implemented safety features to prevent system damage

This commit is contained in:
2026-02-27 02:47:51 +01:00
parent 4c4026a600
commit f0925a3ab3
9 changed files with 373 additions and 83 deletions

View File

@@ -14,9 +14,10 @@ use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Mutex;
use std::path::PathBuf;
use crate::sal::traits::{PlatformSal, SafetyStatus};
use crate::sal::traits::{PlatformSal, AuditStep, SafetyStatus};
use crate::sal::heuristic::discovery::SystemFactSheet;
use crate::load::Workload;
use crate::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts};
use crate::load::{Workload, IntensityProfile};
use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase};
use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult};
@@ -44,6 +45,9 @@ pub struct BenchmarkOrchestrator {
/// CLI override for the configuration output path.
optional_config_out: Option<PathBuf>,
/// The safety membrane protecting the system.
safeguard: Option<HardwareStateGuard>,
/// Sliding window of power readings (Watts).
history_watts: VecDeque<f32>,
/// Sliding window of temperature readings (Celsius).
@@ -97,12 +101,13 @@ impl BenchmarkOrchestrator {
emergency_abort: Arc::new(AtomicBool::new(false)),
emergency_reason: Arc::new(Mutex::new(None)),
optional_config_out,
safeguard: None,
}
}
/// Executes the full benchmark sequence.
///
/// This method guarantees that [crate::sal::traits::EnvironmentGuard::restore] and [Workload::stop]
/// This method guarantees that [crate::sal::traits::EnvironmentGuard::restore] and [Workload::stop_workload]
/// are called regardless of whether the benchmark succeeds or fails.
pub fn run(&mut self) -> Result<OptimizationResult> {
self.log("Starting ember-tune Benchmark Sequence.")?;
@@ -111,8 +116,16 @@ impl BenchmarkOrchestrator {
let result = self.execute_benchmark();
// --- MANDATORY CLEANUP ---
self.log("Benchmark sequence finished. Restoring hardware defaults...")?;
let _ = self.workload.stop();
let _ = self.workload.stop_workload();
if let Some(mut sg) = self.safeguard.take() {
if let Err(e) = sg.release() {
anyhow::bail!("CRITICAL: USA Restoration Failure: {}", e);
}
}
if let Err(e) = self.sal.restore() {
anyhow::bail!("CRITICAL: Failed to restore hardware state: {}", e);
}
@@ -125,6 +138,19 @@ impl BenchmarkOrchestrator {
fn execute_benchmark(&mut self) -> Result<OptimizationResult> {
let bench_cfg = self.facts.bench_config.clone().context("Benchmarking config missing in facts")?;
// 1. Snapshot & Arm Safeguard
let mut target_files = self.facts.rapl_paths.iter()
.map(|p| p.join("constraint_0_power_limit_uw"))
.collect::<Vec<_>>();
target_files.extend(self.facts.rapl_paths.iter().map(|p| p.join("constraint_1_power_limit_uw")));
if let Some(tp) = self.facts.paths.configs.get("throttled") {
target_files.push(tp.clone());
}
let target_services = vec!["tlp.service".to_string(), "thermald.service".to_string(), "throttled.service".to_string()];
self.safeguard = Some(HardwareStateGuard::acquire(&target_files, &target_services)?);
// Phase 1: Audit & Baseline
self.phase = BenchmarkPhase::Auditing;
for step in self.sal.audit() {
if let Err(e) = step.outcome {
@@ -132,9 +158,11 @@ impl BenchmarkOrchestrator {
}
}
self.workload.initialize().context("Failed to initialize workload")?;
self.log("Suppressing background services (tlp, thermald)...")?;
self.sal.suppress().context("Failed to suppress background services")?;
// Baseline (Idle Calibration)
self.phase = BenchmarkPhase::IdleCalibration;
self.log(&format!("Phase 1: Recording Idle Baseline ({}s)...", bench_cfg.idle_duration_s))?;
self.sal.set_fan_mode("auto")?;
@@ -152,6 +180,7 @@ impl BenchmarkOrchestrator {
self.profile.ambient_temp = self.engine.smooth(&idle_temps).last().cloned().unwrap_or(0.0);
self.log(&format!("✓ Idle Baseline: {:.1}°C", self.profile.ambient_temp))?;
// Phase 2: Stress Stepping
self.phase = BenchmarkPhase::StressTesting;
self.log("Phase 2: Starting Synthetic Stress Matrix.")?;
self.sal.set_fan_mode("max")?;
@@ -159,10 +188,16 @@ impl BenchmarkOrchestrator {
let steps = bench_cfg.power_steps_watts.clone();
for &pl in &steps {
self.log(&format!("Testing PL1 = {:.0}W...", pl))?;
self.sal.set_sustained_power_limit(pl)?;
self.sal.set_burst_power_limit(pl + 5.0)?;
self.workload.start(num_cpus::get(), 100)?;
let pl1_uw = crate::sal::safety::TdpLimitMicroWatts::new((pl * 1_000_000.0) as u64)?;
let pl2_uw = crate::sal::safety::TdpLimitMicroWatts::new(((pl + 5.0) * 1_000_000.0) as u64)?;
self.sal.set_sustained_power_limit(pl1_uw)?;
self.sal.set_burst_power_limit(pl2_uw)?;
self.workload.run_workload(
Duration::from_secs(bench_cfg.stress_duration_max_s),
IntensityProfile { threads: num_cpus::get(), load_percentage: 100 }
)?;
let step_start = Instant::now();
let mut step_temps = VecDeque::with_capacity(30);
@@ -188,26 +223,28 @@ impl BenchmarkOrchestrator {
thread::sleep(Duration::from_millis(500));
}
// Record data point
let avg_p = self.sal.get_power_w().unwrap_or(0.0);
let avg_t = self.sal.get_temp().unwrap_or(0.0);
let avg_f = self.sal.get_freq_mhz().unwrap_or(0.0);
let fans = self.sal.get_fan_rpms().unwrap_or_default();
let primary_fan = fans.first().cloned().unwrap_or(0);
let tp = self.workload.get_throughput().unwrap_or(0.0);
let metrics = self.workload.get_current_metrics().unwrap_or_default();
self.profile.points.push(ThermalPoint {
power_w: avg_p,
temp_c: avg_t,
freq_mhz: avg_f,
fan_rpm: primary_fan,
throughput: tp,
throughput: metrics.primary_ops_per_sec,
});
self.workload.stop()?;
self.workload.stop_workload()?;
self.log(&format!(" Step complete. Cooling down for {}s...", bench_cfg.cool_down_s))?;
thread::sleep(Duration::from_secs(bench_cfg.cool_down_s));
}
// Phase 4: Physical Modeling
self.phase = BenchmarkPhase::PhysicalModeling;
self.log("Phase 3: Calculating Silicon Physical Sweet Spot...")?;
@@ -218,6 +255,7 @@ impl BenchmarkOrchestrator {
thread::sleep(Duration::from_secs(3));
// Phase 5: Finalizing
self.phase = BenchmarkPhase::Finalizing;
self.log("Benchmark sequence complete. Generating configurations...")?;
@@ -227,8 +265,6 @@ impl BenchmarkOrchestrator {
trip_temp: res.max_temp_c.max(95.0),
};
// 1. Throttled (Merged if exists)
// PRIORITY: optional_config_out > facts discovery > fallback
let throttled_path = self.optional_config_out.clone()
.or_else(|| self.facts.paths.configs.get("throttled").cloned());
@@ -238,7 +274,6 @@ impl BenchmarkOrchestrator {
res.config_paths.insert("throttled".to_string(), path.clone());
}
// 2. i8kmon
if let Some(i8k_path) = self.facts.paths.configs.get("i8kmon") {
let i8k_config = crate::engine::formatters::i8kmon::I8kmonConfig {
t_ambient: self.profile.ambient_temp,