implemented safety features to prevent system damage
This commit is contained in:
@@ -14,9 +14,10 @@ use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Mutex;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::sal::traits::{PlatformSal, SafetyStatus};
|
||||
use crate::sal::traits::{PlatformSal, AuditStep, SafetyStatus};
|
||||
use crate::sal::heuristic::discovery::SystemFactSheet;
|
||||
use crate::load::Workload;
|
||||
use crate::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts};
|
||||
use crate::load::{Workload, IntensityProfile};
|
||||
use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase};
|
||||
use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult};
|
||||
|
||||
@@ -44,6 +45,9 @@ pub struct BenchmarkOrchestrator {
|
||||
/// CLI override for the configuration output path.
|
||||
optional_config_out: Option<PathBuf>,
|
||||
|
||||
/// The safety membrane protecting the system.
|
||||
safeguard: Option<HardwareStateGuard>,
|
||||
|
||||
/// Sliding window of power readings (Watts).
|
||||
history_watts: VecDeque<f32>,
|
||||
/// Sliding window of temperature readings (Celsius).
|
||||
@@ -97,12 +101,13 @@ impl BenchmarkOrchestrator {
|
||||
emergency_abort: Arc::new(AtomicBool::new(false)),
|
||||
emergency_reason: Arc::new(Mutex::new(None)),
|
||||
optional_config_out,
|
||||
safeguard: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Executes the full benchmark sequence.
|
||||
///
|
||||
/// This method guarantees that [crate::sal::traits::EnvironmentGuard::restore] and [Workload::stop]
|
||||
/// This method guarantees that [crate::sal::traits::EnvironmentGuard::restore] and [Workload::stop_workload]
|
||||
/// are called regardless of whether the benchmark succeeds or fails.
|
||||
pub fn run(&mut self) -> Result<OptimizationResult> {
|
||||
self.log("Starting ember-tune Benchmark Sequence.")?;
|
||||
@@ -111,8 +116,16 @@ impl BenchmarkOrchestrator {
|
||||
|
||||
let result = self.execute_benchmark();
|
||||
|
||||
// --- MANDATORY CLEANUP ---
|
||||
self.log("Benchmark sequence finished. Restoring hardware defaults...")?;
|
||||
let _ = self.workload.stop();
|
||||
let _ = self.workload.stop_workload();
|
||||
|
||||
if let Some(mut sg) = self.safeguard.take() {
|
||||
if let Err(e) = sg.release() {
|
||||
anyhow::bail!("CRITICAL: USA Restoration Failure: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
if let Err(e) = self.sal.restore() {
|
||||
anyhow::bail!("CRITICAL: Failed to restore hardware state: {}", e);
|
||||
}
|
||||
@@ -125,6 +138,19 @@ impl BenchmarkOrchestrator {
|
||||
fn execute_benchmark(&mut self) -> Result<OptimizationResult> {
|
||||
let bench_cfg = self.facts.bench_config.clone().context("Benchmarking config missing in facts")?;
|
||||
|
||||
// 1. Snapshot & Arm Safeguard
|
||||
let mut target_files = self.facts.rapl_paths.iter()
|
||||
.map(|p| p.join("constraint_0_power_limit_uw"))
|
||||
.collect::<Vec<_>>();
|
||||
target_files.extend(self.facts.rapl_paths.iter().map(|p| p.join("constraint_1_power_limit_uw")));
|
||||
if let Some(tp) = self.facts.paths.configs.get("throttled") {
|
||||
target_files.push(tp.clone());
|
||||
}
|
||||
|
||||
let target_services = vec!["tlp.service".to_string(), "thermald.service".to_string(), "throttled.service".to_string()];
|
||||
self.safeguard = Some(HardwareStateGuard::acquire(&target_files, &target_services)?);
|
||||
|
||||
// Phase 1: Audit & Baseline
|
||||
self.phase = BenchmarkPhase::Auditing;
|
||||
for step in self.sal.audit() {
|
||||
if let Err(e) = step.outcome {
|
||||
@@ -132,9 +158,11 @@ impl BenchmarkOrchestrator {
|
||||
}
|
||||
}
|
||||
|
||||
self.workload.initialize().context("Failed to initialize workload")?;
|
||||
self.log("Suppressing background services (tlp, thermald)...")?;
|
||||
self.sal.suppress().context("Failed to suppress background services")?;
|
||||
|
||||
// Baseline (Idle Calibration)
|
||||
self.phase = BenchmarkPhase::IdleCalibration;
|
||||
self.log(&format!("Phase 1: Recording Idle Baseline ({}s)...", bench_cfg.idle_duration_s))?;
|
||||
self.sal.set_fan_mode("auto")?;
|
||||
@@ -152,6 +180,7 @@ impl BenchmarkOrchestrator {
|
||||
self.profile.ambient_temp = self.engine.smooth(&idle_temps).last().cloned().unwrap_or(0.0);
|
||||
self.log(&format!("✓ Idle Baseline: {:.1}°C", self.profile.ambient_temp))?;
|
||||
|
||||
// Phase 2: Stress Stepping
|
||||
self.phase = BenchmarkPhase::StressTesting;
|
||||
self.log("Phase 2: Starting Synthetic Stress Matrix.")?;
|
||||
self.sal.set_fan_mode("max")?;
|
||||
@@ -159,10 +188,16 @@ impl BenchmarkOrchestrator {
|
||||
let steps = bench_cfg.power_steps_watts.clone();
|
||||
for &pl in &steps {
|
||||
self.log(&format!("Testing PL1 = {:.0}W...", pl))?;
|
||||
self.sal.set_sustained_power_limit(pl)?;
|
||||
self.sal.set_burst_power_limit(pl + 5.0)?;
|
||||
|
||||
self.workload.start(num_cpus::get(), 100)?;
|
||||
let pl1_uw = crate::sal::safety::TdpLimitMicroWatts::new((pl * 1_000_000.0) as u64)?;
|
||||
let pl2_uw = crate::sal::safety::TdpLimitMicroWatts::new(((pl + 5.0) * 1_000_000.0) as u64)?;
|
||||
self.sal.set_sustained_power_limit(pl1_uw)?;
|
||||
self.sal.set_burst_power_limit(pl2_uw)?;
|
||||
|
||||
self.workload.run_workload(
|
||||
Duration::from_secs(bench_cfg.stress_duration_max_s),
|
||||
IntensityProfile { threads: num_cpus::get(), load_percentage: 100 }
|
||||
)?;
|
||||
|
||||
let step_start = Instant::now();
|
||||
let mut step_temps = VecDeque::with_capacity(30);
|
||||
@@ -188,26 +223,28 @@ impl BenchmarkOrchestrator {
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
}
|
||||
|
||||
// Record data point
|
||||
let avg_p = self.sal.get_power_w().unwrap_or(0.0);
|
||||
let avg_t = self.sal.get_temp().unwrap_or(0.0);
|
||||
let avg_f = self.sal.get_freq_mhz().unwrap_or(0.0);
|
||||
let fans = self.sal.get_fan_rpms().unwrap_or_default();
|
||||
let primary_fan = fans.first().cloned().unwrap_or(0);
|
||||
let tp = self.workload.get_throughput().unwrap_or(0.0);
|
||||
let metrics = self.workload.get_current_metrics().unwrap_or_default();
|
||||
|
||||
self.profile.points.push(ThermalPoint {
|
||||
power_w: avg_p,
|
||||
temp_c: avg_t,
|
||||
freq_mhz: avg_f,
|
||||
fan_rpm: primary_fan,
|
||||
throughput: tp,
|
||||
throughput: metrics.primary_ops_per_sec,
|
||||
});
|
||||
|
||||
self.workload.stop()?;
|
||||
self.workload.stop_workload()?;
|
||||
self.log(&format!(" Step complete. Cooling down for {}s...", bench_cfg.cool_down_s))?;
|
||||
thread::sleep(Duration::from_secs(bench_cfg.cool_down_s));
|
||||
}
|
||||
|
||||
// Phase 4: Physical Modeling
|
||||
self.phase = BenchmarkPhase::PhysicalModeling;
|
||||
self.log("Phase 3: Calculating Silicon Physical Sweet Spot...")?;
|
||||
|
||||
@@ -218,6 +255,7 @@ impl BenchmarkOrchestrator {
|
||||
|
||||
thread::sleep(Duration::from_secs(3));
|
||||
|
||||
// Phase 5: Finalizing
|
||||
self.phase = BenchmarkPhase::Finalizing;
|
||||
self.log("Benchmark sequence complete. Generating configurations...")?;
|
||||
|
||||
@@ -227,8 +265,6 @@ impl BenchmarkOrchestrator {
|
||||
trip_temp: res.max_temp_c.max(95.0),
|
||||
};
|
||||
|
||||
// 1. Throttled (Merged if exists)
|
||||
// PRIORITY: optional_config_out > facts discovery > fallback
|
||||
let throttled_path = self.optional_config_out.clone()
|
||||
.or_else(|| self.facts.paths.configs.get("throttled").cloned());
|
||||
|
||||
@@ -238,7 +274,6 @@ impl BenchmarkOrchestrator {
|
||||
res.config_paths.insert("throttled".to_string(), path.clone());
|
||||
}
|
||||
|
||||
// 2. i8kmon
|
||||
if let Some(i8k_path) = self.facts.paths.configs.get("i8kmon") {
|
||||
let i8k_config = crate::engine::formatters::i8kmon::I8kmonConfig {
|
||||
t_ambient: self.profile.ambient_temp,
|
||||
|
||||
Reference in New Issue
Block a user