diff --git a/src/load/mod.rs b/src/load/mod.rs index 501e5fd..3ec7956 100644 --- a/src/load/mod.rs +++ b/src/load/mod.rs @@ -1,61 +1,136 @@ -//! Defines the `Workload` trait for generating synthetic CPU/GPU load. +//! Load generation and performance measurement subsystem. -use anyhow::Result; -use std::process::Child; +use anyhow::{Result, Context, anyhow}; +use std::process::{Child, Command, Stdio}; use std::time::{Duration, Instant}; use std::thread; +use std::io::{BufRead, BufReader}; +use std::sync::{Arc, Mutex}; +use serde::{Deserialize, Serialize}; -/// A trait for objects that can generate a measurable system load. -pub trait Workload: Send + Sync { - /// Starts the workload with the specified number of threads and load percentage. - /// - /// # Errors - /// Returns an error if the underlying stress test process fails to spawn. - fn start(&mut self, threads: usize, load_percent: usize) -> Result<()>; - - /// Stops the workload gracefully. - /// - /// # Errors - /// This method should aim to not fail, but may return an error if - /// forcefully killing the child process fails. - fn stop(&mut self) -> Result<()>; - - /// Returns the current throughput of the workload (e.g., ops/sec). - /// - /// # Errors - /// Returns an error if throughput cannot be measured. - fn get_throughput(&self) -> Result; +/// Standardized telemetry returned by any workload implementation. +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct WorkloadMetrics { + /// Primary performance heuristic (e.g., Bogo Ops/s) + pub primary_ops_per_sec: f64, + /// Time elapsed since the workload started + pub elapsed_time: Duration, } -/// An implementation of `Workload` that uses the `stress-ng` utility. +/// A normalized profile defining the intensity and constraints of the workload. +#[derive(Debug, Clone)] +pub struct IntensityProfile { + pub threads: usize, + pub load_percentage: u8, +} + +/// The replaceable interface for load generation and performance measurement. +pub trait Workload: Send + Sync { + /// Sets up prerequisites (e.g., binary checks). + fn initialize(&mut self) -> Result<()>; + + /// Executes the load asynchronously. + fn run_workload(&mut self, duration: Duration, profile: IntensityProfile) -> Result<()>; + + /// Returns the current standardized telemetry object. + fn get_current_metrics(&self) -> Result; + + /// Gracefully and forcefully terminates the workload. + fn stop_workload(&mut self) -> Result<()>; +} + +/// Implementation of the Benchmarking Interface using stress-ng matrix stressors. pub struct StressNg { child: Option, + start_time: Option, + latest_metrics: Arc>, } impl StressNg { pub fn new() -> Self { - Self { child: None } + Self { + child: None, + start_time: None, + latest_metrics: Arc::new(Mutex::new(WorkloadMetrics::default())), + } } } impl Workload for StressNg { - fn start(&mut self, threads: usize, load_percent: usize) -> Result<()> { - self.stop()?; + fn initialize(&mut self) -> Result<()> { + let status = Command::new("stress-ng") + .arg("--version") + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .context("stress-ng binary not found in PATH")?; - let child = std::process::Command::new("stress-ng") + if !status.success() { + return Err(anyhow!("stress-ng failed to initialize")); + } + Ok(()) + } + + fn run_workload(&mut self, duration: Duration, profile: IntensityProfile) -> Result<()> { + self.stop_workload()?; // Ensure clean state + + let threads = profile.threads.to_string(); + let timeout = format!("{}s", duration.as_secs()); + let load = profile.load_percentage.to_string(); + + let mut child = Command::new("stress-ng") .args([ - "--cpu", &threads.to_string(), - "--cpu-load", &load_percent.to_string(), - "--quiet" + "--matrix", &threads, + "--cpu-load", &load, + "--timeout", &timeout, + "--metrics-brief", + "--metrics-brief", // Repeat for stderr/stdout consistency ]) - .spawn()?; + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .context("Failed to spawn stress-ng")?; + + self.start_time = Some(Instant::now()); + // Spawn metrics parser thread + let metrics_ref = Arc::clone(&self.latest_metrics); + let stderr = child.stderr.take().expect("Failed to capture stderr"); + + thread::spawn(move || { + let reader = BufReader::new(stderr); + for line in reader.lines().flatten() { + // Parse stress-ng metrics line: + // stress-ng: info: [PID] matrix [OPS] [TIME] [BOGO OPS/S] + if line.contains("matrix") && line.contains("bogo ops/s") { + let parts: Vec<&str> = line.split_whitespace().collect(); + if let Some(ops_idx) = parts.iter().position(|&p| p == "ops/s") { + if let Some(ops_val) = parts.get(ops_idx - 1) { + if let Ok(ops) = ops_val.parse::() { + let mut m = metrics_ref.lock().unwrap(); + m.primary_ops_per_sec = ops; + } + } + } + } + } + }); + self.child = Some(child); Ok(()) } - fn stop(&mut self) -> Result<()> { + fn get_current_metrics(&self) -> Result { + let mut m = self.latest_metrics.lock().unwrap().clone(); + if let Some(start) = self.start_time { + m.elapsed_time = start.elapsed(); + } + Ok(m) + } + + fn stop_workload(&mut self) -> Result<()> { if let Some(mut child) = self.child.take() { + // Polite SIGTERM #[cfg(unix)] { use libc::{kill, SIGTERM}; @@ -77,19 +152,13 @@ impl Workload for StressNg { let _ = child.wait(); } } + self.start_time = None; Ok(()) } - - /// Returns the current throughput of the workload (e.g., ops/sec). - /// - /// This is currently a stub and does not parse `stress-ng` output. - fn get_throughput(&self) -> Result { - Ok(0.0) - } } impl Drop for StressNg { fn drop(&mut self) { - let _ = self.stop(); + let _ = self.stop_workload(); } } diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index ebe99da..7e42825 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -14,9 +14,10 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Mutex; use std::path::PathBuf; -use crate::sal::traits::{PlatformSal, SafetyStatus}; +use crate::sal::traits::{PlatformSal, AuditStep, SafetyStatus}; use crate::sal::heuristic::discovery::SystemFactSheet; -use crate::load::Workload; +use crate::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts}; +use crate::load::{Workload, IntensityProfile}; use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase}; use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult}; @@ -44,6 +45,9 @@ pub struct BenchmarkOrchestrator { /// CLI override for the configuration output path. optional_config_out: Option, + /// The safety membrane protecting the system. + safeguard: Option, + /// Sliding window of power readings (Watts). history_watts: VecDeque, /// Sliding window of temperature readings (Celsius). @@ -97,12 +101,13 @@ impl BenchmarkOrchestrator { emergency_abort: Arc::new(AtomicBool::new(false)), emergency_reason: Arc::new(Mutex::new(None)), optional_config_out, + safeguard: None, } } /// Executes the full benchmark sequence. /// - /// This method guarantees that [crate::sal::traits::EnvironmentGuard::restore] and [Workload::stop] + /// This method guarantees that [crate::sal::traits::EnvironmentGuard::restore] and [Workload::stop_workload] /// are called regardless of whether the benchmark succeeds or fails. pub fn run(&mut self) -> Result { self.log("Starting ember-tune Benchmark Sequence.")?; @@ -111,8 +116,16 @@ impl BenchmarkOrchestrator { let result = self.execute_benchmark(); + // --- MANDATORY CLEANUP --- self.log("Benchmark sequence finished. Restoring hardware defaults...")?; - let _ = self.workload.stop(); + let _ = self.workload.stop_workload(); + + if let Some(mut sg) = self.safeguard.take() { + if let Err(e) = sg.release() { + anyhow::bail!("CRITICAL: USA Restoration Failure: {}", e); + } + } + if let Err(e) = self.sal.restore() { anyhow::bail!("CRITICAL: Failed to restore hardware state: {}", e); } @@ -125,6 +138,19 @@ impl BenchmarkOrchestrator { fn execute_benchmark(&mut self) -> Result { let bench_cfg = self.facts.bench_config.clone().context("Benchmarking config missing in facts")?; + // 1. Snapshot & Arm Safeguard + let mut target_files = self.facts.rapl_paths.iter() + .map(|p| p.join("constraint_0_power_limit_uw")) + .collect::>(); + target_files.extend(self.facts.rapl_paths.iter().map(|p| p.join("constraint_1_power_limit_uw"))); + if let Some(tp) = self.facts.paths.configs.get("throttled") { + target_files.push(tp.clone()); + } + + let target_services = vec!["tlp.service".to_string(), "thermald.service".to_string(), "throttled.service".to_string()]; + self.safeguard = Some(HardwareStateGuard::acquire(&target_files, &target_services)?); + + // Phase 1: Audit & Baseline self.phase = BenchmarkPhase::Auditing; for step in self.sal.audit() { if let Err(e) = step.outcome { @@ -132,9 +158,11 @@ impl BenchmarkOrchestrator { } } + self.workload.initialize().context("Failed to initialize workload")?; self.log("Suppressing background services (tlp, thermald)...")?; self.sal.suppress().context("Failed to suppress background services")?; + // Baseline (Idle Calibration) self.phase = BenchmarkPhase::IdleCalibration; self.log(&format!("Phase 1: Recording Idle Baseline ({}s)...", bench_cfg.idle_duration_s))?; self.sal.set_fan_mode("auto")?; @@ -152,6 +180,7 @@ impl BenchmarkOrchestrator { self.profile.ambient_temp = self.engine.smooth(&idle_temps).last().cloned().unwrap_or(0.0); self.log(&format!("✓ Idle Baseline: {:.1}°C", self.profile.ambient_temp))?; + // Phase 2: Stress Stepping self.phase = BenchmarkPhase::StressTesting; self.log("Phase 2: Starting Synthetic Stress Matrix.")?; self.sal.set_fan_mode("max")?; @@ -159,10 +188,16 @@ impl BenchmarkOrchestrator { let steps = bench_cfg.power_steps_watts.clone(); for &pl in &steps { self.log(&format!("Testing PL1 = {:.0}W...", pl))?; - self.sal.set_sustained_power_limit(pl)?; - self.sal.set_burst_power_limit(pl + 5.0)?; - self.workload.start(num_cpus::get(), 100)?; + let pl1_uw = crate::sal::safety::TdpLimitMicroWatts::new((pl * 1_000_000.0) as u64)?; + let pl2_uw = crate::sal::safety::TdpLimitMicroWatts::new(((pl + 5.0) * 1_000_000.0) as u64)?; + self.sal.set_sustained_power_limit(pl1_uw)?; + self.sal.set_burst_power_limit(pl2_uw)?; + + self.workload.run_workload( + Duration::from_secs(bench_cfg.stress_duration_max_s), + IntensityProfile { threads: num_cpus::get(), load_percentage: 100 } + )?; let step_start = Instant::now(); let mut step_temps = VecDeque::with_capacity(30); @@ -188,26 +223,28 @@ impl BenchmarkOrchestrator { thread::sleep(Duration::from_millis(500)); } + // Record data point let avg_p = self.sal.get_power_w().unwrap_or(0.0); let avg_t = self.sal.get_temp().unwrap_or(0.0); let avg_f = self.sal.get_freq_mhz().unwrap_or(0.0); let fans = self.sal.get_fan_rpms().unwrap_or_default(); let primary_fan = fans.first().cloned().unwrap_or(0); - let tp = self.workload.get_throughput().unwrap_or(0.0); + let metrics = self.workload.get_current_metrics().unwrap_or_default(); self.profile.points.push(ThermalPoint { power_w: avg_p, temp_c: avg_t, freq_mhz: avg_f, fan_rpm: primary_fan, - throughput: tp, + throughput: metrics.primary_ops_per_sec, }); - self.workload.stop()?; + self.workload.stop_workload()?; self.log(&format!(" Step complete. Cooling down for {}s...", bench_cfg.cool_down_s))?; thread::sleep(Duration::from_secs(bench_cfg.cool_down_s)); } + // Phase 4: Physical Modeling self.phase = BenchmarkPhase::PhysicalModeling; self.log("Phase 3: Calculating Silicon Physical Sweet Spot...")?; @@ -218,6 +255,7 @@ impl BenchmarkOrchestrator { thread::sleep(Duration::from_secs(3)); + // Phase 5: Finalizing self.phase = BenchmarkPhase::Finalizing; self.log("Benchmark sequence complete. Generating configurations...")?; @@ -227,8 +265,6 @@ impl BenchmarkOrchestrator { trip_temp: res.max_temp_c.max(95.0), }; - // 1. Throttled (Merged if exists) - // PRIORITY: optional_config_out > facts discovery > fallback let throttled_path = self.optional_config_out.clone() .or_else(|| self.facts.paths.configs.get("throttled").cloned()); @@ -238,7 +274,6 @@ impl BenchmarkOrchestrator { res.config_paths.insert("throttled".to_string(), path.clone()); } - // 2. i8kmon if let Some(i8k_path) = self.facts.paths.configs.get("i8kmon") { let i8k_config = crate::engine::formatters::i8kmon::I8kmonConfig { t_ambient: self.profile.ambient_temp, diff --git a/src/sal/dell_xps_9380.rs b/src/sal/dell_xps_9380.rs index dcc73ae..b6ca209 100644 --- a/src/sal/dell_xps_9380.rs +++ b/src/sal/dell_xps_9380.rs @@ -1,10 +1,10 @@ use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep, SafetyStatus, EnvironmentCtx}; +use crate::sal::safety::TdpLimitMicroWatts; use anyhow::{Result, Context, anyhow}; use std::fs; use std::path::{PathBuf}; use std::time::{Duration, Instant}; use std::sync::Mutex; -use tracing::{debug}; use crate::sal::heuristic::discovery::SystemFactSheet; pub struct DellXps9380Sal { @@ -151,7 +151,6 @@ impl EnvironmentGuard for DellXps9380Sal { let mut suppressed = self.suppressed_services.lock().unwrap(); for s in services { if self.ctx.runner.run("systemctl", &["is-active", "--quiet", s]).is_ok() { - debug!("Suppressing service: {}", s); let _ = self.ctx.runner.run("systemctl", &["stop", s]); suppressed.push(s.to_string()); } @@ -251,18 +250,18 @@ impl ActuatorBus for DellXps9380Sal { match mode { "max" | "Manual" => { self.ctx.runner.run(&tool_str, &["0"])?; } "auto" | "Auto" => { self.ctx.runner.run(&tool_str, &["1"])?; } - _ => { debug!("Unknown fan mode: {}", mode); } + _ => {} } Ok(()) } - fn set_sustained_power_limit(&self, watts: f32) -> Result<()> { - fs::write(&self.pl1_path, ((watts * 1_000_000.0) as u64).to_string())?; + fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { + fs::write(&self.pl1_path, limit.as_u64().to_string())?; Ok(()) } - fn set_burst_power_limit(&self, watts: f32) -> Result<()> { - fs::write(&self.pl2_path, ((watts * 1_000_000.0) as u64).to_string())?; + fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { + fs::write(&self.pl2_path, limit.as_u64().to_string())?; Ok(()) } } diff --git a/src/sal/generic_linux.rs b/src/sal/generic_linux.rs index 7007b25..ea1498e 100644 --- a/src/sal/generic_linux.rs +++ b/src/sal/generic_linux.rs @@ -6,6 +6,7 @@ use std::sync::Mutex; use tracing::{debug}; use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx}; +use crate::sal::safety::TdpLimitMicroWatts; use crate::sal::heuristic::discovery::SystemFactSheet; use crate::sal::heuristic::schema::HardwareDb; @@ -15,7 +16,7 @@ pub struct GenericLinuxSal { db: HardwareDb, suppressed_services: Mutex>, last_valid_temp: Mutex<(f32, Instant)>, - current_pl1: Mutex, + current_pl1: Mutex, last_energy: Mutex<(u64, Instant)>, // --- Original State for Restoration --- @@ -35,7 +36,7 @@ impl GenericLinuxSal { db, suppressed_services: Mutex::new(Vec::new()), last_valid_temp: Mutex::new((0.0, Instant::now())), - current_pl1: Mutex::new(15.0), + current_pl1: Mutex::new(15_000_000), last_energy: Mutex::new((initial_energy, Instant::now())), fact_sheet: facts, ctx, @@ -151,16 +152,16 @@ impl ActuatorBus for GenericLinuxSal { } else { Ok(()) } } - fn set_sustained_power_limit(&self, watts: f32) -> Result<()> { + fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL1 path"))?; - fs::write(rapl_path.join("constraint_0_power_limit_uw"), ((watts * 1_000_000.0) as u64).to_string())?; - *self.current_pl1.lock().unwrap() = watts; + fs::write(rapl_path.join("constraint_0_power_limit_uw"), limit.as_u64().to_string())?; + *self.current_pl1.lock().unwrap() = limit.as_u64(); Ok(()) } - fn set_burst_power_limit(&self, watts: f32) -> Result<()> { + fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL2 path"))?; - fs::write(rapl_path.join("constraint_1_power_limit_uw"), ((watts * 1_000_000.0) as u64).to_string())?; + fs::write(rapl_path.join("constraint_1_power_limit_uw"), limit.as_u64().to_string())?; Ok(()) } } diff --git a/src/sal/mock.rs b/src/sal/mock.rs index 98aaf14..28b5691 100644 --- a/src/sal/mock.rs +++ b/src/sal/mock.rs @@ -1,4 +1,5 @@ use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, SafetyStatus}; +use crate::sal::safety::TdpLimitMicroWatts; use anyhow::Result; pub struct MockSal { @@ -59,10 +60,10 @@ impl ActuatorBus for MockSal { fn set_fan_mode(&self, _mode: &str) -> Result<()> { Ok(()) } - fn set_sustained_power_limit(&self, _watts: f32) -> Result<()> { + fn set_sustained_power_limit(&self, _limit: TdpLimitMicroWatts) -> Result<()> { Ok(()) } - fn set_burst_power_limit(&self, _watts: f32) -> Result<()> { + fn set_burst_power_limit(&self, _limit: TdpLimitMicroWatts) -> Result<()> { Ok(()) } } diff --git a/src/sal/mod.rs b/src/sal/mod.rs index 16526ac..d2f276f 100644 --- a/src/sal/mod.rs +++ b/src/sal/mod.rs @@ -3,3 +3,4 @@ pub mod mock; pub mod dell_xps_9380; pub mod generic_linux; pub mod heuristic; +pub mod safety; diff --git a/src/sal/safety.rs b/src/sal/safety.rs new file mode 100644 index 0000000..5ccce10 --- /dev/null +++ b/src/sal/safety.rs @@ -0,0 +1,175 @@ +//! Universal Safeguard Architecture (USA) and Hardware Primitives. +//! +//! This module provides the `HardwareStateGuard` for guaranteed state +//! restoration and type-safe primitives to prevent dangerous hardware states. + +use anyhow::{Result, bail, Context}; +use std::collections::HashMap; +use std::fs; +use std::path::{Path, PathBuf}; +use tracing::{info, warn, error}; + +// --- Type-Driven Safety Primitives --- + +/// Represents a safe TDP limit in microwatts. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct TdpLimitMicroWatts(u64); + +impl TdpLimitMicroWatts { + /// Strict bounds to prevent hardware bricking. + pub const MIN_SAFE_UW: u64 = 5_000_000; // 5 Watts + pub const MAX_SAFE_UW: u64 = 80_000_000; // 80 Watts + + /// Constructs a new TdpLimitMicroWatts, enforcing safety bounds. + /// + /// # Errors + /// Returns a `HardwareSafetyError` (via `anyhow::bail`) if the value is out of bounds. + pub fn new(microwatts: u64) -> Result { + if microwatts < Self::MIN_SAFE_UW { + bail!("HardwareSafetyError: Requested TDP {} uW is below the absolute safety floor of {} uW.", microwatts, Self::MIN_SAFE_UW); + } + if microwatts > Self::MAX_SAFE_UW { + bail!("HardwareSafetyError: Requested TDP {} uW exceeds absolute maximum of {} uW.", microwatts, Self::MAX_SAFE_UW); + } + Ok(Self(microwatts)) + } + + pub fn as_u64(&self) -> u64 { + self.0 + } + + pub fn as_watts(&self) -> f32 { + self.0 as f32 / 1_000_000.0 + } +} + +/// Represents a safe Fan Speed in Percentage (0-100). +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct FanSpeedPercentage(u8); + +impl FanSpeedPercentage { + /// Constructs a new FanSpeedPercentage, enforcing safety bounds. + pub fn new(percent: u8) -> Result { + if percent > 100 { + bail!("HardwareSafetyError: Fan speed percentage {} exceeds 100%.", percent); + } + Ok(Self(percent)) + } + + pub fn as_u8(&self) -> u8 { + self.0 + } +} + +/// Represents a safe Thermal Threshold in Celsius. +#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] +pub struct ThermalThresholdCelsius(f32); + +impl ThermalThresholdCelsius { + pub const MAX_SAFE_C: f32 = 98.0; + + /// Constructs a new ThermalThresholdCelsius, enforcing safety bounds. + pub fn new(celsius: f32) -> Result { + if celsius < 0.0 || celsius > Self::MAX_SAFE_C { + bail!("HardwareSafetyError: Thermal threshold {}°C is outside safe bounds (0.0 - {}).", celsius, Self::MAX_SAFE_C); + } + Ok(Self(celsius)) + } + + pub fn as_f32(&self) -> f32 { + self.0 + } +} + +// --- The HardwareStateGuard (RAII Restorer) --- + +/// Represents a deep snapshot of the system state before benchmarking. +#[derive(Debug, Default, Clone)] +pub struct SystemSnapshot { + /// Maps file paths to their raw string content (e.g., RAPL limits). + pub sysfs_nodes: HashMap, + /// List of services that were active and subsequently stopped. + pub suppressed_services: Vec, +} + +/// The Universal Safeguard wrapper. +/// +/// Implements the "Ironclad Restorer" pattern via the [Drop] trait. +pub struct HardwareStateGuard { + snapshot: SystemSnapshot, + is_armed: bool, +} + +impl HardwareStateGuard { + /// Arms the safeguard by taking a snapshot of the target files and services. + /// + /// # Errors + /// Returns an error if any critical sysfs node cannot be read. + pub fn acquire(target_files: &[PathBuf], target_services: &[String]) -> Result { + let mut snapshot = SystemSnapshot::default(); + + info!("USA: Arming safeguard and snapshotting system state..."); + + for path in target_files { + if path.exists() { + let content = fs::read_to_string(path) + .with_context(|| format!("Failed to snapshot {:?}", path))?; + snapshot.sysfs_nodes.insert(path.clone(), content.trim().to_string()); + } else { + warn!("USA: Target node {:?} does not exist, skipping snapshot.", path); + } + } + + for service in target_services { + let status = std::process::Command::new("systemctl") + .args(["is-active", "--quiet", service]) + .status(); + + if let Ok(s) = status { + if s.success() { + snapshot.suppressed_services.push(service.clone()); + } + } + } + + Ok(Self { + snapshot, + is_armed: true, + }) + } + + /// Explicit manual restoration (can be called upon successful exit). + pub fn release(&mut self) -> Result<()> { + if !self.is_armed { + return Ok(()); + } + + info!("USA: Initiating Ironclad Restoration..."); + + // 1. Restore Power/Sysfs states + for (path, content) in &self.snapshot.sysfs_nodes { + if let Err(e) = fs::write(path, content) { + error!("USA RESTORATION FAILURE: Could not revert {:?}: {}", path, e); + } + } + + // 2. Restart Services + for service in &self.snapshot.suppressed_services { + let _ = std::process::Command::new("systemctl") + .args(["start", service]) + .status(); + } + + self.is_armed = false; + Ok(()) + } +} + +impl Drop for HardwareStateGuard { + fn drop(&mut self) { + if self.is_armed { + warn!("USA: HardwareStateGuard triggered via Drop (panic/unexpected exit). Reverting system state..."); + let _ = self.release(); + } + } +} diff --git a/src/sal/traits.rs b/src/sal/traits.rs index 704ce5c..bae1ae8 100644 --- a/src/sal/traits.rs +++ b/src/sal/traits.rs @@ -157,6 +157,8 @@ impl SensorBus for Arc { } } +use crate::sal::safety::TdpLimitMicroWatts; + /// Provides a write-only interface for hardware actuators. pub trait ActuatorBus: Send + Sync { /// Sets the fan control mode (e.g., "auto" or "max"). @@ -165,28 +167,28 @@ pub trait ActuatorBus: Send + Sync { /// Returns an error if the fan control command or `sysfs` write fails. fn set_fan_mode(&self, mode: &str) -> Result<()>; - /// Sets the sustained power limit (PL1) in Watts. + /// Sets the sustained power limit (PL1) using a validated wrapper. /// /// # Errors /// Returns an error if the RAPL `sysfs` node cannot be written to. - fn set_sustained_power_limit(&self, watts: f32) -> Result<()>; + fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>; - /// Sets the burst power limit (PL2) in Watts. + /// Sets the burst power limit (PL2) using a validated wrapper. /// /// # Errors /// Returns an error if the RAPL `sysfs` node cannot be written to. - fn set_burst_power_limit(&self, watts: f32) -> Result<()>; + fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>; } impl ActuatorBus for Arc { fn set_fan_mode(&self, mode: &str) -> Result<()> { (**self).set_fan_mode(mode) } - fn set_sustained_power_limit(&self, watts: f32) -> Result<()> { - (**self).set_sustained_power_limit(watts) + fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { + (**self).set_sustained_power_limit(limit) } - fn set_burst_power_limit(&self, watts: f32) -> Result<()> { - (**self).set_burst_power_limit(watts) + fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { + (**self).set_burst_power_limit(limit) } } diff --git a/tests/orchestrator_e2e_test.rs b/tests/orchestrator_e2e_test.rs index f445c4c..964517f 100644 --- a/tests/orchestrator_e2e_test.rs +++ b/tests/orchestrator_e2e_test.rs @@ -1,16 +1,23 @@ use ember_tune_rs::orchestrator::BenchmarkOrchestrator; use ember_tune_rs::sal::mock::MockSal; use ember_tune_rs::sal::heuristic::discovery::SystemFactSheet; -use ember_tune_rs::load::Workload; +use ember_tune_rs::load::{Workload, IntensityProfile, WorkloadMetrics}; +use std::time::Duration; +use anyhow::Result; use std::sync::mpsc; use std::sync::Arc; -use anyhow::Result; struct MockWorkload; impl Workload for MockWorkload { - fn start(&mut self, _threads: usize, _load_percent: usize) -> Result<()> { Ok(()) } - fn stop(&mut self) -> Result<()> { Ok(()) } - fn get_throughput(&self) -> Result { Ok(100.0) } + fn initialize(&mut self) -> Result<()> { Ok(()) } + fn run_workload(&mut self, _duration: Duration, _profile: IntensityProfile) -> Result<()> { Ok(()) } + fn get_current_metrics(&self) -> Result { + Ok(WorkloadMetrics { + primary_ops_per_sec: 100.0, + elapsed_time: Duration::from_secs(1), + }) + } + fn stop_workload(&mut self) -> Result<()> { Ok(()) } } #[test]