From fe1f58b5ce273ed41478da2749377ef0adf54bed Mon Sep 17 00:00:00 2001 From: Nils Pukropp Date: Fri, 27 Feb 2026 02:59:23 +0100 Subject: [PATCH] implemented more safeguards and autodiscovery --- src/orchestrator/mod.rs | 84 +++++++++++++----- src/sal/dell_xps_9380.rs | 54 ++++-------- src/sal/generic_linux.rs | 7 +- src/sal/mock.rs | 5 +- src/sal/safety.rs | 181 +++++++++++++++++++++------------------ src/sal/traits.rs | 17 ++-- tests/safety_test.rs | 56 ++++++++++++ 7 files changed, 248 insertions(+), 156 deletions(-) create mode 100644 tests/safety_test.rs diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index 7e42825..9fe8341 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -4,6 +4,7 @@ //! using a [Workload], and feeds telemetry to the frontend via MPSC channels. use anyhow::{Result, Context}; +use tracing::warn; use std::sync::mpsc; use std::time::{Duration, Instant}; use std::thread; @@ -14,17 +15,14 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Mutex; use std::path::PathBuf; -use crate::sal::traits::{PlatformSal, AuditStep, SafetyStatus}; +use crate::sal::traits::{PlatformSal, SafetyStatus}; use crate::sal::heuristic::discovery::SystemFactSheet; -use crate::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts}; +use crate::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts, ConfigurationTransaction, ThermalThresholdCelsius}; use crate::load::{Workload, IntensityProfile}; use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase}; use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult}; /// The central state machine responsible for coordinating the thermal benchmark. -/// -/// It manages hardware interactions through the [PlatformSal], generates stress -/// using a [Workload], and feeds telemetry to the frontend via MPSC channels. pub struct BenchmarkOrchestrator { /// Injected hardware abstraction layer. sal: Arc, @@ -106,14 +104,12 @@ impl BenchmarkOrchestrator { } /// Executes the full benchmark sequence. - /// - /// This method guarantees that [crate::sal::traits::EnvironmentGuard::restore] and [Workload::stop_workload] - /// are called regardless of whether the benchmark succeeds or fails. pub fn run(&mut self) -> Result { self.log("Starting ember-tune Benchmark Sequence.")?; let _watchdog_handle = self.spawn_watchdog_monitor(); + // Core execution wrapped in cleanup logic let result = self.execute_benchmark(); // --- MANDATORY CLEANUP --- @@ -126,9 +122,11 @@ impl BenchmarkOrchestrator { } } + // SAL restore should only handle OEM-specific non-sysfs state not covered by guard if let Err(e) = self.sal.restore() { - anyhow::bail!("CRITICAL: Failed to restore hardware state: {}", e); + warn!("Failed to perform secondary SAL restoration: {}", e); } + self.log("✓ Hardware state restored.")?; result @@ -148,7 +146,19 @@ impl BenchmarkOrchestrator { } let target_services = vec!["tlp.service".to_string(), "thermald.service".to_string(), "throttled.service".to_string()]; - self.safeguard = Some(HardwareStateGuard::acquire(&target_files, &target_services)?); + let mut sg = HardwareStateGuard::acquire(&target_files, &target_services)?; + + // # SAFETY: Register fan restoration command if we are on Dell + if self.facts.vendor.to_lowercase().contains("dell") { + if let Some(tool_path) = self.facts.paths.tools.get("dell_fan_ctrl") { + let tool_str = tool_path.to_string_lossy().to_string(); + sg.on_rollback(Box::new(move || { + let _ = std::process::Command::new(tool_str).arg("1").status(); + })); + } + } + + self.safeguard = Some(sg); // Phase 1: Audit & Baseline self.phase = BenchmarkPhase::Auditing; @@ -159,7 +169,6 @@ impl BenchmarkOrchestrator { } self.workload.initialize().context("Failed to initialize workload")?; - self.log("Suppressing background services (tlp, thermald)...")?; self.sal.suppress().context("Failed to suppress background services")?; // Baseline (Idle Calibration) @@ -185,14 +194,22 @@ impl BenchmarkOrchestrator { self.log("Phase 2: Starting Synthetic Stress Matrix.")?; self.sal.set_fan_mode("max")?; - let steps = bench_cfg.power_steps_watts.clone(); - for &pl in &steps { - self.log(&format!("Testing PL1 = {:.0}W...", pl))?; + let mut current_pl = 10.0_f32; // Start at 10W + let mut previous_ops = 0.0; + + loop { + self.log(&format!("Testing PL1 = {:.0}W...", current_pl))?; - let pl1_uw = crate::sal::safety::TdpLimitMicroWatts::new((pl * 1_000_000.0) as u64)?; - let pl2_uw = crate::sal::safety::TdpLimitMicroWatts::new(((pl + 5.0) * 1_000_000.0) as u64)?; - self.sal.set_sustained_power_limit(pl1_uw)?; - self.sal.set_burst_power_limit(pl2_uw)?; + // # SAFETY: Transactional Commit for Power Limits + let pl1_uw = TdpLimitMicroWatts::from_watts(current_pl)?; + let pl2_uw = TdpLimitMicroWatts::from_watts(current_pl + 5.0)?; + + let mut tx = ConfigurationTransaction::default(); + if let Some(p) = self.facts.rapl_paths.first() { + tx.add_change(p.join("constraint_0_power_limit_uw"), pl1_uw.as_u64().to_string()); + tx.add_change(p.join("constraint_1_power_limit_uw"), pl2_uw.as_u64().to_string()); + } + tx.commit().context("Failed to commit power limit transaction")?; self.workload.run_workload( Duration::from_secs(bench_cfg.stress_duration_max_s), @@ -240,6 +257,32 @@ impl BenchmarkOrchestrator { }); self.workload.stop_workload()?; + + // 1. Check Thermal Ceiling Halt Condition + let max_safe_temp = ThermalThresholdCelsius::MAX_SAFE_C - 5.0; // Margin + if avg_t >= max_safe_temp { + self.log(&format!("Thermal ceiling reached ({:.1}°C). Terminating Identification phase.", avg_t))?; + break; + } + + // 2. Check Diminishing Returns Halt Condition (< 1% gain) + if previous_ops > 0.0 { + let gain_percent = ((metrics.primary_ops_per_sec - previous_ops) / previous_ops) * 100.0; + if gain_percent < 1.0 { + self.log(&format!("Performance gain ({:.1}%) fell below 1%. Terminating Identification phase.", gain_percent))?; + break; + } + } + + // 3. Absolute Maximum Power Check + if current_pl >= 60.0 { + self.log("Maximum theoretical power limit reached. Terminating Identification phase.")?; + break; + } + + previous_ops = metrics.primary_ops_per_sec; + current_pl += 2.0; + self.log(&format!(" Step complete. Cooling down for {}s...", bench_cfg.cool_down_s))?; thread::sleep(Duration::from_secs(bench_cfg.cool_down_s)); } @@ -288,7 +331,6 @@ impl BenchmarkOrchestrator { Ok(res) } - /// Spawns a concurrent monitor that polls safety sensors every 100ms. fn spawn_watchdog_monitor(&self) -> thread::JoinHandle<()> { let abort = self.emergency_abort.clone(); let reason_store = self.emergency_reason.clone(); @@ -340,7 +382,6 @@ impl BenchmarkOrchestrator { }) } - /// Generates the final [OptimizationResult] based on current measurements. pub fn generate_result(&self, is_partial: bool) -> OptimizationResult { let r_theta = self.engine.calculate_thermal_resistance(&self.profile); let knee = self.engine.find_silicon_knee(&self.profile); @@ -358,7 +399,6 @@ impl BenchmarkOrchestrator { } } - /// Checks if the benchmark has been aborted by the user or the watchdog. fn check_abort(&self) -> Result<()> { if self.emergency_abort.load(Ordering::SeqCst) { let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Unknown safety trigger".to_string()); @@ -375,7 +415,6 @@ impl BenchmarkOrchestrator { Ok(()) } - /// Helper to send log messages to the frontend. fn log(&self, msg: &str) -> Result<()> { let state = TelemetryState { cpu_model: self.cpu_model.clone(), @@ -401,7 +440,6 @@ impl BenchmarkOrchestrator { self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed")) } - /// Collects current sensors and sends a complete [TelemetryState] to the frontend. fn send_telemetry(&mut self, tick: u64) -> Result<()> { let temp = self.sal.get_temp().unwrap_or(0.0); let pwr = self.sal.get_power_w().unwrap_or(0.0); diff --git a/src/sal/dell_xps_9380.rs b/src/sal/dell_xps_9380.rs index b6ca209..6c81d1a 100644 --- a/src/sal/dell_xps_9380.rs +++ b/src/sal/dell_xps_9380.rs @@ -1,10 +1,11 @@ use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep, SafetyStatus, EnvironmentCtx}; -use crate::sal::safety::TdpLimitMicroWatts; +use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage}; use anyhow::{Result, Context, anyhow}; use std::fs; use std::path::{PathBuf}; use std::time::{Duration, Instant}; use std::sync::Mutex; +use tracing::{debug}; use crate::sal::heuristic::discovery::SystemFactSheet; pub struct DellXps9380Sal { @@ -22,11 +23,6 @@ pub struct DellXps9380Sal { suppressed_services: Mutex>, msr_file: Mutex, last_energy: Mutex<(u64, Instant)>, - - // --- Original State for Restoration --- - original_pl1: Mutex>, - original_pl2: Mutex>, - original_fan_mode: Mutex>, } impl DellXps9380Sal { @@ -58,9 +54,6 @@ impl DellXps9380Sal { last_energy: Mutex::new((initial_energy, Instant::now())), fact_sheet: facts, ctx, - original_pl1: Mutex::new(None), - original_pl2: Mutex::new(None), - original_fan_mode: Mutex::new(None), }) } @@ -134,23 +127,11 @@ impl PreflightAuditor for DellXps9380Sal { impl EnvironmentGuard for DellXps9380Sal { fn suppress(&self) -> Result<()> { - // 1. Snapshot Power Limits - if let Ok(pl1) = fs::read_to_string(&self.pl1_path) { - *self.original_pl1.lock().unwrap() = pl1.trim().parse().ok(); - } - if let Ok(pl2) = fs::read_to_string(&self.pl2_path) { - *self.original_pl2.lock().unwrap() = pl2.trim().parse().ok(); - } - - // 2. Snapshot Fan Mode (Assumption: Dell BIOS Fan Control is active) - // We can't easily read current state of dell-bios-fan-control, so we assume 'auto' (1) - *self.original_fan_mode.lock().unwrap() = Some("1".to_string()); - - // 3. Stop Services - let services = ["tlp", "thermald", "i8kmon"]; let mut suppressed = self.suppressed_services.lock().unwrap(); + let services = ["tlp", "thermald", "i8kmon"]; for s in services { if self.ctx.runner.run("systemctl", &["is-active", "--quiet", s]).is_ok() { + debug!("Suppressing service: {}", s); let _ = self.ctx.runner.run("systemctl", &["stop", s]); suppressed.push(s.to_string()); } @@ -159,20 +140,6 @@ impl EnvironmentGuard for DellXps9380Sal { } fn restore(&self) -> Result<()> { - // 1. Restore Power Limits - if let Some(pl1) = *self.original_pl1.lock().unwrap() { - let _ = fs::write(&self.pl1_path, pl1.to_string()); - } - if let Some(pl2) = *self.original_pl2.lock().unwrap() { - let _ = fs::write(&self.pl2_path, pl2.to_string()); - } - - // 2. Restore Fan Mode (BIOS Control) - if let Some(tool_path) = self.fact_sheet.paths.tools.get("dell_fan_ctrl") { - let _ = self.ctx.runner.run(&tool_path.to_string_lossy(), &["1"]); - } - - // 3. Restart Services let mut suppressed = self.suppressed_services.lock().unwrap(); for s in suppressed.drain(..) { let _ = self.ctx.runner.run("systemctl", &["start", &s]); @@ -196,7 +163,6 @@ impl SensorBus for DellXps9380Sal { } fn get_power_w(&self) -> Result { - // FIX: Ensure we always read from energy_uj if available for delta calculation let rapl_base = self.pl1_path.parent().context("RAPL path error")?; let energy_path = rapl_base.join("energy_uj"); @@ -212,7 +178,6 @@ impl SensorBus for DellXps9380Sal { if delta_t < 0.05 { return Ok(0.0); } Ok((delta_e as f32 / 1_000_000.0) / delta_t) } else { - // Fallback to power1_average if it exists (units are µW) let s = fs::read_to_string(&self.pwr_path)?; Ok(s.trim().parse::()? / 1000000.0) } @@ -255,6 +220,17 @@ impl ActuatorBus for DellXps9380Sal { Ok(()) } + fn set_fan_speed(&self, speed: FanSpeedPercentage) -> Result<()> { + let tool_path = self.fact_sheet.paths.tools.get("dell_fan_ctrl") + .ok_or_else(|| anyhow!("Dell fan control tool not found in PATH"))?; + let tool_str = tool_path.to_string_lossy(); + + if speed.as_u8() > 50 { + let _ = self.ctx.runner.run(&tool_str, &["0"]); + } + Ok(()) + } + fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { fs::write(&self.pl1_path, limit.as_u64().to_string())?; Ok(()) diff --git a/src/sal/generic_linux.rs b/src/sal/generic_linux.rs index ea1498e..e003ce6 100644 --- a/src/sal/generic_linux.rs +++ b/src/sal/generic_linux.rs @@ -3,10 +3,9 @@ use std::path::{Path}; use std::fs; use std::time::{Duration, Instant}; use std::sync::Mutex; -use tracing::{debug}; use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx}; -use crate::sal::safety::TdpLimitMicroWatts; +use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage}; use crate::sal::heuristic::discovery::SystemFactSheet; use crate::sal::heuristic::schema::HardwareDb; @@ -152,6 +151,10 @@ impl ActuatorBus for GenericLinuxSal { } else { Ok(()) } } + fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> { + Ok(()) + } + fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL1 path"))?; fs::write(rapl_path.join("constraint_0_power_limit_uw"), limit.as_u64().to_string())?; diff --git a/src/sal/mock.rs b/src/sal/mock.rs index 28b5691..ecddb91 100644 --- a/src/sal/mock.rs +++ b/src/sal/mock.rs @@ -1,5 +1,5 @@ use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, SafetyStatus}; -use crate::sal::safety::TdpLimitMicroWatts; +use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage}; use anyhow::Result; pub struct MockSal { @@ -60,6 +60,9 @@ impl ActuatorBus for MockSal { fn set_fan_mode(&self, _mode: &str) -> Result<()> { Ok(()) } + fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> { + Ok(()) + } fn set_sustained_power_limit(&self, _limit: TdpLimitMicroWatts) -> Result<()> { Ok(()) } diff --git a/src/sal/safety.rs b/src/sal/safety.rs index 5ccce10..f33689d 100644 --- a/src/sal/safety.rs +++ b/src/sal/safety.rs @@ -1,175 +1,194 @@ -//! Universal Safeguard Architecture (USA) and Hardware Primitives. -//! -//! This module provides the `HardwareStateGuard` for guaranteed state -//! restoration and type-safe primitives to prevent dangerous hardware states. +//! # Hardware Safety & Universal Safeguard Architecture +//! +//! This module implements the core safety logic for `ember-tune`. It uses the Rust +//! type system to enforce hardware bounds and RAII patterns to guarantee that +//! the system is restored to a safe state even after a crash. use anyhow::{Result, bail, Context}; use std::collections::HashMap; use std::fs; -use std::path::{Path, PathBuf}; +use std::path::{PathBuf}; use tracing::{info, warn, error}; -// --- Type-Driven Safety Primitives --- +// --- 1. Type-Driven Bounds Checking --- -/// Represents a safe TDP limit in microwatts. +/// Represents a TDP limit in microwatts, strictly bounded between 5W and 80W. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct TdpLimitMicroWatts(u64); impl TdpLimitMicroWatts { - /// Strict bounds to prevent hardware bricking. - pub const MIN_SAFE_UW: u64 = 5_000_000; // 5 Watts - pub const MAX_SAFE_UW: u64 = 80_000_000; // 80 Watts + /// # SAFETY: + /// Values below 5W can cause CPU frequency to drop to 400MHz and induce system instability. + pub const MIN_SAFE_UW: u64 = 5_000_000; + /// # SAFETY: + /// Values above 80W can exceed the thermal and electrical design limits of XPS chassis. + pub const MAX_SAFE_UW: u64 = 80_000_000; - /// Constructs a new TdpLimitMicroWatts, enforcing safety bounds. - /// - /// # Errors - /// Returns a `HardwareSafetyError` (via `anyhow::bail`) if the value is out of bounds. + /// Validates and constructs a new TDP limit. pub fn new(microwatts: u64) -> Result { if microwatts < Self::MIN_SAFE_UW { - bail!("HardwareSafetyError: Requested TDP {} uW is below the absolute safety floor of {} uW.", microwatts, Self::MIN_SAFE_UW); + bail!("HardwareSafetyError: Requested TDP {}uW is below safety floor (5W).", microwatts); } if microwatts > Self::MAX_SAFE_UW { - bail!("HardwareSafetyError: Requested TDP {} uW exceeds absolute maximum of {} uW.", microwatts, Self::MAX_SAFE_UW); + bail!("HardwareSafetyError: Requested TDP {}uW exceeds safety ceiling (80W).", microwatts); } Ok(Self(microwatts)) } - pub fn as_u64(&self) -> u64 { - self.0 - } - - pub fn as_watts(&self) -> f32 { - self.0 as f32 / 1_000_000.0 + pub fn from_watts(watts: f32) -> Result { + Self::new((watts * 1_000_000.0) as u64) } + + pub fn as_u64(&self) -> u64 { self.0 } } -/// Represents a safe Fan Speed in Percentage (0-100). -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +/// Represents a fan speed percentage (0-100%). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct FanSpeedPercentage(u8); impl FanSpeedPercentage { - /// Constructs a new FanSpeedPercentage, enforcing safety bounds. pub fn new(percent: u8) -> Result { if percent > 100 { - bail!("HardwareSafetyError: Fan speed percentage {} exceeds 100%.", percent); + bail!("HardwareSafetyError: Fan speed {}% is invalid.", percent); } Ok(Self(percent)) } - - pub fn as_u8(&self) -> u8 { - self.0 - } + pub fn as_u8(&self) -> u8 { self.0 } } -/// Represents a safe Thermal Threshold in Celsius. +/// Represents a thermal threshold in Celsius, bounded to TjMax - 2°C (98°C). #[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] pub struct ThermalThresholdCelsius(f32); impl ThermalThresholdCelsius { pub const MAX_SAFE_C: f32 = 98.0; - /// Constructs a new ThermalThresholdCelsius, enforcing safety bounds. pub fn new(celsius: f32) -> Result { - if celsius < 0.0 || celsius > Self::MAX_SAFE_C { - bail!("HardwareSafetyError: Thermal threshold {}°C is outside safe bounds (0.0 - {}).", celsius, Self::MAX_SAFE_C); + if celsius > Self::MAX_SAFE_C { + bail!("HardwareSafetyError: Thermal threshold {}C exceeds safe limit (98C).", celsius); } Ok(Self(celsius)) } - - pub fn as_f32(&self) -> f32 { - self.0 - } + pub fn as_f32(&self) -> f32 { self.0 } } -// --- The HardwareStateGuard (RAII Restorer) --- +// --- 2. The HardwareStateGuard (RAII Restorer) --- -/// Represents a deep snapshot of the system state before benchmarking. -#[derive(Debug, Default, Clone)] -pub struct SystemSnapshot { - /// Maps file paths to their raw string content (e.g., RAPL limits). - pub sysfs_nodes: HashMap, - /// List of services that were active and subsequently stopped. - pub suppressed_services: Vec, -} +/// Defines an arbitrary action to take during restoration. +pub type RollbackAction = Box; -/// The Universal Safeguard wrapper. -/// -/// Implements the "Ironclad Restorer" pattern via the [Drop] trait. +/// Holds a snapshot of the system state. Restores everything on Drop. pub struct HardwareStateGuard { - snapshot: SystemSnapshot, - is_armed: bool, + /// Maps sysfs paths to their original string contents. + snapshots: HashMap, + /// Services that were stopped and must be restarted. + suppressed_services: Vec, + /// Arbitrary actions to perform on restoration (e.g., reset fan mode). + rollback_actions: Vec, + is_active: bool, } impl HardwareStateGuard { - /// Arms the safeguard by taking a snapshot of the target files and services. - /// - /// # Errors - /// Returns an error if any critical sysfs node cannot be read. + /// Snapshots the requested files and neutralizes competing services. pub fn acquire(target_files: &[PathBuf], target_services: &[String]) -> Result { - let mut snapshot = SystemSnapshot::default(); + let mut snapshots = HashMap::new(); + let mut suppressed = Vec::new(); - info!("USA: Arming safeguard and snapshotting system state..."); + info!("USA: Arming HardwareStateGuard. Snapshotting critical registers..."); for path in target_files { if path.exists() { let content = fs::read_to_string(path) .with_context(|| format!("Failed to snapshot {:?}", path))?; - snapshot.sysfs_nodes.insert(path.clone(), content.trim().to_string()); - } else { - warn!("USA: Target node {:?} does not exist, skipping snapshot.", path); + snapshots.insert(path.clone(), content.trim().to_string()); } } - for service in target_services { + for svc in target_services { let status = std::process::Command::new("systemctl") - .args(["is-active", "--quiet", service]) + .args(["is-active", "--quiet", svc]) .status(); if let Ok(s) = status { if s.success() { - snapshot.suppressed_services.push(service.clone()); + info!("USA: Neutralizing service '{}'", svc); + let _ = std::process::Command::new("systemctl").args(["stop", svc]).status(); + suppressed.push(svc.clone()); } } } Ok(Self { - snapshot, - is_armed: true, + snapshots, + suppressed_services: suppressed, + rollback_actions: Vec::new(), + is_active: true, }) } - /// Explicit manual restoration (can be called upon successful exit). - pub fn release(&mut self) -> Result<()> { - if !self.is_armed { - return Ok(()); - } + /// Registers a custom action to be performed when the guard is released. + pub fn on_rollback(&mut self, action: RollbackAction) { + self.rollback_actions.push(action); + } - info!("USA: Initiating Ironclad Restoration..."); + /// Explicitly release and restore the hardware state. + pub fn release(&mut self) -> Result<()> { + if !self.is_active { return Ok(()); } + + info!("USA: Releasing guard. Restoring hardware to pre-flight state..."); // 1. Restore Power/Sysfs states - for (path, content) in &self.snapshot.sysfs_nodes { + for (path, content) in &self.snapshots { if let Err(e) = fs::write(path, content) { - error!("USA RESTORATION FAILURE: Could not revert {:?}: {}", path, e); + error!("CRITICAL: Failed to restore {:?}: {}", path, e); } } // 2. Restart Services - for service in &self.snapshot.suppressed_services { - let _ = std::process::Command::new("systemctl") - .args(["start", service]) - .status(); + for svc in &self.suppressed_services { + let _ = std::process::Command::new("systemctl").args(["start", svc]).status(); } - self.is_armed = false; + // 3. Perform Custom Rollback Actions + for action in self.rollback_actions.drain(..) { + (action)(); + } + + self.is_active = false; Ok(()) } } impl Drop for HardwareStateGuard { fn drop(&mut self) { - if self.is_armed { - warn!("USA: HardwareStateGuard triggered via Drop (panic/unexpected exit). Reverting system state..."); + if self.is_active { + warn!("USA: Guard dropped prematurely (panic/SIGTERM). Force-restoring system..."); let _ = self.release(); } } } + +// --- 3. Transactional Configuration --- + +/// A staged set of changes to be applied to the hardware. +#[derive(Default)] +pub struct ConfigurationTransaction { + changes: Vec<(PathBuf, String)>, +} + +impl ConfigurationTransaction { + pub fn add_change(&mut self, path: PathBuf, value: String) { + self.changes.push((path, value)); + } + + /// # SAFETY: + /// Commits all changes. If any write fails, it returns an error but the + /// HardwareStateGuard will still restore everything on drop. + pub fn commit(self) -> Result<()> { + for (path, val) in self.changes { + fs::write(&path, val) + .with_context(|| format!("Failed to apply change to {:?}", path))?; + } + Ok(()) + } +} diff --git a/src/sal/traits.rs b/src/sal/traits.rs index bae1ae8..235f6b1 100644 --- a/src/sal/traits.rs +++ b/src/sal/traits.rs @@ -157,26 +157,20 @@ impl SensorBus for Arc { } } -use crate::sal::safety::TdpLimitMicroWatts; +use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage}; /// Provides a write-only interface for hardware actuators. pub trait ActuatorBus: Send + Sync { /// Sets the fan control mode (e.g., "auto" or "max"). - /// - /// # Errors - /// Returns an error if the fan control command or `sysfs` write fails. fn set_fan_mode(&self, mode: &str) -> Result<()>; + /// Sets the fan speed directly using a validated percentage. + fn set_fan_speed(&self, speed: FanSpeedPercentage) -> Result<()>; + /// Sets the sustained power limit (PL1) using a validated wrapper. - /// - /// # Errors - /// Returns an error if the RAPL `sysfs` node cannot be written to. fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>; /// Sets the burst power limit (PL2) using a validated wrapper. - /// - /// # Errors - /// Returns an error if the RAPL `sysfs` node cannot be written to. fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>; } @@ -184,6 +178,9 @@ impl ActuatorBus for Arc { fn set_fan_mode(&self, mode: &str) -> Result<()> { (**self).set_fan_mode(mode) } + fn set_fan_speed(&self, speed: FanSpeedPercentage) -> Result<()> { + (**self).set_fan_speed(speed) + } fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { (**self).set_sustained_power_limit(limit) } diff --git a/tests/safety_test.rs b/tests/safety_test.rs new file mode 100644 index 0000000..2922019 --- /dev/null +++ b/tests/safety_test.rs @@ -0,0 +1,56 @@ +use anyhow::Result; +use std::fs; +use std::path::PathBuf; +use ember_tune_rs::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts}; +use crate::common::fakesys::FakeSysBuilder; + +mod common; + +#[test] +fn test_hardware_state_guard_panic_restoration() { + let fake = FakeSysBuilder::new(); + let pl1_path = fake.base_path().join("sys/class/powercap/intel-rapl:0/constraint_0_power_limit_uw"); + + fake.add_rapl("intel-rapl:0", "1000", "15000000"); // 15W original + + let target_files = vec![pl1_path.clone()]; + + // Simulate a scope where the guard is active + { + let mut _guard = HardwareStateGuard::acquire(&target_files, &[]).expect("Failed to acquire guard"); + + // Modify the file + fs::write(&pl1_path, "25000000").expect("Failed to write new value"); + assert_eq!(fs::read_to_string(&pl1_path).unwrap().trim(), "25000000"); + + // Guard is dropped here (simulating end of scope or panic) + } + + // Verify restoration + let restored = fs::read_to_string(&pl1_path).expect("Failed to read restored file"); + assert_eq!(restored.trim(), "15000000"); +} + +#[test] +fn test_tdp_limit_bounds_checking() { + // 1. Valid value + assert!(TdpLimitMicroWatts::new(15_000_000).is_ok()); + + // 2. Too low (Dangerous 0W or below 5W) + let low_res = TdpLimitMicroWatts::new(1_000_000); + assert!(low_res.is_err()); + assert!(low_res.unwrap_err().to_string().contains("below safety floor")); + + // 3. Too high (> 80W) + let high_res = TdpLimitMicroWatts::new(100_000_000); + assert!(high_res.is_err()); + assert!(high_res.unwrap_err().to_string().contains("exceeds safety ceiling")); +} + +#[test] +fn test_0w_tdp_regression_prevention() { + // The prime directive is to never set 0W. + // Ensure the new() constructor explicitly fails for 0. + let zero_res = TdpLimitMicroWatts::new(0); + assert!(zero_res.is_err()); +}