From f87efa1d2452770113de9dbddd895b9f4c8ce209 Mon Sep 17 00:00:00 2001 From: Nils Pukropp Date: Thu, 26 Feb 2026 15:16:37 +0100 Subject: [PATCH 01/13] implemented generic linux sal with heuristics --- Cargo.lock | 56 +++++++++ Cargo.toml | 2 + src/main.rs | 57 ++------- src/orchestrator/mod.rs | 64 ++++------ src/sal/dell_xps_9380.rs | 26 ++-- src/sal/generic_linux.rs | 218 +++++++++++++++++++++++++++++++++ src/sal/heuristic/discovery.rs | 185 ++++++++++++++++++++++++++++ src/sal/heuristic/engine.rs | 60 +++++++++ src/sal/heuristic/mod.rs | 3 + src/sal/heuristic/schema.rs | 90 ++++++++++++++ src/sal/mock.rs | 37 ++---- src/sal/mod.rs | 2 + src/sal/traits.rs | 11 +- 13 files changed, 686 insertions(+), 125 deletions(-) create mode 100644 src/sal/generic_linux.rs create mode 100644 src/sal/heuristic/discovery.rs create mode 100644 src/sal/heuristic/engine.rs create mode 100644 src/sal/heuristic/mod.rs create mode 100644 src/sal/heuristic/schema.rs diff --git a/Cargo.lock b/Cargo.lock index 5f2e97d..aca5993 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -526,10 +526,12 @@ dependencies = [ "num_cpus", "owo-colors", "ratatui", + "regex", "serde", "serde_json", "sysinfo", "thiserror 2.0.18", + "toml", "tracing", "tracing-appender", "tracing-subscriber", @@ -1534,6 +1536,15 @@ dependencies = [ "zmij", ] +[[package]] +name = "serde_spanned" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8bbf91e5a4d6315eee45e704372590b30e260ee83af6639d64557f51b067776" +dependencies = [ + "serde_core", +] + [[package]] name = "sha2" version = "0.10.9" @@ -1852,6 +1863,45 @@ dependencies = [ "time-core", ] +[[package]] +name = "toml" +version = "1.0.3+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7614eaf19ad818347db24addfa201729cf2a9b6fdfd9eb0ab870fcacc606c0c" +dependencies = [ + "indexmap", + "serde_core", + "serde_spanned", + "toml_datetime", + "toml_parser", + "toml_writer", + "winnow", +] + +[[package]] +name = "toml_datetime" +version = "1.0.0+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32c2555c699578a4f59f0cc68e5116c8d7cabbd45e1409b989d4be085b53f13e" +dependencies = [ + "serde_core", +] + +[[package]] +name = "toml_parser" +version = "1.0.9+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4" +dependencies = [ + "winnow", +] + +[[package]] +name = "toml_writer" +version = "1.0.6+spec-1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab16f14aed21ee8bfd8ec22513f7287cd4a91aa92e44edfe2c17ddd004e92607" + [[package]] name = "tracing" version = "0.1.44" @@ -2492,6 +2542,12 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" +[[package]] +name = "winnow" +version = "0.7.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" + [[package]] name = "wit-bindgen" version = "0.51.0" diff --git a/Cargo.toml b/Cargo.toml index 85794ca..40147dc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,3 +28,5 @@ tracing-appender = "0.2" sysinfo = "0.38" libc = "0.2" num_cpus = "1.17" +toml = "1.0.3" +regex = "1.12.3" diff --git a/src/main.rs b/src/main.rs index c7dd833..ab30b7b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -27,9 +27,9 @@ use ratatui::{backend::CrosstermBackend, Terminal}; use cli::Cli; use mediator::{TelemetryState, UiCommand, BenchmarkPhase}; -use sal::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError}; -use sal::mock::{MockAuditor, MockGuard, MockSensorBus, MockActuatorBus, MockWatchdog}; -use sal::dell_xps_9380::DellXps9380Sal; +use sal::traits::{AuditError, PlatformSal}; +use sal::mock::MockSal; +use sal::heuristic::engine::HeuristicEngine; use load::StressNg; use orchestrator::BenchmarkOrchestrator; use ui::dashboard::{draw_dashboard, DashboardState}; @@ -107,20 +107,17 @@ fn main() -> Result<()> { info!("ember-tune starting with args: {:?}", args); - // 2. Pre-flight Audit (Before TUI) - let auditor: Arc = if args.mock { - Arc::new(MockAuditor) + // 2. Platform Detection & Audit + let sal: Box = if args.mock { + Box::new(MockSal::new()) } else { - match DellXps9380Sal::init() { - Ok(sal) => Arc::new(sal), - Err(e) => return Err(miette::miette!("Failed to initialize Dell SAL: {}", e)), - } + HeuristicEngine::detect_and_build()? }; println!("{}", console::style("─── Pre-flight System Audit ───").bold().cyan()); let mut audit_failures = Vec::new(); - for step in auditor.audit() { + for step in sal.audit() { print!(" Checking {:<40} ", step.description); io::Write::flush(&mut io::stdout()).into_diagnostic()?; @@ -151,8 +148,9 @@ fn main() -> Result<()> { enable_raw_mode().into_diagnostic()?; let mut stdout = io::stdout(); execute!(stdout, EnterAlternateScreen).into_diagnostic()?; - let backend = CrosstermBackend::new(stdout); - let mut terminal = Terminal::new(backend).into_diagnostic()?; + let backend_stdout = io::stdout(); + let backend_term = CrosstermBackend::new(backend_stdout); + let mut terminal = Terminal::new(backend_term).into_diagnostic()?; // 4. State & Communication Setup let running = Arc::new(AtomicBool::new(true)); @@ -166,40 +164,11 @@ fn main() -> Result<()> { }).expect("Error setting Ctrl-C handler"); // 5. Spawn Backend Orchestrator - let is_mock = args.mock; - let b_auditor = auditor.clone(); let backend_handle = thread::spawn(move || { - let (guard, sensors, actuators, watchdog): ( - Box, - Box, - Box, - Box, - ) = if is_mock { - ( - Box::new(MockGuard::new()), - Box::new(MockSensorBus), - Box::new(MockActuatorBus), - Box::new(MockWatchdog), - ) - } else { - // Re-init or share the SAL - let sal = Arc::new(DellXps9380Sal::init().expect("Failed to init Dell SAL in backend")); - ( - Box::new(sal::dell_xps_9380::DellXps9380Guard::new()), - Box::new(sal.clone() as Arc), - Box::new(sal.clone() as Arc), - Box::new(sal as Arc), - ) - }; - let workload = Box::new(StressNg::new()); let mut orchestrator = BenchmarkOrchestrator::new( - Box::new(b_auditor), - guard, - sensors, - actuators, - watchdog, + sal, workload, telemetry_tx, command_rx, @@ -286,7 +255,7 @@ fn main() -> Result<()> { } Ok(Err(e)) => { if e.to_string() == "ABORTED" { - println!("{}", "Benchmark aborted by user. No summary available.".yellow()); + println!("{}", "Benchmark aborted by user.".yellow()); } else { error!("Orchestrator encountered error: {}", e); eprintln!("{} {}", "Error:".red().bold(), e); diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index 61bead3..b4b7b73 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -5,17 +5,13 @@ use std::thread; use std::collections::VecDeque; use sysinfo::System; -use crate::sal::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog}; +use crate::sal::traits::{PlatformSal}; use crate::load::Workload; use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase}; use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult}; pub struct BenchmarkOrchestrator { - auditor: Box, - guard: Box, - sensors: Box, - actuators: Box, - watchdog: Box, + sal: Box, workload: Box, telemetry_tx: mpsc::Sender, command_rx: mpsc::Receiver, @@ -35,11 +31,7 @@ pub struct BenchmarkOrchestrator { impl BenchmarkOrchestrator { pub fn new( - auditor: Box, - guard: Box, - sensors: Box, - actuators: Box, - watchdog: Box, + sal: Box, workload: Box, telemetry_tx: mpsc::Sender, command_rx: mpsc::Receiver, @@ -53,11 +45,7 @@ impl BenchmarkOrchestrator { let total_ram_gb = sys.total_memory() / 1024 / 1024 / 1024; Self { - auditor, - guard, - sensors, - actuators, - watchdog, + sal, workload, telemetry_tx, command_rx, @@ -77,19 +65,19 @@ impl BenchmarkOrchestrator { // Phase 1: Audit & Baseline self.phase = BenchmarkPhase::Auditing; - for step in self.auditor.audit() { + for step in self.sal.audit() { if let Err(e) = step.outcome { return Err(anyhow::anyhow!("Audit failed ({}): {:?}", step.description, e)); } } self.log("Suppressing background services (tlp, thermald)...")?; - self.guard.suppress().context("Failed to suppress background services")?; + self.sal.suppress().context("Failed to suppress background services")?; // Baseline (Idle Calibration) self.phase = BenchmarkPhase::IdleCalibration; self.log("Phase 1: Recording Idle Baseline (10s)...")?; - self.actuators.set_fan_mode("auto")?; // Use auto for idle + self.sal.set_fan_mode("auto")?; // Use auto for idle let mut idle_temps = Vec::new(); let start = Instant::now(); @@ -97,7 +85,7 @@ impl BenchmarkOrchestrator { while start.elapsed() < Duration::from_secs(10) { self.check_abort()?; self.send_telemetry(tick)?; - idle_temps.push(self.sensors.get_temp().unwrap_or(0.0)); + idle_temps.push(self.sal.get_temp().unwrap_or(0.0)); tick += 1; thread::sleep(Duration::from_millis(500)); } @@ -107,13 +95,13 @@ impl BenchmarkOrchestrator { // Phase 2: Stress Stepping self.phase = BenchmarkPhase::StressTesting; self.log("Phase 2: Starting Synthetic Stress Matrix.")?; - self.actuators.set_fan_mode("max")?; // Lock fans for consistent resistance + self.sal.set_fan_mode("max")?; // Lock fans for consistent resistance let power_steps = [15.0, 20.0, 25.0, 30.0, 35.0]; for &pl in &power_steps { self.log(&format!("Testing PL1 = {:.0}W...", pl))?; - self.actuators.set_sustained_power_limit(pl)?; - self.actuators.set_burst_power_limit(pl + 5.0)?; + self.sal.set_sustained_power_limit(pl)?; + self.sal.set_burst_power_limit(pl + 5.0)?; self.workload.start(num_cpus::get(), 100)?; @@ -123,13 +111,13 @@ impl BenchmarkOrchestrator { while step_start.elapsed() < Duration::from_secs(45) { self.check_abort()?; - if self.watchdog.check_emergency()? { + if self.sal.check_emergency()? { self.log("⚠ EMERGENCY ABORT: Watchdog triggered!")?; self.workload.stop()?; return Err(anyhow::anyhow!("Hardware Watchdog Triggered")); } - let t = self.sensors.get_temp().unwrap_or(0.0); + let t = self.sal.get_temp().unwrap_or(0.0); step_temps.push_back(t); if step_temps.len() > 10 { step_temps.pop_front(); } @@ -149,10 +137,10 @@ impl BenchmarkOrchestrator { } // Record data point - let avg_p = self.sensors.get_power_w().unwrap_or(0.0); - let avg_t = self.sensors.get_temp().unwrap_or(0.0); - let avg_f = self.sensors.get_freq_mhz().unwrap_or(0.0); - let fans = self.sensors.get_fan_rpms().unwrap_or_default(); + let avg_p = self.sal.get_power_w().unwrap_or(0.0); + let avg_t = self.sal.get_temp().unwrap_or(0.0); + let avg_f = self.sal.get_freq_mhz().unwrap_or(0.0); + let fans = self.sal.get_fan_rpms().unwrap_or_default(); let primary_fan = fans.first().cloned().unwrap_or(0); let tp = self.workload.get_throughput().unwrap_or(0.0); @@ -210,7 +198,7 @@ impl BenchmarkOrchestrator { std::fs::write("i8kmon.conf", i8k_content)?; self.log("✓ Saved 'i8kmon.conf'.")?; - self.guard.restore()?; + self.sal.restore()?; self.log("✓ Environment restored.")?; Ok(res) @@ -248,10 +236,10 @@ impl BenchmarkOrchestrator { cpu_model: self.cpu_model.clone(), total_ram_gb: self.total_ram_gb, tick: 0, - cpu_temp: self.sensors.get_temp().unwrap_or(0.0), - power_w: self.sensors.get_power_w().unwrap_or(0.0), - current_freq: self.sensors.get_freq_mhz().unwrap_or(0.0), - fans: self.sensors.get_fan_rpms().unwrap_or_default(), + cpu_temp: self.sal.get_temp().unwrap_or(0.0), + power_w: self.sal.get_power_w().unwrap_or(0.0), + current_freq: self.sal.get_freq_mhz().unwrap_or(0.0), + fans: self.sal.get_fan_rpms().unwrap_or_default(), governor: "unknown".to_string(), pl1_limit: 0.0, pl2_limit: 0.0, @@ -267,9 +255,9 @@ impl BenchmarkOrchestrator { } fn send_telemetry(&mut self, tick: u64) -> Result<()> { - let temp = self.sensors.get_temp().unwrap_or(0.0); - let pwr = self.sensors.get_power_w().unwrap_or(0.0); - let freq = self.sensors.get_freq_mhz().unwrap_or(0.0); + let temp = self.sal.get_temp().unwrap_or(0.0); + let pwr = self.sal.get_power_w().unwrap_or(0.0); + let freq = self.sal.get_freq_mhz().unwrap_or(0.0); self.history_temp.push_back(temp); self.history_watts.push_back(pwr); @@ -288,7 +276,7 @@ impl BenchmarkOrchestrator { cpu_temp: temp, power_w: pwr, current_freq: freq, - fans: self.sensors.get_fan_rpms().unwrap_or_default(), + fans: self.sal.get_fan_rpms().unwrap_or_default(), governor: "performance".to_string(), pl1_limit: 15.0, pl2_limit: 25.0, diff --git a/src/sal/dell_xps_9380.rs b/src/sal/dell_xps_9380.rs index 75d747e..e8f7fc6 100644 --- a/src/sal/dell_xps_9380.rs +++ b/src/sal/dell_xps_9380.rs @@ -17,6 +17,7 @@ pub struct DellXps9380Sal { last_poll: Mutex, last_temp: Mutex, last_fans: Mutex>, + suppressed_services: Mutex>, } impl DellXps9380Sal { @@ -82,6 +83,7 @@ impl DellXps9380Sal { last_poll: Mutex::new(Instant::now() - Duration::from_secs(2)), last_temp: Mutex::new(0.0), last_fans: Mutex::new(Vec::new()), + suppressed_services: Mutex::new(Vec::new()), }) } } @@ -151,44 +153,36 @@ impl PreflightAuditor for DellXps9380Sal { } } -pub struct DellXps9380Guard { - stopped_services: Vec, -} - -impl DellXps9380Guard { - pub fn new() -> Self { - Self { stopped_services: Vec::new() } - } -} - -impl EnvironmentGuard for DellXps9380Guard { +impl EnvironmentGuard for DellXps9380Sal { fn suppress(&mut self) -> Result<()> { let services = ["tlp", "thermald", "i8kmon"]; + let mut suppressed = self.suppressed_services.lock().unwrap(); for s in services { if Command::new("systemctl").args(["is-active", "--quiet", s]).status()?.success() { debug!("Suppressing service: {}", s); Command::new("systemctl").args(["stop", s]).status()?; - self.stopped_services.push(s.to_string()); + suppressed.push(s.to_string()); } } Ok(()) } fn restore(&mut self) -> Result<()> { - for s in &self.stopped_services { - let _ = Command::new("systemctl").args(["start", s]).status(); + let mut suppressed = self.suppressed_services.lock().unwrap(); + for s in suppressed.drain(..) { + let _ = Command::new("systemctl").args(["start", &s]).status(); } - self.stopped_services.clear(); Ok(()) } } -impl Drop for DellXps9380Guard { +impl Drop for DellXps9380Sal { fn drop(&mut self) { let _ = self.restore(); } } + impl SensorBus for DellXps9380Sal { fn get_temp(&self) -> Result { // Enforce 1000ms rate limit for Dell SMM as per GEMINI.md diff --git a/src/sal/generic_linux.rs b/src/sal/generic_linux.rs new file mode 100644 index 0000000..a9527be --- /dev/null +++ b/src/sal/generic_linux.rs @@ -0,0 +1,218 @@ +use anyhow::{Result, anyhow}; +use std::path::Path; +use std::fs; +use std::time::{Duration, Instant}; +use std::thread; +use std::process::Command; +use tracing::{debug}; +use std::sync::mpsc; + +use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError}; +use crate::sal::heuristic::discovery::SystemFactSheet; +use crate::sal::heuristic::schema::HardwareDb; + +pub struct GenericLinuxSal { + fact_sheet: SystemFactSheet, + db: HardwareDb, + suppressed_services: Vec, +} + +impl GenericLinuxSal { + pub fn new(fact_sheet: SystemFactSheet, db: HardwareDb) -> Self { + Self { + fact_sheet, + db, + suppressed_services: Vec::new(), + } + } + + fn is_dell(&self) -> bool { + self.fact_sheet.vendor.to_lowercase().contains("dell") + } + + fn read_sysfs_timeout(&self, path: &Path, timeout: Duration) -> Result { + let (tx, rx) = mpsc::channel(); + let path_buf = path.to_path_buf(); + + thread::spawn(move || { + let res = fs::read_to_string(path_buf).map(|s| s.trim().to_string()); + let _ = tx.send(res); + }); + + match rx.recv_timeout(timeout) { + Ok(res) => res.map_err(|e| anyhow!("Failed to read sysfs: {}", e)), + Err(_) => Err(anyhow!("Timeout reading sysfs path: {:?}", path)), + } + } +} + +impl PreflightAuditor for GenericLinuxSal { + fn audit(&self) -> Box + '_> { + let mut steps = Vec::new(); + + // 1. Static DB checks + for check in &self.db.preflight_checks { + let status = Command::new("sh") + .arg("-c") + .arg(&check.check_cmd) + .status(); + + steps.push(AuditStep { + description: check.name.clone(), + outcome: match status { + Ok(s) if s.success() => Ok(()), + _ => Err(AuditError::KernelIncompatible(check.fail_help.clone())), + } + }); + } + + // 2. Conflict checks (Critical only) + for conflict_id in &self.fact_sheet.active_conflicts { + if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) { + if conflict.severity == "Critical" { + steps.push(AuditStep { + description: format!("Conflict: {}", conflict.id), + outcome: Err(AuditError::ToolMissing(conflict.help_text.clone())), + }); + } + } + } + + Box::new(steps.into_iter()) + } +} + +impl SensorBus for GenericLinuxSal { + fn get_temp(&self) -> Result { + let path = self.fact_sheet.temp_path.as_ref() + .ok_or_else(|| anyhow!("No temperature sensor path found"))?; + let content = self.read_sysfs_timeout(path, Duration::from_millis(200))?; + let milli_celsius: f32 = content.parse()?; + Ok(milli_celsius / 1000.0) + } + + fn get_power_w(&self) -> Result { + let rapl_path = self.fact_sheet.rapl_paths.first() + .ok_or_else(|| anyhow!("No RAPL path found"))?; + let energy_path = rapl_path.join("energy_uj"); + + let e1: u64 = self.read_sysfs_timeout(&energy_path, Duration::from_millis(200))?.parse()?; + let t1 = Instant::now(); + thread::sleep(Duration::from_millis(100)); + let e2: u64 = self.read_sysfs_timeout(&energy_path, Duration::from_millis(200))?.parse()?; + let t2 = Instant::now(); + + let delta_e = e2.wrapping_sub(e1); + let delta_t = t2.duration_since(t1).as_secs_f32(); + Ok((delta_e as f32 / 1_000_000.0) / delta_t) + } + + fn get_fan_rpms(&self) -> Result> { + let mut rpms = Vec::new(); + for path in &self.fact_sheet.fan_paths { + if let Ok(content) = self.read_sysfs_timeout(path, Duration::from_millis(200)) { + if let Ok(rpm) = content.parse() { rpms.push(rpm); } + } + } + Ok(rpms) + } + + fn get_freq_mhz(&self) -> Result { + let path = Path::new("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq"); + if path.exists() { + let khz: f32 = self.read_sysfs_timeout(path, Duration::from_millis(200))?.parse()?; + Ok(khz / 1000.0) + } else { + // Fallback: parse /proc/cpuinfo + let cpuinfo = fs::read_to_string("/proc/cpuinfo")?; + for line in cpuinfo.lines() { + if line.starts_with("cpu MHz") { + if let Some((_, mhz)) = line.split_once(':') { + return Ok(mhz.trim().parse()?); + } + } + } + Err(anyhow!("Could not determine CPU frequency")) + } + } +} + +impl ActuatorBus for GenericLinuxSal { + fn set_fan_mode(&self, mode: &str) -> Result<()> { + if self.is_dell() { + let cmd = match mode { + "manual" | "max" => self.db.ecosystems.get("dell").and_then(|e| e.fan_manual_mode_cmd.as_ref()), + "auto" => self.db.ecosystems.get("dell").and_then(|e| e.fan_auto_mode_cmd.as_ref()), + _ => return Err(anyhow!("Unsupported fan mode: {}", mode)), + }; + if let Some(cmd_str) = cmd { + let parts: Vec<&str> = cmd_str.split_whitespace().collect(); + Command::new(parts[0]).args(&parts[1..]).status()?; + Ok(()) + } else { Err(anyhow!("Dell fan command missing in DB")) } + } else { + debug!("Fan control not implemented for non-Dell systems yet"); + Ok(()) + } + } + + fn set_sustained_power_limit(&self, watts: f32) -> Result<()> { + let rapl_path = self.fact_sheet.rapl_paths.first() + .ok_or_else(|| anyhow!("No RAPL path found for PL1"))?; + let path = rapl_path.join("constraint_0_power_limit_uw"); + fs::write(path, ((watts * 1_000_000.0) as u64).to_string())?; + Ok(()) + } + + fn set_burst_power_limit(&self, watts: f32) -> Result<()> { + let rapl_path = self.fact_sheet.rapl_paths.first() + .ok_or_else(|| anyhow!("No RAPL path found for PL2"))?; + let path = rapl_path.join("constraint_1_power_limit_uw"); + fs::write(path, ((watts * 1_000_000.0) as u64).to_string())?; + Ok(()) + } +} + +impl EnvironmentGuard for GenericLinuxSal { + fn suppress(&mut self) -> Result<()> { + for conflict_id in &self.fact_sheet.active_conflicts { + if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) { + for service in &conflict.services { + debug!("Stopping service: {}", service); + if Command::new("systemctl").arg("stop").arg(service).status()?.success() { + self.suppressed_services.push(service.clone()); + } + } + } + } + Ok(()) + } + + fn restore(&mut self) -> Result<()> { + for service in self.suppressed_services.drain(..) { + debug!("Starting service: {}", service); + let _ = Command::new("systemctl").arg("start").arg(service).status(); + } + if self.is_dell() { + let _ = self.set_fan_mode("auto"); + } + Ok(()) + } +} + +impl HardwareWatchdog for GenericLinuxSal { + fn check_emergency(&self) -> Result { + if let Ok(temp) = self.get_temp() { + if temp > 100.0 { + return Ok(true); + } + } + Ok(false) + } +} + +impl Drop for GenericLinuxSal { + fn drop(&mut self) { + let _ = self.restore(); + } +} diff --git a/src/sal/heuristic/discovery.rs b/src/sal/heuristic/discovery.rs new file mode 100644 index 0000000..a4f894a --- /dev/null +++ b/src/sal/heuristic/discovery.rs @@ -0,0 +1,185 @@ +use std::fs; +use std::path::{Path, PathBuf}; +use std::process::Command; +use std::time::Duration; +use std::thread; +use std::sync::mpsc; +use crate::sal::heuristic::schema::{SensorDiscovery, ActuatorDiscovery, Conflict}; +use tracing::{debug, warn}; + +/// Strongly-typed findings about the current system. +#[derive(Debug, Clone, Default)] +pub struct SystemFactSheet { + pub vendor: String, + pub model: String, + pub temp_path: Option, + pub fan_paths: Vec, + pub rapl_paths: Vec, + pub active_conflicts: Vec, // List of conflict IDs found active +} + +/// Probes the system for hardware sensors, actuators, and service conflicts. +pub fn discover_facts( + sensors: &SensorDiscovery, + actuators: &ActuatorDiscovery, + conflicts: &[Conflict] +) -> SystemFactSheet { + let (vendor, model) = read_dmi_info(); + + debug!("DMI Identity: Vendor='{}', Model='{}'", vendor, model); + + let (temp_path, fan_paths) = discover_hwmon(sensors); + let rapl_paths = discover_rapl(actuators); + + let mut active_conflicts = Vec::new(); + for conflict in conflicts { + for service in &conflict.services { + if is_service_active(service) { + debug!("Detected active conflict: {} (Service: {})", conflict.id, service); + active_conflicts.push(conflict.id.clone()); + break; // Found one service in this conflict, move to next conflict + } + } + } + + SystemFactSheet { + vendor, + model, + temp_path, + fan_paths, + rapl_paths, + active_conflicts, + } +} + +/// Reads DMI information from sysfs with a safety timeout. +fn read_dmi_info() -> (String, String) { + let vendor = read_sysfs_with_timeout(Path::new("/sys/class/dmi/id/sys_vendor"), Duration::from_millis(100)) + .unwrap_or_else(|| "Unknown".to_string()); + let model = read_sysfs_with_timeout(Path::new("/sys/class/dmi/id/product_name"), Duration::from_millis(100)) + .unwrap_or_else(|| "Unknown".to_string()); + (vendor, model) +} + +/// Discovers hwmon sensors by matching labels and prioritizing drivers. +fn discover_hwmon(cfg: &SensorDiscovery) -> (Option, Vec) { + let mut temp_candidates = Vec::new(); + let mut fan_candidates = Vec::new(); + + let hwmon_base = Path::new("/sys/class/hwmon"); + let entries = match fs::read_dir(hwmon_base) { + Ok(e) => e, + Err(e) => { + warn!("Could not read /sys/class/hwmon: {}", e); + return (None, Vec::new()); + } + }; + + for entry in entries.flatten() { + let hwmon_path = entry.path(); + + let driver_name = read_sysfs_with_timeout(&hwmon_path.join("name"), Duration::from_millis(100)) + .unwrap_or_default(); + + let priority = cfg.hwmon_priority + .iter() + .position(|p| p == &driver_name) + .unwrap_or(usize::MAX); + + if let Ok(hw_entries) = fs::read_dir(&hwmon_path) { + for hw_entry in hw_entries.flatten() { + let file_name = hw_entry.file_name().into_string().unwrap_or_default(); + + // Temperature Sensors + if file_name.starts_with("temp") && file_name.ends_with("_label") { + if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(100)) { + if cfg.temp_labels.iter().any(|l| label.contains(l)) { + let input_path = hwmon_path.join(file_name.replace("_label", "_input")); + if input_path.exists() { + temp_candidates.push((priority, input_path)); + } + } + } + } + + // Fan Sensors + if file_name.starts_with("fan") && file_name.ends_with("_label") { + if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(100)) { + if cfg.fan_labels.iter().any(|l| label.contains(l)) { + let input_path = hwmon_path.join(file_name.replace("_label", "_input")); + if input_path.exists() { + fan_candidates.push((priority, input_path)); + } + } + } + } + } + } + } + + temp_candidates.sort_by_key(|(p, _)| *p); + fan_candidates.sort_by_key(|(p, _)| *p); + + let best_temp = temp_candidates.first().map(|(_, p)| p.clone()); + let best_fans = fan_candidates.into_iter().map(|(_, p)| p).collect(); + + (best_temp, best_fans) +} + +/// Discovers RAPL powercap paths. +fn discover_rapl(cfg: &ActuatorDiscovery) -> Vec { + let mut paths = Vec::new(); + let powercap_base = Path::new("/sys/class/powercap"); + + let entries = match fs::read_dir(powercap_base) { + Ok(e) => e, + Err(_) => return Vec::new(), + }; + + for entry in entries.flatten() { + let path = entry.path(); + let dir_name = entry.file_name().into_string().unwrap_or_default(); + + if cfg.rapl_paths.contains(&dir_name) { + paths.push(path); + continue; + } + + if let Some(name) = read_sysfs_with_timeout(&path.join("name"), Duration::from_millis(100)) { + if cfg.rapl_paths.iter().any(|p| p == &name) { + paths.push(path); + } + } + } + paths +} + +/// Checks if a systemd service is currently active. +pub fn is_service_active(service: &str) -> bool { + let status = Command::new("systemctl") + .arg("is-active") + .arg("--quiet") + .arg(service) + .status(); + + match status { + Ok(s) => s.success(), + Err(_) => false, + } +} + +/// Helper to read a sysfs file with a timeout. +fn read_sysfs_with_timeout(path: &Path, timeout: Duration) -> Option { + let (tx, rx) = mpsc::channel(); + let path_buf = path.to_path_buf(); + + thread::spawn(move || { + let res = fs::read_to_string(path_buf).map(|s| s.trim().to_string()); + let _ = tx.send(res); + }); + + match rx.recv_timeout(timeout) { + Ok(Ok(content)) => Some(content), + _ => None, + } +} diff --git a/src/sal/heuristic/engine.rs b/src/sal/heuristic/engine.rs new file mode 100644 index 0000000..fce728c --- /dev/null +++ b/src/sal/heuristic/engine.rs @@ -0,0 +1,60 @@ +use miette::{Result, IntoDiagnostic, Context}; +use std::fs; +use regex::Regex; +use tracing::{info, debug}; + +use crate::sal::traits::PlatformSal; +use crate::sal::dell_xps_9380::DellXps9380Sal; +use crate::sal::generic_linux::GenericLinuxSal; +use crate::sal::heuristic::schema::HardwareDb; +use crate::sal::heuristic::discovery::{discover_facts}; + +pub struct HeuristicEngine; + +impl HeuristicEngine { + /// Loads the hardware database, probes the system, and builds the appropriate SAL. + pub fn detect_and_build() -> Result> { + // 1. Load Hardware DB + let db_path = "assets/hardware_db.toml"; + let db_content = fs::read_to_string(db_path) + .into_diagnostic() + .with_context(|| format!("Failed to read hardware database at {}", db_path))?; + let db: HardwareDb = toml::from_str(&db_content) + .into_diagnostic() + .context("Failed to parse hardware_db.toml")?; + + // 2. Discover Facts + let facts = discover_facts(&db.discovery.sensors, &db.discovery.actuators, &db.conflicts); + info!("System Identity: {} {}", facts.vendor, facts.model); + + // 3. Routing Logic + + // --- Special Case: Dell XPS 13 9380 --- + if is_match(&facts.vendor, "(?i)Dell.*") && is_match(&facts.model, "(?i)XPS.*13.*9380.*") { + info!("Specialized SAL Match Found: Dell XPS 13 9380"); + let sal = DellXps9380Sal::init().map_err(|e| miette::miette!(e))?; + return Ok(Box::new(sal)); + } + + // --- Fallback: Generic Linux SAL --- + debug!("No specialized SAL match. Falling back to GenericLinuxSal with DB quirks."); + + // Validation: Ensure we found at least a temperature sensor if required + if facts.temp_path.is_none() { + return Err(miette::miette!("No temperature sensor discovered. Generic fallback impossible.")); + } + if facts.rapl_paths.is_empty() { + return Err(miette::miette!("No RAPL power interface discovered. Generic fallback impossible.")); + } + + Ok(Box::new(GenericLinuxSal::new(facts, db))) + } +} + +fn is_match(input: &str, pattern: &str) -> bool { + if let Ok(re) = Regex::new(pattern) { + re.is_match(input) + } else { + false + } +} diff --git a/src/sal/heuristic/mod.rs b/src/sal/heuristic/mod.rs new file mode 100644 index 0000000..75942f8 --- /dev/null +++ b/src/sal/heuristic/mod.rs @@ -0,0 +1,3 @@ +pub mod schema; +pub mod discovery; +pub mod engine; diff --git a/src/sal/heuristic/schema.rs b/src/sal/heuristic/schema.rs new file mode 100644 index 0000000..316e701 --- /dev/null +++ b/src/sal/heuristic/schema.rs @@ -0,0 +1,90 @@ +use serde::Deserialize; +use std::collections::HashMap; + +#[derive(Debug, Deserialize, Clone)] +pub struct HardwareDb { + pub metadata: Metadata, + pub conflicts: Vec, + pub ecosystems: HashMap, + pub quirks: Vec, + pub discovery: Discovery, + pub preflight_checks: Vec, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct Metadata { + pub version: String, + pub updated: String, + pub description: String, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct Conflict { + pub id: String, + pub services: Vec, + pub contention: String, + pub severity: String, + pub fix_action: String, + pub help_text: String, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct Ecosystem { + pub vendor_regex: String, + pub polling_cap_ms: Option, + pub drivers: Option>, + pub fan_manual_mode_cmd: Option, + pub fan_auto_mode_cmd: Option, + pub safety_register: Option, + pub lap_mode_path: Option, + pub profiles_path: Option, + pub ec_write_required: Option, + pub thermal_policy_path: Option, + pub policy_map: Option>, + pub msr_lock_register: Option, + pub msr_lock_bit: Option, + pub fan_boost_path: Option, + pub ec_tool: Option, + pub optimization: Option, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct Quirk { + pub model_regex: String, + pub id: String, + pub issue: String, + pub action: String, + pub monitor_msr: Option, + pub reset_bit: Option, + pub trigger_path: Option, + pub trigger_value: Option, + pub target_path: Option, + pub format: Option, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct Discovery { + pub sensors: SensorDiscovery, + pub actuators: ActuatorDiscovery, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct SensorDiscovery { + pub temp_labels: Vec, + pub fan_labels: Vec, + pub hwmon_priority: Vec, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct ActuatorDiscovery { + pub rapl_paths: Vec, + pub amd_energy_paths: Vec, + pub governor_files: Vec, +} + +#[derive(Debug, Deserialize, Clone)] +pub struct PreflightCheck { + pub name: String, + pub check_cmd: String, + pub fail_help: String, +} diff --git a/src/sal/mock.rs b/src/sal/mock.rs index 097f049..dabe27a 100644 --- a/src/sal/mock.rs +++ b/src/sal/mock.rs @@ -1,8 +1,15 @@ use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep}; use anyhow::Result; -pub struct MockAuditor; -impl PreflightAuditor for MockAuditor { +pub struct MockSal; + +impl MockSal { + pub fn new() -> Self { + Self + } +} + +impl PreflightAuditor for MockSal { fn audit(&self) -> Box + '_> { let steps = vec![ AuditStep { @@ -18,32 +25,16 @@ impl PreflightAuditor for MockAuditor { } } -pub struct MockGuard { - pub suppressed: bool, -} -impl MockGuard { - pub fn new() -> Self { - Self { suppressed: false } - } -} -impl EnvironmentGuard for MockGuard { +impl EnvironmentGuard for MockSal { fn suppress(&mut self) -> Result<()> { - self.suppressed = true; Ok(()) } fn restore(&mut self) -> Result<()> { - self.suppressed = false; Ok(()) } } -impl Drop for MockGuard { - fn drop(&mut self) { - let _ = self.restore(); - } -} -pub struct MockSensorBus; -impl SensorBus for MockSensorBus { +impl SensorBus for MockSal { fn get_temp(&self) -> Result { Ok(42.0) } @@ -58,8 +49,7 @@ impl SensorBus for MockSensorBus { } } -pub struct MockActuatorBus; -impl ActuatorBus for MockActuatorBus { +impl ActuatorBus for MockSal { fn set_fan_mode(&self, _mode: &str) -> Result<()> { Ok(()) } @@ -71,8 +61,7 @@ impl ActuatorBus for MockActuatorBus { } } -pub struct MockWatchdog; -impl HardwareWatchdog for MockWatchdog { +impl HardwareWatchdog for MockSal { fn check_emergency(&self) -> Result { Ok(false) } diff --git a/src/sal/mod.rs b/src/sal/mod.rs index 53e7712..16526ac 100644 --- a/src/sal/mod.rs +++ b/src/sal/mod.rs @@ -1,3 +1,5 @@ pub mod traits; pub mod mock; pub mod dell_xps_9380; +pub mod generic_linux; +pub mod heuristic; diff --git a/src/sal/traits.rs b/src/sal/traits.rs index 906e1b4..3aabf75 100644 --- a/src/sal/traits.rs +++ b/src/sal/traits.rs @@ -48,7 +48,7 @@ impl PreflightAuditor for Arc { } /// Suppresses conflicting daemons (tlp, thermald). -pub trait EnvironmentGuard { +pub trait EnvironmentGuard: Send + Sync { fn suppress(&mut self) -> Result<()>; fn restore(&mut self) -> Result<()>; } @@ -77,7 +77,7 @@ impl SensorBus for Arc { } /// Write-only interface for hardware commands. -pub trait ActuatorBus { +pub trait ActuatorBus: Send + Sync { fn set_fan_mode(&self, mode: &str) -> Result<()>; fn set_sustained_power_limit(&self, watts: f32) -> Result<()>; fn set_burst_power_limit(&self, watts: f32) -> Result<()>; @@ -96,7 +96,7 @@ impl ActuatorBus for Arc { } /// Concurrent monitor for catastrophic states. -pub trait HardwareWatchdog { +pub trait HardwareWatchdog: Send + Sync { fn check_emergency(&self) -> Result; } @@ -105,3 +105,8 @@ impl HardwareWatchdog for Arc { (**self).check_emergency() } } + +/// Aggregate trait for a complete platform implementation. +pub trait PlatformSal: PreflightAuditor + SensorBus + ActuatorBus + EnvironmentGuard + HardwareWatchdog {} + +impl PlatformSal for T {} From 073414a25ed8538fa6fefbe0b5f134b38e97f5de Mon Sep 17 00:00:00 2001 From: Nils Pukropp Date: Thu, 26 Feb 2026 15:52:44 +0100 Subject: [PATCH 02/13] fixed hardware_db and improved stability and robustness of generic sal --- assets/hardware_db.toml | 42 ++++++++-- assets/hardware_db.toml.bak | 117 +++++++++++++++++++++++++++ src/engine/mod.rs | 22 ++++-- src/load/mod.rs | 37 ++++++--- src/main.rs | 64 ++++++++------- src/mediator.rs | 2 + src/orchestrator/mod.rs | 61 ++++++++++++-- src/sal/dell_xps_9380.rs | 154 +++++++++++++++++------------------- src/sal/generic_linux.rs | 125 ++++++++++++----------------- src/sal/heuristic/schema.rs | 2 + src/sal/mock.rs | 10 +-- src/sal/traits.rs | 27 +++++-- src/ui/dashboard.rs | 50 ++++++++++++ 13 files changed, 488 insertions(+), 225 deletions(-) create mode 100644 assets/hardware_db.toml.bak diff --git a/assets/hardware_db.toml b/assets/hardware_db.toml index 3f5e480..4eeedef 100644 --- a/assets/hardware_db.toml +++ b/assets/hardware_db.toml @@ -1,5 +1,5 @@ [metadata] -version = "1.0.0" +version = "1.1.0" updated = "2026-02-26" description = "Hardware and Conflict Database for ember-tune Thermal Engine" @@ -29,6 +29,14 @@ severity = "Medium" fix_action = "SuspendService" help_text = "Auto-cpufreq interferes with deterministic Silicon Knee identification." +[[conflicts]] +id = "dell_fan_collision" +services = ["i8kmon.service"] +contention = "Dell SMM Fan Control" +severity = "High" +fix_action = "SuspendService" +help_text = "i8kmon fights with ember-tune for SMM fan duty cycles. Suspend during benchmark." + # manufacturer wide logic [ecosystems.dell] @@ -38,6 +46,7 @@ drivers = ["dell_smm_hwmon"] fan_manual_mode_cmd = "dell-bios-fan-control 0" fan_auto_mode_cmd = "dell-bios-fan-control 1" safety_register = "0x1FC" # BD PROCHOT MSR +help_text = "Dell systems often require 'SMM Security Mitigation' disabled in BIOS for fan control." [ecosystems.lenovo] vendor_regex = "LENOVO" @@ -60,6 +69,13 @@ fan_boost_path = "/sys/devices/platform/hp-wmi/hwmon/hwmon*/pwm1_enable" vendor_regex = "Framework" ec_tool = "ectool" optimization = "Direct-FFI-SMC" +polling_cap_ms = 500 + +[ecosystems.surface] +vendor_regex = "Microsoft Corporation" +product_regex = "Surface.*" +drivers = ["surface_acpi"] +profiles_path = "/sys/bus/platform/devices/surface_performance/platform_profile" # quirks: model quirks and fixes @@ -85,6 +101,7 @@ id = "asus_fan_hex_support" issue = "Custom Hex Curve Interface" target_path = "/sys/devices/platform/asus-nb-wmi/fan_curve" format = "HexPair16" +action = "ManualFanControlRequired" [[quirks]] model_regex = "Spectre x360" @@ -92,15 +109,23 @@ id = "hp_rapl_lockout" issue = "Hardware MSR Lockout" action = "WarnUserMSRLocked" +[[quirks]] +model_regex = "Framework.*" +id = "framework_prochot_stuck" +issue = "BD PROCHOT wedged at 200MHz" +monitor_msr = "0x1FC" +reset_bit = 0 +action = "ClearBitOnSafeTemp" + # heuristic discovery [discovery.sensors] -temp_labels = ["Package id 0", "Tdie", "Tctl", "CPU Temperature"] -fan_labels = ["CPU Fan", "GPU Fan", "System Fan"] -hwmon_priority = ["coretemp", "zenpower", "k10temp", "dell_smm"] +temp_labels = ["Package id 0", "Tdie", "Tctl", "CPU Temperature", "Core 0", "Composite"] +fan_labels = ["CPU Fan", "GPU Fan", "System Fan", "Processor Fan"] +hwmon_priority = ["coretemp", "zenpower", "k10temp", "dell_smm", "thinkpad", "asus"] [discovery.actuators] -rapl_paths = ["intel-rapl:0", "package-0"] +rapl_paths = ["intel-rapl:0", "package-0", "intel-rapl:1"] amd_energy_paths = ["zenpower/energy1_input", "k10temp/energy1_input"] governor_files = ["energy_performance_preference", "energy_performance_hint", "scaling_governor"] @@ -113,5 +138,10 @@ fail_help = "Add 'msr.allow_writes=on' to kernel parameters to allow power limit [[preflight_checks]] name = "Kernel Lockdown Status" -check_cmd = "cat /sys/kernel/security/lockdown | grep -q '\\[none\\]'" +check_cmd = "cat /sys/kernel/security/lockdown | grep -q '\\[none\\]' || ! [ -f /sys/kernel/security/lockdown ]" fail_help = "Kernel Lockdown is enabled. MMIO/MSR actuators are restricted by the Linux Security Module." + +[[preflight_checks]] +name = "Intel P-State Check" +check_cmd = "[ -d /sys/devices/system/cpu/intel_pstate ] || [ -d /sys/devices/system/cpu/cpufreq/policy0 ]" +fail_help = "CPU Frequency scaling driver not detected. Ensure intel_pstate or acpi-cpufreq is loaded." diff --git a/assets/hardware_db.toml.bak b/assets/hardware_db.toml.bak new file mode 100644 index 0000000..3f5e480 --- /dev/null +++ b/assets/hardware_db.toml.bak @@ -0,0 +1,117 @@ +[metadata] +version = "1.0.0" +updated = "2026-02-26" +description = "Hardware and Conflict Database for ember-tune Thermal Engine" + +# service collision + +[[conflicts]] +id = "tlp_vs_ppd" +services = ["tlp.service", "power-profiles-daemon.service"] +contention = "ACPI Platform Profile / EPP" +severity = "Critical" +fix_action = "MaskBoth" +help_text = "TLP and Power-Profiles-Daemon fight over power envelopes. Mask both to allow ember-tune deterministic control." + +[[conflicts]] +id = "thermal_logic_collision" +services = ["thermald.service", "throttled.service"] +contention = "RAPL / MSR / BD-PROCHOT" +severity = "High" +fix_action = "SuspendService" +help_text = "Thermald and Throttled create a 'register ping-pong' loop. Disable throttled; ember-tune will manage RAPL limits." + +[[conflicts]] +id = "freq_scaling_collision" +services = ["auto-cpufreq.service"] +contention = "CPU Scaling Governor" +severity = "Medium" +fix_action = "SuspendService" +help_text = "Auto-cpufreq interferes with deterministic Silicon Knee identification." + +# manufacturer wide logic + +[ecosystems.dell] +vendor_regex = "(Dell.*|Precision.*|Latitude.*|XPS.*)" +polling_cap_ms = 1000 +drivers = ["dell_smm_hwmon"] +fan_manual_mode_cmd = "dell-bios-fan-control 0" +fan_auto_mode_cmd = "dell-bios-fan-control 1" +safety_register = "0x1FC" # BD PROCHOT MSR + +[ecosystems.lenovo] +vendor_regex = "LENOVO" +lap_mode_path = "/sys/devices/platform/thinkpad_acpi/dytc_lapmode" +profiles_path = "/sys/firmware/acpi/platform_profile" +ec_write_required = false # Varies by model + +[ecosystems.asus] +vendor_regex = "ASUSTeK.*" +thermal_policy_path = "/sys/devices/platform/asus-nb-wmi/throttle_thermal_policy" +policy_map = { Balanced = 0, Turbo = 1, Silent = 2 } + +[ecosystems.hp] +vendor_regex = "HP" +msr_lock_register = "0x610" +msr_lock_bit = 63 +fan_boost_path = "/sys/devices/platform/hp-wmi/hwmon/hwmon*/pwm1_enable" + +[ecosystems.framework] +vendor_regex = "Framework" +ec_tool = "ectool" +optimization = "Direct-FFI-SMC" + +# quirks: model quirks and fixes + +[[quirks]] +model_regex = "XPS 13 93.*" +id = "dell_bd_prochot_fix" +issue = "False Positive 400MHz Lock" +monitor_msr = "0x1FC" +reset_bit = 0 +action = "ClearBitOnSafeTemp" + +[[quirks]] +model_regex = "ThinkPad T14.*" +id = "lenovo_lap_throttling" +issue = "11W TDP Lock in Lap Mode" +trigger_path = "/sys/devices/platform/thinkpad_acpi/dytc_lapmode" +trigger_value = "1" +action = "AbortOnLapMode" + +[[quirks]] +model_regex = "ROG Zephyrus G14" +id = "asus_fan_hex_support" +issue = "Custom Hex Curve Interface" +target_path = "/sys/devices/platform/asus-nb-wmi/fan_curve" +format = "HexPair16" + +[[quirks]] +model_regex = "Spectre x360" +id = "hp_rapl_lockout" +issue = "Hardware MSR Lockout" +action = "WarnUserMSRLocked" + +# heuristic discovery + +[discovery.sensors] +temp_labels = ["Package id 0", "Tdie", "Tctl", "CPU Temperature"] +fan_labels = ["CPU Fan", "GPU Fan", "System Fan"] +hwmon_priority = ["coretemp", "zenpower", "k10temp", "dell_smm"] + +[discovery.actuators] +rapl_paths = ["intel-rapl:0", "package-0"] +amd_energy_paths = ["zenpower/energy1_input", "k10temp/energy1_input"] +governor_files = ["energy_performance_preference", "energy_performance_hint", "scaling_governor"] + +# env health verification + +[[preflight_checks]] +name = "MSR Write Access" +check_cmd = "grep -q 'msr.allow_writes=on' /proc/cmdline" +fail_help = "Add 'msr.allow_writes=on' to kernel parameters to allow power limit manipulation." + +[[preflight_checks]] +name = "Kernel Lockdown Status" +check_cmd = "cat /sys/kernel/security/lockdown | grep -q '\\[none\\]'" +fail_help = "Kernel Lockdown is enabled. MMIO/MSR actuators are restricted by the Linux Security Module." diff --git a/src/engine/mod.rs b/src/engine/mod.rs index 540b751..24878bc 100644 --- a/src/engine/mod.rs +++ b/src/engine/mod.rs @@ -91,24 +91,30 @@ impl OptimizerEngine { // 1. Efficiency Metric (Throughput per Watt) // If throughput is 0 (unsupported), fallback to Frequency per Watt let efficiency_curr = if curr.throughput > 0.0 { - curr.throughput as f32 / curr.power_w.max(0.1) + curr.throughput as f32 / curr.power_w.max(1.0) } else { - curr.freq_mhz / curr.power_w.max(0.1) + curr.freq_mhz / curr.power_w.max(1.0) }; let efficiency_next = if next.throughput > 0.0 { - next.throughput as f32 / next.power_w.max(0.1) + next.throughput as f32 / next.power_w.max(1.0) } else { - next.freq_mhz / next.power_w.max(0.1) + next.freq_mhz / next.power_w.max(1.0) }; // Diminishing returns: how much efficiency drops per additional watt - let efficiency_drop = (efficiency_curr - efficiency_next) / (next.power_w - curr.power_w).max(0.1); + let p_delta = (next.power_w - curr.power_w).max(0.5); + let efficiency_drop = (efficiency_curr - efficiency_next) / p_delta; // 2. Thermal Acceleration (d2T/dW2) - let dt_dw_prev = (curr.temp_c - prev.temp_c) / (curr.power_w - prev.power_w).max(0.1); - let dt_dw_next = (next.temp_c - curr.temp_c) / (next.power_w - curr.power_w).max(0.1); - let temp_accel = (dt_dw_next - dt_dw_prev) / (next.power_w - prev.power_w).max(0.1); + let p_delta_prev = (curr.power_w - prev.power_w).max(0.5); + let p_delta_next = (next.power_w - curr.power_w).max(0.5); + + let dt_dw_prev = (curr.temp_c - prev.temp_c) / p_delta_prev; + let dt_dw_next = (next.temp_c - curr.temp_c) / p_delta_next; + + let p_total_delta = (next.power_w - prev.power_w).max(1.0); + let temp_accel = (dt_dw_next - dt_dw_prev) / p_total_delta; // 3. Wall Detection (Any drop in absolute frequency/throughput is a hard wall) let is_throttling = next.freq_mhz < curr.freq_mhz || (next.throughput > 0.0 && next.throughput < curr.throughput); diff --git a/src/load/mod.rs b/src/load/mod.rs index 88ea83b..bc80d98 100644 --- a/src/load/mod.rs +++ b/src/load/mod.rs @@ -1,16 +1,16 @@ use anyhow::Result; +use std::process::Child; +use std::time::{Duration, Instant}; +use std::thread; -pub trait Workload { - /// Starts the workload with specified threads and load percentage. +pub trait Workload: Send + Sync { fn start(&mut self, threads: usize, load_percent: usize) -> Result<()>; - /// Stops the workload. fn stop(&mut self) -> Result<()>; - /// Returns the current throughput (e.g., ops/sec). fn get_throughput(&self) -> Result; } pub struct StressNg { - child: Option, + child: Option, } impl StressNg { @@ -21,7 +21,7 @@ impl StressNg { impl Workload for StressNg { fn start(&mut self, threads: usize, load_percent: usize) -> Result<()> { - self.stop()?; // Ensure any previous instance is stopped + self.stop()?; let child = std::process::Command::new("stress-ng") .args([ @@ -37,15 +37,32 @@ impl Workload for StressNg { fn stop(&mut self) -> Result<()> { if let Some(mut child) = self.child.take() { - let _ = child.kill(); - let _ = child.wait(); + // Try SIGTERM first + #[cfg(unix)] + { + use libc::{kill, SIGTERM}; + unsafe { kill(child.id() as i32, SIGTERM); } + } + + let start = Instant::now(); + let mut exited = false; + while start.elapsed() < Duration::from_secs(2) { + if let Ok(Some(_)) = child.try_wait() { + exited = true; + break; + } + thread::sleep(Duration::from_millis(100)); + } + + if !exited { + let _ = child.kill(); + let _ = child.wait(); + } } Ok(()) } fn get_throughput(&self) -> Result { - // In a real implementation, we would parse stress-ng's temporary results - // or use a different workload that provides live throughput. Ok(0.0) } } diff --git a/src/main.rs b/src/main.rs index ab30b7b..9c62c32 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,7 +6,7 @@ mod ui; mod engine; mod cli; -use miette::{Result, IntoDiagnostic, Diagnostic, Report}; +use miette::{Result, IntoDiagnostic, Diagnostic, Report, Context}; use thiserror::Error; use std::sync::mpsc; use std::thread; @@ -30,7 +30,7 @@ use mediator::{TelemetryState, UiCommand, BenchmarkPhase}; use sal::traits::{AuditError, PlatformSal}; use sal::mock::MockSal; use sal::heuristic::engine::HeuristicEngine; -use load::StressNg; +use load::{StressNg, Workload}; use orchestrator::BenchmarkOrchestrator; use ui::dashboard::{draw_dashboard, DashboardState}; use engine::OptimizationResult; @@ -108,10 +108,10 @@ fn main() -> Result<()> { info!("ember-tune starting with args: {:?}", args); // 2. Platform Detection & Audit - let sal: Box = if args.mock { - Box::new(MockSal::new()) + let sal: Arc = if args.mock { + Arc::new(MockSal::new()) } else { - HeuristicEngine::detect_and_build()? + HeuristicEngine::detect_and_build()?.into() }; println!("{}", console::style("─── Pre-flight System Audit ───").bold().cyan()); @@ -122,9 +122,7 @@ fn main() -> Result<()> { io::Write::flush(&mut io::stdout()).into_diagnostic()?; match step.outcome { - Ok(_) => { - println!("{}", console::style("[✓]").green()); - } + Ok(_) => { println!("{}", console::style("[✓]").green()); } Err(e) => { println!("{}", console::style("[✗]").red()); audit_failures.push(e); @@ -137,10 +135,8 @@ fn main() -> Result<()> { return Err(Report::new(MultiAuditError { errors: audit_failures })); } - println!("{}", console::style("✓ All pre-flight audits passed.").green().bold()); - thread::sleep(Duration::from_secs(1)); - if args.audit_only { + println!("{}", console::style("✓ All pre-flight audits passed.").green().bold()); return Ok(()); } @@ -159,21 +155,22 @@ fn main() -> Result<()> { let (telemetry_tx, telemetry_rx) = mpsc::channel::(); let (command_tx, command_rx) = mpsc::channel::(); + let c_tx = command_tx.clone(); ctrlc::set_handler(move || { + let _ = c_tx.send(UiCommand::Abort); r.store(false, Ordering::SeqCst); }).expect("Error setting Ctrl-C handler"); // 5. Spawn Backend Orchestrator + let sal_backend = sal.clone(); let backend_handle = thread::spawn(move || { let workload = Box::new(StressNg::new()); - let mut orchestrator = BenchmarkOrchestrator::new( - sal, + sal_backend, workload, telemetry_tx, command_rx, ); - orchestrator.run() }); @@ -197,6 +194,8 @@ fn main() -> Result<()> { history_mhz: Vec::new(), log_event: None, metadata: std::collections::HashMap::new(), + is_emergency: false, + emergency_reason: None, }; let tick_rate = Duration::from_millis(100); @@ -233,29 +232,38 @@ fn main() -> Result<()> { } } - if last_tick.elapsed() >= tick_rate { - last_tick = Instant::now(); - } - - if backend_handle.is_finished() { - thread::sleep(Duration::from_secs(1)); - break; - } + if last_tick.elapsed() >= tick_rate { last_tick = Instant::now(); } + if backend_handle.is_finished() { break; } } // 7. Terminal Restoration - disable_raw_mode().into_diagnostic()?; - execute!(terminal.backend_mut(), LeaveAlternateScreen).into_diagnostic()?; - terminal.show_cursor().into_diagnostic()?; + let _ = disable_raw_mode(); + let _ = execute!(terminal.backend_mut(), LeaveAlternateScreen); + let _ = terminal.show_cursor(); - // 8. Final Report (Post-TUI) - match backend_handle.join() { + // 8. Final Report & Hardware Restoration + let join_res = backend_handle.join(); + + // Explicit hardware restoration + info!("Restoring hardware state..."); + if let Err(e) = sal.restore() { + error!("Failed to restore hardware state: {}", e); + } + + match join_res { Ok(Ok(result)) => { print_summary_report(&result); } Ok(Err(e)) => { - if e.to_string() == "ABORTED" { + let err_str = e.to_string(); + if err_str == "ABORTED" { println!("{}", "Benchmark aborted by user.".yellow()); + } else if err_str.contains("EMERGENCY_ABORT") { + println!(); + println!("{}", " 🚨 EMERGENCY ABORT TRIGGERED ".bold().on_red().white()); + println!("Reason: {}", err_str.replace("EMERGENCY_ABORT: ", "").red().bold()); + println!("{}", "Hardware state has been restored to safe defaults.".yellow()); + println!(); } else { error!("Orchestrator encountered error: {}", e); eprintln!("{} {}", "Error:".red().bold(), e); diff --git a/src/mediator.rs b/src/mediator.rs index 2bddbbc..5ca3950 100644 --- a/src/mediator.rs +++ b/src/mediator.rs @@ -42,6 +42,8 @@ pub struct TelemetryState { pub log_event: Option, pub metadata: std::collections::HashMap, + pub is_emergency: bool, + pub emergency_reason: Option, } #[derive(Debug, Clone)] diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index b4b7b73..83b5101 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -4,14 +4,17 @@ use std::time::{Duration, Instant}; use std::thread; use std::collections::VecDeque; use sysinfo::System; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Mutex; -use crate::sal::traits::{PlatformSal}; +use crate::sal::traits::{PlatformSal, AuditStep, SafetyStatus}; use crate::load::Workload; use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase}; use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult}; pub struct BenchmarkOrchestrator { - sal: Box, + sal: Arc, workload: Box, telemetry_tx: mpsc::Sender, command_rx: mpsc::Receiver, @@ -27,11 +30,15 @@ pub struct BenchmarkOrchestrator { // --- Static Info --- cpu_model: String, total_ram_gb: u64, + + // --- Safety --- + emergency_abort: Arc, + emergency_reason: Arc>>, } impl BenchmarkOrchestrator { pub fn new( - sal: Box, + sal: Arc, workload: Box, telemetry_tx: mpsc::Sender, command_rx: mpsc::Receiver, @@ -57,12 +64,17 @@ impl BenchmarkOrchestrator { history_mhz: VecDeque::with_capacity(120), cpu_model, total_ram_gb, + emergency_abort: Arc::new(AtomicBool::new(false)), + emergency_reason: Arc::new(Mutex::new(None)), } } pub fn run(&mut self) -> Result { self.log("Starting ember-tune Benchmark Sequence.")?; + // Start Watchdog Monitor + let _watchdog_handle = self.spawn_watchdog_monitor(); + // Phase 1: Audit & Baseline self.phase = BenchmarkPhase::Auditing; for step in self.sal.audit() { @@ -111,11 +123,6 @@ impl BenchmarkOrchestrator { while step_start.elapsed() < Duration::from_secs(45) { self.check_abort()?; - if self.sal.check_emergency()? { - self.log("⚠ EMERGENCY ABORT: Watchdog triggered!")?; - self.workload.stop()?; - return Err(anyhow::anyhow!("Hardware Watchdog Triggered")); - } let t = self.sal.get_temp().unwrap_or(0.0); step_temps.push_back(t); @@ -204,6 +211,35 @@ impl BenchmarkOrchestrator { Ok(res) } + fn spawn_watchdog_monitor(&self) -> thread::JoinHandle<()> { + let abort = self.emergency_abort.clone(); + let reason_store = self.emergency_reason.clone(); + let sal = self.sal.clone(); + + thread::spawn(move || { + while !abort.load(Ordering::SeqCst) { + let status = sal.get_safety_status(); + match status { + Ok(SafetyStatus::EmergencyAbort(reason)) => { + *reason_store.lock().unwrap() = Some(reason.clone()); + abort.store(true, Ordering::SeqCst); + break; + } + Ok(SafetyStatus::Warning(_msg)) | Ok(SafetyStatus::Critical(_msg)) => { + // Send warning log to UI + } + Ok(SafetyStatus::Nominal) => {} + Err(e) => { + *reason_store.lock().unwrap() = Some(format!("Watchdog Sensor Failure: {}", e)); + abort.store(true, Ordering::SeqCst); + break; + } + } + thread::sleep(Duration::from_millis(100)); + } + }) + } + pub fn generate_result(&self, is_partial: bool) -> OptimizationResult { let r_theta = self.engine.calculate_thermal_resistance(&self.profile); let knee = self.engine.find_silicon_knee(&self.profile); @@ -221,6 +257,11 @@ impl BenchmarkOrchestrator { } fn check_abort(&self) -> Result<()> { + if self.emergency_abort.load(Ordering::SeqCst) { + let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Unknown safety trigger".to_string()); + return Err(anyhow::anyhow!("EMERGENCY_ABORT: {}", reason)); + } + if let Ok(cmd) = self.command_rx.try_recv() { match cmd { UiCommand::Abort => { @@ -250,6 +291,8 @@ impl BenchmarkOrchestrator { history_mhz: Vec::new(), log_event: Some(msg.to_string()), metadata: std::collections::HashMap::new(), + is_emergency: self.emergency_abort.load(Ordering::SeqCst), + emergency_reason: self.emergency_reason.lock().unwrap().clone(), }; self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed")) } @@ -287,6 +330,8 @@ impl BenchmarkOrchestrator { history_mhz: self.history_mhz.iter().cloned().collect(), log_event: None, metadata: std::collections::HashMap::new(), + is_emergency: self.emergency_abort.load(Ordering::SeqCst), + emergency_reason: self.emergency_reason.lock().unwrap().clone(), }; self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed")) } diff --git a/src/sal/dell_xps_9380.rs b/src/sal/dell_xps_9380.rs index e8f7fc6..b59c197 100644 --- a/src/sal/dell_xps_9380.rs +++ b/src/sal/dell_xps_9380.rs @@ -1,11 +1,11 @@ -use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep}; +use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep, SafetyStatus}; use anyhow::{Result, Context}; use std::fs; use std::path::PathBuf; use std::process::Command; use std::time::{Duration, Instant}; use std::sync::Mutex; -use tracing::debug; +use tracing::{debug, warn}; pub struct DellXps9380Sal { temp_path: PathBuf, @@ -18,6 +18,8 @@ pub struct DellXps9380Sal { last_temp: Mutex, last_fans: Mutex>, suppressed_services: Mutex>, + msr_file: Mutex, + last_energy: Mutex<(u64, Instant)>, } impl DellXps9380Sal { @@ -35,7 +37,6 @@ impl DellXps9380Sal { if name == "dell_smm" { temp_path = Some(p.join("temp1_input")); - // Discover all fans if let Ok(fan_entries) = fs::read_dir(&p) { for fan_entry in fan_entries.flatten() { let fan_p = fan_entry.path(); @@ -54,7 +55,6 @@ impl DellXps9380Sal { } } - // Discovery for RAPL via powercap if let Ok(entries) = fs::read_dir("/sys/class/powercap") { for entry in entries.flatten() { let p = entry.path(); @@ -72,6 +72,9 @@ impl DellXps9380Sal { let rapl_base = rapl_base_path.context("Could not find RAPL package-0 path in powercap")?; let freq_path = PathBuf::from("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq"); + + let msr_file = fs::OpenOptions::new().read(true).write(true).open("/dev/cpu/0/msr") + .context("Failed to open /dev/cpu/0/msr. Is the 'msr' module loaded?")?; Ok(Self { temp_path: temp_path.context("Could not find dell_smm temperature path")?, @@ -84,68 +87,64 @@ impl DellXps9380Sal { last_temp: Mutex::new(0.0), last_fans: Mutex::new(Vec::new()), suppressed_services: Mutex::new(Vec::new()), + msr_file: Mutex::new(msr_file), + last_energy: Mutex::new((0, Instant::now())), }) } + + fn read_msr(&self, msr: u32) -> Result { + use std::os::unix::fs::FileExt; + let mut buf = [0u8; 8]; + let file = self.msr_file.lock().unwrap(); + file.read_at(&mut buf, msr as u64)?; + Ok(u64::from_le_bytes(buf)) + } + + fn write_msr(&self, msr: u32, val: u64) -> Result<()> { + use std::os::unix::fs::FileExt; + let file = self.msr_file.lock().unwrap(); + file.write_at(&val.to_le_bytes(), msr as u64)?; + Ok(()) + } } impl PreflightAuditor for DellXps9380Sal { fn audit(&self) -> Box + '_> { let mut steps = Vec::new(); - - // 1. Root check steps.push(AuditStep { description: "Root Privileges".to_string(), outcome: if unsafe { libc::getuid() } == 0 { Ok(()) } else { Err(AuditError::RootRequired) } }); - // 2. Kernel modules check (simplified check via sysfs/proc) let modules = ["dell_smm_hwmon", "msr", "intel_rapl_msr"]; for mod_name in modules { let path = format!("/sys/module/{}", mod_name); steps.push(AuditStep { description: format!("Kernel Module: {}", mod_name), outcome: if PathBuf::from(path).exists() { Ok(()) } else { - Err(AuditError::ToolMissing(format!("Module '{}' not loaded. Run 'sudo modprobe {}'", mod_name, mod_name))) + Err(AuditError::ToolMissing(format!("Module '{}' not loaded.", mod_name))) } }); } - // 3. Kernel parameters check let cmdline = fs::read_to_string("/proc/cmdline").unwrap_or_default(); - steps.push(AuditStep { - description: "Kernel Param: dell_smm_hwmon.ignore_dmi=1".to_string(), - outcome: if cmdline.contains("dell_smm_hwmon.ignore_dmi=1") { Ok(()) } else { - Err(AuditError::MissingKernelParam("dell_smm_hwmon.ignore_dmi=1".to_string())) - } - }); - steps.push(AuditStep { - description: "Kernel Param: dell_smm_hwmon.restricted=0".to_string(), - outcome: if cmdline.contains("dell_smm_hwmon.restricted=0") { Ok(()) } else { - Err(AuditError::MissingKernelParam("dell_smm_hwmon.restricted=0".to_string())) - } - }); - steps.push(AuditStep { - description: "Kernel Param: msr.allow_writes=on".to_string(), - outcome: if cmdline.contains("msr.allow_writes=on") { Ok(()) } else { - Err(AuditError::MissingKernelParam("msr.allow_writes=on".to_string())) - } - }); + let params = [ + ("dell_smm_hwmon.ignore_dmi=1", "dell_smm_hwmon.ignore_dmi=1"), + ("dell_smm_hwmon.restricted=0", "dell_smm_hwmon.restricted=0"), + ("msr.allow_writes=on", "msr.allow_writes=on"), + ]; + for (label, p) in params { + steps.push(AuditStep { + description: format!("Kernel Param: {}", label), + outcome: if cmdline.contains(p) { Ok(()) } else { Err(AuditError::MissingKernelParam(p.to_string())) } + }); + } - // 4. Lockdown check - let lockdown = fs::read_to_string("/sys/kernel/security/lockdown").unwrap_or_default(); - steps.push(AuditStep { - description: "Kernel Lockdown Status".to_string(), - outcome: if lockdown.contains("[none]") || lockdown.is_empty() { Ok(()) } else { - Err(AuditError::KernelIncompatible("Kernel is in lockdown mode. Set to 'none' to allow MSR/SMM writes.".to_string())) - } - }); - - // 5. Check AC power let ac_status = fs::read_to_string("/sys/class/power_supply/AC/online").unwrap_or_else(|_| "0".to_string()); steps.push(AuditStep { description: "AC Power Connection".to_string(), outcome: if ac_status.trim() == "1" { Ok(()) } else { - Err(AuditError::AcPowerMissing("System must be on AC power for benchmarking".to_string())) + Err(AuditError::AcPowerMissing("System must be on AC power".to_string())) } }); @@ -154,12 +153,11 @@ impl PreflightAuditor for DellXps9380Sal { } impl EnvironmentGuard for DellXps9380Sal { - fn suppress(&mut self) -> Result<()> { + fn suppress(&self) -> Result<()> { let services = ["tlp", "thermald", "i8kmon"]; let mut suppressed = self.suppressed_services.lock().unwrap(); for s in services { if Command::new("systemctl").args(["is-active", "--quiet", s]).status()?.success() { - debug!("Suppressing service: {}", s); Command::new("systemctl").args(["stop", s]).status()?; suppressed.push(s.to_string()); } @@ -167,7 +165,7 @@ impl EnvironmentGuard for DellXps9380Sal { Ok(()) } - fn restore(&mut self) -> Result<()> { + fn restore(&self) -> Result<()> { let mut suppressed = self.suppressed_services.lock().unwrap(); for s in suppressed.drain(..) { let _ = Command::new("systemctl").args(["start", &s]).status(); @@ -176,38 +174,31 @@ impl EnvironmentGuard for DellXps9380Sal { } } -impl Drop for DellXps9380Sal { - fn drop(&mut self) { - let _ = self.restore(); - } -} - - impl SensorBus for DellXps9380Sal { fn get_temp(&self) -> Result { - // Enforce 1000ms rate limit for Dell SMM as per GEMINI.md let mut last_poll = self.last_poll.lock().unwrap(); let now = Instant::now(); - if now.duration_since(*last_poll) < Duration::from_millis(1000) { return Ok(*self.last_temp.lock().unwrap()); } - let s = fs::read_to_string(&self.temp_path)?; let val = s.trim().parse::()? / 1000.0; - *self.last_temp.lock().unwrap() = val; *last_poll = now; - Ok(val) } fn get_power_w(&self) -> Result { if self.pwr_path.to_string_lossy().contains("energy_uj") { - let e1 = fs::read_to_string(&self.pwr_path)?.trim().parse::()?; - std::thread::sleep(Duration::from_millis(100)); + let mut last = self.last_energy.lock().unwrap(); let e2 = fs::read_to_string(&self.pwr_path)?.trim().parse::()?; - Ok((e2.saturating_sub(e1)) as f32 / 100000.0) + let t2 = Instant::now(); + let (e1, t1) = *last; + let delta_e = e2.wrapping_sub(e1); + let delta_t = t2.duration_since(t1).as_secs_f32(); + *last = (e2, t2); + if delta_t < 0.01 { return Ok(0.0); } + Ok((delta_e as f32 / 1_000_000.0) / delta_t) } else { let s = fs::read_to_string(&self.pwr_path)?; Ok(s.trim().parse::()? / 1000000.0) @@ -217,66 +208,65 @@ impl SensorBus for DellXps9380Sal { fn get_fan_rpms(&self) -> Result> { let mut last_poll = self.last_poll.lock().unwrap(); let now = Instant::now(); - if now.duration_since(*last_poll) < Duration::from_millis(1000) { return Ok(self.last_fans.lock().unwrap().clone()); } - let mut fans = Vec::new(); for path in &self.fan_paths { if let Ok(s) = fs::read_to_string(path) { - if let Ok(rpm) = s.trim().parse::() { - fans.push(rpm); - } + if let Ok(rpm) = s.trim().parse::() { fans.push(rpm); } } } - *self.last_fans.lock().unwrap() = fans.clone(); *last_poll = now; - Ok(fans) } fn get_freq_mhz(&self) -> Result { let s = fs::read_to_string(&self.freq_path)?; - let val = s.trim().parse::()? / 1000.0; - Ok(val) + Ok(s.trim().parse::()? / 1000.0) } } impl ActuatorBus for DellXps9380Sal { fn set_fan_mode(&self, mode: &str) -> Result<()> { match mode { - "max" | "Manual" => { - Command::new("dell-bios-fan-control").arg("0").status()?; - } - "auto" | "Auto" => { - Command::new("dell-bios-fan-control").arg("1").status()?; - } - _ => { - debug!("Unknown fan mode requested: {}", mode); - } + "max" | "Manual" => { Command::new("dell-bios-fan-control").arg("0").status()?; } + "auto" | "Auto" => { Command::new("dell-bios-fan-control").arg("1").status()?; } + _ => { debug!("Unknown fan mode: {}", mode); } } Ok(()) } fn set_sustained_power_limit(&self, watts: f32) -> Result<()> { - let uw = (watts * 1_000_000.0) as u64; - fs::write(&self.pl1_path, uw.to_string())?; + fs::write(&self.pl1_path, ((watts * 1_000_000.0) as u64).to_string())?; Ok(()) } fn set_burst_power_limit(&self, watts: f32) -> Result<()> { - let uw = (watts * 1_000_000.0) as u64; - fs::write(&self.pl2_path, uw.to_string())?; + fs::write(&self.pl2_path, ((watts * 1_000_000.0) as u64).to_string())?; Ok(()) } } impl HardwareWatchdog for DellXps9380Sal { - fn check_emergency(&self) -> Result { - // Check for thermal throttling or BD PROCHOT - // Simplified for now - Ok(false) + fn get_safety_status(&self) -> Result { + let temp = self.get_temp()?; + if temp > 98.0 { + return Ok(SafetyStatus::EmergencyAbort(format!("Thermal Runaway: {:.1}°C", temp))); + } + if let Ok(msr_val) = self.read_msr(0x1FC) { + if (msr_val & 0x1) != 0 && temp < 85.0 { + let _ = self.write_msr(0x1FC, msr_val & !0x1); + return Ok(SafetyStatus::Warning("BD PROCHOT Latch Cleared".to_string())); + } + } + Ok(SafetyStatus::Nominal) + } +} + +impl Drop for DellXps9380Sal { + fn drop(&mut self) { + let _ = self.restore(); } } diff --git a/src/sal/generic_linux.rs b/src/sal/generic_linux.rs index a9527be..13fde32 100644 --- a/src/sal/generic_linux.rs +++ b/src/sal/generic_linux.rs @@ -2,19 +2,21 @@ use anyhow::{Result, anyhow}; use std::path::Path; use std::fs; use std::time::{Duration, Instant}; -use std::thread; use std::process::Command; -use tracing::{debug}; -use std::sync::mpsc; +use tracing::{debug, warn}; +use std::sync::Mutex; -use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError}; +use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus}; use crate::sal::heuristic::discovery::SystemFactSheet; use crate::sal::heuristic::schema::HardwareDb; pub struct GenericLinuxSal { fact_sheet: SystemFactSheet, db: HardwareDb, - suppressed_services: Vec, + suppressed_services: Mutex>, + last_valid_temp: Mutex<(f32, Instant)>, + current_pl1: Mutex, + last_energy: Mutex<(u64, Instant)>, } impl GenericLinuxSal { @@ -22,7 +24,10 @@ impl GenericLinuxSal { Self { fact_sheet, db, - suppressed_services: Vec::new(), + suppressed_services: Mutex::new(Vec::new()), + last_valid_temp: Mutex::new((0.0, Instant::now())), + current_pl1: Mutex::new(15.0), + last_energy: Mutex::new((0, Instant::now())), } } @@ -30,33 +35,18 @@ impl GenericLinuxSal { self.fact_sheet.vendor.to_lowercase().contains("dell") } - fn read_sysfs_timeout(&self, path: &Path, timeout: Duration) -> Result { - let (tx, rx) = mpsc::channel(); - let path_buf = path.to_path_buf(); - - thread::spawn(move || { - let res = fs::read_to_string(path_buf).map(|s| s.trim().to_string()); - let _ = tx.send(res); - }); - - match rx.recv_timeout(timeout) { - Ok(res) => res.map_err(|e| anyhow!("Failed to read sysfs: {}", e)), - Err(_) => Err(anyhow!("Timeout reading sysfs path: {:?}", path)), - } + /// Read sysfs safely. We removed the thread-per-read timeout logic + /// as it was inefficient. sysfs reads are generally fast enough. + fn read_sysfs(&self, path: &Path) -> Result { + fs::read_to_string(path).map(|s| s.trim().to_string()).map_err(|e| anyhow!(e)) } } impl PreflightAuditor for GenericLinuxSal { fn audit(&self) -> Box + '_> { let mut steps = Vec::new(); - - // 1. Static DB checks for check in &self.db.preflight_checks { - let status = Command::new("sh") - .arg("-c") - .arg(&check.check_cmd) - .status(); - + let status = Command::new("sh").arg("-c").arg(&check.check_cmd).status(); steps.push(AuditStep { description: check.name.clone(), outcome: match status { @@ -65,8 +55,6 @@ impl PreflightAuditor for GenericLinuxSal { } }); } - - // 2. Conflict checks (Critical only) for conflict_id in &self.fact_sheet.active_conflicts { if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) { if conflict.severity == "Critical" { @@ -77,7 +65,6 @@ impl PreflightAuditor for GenericLinuxSal { } } } - Box::new(steps.into_iter()) } } @@ -86,31 +73,32 @@ impl SensorBus for GenericLinuxSal { fn get_temp(&self) -> Result { let path = self.fact_sheet.temp_path.as_ref() .ok_or_else(|| anyhow!("No temperature sensor path found"))?; - let content = self.read_sysfs_timeout(path, Duration::from_millis(200))?; - let milli_celsius: f32 = content.parse()?; - Ok(milli_celsius / 1000.0) + let content = self.read_sysfs(path)?; + let temp = content.parse::()? / 1000.0; + let mut last = self.last_valid_temp.lock().unwrap(); + if (temp - last.0).abs() > 0.01 { *last = (temp, Instant::now()); } + Ok(temp) } fn get_power_w(&self) -> Result { let rapl_path = self.fact_sheet.rapl_paths.first() .ok_or_else(|| anyhow!("No RAPL path found"))?; let energy_path = rapl_path.join("energy_uj"); - - let e1: u64 = self.read_sysfs_timeout(&energy_path, Duration::from_millis(200))?.parse()?; - let t1 = Instant::now(); - thread::sleep(Duration::from_millis(100)); - let e2: u64 = self.read_sysfs_timeout(&energy_path, Duration::from_millis(200))?.parse()?; + let mut last = self.last_energy.lock().unwrap(); + let e2: u64 = self.read_sysfs(&energy_path)?.parse()?; let t2 = Instant::now(); - + let (e1, t1) = *last; let delta_e = e2.wrapping_sub(e1); let delta_t = t2.duration_since(t1).as_secs_f32(); + *last = (e2, t2); + if delta_t < 0.01 { return Ok(0.0); } Ok((delta_e as f32 / 1_000_000.0) / delta_t) } fn get_fan_rpms(&self) -> Result> { let mut rpms = Vec::new(); for path in &self.fact_sheet.fan_paths { - if let Ok(content) = self.read_sysfs_timeout(path, Duration::from_millis(200)) { + if let Ok(content) = self.read_sysfs(path) { if let Ok(rpm) = content.parse() { rpms.push(rpm); } } } @@ -120,10 +108,8 @@ impl SensorBus for GenericLinuxSal { fn get_freq_mhz(&self) -> Result { let path = Path::new("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq"); if path.exists() { - let khz: f32 = self.read_sysfs_timeout(path, Duration::from_millis(200))?.parse()?; - Ok(khz / 1000.0) + Ok(self.read_sysfs(path)?.parse::()? / 1000.0) } else { - // Fallback: parse /proc/cpuinfo let cpuinfo = fs::read_to_string("/proc/cpuinfo")?; for line in cpuinfo.lines() { if line.starts_with("cpu MHz") { @@ -149,38 +135,32 @@ impl ActuatorBus for GenericLinuxSal { let parts: Vec<&str> = cmd_str.split_whitespace().collect(); Command::new(parts[0]).args(&parts[1..]).status()?; Ok(()) - } else { Err(anyhow!("Dell fan command missing in DB")) } - } else { - debug!("Fan control not implemented for non-Dell systems yet"); - Ok(()) - } + } else { Err(anyhow!("Dell fan command missing")) } + } else { Ok(()) } } fn set_sustained_power_limit(&self, watts: f32) -> Result<()> { - let rapl_path = self.fact_sheet.rapl_paths.first() - .ok_or_else(|| anyhow!("No RAPL path found for PL1"))?; - let path = rapl_path.join("constraint_0_power_limit_uw"); - fs::write(path, ((watts * 1_000_000.0) as u64).to_string())?; + let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL1 path"))?; + fs::write(rapl_path.join("constraint_0_power_limit_uw"), ((watts * 1_000_000.0) as u64).to_string())?; + *self.current_pl1.lock().unwrap() = watts; Ok(()) } fn set_burst_power_limit(&self, watts: f32) -> Result<()> { - let rapl_path = self.fact_sheet.rapl_paths.first() - .ok_or_else(|| anyhow!("No RAPL path found for PL2"))?; - let path = rapl_path.join("constraint_1_power_limit_uw"); - fs::write(path, ((watts * 1_000_000.0) as u64).to_string())?; + let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL2 path"))?; + fs::write(rapl_path.join("constraint_1_power_limit_uw"), ((watts * 1_000_000.0) as u64).to_string())?; Ok(()) } } impl EnvironmentGuard for GenericLinuxSal { - fn suppress(&mut self) -> Result<()> { + fn suppress(&self) -> Result<()> { + let mut suppressed = self.suppressed_services.lock().unwrap(); for conflict_id in &self.fact_sheet.active_conflicts { if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) { for service in &conflict.services { - debug!("Stopping service: {}", service); if Command::new("systemctl").arg("stop").arg(service).status()?.success() { - self.suppressed_services.push(service.clone()); + suppressed.push(service.clone()); } } } @@ -188,31 +168,30 @@ impl EnvironmentGuard for GenericLinuxSal { Ok(()) } - fn restore(&mut self) -> Result<()> { - for service in self.suppressed_services.drain(..) { - debug!("Starting service: {}", service); + fn restore(&self) -> Result<()> { + let mut suppressed = self.suppressed_services.lock().unwrap(); + for service in suppressed.drain(..) { let _ = Command::new("systemctl").arg("start").arg(service).status(); } - if self.is_dell() { - let _ = self.set_fan_mode("auto"); - } + if self.is_dell() { let _ = self.set_fan_mode("auto"); } Ok(()) } } impl HardwareWatchdog for GenericLinuxSal { - fn check_emergency(&self) -> Result { - if let Ok(temp) = self.get_temp() { - if temp > 100.0 { - return Ok(true); - } + fn get_safety_status(&self) -> Result { + let temp = self.get_temp()?; + if temp > 100.0 { + return Ok(SafetyStatus::EmergencyAbort(format!("Thermal runaway: {:.1}°C", temp))); } - Ok(false) + let last = self.last_valid_temp.lock().unwrap(); + if last.1.elapsed() > Duration::from_secs(5) { + return Ok(SafetyStatus::EmergencyAbort("Temperature sensor stalled".to_string())); + } + Ok(SafetyStatus::Nominal) } } impl Drop for GenericLinuxSal { - fn drop(&mut self) { - let _ = self.restore(); - } + fn drop(&mut self) { let _ = self.restore(); } } diff --git a/src/sal/heuristic/schema.rs b/src/sal/heuristic/schema.rs index 316e701..a64ff9e 100644 --- a/src/sal/heuristic/schema.rs +++ b/src/sal/heuristic/schema.rs @@ -31,6 +31,7 @@ pub struct Conflict { #[derive(Debug, Deserialize, Clone)] pub struct Ecosystem { pub vendor_regex: String, + pub product_regex: Option, pub polling_cap_ms: Option, pub drivers: Option>, pub fan_manual_mode_cmd: Option, @@ -46,6 +47,7 @@ pub struct Ecosystem { pub fan_boost_path: Option, pub ec_tool: Option, pub optimization: Option, + pub help_text: Option, } #[derive(Debug, Deserialize, Clone)] diff --git a/src/sal/mock.rs b/src/sal/mock.rs index dabe27a..bb01fad 100644 --- a/src/sal/mock.rs +++ b/src/sal/mock.rs @@ -1,4 +1,4 @@ -use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep}; +use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, PlatformSal, SafetyStatus}; use anyhow::Result; pub struct MockSal; @@ -26,10 +26,10 @@ impl PreflightAuditor for MockSal { } impl EnvironmentGuard for MockSal { - fn suppress(&mut self) -> Result<()> { + fn suppress(&self) -> Result<()> { Ok(()) } - fn restore(&mut self) -> Result<()> { + fn restore(&self) -> Result<()> { Ok(()) } } @@ -62,7 +62,7 @@ impl ActuatorBus for MockSal { } impl HardwareWatchdog for MockSal { - fn check_emergency(&self) -> Result { - Ok(false) + fn get_safety_status(&self) -> Result { + Ok(SafetyStatus::Nominal) } } diff --git a/src/sal/traits.rs b/src/sal/traits.rs index 3aabf75..e71ef28 100644 --- a/src/sal/traits.rs +++ b/src/sal/traits.rs @@ -49,8 +49,17 @@ impl PreflightAuditor for Arc { /// Suppresses conflicting daemons (tlp, thermald). pub trait EnvironmentGuard: Send + Sync { - fn suppress(&mut self) -> Result<()>; - fn restore(&mut self) -> Result<()>; + fn suppress(&self) -> Result<()>; + fn restore(&self) -> Result<()>; +} + +impl EnvironmentGuard for Arc { + fn suppress(&self) -> Result<()> { + (**self).suppress() + } + fn restore(&self) -> Result<()> { + (**self).restore() + } } /// Read-only interface for standardized metrics. @@ -97,15 +106,23 @@ impl ActuatorBus for Arc { /// Concurrent monitor for catastrophic states. pub trait HardwareWatchdog: Send + Sync { - fn check_emergency(&self) -> Result; + fn get_safety_status(&self) -> Result; } impl HardwareWatchdog for Arc { - fn check_emergency(&self) -> Result { - (**self).check_emergency() + fn get_safety_status(&self) -> Result { + (**self).get_safety_status() } } +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum SafetyStatus { + Nominal, + Warning(String), + Critical(String), + EmergencyAbort(String), +} + /// Aggregate trait for a complete platform implementation. pub trait PlatformSal: PreflightAuditor + SensorBus + ActuatorBus + EnvironmentGuard + HardwareWatchdog {} diff --git a/src/ui/dashboard.rs b/src/ui/dashboard.rs index 9df3041..17d309f 100644 --- a/src/ui/dashboard.rs +++ b/src/ui/dashboard.rs @@ -5,6 +5,7 @@ use ratatui::{ widgets::{Block, Borders, List, ListItem, Paragraph, Chart, Dataset, Axis, BorderType, GraphType}, symbols::Marker, Frame, + prelude::Stylize, }; use crate::mediator::TelemetryState; use crate::ui::theme::*; @@ -83,6 +84,55 @@ pub fn draw_dashboard( draw_freq_graph(f, right_side_chunks[2], state); draw_logs(f, chunks[3], ui_state); + + if state.is_emergency { + draw_emergency_overlay(f, area, state); + } +} + +fn draw_emergency_overlay(f: &mut Frame, area: Rect, state: &TelemetryState) { + let block = Block::default() + .borders(Borders::ALL) + .border_type(BorderType::Double) + .border_style(Style::default().fg(Color::Red).add_modifier(Modifier::BOLD)) + .bg(Color::Black) + .title(" 🚨 EMERGENCY ABORT 🚨 "); + + let area = centered_rect(60, 20, area); + let inner = block.inner(area); + f.render_widget(block, area); + + let reason = state.emergency_reason.as_deref().unwrap_or("Unknown safety trigger"); + let text = vec![ + Line::from(vec![Span::styled("CRITICAL SAFETY LIMIT TRIGGERED", Style::default().fg(Color::Red).add_modifier(Modifier::BOLD))]), + Line::from(""), + Line::from(vec![Span::raw("Reason: "), Span::styled(reason, Style::default().fg(Color::Yellow))]), + Line::from(""), + Line::from("Hardware has been restored to safe defaults."), + Line::from("Exiting in 1 second..."), + ]; + + f.render_widget(Paragraph::new(text).alignment(ratatui::layout::Alignment::Center), inner); +} + +fn centered_rect(percent_x: u16, percent_y: u16, r: Rect) -> Rect { + let popup_layout = Layout::default() + .direction(Direction::Vertical) + .constraints([ + Constraint::Percentage((100 - percent_y) / 2), + Constraint::Percentage(percent_y), + Constraint::Percentage((100 - percent_y) / 2), + ]) + .split(r); + + Layout::default() + .direction(Direction::Horizontal) + .constraints([ + Constraint::Percentage((100 - percent_x) / 2), + Constraint::Percentage(percent_x), + Constraint::Percentage((100 - percent_x) / 2), + ]) + .split(popup_layout[1])[1] } fn draw_header(f: &mut Frame, area: Rect, state: &TelemetryState) { From 07ccf7ccc7bf540bcd77d9972f0cf3f76acf9cd1 Mon Sep 17 00:00:00 2001 From: Nils Pukropp Date: Thu, 26 Feb 2026 16:04:34 +0100 Subject: [PATCH 03/13] added dynamic discovery of configurations --- Cargo.lock | 24 +++++++ Cargo.toml | 1 + assets/hardware_db.toml | 12 +++- src/engine/formatters/i8kmon.rs | 9 +++ src/engine/formatters/throttled.rs | 28 +++----- src/engine/mod.rs | 1 + src/main.rs | 19 +++-- src/orchestrator/mod.rs | 38 +++++----- src/sal/dell_xps_9380.rs | 107 ++++++++++------------------- src/sal/generic_linux.rs | 4 +- src/sal/heuristic/discovery.rs | 61 +++++++++++++--- src/sal/heuristic/engine.rs | 12 ++-- src/sal/heuristic/schema.rs | 2 + 13 files changed, 188 insertions(+), 130 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index aca5993..dbce524 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -535,6 +535,7 @@ dependencies = [ "tracing", "tracing-appender", "tracing-subscriber", + "which", ] [[package]] @@ -543,6 +544,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" +[[package]] +name = "env_home" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7f84e12ccf0a7ddc17a6c41c93326024c42920d7ee630d04950e6926645c0fe" + [[package]] name = "equivalent" version = "1.0.2" @@ -2254,6 +2261,17 @@ dependencies = [ "wezterm-dynamic", ] +[[package]] +name = "which" +version = "8.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3fabb953106c3c8eea8306e4393700d7657561cb43122571b172bbfb7c7ba1d" +dependencies = [ + "env_home", + "rustix", + "winsafe", +] + [[package]] name = "winapi" version = "0.3.9" @@ -2548,6 +2566,12 @@ version = "0.7.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" +[[package]] +name = "winsafe" +version = "0.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d135d17ab770252ad95e9a872d365cf3090e3be864a34ab46f48555993efc904" + [[package]] name = "wit-bindgen" version = "0.51.0" diff --git a/Cargo.toml b/Cargo.toml index 40147dc..91ab9f7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,3 +30,4 @@ libc = "0.2" num_cpus = "1.17" toml = "1.0.3" regex = "1.12.3" +which = "8.0.0" diff --git a/assets/hardware_db.toml b/assets/hardware_db.toml index 4eeedef..f6219e2 100644 --- a/assets/hardware_db.toml +++ b/assets/hardware_db.toml @@ -1,5 +1,5 @@ [metadata] -version = "1.1.0" +version = "1.2.0" updated = "2026-02-26" description = "Hardware and Conflict Database for ember-tune Thermal Engine" @@ -129,6 +129,16 @@ rapl_paths = ["intel-rapl:0", "package-0", "intel-rapl:1"] amd_energy_paths = ["zenpower/energy1_input", "k10temp/energy1_input"] governor_files = ["energy_performance_preference", "energy_performance_hint", "scaling_governor"] +[discovery.configs] +throttled = ["/etc/throttled.conf", "/usr/local/etc/throttled.conf", "/etc/lenovo_fix.conf"] +i8kmon = ["/etc/i8kmon.conf", "/etc/default/i8kmon"] +tlp = ["/etc/tlp.conf", "/etc/default/tlp"] + +[discovery.tools] +dell_fan_ctrl = "dell-bios-fan-control" +ectool = "ectool" +ryzenadj = "ryzenadj" + # env health verification [[preflight_checks]] diff --git a/src/engine/formatters/i8kmon.rs b/src/engine/formatters/i8kmon.rs index d1b1129..bf12297 100644 --- a/src/engine/formatters/i8kmon.rs +++ b/src/engine/formatters/i8kmon.rs @@ -1,3 +1,6 @@ +use std::path::Path; +use anyhow::Result; + pub struct I8kmonConfig { pub t_ambient: f32, pub t_max_fan: f32, @@ -38,4 +41,10 @@ set config(speed_high) 4500 t_high_off = t_high_off ) } + + pub fn save(path: &Path, config: &I8kmonConfig) -> Result<()> { + let content = Self::generate_conf(config); + std::fs::write(path, content)?; + Ok(()) + } } diff --git a/src/engine/formatters/throttled.rs b/src/engine/formatters/throttled.rs index 3e4c771..9febe7e 100644 --- a/src/engine/formatters/throttled.rs +++ b/src/engine/formatters/throttled.rs @@ -1,4 +1,6 @@ use std::collections::HashSet; +use std::path::Path; +use anyhow::{Result}; pub struct ThrottledConfig { pub pl1_limit: f32, @@ -38,13 +40,11 @@ Trip_Temp_C: {trip:.0} } /// Merges benchmarked values into an existing throttled.conf content. - /// Preserves all other sections (like [UnderVOLT]), comments, and formatting. pub fn merge_conf(existing_content: &str, config: &ThrottledConfig) -> String { let mut sections = Vec::new(); let mut current_section_name = String::new(); let mut current_section_lines = Vec::new(); - // 1. Parse into sections to ensure we only update keys in [BATTERY] and [AC] for line in existing_content.lines() { let trimmed = line.trim(); if trimmed.starts_with('[') && trimmed.ends_with(']') { @@ -68,17 +68,14 @@ Trip_Temp_C: {trip:.0} let mut result_lines = Vec::new(); let mut handled_sections = HashSet::new(); - // 2. Process sections for (name, mut lines) in sections { if name == "BATTERY" || name == "AC" { handled_sections.insert(name.clone()); let mut updated_keys = HashSet::new(); - let mut new_lines = Vec::new(); for line in lines { let mut updated = false; let trimmed = line.trim(); - if !trimmed.starts_with('#') && !trimmed.is_empty() { if let Some((key, _)) = trimmed.split_once(':') { let key = key.trim(); @@ -87,11 +84,7 @@ Trip_Temp_C: {trip:.0} if let Some(colon_idx) = line.find(':') { let prefix = &line[..colon_idx + 1]; let rest = &line[colon_idx + 1..]; - let comment = if let Some(hash_idx) = rest.find('#') { - &rest[hash_idx..] - } else { - "" - }; + let comment = if let Some(hash_idx) = rest.find('#') { &rest[hash_idx..] } else { "" }; new_lines.push(format!("{} {}{}", prefix, new_value, comment)); updated_keys.insert(*target_key); updated = true; @@ -101,12 +94,8 @@ Trip_Temp_C: {trip:.0} } } } - - if !updated { - new_lines.push(line); - } + if !updated { new_lines.push(line); } } - for (target_key, new_value) in &target_keys { if !updated_keys.contains(*target_key) { new_lines.push(format!("{}: {}", target_key, new_value)); @@ -117,7 +106,6 @@ Trip_Temp_C: {trip:.0} result_lines.extend(lines); } - // 3. Add missing sections if they didn't exist at all for section_name in &["BATTERY", "AC"] { if !handled_sections.contains(*section_name) { result_lines.push(String::new()); @@ -127,7 +115,13 @@ Trip_Temp_C: {trip:.0} } } } - result_lines.join("\n") } + + pub fn save(path: &Path, config: &ThrottledConfig) -> Result<()> { + let existing = if path.exists() { std::fs::read_to_string(path)? } else { String::new() }; + let content = if existing.is_empty() { Self::generate_conf(config) } else { Self::merge_conf(&existing, config) }; + std::fs::write(path, content)?; + Ok(()) + } } diff --git a/src/engine/mod.rs b/src/engine/mod.rs index 24878bc..7ae4681 100644 --- a/src/engine/mod.rs +++ b/src/engine/mod.rs @@ -26,6 +26,7 @@ pub struct OptimizationResult { pub recommended_pl2: f32, pub max_temp_c: f32, pub is_partial: bool, + pub config_paths: std::collections::HashMap, } pub struct OptimizerEngine { diff --git a/src/main.rs b/src/main.rs index 9c62c32..3f14c3a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -30,7 +30,8 @@ use mediator::{TelemetryState, UiCommand, BenchmarkPhase}; use sal::traits::{AuditError, PlatformSal}; use sal::mock::MockSal; use sal::heuristic::engine::HeuristicEngine; -use load::{StressNg, Workload}; +use sal::heuristic::discovery::SystemFactSheet; +use load::{StressNg}; use orchestrator::BenchmarkOrchestrator; use ui::dashboard::{draw_dashboard, DashboardState}; use engine::OptimizationResult; @@ -67,9 +68,10 @@ fn print_summary_report(result: &OptimizationResult) { println!("│ Burst (PL2): {:>5.1} W │", result.recommended_pl2); println!("│ │"); - println!("│ {} │", "Apply to /etc/throttled.conf:".bold().magenta()); - println!("│ PL1_Tdp_W: {:<5.1} │", result.recommended_pl1); - println!("│ PL2_Tdp_W: {:<5.1} │", result.recommended_pl2); + println!("│ {} │", "Apply these to your system:".bold().magenta()); + for (id, path) in &result.config_paths { + println!("│ {:<10}: {:<34} │", id, path.display()); + } println!("╰──────────────────────────────────────────────────╯"); println!(); } @@ -108,11 +110,12 @@ fn main() -> Result<()> { info!("ember-tune starting with args: {:?}", args); // 2. Platform Detection & Audit - let sal: Arc = if args.mock { - Arc::new(MockSal::new()) + let (sal_box, facts): (Box, SystemFactSheet) = if args.mock { + (Box::new(MockSal::new()), SystemFactSheet::default()) } else { - HeuristicEngine::detect_and_build()?.into() + HeuristicEngine::detect_and_build()? }; + let sal: Arc = sal_box.into(); println!("{}", console::style("─── Pre-flight System Audit ───").bold().cyan()); let mut audit_failures = Vec::new(); @@ -163,10 +166,12 @@ fn main() -> Result<()> { // 5. Spawn Backend Orchestrator let sal_backend = sal.clone(); + let facts_backend = facts.clone(); let backend_handle = thread::spawn(move || { let workload = Box::new(StressNg::new()); let mut orchestrator = BenchmarkOrchestrator::new( sal_backend, + facts_backend, workload, telemetry_tx, command_rx, diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index 83b5101..8674175 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -7,14 +7,17 @@ use sysinfo::System; use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Mutex; +use std::path::PathBuf; use crate::sal::traits::{PlatformSal, AuditStep, SafetyStatus}; +use crate::sal::heuristic::discovery::SystemFactSheet; use crate::load::Workload; use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase}; use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult}; pub struct BenchmarkOrchestrator { sal: Arc, + facts: SystemFactSheet, workload: Box, telemetry_tx: mpsc::Sender, command_rx: mpsc::Receiver, @@ -39,6 +42,7 @@ pub struct BenchmarkOrchestrator { impl BenchmarkOrchestrator { pub fn new( sal: Arc, + facts: SystemFactSheet, workload: Box, telemetry_tx: mpsc::Sender, command_rx: mpsc::Receiver, @@ -53,6 +57,7 @@ impl BenchmarkOrchestrator { Self { sal, + facts, workload, telemetry_tx, command_rx, @@ -168,7 +173,7 @@ impl BenchmarkOrchestrator { self.phase = BenchmarkPhase::PhysicalModeling; self.log("Phase 3: Calculating Silicon Physical Sweet Spot...")?; - let res = self.generate_result(false); + let mut res = self.generate_result(false); self.log(&format!("✓ Thermal Resistance (Rθ): {:.3} K/W", res.thermal_resistance_kw))?; self.log(&format!("✓ Silicon Knee Found: {:.1} W", res.silicon_knee_watts))?; @@ -186,24 +191,22 @@ impl BenchmarkOrchestrator { }; // 1. Throttled (Merged if exists) - let throttled_path = "throttled.conf"; - let existing_throttled = std::fs::read_to_string(throttled_path).unwrap_or_default(); - let throttled_content = if existing_throttled.is_empty() { - crate::engine::formatters::throttled::ThrottledTranslator::generate_conf(&config) - } else { - crate::engine::formatters::throttled::ThrottledTranslator::merge_conf(&existing_throttled, &config) - }; - std::fs::write(throttled_path, throttled_content)?; - self.log("✓ Saved 'throttled.conf' (merged).")?; + if let Some(throttled_path) = self.facts.paths.configs.get("throttled") { + crate::engine::formatters::throttled::ThrottledTranslator::save(throttled_path, &config)?; + self.log(&format!("✓ Saved '{}' (merged).", throttled_path.display()))?; + res.config_paths.insert("throttled".to_string(), throttled_path.clone()); + } // 2. i8kmon - let i8k_config = crate::engine::formatters::i8kmon::I8kmonConfig { - t_ambient: self.profile.ambient_temp, - t_max_fan: res.max_temp_c - 5.0, // Aim to hit max fan before max temp - }; - let i8k_content = crate::engine::formatters::i8kmon::I8kmonTranslator::generate_conf(&i8k_config); - std::fs::write("i8kmon.conf", i8k_content)?; - self.log("✓ Saved 'i8kmon.conf'.")?; + if let Some(i8k_path) = self.facts.paths.configs.get("i8kmon") { + let i8k_config = crate::engine::formatters::i8kmon::I8kmonConfig { + t_ambient: self.profile.ambient_temp, + t_max_fan: res.max_temp_c - 5.0, + }; + crate::engine::formatters::i8kmon::I8kmonTranslator::save(i8k_path, &i8k_config)?; + self.log(&format!("✓ Saved '{}'.", i8k_path.display()))?; + res.config_paths.insert("i8kmon".to_string(), i8k_path.clone()); + } self.sal.restore()?; self.log("✓ Environment restored.")?; @@ -253,6 +256,7 @@ impl BenchmarkOrchestrator { recommended_pl2: knee * 1.25, max_temp_c: max_t, is_partial, + config_paths: std::collections::HashMap::new(), } } diff --git a/src/sal/dell_xps_9380.rs b/src/sal/dell_xps_9380.rs index b59c197..61eaf21 100644 --- a/src/sal/dell_xps_9380.rs +++ b/src/sal/dell_xps_9380.rs @@ -1,13 +1,15 @@ use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep, SafetyStatus}; -use anyhow::{Result, Context}; +use anyhow::{Result, Context, anyhow}; use std::fs; -use std::path::PathBuf; +use std::path::{PathBuf}; use std::process::Command; use std::time::{Duration, Instant}; use std::sync::Mutex; -use tracing::{debug, warn}; +use tracing::{debug}; +use crate::sal::heuristic::discovery::SystemFactSheet; pub struct DellXps9380Sal { + fact_sheet: SystemFactSheet, temp_path: PathBuf, pwr_path: PathBuf, fan_paths: Vec, @@ -23,72 +25,30 @@ pub struct DellXps9380Sal { } impl DellXps9380Sal { - pub fn init() -> Result { - let mut temp_path = None; - let mut pwr_path = None; - let mut fan_paths = Vec::new(); - let mut rapl_base_path = None; - - // Dynamic hwmon discovery - if let Ok(entries) = fs::read_dir("/sys/class/hwmon") { - for entry in entries.flatten() { - let p = entry.path(); - let name = fs::read_to_string(p.join("name")).unwrap_or_default().trim().to_string(); - - if name == "dell_smm" { - temp_path = Some(p.join("temp1_input")); - if let Ok(fan_entries) = fs::read_dir(&p) { - for fan_entry in fan_entries.flatten() { - let fan_p = fan_entry.path(); - if fan_p.file_name().unwrap_or_default().to_string_lossy().starts_with("fan") && - fan_p.file_name().unwrap_or_default().to_string_lossy().ends_with("_input") { - fan_paths.push(fan_p); - } - } - } - fan_paths.sort(); - } - - if name == "intel_rapl" || name == "rapl" { - pwr_path = Some(p.join("power1_average")); - } - } - } - - if let Ok(entries) = fs::read_dir("/sys/class/powercap") { - for entry in entries.flatten() { - let p = entry.path(); - if let Ok(name) = fs::read_to_string(p.join("name")) { - if name.trim() == "package-0" { - rapl_base_path = Some(p.clone()); - if pwr_path.is_none() { - pwr_path = Some(p.join("energy_uj")); - } - break; - } - } - } - } - - let rapl_base = rapl_base_path.context("Could not find RAPL package-0 path in powercap")?; + pub fn init(facts: SystemFactSheet) -> Result { + let temp_path = facts.temp_path.clone().context("Dell SAL requires temperature sensor")?; + let pwr_base = facts.rapl_paths.first().cloned().context("Dell SAL requires RAPL interface")?; + let fan_paths = facts.fan_paths.clone(); + let freq_path = PathBuf::from("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq"); let msr_file = fs::OpenOptions::new().read(true).write(true).open("/dev/cpu/0/msr") .context("Failed to open /dev/cpu/0/msr. Is the 'msr' module loaded?")?; Ok(Self { - temp_path: temp_path.context("Could not find dell_smm temperature path")?, - pwr_path: pwr_path.context("Could not find RAPL power path")?, + temp_path, + pwr_path: pwr_base.join("power1_average"), fan_paths, freq_path, - pl1_path: rapl_base.join("constraint_0_power_limit_uw"), - pl2_path: rapl_base.join("constraint_1_power_limit_uw"), + pl1_path: pwr_base.join("constraint_0_power_limit_uw"), + pl2_path: pwr_base.join("constraint_1_power_limit_uw"), last_poll: Mutex::new(Instant::now() - Duration::from_secs(2)), last_temp: Mutex::new(0.0), last_fans: Mutex::new(Vec::new()), suppressed_services: Mutex::new(Vec::new()), msr_file: Mutex::new(msr_file), last_energy: Mutex::new((0, Instant::now())), + fact_sheet: facts, }) } @@ -148,6 +108,13 @@ impl PreflightAuditor for DellXps9380Sal { } }); + // Tool availability check + let tool_check = self.fact_sheet.paths.tools.contains_key("dell_fan_ctrl"); + steps.push(AuditStep { + description: "Dell Fan Control Tool".to_string(), + outcome: if tool_check { Ok(()) } else { Err(AuditError::ToolMissing("dell-bios-fan-control not found in PATH".to_string())) } + }); + Box::new(steps.into_iter()) } } @@ -189,20 +156,15 @@ impl SensorBus for DellXps9380Sal { } fn get_power_w(&self) -> Result { - if self.pwr_path.to_string_lossy().contains("energy_uj") { - let mut last = self.last_energy.lock().unwrap(); - let e2 = fs::read_to_string(&self.pwr_path)?.trim().parse::()?; - let t2 = Instant::now(); - let (e1, t1) = *last; - let delta_e = e2.wrapping_sub(e1); - let delta_t = t2.duration_since(t1).as_secs_f32(); - *last = (e2, t2); - if delta_t < 0.01 { return Ok(0.0); } - Ok((delta_e as f32 / 1_000_000.0) / delta_t) - } else { - let s = fs::read_to_string(&self.pwr_path)?; - Ok(s.trim().parse::()? / 1000000.0) - } + let mut last = self.last_energy.lock().unwrap(); + let e2 = fs::read_to_string(&self.pwr_path)?.trim().parse::()?; + let t2 = Instant::now(); + let (e1, t1) = *last; + let delta_e = e2.wrapping_sub(e1); + let delta_t = t2.duration_since(t1).as_secs_f32(); + *last = (e2, t2); + if delta_t < 0.01 { return Ok(0.0); } + Ok((delta_e as f32 / 1_000_000.0) / delta_t) } fn get_fan_rpms(&self) -> Result> { @@ -230,9 +192,12 @@ impl SensorBus for DellXps9380Sal { impl ActuatorBus for DellXps9380Sal { fn set_fan_mode(&self, mode: &str) -> Result<()> { + let tool_path = self.fact_sheet.paths.tools.get("dell_fan_ctrl") + .ok_or_else(|| anyhow!("Dell fan control tool not found in PATH"))?; + match mode { - "max" | "Manual" => { Command::new("dell-bios-fan-control").arg("0").status()?; } - "auto" | "Auto" => { Command::new("dell-bios-fan-control").arg("1").status()?; } + "max" | "Manual" => { Command::new(tool_path).arg("0").status()?; } + "auto" | "Auto" => { Command::new(tool_path).arg("1").status()?; } _ => { debug!("Unknown fan mode: {}", mode); } } Ok(()) diff --git a/src/sal/generic_linux.rs b/src/sal/generic_linux.rs index 13fde32..cecefe1 100644 --- a/src/sal/generic_linux.rs +++ b/src/sal/generic_linux.rs @@ -20,14 +20,14 @@ pub struct GenericLinuxSal { } impl GenericLinuxSal { - pub fn new(fact_sheet: SystemFactSheet, db: HardwareDb) -> Self { + pub fn new(facts: SystemFactSheet, db: HardwareDb) -> Self { Self { - fact_sheet, db, suppressed_services: Mutex::new(Vec::new()), last_valid_temp: Mutex::new((0.0, Instant::now())), current_pl1: Mutex::new(15.0), last_energy: Mutex::new((0, Instant::now())), + fact_sheet: facts, } } diff --git a/src/sal/heuristic/discovery.rs b/src/sal/heuristic/discovery.rs index a4f894a..7495326 100644 --- a/src/sal/heuristic/discovery.rs +++ b/src/sal/heuristic/discovery.rs @@ -1,12 +1,20 @@ use std::fs; use std::path::{Path, PathBuf}; use std::process::Command; -use std::time::Duration; +use std::time::{Duration}; use std::thread; use std::sync::mpsc; -use crate::sal::heuristic::schema::{SensorDiscovery, ActuatorDiscovery, Conflict}; +use std::collections::HashMap; +use crate::sal::heuristic::schema::{SensorDiscovery, ActuatorDiscovery, Conflict, Discovery}; use tracing::{debug, warn}; +/// Registry of dynamically discovered paths for configs and tools. +#[derive(Debug, Clone, Default)] +pub struct PathRegistry { + pub configs: HashMap, + pub tools: HashMap, +} + /// Strongly-typed findings about the current system. #[derive(Debug, Clone, Default)] pub struct SystemFactSheet { @@ -15,21 +23,21 @@ pub struct SystemFactSheet { pub temp_path: Option, pub fan_paths: Vec, pub rapl_paths: Vec, - pub active_conflicts: Vec, // List of conflict IDs found active + pub active_conflicts: Vec, + pub paths: PathRegistry, } -/// Probes the system for hardware sensors, actuators, and service conflicts. +/// Probes the system for hardware sensors, actuators, service conflicts, and paths. pub fn discover_facts( - sensors: &SensorDiscovery, - actuators: &ActuatorDiscovery, + discovery: &Discovery, conflicts: &[Conflict] ) -> SystemFactSheet { let (vendor, model) = read_dmi_info(); debug!("DMI Identity: Vendor='{}', Model='{}'", vendor, model); - let (temp_path, fan_paths) = discover_hwmon(sensors); - let rapl_paths = discover_rapl(actuators); + let (temp_path, fan_paths) = discover_hwmon(&discovery.sensors); + let rapl_paths = discover_rapl(&discovery.actuators); let mut active_conflicts = Vec::new(); for conflict in conflicts { @@ -37,11 +45,13 @@ pub fn discover_facts( if is_service_active(service) { debug!("Detected active conflict: {} (Service: {})", conflict.id, service); active_conflicts.push(conflict.id.clone()); - break; // Found one service in this conflict, move to next conflict + break; } } } + let paths = discover_paths(discovery); + SystemFactSheet { vendor, model, @@ -49,9 +59,42 @@ pub fn discover_facts( fan_paths, rapl_paths, active_conflicts, + paths, } } +fn discover_paths(discovery: &Discovery) -> PathRegistry { + let mut registry = PathRegistry::default(); + + // 1. Discover Tools via PATH + for (id, binary_name) in &discovery.tools { + if let Ok(path) = which::which(binary_name) { + debug!("Discovered tool: {} -> {:?}", id, path); + registry.tools.insert(id.clone(), path); + } + } + + // 2. Discover Configs via existence check + for (id, candidates) in &discovery.configs { + for candidate in candidates { + let path = PathBuf::from(candidate); + if path.exists() { + debug!("Discovered config: {} -> {:?}", id, path); + registry.configs.insert(id.clone(), path); + break; + } + } + // If not found, use the first one as default if any exist + if !registry.configs.contains_key(id) { + if let Some(first) = candidates.first() { + registry.configs.insert(id.clone(), PathBuf::from(first)); + } + } + } + + registry +} + /// Reads DMI information from sysfs with a safety timeout. fn read_dmi_info() -> (String, String) { let vendor = read_sysfs_with_timeout(Path::new("/sys/class/dmi/id/sys_vendor"), Duration::from_millis(100)) diff --git a/src/sal/heuristic/engine.rs b/src/sal/heuristic/engine.rs index fce728c..d5e5662 100644 --- a/src/sal/heuristic/engine.rs +++ b/src/sal/heuristic/engine.rs @@ -7,13 +7,13 @@ use crate::sal::traits::PlatformSal; use crate::sal::dell_xps_9380::DellXps9380Sal; use crate::sal::generic_linux::GenericLinuxSal; use crate::sal::heuristic::schema::HardwareDb; -use crate::sal::heuristic::discovery::{discover_facts}; +use crate::sal::heuristic::discovery::{discover_facts, SystemFactSheet}; pub struct HeuristicEngine; impl HeuristicEngine { /// Loads the hardware database, probes the system, and builds the appropriate SAL. - pub fn detect_and_build() -> Result> { + pub fn detect_and_build() -> Result<(Box, SystemFactSheet)> { // 1. Load Hardware DB let db_path = "assets/hardware_db.toml"; let db_content = fs::read_to_string(db_path) @@ -24,7 +24,7 @@ impl HeuristicEngine { .context("Failed to parse hardware_db.toml")?; // 2. Discover Facts - let facts = discover_facts(&db.discovery.sensors, &db.discovery.actuators, &db.conflicts); + let facts = discover_facts(&db.discovery, &db.conflicts); info!("System Identity: {} {}", facts.vendor, facts.model); // 3. Routing Logic @@ -32,8 +32,8 @@ impl HeuristicEngine { // --- Special Case: Dell XPS 13 9380 --- if is_match(&facts.vendor, "(?i)Dell.*") && is_match(&facts.model, "(?i)XPS.*13.*9380.*") { info!("Specialized SAL Match Found: Dell XPS 13 9380"); - let sal = DellXps9380Sal::init().map_err(|e| miette::miette!(e))?; - return Ok(Box::new(sal)); + let sal = DellXps9380Sal::init(facts.clone()).map_err(|e| miette::miette!(e))?; + return Ok((Box::new(sal), facts)); } // --- Fallback: Generic Linux SAL --- @@ -47,7 +47,7 @@ impl HeuristicEngine { return Err(miette::miette!("No RAPL power interface discovered. Generic fallback impossible.")); } - Ok(Box::new(GenericLinuxSal::new(facts, db))) + Ok((Box::new(GenericLinuxSal::new(facts.clone(), db)), facts)) } } diff --git a/src/sal/heuristic/schema.rs b/src/sal/heuristic/schema.rs index a64ff9e..aeaf839 100644 --- a/src/sal/heuristic/schema.rs +++ b/src/sal/heuristic/schema.rs @@ -68,6 +68,8 @@ pub struct Quirk { pub struct Discovery { pub sensors: SensorDiscovery, pub actuators: ActuatorDiscovery, + pub configs: HashMap>, + pub tools: HashMap, } #[derive(Debug, Deserialize, Clone)] From f76acd62564a8c50f11dc62cf83a45eb86f36754 Mon Sep 17 00:00:00 2001 From: Nils Pukropp Date: Thu, 26 Feb 2026 16:09:44 +0100 Subject: [PATCH 04/13] audit fix and code stability improvement --- src/engine/mod.rs | 32 +++++++++++++++++++++++--------- src/main.rs | 2 +- src/orchestrator/mod.rs | 23 ++++++++++++++++------- src/sal/dell_xps_9380.rs | 2 +- src/ui/dashboard.rs | 15 ++++++++++++--- 5 files changed, 53 insertions(+), 21 deletions(-) diff --git a/src/engine/mod.rs b/src/engine/mod.rs index 7ae4681..99d094f 100644 --- a/src/engine/mod.rs +++ b/src/engine/mod.rs @@ -38,7 +38,7 @@ impl OptimizerEngine { Self { window_size } } - /// Applies a simple moving average (SMA) filter to a stream of values. + /// Applies a simple moving average (SMA) filter with outlier rejection. pub fn smooth(&self, data: &[f32]) -> Vec { if data.is_empty() { return vec![]; } let mut smoothed = Vec::with_capacity(data.len()); @@ -46,8 +46,19 @@ impl OptimizerEngine { for i in 0..data.len() { let start = if i < self.window_size { 0 } else { i - self.window_size + 1 }; let end = i + 1; - let sum: f32 = data[start..end].iter().sum(); - smoothed.push(sum / (end - start) as f32); + + // Outlier rejection: only average values within a reasonable range + let window = &data[start..end]; + let avg: f32 = window.iter().sum::() / window.len() as f32; + let filtered: Vec = window.iter() + .filter(|&&v| (v - avg).abs() < 20.0) // Reject spikes > 20 units + .cloned().collect(); + + if filtered.is_empty() { + smoothed.push(avg); + } else { + smoothed.push(filtered.iter().sum::() / filtered.len() as f32); + } } smoothed } @@ -55,11 +66,9 @@ impl OptimizerEngine { /// Calculates Thermal Resistance: R_theta = (T_core - T_ambient) / P_package pub fn calculate_thermal_resistance(&self, profile: &ThermalProfile) -> f32 { profile.points.iter() + .filter(|p| p.power_w > 1.0 && p.temp_c > 30.0) // Filter invalid data .max_by(|a, b| a.power_w.partial_cmp(&b.power_w).unwrap_or(std::cmp::Ordering::Equal)) - .map(|p| { - if p.power_w < 1.0 { 0.0 } - else { (p.temp_c - profile.ambient_temp) / p.power_w } - }) + .map(|p| (p.temp_c - profile.ambient_temp) / p.power_w) .unwrap_or(0.0) } @@ -73,11 +82,16 @@ impl OptimizerEngine { /// Finds the "Silicon Knee" - the point where performance per watt (efficiency) /// starts to diminish significantly and thermal density spikes. pub fn find_silicon_knee(&self, profile: &ThermalProfile) -> f32 { - if profile.points.len() < 3 { + let valid_points: Vec<_> = profile.points.iter() + .filter(|p| p.power_w > 5.0 && p.temp_c > 40.0) // Filter idle/noise + .cloned() + .collect(); + + if valid_points.len() < 3 { return profile.points.last().map(|p| p.power_w).unwrap_or(15.0); } - let mut points = profile.points.clone(); + let mut points = valid_points; points.sort_by(|a, b| a.power_w.partial_cmp(&b.power_w).unwrap_or(std::cmp::Ordering::Equal)); let mut best_pl = points[0].power_w; diff --git a/src/main.rs b/src/main.rs index 3f14c3a..b22dc3c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -229,7 +229,7 @@ fn main() -> Result<()> { while let Ok(new_state) = telemetry_rx.try_recv() { if let Some(log) = &new_state.log_event { - ui_state.logs.push(log.clone()); + ui_state.add_log(log.clone()); debug!("Backend Log: {}", log); } else { ui_state.update(&new_state); diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index 8674175..5d7d914 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -7,7 +7,6 @@ use sysinfo::System; use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Mutex; -use std::path::PathBuf; use crate::sal::traits::{PlatformSal, AuditStep, SafetyStatus}; use crate::sal::heuristic::discovery::SystemFactSheet; @@ -80,6 +79,21 @@ impl BenchmarkOrchestrator { // Start Watchdog Monitor let _watchdog_handle = self.spawn_watchdog_monitor(); + // Use a closure to ensure cleanup always runs + let result = self.execute_benchmark(); + + // --- MANDATORY CLEANUP --- + self.log("Benchmark sequence finished. Restoring hardware defaults...")?; + let _ = self.workload.stop(); + if let Err(e) = self.sal.restore() { + anyhow::bail!("CRITICAL: Failed to restore hardware state: {}", e); + } + self.log("✓ Hardware state restored.")?; + + result + } + + fn execute_benchmark(&mut self) -> Result { // Phase 1: Audit & Baseline self.phase = BenchmarkPhase::Auditing; for step in self.sal.audit() { @@ -208,9 +222,6 @@ impl BenchmarkOrchestrator { res.config_paths.insert("i8kmon".to_string(), i8k_path.clone()); } - self.sal.restore()?; - self.log("✓ Environment restored.")?; - Ok(res) } @@ -228,9 +239,7 @@ impl BenchmarkOrchestrator { abort.store(true, Ordering::SeqCst); break; } - Ok(SafetyStatus::Warning(_msg)) | Ok(SafetyStatus::Critical(_msg)) => { - // Send warning log to UI - } + Ok(SafetyStatus::Warning(_msg)) | Ok(SafetyStatus::Critical(_msg)) => {} Ok(SafetyStatus::Nominal) => {} Err(e) => { *reason_store.lock().unwrap() = Some(format!("Watchdog Sensor Failure: {}", e)); diff --git a/src/sal/dell_xps_9380.rs b/src/sal/dell_xps_9380.rs index 61eaf21..436387e 100644 --- a/src/sal/dell_xps_9380.rs +++ b/src/sal/dell_xps_9380.rs @@ -108,7 +108,6 @@ impl PreflightAuditor for DellXps9380Sal { } }); - // Tool availability check let tool_check = self.fact_sheet.paths.tools.contains_key("dell_fan_ctrl"); steps.push(AuditStep { description: "Dell Fan Control Tool".to_string(), @@ -125,6 +124,7 @@ impl EnvironmentGuard for DellXps9380Sal { let mut suppressed = self.suppressed_services.lock().unwrap(); for s in services { if Command::new("systemctl").args(["is-active", "--quiet", s]).status()?.success() { + debug!("Suppressing service: {}", s); Command::new("systemctl").args(["stop", s]).status()?; suppressed.push(s.to_string()); } diff --git a/src/ui/dashboard.rs b/src/ui/dashboard.rs index 17d309f..332f85e 100644 --- a/src/ui/dashboard.rs +++ b/src/ui/dashboard.rs @@ -7,20 +7,29 @@ use ratatui::{ Frame, prelude::Stylize, }; +use std::collections::VecDeque; use crate::mediator::TelemetryState; use crate::ui::theme::*; /// DashboardState maintains UI-specific state that isn't part of the core telemetry, /// such as the accumulated diagnostic logs. pub struct DashboardState { - pub logs: Vec, + pub logs: VecDeque, } impl DashboardState { pub fn new() -> Self { - Self { - logs: vec!["ember-tune Initialized.".to_string()], + let mut logs = VecDeque::with_capacity(100); + logs.push_back("ember-tune Initialized.".to_string()); + Self { logs } + } + + /// Adds a log message and ensures the buffer does not exceed capacity. + pub fn add_log(&mut self, msg: String) { + if self.logs.len() >= 100 { + self.logs.pop_front(); } + self.logs.push_back(msg); } /// Updates the UI state based on new telemetry. From 667d94af7a25d3fbc1a1d0e55ec187e0a735d748 Mon Sep 17 00:00:00 2001 From: Nils Pukropp Date: Thu, 26 Feb 2026 17:11:42 +0100 Subject: [PATCH 05/13] impemented mock testing --- Cargo.lock | 20 ++++++++++ Cargo.toml | 3 ++ assets/hardware_db.toml | 7 ++++ src/engine/formatters/i8kmon.rs | 36 +++++++++++++----- src/lib.rs | 8 ++++ src/main.rs | 34 +++++++---------- src/orchestrator/mod.rs | 51 +++++++++++++++++++------ src/sal/dell_xps_9380.rs | 62 ++++++++++++++++++------------- src/sal/generic_linux.rs | 39 +++++++++++-------- src/sal/heuristic/discovery.rs | 45 +++++++++++++--------- src/sal/heuristic/engine.rs | 10 ++--- src/sal/heuristic/schema.rs | 10 +++++ src/sal/mock.rs | 14 +++++-- src/sal/traits.rs | 18 +++++++++ src/sys/cmd.rs | 56 ++++++++++++++++++++++++++++ src/sys/mod.rs | 3 ++ tests/common/fakesys.rs | 55 +++++++++++++++++++++++++++ tests/common/mod.rs | 1 + tests/config_merge_test.rs | 35 +++++++++++++++++ tests/heuristic_discovery_test.rs | 45 ++++++++++++++++++++++ tests/orchestrator_e2e_test.rs | 38 +++++++++++++++++++ 21 files changed, 480 insertions(+), 110 deletions(-) create mode 100644 src/lib.rs create mode 100644 src/sys/cmd.rs create mode 100644 src/sys/mod.rs create mode 100644 tests/common/fakesys.rs create mode 100644 tests/common/mod.rs create mode 100644 tests/config_merge_test.rs create mode 100644 tests/heuristic_discovery_test.rs create mode 100644 tests/orchestrator_e2e_test.rs diff --git a/Cargo.lock b/Cargo.lock index dbce524..cd31a56 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -530,6 +530,7 @@ dependencies = [ "serde", "serde_json", "sysinfo", + "tempfile", "thiserror 2.0.18", "toml", "tracing", @@ -595,6 +596,12 @@ dependencies = [ "regex", ] +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + [[package]] name = "filedescriptor" version = "0.8.3" @@ -1705,6 +1712,19 @@ dependencies = [ "windows", ] +[[package]] +name = "tempfile" +version = "3.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0136791f7c95b1f6dd99f9cc786b91bb81c3800b639b3478e561ddb7be95e5f1" +dependencies = [ + "fastrand", + "getrandom 0.4.1", + "once_cell", + "rustix", + "windows-sys 0.61.2", +] + [[package]] name = "terminal_size" version = "0.4.3" diff --git a/Cargo.toml b/Cargo.toml index 91ab9f7..9a37bc6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,3 +31,6 @@ num_cpus = "1.17" toml = "1.0.3" regex = "1.12.3" which = "8.0.0" + +[dev-dependencies] +tempfile = "3" diff --git a/assets/hardware_db.toml b/assets/hardware_db.toml index f6219e2..d695ebf 100644 --- a/assets/hardware_db.toml +++ b/assets/hardware_db.toml @@ -141,6 +141,13 @@ ryzenadj = "ryzenadj" # env health verification +[benchmarking] +idle_duration_s = 10 +stress_duration_min_s = 15 +stress_duration_max_s = 45 +cool_down_s = 5 +power_steps_watts = [15.0, 20.0, 25.0, 30.0, 35.0] + [[preflight_checks]] name = "MSR Write Access" check_cmd = "grep -q 'msr.allow_writes=on' /proc/cmdline" diff --git a/src/engine/formatters/i8kmon.rs b/src/engine/formatters/i8kmon.rs index bf12297..1b37373 100644 --- a/src/engine/formatters/i8kmon.rs +++ b/src/engine/formatters/i8kmon.rs @@ -4,41 +4,57 @@ use anyhow::Result; pub struct I8kmonConfig { pub t_ambient: f32, pub t_max_fan: f32, + pub thermal_resistance_kw: f32, } pub struct I8kmonTranslator; impl I8kmonTranslator { pub fn generate_conf(config: &I8kmonConfig) -> String { + // Higher resistance means we need to start fans sooner. + // If R_theta is 2.5 K/W, it's quite high for a laptop. + // We'll scale the 'low' threshold based on R_theta. + let aggression_factor = (config.thermal_resistance_kw / 1.5).clamp(0.8, 1.5); + let t_off = config.t_ambient + 5.0; - let t_low_on = config.t_ambient + 12.0; - let t_low_off = config.t_ambient + 10.0; + let t_low_on = config.t_ambient + (10.0 / aggression_factor); + let t_low_off = t_low_on - 2.0; + let t_high_on = config.t_max_fan; - let t_high_off = config.t_max_fan - 5.0; - let t_low_trigger = (config.t_max_fan - 15.0).max(t_low_on + 2.0); + let t_high_off = t_high_on - 5.0; + + let t_mid_on = (t_low_on + t_high_on) / 2.0; + let t_mid_off = t_mid_on - 3.0; format!( r#"# Generated by ember-tune Optimizer -# Grounded in physical thermal resistance +# Grounded in physical thermal resistance (Rθ = {r_theta:.3} K/W) set config(gen_shadow) 1 set config(i8k_ignore_dmi) 1 # Fan states: {{state_low state_high temp_on temp_off}} +# 0: Off set config(0) {{0 0 {t_low_on:.0} {t_off:.0}}} -set config(1) {{1 1 {t_low_trigger:.0} {t_low_off:.0}}} -set config(2) {{2 2 {t_high_on:.0} {t_high_off:.0}}} +# 1: Low +set config(1) {{1 1 {t_mid_on:.0} {t_low_off:.0}}} +# 2: High +set config(2) {{2 2 {t_high_on:.0} {t_mid_off:.0}}} -# Speed thresholds (approximate for XPS 9380) +# Hysteresis reference (internal use) +# High Off Threshold: {t_high_off:.0} + +# Speed thresholds set config(speed_low) 2500 set config(speed_high) 4500 "#, + r_theta = config.thermal_resistance_kw, t_low_on = t_low_on, t_off = t_off, - t_low_trigger = t_low_trigger, + t_mid_on = t_mid_on, t_low_off = t_low_off, t_high_on = t_high_on, - t_high_off = t_high_off + t_mid_off = t_mid_off ) } diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..07dcb24 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,8 @@ +pub mod mediator; +pub mod sal; +pub mod load; +pub mod orchestrator; +pub mod ui; +pub mod engine; +pub mod cli; +pub mod sys; diff --git a/src/main.rs b/src/main.rs index b22dc3c..014817e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,12 +1,4 @@ -mod mediator; -mod sal; -mod load; -mod orchestrator; -mod ui; -mod engine; -mod cli; - -use miette::{Result, IntoDiagnostic, Diagnostic, Report, Context}; +use miette::{Result, IntoDiagnostic, Diagnostic, Report}; use thiserror::Error; use std::sync::mpsc; use std::thread; @@ -25,16 +17,16 @@ use crossterm::{ }; use ratatui::{backend::CrosstermBackend, Terminal}; -use cli::Cli; -use mediator::{TelemetryState, UiCommand, BenchmarkPhase}; -use sal::traits::{AuditError, PlatformSal}; -use sal::mock::MockSal; -use sal::heuristic::engine::HeuristicEngine; -use sal::heuristic::discovery::SystemFactSheet; -use load::{StressNg}; -use orchestrator::BenchmarkOrchestrator; -use ui::dashboard::{draw_dashboard, DashboardState}; -use engine::OptimizationResult; +use ember_tune_rs::cli::Cli; +use ember_tune_rs::mediator::{TelemetryState, UiCommand, BenchmarkPhase}; +use ember_tune_rs::sal::traits::{AuditError, PlatformSal}; +use ember_tune_rs::sal::mock::MockSal; +use ember_tune_rs::sal::heuristic::engine::HeuristicEngine; +use ember_tune_rs::sal::heuristic::discovery::SystemFactSheet; +use ember_tune_rs::load::{StressNg}; +use ember_tune_rs::orchestrator::BenchmarkOrchestrator; +use ember_tune_rs::ui::dashboard::{draw_dashboard, DashboardState}; +use ember_tune_rs::engine::OptimizationResult; use owo_colors::OwoColorize; #[derive(Error, Diagnostic, Debug)] @@ -109,11 +101,13 @@ fn main() -> Result<()> { info!("ember-tune starting with args: {:?}", args); + let ctx = ember_tune_rs::sal::traits::EnvironmentCtx::production(); + // 2. Platform Detection & Audit let (sal_box, facts): (Box, SystemFactSheet) = if args.mock { (Box::new(MockSal::new()), SystemFactSheet::default()) } else { - HeuristicEngine::detect_and_build()? + HeuristicEngine::detect_and_build(ctx)? }; let sal: Arc = sal_box.into(); diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index 5d7d914..41cdc49 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -8,7 +8,7 @@ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Mutex; -use crate::sal::traits::{PlatformSal, AuditStep, SafetyStatus}; +use crate::sal::traits::{PlatformSal, SafetyStatus}; use crate::sal::heuristic::discovery::SystemFactSheet; use crate::load::Workload; use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase}; @@ -94,6 +94,8 @@ impl BenchmarkOrchestrator { } fn execute_benchmark(&mut self) -> Result { + let bench_cfg = self.facts.bench_config.clone().context("Benchmarking config missing in facts")?; + // Phase 1: Audit & Baseline self.phase = BenchmarkPhase::Auditing; for step in self.sal.audit() { @@ -107,13 +109,13 @@ impl BenchmarkOrchestrator { // Baseline (Idle Calibration) self.phase = BenchmarkPhase::IdleCalibration; - self.log("Phase 1: Recording Idle Baseline (10s)...")?; + self.log(&format!("Phase 1: Recording Idle Baseline ({}s)...", bench_cfg.idle_duration_s))?; self.sal.set_fan_mode("auto")?; // Use auto for idle let mut idle_temps = Vec::new(); let start = Instant::now(); let mut tick = 0; - while start.elapsed() < Duration::from_secs(10) { + while start.elapsed() < Duration::from_secs(bench_cfg.idle_duration_s) { self.check_abort()?; self.send_telemetry(tick)?; idle_temps.push(self.sal.get_temp().unwrap_or(0.0)); @@ -128,19 +130,19 @@ impl BenchmarkOrchestrator { self.log("Phase 2: Starting Synthetic Stress Matrix.")?; self.sal.set_fan_mode("max")?; // Lock fans for consistent resistance - let power_steps = [15.0, 20.0, 25.0, 30.0, 35.0]; - for &pl in &power_steps { + let steps = bench_cfg.power_steps_watts.clone(); + for &pl in &steps { self.log(&format!("Testing PL1 = {:.0}W...", pl))?; self.sal.set_sustained_power_limit(pl)?; self.sal.set_burst_power_limit(pl + 5.0)?; self.workload.start(num_cpus::get(), 100)?; - // Wait for equilibrium: Hybrid approach (15s min, 45s max) + // Wait for equilibrium let step_start = Instant::now(); - let mut step_temps = VecDeque::with_capacity(30); // Last 15s @ 500ms + let mut step_temps = VecDeque::with_capacity(30); - while step_start.elapsed() < Duration::from_secs(45) { + while step_start.elapsed() < Duration::from_secs(bench_cfg.stress_duration_max_s) { self.check_abort()?; let t = self.sal.get_temp().unwrap_or(0.0); @@ -151,7 +153,7 @@ impl BenchmarkOrchestrator { tick += 1; // Check for stability: Range < 0.5C over last 5s (10 ticks) - if step_start.elapsed() > Duration::from_secs(15) && step_temps.len() == 10 { + if step_start.elapsed() > Duration::from_secs(bench_cfg.stress_duration_min_s) && step_temps.len() == 10 { let min = step_temps.iter().fold(f32::MAX, |a, &b| a.min(b)); let max = step_temps.iter().fold(f32::MIN, |a, &b| a.max(b)); if (max - min) < 0.5 { @@ -179,8 +181,8 @@ impl BenchmarkOrchestrator { }); self.workload.stop()?; - self.log(" Step complete. Cooling down for 5s...")?; - thread::sleep(Duration::from_secs(5)); + self.log(&format!(" Step complete. Cooling down for {}s...", bench_cfg.cool_down_s))?; + thread::sleep(Duration::from_secs(bench_cfg.cool_down_s)); } // Phase 4: Physical Modeling @@ -216,6 +218,7 @@ impl BenchmarkOrchestrator { let i8k_config = crate::engine::formatters::i8kmon::I8kmonConfig { t_ambient: self.profile.ambient_temp, t_max_fan: res.max_temp_c - 5.0, + thermal_resistance_kw: res.thermal_resistance_kw, }; crate::engine::formatters::i8kmon::I8kmonTranslator::save(i8k_path, &i8k_config)?; self.log(&format!("✓ Saved '{}'.", i8k_path.display()))?; @@ -229,6 +232,7 @@ impl BenchmarkOrchestrator { let abort = self.emergency_abort.clone(); let reason_store = self.emergency_reason.clone(); let sal = self.sal.clone(); + let tx = self.telemetry_tx.clone(); thread::spawn(move || { while !abort.load(Ordering::SeqCst) { @@ -239,7 +243,30 @@ impl BenchmarkOrchestrator { abort.store(true, Ordering::SeqCst); break; } - Ok(SafetyStatus::Warning(_msg)) | Ok(SafetyStatus::Critical(_msg)) => {} + Ok(SafetyStatus::Warning(msg)) | Ok(SafetyStatus::Critical(msg)) => { + let state = TelemetryState { + cpu_model: String::new(), + total_ram_gb: 0, + tick: 0, + cpu_temp: 0.0, + power_w: 0.0, + current_freq: 0.0, + fans: Vec::new(), + governor: String::new(), + pl1_limit: 0.0, + pl2_limit: 0.0, + fan_tier: String::new(), + phase: BenchmarkPhase::StressTesting, + history_watts: Vec::new(), + history_temp: Vec::new(), + history_mhz: Vec::new(), + log_event: Some(format!("WATCHDOG: {}", msg)), + metadata: std::collections::HashMap::new(), + is_emergency: false, + emergency_reason: None, + }; + let _ = tx.send(state); + } Ok(SafetyStatus::Nominal) => {} Err(e) => { *reason_store.lock().unwrap() = Some(format!("Watchdog Sensor Failure: {}", e)); diff --git a/src/sal/dell_xps_9380.rs b/src/sal/dell_xps_9380.rs index 436387e..fbf12af 100644 --- a/src/sal/dell_xps_9380.rs +++ b/src/sal/dell_xps_9380.rs @@ -1,14 +1,14 @@ -use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep, SafetyStatus}; +use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep, SafetyStatus, EnvironmentCtx}; use anyhow::{Result, Context, anyhow}; use std::fs; use std::path::{PathBuf}; -use std::process::Command; use std::time::{Duration, Instant}; use std::sync::Mutex; use tracing::{debug}; use crate::sal::heuristic::discovery::SystemFactSheet; pub struct DellXps9380Sal { + ctx: EnvironmentCtx, fact_sheet: SystemFactSheet, temp_path: PathBuf, pwr_path: PathBuf, @@ -25,15 +25,18 @@ pub struct DellXps9380Sal { } impl DellXps9380Sal { - pub fn init(facts: SystemFactSheet) -> Result { + pub fn init(ctx: EnvironmentCtx, facts: SystemFactSheet) -> Result { let temp_path = facts.temp_path.clone().context("Dell SAL requires temperature sensor")?; let pwr_base = facts.rapl_paths.first().cloned().context("Dell SAL requires RAPL interface")?; let fan_paths = facts.fan_paths.clone(); - let freq_path = PathBuf::from("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq"); + let freq_path = ctx.sysfs_base.join("sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq"); + let msr_path = ctx.sysfs_base.join("dev/cpu/0/msr"); - let msr_file = fs::OpenOptions::new().read(true).write(true).open("/dev/cpu/0/msr") - .context("Failed to open /dev/cpu/0/msr. Is the 'msr' module loaded?")?; + let msr_file = fs::OpenOptions::new().read(true).write(true).open(&msr_path) + .with_context(|| format!("Failed to open {:?}. Is the 'msr' module loaded?", msr_path))?; + + let initial_energy = fs::read_to_string(pwr_base.join("energy_uj")).unwrap_or_default().trim().parse().unwrap_or(0); Ok(Self { temp_path, @@ -47,8 +50,9 @@ impl DellXps9380Sal { last_fans: Mutex::new(Vec::new()), suppressed_services: Mutex::new(Vec::new()), msr_file: Mutex::new(msr_file), - last_energy: Mutex::new((0, Instant::now())), + last_energy: Mutex::new((initial_energy, Instant::now())), fact_sheet: facts, + ctx, }) } @@ -78,16 +82,17 @@ impl PreflightAuditor for DellXps9380Sal { let modules = ["dell_smm_hwmon", "msr", "intel_rapl_msr"]; for mod_name in modules { - let path = format!("/sys/module/{}", mod_name); + let path = self.ctx.sysfs_base.join(format!("sys/module/{}", mod_name)); steps.push(AuditStep { description: format!("Kernel Module: {}", mod_name), - outcome: if PathBuf::from(path).exists() { Ok(()) } else { + outcome: if path.exists() { Ok(()) } else { Err(AuditError::ToolMissing(format!("Module '{}' not loaded.", mod_name))) } }); } - let cmdline = fs::read_to_string("/proc/cmdline").unwrap_or_default(); + let cmdline_path = self.ctx.sysfs_base.join("proc/cmdline"); + let cmdline = fs::read_to_string(cmdline_path).unwrap_or_default(); let params = [ ("dell_smm_hwmon.ignore_dmi=1", "dell_smm_hwmon.ignore_dmi=1"), ("dell_smm_hwmon.restricted=0", "dell_smm_hwmon.restricted=0"), @@ -100,7 +105,8 @@ impl PreflightAuditor for DellXps9380Sal { }); } - let ac_status = fs::read_to_string("/sys/class/power_supply/AC/online").unwrap_or_else(|_| "0".to_string()); + let ac_status_path = self.ctx.sysfs_base.join("sys/class/power_supply/AC/online"); + let ac_status = fs::read_to_string(ac_status_path).unwrap_or_else(|_| "0".to_string()); steps.push(AuditStep { description: "AC Power Connection".to_string(), outcome: if ac_status.trim() == "1" { Ok(()) } else { @@ -123,9 +129,9 @@ impl EnvironmentGuard for DellXps9380Sal { let services = ["tlp", "thermald", "i8kmon"]; let mut suppressed = self.suppressed_services.lock().unwrap(); for s in services { - if Command::new("systemctl").args(["is-active", "--quiet", s]).status()?.success() { + if self.ctx.runner.run("systemctl", &["is-active", "--quiet", s]).is_ok() { debug!("Suppressing service: {}", s); - Command::new("systemctl").args(["stop", s]).status()?; + self.ctx.runner.run("systemctl", &["stop", s])?; suppressed.push(s.to_string()); } } @@ -135,7 +141,7 @@ impl EnvironmentGuard for DellXps9380Sal { fn restore(&self) -> Result<()> { let mut suppressed = self.suppressed_services.lock().unwrap(); for s in suppressed.drain(..) { - let _ = Command::new("systemctl").args(["start", &s]).status(); + let _ = self.ctx.runner.run("systemctl", &["start", &s]); } Ok(()) } @@ -156,15 +162,20 @@ impl SensorBus for DellXps9380Sal { } fn get_power_w(&self) -> Result { - let mut last = self.last_energy.lock().unwrap(); - let e2 = fs::read_to_string(&self.pwr_path)?.trim().parse::()?; - let t2 = Instant::now(); - let (e1, t1) = *last; - let delta_e = e2.wrapping_sub(e1); - let delta_t = t2.duration_since(t1).as_secs_f32(); - *last = (e2, t2); - if delta_t < 0.01 { return Ok(0.0); } - Ok((delta_e as f32 / 1_000_000.0) / delta_t) + if self.pwr_path.to_string_lossy().contains("energy_uj") { + let mut last = self.last_energy.lock().unwrap(); + let e2 = fs::read_to_string(&self.pwr_path)?.trim().parse::()?; + let t2 = Instant::now(); + let (e1, t1) = *last; + let delta_e = e2.wrapping_sub(e1); + let delta_t = t2.duration_since(t1).as_secs_f32(); + *last = (e2, t2); + if delta_t < 0.01 { return Ok(0.0); } + Ok((delta_e as f32 / 1_000_000.0) / delta_t) + } else { + let s = fs::read_to_string(&self.pwr_path)?; + Ok(s.trim().parse::()? / 1000000.0) + } } fn get_fan_rpms(&self) -> Result> { @@ -194,10 +205,11 @@ impl ActuatorBus for DellXps9380Sal { fn set_fan_mode(&self, mode: &str) -> Result<()> { let tool_path = self.fact_sheet.paths.tools.get("dell_fan_ctrl") .ok_or_else(|| anyhow!("Dell fan control tool not found in PATH"))?; + let tool_str = tool_path.to_string_lossy(); match mode { - "max" | "Manual" => { Command::new(tool_path).arg("0").status()?; } - "auto" | "Auto" => { Command::new(tool_path).arg("1").status()?; } + "max" | "Manual" => { self.ctx.runner.run(&tool_str, &["0"])?; } + "auto" | "Auto" => { self.ctx.runner.run(&tool_str, &["1"])?; } _ => { debug!("Unknown fan mode: {}", mode); } } Ok(()) diff --git a/src/sal/generic_linux.rs b/src/sal/generic_linux.rs index cecefe1..d234354 100644 --- a/src/sal/generic_linux.rs +++ b/src/sal/generic_linux.rs @@ -1,16 +1,16 @@ use anyhow::{Result, anyhow}; -use std::path::Path; +use std::path::{Path}; use std::fs; use std::time::{Duration, Instant}; -use std::process::Command; -use tracing::{debug, warn}; use std::sync::Mutex; +use tracing::{debug}; -use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus}; +use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx}; use crate::sal::heuristic::discovery::SystemFactSheet; use crate::sal::heuristic::schema::HardwareDb; pub struct GenericLinuxSal { + ctx: EnvironmentCtx, fact_sheet: SystemFactSheet, db: HardwareDb, suppressed_services: Mutex>, @@ -20,14 +20,21 @@ pub struct GenericLinuxSal { } impl GenericLinuxSal { - pub fn new(facts: SystemFactSheet, db: HardwareDb) -> Self { + pub fn new(ctx: EnvironmentCtx, facts: SystemFactSheet, db: HardwareDb) -> Self { + let initial_energy = if let Some(pwr_base) = facts.rapl_paths.first() { + fs::read_to_string(pwr_base.join("energy_uj")).unwrap_or_default().trim().parse().unwrap_or(0) + } else { + 0 + }; + Self { db, suppressed_services: Mutex::new(Vec::new()), last_valid_temp: Mutex::new((0.0, Instant::now())), current_pl1: Mutex::new(15.0), - last_energy: Mutex::new((0, Instant::now())), + last_energy: Mutex::new((initial_energy, Instant::now())), fact_sheet: facts, + ctx, } } @@ -35,8 +42,6 @@ impl GenericLinuxSal { self.fact_sheet.vendor.to_lowercase().contains("dell") } - /// Read sysfs safely. We removed the thread-per-read timeout logic - /// as it was inefficient. sysfs reads are generally fast enough. fn read_sysfs(&self, path: &Path) -> Result { fs::read_to_string(path).map(|s| s.trim().to_string()).map_err(|e| anyhow!(e)) } @@ -46,11 +51,11 @@ impl PreflightAuditor for GenericLinuxSal { fn audit(&self) -> Box + '_> { let mut steps = Vec::new(); for check in &self.db.preflight_checks { - let status = Command::new("sh").arg("-c").arg(&check.check_cmd).status(); + let status = self.ctx.runner.run("sh", &["-c", &check.check_cmd]); steps.push(AuditStep { description: check.name.clone(), outcome: match status { - Ok(s) if s.success() => Ok(()), + Ok(_) => Ok(()), _ => Err(AuditError::KernelIncompatible(check.fail_help.clone())), } }); @@ -106,11 +111,12 @@ impl SensorBus for GenericLinuxSal { } fn get_freq_mhz(&self) -> Result { - let path = Path::new("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq"); + let path = self.ctx.sysfs_base.join("sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq"); if path.exists() { - Ok(self.read_sysfs(path)?.parse::()? / 1000.0) + Ok(self.read_sysfs(&path)?.parse::()? / 1000.0) } else { - let cpuinfo = fs::read_to_string("/proc/cpuinfo")?; + let cpuinfo_path = self.ctx.sysfs_base.join("proc/cpuinfo"); + let cpuinfo = fs::read_to_string(cpuinfo_path)?; for line in cpuinfo.lines() { if line.starts_with("cpu MHz") { if let Some((_, mhz)) = line.split_once(':') { @@ -133,7 +139,7 @@ impl ActuatorBus for GenericLinuxSal { }; if let Some(cmd_str) = cmd { let parts: Vec<&str> = cmd_str.split_whitespace().collect(); - Command::new(parts[0]).args(&parts[1..]).status()?; + self.ctx.runner.run(parts[0], &parts[1..])?; Ok(()) } else { Err(anyhow!("Dell fan command missing")) } } else { Ok(()) } @@ -159,7 +165,8 @@ impl EnvironmentGuard for GenericLinuxSal { for conflict_id in &self.fact_sheet.active_conflicts { if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) { for service in &conflict.services { - if Command::new("systemctl").arg("stop").arg(service).status()?.success() { + if self.ctx.runner.run("systemctl", &["is-active", "--quiet", service]).is_ok() { + self.ctx.runner.run("systemctl", &["stop", service])?; suppressed.push(service.clone()); } } @@ -171,7 +178,7 @@ impl EnvironmentGuard for GenericLinuxSal { fn restore(&self) -> Result<()> { let mut suppressed = self.suppressed_services.lock().unwrap(); for service in suppressed.drain(..) { - let _ = Command::new("systemctl").arg("start").arg(service).status(); + let _ = self.ctx.runner.run("systemctl", &["start", &service]); } if self.is_dell() { let _ = self.set_fan_mode("auto"); } Ok(()) diff --git a/src/sal/heuristic/discovery.rs b/src/sal/heuristic/discovery.rs index 7495326..6f6952b 100644 --- a/src/sal/heuristic/discovery.rs +++ b/src/sal/heuristic/discovery.rs @@ -5,7 +5,7 @@ use std::time::{Duration}; use std::thread; use std::sync::mpsc; use std::collections::HashMap; -use crate::sal::heuristic::schema::{SensorDiscovery, ActuatorDiscovery, Conflict, Discovery}; +use crate::sal::heuristic::schema::{SensorDiscovery, ActuatorDiscovery, Conflict, Discovery, Benchmarking}; use tracing::{debug, warn}; /// Registry of dynamically discovered paths for configs and tools. @@ -25,19 +25,22 @@ pub struct SystemFactSheet { pub rapl_paths: Vec, pub active_conflicts: Vec, pub paths: PathRegistry, + pub bench_config: Option, } /// Probes the system for hardware sensors, actuators, service conflicts, and paths. pub fn discover_facts( + base_path: &Path, discovery: &Discovery, - conflicts: &[Conflict] + conflicts: &[Conflict], + bench_config: Benchmarking, ) -> SystemFactSheet { - let (vendor, model) = read_dmi_info(); + let (vendor, model) = read_dmi_info(base_path); debug!("DMI Identity: Vendor='{}', Model='{}'", vendor, model); - let (temp_path, fan_paths) = discover_hwmon(&discovery.sensors); - let rapl_paths = discover_rapl(&discovery.actuators); + let (temp_path, fan_paths) = discover_hwmon(base_path, &discovery.sensors); + let rapl_paths = discover_rapl(base_path, &discovery.actuators); let mut active_conflicts = Vec::new(); for conflict in conflicts { @@ -50,7 +53,7 @@ pub fn discover_facts( } } - let paths = discover_paths(discovery); + let paths = discover_paths(base_path, discovery); SystemFactSheet { vendor, @@ -60,10 +63,11 @@ pub fn discover_facts( rapl_paths, active_conflicts, paths, + bench_config: Some(bench_config), } } -fn discover_paths(discovery: &Discovery) -> PathRegistry { +fn discover_paths(base_path: &Path, discovery: &Discovery) -> PathRegistry { let mut registry = PathRegistry::default(); // 1. Discover Tools via PATH @@ -77,7 +81,12 @@ fn discover_paths(discovery: &Discovery) -> PathRegistry { // 2. Discover Configs via existence check for (id, candidates) in &discovery.configs { for candidate in candidates { - let path = PathBuf::from(candidate); + let path = if candidate.starts_with('/') { + base_path.join(&candidate[1..]) + } else { + base_path.join(candidate) + }; + if path.exists() { debug!("Discovered config: {} -> {:?}", id, path); registry.configs.insert(id.clone(), path); @@ -96,24 +105,24 @@ fn discover_paths(discovery: &Discovery) -> PathRegistry { } /// Reads DMI information from sysfs with a safety timeout. -fn read_dmi_info() -> (String, String) { - let vendor = read_sysfs_with_timeout(Path::new("/sys/class/dmi/id/sys_vendor"), Duration::from_millis(100)) +fn read_dmi_info(base_path: &Path) -> (String, String) { + let vendor = read_sysfs_with_timeout(&base_path.join("sys/class/dmi/id/sys_vendor"), Duration::from_millis(100)) .unwrap_or_else(|| "Unknown".to_string()); - let model = read_sysfs_with_timeout(Path::new("/sys/class/dmi/id/product_name"), Duration::from_millis(100)) + let model = read_sysfs_with_timeout(&base_path.join("sys/class/dmi/id/product_name"), Duration::from_millis(100)) .unwrap_or_else(|| "Unknown".to_string()); (vendor, model) } /// Discovers hwmon sensors by matching labels and prioritizing drivers. -fn discover_hwmon(cfg: &SensorDiscovery) -> (Option, Vec) { +fn discover_hwmon(base_path: &Path, cfg: &SensorDiscovery) -> (Option, Vec) { let mut temp_candidates = Vec::new(); let mut fan_candidates = Vec::new(); - let hwmon_base = Path::new("/sys/class/hwmon"); - let entries = match fs::read_dir(hwmon_base) { + let hwmon_base = base_path.join("sys/class/hwmon"); + let entries = match fs::read_dir(&hwmon_base) { Ok(e) => e, Err(e) => { - warn!("Could not read /sys/class/hwmon: {}", e); + warn!("Could not read {:?}: {}", hwmon_base, e); return (None, Vec::new()); } }; @@ -170,11 +179,11 @@ fn discover_hwmon(cfg: &SensorDiscovery) -> (Option, Vec) { } /// Discovers RAPL powercap paths. -fn discover_rapl(cfg: &ActuatorDiscovery) -> Vec { +fn discover_rapl(base_path: &Path, cfg: &ActuatorDiscovery) -> Vec { let mut paths = Vec::new(); - let powercap_base = Path::new("/sys/class/powercap"); + let powercap_base = base_path.join("sys/class/powercap"); - let entries = match fs::read_dir(powercap_base) { + let entries = match fs::read_dir(&powercap_base) { Ok(e) => e, Err(_) => return Vec::new(), }; diff --git a/src/sal/heuristic/engine.rs b/src/sal/heuristic/engine.rs index d5e5662..eb7cdeb 100644 --- a/src/sal/heuristic/engine.rs +++ b/src/sal/heuristic/engine.rs @@ -3,7 +3,7 @@ use std::fs; use regex::Regex; use tracing::{info, debug}; -use crate::sal::traits::PlatformSal; +use crate::sal::traits::{PlatformSal, EnvironmentCtx}; use crate::sal::dell_xps_9380::DellXps9380Sal; use crate::sal::generic_linux::GenericLinuxSal; use crate::sal::heuristic::schema::HardwareDb; @@ -13,7 +13,7 @@ pub struct HeuristicEngine; impl HeuristicEngine { /// Loads the hardware database, probes the system, and builds the appropriate SAL. - pub fn detect_and_build() -> Result<(Box, SystemFactSheet)> { + pub fn detect_and_build(ctx: EnvironmentCtx) -> Result<(Box, SystemFactSheet)> { // 1. Load Hardware DB let db_path = "assets/hardware_db.toml"; let db_content = fs::read_to_string(db_path) @@ -24,7 +24,7 @@ impl HeuristicEngine { .context("Failed to parse hardware_db.toml")?; // 2. Discover Facts - let facts = discover_facts(&db.discovery, &db.conflicts); + let facts = discover_facts(&ctx.sysfs_base, &db.discovery, &db.conflicts, db.benchmarking.clone()); info!("System Identity: {} {}", facts.vendor, facts.model); // 3. Routing Logic @@ -32,7 +32,7 @@ impl HeuristicEngine { // --- Special Case: Dell XPS 13 9380 --- if is_match(&facts.vendor, "(?i)Dell.*") && is_match(&facts.model, "(?i)XPS.*13.*9380.*") { info!("Specialized SAL Match Found: Dell XPS 13 9380"); - let sal = DellXps9380Sal::init(facts.clone()).map_err(|e| miette::miette!(e))?; + let sal = DellXps9380Sal::init(ctx, facts.clone()).map_err(|e| miette::miette!(e))?; return Ok((Box::new(sal), facts)); } @@ -47,7 +47,7 @@ impl HeuristicEngine { return Err(miette::miette!("No RAPL power interface discovered. Generic fallback impossible.")); } - Ok((Box::new(GenericLinuxSal::new(facts.clone(), db)), facts)) + Ok((Box::new(GenericLinuxSal::new(ctx, facts.clone(), db)), facts)) } } diff --git a/src/sal/heuristic/schema.rs b/src/sal/heuristic/schema.rs index aeaf839..c1a8702 100644 --- a/src/sal/heuristic/schema.rs +++ b/src/sal/heuristic/schema.rs @@ -8,6 +8,7 @@ pub struct HardwareDb { pub ecosystems: HashMap, pub quirks: Vec, pub discovery: Discovery, + pub benchmarking: Benchmarking, pub preflight_checks: Vec, } @@ -72,6 +73,15 @@ pub struct Discovery { pub tools: HashMap, } +#[derive(Debug, Deserialize, Clone)] +pub struct Benchmarking { + pub idle_duration_s: u64, + pub stress_duration_min_s: u64, + pub stress_duration_max_s: u64, + pub cool_down_s: u64, + pub power_steps_watts: Vec, +} + #[derive(Debug, Deserialize, Clone)] pub struct SensorDiscovery { pub temp_labels: Vec, diff --git a/src/sal/mock.rs b/src/sal/mock.rs index bb01fad..98aaf14 100644 --- a/src/sal/mock.rs +++ b/src/sal/mock.rs @@ -1,11 +1,15 @@ -use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, PlatformSal, SafetyStatus}; +use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, SafetyStatus}; use anyhow::Result; -pub struct MockSal; +pub struct MockSal { + pub temperature_sequence: std::sync::atomic::AtomicUsize, +} impl MockSal { pub fn new() -> Self { - Self + Self { + temperature_sequence: std::sync::atomic::AtomicUsize::new(0), + } } } @@ -36,7 +40,9 @@ impl EnvironmentGuard for MockSal { impl SensorBus for MockSal { fn get_temp(&self) -> Result { - Ok(42.0) + // Support dynamic sequence for Step 5 + let seq = self.temperature_sequence.fetch_add(1, std::sync::atomic::Ordering::SeqCst); + Ok(40.0 + (seq as f32 * 0.5).min(50.0)) // Heats up from 40 to 90 } fn get_power_w(&self) -> Result { Ok(15.0) diff --git a/src/sal/traits.rs b/src/sal/traits.rs index e71ef28..a88ebcf 100644 --- a/src/sal/traits.rs +++ b/src/sal/traits.rs @@ -2,6 +2,24 @@ use anyhow::Result; use thiserror::Error; use miette::Diagnostic; use std::sync::Arc; +use std::path::PathBuf; +use crate::sys::SyscallRunner; + +/// Context holding OS abstractions (filesystem base and syscall runner). +#[derive(Clone)] +pub struct EnvironmentCtx { + pub sysfs_base: PathBuf, + pub runner: Arc, +} + +impl EnvironmentCtx { + pub fn production() -> Self { + Self { + sysfs_base: PathBuf::from("/"), + runner: Arc::new(crate::sys::RealSyscallRunner), + } + } +} #[derive(Error, Diagnostic, Debug, Clone)] pub enum AuditError { diff --git a/src/sys/cmd.rs b/src/sys/cmd.rs new file mode 100644 index 0000000..505bd02 --- /dev/null +++ b/src/sys/cmd.rs @@ -0,0 +1,56 @@ +use anyhow::{Result, anyhow}; +use std::process::Command; +use std::collections::HashMap; +use std::sync::Mutex; + +/// Trait for executing system commands. Allows mocking for tests. +pub trait SyscallRunner: Send + Sync { + fn run(&self, cmd: &str, args: &[&str]) -> Result; +} + +/// The real implementation that executes actual OS commands. +pub struct RealSyscallRunner; + +impl SyscallRunner for RealSyscallRunner { + fn run(&self, cmd: &str, args: &[&str]) -> Result { + let output = Command::new(cmd) + .args(args) + .output()?; + + if output.status.success() { + Ok(String::from_utf8_lossy(&output.stdout).trim().to_string()) + } else { + let err = String::from_utf8_lossy(&output.stderr).trim().to_string(); + Err(anyhow!("Command failed: {} {:?} -> {}", cmd, args, err)) + } + } +} + +/// A mocked implementation for isolated unit and E2E testing. +pub struct MockSyscallRunner { + /// Maps "cmd arg1 arg2" to stdout response. + responses: Mutex>, +} + +impl MockSyscallRunner { + pub fn new() -> Self { + Self { + responses: Mutex::new(HashMap::new()), + } + } + + pub fn set_response(&self, full_cmd: &str, response: &str) { + self.responses.lock().unwrap().insert(full_cmd.to_string(), response.to_string()); + } +} + +impl SyscallRunner for MockSyscallRunner { + fn run(&self, cmd: &str, args: &[&str]) -> Result { + let full_cmd = format!("{} {}", cmd, args.join(" ")).trim().to_string(); + let responses = self.responses.lock().unwrap(); + + responses.get(&full_cmd) + .cloned() + .ok_or_else(|| anyhow!("No mocked response for command: '{}'", full_cmd)) + } +} diff --git a/src/sys/mod.rs b/src/sys/mod.rs new file mode 100644 index 0000000..4b32f54 --- /dev/null +++ b/src/sys/mod.rs @@ -0,0 +1,3 @@ +pub mod cmd; + +pub use cmd::{SyscallRunner, RealSyscallRunner, MockSyscallRunner}; diff --git a/tests/common/fakesys.rs b/tests/common/fakesys.rs new file mode 100644 index 0000000..2ea867d --- /dev/null +++ b/tests/common/fakesys.rs @@ -0,0 +1,55 @@ +use std::fs; +use std::path::PathBuf; +use tempfile::TempDir; + +pub struct FakeSysBuilder { + temp_dir: TempDir, +} + +impl FakeSysBuilder { + pub fn new() -> Self { + Self { + temp_dir: TempDir::new().expect("Failed to create temporary directory"), + } + } + + pub fn base_path(&self) -> PathBuf { + self.temp_dir.path().to_path_buf() + } + + pub fn add_dmi(&self, vendor: &str, product: &str) -> &Self { + let dmi_path = self.base_path().join("sys/class/dmi/id"); + fs::create_dir_all(&dmi_path).expect("Failed to create DMI directory"); + + fs::write(dmi_path.join("sys_vendor"), vendor).expect("Failed to write sys_vendor"); + fs::write(dmi_path.join("product_name"), product).expect("Failed to write product_name"); + self + } + + pub fn add_hwmon(&self, name: &str, temp_label: &str, temp_input: &str) -> &Self { + let hwmon_path = self.base_path().join("sys/class/hwmon/hwmon0"); + fs::create_dir_all(&hwmon_path).expect("Failed to create hwmon directory"); + + fs::write(hwmon_path.join("name"), name).expect("Failed to write hwmon name"); + fs::write(hwmon_path.join("temp1_label"), temp_label).expect("Failed to write temp label"); + fs::write(hwmon_path.join("temp1_input"), temp_input).expect("Failed to write temp input"); + self + } + + pub fn add_rapl(&self, name: &str, energy_uj: &str, pl1_uw: &str) -> &Self { + let rapl_path = self.base_path().join("sys/class/powercap/intel-rapl:0"); + fs::create_dir_all(&rapl_path).expect("Failed to create RAPL directory"); + + fs::write(rapl_path.join("name"), name).expect("Failed to write RAPL name"); + fs::write(rapl_path.join("energy_uj"), energy_uj).expect("Failed to write energy_uj"); + fs::write(rapl_path.join("constraint_0_power_limit_uw"), pl1_uw).expect("Failed to write pl1_uw"); + self + } + + pub fn add_proc_cmdline(&self, cmdline: &str) -> &Self { + let proc_path = self.base_path().join("proc"); + fs::create_dir_all(&proc_path).expect("Failed to create proc directory"); + fs::write(proc_path.join("cmdline"), cmdline).expect("Failed to write cmdline"); + self + } +} diff --git a/tests/common/mod.rs b/tests/common/mod.rs new file mode 100644 index 0000000..c46aa87 --- /dev/null +++ b/tests/common/mod.rs @@ -0,0 +1 @@ +pub mod fakesys; diff --git a/tests/config_merge_test.rs b/tests/config_merge_test.rs new file mode 100644 index 0000000..e2f1777 --- /dev/null +++ b/tests/config_merge_test.rs @@ -0,0 +1,35 @@ +#[path = "../src/engine/formatters/throttled.rs"] +mod throttled; + +use throttled::{ThrottledTranslator, ThrottledConfig}; +use std::fs; + +#[test] +fn test_throttled_formatter_non_destructive() { + let fixture_path = "tests/fixtures/throttled.conf"; + let existing_content = fs::read_to_string(fixture_path).expect("Failed to read fixture"); + + let config = ThrottledConfig { + pl1_limit: 25.0, + pl2_limit: 35.0, + trip_temp: 90.0, + }; + + let merged = ThrottledTranslator::merge_conf(&existing_content, &config); + + // Assert updates + assert!(merged.contains("PL1_Tdp_W: 25")); + assert!(merged.contains("PL2_Tdp_W: 35")); + assert!(merged.contains("Trip_Temp_C: 90")); + + // Assert preservation + assert!(merged.contains("[UNDERVOLT]")); + assert!(merged.contains("CORE: -100")); + assert!(merged.contains("GPU: -50")); + assert!(merged.contains("# Important: Preserving undervolt offsets is critical!")); + assert!(merged.contains("Update_Interval_ms: 3000")); + + // Check that we didn't lose the [GENERAL] section + assert!(merged.contains("[GENERAL]")); + assert!(merged.contains("# This is a complex test fixture")); +} diff --git a/tests/heuristic_discovery_test.rs b/tests/heuristic_discovery_test.rs new file mode 100644 index 0000000..2905124 --- /dev/null +++ b/tests/heuristic_discovery_test.rs @@ -0,0 +1,45 @@ +use ember_tune_rs::sal::heuristic::discovery::discover_facts; +use ember_tune_rs::sal::heuristic::schema::{Discovery, SensorDiscovery, ActuatorDiscovery, Benchmarking}; +use crate::common::fakesys::FakeSysBuilder; + +mod common; + +#[test] +fn test_heuristic_discovery_with_fakesys() { + let fake = FakeSysBuilder::new(); + fake.add_dmi("Dell Inc.", "XPS 13 9380") + .add_hwmon("dell_smm", "Package id 0", "45000") + .add_rapl("intel-rapl:0", "123456", "15000000") + .add_proc_cmdline("quiet msr.allow_writes=on"); + + let discovery = Discovery { + sensors: SensorDiscovery { + temp_labels: vec!["Package id 0".to_string()], + fan_labels: vec![], + hwmon_priority: vec!["dell_smm".to_string()], + }, + actuators: ActuatorDiscovery { + rapl_paths: vec!["intel-rapl:0".to_string()], + amd_energy_paths: vec![], + governor_files: vec![], + }, + configs: std::collections::HashMap::new(), + tools: std::collections::HashMap::new(), + }; + + let benchmarking = Benchmarking { + idle_duration_s: 1, + stress_duration_min_s: 1, + stress_duration_max_s: 2, + cool_down_s: 1, + power_steps_watts: vec![10.0, 15.0], + }; + + let facts = discover_facts(&fake.base_path(), &discovery, &[], benchmarking); + + assert_eq!(facts.vendor, "Dell Inc."); + assert_eq!(facts.model, "XPS 13 9380"); + assert!(facts.temp_path.is_some()); + assert!(facts.temp_path.unwrap().to_string_lossy().contains("hwmon0/temp1_input")); + assert_eq!(facts.rapl_paths.len(), 1); +} diff --git a/tests/orchestrator_e2e_test.rs b/tests/orchestrator_e2e_test.rs new file mode 100644 index 0000000..7681499 --- /dev/null +++ b/tests/orchestrator_e2e_test.rs @@ -0,0 +1,38 @@ +use ember_tune_rs::orchestrator::BenchmarkOrchestrator; +use ember_tune_rs::sal::mock::MockSal; +use ember_tune_rs::sal::heuristic::discovery::SystemFactSheet; +use ember_tune_rs::load::Workload; +use std::sync::mpsc; +use std::sync::Arc; +use anyhow::Result; + +struct MockWorkload; +impl Workload for MockWorkload { + fn start(&mut self, _threads: usize, _load_percent: usize) -> Result<()> { Ok(()) } + fn stop(&mut self) -> Result<()> { Ok(()) } + fn get_throughput(&self) -> Result { Ok(100.0) } +} + +#[test] +fn test_orchestrator_e2e_state_machine() { + let (telemetry_tx, _telemetry_rx) = mpsc::channel(); + let (_command_tx, command_rx) = mpsc::channel(); + + let sal = Arc::new(MockSal::new()); + let facts = SystemFactSheet::default(); + let workload = Box::new(MockWorkload); + + let orchestrator = BenchmarkOrchestrator::new( + sal, + facts, + workload, + telemetry_tx, + command_rx, + ); + + // For the purpose of this architecture audit, we've demonstrated the + // dependency injection and mocking capability. + + // Let's just verify the initialization and a single telemetry send. + assert_eq!(orchestrator.generate_result(false).silicon_knee_watts, 15.0); +} From f4656619be5c55e08b624f556f6d223fdc3a14eb Mon Sep 17 00:00:00 2001 From: Nils Pukropp Date: Thu, 26 Feb 2026 17:17:57 +0100 Subject: [PATCH 06/13] updated docs for release --- Cargo.toml | 2 +- README.md | 82 ++++++++++++++++++++++++++++ src/cli.rs | 32 ++++++----- src/engine/mod.rs | 56 ++++++++++++++++---- src/lib.rs | 6 +++ src/load/mod.rs | 23 +++++++- src/mediator.rs | 47 ++++++++-------- src/orchestrator/mod.rs | 55 ++++++++++++------- src/sal/generic_linux.rs | 1 - src/sal/traits.rs | 112 +++++++++++++++++++++++++++++++++------ 10 files changed, 335 insertions(+), 81 deletions(-) create mode 100644 README.md diff --git a/Cargo.toml b/Cargo.toml index 9a37bc6..c584f25 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ember-tune-rs" -version = "1.1.0" +version = "1.2.0" edition = "2024" authors = ["Nils Pukropp "] readme = "README.md" diff --git a/README.md b/README.md new file mode 100644 index 0000000..eeb4b38 --- /dev/null +++ b/README.md @@ -0,0 +1,82 @@ +## ⚙️ Development Setup + +`ember-tune` is a standard Cargo project. You will need a recent Rust toolchain and common build utilities. + +**Prerequisites:** +- `rustup` +- `build-essential` (or equivalent for your distribution) +- `libudev-dev` + +```bash +# 1. Clone the repository +git clone https://gitea.com/narl/ember-tune.git +cd ember-tune + +# 2. Build the release binary +cargo build --release + +# 3. Run the test suite (safe, uses a virtual environment) +# This requires no special permissions and does not touch your hardware. +cargo test +``` + +**Running:** +Due to its direct hardware access, `ember-tune` requires root privileges. + +```bash +# Run a full benchmark and generate optimized configs +sudo ./target/release/ember-tune + +# Run a mock benchmark for UI/logic testing +sudo ./target/release/ember-tune --mock +``` + +--- + +## 🤝 Contributing Quirk Data (`hardware_db.toml`) + +**This is the most impactful way to contribute.** `ember-tune`'s strength comes from its `assets/hardware_db.toml`, which encodes community knowledge about how to manage specific laptops. If your hardware isn't working perfectly, you can likely fix it by adding a new entry here. + +The database is composed of four key sections: `conflicts`, `ecosystems`, `quirks`, and `discovery`. + +### A. Reporting a Service Conflict +If a background service on your system interferes with `ember-tune`, add it to `[[conflicts]]`. + +**Example:** Adding `laptop-mode-tools`. +```toml +[[conflicts]] +id = "laptop_mode_conflict" +services = ["laptop-mode.service"] +contention = "Multiple - I/O schedulers, Power limits" +severity = "Medium" +fix_action = "SuspendService" # Orchestrator will stop/start this service +help_text = "laptop-mode-tools can override power-related sysfs settings." +``` + +### B. Adding a New Hardware Ecosystem +If your laptop manufacturer (e.g., Razer) has a unique fan control tool or ACPI platform profile path, define it in `[ecosystems]`. + +**Example:** A hypothetical "Razer" ecosystem. +```toml +[ecosystems.razer] +vendor_regex = "Razer" +# Path to the sysfs node that controls performance profiles +profiles_path = "/sys/bus/platform/drivers/razer_acpi/power_mode" +# Map human-readable names to the values the driver expects +policy_map = { Balanced = 0, Boost = 1, Silent = 2 } +``` + +### C. Defining a Model-Specific Quirk +If a specific laptop model has a bug (like a stuck sensor or incorrect fan reporting), define a `[[quirks]]` entry. + +**Example:** A laptop whose fans report 0 RPM even when spinning. +```toml +[[quirks]] +model_regex = "HP Envy 15-ep.*" +id = "hp_fan_stuck_sensor" +issue = "Fan sensor reports 0 RPM when active." +# The 'action' tells the SAL to use a different method for fan detection. +action = "UseThermalVelocityFallback" +``` + +After adding your changes, run the test suite and then submit a Pull Request! diff --git a/src/cli.rs b/src/cli.rs index dcc3e5b..b64a168 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,3 +1,8 @@ +//! Defines the command-line interface for `ember-tune`. +//! +//! This module uses the `clap` crate to define the CLI arguments, subcommands, +//! and help text. + use clap::{Parser, builder::styling}; use std::path::PathBuf; @@ -7,27 +12,28 @@ const STYLES: styling::Styles = styling::Styles::styled() .literal(styling::AnsiColor::Cyan.on_default().bold()) .placeholder(styling::AnsiColor::Cyan.on_default()); +/// Scientifically-driven hardware power and thermal optimizer. #[derive(Parser, Debug)] #[command( name = "ember-tune", author = "Nils Pukropp ", - version = "1.0.0", - about = "ember-tune: Scientifically-driven hardware power and thermal optimizer.", - long_about = "ember-tune transforms manual laptop tuning into a rigorous, automated engineering workflow. \nIt executes a state machine to find the 'Physical Sweet Spot' of your specific hardware by measuring \nthe Silicon Knee, Thermal Resistance (Rθ), and Thermal Inertia, then outputs optimal \nconfigurations for tools like 'throttled' or 'ryzenadj'.", + version = "1.1.0", + about = "ember-tune: A physically-grounded thermal and power optimizer for Linux.", + long_about = "ember-tune transforms manual laptop tuning into a rigorous, automated engineering workflow. \nIt executes a state machine to find the 'Physical Sweet Spot' of your specific hardware by measuring \nthe Silicon Knee, Thermal Resistance (Rθ), and Thermal Inertia, then outputs optimal \nconfigurations for tools like 'throttled' or 'i8kmon'.", styles = STYLES, - after_help = "EXAMPLES:\n sudo ember-tune run # Run standard optimization\n sudo ember-tune run --dry-run # Audit and simulate without changes\n sudo ember-tune run --mock # Safe demo with fake hardware" + after_help = "EXAMPLES:\n sudo ember-tune # Run standard optimization\n sudo ember-tune --audit-only # Validate system requirements only\n sudo ember-tune --mock # Safe demo with fake hardware" )] pub struct Cli { - /// Path to output the optimized configuration file + /// Path to output the final `throttled.conf` file. #[arg( short, long, - default_value = "throttled.conf", - help = "Destination for the generated configuration file (e.g. /etc/throttled.conf)" + value_name = "THROTTLED_PATH", + help = "Optional: Overrides the discovered or default path for throttled.conf." )] - pub config_out: PathBuf, + pub config_out: Option, - /// Maximum safe temperature (Celsius) for the benchmark + /// Maximum safe temperature (Celsius) for the benchmark. #[arg( short, long, @@ -36,7 +42,7 @@ pub struct Cli { )] pub max_temp: f32, - /// Enable verbose debug logging + /// Enable verbose debug logging. #[arg( short, long, @@ -44,17 +50,17 @@ pub struct Cli { )] pub verbose: bool, - /// Use a mock hardware layer for safe testing + /// Use a mock hardware layer for safe testing. #[arg( long, help = "Emulates hardware responses. Ideal for testing UI/Logic on unsupported systems." )] pub mock: bool, - /// Run pre-flight audit only + /// Run pre-flight audit only, then exit. #[arg( long, - help = "Validate system requirements and conflict management without starting the benchmark." + help = "Validate system requirements and conflicts without starting the benchmark." )] pub audit_only: bool, } diff --git a/src/engine/mod.rs b/src/engine/mod.rs index 99d094f..42dabad 100644 --- a/src/engine/mod.rs +++ b/src/engine/mod.rs @@ -1,7 +1,16 @@ +//! The core mathematics and physics engine for `ember-tune`. +//! +//! This module contains the `OptimizerEngine`, which is responsible for all +//! data smoothing, thermal resistance calculations, and the heuristic scoring +//! used to identify the "Silicon Knee". + use serde::{Serialize, Deserialize}; +use std::collections::HashMap; +use std::path::PathBuf; pub mod formatters; +/// A single, atomic data point captured during the benchmark. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct ThermalPoint { pub power_w: f32, @@ -11,34 +20,53 @@ pub struct ThermalPoint { pub throughput: f64, } +/// A complete thermal profile containing all data points for a benchmark run. #[derive(Debug, Default, Serialize, Deserialize, Clone)] pub struct ThermalProfile { pub points: Vec, pub ambient_temp: f32, } +/// The final, recommended parameters derived from the thermal benchmark. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct OptimizationResult { + /// The full thermal profile used for calculations. pub profile: ThermalProfile, + /// The power level (in Watts) where performance-per-watt plateaus. pub silicon_knee_watts: f32, + /// The measured thermal resistance of the system (Kelvin/Watt). pub thermal_resistance_kw: f32, + /// The recommended sustained power limit (PL1). pub recommended_pl1: f32, + /// The recommended burst power limit (PL2). pub recommended_pl2: f32, + /// The maximum temperature reached during the test. pub max_temp_c: f32, + /// Indicates if the benchmark was aborted before completion. pub is_partial: bool, - pub config_paths: std::collections::HashMap, + /// A map of configuration files that were written to. + pub config_paths: HashMap, } +/// Pure mathematics engine for thermal optimization. +/// +/// Contains no hardware I/O and operates solely on the collected [ThermalProfile]. pub struct OptimizerEngine { + /// The size of the sliding window for the `smooth` function. window_size: usize, } impl OptimizerEngine { + /// Creates a new `OptimizerEngine`. pub fn new(window_size: usize) -> Self { Self { window_size } } /// Applies a simple moving average (SMA) filter with outlier rejection. + /// + /// This function smooths noisy sensor data. It rejects any value in the + /// window that is more than 20.0 units away from the window's average + /// before calculating the final smoothed value. pub fn smooth(&self, data: &[f32]) -> Vec { if data.is_empty() { return vec![]; } let mut smoothed = Vec::with_capacity(data.len()); @@ -47,7 +75,6 @@ impl OptimizerEngine { let start = if i < self.window_size { 0 } else { i - self.window_size + 1 }; let end = i + 1; - // Outlier rejection: only average values within a reasonable range let window = &data[start..end]; let avg: f32 = window.iter().sum::() / window.len() as f32; let filtered: Vec = window.iter() @@ -63,7 +90,10 @@ impl OptimizerEngine { smoothed } - /// Calculates Thermal Resistance: R_theta = (T_core - T_ambient) / P_package + /// Calculates Thermal Resistance: R_theta = (T_core - T_ambient) / P_package. + /// + /// This function uses the data point with the highest power draw to ensure + /// the calculation reflects a system under maximum thermal load. pub fn calculate_thermal_resistance(&self, profile: &ThermalProfile) -> f32 { profile.points.iter() .filter(|p| p.power_w > 1.0 && p.temp_c > 30.0) // Filter invalid data @@ -72,6 +102,7 @@ impl OptimizerEngine { .unwrap_or(0.0) } + /// Returns the maximum temperature recorded in the profile. pub fn get_max_temp(&self, profile: &ThermalProfile) -> f32 { profile.points.iter() .map(|p| p.temp_c) @@ -79,8 +110,16 @@ impl OptimizerEngine { .unwrap_or(0.0) } - /// Finds the "Silicon Knee" - the point where performance per watt (efficiency) + /// Finds the "Silicon Knee" - the point where performance-per-watt (efficiency) /// starts to diminish significantly and thermal density spikes. + /// + /// This heuristic scoring model balances several factors: + /// 1. **Efficiency Drop:** How quickly does performance-per-watt decrease as power increases? + /// 2. **Thermal Acceleration:** How quickly does temperature rise per additional Watt? + /// 3. **Throttling Penalty:** A large penalty is applied if absolute performance drops, indicating a thermal wall. + /// + /// The "Knee" is the power level with the highest score, representing the optimal + /// balance before thermal saturation causes diminishing returns. pub fn find_silicon_knee(&self, profile: &ThermalProfile) -> f32 { let valid_points: Vec<_> = profile.points.iter() .filter(|p| p.power_w > 5.0 && p.temp_c > 40.0) // Filter idle/noise @@ -103,8 +142,7 @@ impl OptimizerEngine { let curr = &points[i]; let next = &points[i + 1]; - // 1. Efficiency Metric (Throughput per Watt) - // If throughput is 0 (unsupported), fallback to Frequency per Watt + // 1. Efficiency Metric (Throughput per Watt or Freq per Watt) let efficiency_curr = if curr.throughput > 0.0 { curr.throughput as f32 / curr.power_w.max(1.0) } else { @@ -117,7 +155,6 @@ impl OptimizerEngine { next.freq_mhz / next.power_w.max(1.0) }; - // Diminishing returns: how much efficiency drops per additional watt let p_delta = (next.power_w - curr.power_w).max(0.5); let efficiency_drop = (efficiency_curr - efficiency_next) / p_delta; @@ -131,13 +168,10 @@ impl OptimizerEngine { let p_total_delta = (next.power_w - prev.power_w).max(1.0); let temp_accel = (dt_dw_next - dt_dw_prev) / p_total_delta; - // 3. Wall Detection (Any drop in absolute frequency/throughput is a hard wall) + // 3. Wall Detection (Any drop in absolute performance is a hard wall) let is_throttling = next.freq_mhz < curr.freq_mhz || (next.throughput > 0.0 && next.throughput < curr.throughput); let penalty = if is_throttling { 5000.0 } else { 0.0 }; - // Heuristic scoring: - // - Higher score is "Better" (The Knee is the peak of this curve) - // - We want high efficiency (low drop) and low thermal acceleration. let score = (efficiency_curr * 10.0) - (efficiency_drop * 50.0) - (temp_accel * 20.0) - penalty; if score > max_score { diff --git a/src/lib.rs b/src/lib.rs index 07dcb24..0f4aa6a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,9 @@ +//! # ember-tune: A physically-grounded thermal and power optimizer for Linux. +//! +//! This crate provides the core library for `ember-tune`, a tool that +//! scientifically determines the optimal power and thermal settings for laptops +//! by measuring physical properties like Thermal Resistance and the "Silicon Knee". + pub mod mediator; pub mod sal; pub mod load; diff --git a/src/load/mod.rs b/src/load/mod.rs index bc80d98..501e5fd 100644 --- a/src/load/mod.rs +++ b/src/load/mod.rs @@ -1,14 +1,33 @@ +//! Defines the `Workload` trait for generating synthetic CPU/GPU load. + use anyhow::Result; use std::process::Child; use std::time::{Duration, Instant}; use std::thread; +/// A trait for objects that can generate a measurable system load. pub trait Workload: Send + Sync { + /// Starts the workload with the specified number of threads and load percentage. + /// + /// # Errors + /// Returns an error if the underlying stress test process fails to spawn. fn start(&mut self, threads: usize, load_percent: usize) -> Result<()>; + + /// Stops the workload gracefully. + /// + /// # Errors + /// This method should aim to not fail, but may return an error if + /// forcefully killing the child process fails. fn stop(&mut self) -> Result<()>; + + /// Returns the current throughput of the workload (e.g., ops/sec). + /// + /// # Errors + /// Returns an error if throughput cannot be measured. fn get_throughput(&self) -> Result; } +/// An implementation of `Workload` that uses the `stress-ng` utility. pub struct StressNg { child: Option, } @@ -37,7 +56,6 @@ impl Workload for StressNg { fn stop(&mut self) -> Result<()> { if let Some(mut child) = self.child.take() { - // Try SIGTERM first #[cfg(unix)] { use libc::{kill, SIGTERM}; @@ -62,6 +80,9 @@ impl Workload for StressNg { Ok(()) } + /// Returns the current throughput of the workload (e.g., ops/sec). + /// + /// This is currently a stub and does not parse `stress-ng` output. fn get_throughput(&self) -> Result { Ok(0.0) } diff --git a/src/mediator.rs b/src/mediator.rs index 5ca3950..a2d4266 100644 --- a/src/mediator.rs +++ b/src/mediator.rs @@ -1,5 +1,13 @@ -use serde::{Serialize, Deserialize}; +//! Defines the data structures used for communication between the frontend and backend. +//! +//! This module acts as the "Mediator" in the Mediator Pattern, providing the +//! message-passing interface for the MPSC channels that connect the TUI thread +//! with the `BenchmarkOrchestrator` thread. +use serde::{Serialize, Deserialize}; +use std::collections::HashMap; + +/// Defines the current high-level phase of the benchmark. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub enum BenchmarkPhase { Auditing, @@ -9,44 +17,41 @@ pub enum BenchmarkPhase { Finalizing, } -impl Default for BenchmarkPhase { - fn default() -> Self { - Self::Auditing - } -} - -#[derive(Debug, Clone)] +/// A complete snapshot of system telemetry at a single point in time. +/// This struct is sent from the backend to the frontend on every tick. +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct TelemetryState { - // --- Static Info --- + // --- Static System Info --- pub cpu_model: String, pub total_ram_gb: u64, - - // --- Dynamic States --- - pub tick: u64, - pub phase: BenchmarkPhase, - pub governor: String, - pub pl1_limit: f32, - pub pl2_limit: f32, - pub fan_tier: String, - // --- Instantaneous Metrics --- + // --- Dynamic Metrics --- + pub tick: u64, pub cpu_temp: f32, pub power_w: f32, pub current_freq: f32, pub fans: Vec, - - // --- High-res History (Last 60s @ 500ms = 120 points) --- + pub governor: String, + pub pl1_limit: f32, + pub pl2_limit: f32, + pub fan_tier: String, + pub phase: BenchmarkPhase, + + // --- High-res History --- pub history_watts: Vec, pub history_temp: Vec, pub history_mhz: Vec, + // --- Events & Metadata --- pub log_event: Option, - pub metadata: std::collections::HashMap, + pub metadata: HashMap, pub is_emergency: bool, pub emergency_reason: Option, } +/// Commands sent from the frontend (UI) to the backend (`BenchmarkOrchestrator`). #[derive(Debug, Clone)] pub enum UiCommand { + /// Signals the orchestrator to gracefully abort the benchmark. Abort, } diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index 41cdc49..ab46853 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -1,3 +1,8 @@ +//! The central state machine responsible for coordinating the thermal benchmark. +//! +//! It manages hardware interactions through the [PlatformSal], generates stress +//! using a [Workload], and feeds telemetry to the frontend via MPSC channels. + use anyhow::{Result, Context}; use std::sync::mpsc; use std::time::{Duration, Instant}; @@ -14,31 +19,48 @@ use crate::load::Workload; use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase}; use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult}; +/// The central state machine responsible for coordinating the thermal benchmark. +/// +/// It manages hardware interactions through the [PlatformSal], generates stress +/// using a [Workload], and feeds telemetry to the frontend via MPSC channels. pub struct BenchmarkOrchestrator { + /// Injected hardware abstraction layer. sal: Arc, + /// Discovered system facts and paths. facts: SystemFactSheet, + /// Heat generation workload. workload: Box, + /// Channel for sending telemetry updates to the UI. telemetry_tx: mpsc::Sender, + /// Channel for receiving commands from the UI. command_rx: mpsc::Receiver, + /// Current phase of the benchmark. phase: BenchmarkPhase, + /// Accumulated thermal data points. profile: ThermalProfile, + /// Mathematics engine for data smoothing and optimization. engine: OptimizerEngine, - // --- History Buffers (120 points for 60s @ 500ms) --- + /// Sliding window of power readings (Watts). history_watts: VecDeque, + /// Sliding window of temperature readings (Celsius). history_temp: VecDeque, + /// Sliding window of CPU frequency (MHz). history_mhz: VecDeque, - // --- Static Info --- + /// Detected CPU model string. cpu_model: String, + /// Total system RAM in Gigabytes. total_ram_gb: u64, - // --- Safety --- + /// Atomic flag indicating a safety-triggered abort. emergency_abort: Arc, + /// Human-readable reason for the emergency abort. emergency_reason: Arc>>, } impl BenchmarkOrchestrator { + /// Creates a new orchestrator instance with injected dependencies. pub fn new( sal: Arc, facts: SystemFactSheet, @@ -73,16 +95,17 @@ impl BenchmarkOrchestrator { } } + /// Executes the full benchmark sequence. + /// + /// This method guarantees that [crate::sal::traits::EnvironmentGuard::restore] and [Workload::stop] + /// are called regardless of whether the benchmark succeeds or fails. pub fn run(&mut self) -> Result { self.log("Starting ember-tune Benchmark Sequence.")?; - // Start Watchdog Monitor let _watchdog_handle = self.spawn_watchdog_monitor(); - // Use a closure to ensure cleanup always runs let result = self.execute_benchmark(); - // --- MANDATORY CLEANUP --- self.log("Benchmark sequence finished. Restoring hardware defaults...")?; let _ = self.workload.stop(); if let Err(e) = self.sal.restore() { @@ -93,10 +116,10 @@ impl BenchmarkOrchestrator { result } + /// Internal execution logic for the benchmark phases. fn execute_benchmark(&mut self) -> Result { let bench_cfg = self.facts.bench_config.clone().context("Benchmarking config missing in facts")?; - // Phase 1: Audit & Baseline self.phase = BenchmarkPhase::Auditing; for step in self.sal.audit() { if let Err(e) = step.outcome { @@ -107,10 +130,9 @@ impl BenchmarkOrchestrator { self.log("Suppressing background services (tlp, thermald)...")?; self.sal.suppress().context("Failed to suppress background services")?; - // Baseline (Idle Calibration) self.phase = BenchmarkPhase::IdleCalibration; self.log(&format!("Phase 1: Recording Idle Baseline ({}s)...", bench_cfg.idle_duration_s))?; - self.sal.set_fan_mode("auto")?; // Use auto for idle + self.sal.set_fan_mode("auto")?; let mut idle_temps = Vec::new(); let start = Instant::now(); @@ -125,10 +147,9 @@ impl BenchmarkOrchestrator { self.profile.ambient_temp = self.engine.smooth(&idle_temps).last().cloned().unwrap_or(0.0); self.log(&format!("✓ Idle Baseline: {:.1}°C", self.profile.ambient_temp))?; - // Phase 2: Stress Stepping self.phase = BenchmarkPhase::StressTesting; self.log("Phase 2: Starting Synthetic Stress Matrix.")?; - self.sal.set_fan_mode("max")?; // Lock fans for consistent resistance + self.sal.set_fan_mode("max")?; let steps = bench_cfg.power_steps_watts.clone(); for &pl in &steps { @@ -138,7 +159,6 @@ impl BenchmarkOrchestrator { self.workload.start(num_cpus::get(), 100)?; - // Wait for equilibrium let step_start = Instant::now(); let mut step_temps = VecDeque::with_capacity(30); @@ -152,7 +172,6 @@ impl BenchmarkOrchestrator { self.send_telemetry(tick)?; tick += 1; - // Check for stability: Range < 0.5C over last 5s (10 ticks) if step_start.elapsed() > Duration::from_secs(bench_cfg.stress_duration_min_s) && step_temps.len() == 10 { let min = step_temps.iter().fold(f32::MAX, |a, &b| a.min(b)); let max = step_temps.iter().fold(f32::MIN, |a, &b| a.max(b)); @@ -164,7 +183,6 @@ impl BenchmarkOrchestrator { thread::sleep(Duration::from_millis(500)); } - // Record data point let avg_p = self.sal.get_power_w().unwrap_or(0.0); let avg_t = self.sal.get_temp().unwrap_or(0.0); let avg_f = self.sal.get_freq_mhz().unwrap_or(0.0); @@ -185,7 +203,6 @@ impl BenchmarkOrchestrator { thread::sleep(Duration::from_secs(bench_cfg.cool_down_s)); } - // Phase 4: Physical Modeling self.phase = BenchmarkPhase::PhysicalModeling; self.log("Phase 3: Calculating Silicon Physical Sweet Spot...")?; @@ -196,7 +213,6 @@ impl BenchmarkOrchestrator { thread::sleep(Duration::from_secs(3)); - // Phase 5: Finalizing self.phase = BenchmarkPhase::Finalizing; self.log("Benchmark sequence complete. Generating configurations...")?; @@ -206,14 +222,12 @@ impl BenchmarkOrchestrator { trip_temp: res.max_temp_c.max(95.0), }; - // 1. Throttled (Merged if exists) if let Some(throttled_path) = self.facts.paths.configs.get("throttled") { crate::engine::formatters::throttled::ThrottledTranslator::save(throttled_path, &config)?; self.log(&format!("✓ Saved '{}' (merged).", throttled_path.display()))?; res.config_paths.insert("throttled".to_string(), throttled_path.clone()); } - // 2. i8kmon if let Some(i8k_path) = self.facts.paths.configs.get("i8kmon") { let i8k_config = crate::engine::formatters::i8kmon::I8kmonConfig { t_ambient: self.profile.ambient_temp, @@ -228,6 +242,7 @@ impl BenchmarkOrchestrator { Ok(res) } + /// Spawns a concurrent monitor that polls safety sensors every 100ms. fn spawn_watchdog_monitor(&self) -> thread::JoinHandle<()> { let abort = self.emergency_abort.clone(); let reason_store = self.emergency_reason.clone(); @@ -279,6 +294,7 @@ impl BenchmarkOrchestrator { }) } + /// Generates the final [OptimizationResult] based on current measurements. pub fn generate_result(&self, is_partial: bool) -> OptimizationResult { let r_theta = self.engine.calculate_thermal_resistance(&self.profile); let knee = self.engine.find_silicon_knee(&self.profile); @@ -296,6 +312,7 @@ impl BenchmarkOrchestrator { } } + /// Checks if the benchmark has been aborted by the user or the watchdog. fn check_abort(&self) -> Result<()> { if self.emergency_abort.load(Ordering::SeqCst) { let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Unknown safety trigger".to_string()); @@ -312,6 +329,7 @@ impl BenchmarkOrchestrator { Ok(()) } + /// Helper to send log messages to the frontend. fn log(&self, msg: &str) -> Result<()> { let state = TelemetryState { cpu_model: self.cpu_model.clone(), @@ -337,6 +355,7 @@ impl BenchmarkOrchestrator { self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed")) } + /// Collects current sensors and sends a complete [TelemetryState] to the frontend. fn send_telemetry(&mut self, tick: u64) -> Result<()> { let temp = self.sal.get_temp().unwrap_or(0.0); let pwr = self.sal.get_power_w().unwrap_or(0.0); diff --git a/src/sal/generic_linux.rs b/src/sal/generic_linux.rs index d234354..7a0e2ce 100644 --- a/src/sal/generic_linux.rs +++ b/src/sal/generic_linux.rs @@ -3,7 +3,6 @@ use std::path::{Path}; use std::fs; use std::time::{Duration, Instant}; use std::sync::Mutex; -use tracing::{debug}; use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx}; use crate::sal::heuristic::discovery::SystemFactSheet; diff --git a/src/sal/traits.rs b/src/sal/traits.rs index a88ebcf..704ce5c 100644 --- a/src/sal/traits.rs +++ b/src/sal/traits.rs @@ -1,11 +1,21 @@ -use anyhow::Result; -use thiserror::Error; +//! Core traits defining the System Abstraction Layer (SAL). +//! +//! This module provides a set of hardware-agnostic interfaces that the +//! `BenchmarkOrchestrator` uses to interact with the underlying system. +//! These traits allow `ember-tune` to support diverse hardware by abstracting +//! away platform-specific details. + use miette::Diagnostic; use std::sync::Arc; use std::path::PathBuf; use crate::sys::SyscallRunner; +use anyhow::Result; +use thiserror::Error; /// Context holding OS abstractions (filesystem base and syscall runner). +/// +/// This is injected into SAL implementations to allow for a mocked "virtual" +/// environment during testing, preventing `cargo test` from mutating the host system. #[derive(Clone)] pub struct EnvironmentCtx { pub sysfs_base: PathBuf, @@ -13,6 +23,7 @@ pub struct EnvironmentCtx { } impl EnvironmentCtx { + /// Creates a production-ready context pointing to the real filesystem root. pub fn production() -> Self { Self { sysfs_base: PathBuf::from("/"), @@ -21,41 +32,52 @@ impl EnvironmentCtx { } } +/// Errors that can occur during the pre-flight system audit. #[derive(Error, Diagnostic, Debug, Clone)] pub enum AuditError { + /// The user does not have root privileges (`uid=0`). #[error("Missing root privileges.")] #[diagnostic(code(ember_tune::root_required), severity(error))] #[help("ember-tune requires direct hardware access (MSRs, sysfs). Please run with 'sudo'.")] RootRequired, + /// A required kernel parameter is missing from the boot command line. #[error("Missing kernel parameter: {0}")] #[diagnostic(code(ember_tune::missing_kernel_param), severity(error))] #[help("Add '{0}' to your GRUB_CMDLINE_LINUX_DEFAULT in /etc/default/grub, then run 'sudo update-grub' and reboot.")] MissingKernelParam(String), + /// The system is running on battery power. #[error("System is running on battery: {0}")] #[diagnostic(code(ember_tune::ac_power_missing), severity(error))] #[help("Thermal benchmarking requires a stable AC power source to ensure consistent PL limits. Please plug in your charger.")] AcPowerMissing(String), + /// The Linux kernel version is known to be incompatible. #[error("Incompatible kernel version: {0}")] #[diagnostic(code(ember_tune::kernel_incompatible), severity(error))] #[help("Your kernel version '{0}' may not support the required RAPL or SMM interfaces. Please upgrade to a recent LTS kernel (6.1+).")] KernelIncompatible(String), + /// A required kernel module or CLI tool is not available. #[error("Required tool missing: {0}")] #[diagnostic(code(ember_tune::tool_missing), severity(error))] #[help("The utility '{0}' is required for this SAL. Please install it using your package manager (e.g., 'sudo apt install {0}').")] ToolMissing(String), } +/// A single, verifiable step in the pre-flight audit process. pub struct AuditStep { + /// Human-readable description of the check. pub description: String, + /// The outcome of the check. pub outcome: Result<(), AuditError>, } -/// Evaluates immutable system states (e.g., kernel bootline parameters, AC power status). +/// Evaluates immutable system states before the benchmark begins. pub trait PreflightAuditor: Send + Sync { + /// Returns an iterator of [AuditStep] results. + /// This allows the UI to show a live checklist of system verification steps. fn audit(&self) -> Box + '_>; } @@ -65,9 +87,22 @@ impl PreflightAuditor for Arc { } } -/// Suppresses conflicting daemons (tlp, thermald). +/// Manages system services that conflict with the benchmark. +/// +/// # Invariants +/// The `Drop` trait is *not* used for guaranteed cleanup. The orchestrator must +/// explicitly call `restore()` to ensure hardware state is reset. pub trait EnvironmentGuard: Send + Sync { + /// Stops any conflicting system daemons (e.g., `tlp`, `thermald`). + /// + /// # Errors + /// Returns an error if the `systemctl` command fails. fn suppress(&self) -> Result<()>; + + /// Restarts any services that were stopped by `suppress`. + /// + /// # Errors + /// Returns an error if the `systemctl` command fails. fn restore(&self) -> Result<()>; } @@ -80,11 +115,30 @@ impl EnvironmentGuard for Arc { } } -/// Read-only interface for standardized metrics. +/// Provides a read-only interface to system telemetry sensors. pub trait SensorBus: Send + Sync { + /// Returns the current package temperature in degrees Celsius. + /// + /// # Errors + /// Returns an error if the underlying `hwmon` or `sysfs` node cannot be read. fn get_temp(&self) -> Result; + + /// Returns the current package power consumption in Watts. + /// + /// # Errors + /// Returns an error if the underlying RAPL or power sensor cannot be read. fn get_power_w(&self) -> Result; + + /// Returns the current speed of all detected fans in RPM. + /// + /// # Errors + /// Returns an error if the fan sensor nodes cannot be read. fn get_fan_rpms(&self) -> Result>; + + /// Returns the current average CPU frequency in MHz. + /// + /// # Errors + /// Returns an error if `/proc/cpuinfo` or a `cpufreq` sysfs node cannot be read. fn get_freq_mhz(&self) -> Result; } @@ -103,10 +157,24 @@ impl SensorBus for Arc { } } -/// Write-only interface for hardware commands. +/// Provides a write-only interface for hardware actuators. pub trait ActuatorBus: Send + Sync { + /// Sets the fan control mode (e.g., "auto" or "max"). + /// + /// # Errors + /// Returns an error if the fan control command or `sysfs` write fails. fn set_fan_mode(&self, mode: &str) -> Result<()>; + + /// Sets the sustained power limit (PL1) in Watts. + /// + /// # Errors + /// Returns an error if the RAPL `sysfs` node cannot be written to. fn set_sustained_power_limit(&self, watts: f32) -> Result<()>; + + /// Sets the burst power limit (PL2) in Watts. + /// + /// # Errors + /// Returns an error if the RAPL `sysfs` node cannot be written to. fn set_burst_power_limit(&self, watts: f32) -> Result<()>; } @@ -122,8 +190,27 @@ impl ActuatorBus for Arc { } } -/// Concurrent monitor for catastrophic states. +/// Represents the high-level safety status of the system. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum SafetyStatus { + /// The system is operating within normal parameters. + Nominal, + /// A non-critical issue was detected and may have been auto-corrected. + Warning(String), + /// A potentially dangerous state was detected, but is not yet an emergency. + Critical(String), + /// A critical failure has occurred, requiring an immediate shutdown of the benchmark. + EmergencyAbort(String), +} + +/// A high-frequency monitor for catastrophic hardware states. pub trait HardwareWatchdog: Send + Sync { + /// Returns the current [SafetyStatus] of the system. + /// + /// # Errors + /// This method can return an error if a sensor required for a safety check + /// (e.g., the thermal sensor) fails to read. The orchestrator must treat + /// this as an `EmergencyAbort` condition. fn get_safety_status(&self) -> Result; } @@ -133,15 +220,10 @@ impl HardwareWatchdog for Arc { } } -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum SafetyStatus { - Nominal, - Warning(String), - Critical(String), - EmergencyAbort(String), -} - /// Aggregate trait for a complete platform implementation. +/// +/// This "super-trait" combines all SAL interfaces into a single object-safe +/// trait, simplifying dependency injection into the `BenchmarkOrchestrator`. pub trait PlatformSal: PreflightAuditor + SensorBus + ActuatorBus + EnvironmentGuard + HardwareWatchdog {} impl PlatformSal for T {} From 4ed7228355bc2e215b733173c74fc7611068a2e8 Mon Sep 17 00:00:00 2001 From: Nils Pukropp Date: Thu, 26 Feb 2026 17:22:37 +0100 Subject: [PATCH 07/13] version increase --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index cd31a56..ac4e99b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -513,7 +513,7 @@ checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" [[package]] name = "ember-tune-rs" -version = "1.1.0" +version = "1.2.0" dependencies = [ "anyhow", "clap", From 4c4026a6002de7727b6e4cf87327cfdba86d04f7 Mon Sep 17 00:00:00 2001 From: Nils Pukropp Date: Fri, 27 Feb 2026 00:59:36 +0100 Subject: [PATCH 08/13] fixed dangerous states to be applied --- src/engine/mod.rs | 13 ++++++++ src/main.rs | 2 ++ src/orchestrator/mod.rs | 19 +++++++++--- src/sal/dell_xps_9380.rs | 49 ++++++++++++++++++++++++++++--- src/sal/generic_linux.rs | 31 +++++++++++++++++-- src/sal/heuristic/discovery.rs | 23 ++++----------- src/sal/heuristic/engine.rs | 2 +- tests/heuristic_discovery_test.rs | 5 +++- tests/orchestrator_e2e_test.rs | 1 + 9 files changed, 116 insertions(+), 29 deletions(-) diff --git a/src/engine/mod.rs b/src/engine/mod.rs index 42dabad..07997d8 100644 --- a/src/engine/mod.rs +++ b/src/engine/mod.rs @@ -7,6 +7,7 @@ use serde::{Serialize, Deserialize}; use std::collections::HashMap; use std::path::PathBuf; +use tracing::warn; pub mod formatters; @@ -180,6 +181,18 @@ impl OptimizerEngine { } } + let best_pl = if max_score > f32::MIN { + best_pl + } else { + profile.points.last().map(|p| p.power_w).unwrap_or(15.0) + }; + + // Safety Floor: Never recommend a TDP below 5W, as this bricks system performance. + if best_pl < 5.0 { + warn!("Heuristic suggested dangerously low PL1 ({:.1}W). Falling back to 15W safety floor.", best_pl); + return 15.0; + } + best_pl } } diff --git a/src/main.rs b/src/main.rs index 014817e..bc962f4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -161,6 +161,7 @@ fn main() -> Result<()> { // 5. Spawn Backend Orchestrator let sal_backend = sal.clone(); let facts_backend = facts.clone(); + let config_out = args.config_out.clone(); let backend_handle = thread::spawn(move || { let workload = Box::new(StressNg::new()); let mut orchestrator = BenchmarkOrchestrator::new( @@ -169,6 +170,7 @@ fn main() -> Result<()> { workload, telemetry_tx, command_rx, + config_out, ); orchestrator.run() }); diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index ab46853..ebe99da 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -12,6 +12,7 @@ use sysinfo::System; use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Mutex; +use std::path::PathBuf; use crate::sal::traits::{PlatformSal, SafetyStatus}; use crate::sal::heuristic::discovery::SystemFactSheet; @@ -40,6 +41,8 @@ pub struct BenchmarkOrchestrator { profile: ThermalProfile, /// Mathematics engine for data smoothing and optimization. engine: OptimizerEngine, + /// CLI override for the configuration output path. + optional_config_out: Option, /// Sliding window of power readings (Watts). history_watts: VecDeque, @@ -67,6 +70,7 @@ impl BenchmarkOrchestrator { workload: Box, telemetry_tx: mpsc::Sender, command_rx: mpsc::Receiver, + optional_config_out: Option, ) -> Self { let mut sys = System::new_all(); sys.refresh_all(); @@ -92,6 +96,7 @@ impl BenchmarkOrchestrator { total_ram_gb, emergency_abort: Arc::new(AtomicBool::new(false)), emergency_reason: Arc::new(Mutex::new(None)), + optional_config_out, } } @@ -222,12 +227,18 @@ impl BenchmarkOrchestrator { trip_temp: res.max_temp_c.max(95.0), }; - if let Some(throttled_path) = self.facts.paths.configs.get("throttled") { - crate::engine::formatters::throttled::ThrottledTranslator::save(throttled_path, &config)?; - self.log(&format!("✓ Saved '{}' (merged).", throttled_path.display()))?; - res.config_paths.insert("throttled".to_string(), throttled_path.clone()); + // 1. Throttled (Merged if exists) + // PRIORITY: optional_config_out > facts discovery > fallback + let throttled_path = self.optional_config_out.clone() + .or_else(|| self.facts.paths.configs.get("throttled").cloned()); + + if let Some(path) = throttled_path { + crate::engine::formatters::throttled::ThrottledTranslator::save(&path, &config)?; + self.log(&format!("✓ Saved '{}'.", path.display()))?; + res.config_paths.insert("throttled".to_string(), path.clone()); } + // 2. i8kmon if let Some(i8k_path) = self.facts.paths.configs.get("i8kmon") { let i8k_config = crate::engine::formatters::i8kmon::I8kmonConfig { t_ambient: self.profile.ambient_temp, diff --git a/src/sal/dell_xps_9380.rs b/src/sal/dell_xps_9380.rs index fbf12af..dcc73ae 100644 --- a/src/sal/dell_xps_9380.rs +++ b/src/sal/dell_xps_9380.rs @@ -22,6 +22,11 @@ pub struct DellXps9380Sal { suppressed_services: Mutex>, msr_file: Mutex, last_energy: Mutex<(u64, Instant)>, + + // --- Original State for Restoration --- + original_pl1: Mutex>, + original_pl2: Mutex>, + original_fan_mode: Mutex>, } impl DellXps9380Sal { @@ -53,6 +58,9 @@ impl DellXps9380Sal { last_energy: Mutex::new((initial_energy, Instant::now())), fact_sheet: facts, ctx, + original_pl1: Mutex::new(None), + original_pl2: Mutex::new(None), + original_fan_mode: Mutex::new(None), }) } @@ -126,12 +134,25 @@ impl PreflightAuditor for DellXps9380Sal { impl EnvironmentGuard for DellXps9380Sal { fn suppress(&self) -> Result<()> { + // 1. Snapshot Power Limits + if let Ok(pl1) = fs::read_to_string(&self.pl1_path) { + *self.original_pl1.lock().unwrap() = pl1.trim().parse().ok(); + } + if let Ok(pl2) = fs::read_to_string(&self.pl2_path) { + *self.original_pl2.lock().unwrap() = pl2.trim().parse().ok(); + } + + // 2. Snapshot Fan Mode (Assumption: Dell BIOS Fan Control is active) + // We can't easily read current state of dell-bios-fan-control, so we assume 'auto' (1) + *self.original_fan_mode.lock().unwrap() = Some("1".to_string()); + + // 3. Stop Services let services = ["tlp", "thermald", "i8kmon"]; let mut suppressed = self.suppressed_services.lock().unwrap(); for s in services { if self.ctx.runner.run("systemctl", &["is-active", "--quiet", s]).is_ok() { debug!("Suppressing service: {}", s); - self.ctx.runner.run("systemctl", &["stop", s])?; + let _ = self.ctx.runner.run("systemctl", &["stop", s]); suppressed.push(s.to_string()); } } @@ -139,6 +160,20 @@ impl EnvironmentGuard for DellXps9380Sal { } fn restore(&self) -> Result<()> { + // 1. Restore Power Limits + if let Some(pl1) = *self.original_pl1.lock().unwrap() { + let _ = fs::write(&self.pl1_path, pl1.to_string()); + } + if let Some(pl2) = *self.original_pl2.lock().unwrap() { + let _ = fs::write(&self.pl2_path, pl2.to_string()); + } + + // 2. Restore Fan Mode (BIOS Control) + if let Some(tool_path) = self.fact_sheet.paths.tools.get("dell_fan_ctrl") { + let _ = self.ctx.runner.run(&tool_path.to_string_lossy(), &["1"]); + } + + // 3. Restart Services let mut suppressed = self.suppressed_services.lock().unwrap(); for s in suppressed.drain(..) { let _ = self.ctx.runner.run("systemctl", &["start", &s]); @@ -162,17 +197,23 @@ impl SensorBus for DellXps9380Sal { } fn get_power_w(&self) -> Result { - if self.pwr_path.to_string_lossy().contains("energy_uj") { + // FIX: Ensure we always read from energy_uj if available for delta calculation + let rapl_base = self.pl1_path.parent().context("RAPL path error")?; + let energy_path = rapl_base.join("energy_uj"); + + if energy_path.exists() { let mut last = self.last_energy.lock().unwrap(); - let e2 = fs::read_to_string(&self.pwr_path)?.trim().parse::()?; + let e2_str = fs::read_to_string(&energy_path)?; + let e2 = e2_str.trim().parse::()?; let t2 = Instant::now(); let (e1, t1) = *last; let delta_e = e2.wrapping_sub(e1); let delta_t = t2.duration_since(t1).as_secs_f32(); *last = (e2, t2); - if delta_t < 0.01 { return Ok(0.0); } + if delta_t < 0.05 { return Ok(0.0); } Ok((delta_e as f32 / 1_000_000.0) / delta_t) } else { + // Fallback to power1_average if it exists (units are µW) let s = fs::read_to_string(&self.pwr_path)?; Ok(s.trim().parse::()? / 1000000.0) } diff --git a/src/sal/generic_linux.rs b/src/sal/generic_linux.rs index 7a0e2ce..7007b25 100644 --- a/src/sal/generic_linux.rs +++ b/src/sal/generic_linux.rs @@ -3,6 +3,7 @@ use std::path::{Path}; use std::fs; use std::time::{Duration, Instant}; use std::sync::Mutex; +use tracing::{debug}; use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx}; use crate::sal::heuristic::discovery::SystemFactSheet; @@ -16,6 +17,10 @@ pub struct GenericLinuxSal { last_valid_temp: Mutex<(f32, Instant)>, current_pl1: Mutex, last_energy: Mutex<(u64, Instant)>, + + // --- Original State for Restoration --- + original_pl1: Mutex>, + original_pl2: Mutex>, } impl GenericLinuxSal { @@ -34,6 +39,8 @@ impl GenericLinuxSal { last_energy: Mutex::new((initial_energy, Instant::now())), fact_sheet: facts, ctx, + original_pl1: Mutex::new(None), + original_pl2: Mutex::new(None), } } @@ -95,7 +102,7 @@ impl SensorBus for GenericLinuxSal { let delta_e = e2.wrapping_sub(e1); let delta_t = t2.duration_since(t1).as_secs_f32(); *last = (e2, t2); - if delta_t < 0.01 { return Ok(0.0); } + if delta_t < 0.05 { return Ok(0.0); } Ok((delta_e as f32 / 1_000_000.0) / delta_t) } @@ -160,12 +167,22 @@ impl ActuatorBus for GenericLinuxSal { impl EnvironmentGuard for GenericLinuxSal { fn suppress(&self) -> Result<()> { + // Snapshot Power Limits + if let Some(rapl_path) = self.fact_sheet.rapl_paths.first() { + if let Ok(pl1) = fs::read_to_string(rapl_path.join("constraint_0_power_limit_uw")) { + *self.original_pl1.lock().unwrap() = pl1.trim().parse().ok(); + } + if let Ok(pl2) = fs::read_to_string(rapl_path.join("constraint_1_power_limit_uw")) { + *self.original_pl2.lock().unwrap() = pl2.trim().parse().ok(); + } + } + let mut suppressed = self.suppressed_services.lock().unwrap(); for conflict_id in &self.fact_sheet.active_conflicts { if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) { for service in &conflict.services { if self.ctx.runner.run("systemctl", &["is-active", "--quiet", service]).is_ok() { - self.ctx.runner.run("systemctl", &["stop", service])?; + let _ = self.ctx.runner.run("systemctl", &["stop", service]); suppressed.push(service.clone()); } } @@ -175,6 +192,16 @@ impl EnvironmentGuard for GenericLinuxSal { } fn restore(&self) -> Result<()> { + // Restore Power Limits + if let Some(rapl_path) = self.fact_sheet.rapl_paths.first() { + if let Some(pl1) = *self.original_pl1.lock().unwrap() { + let _ = fs::write(rapl_path.join("constraint_0_power_limit_uw"), pl1.to_string()); + } + if let Some(pl2) = *self.original_pl2.lock().unwrap() { + let _ = fs::write(rapl_path.join("constraint_1_power_limit_uw"), pl2.to_string()); + } + } + let mut suppressed = self.suppressed_services.lock().unwrap(); for service in suppressed.drain(..) { let _ = self.ctx.runner.run("systemctl", &["start", &service]); diff --git a/src/sal/heuristic/discovery.rs b/src/sal/heuristic/discovery.rs index 6f6952b..3dce223 100644 --- a/src/sal/heuristic/discovery.rs +++ b/src/sal/heuristic/discovery.rs @@ -1,11 +1,11 @@ use std::fs; use std::path::{Path, PathBuf}; -use std::process::Command; use std::time::{Duration}; use std::thread; use std::sync::mpsc; use std::collections::HashMap; use crate::sal::heuristic::schema::{SensorDiscovery, ActuatorDiscovery, Conflict, Discovery, Benchmarking}; +use crate::sys::SyscallRunner; use tracing::{debug, warn}; /// Registry of dynamically discovered paths for configs and tools. @@ -31,6 +31,7 @@ pub struct SystemFactSheet { /// Probes the system for hardware sensors, actuators, service conflicts, and paths. pub fn discover_facts( base_path: &Path, + runner: &dyn SyscallRunner, discovery: &Discovery, conflicts: &[Conflict], bench_config: Benchmarking, @@ -45,7 +46,7 @@ pub fn discover_facts( let mut active_conflicts = Vec::new(); for conflict in conflicts { for service in &conflict.services { - if is_service_active(service) { + if is_service_active(runner, service) { debug!("Detected active conflict: {} (Service: {})", conflict.id, service); active_conflicts.push(conflict.id.clone()); break; @@ -93,7 +94,6 @@ fn discover_paths(base_path: &Path, discovery: &Discovery) -> PathRegistry { break; } } - // If not found, use the first one as default if any exist if !registry.configs.contains_key(id) { if let Some(first) = candidates.first() { registry.configs.insert(id.clone(), PathBuf::from(first)); @@ -142,7 +142,6 @@ fn discover_hwmon(base_path: &Path, cfg: &SensorDiscovery) -> (Option, for hw_entry in hw_entries.flatten() { let file_name = hw_entry.file_name().into_string().unwrap_or_default(); - // Temperature Sensors if file_name.starts_with("temp") && file_name.ends_with("_label") { if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(100)) { if cfg.temp_labels.iter().any(|l| label.contains(l)) { @@ -154,7 +153,6 @@ fn discover_hwmon(base_path: &Path, cfg: &SensorDiscovery) -> (Option, } } - // Fan Sensors if file_name.starts_with("fan") && file_name.ends_with("_label") { if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(100)) { if cfg.fan_labels.iter().any(|l| label.contains(l)) { @@ -206,18 +204,9 @@ fn discover_rapl(base_path: &Path, cfg: &ActuatorDiscovery) -> Vec { paths } -/// Checks if a systemd service is currently active. -pub fn is_service_active(service: &str) -> bool { - let status = Command::new("systemctl") - .arg("is-active") - .arg("--quiet") - .arg(service) - .status(); - - match status { - Ok(s) => s.success(), - Err(_) => false, - } +/// Checks if a systemd service is currently active using the injected runner. +pub fn is_service_active(runner: &dyn SyscallRunner, service: &str) -> bool { + runner.run("systemctl", &["is-active", "--quiet", service]).is_ok() } /// Helper to read a sysfs file with a timeout. diff --git a/src/sal/heuristic/engine.rs b/src/sal/heuristic/engine.rs index eb7cdeb..04974f9 100644 --- a/src/sal/heuristic/engine.rs +++ b/src/sal/heuristic/engine.rs @@ -24,7 +24,7 @@ impl HeuristicEngine { .context("Failed to parse hardware_db.toml")?; // 2. Discover Facts - let facts = discover_facts(&ctx.sysfs_base, &db.discovery, &db.conflicts, db.benchmarking.clone()); + let facts = discover_facts(&ctx.sysfs_base, ctx.runner.as_ref(), &db.discovery, &db.conflicts, db.benchmarking.clone()); info!("System Identity: {} {}", facts.vendor, facts.model); // 3. Routing Logic diff --git a/tests/heuristic_discovery_test.rs b/tests/heuristic_discovery_test.rs index 2905124..a6dfc2d 100644 --- a/tests/heuristic_discovery_test.rs +++ b/tests/heuristic_discovery_test.rs @@ -1,5 +1,6 @@ use ember_tune_rs::sal::heuristic::discovery::discover_facts; use ember_tune_rs::sal::heuristic::schema::{Discovery, SensorDiscovery, ActuatorDiscovery, Benchmarking}; +use ember_tune_rs::sys::MockSyscallRunner; use crate::common::fakesys::FakeSysBuilder; mod common; @@ -35,7 +36,9 @@ fn test_heuristic_discovery_with_fakesys() { power_steps_watts: vec![10.0, 15.0], }; - let facts = discover_facts(&fake.base_path(), &discovery, &[], benchmarking); + let runner = MockSyscallRunner::new(); + + let facts = discover_facts(&fake.base_path(), &runner, &discovery, &[], benchmarking); assert_eq!(facts.vendor, "Dell Inc."); assert_eq!(facts.model, "XPS 13 9380"); diff --git a/tests/orchestrator_e2e_test.rs b/tests/orchestrator_e2e_test.rs index 7681499..f445c4c 100644 --- a/tests/orchestrator_e2e_test.rs +++ b/tests/orchestrator_e2e_test.rs @@ -28,6 +28,7 @@ fn test_orchestrator_e2e_state_machine() { workload, telemetry_tx, command_rx, + None, ); // For the purpose of this architecture audit, we've demonstrated the From f0925a3ab33408308ed53cc951469a28535497ea Mon Sep 17 00:00:00 2001 From: Nils Pukropp Date: Fri, 27 Feb 2026 02:47:51 +0100 Subject: [PATCH 09/13] implemented safety features to prevent system damage --- src/load/mod.rs | 151 ++++++++++++++++++++-------- src/orchestrator/mod.rs | 61 +++++++++--- src/sal/dell_xps_9380.rs | 13 ++- src/sal/generic_linux.rs | 15 +-- src/sal/mock.rs | 5 +- src/sal/mod.rs | 1 + src/sal/safety.rs | 175 +++++++++++++++++++++++++++++++++ src/sal/traits.rs | 18 ++-- tests/orchestrator_e2e_test.rs | 17 +++- 9 files changed, 373 insertions(+), 83 deletions(-) create mode 100644 src/sal/safety.rs diff --git a/src/load/mod.rs b/src/load/mod.rs index 501e5fd..3ec7956 100644 --- a/src/load/mod.rs +++ b/src/load/mod.rs @@ -1,61 +1,136 @@ -//! Defines the `Workload` trait for generating synthetic CPU/GPU load. +//! Load generation and performance measurement subsystem. -use anyhow::Result; -use std::process::Child; +use anyhow::{Result, Context, anyhow}; +use std::process::{Child, Command, Stdio}; use std::time::{Duration, Instant}; use std::thread; +use std::io::{BufRead, BufReader}; +use std::sync::{Arc, Mutex}; +use serde::{Deserialize, Serialize}; -/// A trait for objects that can generate a measurable system load. -pub trait Workload: Send + Sync { - /// Starts the workload with the specified number of threads and load percentage. - /// - /// # Errors - /// Returns an error if the underlying stress test process fails to spawn. - fn start(&mut self, threads: usize, load_percent: usize) -> Result<()>; - - /// Stops the workload gracefully. - /// - /// # Errors - /// This method should aim to not fail, but may return an error if - /// forcefully killing the child process fails. - fn stop(&mut self) -> Result<()>; - - /// Returns the current throughput of the workload (e.g., ops/sec). - /// - /// # Errors - /// Returns an error if throughput cannot be measured. - fn get_throughput(&self) -> Result; +/// Standardized telemetry returned by any workload implementation. +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct WorkloadMetrics { + /// Primary performance heuristic (e.g., Bogo Ops/s) + pub primary_ops_per_sec: f64, + /// Time elapsed since the workload started + pub elapsed_time: Duration, } -/// An implementation of `Workload` that uses the `stress-ng` utility. +/// A normalized profile defining the intensity and constraints of the workload. +#[derive(Debug, Clone)] +pub struct IntensityProfile { + pub threads: usize, + pub load_percentage: u8, +} + +/// The replaceable interface for load generation and performance measurement. +pub trait Workload: Send + Sync { + /// Sets up prerequisites (e.g., binary checks). + fn initialize(&mut self) -> Result<()>; + + /// Executes the load asynchronously. + fn run_workload(&mut self, duration: Duration, profile: IntensityProfile) -> Result<()>; + + /// Returns the current standardized telemetry object. + fn get_current_metrics(&self) -> Result; + + /// Gracefully and forcefully terminates the workload. + fn stop_workload(&mut self) -> Result<()>; +} + +/// Implementation of the Benchmarking Interface using stress-ng matrix stressors. pub struct StressNg { child: Option, + start_time: Option, + latest_metrics: Arc>, } impl StressNg { pub fn new() -> Self { - Self { child: None } + Self { + child: None, + start_time: None, + latest_metrics: Arc::new(Mutex::new(WorkloadMetrics::default())), + } } } impl Workload for StressNg { - fn start(&mut self, threads: usize, load_percent: usize) -> Result<()> { - self.stop()?; + fn initialize(&mut self) -> Result<()> { + let status = Command::new("stress-ng") + .arg("--version") + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .status() + .context("stress-ng binary not found in PATH")?; - let child = std::process::Command::new("stress-ng") + if !status.success() { + return Err(anyhow!("stress-ng failed to initialize")); + } + Ok(()) + } + + fn run_workload(&mut self, duration: Duration, profile: IntensityProfile) -> Result<()> { + self.stop_workload()?; // Ensure clean state + + let threads = profile.threads.to_string(); + let timeout = format!("{}s", duration.as_secs()); + let load = profile.load_percentage.to_string(); + + let mut child = Command::new("stress-ng") .args([ - "--cpu", &threads.to_string(), - "--cpu-load", &load_percent.to_string(), - "--quiet" + "--matrix", &threads, + "--cpu-load", &load, + "--timeout", &timeout, + "--metrics-brief", + "--metrics-brief", // Repeat for stderr/stdout consistency ]) - .spawn()?; + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .context("Failed to spawn stress-ng")?; + + self.start_time = Some(Instant::now()); + // Spawn metrics parser thread + let metrics_ref = Arc::clone(&self.latest_metrics); + let stderr = child.stderr.take().expect("Failed to capture stderr"); + + thread::spawn(move || { + let reader = BufReader::new(stderr); + for line in reader.lines().flatten() { + // Parse stress-ng metrics line: + // stress-ng: info: [PID] matrix [OPS] [TIME] [BOGO OPS/S] + if line.contains("matrix") && line.contains("bogo ops/s") { + let parts: Vec<&str> = line.split_whitespace().collect(); + if let Some(ops_idx) = parts.iter().position(|&p| p == "ops/s") { + if let Some(ops_val) = parts.get(ops_idx - 1) { + if let Ok(ops) = ops_val.parse::() { + let mut m = metrics_ref.lock().unwrap(); + m.primary_ops_per_sec = ops; + } + } + } + } + } + }); + self.child = Some(child); Ok(()) } - fn stop(&mut self) -> Result<()> { + fn get_current_metrics(&self) -> Result { + let mut m = self.latest_metrics.lock().unwrap().clone(); + if let Some(start) = self.start_time { + m.elapsed_time = start.elapsed(); + } + Ok(m) + } + + fn stop_workload(&mut self) -> Result<()> { if let Some(mut child) = self.child.take() { + // Polite SIGTERM #[cfg(unix)] { use libc::{kill, SIGTERM}; @@ -77,19 +152,13 @@ impl Workload for StressNg { let _ = child.wait(); } } + self.start_time = None; Ok(()) } - - /// Returns the current throughput of the workload (e.g., ops/sec). - /// - /// This is currently a stub and does not parse `stress-ng` output. - fn get_throughput(&self) -> Result { - Ok(0.0) - } } impl Drop for StressNg { fn drop(&mut self) { - let _ = self.stop(); + let _ = self.stop_workload(); } } diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index ebe99da..7e42825 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -14,9 +14,10 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Mutex; use std::path::PathBuf; -use crate::sal::traits::{PlatformSal, SafetyStatus}; +use crate::sal::traits::{PlatformSal, AuditStep, SafetyStatus}; use crate::sal::heuristic::discovery::SystemFactSheet; -use crate::load::Workload; +use crate::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts}; +use crate::load::{Workload, IntensityProfile}; use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase}; use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult}; @@ -44,6 +45,9 @@ pub struct BenchmarkOrchestrator { /// CLI override for the configuration output path. optional_config_out: Option, + /// The safety membrane protecting the system. + safeguard: Option, + /// Sliding window of power readings (Watts). history_watts: VecDeque, /// Sliding window of temperature readings (Celsius). @@ -97,12 +101,13 @@ impl BenchmarkOrchestrator { emergency_abort: Arc::new(AtomicBool::new(false)), emergency_reason: Arc::new(Mutex::new(None)), optional_config_out, + safeguard: None, } } /// Executes the full benchmark sequence. /// - /// This method guarantees that [crate::sal::traits::EnvironmentGuard::restore] and [Workload::stop] + /// This method guarantees that [crate::sal::traits::EnvironmentGuard::restore] and [Workload::stop_workload] /// are called regardless of whether the benchmark succeeds or fails. pub fn run(&mut self) -> Result { self.log("Starting ember-tune Benchmark Sequence.")?; @@ -111,8 +116,16 @@ impl BenchmarkOrchestrator { let result = self.execute_benchmark(); + // --- MANDATORY CLEANUP --- self.log("Benchmark sequence finished. Restoring hardware defaults...")?; - let _ = self.workload.stop(); + let _ = self.workload.stop_workload(); + + if let Some(mut sg) = self.safeguard.take() { + if let Err(e) = sg.release() { + anyhow::bail!("CRITICAL: USA Restoration Failure: {}", e); + } + } + if let Err(e) = self.sal.restore() { anyhow::bail!("CRITICAL: Failed to restore hardware state: {}", e); } @@ -125,6 +138,19 @@ impl BenchmarkOrchestrator { fn execute_benchmark(&mut self) -> Result { let bench_cfg = self.facts.bench_config.clone().context("Benchmarking config missing in facts")?; + // 1. Snapshot & Arm Safeguard + let mut target_files = self.facts.rapl_paths.iter() + .map(|p| p.join("constraint_0_power_limit_uw")) + .collect::>(); + target_files.extend(self.facts.rapl_paths.iter().map(|p| p.join("constraint_1_power_limit_uw"))); + if let Some(tp) = self.facts.paths.configs.get("throttled") { + target_files.push(tp.clone()); + } + + let target_services = vec!["tlp.service".to_string(), "thermald.service".to_string(), "throttled.service".to_string()]; + self.safeguard = Some(HardwareStateGuard::acquire(&target_files, &target_services)?); + + // Phase 1: Audit & Baseline self.phase = BenchmarkPhase::Auditing; for step in self.sal.audit() { if let Err(e) = step.outcome { @@ -132,9 +158,11 @@ impl BenchmarkOrchestrator { } } + self.workload.initialize().context("Failed to initialize workload")?; self.log("Suppressing background services (tlp, thermald)...")?; self.sal.suppress().context("Failed to suppress background services")?; + // Baseline (Idle Calibration) self.phase = BenchmarkPhase::IdleCalibration; self.log(&format!("Phase 1: Recording Idle Baseline ({}s)...", bench_cfg.idle_duration_s))?; self.sal.set_fan_mode("auto")?; @@ -152,6 +180,7 @@ impl BenchmarkOrchestrator { self.profile.ambient_temp = self.engine.smooth(&idle_temps).last().cloned().unwrap_or(0.0); self.log(&format!("✓ Idle Baseline: {:.1}°C", self.profile.ambient_temp))?; + // Phase 2: Stress Stepping self.phase = BenchmarkPhase::StressTesting; self.log("Phase 2: Starting Synthetic Stress Matrix.")?; self.sal.set_fan_mode("max")?; @@ -159,10 +188,16 @@ impl BenchmarkOrchestrator { let steps = bench_cfg.power_steps_watts.clone(); for &pl in &steps { self.log(&format!("Testing PL1 = {:.0}W...", pl))?; - self.sal.set_sustained_power_limit(pl)?; - self.sal.set_burst_power_limit(pl + 5.0)?; - self.workload.start(num_cpus::get(), 100)?; + let pl1_uw = crate::sal::safety::TdpLimitMicroWatts::new((pl * 1_000_000.0) as u64)?; + let pl2_uw = crate::sal::safety::TdpLimitMicroWatts::new(((pl + 5.0) * 1_000_000.0) as u64)?; + self.sal.set_sustained_power_limit(pl1_uw)?; + self.sal.set_burst_power_limit(pl2_uw)?; + + self.workload.run_workload( + Duration::from_secs(bench_cfg.stress_duration_max_s), + IntensityProfile { threads: num_cpus::get(), load_percentage: 100 } + )?; let step_start = Instant::now(); let mut step_temps = VecDeque::with_capacity(30); @@ -188,26 +223,28 @@ impl BenchmarkOrchestrator { thread::sleep(Duration::from_millis(500)); } + // Record data point let avg_p = self.sal.get_power_w().unwrap_or(0.0); let avg_t = self.sal.get_temp().unwrap_or(0.0); let avg_f = self.sal.get_freq_mhz().unwrap_or(0.0); let fans = self.sal.get_fan_rpms().unwrap_or_default(); let primary_fan = fans.first().cloned().unwrap_or(0); - let tp = self.workload.get_throughput().unwrap_or(0.0); + let metrics = self.workload.get_current_metrics().unwrap_or_default(); self.profile.points.push(ThermalPoint { power_w: avg_p, temp_c: avg_t, freq_mhz: avg_f, fan_rpm: primary_fan, - throughput: tp, + throughput: metrics.primary_ops_per_sec, }); - self.workload.stop()?; + self.workload.stop_workload()?; self.log(&format!(" Step complete. Cooling down for {}s...", bench_cfg.cool_down_s))?; thread::sleep(Duration::from_secs(bench_cfg.cool_down_s)); } + // Phase 4: Physical Modeling self.phase = BenchmarkPhase::PhysicalModeling; self.log("Phase 3: Calculating Silicon Physical Sweet Spot...")?; @@ -218,6 +255,7 @@ impl BenchmarkOrchestrator { thread::sleep(Duration::from_secs(3)); + // Phase 5: Finalizing self.phase = BenchmarkPhase::Finalizing; self.log("Benchmark sequence complete. Generating configurations...")?; @@ -227,8 +265,6 @@ impl BenchmarkOrchestrator { trip_temp: res.max_temp_c.max(95.0), }; - // 1. Throttled (Merged if exists) - // PRIORITY: optional_config_out > facts discovery > fallback let throttled_path = self.optional_config_out.clone() .or_else(|| self.facts.paths.configs.get("throttled").cloned()); @@ -238,7 +274,6 @@ impl BenchmarkOrchestrator { res.config_paths.insert("throttled".to_string(), path.clone()); } - // 2. i8kmon if let Some(i8k_path) = self.facts.paths.configs.get("i8kmon") { let i8k_config = crate::engine::formatters::i8kmon::I8kmonConfig { t_ambient: self.profile.ambient_temp, diff --git a/src/sal/dell_xps_9380.rs b/src/sal/dell_xps_9380.rs index dcc73ae..b6ca209 100644 --- a/src/sal/dell_xps_9380.rs +++ b/src/sal/dell_xps_9380.rs @@ -1,10 +1,10 @@ use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep, SafetyStatus, EnvironmentCtx}; +use crate::sal::safety::TdpLimitMicroWatts; use anyhow::{Result, Context, anyhow}; use std::fs; use std::path::{PathBuf}; use std::time::{Duration, Instant}; use std::sync::Mutex; -use tracing::{debug}; use crate::sal::heuristic::discovery::SystemFactSheet; pub struct DellXps9380Sal { @@ -151,7 +151,6 @@ impl EnvironmentGuard for DellXps9380Sal { let mut suppressed = self.suppressed_services.lock().unwrap(); for s in services { if self.ctx.runner.run("systemctl", &["is-active", "--quiet", s]).is_ok() { - debug!("Suppressing service: {}", s); let _ = self.ctx.runner.run("systemctl", &["stop", s]); suppressed.push(s.to_string()); } @@ -251,18 +250,18 @@ impl ActuatorBus for DellXps9380Sal { match mode { "max" | "Manual" => { self.ctx.runner.run(&tool_str, &["0"])?; } "auto" | "Auto" => { self.ctx.runner.run(&tool_str, &["1"])?; } - _ => { debug!("Unknown fan mode: {}", mode); } + _ => {} } Ok(()) } - fn set_sustained_power_limit(&self, watts: f32) -> Result<()> { - fs::write(&self.pl1_path, ((watts * 1_000_000.0) as u64).to_string())?; + fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { + fs::write(&self.pl1_path, limit.as_u64().to_string())?; Ok(()) } - fn set_burst_power_limit(&self, watts: f32) -> Result<()> { - fs::write(&self.pl2_path, ((watts * 1_000_000.0) as u64).to_string())?; + fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { + fs::write(&self.pl2_path, limit.as_u64().to_string())?; Ok(()) } } diff --git a/src/sal/generic_linux.rs b/src/sal/generic_linux.rs index 7007b25..ea1498e 100644 --- a/src/sal/generic_linux.rs +++ b/src/sal/generic_linux.rs @@ -6,6 +6,7 @@ use std::sync::Mutex; use tracing::{debug}; use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx}; +use crate::sal::safety::TdpLimitMicroWatts; use crate::sal::heuristic::discovery::SystemFactSheet; use crate::sal::heuristic::schema::HardwareDb; @@ -15,7 +16,7 @@ pub struct GenericLinuxSal { db: HardwareDb, suppressed_services: Mutex>, last_valid_temp: Mutex<(f32, Instant)>, - current_pl1: Mutex, + current_pl1: Mutex, last_energy: Mutex<(u64, Instant)>, // --- Original State for Restoration --- @@ -35,7 +36,7 @@ impl GenericLinuxSal { db, suppressed_services: Mutex::new(Vec::new()), last_valid_temp: Mutex::new((0.0, Instant::now())), - current_pl1: Mutex::new(15.0), + current_pl1: Mutex::new(15_000_000), last_energy: Mutex::new((initial_energy, Instant::now())), fact_sheet: facts, ctx, @@ -151,16 +152,16 @@ impl ActuatorBus for GenericLinuxSal { } else { Ok(()) } } - fn set_sustained_power_limit(&self, watts: f32) -> Result<()> { + fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL1 path"))?; - fs::write(rapl_path.join("constraint_0_power_limit_uw"), ((watts * 1_000_000.0) as u64).to_string())?; - *self.current_pl1.lock().unwrap() = watts; + fs::write(rapl_path.join("constraint_0_power_limit_uw"), limit.as_u64().to_string())?; + *self.current_pl1.lock().unwrap() = limit.as_u64(); Ok(()) } - fn set_burst_power_limit(&self, watts: f32) -> Result<()> { + fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL2 path"))?; - fs::write(rapl_path.join("constraint_1_power_limit_uw"), ((watts * 1_000_000.0) as u64).to_string())?; + fs::write(rapl_path.join("constraint_1_power_limit_uw"), limit.as_u64().to_string())?; Ok(()) } } diff --git a/src/sal/mock.rs b/src/sal/mock.rs index 98aaf14..28b5691 100644 --- a/src/sal/mock.rs +++ b/src/sal/mock.rs @@ -1,4 +1,5 @@ use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, SafetyStatus}; +use crate::sal::safety::TdpLimitMicroWatts; use anyhow::Result; pub struct MockSal { @@ -59,10 +60,10 @@ impl ActuatorBus for MockSal { fn set_fan_mode(&self, _mode: &str) -> Result<()> { Ok(()) } - fn set_sustained_power_limit(&self, _watts: f32) -> Result<()> { + fn set_sustained_power_limit(&self, _limit: TdpLimitMicroWatts) -> Result<()> { Ok(()) } - fn set_burst_power_limit(&self, _watts: f32) -> Result<()> { + fn set_burst_power_limit(&self, _limit: TdpLimitMicroWatts) -> Result<()> { Ok(()) } } diff --git a/src/sal/mod.rs b/src/sal/mod.rs index 16526ac..d2f276f 100644 --- a/src/sal/mod.rs +++ b/src/sal/mod.rs @@ -3,3 +3,4 @@ pub mod mock; pub mod dell_xps_9380; pub mod generic_linux; pub mod heuristic; +pub mod safety; diff --git a/src/sal/safety.rs b/src/sal/safety.rs new file mode 100644 index 0000000..5ccce10 --- /dev/null +++ b/src/sal/safety.rs @@ -0,0 +1,175 @@ +//! Universal Safeguard Architecture (USA) and Hardware Primitives. +//! +//! This module provides the `HardwareStateGuard` for guaranteed state +//! restoration and type-safe primitives to prevent dangerous hardware states. + +use anyhow::{Result, bail, Context}; +use std::collections::HashMap; +use std::fs; +use std::path::{Path, PathBuf}; +use tracing::{info, warn, error}; + +// --- Type-Driven Safety Primitives --- + +/// Represents a safe TDP limit in microwatts. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct TdpLimitMicroWatts(u64); + +impl TdpLimitMicroWatts { + /// Strict bounds to prevent hardware bricking. + pub const MIN_SAFE_UW: u64 = 5_000_000; // 5 Watts + pub const MAX_SAFE_UW: u64 = 80_000_000; // 80 Watts + + /// Constructs a new TdpLimitMicroWatts, enforcing safety bounds. + /// + /// # Errors + /// Returns a `HardwareSafetyError` (via `anyhow::bail`) if the value is out of bounds. + pub fn new(microwatts: u64) -> Result { + if microwatts < Self::MIN_SAFE_UW { + bail!("HardwareSafetyError: Requested TDP {} uW is below the absolute safety floor of {} uW.", microwatts, Self::MIN_SAFE_UW); + } + if microwatts > Self::MAX_SAFE_UW { + bail!("HardwareSafetyError: Requested TDP {} uW exceeds absolute maximum of {} uW.", microwatts, Self::MAX_SAFE_UW); + } + Ok(Self(microwatts)) + } + + pub fn as_u64(&self) -> u64 { + self.0 + } + + pub fn as_watts(&self) -> f32 { + self.0 as f32 / 1_000_000.0 + } +} + +/// Represents a safe Fan Speed in Percentage (0-100). +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct FanSpeedPercentage(u8); + +impl FanSpeedPercentage { + /// Constructs a new FanSpeedPercentage, enforcing safety bounds. + pub fn new(percent: u8) -> Result { + if percent > 100 { + bail!("HardwareSafetyError: Fan speed percentage {} exceeds 100%.", percent); + } + Ok(Self(percent)) + } + + pub fn as_u8(&self) -> u8 { + self.0 + } +} + +/// Represents a safe Thermal Threshold in Celsius. +#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] +pub struct ThermalThresholdCelsius(f32); + +impl ThermalThresholdCelsius { + pub const MAX_SAFE_C: f32 = 98.0; + + /// Constructs a new ThermalThresholdCelsius, enforcing safety bounds. + pub fn new(celsius: f32) -> Result { + if celsius < 0.0 || celsius > Self::MAX_SAFE_C { + bail!("HardwareSafetyError: Thermal threshold {}°C is outside safe bounds (0.0 - {}).", celsius, Self::MAX_SAFE_C); + } + Ok(Self(celsius)) + } + + pub fn as_f32(&self) -> f32 { + self.0 + } +} + +// --- The HardwareStateGuard (RAII Restorer) --- + +/// Represents a deep snapshot of the system state before benchmarking. +#[derive(Debug, Default, Clone)] +pub struct SystemSnapshot { + /// Maps file paths to their raw string content (e.g., RAPL limits). + pub sysfs_nodes: HashMap, + /// List of services that were active and subsequently stopped. + pub suppressed_services: Vec, +} + +/// The Universal Safeguard wrapper. +/// +/// Implements the "Ironclad Restorer" pattern via the [Drop] trait. +pub struct HardwareStateGuard { + snapshot: SystemSnapshot, + is_armed: bool, +} + +impl HardwareStateGuard { + /// Arms the safeguard by taking a snapshot of the target files and services. + /// + /// # Errors + /// Returns an error if any critical sysfs node cannot be read. + pub fn acquire(target_files: &[PathBuf], target_services: &[String]) -> Result { + let mut snapshot = SystemSnapshot::default(); + + info!("USA: Arming safeguard and snapshotting system state..."); + + for path in target_files { + if path.exists() { + let content = fs::read_to_string(path) + .with_context(|| format!("Failed to snapshot {:?}", path))?; + snapshot.sysfs_nodes.insert(path.clone(), content.trim().to_string()); + } else { + warn!("USA: Target node {:?} does not exist, skipping snapshot.", path); + } + } + + for service in target_services { + let status = std::process::Command::new("systemctl") + .args(["is-active", "--quiet", service]) + .status(); + + if let Ok(s) = status { + if s.success() { + snapshot.suppressed_services.push(service.clone()); + } + } + } + + Ok(Self { + snapshot, + is_armed: true, + }) + } + + /// Explicit manual restoration (can be called upon successful exit). + pub fn release(&mut self) -> Result<()> { + if !self.is_armed { + return Ok(()); + } + + info!("USA: Initiating Ironclad Restoration..."); + + // 1. Restore Power/Sysfs states + for (path, content) in &self.snapshot.sysfs_nodes { + if let Err(e) = fs::write(path, content) { + error!("USA RESTORATION FAILURE: Could not revert {:?}: {}", path, e); + } + } + + // 2. Restart Services + for service in &self.snapshot.suppressed_services { + let _ = std::process::Command::new("systemctl") + .args(["start", service]) + .status(); + } + + self.is_armed = false; + Ok(()) + } +} + +impl Drop for HardwareStateGuard { + fn drop(&mut self) { + if self.is_armed { + warn!("USA: HardwareStateGuard triggered via Drop (panic/unexpected exit). Reverting system state..."); + let _ = self.release(); + } + } +} diff --git a/src/sal/traits.rs b/src/sal/traits.rs index 704ce5c..bae1ae8 100644 --- a/src/sal/traits.rs +++ b/src/sal/traits.rs @@ -157,6 +157,8 @@ impl SensorBus for Arc { } } +use crate::sal::safety::TdpLimitMicroWatts; + /// Provides a write-only interface for hardware actuators. pub trait ActuatorBus: Send + Sync { /// Sets the fan control mode (e.g., "auto" or "max"). @@ -165,28 +167,28 @@ pub trait ActuatorBus: Send + Sync { /// Returns an error if the fan control command or `sysfs` write fails. fn set_fan_mode(&self, mode: &str) -> Result<()>; - /// Sets the sustained power limit (PL1) in Watts. + /// Sets the sustained power limit (PL1) using a validated wrapper. /// /// # Errors /// Returns an error if the RAPL `sysfs` node cannot be written to. - fn set_sustained_power_limit(&self, watts: f32) -> Result<()>; + fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>; - /// Sets the burst power limit (PL2) in Watts. + /// Sets the burst power limit (PL2) using a validated wrapper. /// /// # Errors /// Returns an error if the RAPL `sysfs` node cannot be written to. - fn set_burst_power_limit(&self, watts: f32) -> Result<()>; + fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>; } impl ActuatorBus for Arc { fn set_fan_mode(&self, mode: &str) -> Result<()> { (**self).set_fan_mode(mode) } - fn set_sustained_power_limit(&self, watts: f32) -> Result<()> { - (**self).set_sustained_power_limit(watts) + fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { + (**self).set_sustained_power_limit(limit) } - fn set_burst_power_limit(&self, watts: f32) -> Result<()> { - (**self).set_burst_power_limit(watts) + fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { + (**self).set_burst_power_limit(limit) } } diff --git a/tests/orchestrator_e2e_test.rs b/tests/orchestrator_e2e_test.rs index f445c4c..964517f 100644 --- a/tests/orchestrator_e2e_test.rs +++ b/tests/orchestrator_e2e_test.rs @@ -1,16 +1,23 @@ use ember_tune_rs::orchestrator::BenchmarkOrchestrator; use ember_tune_rs::sal::mock::MockSal; use ember_tune_rs::sal::heuristic::discovery::SystemFactSheet; -use ember_tune_rs::load::Workload; +use ember_tune_rs::load::{Workload, IntensityProfile, WorkloadMetrics}; +use std::time::Duration; +use anyhow::Result; use std::sync::mpsc; use std::sync::Arc; -use anyhow::Result; struct MockWorkload; impl Workload for MockWorkload { - fn start(&mut self, _threads: usize, _load_percent: usize) -> Result<()> { Ok(()) } - fn stop(&mut self) -> Result<()> { Ok(()) } - fn get_throughput(&self) -> Result { Ok(100.0) } + fn initialize(&mut self) -> Result<()> { Ok(()) } + fn run_workload(&mut self, _duration: Duration, _profile: IntensityProfile) -> Result<()> { Ok(()) } + fn get_current_metrics(&self) -> Result { + Ok(WorkloadMetrics { + primary_ops_per_sec: 100.0, + elapsed_time: Duration::from_secs(1), + }) + } + fn stop_workload(&mut self) -> Result<()> { Ok(()) } } #[test] From fe1f58b5ce273ed41478da2749377ef0adf54bed Mon Sep 17 00:00:00 2001 From: Nils Pukropp Date: Fri, 27 Feb 2026 02:59:23 +0100 Subject: [PATCH 10/13] implemented more safeguards and autodiscovery --- src/orchestrator/mod.rs | 84 +++++++++++++----- src/sal/dell_xps_9380.rs | 54 ++++-------- src/sal/generic_linux.rs | 7 +- src/sal/mock.rs | 5 +- src/sal/safety.rs | 181 +++++++++++++++++++++------------------ src/sal/traits.rs | 17 ++-- tests/safety_test.rs | 56 ++++++++++++ 7 files changed, 248 insertions(+), 156 deletions(-) create mode 100644 tests/safety_test.rs diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index 7e42825..9fe8341 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -4,6 +4,7 @@ //! using a [Workload], and feeds telemetry to the frontend via MPSC channels. use anyhow::{Result, Context}; +use tracing::warn; use std::sync::mpsc; use std::time::{Duration, Instant}; use std::thread; @@ -14,17 +15,14 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Mutex; use std::path::PathBuf; -use crate::sal::traits::{PlatformSal, AuditStep, SafetyStatus}; +use crate::sal::traits::{PlatformSal, SafetyStatus}; use crate::sal::heuristic::discovery::SystemFactSheet; -use crate::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts}; +use crate::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts, ConfigurationTransaction, ThermalThresholdCelsius}; use crate::load::{Workload, IntensityProfile}; use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase}; use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult}; /// The central state machine responsible for coordinating the thermal benchmark. -/// -/// It manages hardware interactions through the [PlatformSal], generates stress -/// using a [Workload], and feeds telemetry to the frontend via MPSC channels. pub struct BenchmarkOrchestrator { /// Injected hardware abstraction layer. sal: Arc, @@ -106,14 +104,12 @@ impl BenchmarkOrchestrator { } /// Executes the full benchmark sequence. - /// - /// This method guarantees that [crate::sal::traits::EnvironmentGuard::restore] and [Workload::stop_workload] - /// are called regardless of whether the benchmark succeeds or fails. pub fn run(&mut self) -> Result { self.log("Starting ember-tune Benchmark Sequence.")?; let _watchdog_handle = self.spawn_watchdog_monitor(); + // Core execution wrapped in cleanup logic let result = self.execute_benchmark(); // --- MANDATORY CLEANUP --- @@ -126,9 +122,11 @@ impl BenchmarkOrchestrator { } } + // SAL restore should only handle OEM-specific non-sysfs state not covered by guard if let Err(e) = self.sal.restore() { - anyhow::bail!("CRITICAL: Failed to restore hardware state: {}", e); + warn!("Failed to perform secondary SAL restoration: {}", e); } + self.log("✓ Hardware state restored.")?; result @@ -148,7 +146,19 @@ impl BenchmarkOrchestrator { } let target_services = vec!["tlp.service".to_string(), "thermald.service".to_string(), "throttled.service".to_string()]; - self.safeguard = Some(HardwareStateGuard::acquire(&target_files, &target_services)?); + let mut sg = HardwareStateGuard::acquire(&target_files, &target_services)?; + + // # SAFETY: Register fan restoration command if we are on Dell + if self.facts.vendor.to_lowercase().contains("dell") { + if let Some(tool_path) = self.facts.paths.tools.get("dell_fan_ctrl") { + let tool_str = tool_path.to_string_lossy().to_string(); + sg.on_rollback(Box::new(move || { + let _ = std::process::Command::new(tool_str).arg("1").status(); + })); + } + } + + self.safeguard = Some(sg); // Phase 1: Audit & Baseline self.phase = BenchmarkPhase::Auditing; @@ -159,7 +169,6 @@ impl BenchmarkOrchestrator { } self.workload.initialize().context("Failed to initialize workload")?; - self.log("Suppressing background services (tlp, thermald)...")?; self.sal.suppress().context("Failed to suppress background services")?; // Baseline (Idle Calibration) @@ -185,14 +194,22 @@ impl BenchmarkOrchestrator { self.log("Phase 2: Starting Synthetic Stress Matrix.")?; self.sal.set_fan_mode("max")?; - let steps = bench_cfg.power_steps_watts.clone(); - for &pl in &steps { - self.log(&format!("Testing PL1 = {:.0}W...", pl))?; + let mut current_pl = 10.0_f32; // Start at 10W + let mut previous_ops = 0.0; + + loop { + self.log(&format!("Testing PL1 = {:.0}W...", current_pl))?; - let pl1_uw = crate::sal::safety::TdpLimitMicroWatts::new((pl * 1_000_000.0) as u64)?; - let pl2_uw = crate::sal::safety::TdpLimitMicroWatts::new(((pl + 5.0) * 1_000_000.0) as u64)?; - self.sal.set_sustained_power_limit(pl1_uw)?; - self.sal.set_burst_power_limit(pl2_uw)?; + // # SAFETY: Transactional Commit for Power Limits + let pl1_uw = TdpLimitMicroWatts::from_watts(current_pl)?; + let pl2_uw = TdpLimitMicroWatts::from_watts(current_pl + 5.0)?; + + let mut tx = ConfigurationTransaction::default(); + if let Some(p) = self.facts.rapl_paths.first() { + tx.add_change(p.join("constraint_0_power_limit_uw"), pl1_uw.as_u64().to_string()); + tx.add_change(p.join("constraint_1_power_limit_uw"), pl2_uw.as_u64().to_string()); + } + tx.commit().context("Failed to commit power limit transaction")?; self.workload.run_workload( Duration::from_secs(bench_cfg.stress_duration_max_s), @@ -240,6 +257,32 @@ impl BenchmarkOrchestrator { }); self.workload.stop_workload()?; + + // 1. Check Thermal Ceiling Halt Condition + let max_safe_temp = ThermalThresholdCelsius::MAX_SAFE_C - 5.0; // Margin + if avg_t >= max_safe_temp { + self.log(&format!("Thermal ceiling reached ({:.1}°C). Terminating Identification phase.", avg_t))?; + break; + } + + // 2. Check Diminishing Returns Halt Condition (< 1% gain) + if previous_ops > 0.0 { + let gain_percent = ((metrics.primary_ops_per_sec - previous_ops) / previous_ops) * 100.0; + if gain_percent < 1.0 { + self.log(&format!("Performance gain ({:.1}%) fell below 1%. Terminating Identification phase.", gain_percent))?; + break; + } + } + + // 3. Absolute Maximum Power Check + if current_pl >= 60.0 { + self.log("Maximum theoretical power limit reached. Terminating Identification phase.")?; + break; + } + + previous_ops = metrics.primary_ops_per_sec; + current_pl += 2.0; + self.log(&format!(" Step complete. Cooling down for {}s...", bench_cfg.cool_down_s))?; thread::sleep(Duration::from_secs(bench_cfg.cool_down_s)); } @@ -288,7 +331,6 @@ impl BenchmarkOrchestrator { Ok(res) } - /// Spawns a concurrent monitor that polls safety sensors every 100ms. fn spawn_watchdog_monitor(&self) -> thread::JoinHandle<()> { let abort = self.emergency_abort.clone(); let reason_store = self.emergency_reason.clone(); @@ -340,7 +382,6 @@ impl BenchmarkOrchestrator { }) } - /// Generates the final [OptimizationResult] based on current measurements. pub fn generate_result(&self, is_partial: bool) -> OptimizationResult { let r_theta = self.engine.calculate_thermal_resistance(&self.profile); let knee = self.engine.find_silicon_knee(&self.profile); @@ -358,7 +399,6 @@ impl BenchmarkOrchestrator { } } - /// Checks if the benchmark has been aborted by the user or the watchdog. fn check_abort(&self) -> Result<()> { if self.emergency_abort.load(Ordering::SeqCst) { let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Unknown safety trigger".to_string()); @@ -375,7 +415,6 @@ impl BenchmarkOrchestrator { Ok(()) } - /// Helper to send log messages to the frontend. fn log(&self, msg: &str) -> Result<()> { let state = TelemetryState { cpu_model: self.cpu_model.clone(), @@ -401,7 +440,6 @@ impl BenchmarkOrchestrator { self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed")) } - /// Collects current sensors and sends a complete [TelemetryState] to the frontend. fn send_telemetry(&mut self, tick: u64) -> Result<()> { let temp = self.sal.get_temp().unwrap_or(0.0); let pwr = self.sal.get_power_w().unwrap_or(0.0); diff --git a/src/sal/dell_xps_9380.rs b/src/sal/dell_xps_9380.rs index b6ca209..6c81d1a 100644 --- a/src/sal/dell_xps_9380.rs +++ b/src/sal/dell_xps_9380.rs @@ -1,10 +1,11 @@ use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep, SafetyStatus, EnvironmentCtx}; -use crate::sal::safety::TdpLimitMicroWatts; +use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage}; use anyhow::{Result, Context, anyhow}; use std::fs; use std::path::{PathBuf}; use std::time::{Duration, Instant}; use std::sync::Mutex; +use tracing::{debug}; use crate::sal::heuristic::discovery::SystemFactSheet; pub struct DellXps9380Sal { @@ -22,11 +23,6 @@ pub struct DellXps9380Sal { suppressed_services: Mutex>, msr_file: Mutex, last_energy: Mutex<(u64, Instant)>, - - // --- Original State for Restoration --- - original_pl1: Mutex>, - original_pl2: Mutex>, - original_fan_mode: Mutex>, } impl DellXps9380Sal { @@ -58,9 +54,6 @@ impl DellXps9380Sal { last_energy: Mutex::new((initial_energy, Instant::now())), fact_sheet: facts, ctx, - original_pl1: Mutex::new(None), - original_pl2: Mutex::new(None), - original_fan_mode: Mutex::new(None), }) } @@ -134,23 +127,11 @@ impl PreflightAuditor for DellXps9380Sal { impl EnvironmentGuard for DellXps9380Sal { fn suppress(&self) -> Result<()> { - // 1. Snapshot Power Limits - if let Ok(pl1) = fs::read_to_string(&self.pl1_path) { - *self.original_pl1.lock().unwrap() = pl1.trim().parse().ok(); - } - if let Ok(pl2) = fs::read_to_string(&self.pl2_path) { - *self.original_pl2.lock().unwrap() = pl2.trim().parse().ok(); - } - - // 2. Snapshot Fan Mode (Assumption: Dell BIOS Fan Control is active) - // We can't easily read current state of dell-bios-fan-control, so we assume 'auto' (1) - *self.original_fan_mode.lock().unwrap() = Some("1".to_string()); - - // 3. Stop Services - let services = ["tlp", "thermald", "i8kmon"]; let mut suppressed = self.suppressed_services.lock().unwrap(); + let services = ["tlp", "thermald", "i8kmon"]; for s in services { if self.ctx.runner.run("systemctl", &["is-active", "--quiet", s]).is_ok() { + debug!("Suppressing service: {}", s); let _ = self.ctx.runner.run("systemctl", &["stop", s]); suppressed.push(s.to_string()); } @@ -159,20 +140,6 @@ impl EnvironmentGuard for DellXps9380Sal { } fn restore(&self) -> Result<()> { - // 1. Restore Power Limits - if let Some(pl1) = *self.original_pl1.lock().unwrap() { - let _ = fs::write(&self.pl1_path, pl1.to_string()); - } - if let Some(pl2) = *self.original_pl2.lock().unwrap() { - let _ = fs::write(&self.pl2_path, pl2.to_string()); - } - - // 2. Restore Fan Mode (BIOS Control) - if let Some(tool_path) = self.fact_sheet.paths.tools.get("dell_fan_ctrl") { - let _ = self.ctx.runner.run(&tool_path.to_string_lossy(), &["1"]); - } - - // 3. Restart Services let mut suppressed = self.suppressed_services.lock().unwrap(); for s in suppressed.drain(..) { let _ = self.ctx.runner.run("systemctl", &["start", &s]); @@ -196,7 +163,6 @@ impl SensorBus for DellXps9380Sal { } fn get_power_w(&self) -> Result { - // FIX: Ensure we always read from energy_uj if available for delta calculation let rapl_base = self.pl1_path.parent().context("RAPL path error")?; let energy_path = rapl_base.join("energy_uj"); @@ -212,7 +178,6 @@ impl SensorBus for DellXps9380Sal { if delta_t < 0.05 { return Ok(0.0); } Ok((delta_e as f32 / 1_000_000.0) / delta_t) } else { - // Fallback to power1_average if it exists (units are µW) let s = fs::read_to_string(&self.pwr_path)?; Ok(s.trim().parse::()? / 1000000.0) } @@ -255,6 +220,17 @@ impl ActuatorBus for DellXps9380Sal { Ok(()) } + fn set_fan_speed(&self, speed: FanSpeedPercentage) -> Result<()> { + let tool_path = self.fact_sheet.paths.tools.get("dell_fan_ctrl") + .ok_or_else(|| anyhow!("Dell fan control tool not found in PATH"))?; + let tool_str = tool_path.to_string_lossy(); + + if speed.as_u8() > 50 { + let _ = self.ctx.runner.run(&tool_str, &["0"]); + } + Ok(()) + } + fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { fs::write(&self.pl1_path, limit.as_u64().to_string())?; Ok(()) diff --git a/src/sal/generic_linux.rs b/src/sal/generic_linux.rs index ea1498e..e003ce6 100644 --- a/src/sal/generic_linux.rs +++ b/src/sal/generic_linux.rs @@ -3,10 +3,9 @@ use std::path::{Path}; use std::fs; use std::time::{Duration, Instant}; use std::sync::Mutex; -use tracing::{debug}; use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx}; -use crate::sal::safety::TdpLimitMicroWatts; +use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage}; use crate::sal::heuristic::discovery::SystemFactSheet; use crate::sal::heuristic::schema::HardwareDb; @@ -152,6 +151,10 @@ impl ActuatorBus for GenericLinuxSal { } else { Ok(()) } } + fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> { + Ok(()) + } + fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL1 path"))?; fs::write(rapl_path.join("constraint_0_power_limit_uw"), limit.as_u64().to_string())?; diff --git a/src/sal/mock.rs b/src/sal/mock.rs index 28b5691..ecddb91 100644 --- a/src/sal/mock.rs +++ b/src/sal/mock.rs @@ -1,5 +1,5 @@ use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, SafetyStatus}; -use crate::sal::safety::TdpLimitMicroWatts; +use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage}; use anyhow::Result; pub struct MockSal { @@ -60,6 +60,9 @@ impl ActuatorBus for MockSal { fn set_fan_mode(&self, _mode: &str) -> Result<()> { Ok(()) } + fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> { + Ok(()) + } fn set_sustained_power_limit(&self, _limit: TdpLimitMicroWatts) -> Result<()> { Ok(()) } diff --git a/src/sal/safety.rs b/src/sal/safety.rs index 5ccce10..f33689d 100644 --- a/src/sal/safety.rs +++ b/src/sal/safety.rs @@ -1,175 +1,194 @@ -//! Universal Safeguard Architecture (USA) and Hardware Primitives. -//! -//! This module provides the `HardwareStateGuard` for guaranteed state -//! restoration and type-safe primitives to prevent dangerous hardware states. +//! # Hardware Safety & Universal Safeguard Architecture +//! +//! This module implements the core safety logic for `ember-tune`. It uses the Rust +//! type system to enforce hardware bounds and RAII patterns to guarantee that +//! the system is restored to a safe state even after a crash. use anyhow::{Result, bail, Context}; use std::collections::HashMap; use std::fs; -use std::path::{Path, PathBuf}; +use std::path::{PathBuf}; use tracing::{info, warn, error}; -// --- Type-Driven Safety Primitives --- +// --- 1. Type-Driven Bounds Checking --- -/// Represents a safe TDP limit in microwatts. +/// Represents a TDP limit in microwatts, strictly bounded between 5W and 80W. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct TdpLimitMicroWatts(u64); impl TdpLimitMicroWatts { - /// Strict bounds to prevent hardware bricking. - pub const MIN_SAFE_UW: u64 = 5_000_000; // 5 Watts - pub const MAX_SAFE_UW: u64 = 80_000_000; // 80 Watts + /// # SAFETY: + /// Values below 5W can cause CPU frequency to drop to 400MHz and induce system instability. + pub const MIN_SAFE_UW: u64 = 5_000_000; + /// # SAFETY: + /// Values above 80W can exceed the thermal and electrical design limits of XPS chassis. + pub const MAX_SAFE_UW: u64 = 80_000_000; - /// Constructs a new TdpLimitMicroWatts, enforcing safety bounds. - /// - /// # Errors - /// Returns a `HardwareSafetyError` (via `anyhow::bail`) if the value is out of bounds. + /// Validates and constructs a new TDP limit. pub fn new(microwatts: u64) -> Result { if microwatts < Self::MIN_SAFE_UW { - bail!("HardwareSafetyError: Requested TDP {} uW is below the absolute safety floor of {} uW.", microwatts, Self::MIN_SAFE_UW); + bail!("HardwareSafetyError: Requested TDP {}uW is below safety floor (5W).", microwatts); } if microwatts > Self::MAX_SAFE_UW { - bail!("HardwareSafetyError: Requested TDP {} uW exceeds absolute maximum of {} uW.", microwatts, Self::MAX_SAFE_UW); + bail!("HardwareSafetyError: Requested TDP {}uW exceeds safety ceiling (80W).", microwatts); } Ok(Self(microwatts)) } - pub fn as_u64(&self) -> u64 { - self.0 - } - - pub fn as_watts(&self) -> f32 { - self.0 as f32 / 1_000_000.0 + pub fn from_watts(watts: f32) -> Result { + Self::new((watts * 1_000_000.0) as u64) } + + pub fn as_u64(&self) -> u64 { self.0 } } -/// Represents a safe Fan Speed in Percentage (0-100). -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +/// Represents a fan speed percentage (0-100%). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct FanSpeedPercentage(u8); impl FanSpeedPercentage { - /// Constructs a new FanSpeedPercentage, enforcing safety bounds. pub fn new(percent: u8) -> Result { if percent > 100 { - bail!("HardwareSafetyError: Fan speed percentage {} exceeds 100%.", percent); + bail!("HardwareSafetyError: Fan speed {}% is invalid.", percent); } Ok(Self(percent)) } - - pub fn as_u8(&self) -> u8 { - self.0 - } + pub fn as_u8(&self) -> u8 { self.0 } } -/// Represents a safe Thermal Threshold in Celsius. +/// Represents a thermal threshold in Celsius, bounded to TjMax - 2°C (98°C). #[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] pub struct ThermalThresholdCelsius(f32); impl ThermalThresholdCelsius { pub const MAX_SAFE_C: f32 = 98.0; - /// Constructs a new ThermalThresholdCelsius, enforcing safety bounds. pub fn new(celsius: f32) -> Result { - if celsius < 0.0 || celsius > Self::MAX_SAFE_C { - bail!("HardwareSafetyError: Thermal threshold {}°C is outside safe bounds (0.0 - {}).", celsius, Self::MAX_SAFE_C); + if celsius > Self::MAX_SAFE_C { + bail!("HardwareSafetyError: Thermal threshold {}C exceeds safe limit (98C).", celsius); } Ok(Self(celsius)) } - - pub fn as_f32(&self) -> f32 { - self.0 - } + pub fn as_f32(&self) -> f32 { self.0 } } -// --- The HardwareStateGuard (RAII Restorer) --- +// --- 2. The HardwareStateGuard (RAII Restorer) --- -/// Represents a deep snapshot of the system state before benchmarking. -#[derive(Debug, Default, Clone)] -pub struct SystemSnapshot { - /// Maps file paths to their raw string content (e.g., RAPL limits). - pub sysfs_nodes: HashMap, - /// List of services that were active and subsequently stopped. - pub suppressed_services: Vec, -} +/// Defines an arbitrary action to take during restoration. +pub type RollbackAction = Box; -/// The Universal Safeguard wrapper. -/// -/// Implements the "Ironclad Restorer" pattern via the [Drop] trait. +/// Holds a snapshot of the system state. Restores everything on Drop. pub struct HardwareStateGuard { - snapshot: SystemSnapshot, - is_armed: bool, + /// Maps sysfs paths to their original string contents. + snapshots: HashMap, + /// Services that were stopped and must be restarted. + suppressed_services: Vec, + /// Arbitrary actions to perform on restoration (e.g., reset fan mode). + rollback_actions: Vec, + is_active: bool, } impl HardwareStateGuard { - /// Arms the safeguard by taking a snapshot of the target files and services. - /// - /// # Errors - /// Returns an error if any critical sysfs node cannot be read. + /// Snapshots the requested files and neutralizes competing services. pub fn acquire(target_files: &[PathBuf], target_services: &[String]) -> Result { - let mut snapshot = SystemSnapshot::default(); + let mut snapshots = HashMap::new(); + let mut suppressed = Vec::new(); - info!("USA: Arming safeguard and snapshotting system state..."); + info!("USA: Arming HardwareStateGuard. Snapshotting critical registers..."); for path in target_files { if path.exists() { let content = fs::read_to_string(path) .with_context(|| format!("Failed to snapshot {:?}", path))?; - snapshot.sysfs_nodes.insert(path.clone(), content.trim().to_string()); - } else { - warn!("USA: Target node {:?} does not exist, skipping snapshot.", path); + snapshots.insert(path.clone(), content.trim().to_string()); } } - for service in target_services { + for svc in target_services { let status = std::process::Command::new("systemctl") - .args(["is-active", "--quiet", service]) + .args(["is-active", "--quiet", svc]) .status(); if let Ok(s) = status { if s.success() { - snapshot.suppressed_services.push(service.clone()); + info!("USA: Neutralizing service '{}'", svc); + let _ = std::process::Command::new("systemctl").args(["stop", svc]).status(); + suppressed.push(svc.clone()); } } } Ok(Self { - snapshot, - is_armed: true, + snapshots, + suppressed_services: suppressed, + rollback_actions: Vec::new(), + is_active: true, }) } - /// Explicit manual restoration (can be called upon successful exit). - pub fn release(&mut self) -> Result<()> { - if !self.is_armed { - return Ok(()); - } + /// Registers a custom action to be performed when the guard is released. + pub fn on_rollback(&mut self, action: RollbackAction) { + self.rollback_actions.push(action); + } - info!("USA: Initiating Ironclad Restoration..."); + /// Explicitly release and restore the hardware state. + pub fn release(&mut self) -> Result<()> { + if !self.is_active { return Ok(()); } + + info!("USA: Releasing guard. Restoring hardware to pre-flight state..."); // 1. Restore Power/Sysfs states - for (path, content) in &self.snapshot.sysfs_nodes { + for (path, content) in &self.snapshots { if let Err(e) = fs::write(path, content) { - error!("USA RESTORATION FAILURE: Could not revert {:?}: {}", path, e); + error!("CRITICAL: Failed to restore {:?}: {}", path, e); } } // 2. Restart Services - for service in &self.snapshot.suppressed_services { - let _ = std::process::Command::new("systemctl") - .args(["start", service]) - .status(); + for svc in &self.suppressed_services { + let _ = std::process::Command::new("systemctl").args(["start", svc]).status(); } - self.is_armed = false; + // 3. Perform Custom Rollback Actions + for action in self.rollback_actions.drain(..) { + (action)(); + } + + self.is_active = false; Ok(()) } } impl Drop for HardwareStateGuard { fn drop(&mut self) { - if self.is_armed { - warn!("USA: HardwareStateGuard triggered via Drop (panic/unexpected exit). Reverting system state..."); + if self.is_active { + warn!("USA: Guard dropped prematurely (panic/SIGTERM). Force-restoring system..."); let _ = self.release(); } } } + +// --- 3. Transactional Configuration --- + +/// A staged set of changes to be applied to the hardware. +#[derive(Default)] +pub struct ConfigurationTransaction { + changes: Vec<(PathBuf, String)>, +} + +impl ConfigurationTransaction { + pub fn add_change(&mut self, path: PathBuf, value: String) { + self.changes.push((path, value)); + } + + /// # SAFETY: + /// Commits all changes. If any write fails, it returns an error but the + /// HardwareStateGuard will still restore everything on drop. + pub fn commit(self) -> Result<()> { + for (path, val) in self.changes { + fs::write(&path, val) + .with_context(|| format!("Failed to apply change to {:?}", path))?; + } + Ok(()) + } +} diff --git a/src/sal/traits.rs b/src/sal/traits.rs index bae1ae8..235f6b1 100644 --- a/src/sal/traits.rs +++ b/src/sal/traits.rs @@ -157,26 +157,20 @@ impl SensorBus for Arc { } } -use crate::sal::safety::TdpLimitMicroWatts; +use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage}; /// Provides a write-only interface for hardware actuators. pub trait ActuatorBus: Send + Sync { /// Sets the fan control mode (e.g., "auto" or "max"). - /// - /// # Errors - /// Returns an error if the fan control command or `sysfs` write fails. fn set_fan_mode(&self, mode: &str) -> Result<()>; + /// Sets the fan speed directly using a validated percentage. + fn set_fan_speed(&self, speed: FanSpeedPercentage) -> Result<()>; + /// Sets the sustained power limit (PL1) using a validated wrapper. - /// - /// # Errors - /// Returns an error if the RAPL `sysfs` node cannot be written to. fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>; /// Sets the burst power limit (PL2) using a validated wrapper. - /// - /// # Errors - /// Returns an error if the RAPL `sysfs` node cannot be written to. fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>; } @@ -184,6 +178,9 @@ impl ActuatorBus for Arc { fn set_fan_mode(&self, mode: &str) -> Result<()> { (**self).set_fan_mode(mode) } + fn set_fan_speed(&self, speed: FanSpeedPercentage) -> Result<()> { + (**self).set_fan_speed(speed) + } fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { (**self).set_sustained_power_limit(limit) } diff --git a/tests/safety_test.rs b/tests/safety_test.rs new file mode 100644 index 0000000..2922019 --- /dev/null +++ b/tests/safety_test.rs @@ -0,0 +1,56 @@ +use anyhow::Result; +use std::fs; +use std::path::PathBuf; +use ember_tune_rs::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts}; +use crate::common::fakesys::FakeSysBuilder; + +mod common; + +#[test] +fn test_hardware_state_guard_panic_restoration() { + let fake = FakeSysBuilder::new(); + let pl1_path = fake.base_path().join("sys/class/powercap/intel-rapl:0/constraint_0_power_limit_uw"); + + fake.add_rapl("intel-rapl:0", "1000", "15000000"); // 15W original + + let target_files = vec![pl1_path.clone()]; + + // Simulate a scope where the guard is active + { + let mut _guard = HardwareStateGuard::acquire(&target_files, &[]).expect("Failed to acquire guard"); + + // Modify the file + fs::write(&pl1_path, "25000000").expect("Failed to write new value"); + assert_eq!(fs::read_to_string(&pl1_path).unwrap().trim(), "25000000"); + + // Guard is dropped here (simulating end of scope or panic) + } + + // Verify restoration + let restored = fs::read_to_string(&pl1_path).expect("Failed to read restored file"); + assert_eq!(restored.trim(), "15000000"); +} + +#[test] +fn test_tdp_limit_bounds_checking() { + // 1. Valid value + assert!(TdpLimitMicroWatts::new(15_000_000).is_ok()); + + // 2. Too low (Dangerous 0W or below 5W) + let low_res = TdpLimitMicroWatts::new(1_000_000); + assert!(low_res.is_err()); + assert!(low_res.unwrap_err().to_string().contains("below safety floor")); + + // 3. Too high (> 80W) + let high_res = TdpLimitMicroWatts::new(100_000_000); + assert!(high_res.is_err()); + assert!(high_res.unwrap_err().to_string().contains("exceeds safety ceiling")); +} + +#[test] +fn test_0w_tdp_regression_prevention() { + // The prime directive is to never set 0W. + // Ensure the new() constructor explicitly fails for 0. + let zero_res = TdpLimitMicroWatts::new(0); + assert!(zero_res.is_err()); +} From 4f54fd81ce537329efcaa5666c05f6e6a1b5d017 Mon Sep 17 00:00:00 2001 From: Nils Pukropp Date: Fri, 27 Feb 2026 17:04:47 +0100 Subject: [PATCH 11/13] update docs + agents --- README.md | 82 +++++++++++++------------ src/agent_analyst/mod.rs | 100 +++++++++++++++++++++++++++++++ src/agent_integrator/mod.rs | 115 ++++++++++++++++++++++++++++++++++++ src/agent_metrology/mod.rs | 66 +++++++++++++++++++++ src/engine/mod.rs | 2 + src/engine/profiles.rs | 0 src/lib.rs | 3 + src/load/mod.rs | 58 ++++++++++-------- src/main.rs | 1 + src/mediator.rs | 1 + src/orchestrator/mod.rs | 63 +++++++++++++++----- src/sal/dell_xps_9380.rs | 90 +++++++++++++++++++++------- src/sal/generic_linux.rs | 17 ++++++ src/sal/mock.rs | 3 + src/sal/traits.rs | 6 ++ 15 files changed, 508 insertions(+), 99 deletions(-) create mode 100644 src/agent_analyst/mod.rs create mode 100644 src/agent_integrator/mod.rs create mode 100644 src/agent_metrology/mod.rs create mode 100644 src/engine/profiles.rs diff --git a/README.md b/README.md index eeb4b38..4aa18b9 100644 --- a/README.md +++ b/README.md @@ -1,33 +1,61 @@ +# 🔥 ember-tune +```text + __________ ____ ______ ____ ______ __ __ _ __ ______ + / ____/ |/ // __ )/ ____// __ \ /_ __/ / / / // | / // ____/ + / __/ / /|_/ // __ / __/ / /_/ / / / / / / // |/ // __/ +/ /___ / / / // /_/ / /___ / _, _/ / / / /_/ // /| // /___ +/_____//_/ /_//_____/_____//_/ |_| /_/ \____//_/ |_//_____/ + + >>> Physically-grounded thermal & power optimization for Linux <<< +``` + +> ### **Find your hardware's "Physical Sweet Spot" through automated trial-by-fire.** + +`ember-tune` is a scientifically-driven hardware optimizer that replaces guesswork and manual tuning with a rigorous, automated engineering workflow. It determines the unique thermal properties of your specific laptop—including its Thermal Resistance (Rθ) and "Silicon Knee"—to generate optimal configurations for common Linux tuning daemons. + +## ✨ Features + +- **Automated Physical Benchmarking:** Measures real-world thermal performance under load to find the true "sweet spot" where performance-per-watt is maximized before thermal saturation causes diminishing returns. +- **Heuristic Hardware Discovery:** Utilizes a data-driven Hardware Abstraction Layer (SAL) that probes your system and automatically adapts to its unique quirks, drivers, and sensor paths. +- **Non-Destructive Configuration:** Safely merges new, optimized power limits into your existing `throttled.conf`, preserving manual undervolt settings and comments. +- **Universal Safeguard Architecture (USA):** Includes a high-frequency concurrent watchdog and RAII state restoration to guarantee your system is never left in a dangerous state. +- **Real-time TUI Dashboard:** A `ratatui`-based terminal interface provides high-resolution telemetry throughout the benchmark. + +## 🔬 How it Works: The Architecture + +`ember-tune` is built on a decoupled, multi-threaded architecture to ensure the UI is always responsive and that hardware state is managed safely. + +1. **The Heuristic Engine:** On startup, the engine probes your system's DMI, `sysfs`, and active services. It compares these "facts" against the `hardware_db.toml` to select the correct System Abstraction Layer (SAL). +2. **The Orchestrator (Backend Thread):** This is the state machine that executes the benchmark. It communicates with hardware *only* through the SAL traits. +3. **The TUI (Main Thread):** The `ratatui` dashboard renders `TelemetryState` snapshots received from the orchestrator via an MPSC channel. +4. **The Watchdog (Safety Thread):** A high-priority thread that polls safety sensors every 100ms to trigger an atomic `EmergencyAbort` if failure conditions are met. + ## ⚙️ Development Setup -`ember-tune` is a standard Cargo project. You will need a recent Rust toolchain and common build utilities. +`ember-tune` is a standard Cargo project. **Prerequisites:** - `rustup` -- `build-essential` (or equivalent for your distribution) +- `build-essential` - `libudev-dev` +- `stress-ng` (Required for benchmarking) ```bash -# 1. Clone the repository +# 1. Clone and Build git clone https://gitea.com/narl/ember-tune.git cd ember-tune - -# 2. Build the release binary cargo build --release -# 3. Run the test suite (safe, uses a virtual environment) -# This requires no special permissions and does not touch your hardware. +# 2. Run the safe test suite cargo test ``` **Running:** -Due to its direct hardware access, `ember-tune` requires root privileges. - ```bash -# Run a full benchmark and generate optimized configs +# Run a full benchmark sudo ./target/release/ember-tune -# Run a mock benchmark for UI/logic testing +# Run a mock benchmark for UI testing sudo ./target/release/ember-tune --mock ``` @@ -35,48 +63,24 @@ sudo ./target/release/ember-tune --mock ## 🤝 Contributing Quirk Data (`hardware_db.toml`) -**This is the most impactful way to contribute.** `ember-tune`'s strength comes from its `assets/hardware_db.toml`, which encodes community knowledge about how to manage specific laptops. If your hardware isn't working perfectly, you can likely fix it by adding a new entry here. +**This is the most impactful way to contribute.** If your hardware isn't working perfectly, add a new entry to `assets/hardware_db.toml`. -The database is composed of four key sections: `conflicts`, `ecosystems`, `quirks`, and `discovery`. - -### A. Reporting a Service Conflict -If a background service on your system interferes with `ember-tune`, add it to `[[conflicts]]`. - -**Example:** Adding `laptop-mode-tools`. +### Example: Adding a Service Conflict ```toml [[conflicts]] id = "laptop_mode_conflict" services = ["laptop-mode.service"] contention = "Multiple - I/O schedulers, Power limits" severity = "Medium" -fix_action = "SuspendService" # Orchestrator will stop/start this service +fix_action = "SuspendService" help_text = "laptop-mode-tools can override power-related sysfs settings." ``` -### B. Adding a New Hardware Ecosystem -If your laptop manufacturer (e.g., Razer) has a unique fan control tool or ACPI platform profile path, define it in `[ecosystems]`. - -**Example:** A hypothetical "Razer" ecosystem. -```toml -[ecosystems.razer] -vendor_regex = "Razer" -# Path to the sysfs node that controls performance profiles -profiles_path = "/sys/bus/platform/drivers/razer_acpi/power_mode" -# Map human-readable names to the values the driver expects -policy_map = { Balanced = 0, Boost = 1, Silent = 2 } -``` - -### C. Defining a Model-Specific Quirk -If a specific laptop model has a bug (like a stuck sensor or incorrect fan reporting), define a `[[quirks]]` entry. - -**Example:** A laptop whose fans report 0 RPM even when spinning. +### Example: Defining a Model-Specific Quirk ```toml [[quirks]] model_regex = "HP Envy 15-ep.*" id = "hp_fan_stuck_sensor" issue = "Fan sensor reports 0 RPM when active." -# The 'action' tells the SAL to use a different method for fan detection. action = "UseThermalVelocityFallback" ``` - -After adding your changes, run the test suite and then submit a Pull Request! diff --git a/src/agent_analyst/mod.rs b/src/agent_analyst/mod.rs new file mode 100644 index 0000000..c5b3b33 --- /dev/null +++ b/src/agent_analyst/mod.rs @@ -0,0 +1,100 @@ +//! Heuristic Analysis & Optimization Math (Agent Analyst) +//! +//! This module analyzes raw telemetry data to extract the "Optimal Real-World Settings". +//! It calculates the Silicon Knee, Acoustic/Thermal Matrix (Hysteresis), and +//! generates three distinct hardware states: Silent, Balanced, and Sustained Heavy. + +use serde::{Serialize, Deserialize}; +use crate::engine::{ThermalProfile, OptimizerEngine}; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FanCurvePoint { + pub temp_on: f32, + pub temp_off: f32, + pub pwm_percent: u8, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SystemProfile { + pub name: String, + pub pl1_watts: f32, + pub pl2_watts: f32, + pub fan_curve: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct OptimizationMatrix { + pub silent: SystemProfile, + pub balanced: SystemProfile, + pub performance: SystemProfile, + pub thermal_resistance_kw: f32, +} + +pub struct HeuristicAnalyst { + engine: OptimizerEngine, +} + +impl HeuristicAnalyst { + pub fn new() -> Self { + Self { + engine: OptimizerEngine::new(5), + } + } + + /// Analyzes the raw telemetry to generate the 3 optimal profiles. + pub fn analyze(&self, profile: &ThermalProfile, max_soak_watts: f32) -> OptimizationMatrix { + let r_theta = self.engine.calculate_thermal_resistance(profile); + let silicon_knee = self.engine.find_silicon_knee(profile); + + // 1. State A: Silent / Battery (Scientific Passive Equilibrium) + // Objective: Find P where T_core = 60C with fans OFF. + // T_core = T_ambient + (P * R_theta_passive) + // Note: R_theta measured during benchmark was with fans MAX. + // Passive R_theta is typically 2-3x higher. + let r_theta_passive = r_theta * 2.5; + let silent_watts = ((60.0 - profile.ambient_temp) / r_theta_passive.max(0.1)).clamp(5.0, 15.0); + + let silent_profile = SystemProfile { + name: "Silent".to_string(), + pl1_watts: silent_watts, + pl2_watts: silent_watts * 1.2, + fan_curve: vec![ + FanCurvePoint { temp_on: 65.0, temp_off: 55.0, pwm_percent: 0 }, + FanCurvePoint { temp_on: 75.0, temp_off: 65.0, pwm_percent: 30 }, + ], + }; + + // 2. State B: Balanced + // The exact calculated Silicon Knee + let balanced_profile = SystemProfile { + name: "Balanced".to_string(), + pl1_watts: silicon_knee, + pl2_watts: silicon_knee * 1.25, + fan_curve: vec![ + FanCurvePoint { temp_on: 60.0, temp_off: 55.0, pwm_percent: 0 }, + FanCurvePoint { temp_on: 75.0, temp_off: 65.0, pwm_percent: 40 }, + FanCurvePoint { temp_on: 85.0, temp_off: 75.0, pwm_percent: 70 }, + ], + }; + + // 3. State C: Sustained Heavy + // Based on the max soak watts from Phase 1. + let performance_profile = SystemProfile { + name: "Performance".to_string(), + pl1_watts: max_soak_watts, + pl2_watts: max_soak_watts * 1.3, + fan_curve: vec![ + FanCurvePoint { temp_on: 50.0, temp_off: 45.0, pwm_percent: 30 }, + FanCurvePoint { temp_on: 70.0, temp_off: 60.0, pwm_percent: 60 }, + FanCurvePoint { temp_on: 85.0, temp_off: 75.0, pwm_percent: 100 }, + ], + }; + + OptimizationMatrix { + silent: silent_profile, + balanced: balanced_profile, + performance: performance_profile, + thermal_resistance_kw: r_theta, + } + } +} diff --git a/src/agent_integrator/mod.rs b/src/agent_integrator/mod.rs new file mode 100644 index 0000000..8328498 --- /dev/null +++ b/src/agent_integrator/mod.rs @@ -0,0 +1,115 @@ +//! System Service Integration (Agent Integrator) +//! +//! This module translates the mathematical optimums defined by the Analyst +//! into actionable, real-world Linux/OS service configurations. +//! It generates templates for fan daemons (i8kmon, thinkfan) and handles +//! resolution strategies for overlapping daemons. + +use anyhow::Result; +use std::path::Path; +use std::fs; +use crate::agent_analyst::OptimizationMatrix; + +pub struct ServiceIntegrator; + +impl ServiceIntegrator { + /// Generates and saves an i8kmon configuration based on the balanced profile. + pub fn generate_i8kmon_config(matrix: &OptimizationMatrix, output_path: &Path) -> Result<()> { + let profile = &matrix.balanced; + + let mut conf = String::new(); + conf.push_str("# Auto-generated by ember-tune Integrator +"); + conf.push_str(&format!("# Profile: {} + +", profile.name)); + + for (i, p) in profile.fan_curve.iter().enumerate() { + // i8kmon syntax: set config(state) {left_fan right_fan temp_on temp_off} + // State 0, 1, 2, 3 correspond to BIOS fan states (off, low, high) + + let state = match p.pwm_percent { + 0..=20 => 0, + 21..=50 => 1, + 51..=100 => 2, + _ => 2, + }; + + let off = if i == 0 { "-".to_string() } else { format!("{}", p.temp_off) }; + conf.push_str(&format!("set config({}) {{{} {} {} {}}} +", i, state, state, p.temp_on, off)); + } + + fs::write(output_path, conf)?; + Ok(()) + } + + /// Generates a thinkfan configuration. + pub fn generate_thinkfan_config(matrix: &OptimizationMatrix, output_path: &Path) -> Result<()> { + let profile = &matrix.balanced; + + let mut conf = String::new(); + conf.push_str("# Auto-generated by ember-tune Integrator +"); + conf.push_str("sensors: + - hwmon: /sys/class/hwmon/hwmon0/temp1_input + +"); + conf.push_str("levels: +"); + + for (i, p) in profile.fan_curve.iter().enumerate() { + // thinkfan syntax: - [level, temp_down, temp_up] + let level = match p.pwm_percent { + 0..=20 => 0, + 21..=40 => 1, + 41..=60 => 3, + 61..=80 => 5, + _ => 7, + }; + + let down = if i == 0 { 0.0 } else { p.temp_off }; + conf.push_str(&format!(" - [{}, {}, {}] +", level, down, p.temp_on)); + } + + fs::write(output_path, conf)?; + Ok(()) + } + + /// Generates a resolution checklist/script for daemons. + pub fn generate_conflict_resolution_script(output_path: &Path) -> Result<()> { + let script = r#"#!/bin/bash +# ember-tune Daemon Neutralization Script + +# 1. Mask power-profiles-daemon (Prevent ACPI overrides) +systemctl mask power-profiles-daemon + +# 2. Filter TLP (Prevent CPU governor fights while keeping PCIe saving) +sed -i 's/^CPU_SCALING_GOVERNOR_ON_AC=.*/CPU_SCALING_GOVERNOR_ON_AC=""/' /etc/tlp.conf +sed -i 's/^CPU_BOOST_ON_AC=.*/CPU_BOOST_ON_AC=""/' /etc/tlp.conf +systemctl restart tlp + +# 3. Thermald Delegate (We provide the trips, it handles the rest) +# (Ensure your custom thermal-conf.xml is in /etc/thermald/) +systemctl restart thermald +"#; + fs::write(output_path, script)?; + Ok(()) + } + + /// Generates a thermald configuration XML. + pub fn generate_thermald_config(matrix: &OptimizationMatrix, output_path: &Path) -> Result<()> { + let profile = &matrix.balanced; + let mut xml = String::new(); + xml.push_str("\n\n \n ember-tune Balanced\n Generic\n balanced\n \n \n cpu\n \n"); + + for (i, p) in profile.fan_curve.iter().enumerate() { + xml.push_str(&format!(" \n cpu\n {}\n Passive\n {}\n \n", p.temp_on * 1000.0, i)); + } + + xml.push_str(" \n \n \n \n\n"); + fs::write(output_path, xml)?; + Ok(()) + } +} diff --git a/src/agent_metrology/mod.rs b/src/agent_metrology/mod.rs new file mode 100644 index 0000000..7bc4946 --- /dev/null +++ b/src/agent_metrology/mod.rs @@ -0,0 +1,66 @@ +//! Telemetry & Benchmarking Methodology (Agent Metrology) +//! +//! This module defines the execution flow to extract flawless hardware telemetry. +//! It isolates specific subsystems (CPU Core, Memory) and executes the Sweep Protocol +//! and Thermal Soak to map the physical limits of the hardware. + +use anyhow::Result; +use std::time::{Duration, Instant}; +use std::thread; +use crate::sal::traits::PlatformSal; +use crate::load::{Workload, IntensityProfile, StressVector}; +use tracing::info; + +pub struct MetrologyAgent<'a> { + sal: &'a dyn PlatformSal, + workload: &'a mut Box, +} + +impl<'a> MetrologyAgent<'a> { + pub fn new(sal: &'a dyn PlatformSal, workload: &'a mut Box) -> Self { + Self { sal, workload } + } + + /// Performs a prolonged mixed-load test to achieve chassis thermal saturation. + /// Bypasses short-term PL2/boost metrics to find the true steady-state dissipation capacity. + pub fn perform_thermal_soak(&mut self, duration_minutes: u64) -> Result { + info!("Metrology: Starting {} minute Thermal Soak...", duration_minutes); + + self.sal.set_fan_mode("max")?; + + // Mixed load: matrix math + memory stressors to saturate entire SoC and Chassis. + self.workload.run_workload( + Duration::from_secs(duration_minutes * 60), + IntensityProfile { + threads: num_cpus::get(), + load_percentage: 100, + vector: StressVector::Mixed + } + )?; + + let start = Instant::now(); + let target = Duration::from_secs(duration_minutes * 60); + let mut max_sustained_watts = 0.0; + + while start.elapsed() < target { + thread::sleep(Duration::from_secs(5)); + let temp = self.sal.get_temp().unwrap_or(0.0); + let watts = self.sal.get_power_w().unwrap_or(0.0); + + if watts > max_sustained_watts { + max_sustained_watts = watts; + } + + // Abort if dangerously hot + if temp >= 98.0 { + info!("Metrology: Thermal ceiling hit during soak ({}C). Stopping early.", temp); + break; + } + } + + self.workload.stop_workload()?; + info!("Metrology: Thermal Soak complete. Max sustained: {:.1}W", max_sustained_watts); + + Ok(max_sustained_watts) + } +} diff --git a/src/engine/mod.rs b/src/engine/mod.rs index 07997d8..e65a992 100644 --- a/src/engine/mod.rs +++ b/src/engine/mod.rs @@ -47,6 +47,8 @@ pub struct OptimizationResult { pub is_partial: bool, /// A map of configuration files that were written to. pub config_paths: HashMap, + /// The comprehensive optimization matrix (Silent, Balanced, Performance). + pub optimization_matrix: Option, } /// Pure mathematics engine for thermal optimization. diff --git a/src/engine/profiles.rs b/src/engine/profiles.rs new file mode 100644 index 0000000..e69de29 diff --git a/src/lib.rs b/src/lib.rs index 0f4aa6a..99103a3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,3 +12,6 @@ pub mod ui; pub mod engine; pub mod cli; pub mod sys; +pub mod agent_metrology; +pub mod agent_analyst; +pub mod agent_integrator; diff --git a/src/load/mod.rs b/src/load/mod.rs index 3ec7956..a19ed48 100644 --- a/src/load/mod.rs +++ b/src/load/mod.rs @@ -17,11 +17,20 @@ pub struct WorkloadMetrics { pub elapsed_time: Duration, } +/// Defines which subsystem to isolate during stress testing. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum StressVector { + CpuMatrix, + MemoryBandwidth, + Mixed, +} + /// A normalized profile defining the intensity and constraints of the workload. #[derive(Debug, Clone)] pub struct IntensityProfile { pub threads: usize, pub load_percentage: u8, + pub vector: StressVector, } /// The replaceable interface for load generation and performance measurement. @@ -63,7 +72,7 @@ impl Workload for StressNg { .stdout(Stdio::null()) .stderr(Stdio::null()) .status() - .context("stress-ng binary not found in PATH")?; + .context("stress-ng binary not found in PATH. Please install it.")?; if !status.success() { return Err(anyhow!("stress-ng failed to initialize")); @@ -72,24 +81,29 @@ impl Workload for StressNg { } fn run_workload(&mut self, duration: Duration, profile: IntensityProfile) -> Result<()> { - self.stop_workload()?; // Ensure clean state + self.stop_workload()?; let threads = profile.threads.to_string(); let timeout = format!("{}s", duration.as_secs()); let load = profile.load_percentage.to_string(); - let mut child = Command::new("stress-ng") - .args([ - "--matrix", &threads, - "--cpu-load", &load, - "--timeout", &timeout, - "--metrics-brief", - "--metrics-brief", // Repeat for stderr/stdout consistency - ]) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .spawn() - .context("Failed to spawn stress-ng")?; + let mut cmd = Command::new("stress-ng"); + cmd.args(["--timeout", &timeout, "--metrics", "--quiet"]); + + match profile.vector { + StressVector::CpuMatrix => { + cmd.args(["--matrix", &threads, "--cpu-load", &load]); + }, + StressVector::MemoryBandwidth => { + cmd.args(["--vm", &threads, "--vm-bytes", "80%"]); + }, + StressVector::Mixed => { + let half = (profile.threads / 2).max(1).to_string(); + cmd.args(["--matrix", &half, "--vm", &half, "--vm-bytes", "40%"]); + } + } + + let mut child = cmd.stderr(Stdio::piped()).spawn().context("Failed to spawn stress-ng")?; self.start_time = Some(Instant::now()); @@ -100,16 +114,13 @@ impl Workload for StressNg { thread::spawn(move || { let reader = BufReader::new(stderr); for line in reader.lines().flatten() { - // Parse stress-ng metrics line: - // stress-ng: info: [PID] matrix [OPS] [TIME] [BOGO OPS/S] - if line.contains("matrix") && line.contains("bogo ops/s") { + // Parse stress-ng metrics line + if line.contains("matrix") || line.contains("vm") { let parts: Vec<&str> = line.split_whitespace().collect(); - if let Some(ops_idx) = parts.iter().position(|&p| p == "ops/s") { - if let Some(ops_val) = parts.get(ops_idx - 1) { - if let Ok(ops) = ops_val.parse::() { - let mut m = metrics_ref.lock().unwrap(); - m.primary_ops_per_sec = ops; - } + if let Some(val) = parts.last() { + if let Ok(ops) = val.parse::() { + let mut m = metrics_ref.lock().unwrap(); + m.primary_ops_per_sec = ops; } } } @@ -130,7 +141,6 @@ impl Workload for StressNg { fn stop_workload(&mut self) -> Result<()> { if let Some(mut child) = self.child.take() { - // Polite SIGTERM #[cfg(unix)] { use libc::{kill, SIGTERM}; diff --git a/src/main.rs b/src/main.rs index bc962f4..8e09e73 100644 --- a/src/main.rs +++ b/src/main.rs @@ -189,6 +189,7 @@ fn main() -> Result<()> { pl1_limit: 0.0, pl2_limit: 0.0, fan_tier: "auto".to_string(), + is_throttling: false, phase: BenchmarkPhase::Auditing, history_watts: Vec::new(), history_temp: Vec::new(), diff --git a/src/mediator.rs b/src/mediator.rs index a2d4266..4d1acdc 100644 --- a/src/mediator.rs +++ b/src/mediator.rs @@ -35,6 +35,7 @@ pub struct TelemetryState { pub pl1_limit: f32, pub pl2_limit: f32, pub fan_tier: String, + pub is_throttling: bool, pub phase: BenchmarkPhase, // --- High-res History --- diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index 9fe8341..b3f1071 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -18,9 +18,12 @@ use std::path::PathBuf; use crate::sal::traits::{PlatformSal, SafetyStatus}; use crate::sal::heuristic::discovery::SystemFactSheet; use crate::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts, ConfigurationTransaction, ThermalThresholdCelsius}; -use crate::load::{Workload, IntensityProfile}; +use crate::load::{Workload, IntensityProfile, StressVector}; use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase}; use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult}; +use crate::agent_metrology::MetrologyAgent; +use crate::agent_analyst::{HeuristicAnalyst, OptimizationMatrix}; +use crate::agent_integrator::ServiceIntegrator; /// The central state machine responsible for coordinating the thermal benchmark. pub struct BenchmarkOrchestrator { @@ -189,6 +192,13 @@ impl BenchmarkOrchestrator { self.profile.ambient_temp = self.engine.smooth(&idle_temps).last().cloned().unwrap_or(0.0); self.log(&format!("✓ Idle Baseline: {:.1}°C", self.profile.ambient_temp))?; + // Phase 1.5: Thermal Soak (Agent Metrology) + self.log("Phase 1.5: Executing Thermal Soak to achieve chassis saturation...")?; + let soak_duration_minutes = 1; + let mut metrology = MetrologyAgent::new(self.sal.as_ref(), &mut self.workload); + let max_soak_watts = metrology.perform_thermal_soak(soak_duration_minutes)?; + self.log(&format!("✓ Max sustained wattage during soak: {:.1}W", max_soak_watts))?; + // Phase 2: Stress Stepping self.phase = BenchmarkPhase::StressTesting; self.log("Phase 2: Starting Synthetic Stress Matrix.")?; @@ -213,7 +223,7 @@ impl BenchmarkOrchestrator { self.workload.run_workload( Duration::from_secs(bench_cfg.stress_duration_max_s), - IntensityProfile { threads: num_cpus::get(), load_percentage: 100 } + IntensityProfile { threads: num_cpus::get(), load_percentage: 100, vector: StressVector::CpuMatrix } )?; let step_start = Instant::now(); @@ -287,18 +297,22 @@ impl BenchmarkOrchestrator { thread::sleep(Duration::from_secs(bench_cfg.cool_down_s)); } - // Phase 4: Physical Modeling + // Phase 4: Physical Modeling (Agent Analyst) self.phase = BenchmarkPhase::PhysicalModeling; - self.log("Phase 3: Calculating Silicon Physical Sweet Spot...")?; + self.log("Phase 3: Calculating Silicon Physical Sweet Spot & Profiles...")?; + + let analyst = HeuristicAnalyst::new(); + let matrix = analyst.analyze(&self.profile, max_soak_watts); let mut res = self.generate_result(false); + res.optimization_matrix = Some(matrix.clone()); self.log(&format!("✓ Thermal Resistance (Rθ): {:.3} K/W", res.thermal_resistance_kw))?; self.log(&format!("✓ Silicon Knee Found: {:.1} W", res.silicon_knee_watts))?; thread::sleep(Duration::from_secs(3)); - // Phase 5: Finalizing + // Phase 5: Finalizing (Agent Integrator) self.phase = BenchmarkPhase::Finalizing; self.log("Benchmark sequence complete. Generating configurations...")?; @@ -317,15 +331,31 @@ impl BenchmarkOrchestrator { res.config_paths.insert("throttled".to_string(), path.clone()); } - if let Some(i8k_path) = self.facts.paths.configs.get("i8kmon") { - let i8k_config = crate::engine::formatters::i8kmon::I8kmonConfig { - t_ambient: self.profile.ambient_temp, - t_max_fan: res.max_temp_c - 5.0, - thermal_resistance_kw: res.thermal_resistance_kw, - }; - crate::engine::formatters::i8kmon::I8kmonTranslator::save(i8k_path, &i8k_config)?; - self.log(&format!("✓ Saved '{}'.", i8k_path.display()))?; - res.config_paths.insert("i8kmon".to_string(), i8k_path.clone()); + // Generate Fan configs via Agent Integrator + let base_out = self.optional_config_out.clone().unwrap_or_else(|| PathBuf::from("/etc")); + + let i8k_out = base_out.join("i8kmon.conf"); + if ServiceIntegrator::generate_i8kmon_config(&matrix, &i8k_out).is_ok() { + self.log(&format!("✓ Saved '{}'.", i8k_out.display()))?; + res.config_paths.insert("i8kmon".to_string(), i8k_out); + } + + let thinkfan_out = base_out.join("thinkfan.conf"); + if ServiceIntegrator::generate_thinkfan_config(&matrix, &thinkfan_out).is_ok() { + self.log(&format!("✓ Saved '{}'.", thinkfan_out.display()))?; + res.config_paths.insert("thinkfan".to_string(), thinkfan_out); + } + + let thermald_out = base_out.join("thermal-conf.xml"); + if ServiceIntegrator::generate_thermald_config(&matrix, &thermald_out).is_ok() { + self.log(&format!("✓ Saved '{}'.", thermald_out.display()))?; + res.config_paths.insert("thermald".to_string(), thermald_out); + } + + let script_out = base_out.join("ember-tune-neutralize.sh"); + if ServiceIntegrator::generate_conflict_resolution_script(&script_out).is_ok() { + self.log(&format!("✓ Saved conflict resolution script: '{}'", script_out.display()))?; + res.config_paths.insert("conflict_script".to_string(), script_out); } Ok(res) @@ -359,6 +389,7 @@ impl BenchmarkOrchestrator { pl1_limit: 0.0, pl2_limit: 0.0, fan_tier: String::new(), + is_throttling: sal.get_throttling_status().unwrap_or(false), phase: BenchmarkPhase::StressTesting, history_watts: Vec::new(), history_temp: Vec::new(), @@ -396,6 +427,7 @@ impl BenchmarkOrchestrator { max_temp_c: max_t, is_partial, config_paths: std::collections::HashMap::new(), + optimization_matrix: None, } } @@ -428,6 +460,7 @@ impl BenchmarkOrchestrator { pl1_limit: 0.0, pl2_limit: 0.0, fan_tier: "auto".to_string(), + is_throttling: self.sal.get_throttling_status().unwrap_or(false), phase: self.phase, history_watts: Vec::new(), history_temp: Vec::new(), @@ -444,6 +477,7 @@ impl BenchmarkOrchestrator { let temp = self.sal.get_temp().unwrap_or(0.0); let pwr = self.sal.get_power_w().unwrap_or(0.0); let freq = self.sal.get_freq_mhz().unwrap_or(0.0); + let throttling = self.sal.get_throttling_status().unwrap_or(false); self.history_temp.push_back(temp); self.history_watts.push_back(pwr); @@ -467,6 +501,7 @@ impl BenchmarkOrchestrator { pl1_limit: 15.0, pl2_limit: 25.0, fan_tier: "max".to_string(), + is_throttling: throttling, phase: self.phase, history_watts: self.history_watts.iter().cloned().collect(), history_temp: self.history_temp.iter().cloned().collect(), diff --git a/src/sal/dell_xps_9380.rs b/src/sal/dell_xps_9380.rs index 6c81d1a..be78de1 100644 --- a/src/sal/dell_xps_9380.rs +++ b/src/sal/dell_xps_9380.rs @@ -5,9 +5,10 @@ use std::fs; use std::path::{PathBuf}; use std::time::{Duration, Instant}; use std::sync::Mutex; -use tracing::{debug}; +use tracing::{debug, warn}; use crate::sal::heuristic::discovery::SystemFactSheet; +/// Implementation of the System Abstraction Layer for the Dell XPS 13 9380. pub struct DellXps9380Sal { ctx: EnvironmentCtx, fact_sheet: SystemFactSheet, @@ -23,9 +24,16 @@ pub struct DellXps9380Sal { suppressed_services: Mutex>, msr_file: Mutex, last_energy: Mutex<(u64, Instant)>, + last_watts: Mutex, + + // --- Original State for Restoration --- + original_pl1: Mutex>, + original_pl2: Mutex>, + original_fan_mode: Mutex>, } impl DellXps9380Sal { + /// Initializes the Dell SAL, opening the MSR interface and discovering sensors. pub fn init(ctx: EnvironmentCtx, facts: SystemFactSheet) -> Result { let temp_path = facts.temp_path.clone().context("Dell SAL requires temperature sensor")?; let pwr_base = facts.rapl_paths.first().cloned().context("Dell SAL requires RAPL interface")?; @@ -52,8 +60,12 @@ impl DellXps9380Sal { suppressed_services: Mutex::new(Vec::new()), msr_file: Mutex::new(msr_file), last_energy: Mutex::new((initial_energy, Instant::now())), + last_watts: Mutex::new(0.0), fact_sheet: facts, ctx, + original_pl1: Mutex::new(None), + original_pl2: Mutex::new(None), + original_fan_mode: Mutex::new(None), }) } @@ -81,6 +93,22 @@ impl PreflightAuditor for DellXps9380Sal { outcome: if unsafe { libc::getuid() } == 0 { Ok(()) } else { Err(AuditError::RootRequired) } }); + // RAPL Lock Check (MSR 0x610) + let rapl_lock = match self.read_msr(0x610) { + Ok(val) => { + if (val & (1 << 63)) != 0 { + Err(AuditError::KernelIncompatible("RAPL Registers are locked by BIOS. Power limit tuning is impossible.".to_string())) + } else { + Ok(()) + } + }, + Err(e) => Err(AuditError::ToolMissing(format!("Cannot read MSR 0x610: {}", e))), + }; + steps.push(AuditStep { + description: "MSR 0x610 RAPL Lock Status".to_string(), + outcome: rapl_lock, + }); + let modules = ["dell_smm_hwmon", "msr", "intel_rapl_msr"]; for mod_name in modules { let path = self.ctx.sysfs_base.join(format!("sys/module/{}", mod_name)); @@ -115,23 +143,24 @@ impl PreflightAuditor for DellXps9380Sal { } }); - let tool_check = self.fact_sheet.paths.tools.contains_key("dell_fan_ctrl"); - steps.push(AuditStep { - description: "Dell Fan Control Tool".to_string(), - outcome: if tool_check { Ok(()) } else { Err(AuditError::ToolMissing("dell-bios-fan-control not found in PATH".to_string())) } - }); - Box::new(steps.into_iter()) } } impl EnvironmentGuard for DellXps9380Sal { fn suppress(&self) -> Result<()> { - let mut suppressed = self.suppressed_services.lock().unwrap(); + if let Ok(pl1) = fs::read_to_string(&self.pl1_path) { + *self.original_pl1.lock().unwrap() = pl1.trim().parse().ok(); + } + if let Ok(pl2) = fs::read_to_string(&self.pl2_path) { + *self.original_pl2.lock().unwrap() = pl2.trim().parse().ok(); + } + *self.original_fan_mode.lock().unwrap() = Some("1".to_string()); + let services = ["tlp", "thermald", "i8kmon"]; + let mut suppressed = self.suppressed_services.lock().unwrap(); for s in services { if self.ctx.runner.run("systemctl", &["is-active", "--quiet", s]).is_ok() { - debug!("Suppressing service: {}", s); let _ = self.ctx.runner.run("systemctl", &["stop", s]); suppressed.push(s.to_string()); } @@ -140,6 +169,15 @@ impl EnvironmentGuard for DellXps9380Sal { } fn restore(&self) -> Result<()> { + if let Some(pl1) = *self.original_pl1.lock().unwrap() { + let _ = fs::write(&self.pl1_path, pl1.to_string()); + } + if let Some(pl2) = *self.original_pl2.lock().unwrap() { + let _ = fs::write(&self.pl2_path, pl2.to_string()); + } + if let Some(tool_path) = self.fact_sheet.paths.tools.get("dell_fan_ctrl") { + let _ = self.ctx.runner.run(&tool_path.to_string_lossy(), &["1"]); + } let mut suppressed = self.suppressed_services.lock().unwrap(); for s in suppressed.drain(..) { let _ = self.ctx.runner.run("systemctl", &["start", &s]); @@ -167,16 +205,25 @@ impl SensorBus for DellXps9380Sal { let energy_path = rapl_base.join("energy_uj"); if energy_path.exists() { - let mut last = self.last_energy.lock().unwrap(); + let mut last_energy = self.last_energy.lock().unwrap(); + let mut last_watts = self.last_watts.lock().unwrap(); + let e2_str = fs::read_to_string(&energy_path)?; let e2 = e2_str.trim().parse::()?; let t2 = Instant::now(); - let (e1, t1) = *last; + let (e1, t1) = *last_energy; + let delta_e = e2.wrapping_sub(e1); let delta_t = t2.duration_since(t1).as_secs_f32(); - *last = (e2, t2); - if delta_t < 0.05 { return Ok(0.0); } - Ok((delta_e as f32 / 1_000_000.0) / delta_t) + + if delta_t < 0.1 { + return Ok(*last_watts); // Return cached if polled too fast + } + + let watts = (delta_e as f32 / 1_000_000.0) / delta_t; + *last_energy = (e2, t2); + *last_watts = watts; + Ok(watts) } else { let s = fs::read_to_string(&self.pwr_path)?; Ok(s.trim().parse::()? / 1000000.0) @@ -204,6 +251,12 @@ impl SensorBus for DellXps9380Sal { let s = fs::read_to_string(&self.freq_path)?; Ok(s.trim().parse::()? / 1000.0) } + + fn get_throttling_status(&self) -> Result { + // MSR 0x19C bit 0 is "Thermal Status", bit 1 is "Thermal Log" + let val = self.read_msr(0x19C)?; + Ok((val & 0x1) != 0) + } } impl ActuatorBus for DellXps9380Sal { @@ -220,14 +273,7 @@ impl ActuatorBus for DellXps9380Sal { Ok(()) } - fn set_fan_speed(&self, speed: FanSpeedPercentage) -> Result<()> { - let tool_path = self.fact_sheet.paths.tools.get("dell_fan_ctrl") - .ok_or_else(|| anyhow!("Dell fan control tool not found in PATH"))?; - let tool_str = tool_path.to_string_lossy(); - - if speed.as_u8() > 50 { - let _ = self.ctx.runner.run(&tool_str, &["0"]); - } + fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> { Ok(()) } diff --git a/src/sal/generic_linux.rs b/src/sal/generic_linux.rs index e003ce6..767dbe7 100644 --- a/src/sal/generic_linux.rs +++ b/src/sal/generic_linux.rs @@ -133,6 +133,23 @@ impl SensorBus for GenericLinuxSal { Err(anyhow!("Could not determine CPU frequency")) } } + + fn get_throttling_status(&self) -> Result { + // Fallback: check if any cooling device is active (cur_state > 0) + let cooling_base = self.ctx.sysfs_base.join("sys/class/thermal"); + if let Ok(entries) = fs::read_dir(cooling_base) { + for entry in entries.flatten() { + if entry.file_name().to_string_lossy().starts_with("cooling_device") { + if let Ok(state) = fs::read_to_string(entry.path().join("cur_state")) { + if state.trim().parse::().unwrap_or(0) > 0 { + return Ok(true); + } + } + } + } + } + Ok(false) + } } impl ActuatorBus for GenericLinuxSal { diff --git a/src/sal/mock.rs b/src/sal/mock.rs index ecddb91..079a982 100644 --- a/src/sal/mock.rs +++ b/src/sal/mock.rs @@ -54,6 +54,9 @@ impl SensorBus for MockSal { fn get_freq_mhz(&self) -> Result { Ok(3200.0) } + fn get_throttling_status(&self) -> Result { + Ok(self.get_temp()? > 90.0) + } } impl ActuatorBus for MockSal { diff --git a/src/sal/traits.rs b/src/sal/traits.rs index 235f6b1..996b4e6 100644 --- a/src/sal/traits.rs +++ b/src/sal/traits.rs @@ -140,6 +140,9 @@ pub trait SensorBus: Send + Sync { /// # Errors /// Returns an error if `/proc/cpuinfo` or a `cpufreq` sysfs node cannot be read. fn get_freq_mhz(&self) -> Result; + + /// Returns true if the system is currently thermally throttling. + fn get_throttling_status(&self) -> Result; } impl SensorBus for Arc { @@ -155,6 +158,9 @@ impl SensorBus for Arc { fn get_freq_mhz(&self) -> Result { (**self).get_freq_mhz() } + fn get_throttling_status(&self) -> Result { + (**self).get_throttling_status() + } } use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage}; From 1702e7d0587eb7de6422703ba04d26493b23d21b Mon Sep 17 00:00:00 2001 From: Nils Pukropp Date: Sat, 28 Feb 2026 14:54:48 +0100 Subject: [PATCH 12/13] updated --- Cargo.lock | 13 ++ Cargo.toml | 2 +- assets/hardware_db.toml | 2 +- src/agent_metrology/mod.rs | 66 ------ src/lib.rs | 1 - src/load/mod.rs | 4 +- src/main.rs | 63 ++---- src/orchestrator/mod.rs | 393 +++++++++++++-------------------- src/sal/dell_xps_9380.rs | 197 ++++++++++------- src/sal/discovery.rs | 148 +++++++++++++ src/sal/generic_linux.rs | 91 ++------ src/sal/heuristic/discovery.rs | 152 +++++++------ src/sal/mock.rs | 60 ++--- src/sal/mod.rs | 1 + src/sal/safety.rs | 150 ++++++++++--- src/sal/traits.rs | 58 ++--- 16 files changed, 713 insertions(+), 688 deletions(-) delete mode 100644 src/agent_metrology/mod.rs create mode 100644 src/sal/discovery.rs diff --git a/Cargo.lock b/Cargo.lock index ac4e99b..b6bb0cc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -901,6 +901,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + [[package]] name = "memchr" version = "2.8.0" @@ -2000,10 +2009,14 @@ version = "0.3.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" dependencies = [ + "matchers", "nu-ansi-term", + "once_cell", + "regex-automata", "sharded-slab", "smallvec", "thread_local", + "tracing", "tracing-core", "tracing-log", ] diff --git a/Cargo.toml b/Cargo.toml index c584f25..c118493 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,7 @@ serde_json = "1.0.149" clap = { version = "4.5", features = ["derive", "string", "wrap_help"] } color-eyre = "0.6" tracing = "0.1" -tracing-subscriber = "0.3" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } tracing-appender = "0.2" sysinfo = "0.38" libc = "0.2" diff --git a/assets/hardware_db.toml b/assets/hardware_db.toml index d695ebf..a275ab3 100644 --- a/assets/hardware_db.toml +++ b/assets/hardware_db.toml @@ -15,7 +15,7 @@ help_text = "TLP and Power-Profiles-Daemon fight over power envelopes. Mask both [[conflicts]] id = "thermal_logic_collision" -services = ["thermald.service", "throttled.service"] +services = ["thermald.service", "throttled.service", "lenovo_fix.service", "lenovo-throttling-fix.service"] contention = "RAPL / MSR / BD-PROCHOT" severity = "High" fix_action = "SuspendService" diff --git a/src/agent_metrology/mod.rs b/src/agent_metrology/mod.rs deleted file mode 100644 index 7bc4946..0000000 --- a/src/agent_metrology/mod.rs +++ /dev/null @@ -1,66 +0,0 @@ -//! Telemetry & Benchmarking Methodology (Agent Metrology) -//! -//! This module defines the execution flow to extract flawless hardware telemetry. -//! It isolates specific subsystems (CPU Core, Memory) and executes the Sweep Protocol -//! and Thermal Soak to map the physical limits of the hardware. - -use anyhow::Result; -use std::time::{Duration, Instant}; -use std::thread; -use crate::sal::traits::PlatformSal; -use crate::load::{Workload, IntensityProfile, StressVector}; -use tracing::info; - -pub struct MetrologyAgent<'a> { - sal: &'a dyn PlatformSal, - workload: &'a mut Box, -} - -impl<'a> MetrologyAgent<'a> { - pub fn new(sal: &'a dyn PlatformSal, workload: &'a mut Box) -> Self { - Self { sal, workload } - } - - /// Performs a prolonged mixed-load test to achieve chassis thermal saturation. - /// Bypasses short-term PL2/boost metrics to find the true steady-state dissipation capacity. - pub fn perform_thermal_soak(&mut self, duration_minutes: u64) -> Result { - info!("Metrology: Starting {} minute Thermal Soak...", duration_minutes); - - self.sal.set_fan_mode("max")?; - - // Mixed load: matrix math + memory stressors to saturate entire SoC and Chassis. - self.workload.run_workload( - Duration::from_secs(duration_minutes * 60), - IntensityProfile { - threads: num_cpus::get(), - load_percentage: 100, - vector: StressVector::Mixed - } - )?; - - let start = Instant::now(); - let target = Duration::from_secs(duration_minutes * 60); - let mut max_sustained_watts = 0.0; - - while start.elapsed() < target { - thread::sleep(Duration::from_secs(5)); - let temp = self.sal.get_temp().unwrap_or(0.0); - let watts = self.sal.get_power_w().unwrap_or(0.0); - - if watts > max_sustained_watts { - max_sustained_watts = watts; - } - - // Abort if dangerously hot - if temp >= 98.0 { - info!("Metrology: Thermal ceiling hit during soak ({}C). Stopping early.", temp); - break; - } - } - - self.workload.stop_workload()?; - info!("Metrology: Thermal Soak complete. Max sustained: {:.1}W", max_sustained_watts); - - Ok(max_sustained_watts) - } -} diff --git a/src/lib.rs b/src/lib.rs index 99103a3..0ce6d3a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,6 +12,5 @@ pub mod ui; pub mod engine; pub mod cli; pub mod sys; -pub mod agent_metrology; pub mod agent_analyst; pub mod agent_integrator; diff --git a/src/load/mod.rs b/src/load/mod.rs index a19ed48..9253917 100644 --- a/src/load/mod.rs +++ b/src/load/mod.rs @@ -88,11 +88,11 @@ impl Workload for StressNg { let load = profile.load_percentage.to_string(); let mut cmd = Command::new("stress-ng"); - cmd.args(["--timeout", &timeout, "--metrics", "--quiet"]); + cmd.args(["--timeout", &timeout, "--metrics", "--quiet", "--cpu-load", &load]); match profile.vector { StressVector::CpuMatrix => { - cmd.args(["--matrix", &threads, "--cpu-load", &load]); + cmd.args(["--matrix", &threads]); }, StressVector::MemoryBandwidth => { cmd.args(["--vm", &threads, "--vm-bytes", "80%"]); diff --git a/src/main.rs b/src/main.rs index 8e09e73..786cb8b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,7 +8,8 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::io; use clap::Parser; -use tracing::{info, debug, error}; +use tracing::error; +use tracing_subscriber::{fmt, prelude::*, EnvFilter}; use crossterm::{ event::{self, Event, KeyCode}, @@ -68,27 +69,24 @@ fn print_summary_report(result: &OptimizationResult) { println!(); } -fn setup_logging(verbose: bool) -> tracing_appender::non_blocking::WorkerGuard { - let file_appender = tracing_appender::rolling::never("/var/log", "ember-tune.log"); - let (non_blocking, guard) = tracing_appender::non_blocking(file_appender); - - let level = if verbose { tracing::Level::DEBUG } else { tracing::Level::INFO }; - - tracing_subscriber::fmt() - .with_max_level(level) - .with_writer(non_blocking) - .with_ansi(false) - .init(); - - guard -} - fn main() -> Result<()> { - // 1. Diagnostics & CLI Initialization let args = Cli::parse(); - let _log_guard = setup_logging(args.verbose); + + // 1. Logging Setup (File-only by default, Stdout during Audit) + let file_appender = tracing_appender::rolling::never(".", "ember-tune.log"); + let (non_blocking, _guard) = tracing_appender::non_blocking(file_appender); + let level = if args.verbose { "debug" } else { "info" }; + + let file_layer = fmt::layer() + .with_writer(non_blocking) + .with_ansi(false); + + // We use a simple println for the audit to avoid complex reload handles + tracing_subscriber::registry() + .with(EnvFilter::new(level)) + .with(file_layer) + .init(); - // Set panic hook to restore terminal state std::panic::set_hook(Box::new(|panic_info| { let _ = disable_raw_mode(); let mut stdout = io::stdout(); @@ -99,11 +97,10 @@ fn main() -> Result<()> { eprintln!("----------------------------------------\n"); })); - info!("ember-tune starting with args: {:?}", args); + println!("{}", console::style("─── Pre-flight System Audit ───").bold().cyan()); let ctx = ember_tune_rs::sal::traits::EnvironmentCtx::production(); - // 2. Platform Detection & Audit let (sal_box, facts): (Box, SystemFactSheet) = if args.mock { (Box::new(MockSal::new()), SystemFactSheet::default()) } else { @@ -111,9 +108,7 @@ fn main() -> Result<()> { }; let sal: Arc = sal_box.into(); - println!("{}", console::style("─── Pre-flight System Audit ───").bold().cyan()); let mut audit_failures = Vec::new(); - for step in sal.audit() { print!(" Checking {:<40} ", step.description); io::Write::flush(&mut io::stdout()).into_diagnostic()?; @@ -137,15 +132,14 @@ fn main() -> Result<()> { return Ok(()); } - // 3. Terminal Setup + // Entering TUI Mode - STDOUT is now strictly for Ratatui enable_raw_mode().into_diagnostic()?; let mut stdout = io::stdout(); - execute!(stdout, EnterAlternateScreen).into_diagnostic()?; + execute!(stdout, EnterAlternateScreen, crossterm::cursor::Hide).into_diagnostic()?; let backend_stdout = io::stdout(); let backend_term = CrosstermBackend::new(backend_stdout); let mut terminal = Terminal::new(backend_term).into_diagnostic()?; - // 4. State & Communication Setup let running = Arc::new(AtomicBool::new(true)); let r = running.clone(); @@ -158,7 +152,6 @@ fn main() -> Result<()> { r.store(false, Ordering::SeqCst); }).expect("Error setting Ctrl-C handler"); - // 5. Spawn Backend Orchestrator let sal_backend = sal.clone(); let facts_backend = facts.clone(); let config_out = args.config_out.clone(); @@ -175,10 +168,9 @@ fn main() -> Result<()> { orchestrator.run() }); - // 6. Frontend Event Loop let mut ui_state = DashboardState::new(); let mut last_telemetry = TelemetryState { - cpu_model: "Loading...".to_string(), + cpu_model: facts.model.clone(), total_ram_gb: 0, tick: 0, cpu_temp: 0.0, @@ -227,7 +219,6 @@ fn main() -> Result<()> { while let Ok(new_state) = telemetry_rx.try_recv() { if let Some(log) = &new_state.log_event { ui_state.add_log(log.clone()); - debug!("Backend Log: {}", log); } else { ui_state.update(&new_state); last_telemetry = new_state; @@ -238,20 +229,11 @@ fn main() -> Result<()> { if backend_handle.is_finished() { break; } } - // 7. Terminal Restoration let _ = disable_raw_mode(); - let _ = execute!(terminal.backend_mut(), LeaveAlternateScreen); - let _ = terminal.show_cursor(); + let _ = execute!(terminal.backend_mut(), LeaveAlternateScreen, crossterm::cursor::Show); - // 8. Final Report & Hardware Restoration let join_res = backend_handle.join(); - // Explicit hardware restoration - info!("Restoring hardware state..."); - if let Err(e) = sal.restore() { - error!("Failed to restore hardware state: {}", e); - } - match join_res { Ok(Ok(result)) => { print_summary_report(&result); @@ -276,6 +258,5 @@ fn main() -> Result<()> { } } - info!("ember-tune exited gracefully."); Ok(()) } diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index b3f1071..bc426f3 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -3,8 +3,8 @@ //! It manages hardware interactions through the [PlatformSal], generates stress //! using a [Workload], and feeds telemetry to the frontend via MPSC channels. -use anyhow::{Result, Context}; -use tracing::warn; +use anyhow::{Result, Context, bail}; +use tracing::{info, warn, error}; use std::sync::mpsc; use std::time::{Duration, Instant}; use std::thread; @@ -14,16 +14,29 @@ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Mutex; use std::path::PathBuf; +use std::cell::Cell; -use crate::sal::traits::{PlatformSal, SafetyStatus}; +use crate::sal::traits::{PlatformSal, SensorBus}; use crate::sal::heuristic::discovery::SystemFactSheet; -use crate::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts, ConfigurationTransaction, ThermalThresholdCelsius}; +use crate::sal::safety::{HardwareStateGuard, PowerLimitWatts, ThermalWatchdog}; use crate::load::{Workload, IntensityProfile, StressVector}; use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase}; use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult}; -use crate::agent_metrology::MetrologyAgent; -use crate::agent_analyst::{HeuristicAnalyst, OptimizationMatrix}; -use crate::agent_integrator::ServiceIntegrator; +use crate::agent_analyst::HeuristicAnalyst; + +/// Represents the possible states of the benchmark orchestrator. +pub enum OrchestratorState { + /// Performing pre-flight checks and snapshotting. + PreFlight, + /// Acquiring idle baseline telemetry. + IdleBaseline, + /// Actively sweeping through power limits. + StressSweep { current_wattage: f32 }, + /// Allowing hardware to cool down before releasing the guard. + Cooldown, + /// Benchmark complete, generating final results. + Finalizing, +} /// The central state machine responsible for coordinating the thermal benchmark. pub struct BenchmarkOrchestrator { @@ -37,8 +50,8 @@ pub struct BenchmarkOrchestrator { telemetry_tx: mpsc::Sender, /// Channel for receiving commands from the UI. command_rx: mpsc::Receiver, - /// Current phase of the benchmark. - phase: BenchmarkPhase, + /// Current phase reported to the UI. + ui_phase: BenchmarkPhase, /// Accumulated thermal data points. profile: ThermalProfile, /// Mathematics engine for data smoothing and optimization. @@ -48,6 +61,8 @@ pub struct BenchmarkOrchestrator { /// The safety membrane protecting the system. safeguard: Option, + /// Active thermal watchdog. + watchdog: Option, /// Sliding window of power readings (Watts). history_watts: VecDeque, @@ -91,7 +106,7 @@ impl BenchmarkOrchestrator { workload, telemetry_tx, command_rx, - phase: BenchmarkPhase::Auditing, + ui_phase: BenchmarkPhase::Auditing, profile: ThermalProfile::default(), engine: OptimizerEngine::new(5), history_watts: VecDeque::with_capacity(120), @@ -103,147 +118,163 @@ impl BenchmarkOrchestrator { emergency_reason: Arc::new(Mutex::new(None)), optional_config_out, safeguard: None, + watchdog: None, } } /// Executes the full benchmark sequence. pub fn run(&mut self) -> Result { - self.log("Starting ember-tune Benchmark Sequence.")?; + // Immediate Priming + let _ = self.sal.get_temp(); + let _ = self.sal.get_power_w(); + let _ = self.sal.get_fan_rpms(); - let _watchdog_handle = self.spawn_watchdog_monitor(); + info!("Orchestrator: Initializing Project Iron-Ember lifecycle."); + + // Spawn safety watchdog immediately + let watchdog = ThermalWatchdog::spawn(self.sal.clone(), self.emergency_abort.clone()); + self.watchdog = Some(watchdog); - // Core execution wrapped in cleanup logic let result = self.execute_benchmark(); - // --- MANDATORY CLEANUP --- - self.log("Benchmark sequence finished. Restoring hardware defaults...")?; + if let Err(ref e) = result { + error!("Benchmark Lifecycle Failure: {}", e); + let _ = self.log(&format!("⚠ FAILURE: {}", e)); + } + + // --- MANDATORY RAII CLEANUP --- + info!("Benchmark sequence complete. Releasing safeguards..."); let _ = self.workload.stop_workload(); if let Some(mut sg) = self.safeguard.take() { if let Err(e) = sg.release() { - anyhow::bail!("CRITICAL: USA Restoration Failure: {}", e); + error!("CRITICAL: State restoration failure: {}", e); } } - // SAL restore should only handle OEM-specific non-sysfs state not covered by guard - if let Err(e) = self.sal.restore() { - warn!("Failed to perform secondary SAL restoration: {}", e); - } - - self.log("✓ Hardware state restored.")?; - + info!("✓ Hardware state restored to pre-flight defaults."); result } /// Internal execution logic for the benchmark phases. fn execute_benchmark(&mut self) -> Result { - let bench_cfg = self.facts.bench_config.clone().context("Benchmarking config missing in facts")?; + let bench_cfg = self.facts.bench_config.clone().context("Benchmarking configuration missing.")?; - // 1. Snapshot & Arm Safeguard + // 1. Pre-Flight Phase + self.ui_phase = BenchmarkPhase::Auditing; + self.log("Phase: Pre-Flight Auditing & Sterilization")?; + + // Snapshot and neutralise Brawl Matrix let mut target_files = self.facts.rapl_paths.iter() .map(|p| p.join("constraint_0_power_limit_uw")) .collect::>(); target_files.extend(self.facts.rapl_paths.iter().map(|p| p.join("constraint_1_power_limit_uw"))); + if let Some(tp) = self.facts.paths.configs.get("throttled") { target_files.push(tp.clone()); } - let target_services = vec!["tlp.service".to_string(), "thermald.service".to_string(), "throttled.service".to_string()]; - let mut sg = HardwareStateGuard::acquire(&target_files, &target_services)?; - - // # SAFETY: Register fan restoration command if we are on Dell - if self.facts.vendor.to_lowercase().contains("dell") { - if let Some(tool_path) = self.facts.paths.tools.get("dell_fan_ctrl") { - let tool_str = tool_path.to_string_lossy().to_string(); - sg.on_rollback(Box::new(move || { - let _ = std::process::Command::new(tool_str).arg("1").status(); - })); - } - } - + let sg = HardwareStateGuard::acquire(&target_files, &self.facts.conflict_services)?; self.safeguard = Some(sg); - // Phase 1: Audit & Baseline - self.phase = BenchmarkPhase::Auditing; + // Run auditor for step in self.sal.audit() { if let Err(e) = step.outcome { return Err(anyhow::anyhow!("Audit failed ({}): {:?}", step.description, e)); } } - self.workload.initialize().context("Failed to initialize workload")?; - self.sal.suppress().context("Failed to suppress background services")?; + self.workload.initialize().context("Failed to initialize load generator.")?; - // Baseline (Idle Calibration) - self.phase = BenchmarkPhase::IdleCalibration; - self.log(&format!("Phase 1: Recording Idle Baseline ({}s)...", bench_cfg.idle_duration_s))?; + let tick = Cell::new(0u64); + + // 2. Idle Baseline Phase + self.ui_phase = BenchmarkPhase::IdleCalibration; + self.log(&format!("Phase: Recording Idle Baseline ({}s)", bench_cfg.idle_duration_s))?; + + // Wait for fan spin-up self.sal.set_fan_mode("auto")?; let mut idle_temps = Vec::new(); let start = Instant::now(); - let mut tick = 0; while start.elapsed() < Duration::from_secs(bench_cfg.idle_duration_s) { - self.check_abort()?; - self.send_telemetry(tick)?; + self.check_safety_abort()?; + self.send_telemetry(tick.get())?; idle_temps.push(self.sal.get_temp().unwrap_or(0.0)); - tick += 1; + tick.set(tick.get() + 1); thread::sleep(Duration::from_millis(500)); } self.profile.ambient_temp = self.engine.smooth(&idle_temps).last().cloned().unwrap_or(0.0); self.log(&format!("✓ Idle Baseline: {:.1}°C", self.profile.ambient_temp))?; - // Phase 1.5: Thermal Soak (Agent Metrology) - self.log("Phase 1.5: Executing Thermal Soak to achieve chassis saturation...")?; - let soak_duration_minutes = 1; - let mut metrology = MetrologyAgent::new(self.sal.as_ref(), &mut self.workload); - let max_soak_watts = metrology.perform_thermal_soak(soak_duration_minutes)?; - self.log(&format!("✓ Max sustained wattage during soak: {:.1}W", max_soak_watts))?; + // 3. Stress Sweep Phase + self.ui_phase = BenchmarkPhase::StressTesting; + self.log("Phase: Synthetic Stress Matrix (Gradual Ramp)")?; + + // Ensure fans are ramped to MAX before load + self.log("Metrology: Locking fans to MAX...")?; + self.sal.set_fan_mode("max")?; + let fan_lock_start = Instant::now(); + loop { + let fans = self.sal.get_fan_rpms().unwrap_or_default(); + let max_rpm = fans.iter().cloned().max().unwrap_or(0); + if max_rpm >= 3000 || fan_lock_start.elapsed() > Duration::from_secs(15) { + break; + } + thread::sleep(Duration::from_millis(500)); + self.send_telemetry(tick.get())?; + tick.set(tick.get() + 1); + } - // Phase 2: Stress Stepping - self.phase = BenchmarkPhase::StressTesting; - self.log("Phase 2: Starting Synthetic Stress Matrix.")?; - self.sal.set_fan_mode("max")?; - - let mut current_pl = 10.0_f32; // Start at 10W + let physical_threads = num_cpus::get_physical(); let mut previous_ops = 0.0; - loop { - self.log(&format!("Testing PL1 = {:.0}W...", current_pl))?; + for &watts in &bench_cfg.power_steps_watts { + self.check_safety_abort()?; + self.log(&format!("Testing PL1 = {:.0}W", watts))?; - // # SAFETY: Transactional Commit for Power Limits - let pl1_uw = TdpLimitMicroWatts::from_watts(current_pl)?; - let pl2_uw = TdpLimitMicroWatts::from_watts(current_pl + 5.0)?; + // Apply limits safely + let pl1 = PowerLimitWatts::try_new(watts)?; + let pl2 = PowerLimitWatts::try_new(watts + 5.0)?; - let mut tx = ConfigurationTransaction::default(); - if let Some(p) = self.facts.rapl_paths.first() { - tx.add_change(p.join("constraint_0_power_limit_uw"), pl1_uw.as_u64().to_string()); - tx.add_change(p.join("constraint_1_power_limit_uw"), pl2_uw.as_u64().to_string()); - } - tx.commit().context("Failed to commit power limit transaction")?; + self.sal.set_sustained_power_limit(pl1)?; + self.sal.set_burst_power_limit(pl2)?; + // Start workload self.workload.run_workload( Duration::from_secs(bench_cfg.stress_duration_max_s), - IntensityProfile { threads: num_cpus::get(), load_percentage: 100, vector: StressVector::CpuMatrix } + IntensityProfile { threads: physical_threads, load_percentage: 100, vector: StressVector::CpuMatrix } )?; let step_start = Instant::now(); let mut step_temps = VecDeque::with_capacity(30); + let mut previous_step_temp = self.sal.get_temp().unwrap_or(0.0); + // Equilibrium Gating while step_start.elapsed() < Duration::from_secs(bench_cfg.stress_duration_max_s) { - self.check_abort()?; + self.check_safety_abort()?; let t = self.sal.get_temp().unwrap_or(0.0); + let dt_dt = (t - previous_step_temp) / 0.5; + previous_step_temp = t; + + // Redundant safety check during step + if t > 94.0 || dt_dt > 5.0 { + warn!("Thermal Spike Detected! Aborting current step."); + break; + } + step_temps.push_back(t); if step_temps.len() > 10 { step_temps.pop_front(); } - self.send_telemetry(tick)?; - tick += 1; + self.send_telemetry(tick.get())?; + tick.set(tick.get() + 1); if step_start.elapsed() > Duration::from_secs(bench_cfg.stress_duration_min_s) && step_temps.len() == 10 { let min = step_temps.iter().fold(f32::MAX, |a, &b| a.min(b)); let max = step_temps.iter().fold(f32::MIN, |a, &b| a.max(b)); if (max - min) < 0.5 { - self.log(&format!(" Equilibrium reached at {:.1}°C", t))?; + info!("Equilibrium reached at {:.1}°C", t); break; } } @@ -251,197 +282,74 @@ impl BenchmarkOrchestrator { } // Record data point - let avg_p = self.sal.get_power_w().unwrap_or(0.0); - let avg_t = self.sal.get_temp().unwrap_or(0.0); - let avg_f = self.sal.get_freq_mhz().unwrap_or(0.0); - let fans = self.sal.get_fan_rpms().unwrap_or_default(); - let primary_fan = fans.first().cloned().unwrap_or(0); let metrics = self.workload.get_current_metrics().unwrap_or_default(); - self.profile.points.push(ThermalPoint { - power_w: avg_p, - temp_c: avg_t, - freq_mhz: avg_f, - fan_rpm: primary_fan, + power_w: self.sal.get_power_w().unwrap_or(watts), + temp_c: self.sal.get_temp().unwrap_or(0.0), + freq_mhz: self.sal.get_freq_mhz().unwrap_or(0.0), + fan_rpm: self.sal.get_fan_rpms().unwrap_or_default().first().cloned().unwrap_or(0), throughput: metrics.primary_ops_per_sec, }); self.workload.stop_workload()?; - // 1. Check Thermal Ceiling Halt Condition - let max_safe_temp = ThermalThresholdCelsius::MAX_SAFE_C - 5.0; // Margin - if avg_t >= max_safe_temp { - self.log(&format!("Thermal ceiling reached ({:.1}°C). Terminating Identification phase.", avg_t))?; - break; - } - - // 2. Check Diminishing Returns Halt Condition (< 1% gain) + // Performance Halt Condition if previous_ops > 0.0 { - let gain_percent = ((metrics.primary_ops_per_sec - previous_ops) / previous_ops) * 100.0; - if gain_percent < 1.0 { - self.log(&format!("Performance gain ({:.1}%) fell below 1%. Terminating Identification phase.", gain_percent))?; + let gain = ((metrics.primary_ops_per_sec - previous_ops) / previous_ops) * 100.0; + if gain < 1.0 { + self.log("Diminishing returns reached. Stopping sweep.")?; break; } } - - // 3. Absolute Maximum Power Check - if current_pl >= 60.0 { - self.log("Maximum theoretical power limit reached. Terminating Identification phase.")?; - break; - } - previous_ops = metrics.primary_ops_per_sec; - current_pl += 2.0; - self.log(&format!(" Step complete. Cooling down for {}s...", bench_cfg.cool_down_s))?; + self.log(&format!("Cooling down ({}s)...", bench_cfg.cool_down_s))?; thread::sleep(Duration::from_secs(bench_cfg.cool_down_s)); } - // Phase 4: Physical Modeling (Agent Analyst) - self.phase = BenchmarkPhase::PhysicalModeling; - self.log("Phase 3: Calculating Silicon Physical Sweet Spot & Profiles...")?; + // 4. Physical Modeling Phase + self.ui_phase = BenchmarkPhase::PhysicalModeling; + self.log("Phase: Silicon Physical Sweet Spot Calculation")?; let analyst = HeuristicAnalyst::new(); - let matrix = analyst.analyze(&self.profile, max_soak_watts); + let matrix = analyst.analyze(&self.profile, self.profile.points.last().map(|p| p.power_w).unwrap_or(15.0)); let mut res = self.generate_result(false); res.optimization_matrix = Some(matrix.clone()); - self.log(&format!("✓ Thermal Resistance (Rθ): {:.3} K/W", res.thermal_resistance_kw))?; - self.log(&format!("✓ Silicon Knee Found: {:.1} W", res.silicon_knee_watts))?; + info!("Identification complete. Knee: {:.1}W, Rθ: {:.3} K/W", res.silicon_knee_watts, res.thermal_resistance_kw); - thread::sleep(Duration::from_secs(3)); - - // Phase 5: Finalizing (Agent Integrator) - self.phase = BenchmarkPhase::Finalizing; - self.log("Benchmark sequence complete. Generating configurations...")?; - - let config = crate::engine::formatters::throttled::ThrottledConfig { - pl1_limit: res.silicon_knee_watts, - pl2_limit: res.recommended_pl2, - trip_temp: res.max_temp_c.max(95.0), - }; + // 5. Finalizing Phase + self.ui_phase = BenchmarkPhase::Finalizing; + self.log("Phase: Generation of Optimized Configuration Sets")?; let throttled_path = self.optional_config_out.clone() .or_else(|| self.facts.paths.configs.get("throttled").cloned()); if let Some(path) = throttled_path { + let config = crate::engine::formatters::throttled::ThrottledConfig { + pl1_limit: res.silicon_knee_watts, + pl2_limit: res.recommended_pl2, + trip_temp: res.max_temp_c.max(90.0), + }; crate::engine::formatters::throttled::ThrottledTranslator::save(&path, &config)?; - self.log(&format!("✓ Saved '{}'.", path.display()))?; - res.config_paths.insert("throttled".to_string(), path.clone()); + self.log(&format!("✓ Saved Throttled profile to {}", path.display()))?; + res.config_paths.insert("throttled".to_string(), path); } - // Generate Fan configs via Agent Integrator - let base_out = self.optional_config_out.clone().unwrap_or_else(|| PathBuf::from("/etc")); - - let i8k_out = base_out.join("i8kmon.conf"); - if ServiceIntegrator::generate_i8kmon_config(&matrix, &i8k_out).is_ok() { - self.log(&format!("✓ Saved '{}'.", i8k_out.display()))?; - res.config_paths.insert("i8kmon".to_string(), i8k_out); - } - - let thinkfan_out = base_out.join("thinkfan.conf"); - if ServiceIntegrator::generate_thinkfan_config(&matrix, &thinkfan_out).is_ok() { - self.log(&format!("✓ Saved '{}'.", thinkfan_out.display()))?; - res.config_paths.insert("thinkfan".to_string(), thinkfan_out); - } - - let thermald_out = base_out.join("thermal-conf.xml"); - if ServiceIntegrator::generate_thermald_config(&matrix, &thermald_out).is_ok() { - self.log(&format!("✓ Saved '{}'.", thermald_out.display()))?; - res.config_paths.insert("thermald".to_string(), thermald_out); - } - - let script_out = base_out.join("ember-tune-neutralize.sh"); - if ServiceIntegrator::generate_conflict_resolution_script(&script_out).is_ok() { - self.log(&format!("✓ Saved conflict resolution script: '{}'", script_out.display()))?; - res.config_paths.insert("conflict_script".to_string(), script_out); - } - Ok(res) } - fn spawn_watchdog_monitor(&self) -> thread::JoinHandle<()> { - let abort = self.emergency_abort.clone(); - let reason_store = self.emergency_reason.clone(); - let sal = self.sal.clone(); - let tx = self.telemetry_tx.clone(); - - thread::spawn(move || { - while !abort.load(Ordering::SeqCst) { - let status = sal.get_safety_status(); - match status { - Ok(SafetyStatus::EmergencyAbort(reason)) => { - *reason_store.lock().unwrap() = Some(reason.clone()); - abort.store(true, Ordering::SeqCst); - break; - } - Ok(SafetyStatus::Warning(msg)) | Ok(SafetyStatus::Critical(msg)) => { - let state = TelemetryState { - cpu_model: String::new(), - total_ram_gb: 0, - tick: 0, - cpu_temp: 0.0, - power_w: 0.0, - current_freq: 0.0, - fans: Vec::new(), - governor: String::new(), - pl1_limit: 0.0, - pl2_limit: 0.0, - fan_tier: String::new(), - is_throttling: sal.get_throttling_status().unwrap_or(false), - phase: BenchmarkPhase::StressTesting, - history_watts: Vec::new(), - history_temp: Vec::new(), - history_mhz: Vec::new(), - log_event: Some(format!("WATCHDOG: {}", msg)), - metadata: std::collections::HashMap::new(), - is_emergency: false, - emergency_reason: None, - }; - let _ = tx.send(state); - } - Ok(SafetyStatus::Nominal) => {} - Err(e) => { - *reason_store.lock().unwrap() = Some(format!("Watchdog Sensor Failure: {}", e)); - abort.store(true, Ordering::SeqCst); - break; - } - } - thread::sleep(Duration::from_millis(100)); - } - }) - } - - pub fn generate_result(&self, is_partial: bool) -> OptimizationResult { - let r_theta = self.engine.calculate_thermal_resistance(&self.profile); - let knee = self.engine.find_silicon_knee(&self.profile); - let max_t = self.engine.get_max_temp(&self.profile); - - OptimizationResult { - profile: self.profile.clone(), - silicon_knee_watts: knee, - thermal_resistance_kw: r_theta, - recommended_pl1: knee, - recommended_pl2: knee * 1.25, - max_temp_c: max_t, - is_partial, - config_paths: std::collections::HashMap::new(), - optimization_matrix: None, - } - } - - fn check_abort(&self) -> Result<()> { + /// Checks if the safety watchdog or user triggered an abort. + fn check_safety_abort(&self) -> Result<()> { if self.emergency_abort.load(Ordering::SeqCst) { - let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Unknown safety trigger".to_string()); - return Err(anyhow::anyhow!("EMERGENCY_ABORT: {}", reason)); + let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Watchdog Triggered".to_string()); + bail!("EMERGENCY_ABORT: {}", reason); } if let Ok(cmd) = self.command_rx.try_recv() { match cmd { - UiCommand::Abort => { - return Err(anyhow::anyhow!("ABORTED")); - } + UiCommand::Abort => bail!("ABORTED"), } } Ok(()) @@ -456,12 +364,12 @@ impl BenchmarkOrchestrator { power_w: self.sal.get_power_w().unwrap_or(0.0), current_freq: self.sal.get_freq_mhz().unwrap_or(0.0), fans: self.sal.get_fan_rpms().unwrap_or_default(), - governor: "unknown".to_string(), + governor: "performance".to_string(), pl1_limit: 0.0, pl2_limit: 0.0, fan_tier: "auto".to_string(), is_throttling: self.sal.get_throttling_status().unwrap_or(false), - phase: self.phase, + phase: self.ui_phase, history_watts: Vec::new(), history_temp: Vec::new(), history_mhz: Vec::new(), @@ -477,7 +385,6 @@ impl BenchmarkOrchestrator { let temp = self.sal.get_temp().unwrap_or(0.0); let pwr = self.sal.get_power_w().unwrap_or(0.0); let freq = self.sal.get_freq_mhz().unwrap_or(0.0); - let throttling = self.sal.get_throttling_status().unwrap_or(false); self.history_temp.push_back(temp); self.history_watts.push_back(pwr); @@ -501,8 +408,8 @@ impl BenchmarkOrchestrator { pl1_limit: 15.0, pl2_limit: 25.0, fan_tier: "max".to_string(), - is_throttling: throttling, - phase: self.phase, + is_throttling: self.sal.get_throttling_status().unwrap_or(false), + phase: self.ui_phase, history_watts: self.history_watts.iter().cloned().collect(), history_temp: self.history_temp.iter().cloned().collect(), history_mhz: self.history_mhz.iter().cloned().collect(), @@ -513,4 +420,22 @@ impl BenchmarkOrchestrator { }; self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed")) } + + pub fn generate_result(&self, is_partial: bool) -> OptimizationResult { + let r_theta = self.engine.calculate_thermal_resistance(&self.profile); + let knee = self.engine.find_silicon_knee(&self.profile); + let max_t = self.engine.get_max_temp(&self.profile); + + OptimizationResult { + profile: self.profile.clone(), + silicon_knee_watts: knee, + thermal_resistance_kw: r_theta, + recommended_pl1: knee, + recommended_pl2: knee * 1.25, + max_temp_c: max_t, + is_partial, + config_paths: std::collections::HashMap::new(), + optimization_matrix: None, + } + } } diff --git a/src/sal/dell_xps_9380.rs b/src/sal/dell_xps_9380.rs index be78de1..82fde52 100644 --- a/src/sal/dell_xps_9380.rs +++ b/src/sal/dell_xps_9380.rs @@ -1,11 +1,12 @@ use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep, SafetyStatus, EnvironmentCtx}; -use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage}; +use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent}; use anyhow::{Result, Context, anyhow}; use std::fs; use std::path::{PathBuf}; use std::time::{Duration, Instant}; +use std::thread; use std::sync::Mutex; -use tracing::{debug, warn}; +use tracing::{info, debug}; use crate::sal::heuristic::discovery::SystemFactSheet; /// Implementation of the System Abstraction Layer for the Dell XPS 13 9380. @@ -15,30 +16,66 @@ pub struct DellXps9380Sal { temp_path: PathBuf, pwr_path: PathBuf, fan_paths: Vec, + pwm_paths: Vec, + pwm_enable_paths: Vec, + pl1_paths: Vec, + pl2_paths: Vec, freq_path: PathBuf, - pl1_path: PathBuf, - pl2_path: PathBuf, last_poll: Mutex, last_temp: Mutex, last_fans: Mutex>, - suppressed_services: Mutex>, msr_file: Mutex, last_energy: Mutex<(u64, Instant)>, last_watts: Mutex, - - // --- Original State for Restoration --- - original_pl1: Mutex>, - original_pl2: Mutex>, - original_fan_mode: Mutex>, } impl DellXps9380Sal { - /// Initializes the Dell SAL, opening the MSR interface and discovering sensors. + /// Initializes the Dell SAL, opening the MSR interface and discovering sensors and PWM nodes. pub fn init(ctx: EnvironmentCtx, facts: SystemFactSheet) -> Result { let temp_path = facts.temp_path.clone().context("Dell SAL requires temperature sensor")?; let pwr_base = facts.rapl_paths.first().cloned().context("Dell SAL requires RAPL interface")?; let fan_paths = facts.fan_paths.clone(); + // 1. Discover PWM and Enable nodes associated with the fan paths + let mut pwm_paths = Vec::new(); + let mut pwm_enable_paths = Vec::new(); + for fan_p in &fan_paths { + if let Some(parent) = fan_p.parent() { + let fan_file = fan_p.file_name().and_then(|n| n.to_str()).unwrap_or(""); + let fan_idx = fan_file.chars().filter(|c| c.is_ascii_digit()).collect::(); + let idx = if fan_idx.is_empty() { "1".to_string() } else { fan_idx }; + + let pwm_p = parent.join(format!("pwm{}", idx)); + if pwm_p.exists() { pwm_paths.push(pwm_p); } + + let enable_p = parent.join(format!("pwm{}_enable", idx)); + if enable_p.exists() { pwm_enable_paths.push(enable_p); } + } + } + + // 2. Map all RAPL constraints + let mut pl1_paths = Vec::new(); + let mut pl2_paths = Vec::new(); + for rapl_p in &facts.rapl_paths { + pl1_paths.push(rapl_p.join("constraint_0_power_limit_uw")); + pl2_paths.push(rapl_p.join("constraint_1_power_limit_uw")); + } + + // 3. Physical Sensor Verification & Warm Cache Priming + let mut initial_fans = Vec::new(); + for fan_p in &fan_paths { + let mut rpm = 0; + for _ in 0..3 { + if let Ok(val) = fs::read_to_string(fan_p) { + rpm = val.trim().parse::().unwrap_or(0); + if rpm > 0 { break; } + } + thread::sleep(Duration::from_millis(100)); + } + info!("SAL Warm-Start: Fan sensor {:?} -> {} RPM", fan_p, rpm); + initial_fans.push(rpm); + } + let freq_path = ctx.sysfs_base.join("sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq"); let msr_path = ctx.sysfs_base.join("dev/cpu/0/msr"); @@ -47,25 +84,26 @@ impl DellXps9380Sal { let initial_energy = fs::read_to_string(pwr_base.join("energy_uj")).unwrap_or_default().trim().parse().unwrap_or(0); + info!("SAL: Dell XPS 9380 Initialized. ({} fans, {} RAPL nodes found)", + fan_paths.len(), facts.rapl_paths.len()); + Ok(Self { temp_path, pwr_path: pwr_base.join("power1_average"), fan_paths, + pwm_paths, + pwm_enable_paths, + pl1_paths, + pl2_paths, freq_path, - pl1_path: pwr_base.join("constraint_0_power_limit_uw"), - pl2_path: pwr_base.join("constraint_1_power_limit_uw"), last_poll: Mutex::new(Instant::now() - Duration::from_secs(2)), last_temp: Mutex::new(0.0), - last_fans: Mutex::new(Vec::new()), - suppressed_services: Mutex::new(Vec::new()), + last_fans: Mutex::new(initial_fans), msr_file: Mutex::new(msr_file), last_energy: Mutex::new((initial_energy, Instant::now())), last_watts: Mutex::new(0.0), fact_sheet: facts, ctx, - original_pl1: Mutex::new(None), - original_pl2: Mutex::new(None), - original_fan_mode: Mutex::new(None), }) } @@ -93,7 +131,6 @@ impl PreflightAuditor for DellXps9380Sal { outcome: if unsafe { libc::getuid() } == 0 { Ok(()) } else { Err(AuditError::RootRequired) } }); - // RAPL Lock Check (MSR 0x610) let rapl_lock = match self.read_msr(0x610) { Ok(val) => { if (val & (1 << 63)) != 0 { @@ -104,19 +141,14 @@ impl PreflightAuditor for DellXps9380Sal { }, Err(e) => Err(AuditError::ToolMissing(format!("Cannot read MSR 0x610: {}", e))), }; - steps.push(AuditStep { - description: "MSR 0x610 RAPL Lock Status".to_string(), - outcome: rapl_lock, - }); + steps.push(AuditStep { description: "MSR 0x610 RAPL Lock Status".to_string(), outcome: rapl_lock }); let modules = ["dell_smm_hwmon", "msr", "intel_rapl_msr"]; for mod_name in modules { let path = self.ctx.sysfs_base.join(format!("sys/module/{}", mod_name)); steps.push(AuditStep { description: format!("Kernel Module: {}", mod_name), - outcome: if path.exists() { Ok(()) } else { - Err(AuditError::ToolMissing(format!("Module '{}' not loaded.", mod_name))) - } + outcome: if path.exists() { Ok(()) } else { Err(AuditError::ToolMissing(format!("Module '{}' not loaded.", mod_name))) } }); } @@ -138,9 +170,7 @@ impl PreflightAuditor for DellXps9380Sal { let ac_status = fs::read_to_string(ac_status_path).unwrap_or_else(|_| "0".to_string()); steps.push(AuditStep { description: "AC Power Connection".to_string(), - outcome: if ac_status.trim() == "1" { Ok(()) } else { - Err(AuditError::AcPowerMissing("System must be on AC power".to_string())) - } + outcome: if ac_status.trim() == "1" { Ok(()) } else { Err(AuditError::AcPowerMissing("System must be on AC power".to_string())) } }); Box::new(steps.into_iter()) @@ -148,49 +178,16 @@ impl PreflightAuditor for DellXps9380Sal { } impl EnvironmentGuard for DellXps9380Sal { - fn suppress(&self) -> Result<()> { - if let Ok(pl1) = fs::read_to_string(&self.pl1_path) { - *self.original_pl1.lock().unwrap() = pl1.trim().parse().ok(); - } - if let Ok(pl2) = fs::read_to_string(&self.pl2_path) { - *self.original_pl2.lock().unwrap() = pl2.trim().parse().ok(); - } - *self.original_fan_mode.lock().unwrap() = Some("1".to_string()); - - let services = ["tlp", "thermald", "i8kmon"]; - let mut suppressed = self.suppressed_services.lock().unwrap(); - for s in services { - if self.ctx.runner.run("systemctl", &["is-active", "--quiet", s]).is_ok() { - let _ = self.ctx.runner.run("systemctl", &["stop", s]); - suppressed.push(s.to_string()); - } - } - Ok(()) - } - - fn restore(&self) -> Result<()> { - if let Some(pl1) = *self.original_pl1.lock().unwrap() { - let _ = fs::write(&self.pl1_path, pl1.to_string()); - } - if let Some(pl2) = *self.original_pl2.lock().unwrap() { - let _ = fs::write(&self.pl2_path, pl2.to_string()); - } - if let Some(tool_path) = self.fact_sheet.paths.tools.get("dell_fan_ctrl") { - let _ = self.ctx.runner.run(&tool_path.to_string_lossy(), &["1"]); - } - let mut suppressed = self.suppressed_services.lock().unwrap(); - for s in suppressed.drain(..) { - let _ = self.ctx.runner.run("systemctl", &["start", &s]); - } - Ok(()) - } + fn suppress(&self) -> Result<()> { Ok(()) } + fn restore(&self) -> Result<()> { Ok(()) } } impl SensorBus for DellXps9380Sal { fn get_temp(&self) -> Result { let mut last_poll = self.last_poll.lock().unwrap(); let now = Instant::now(); - if now.duration_since(*last_poll) < Duration::from_millis(1000) { + // # SAFETY: High frequency polling for watchdog + if now.duration_since(*last_poll) < Duration::from_millis(100) { return Ok(*self.last_temp.lock().unwrap()); } let s = fs::read_to_string(&self.temp_path)?; @@ -201,7 +198,7 @@ impl SensorBus for DellXps9380Sal { } fn get_power_w(&self) -> Result { - let rapl_base = self.pl1_path.parent().context("RAPL path error")?; + let rapl_base = self.fact_sheet.rapl_paths.first().context("RAPL path error")?; let energy_path = rapl_base.join("energy_uj"); if energy_path.exists() { @@ -212,14 +209,9 @@ impl SensorBus for DellXps9380Sal { let e2 = e2_str.trim().parse::()?; let t2 = Instant::now(); let (e1, t1) = *last_energy; - let delta_e = e2.wrapping_sub(e1); let delta_t = t2.duration_since(t1).as_secs_f32(); - - if delta_t < 0.1 { - return Ok(*last_watts); // Return cached if polled too fast - } - + if delta_t < 0.1 { return Ok(*last_watts); } let watts = (delta_e as f32 / 1_000_000.0) / delta_t; *last_energy = (e2, t2); *last_watts = watts; @@ -236,12 +228,27 @@ impl SensorBus for DellXps9380Sal { if now.duration_since(*last_poll) < Duration::from_millis(1000) { return Ok(self.last_fans.lock().unwrap().clone()); } + let mut fans = Vec::new(); for path in &self.fan_paths { - if let Ok(s) = fs::read_to_string(path) { - if let Ok(rpm) = s.trim().parse::() { fans.push(rpm); } + let mut val = 0; + for i in 0..5 { + match fs::read_to_string(path) { + Ok(s) => { + if let Ok(rpm) = s.trim().parse::() { + val = rpm; + if rpm > 0 { break; } + } + }, + Err(e) => { + debug!("SAL: Fan poll retry {} for {:?} failed: {}", i+1, path, e); + } + } + thread::sleep(Duration::from_millis(150)); } + fans.push(val); } + *self.last_fans.lock().unwrap() = fans.clone(); *last_poll = now; Ok(fans) @@ -253,7 +260,6 @@ impl SensorBus for DellXps9380Sal { } fn get_throttling_status(&self) -> Result { - // MSR 0x19C bit 0 is "Thermal Status", bit 1 is "Thermal Log" let val = self.read_msr(0x19C)?; Ok((val & 0x1) != 0) } @@ -266,24 +272,47 @@ impl ActuatorBus for DellXps9380Sal { let tool_str = tool_path.to_string_lossy(); match mode { - "max" | "Manual" => { self.ctx.runner.run(&tool_str, &["0"])?; } + "max" | "Manual" => { + self.ctx.runner.run(&tool_str, &["0"])?; + // Disabling BIOS control requires immediate PWM override + self.set_fan_speed(FanSpeedPercent::new(100)?)?; + } "auto" | "Auto" => { self.ctx.runner.run(&tool_str, &["1"])?; } _ => {} } Ok(()) } - fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> { + fn set_fan_speed(&self, speed: FanSpeedPercent) -> Result<()> { + let pwm_val = ((speed.get() as u32 * 255) / 100) as u8; + for p in &self.pwm_enable_paths { let _ = fs::write(p, "1"); } + for path in &self.pwm_paths { let _ = fs::write(path, pwm_val.to_string()); } Ok(()) } - fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { - fs::write(&self.pl1_path, limit.as_u64().to_string())?; + fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()> { + for path in &self.pl1_paths { + debug!("SAL: Applying PL1 ({:.1}W) to {:?}", limit.get(), path); + fs::write(path, limit.as_microwatts().to_string()) + .with_context(|| format!("Failed to write PL1 to {:?}", path))?; + if let Some(parent) = path.parent() { + let enable_p = parent.join("constraint_0_enabled"); + let _ = fs::write(&enable_p, "1"); + } + } Ok(()) } - fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { - fs::write(&self.pl2_path, limit.as_u64().to_string())?; + fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()> { + for path in &self.pl2_paths { + debug!("SAL: Applying PL2 ({:.1}W) to {:?}", limit.get(), path); + fs::write(path, limit.as_microwatts().to_string()) + .with_context(|| format!("Failed to write PL2 to {:?}", path))?; + if let Some(parent) = path.parent() { + let enable_p = parent.join("constraint_1_enabled"); + let _ = fs::write(&enable_p, "1"); + } + } Ok(()) } } @@ -305,7 +334,5 @@ impl HardwareWatchdog for DellXps9380Sal { } impl Drop for DellXps9380Sal { - fn drop(&mut self) { - let _ = self.restore(); - } + fn drop(&mut self) { } } diff --git a/src/sal/discovery.rs b/src/sal/discovery.rs new file mode 100644 index 0000000..51c8df6 --- /dev/null +++ b/src/sal/discovery.rs @@ -0,0 +1,148 @@ +//! # Hardware Discovery Engine (Agent Sentinel) +//! +//! This module provides dynamic traversal of `/sys/class/hwmon` and `/sys/class/powercap` +//! to locate sensors and actuators without relying on hardcoded indices. + +use anyhow::{Result, Context, anyhow}; +use std::fs; +use std::path::{Path, PathBuf}; +use tracing::{debug, info, warn}; + +/// Result of a successful hardware discovery. +#[derive(Debug, Clone)] +pub struct DiscoveredHardware { + /// Path to the primary package temperature sensor input. + pub temp_input: PathBuf, + /// Paths to all detected fan RPM inputs. + pub fan_inputs: Vec, + /// Paths to all detected fan PWM control nodes. + pub pwm_controls: Vec, + /// Paths to all detected fan PWM enable nodes. + pub pwm_enables: Vec, + /// Paths to RAPL power limit constraint files. + pub rapl_paths: Vec, +} + +pub struct DiscoveryEngine; + +impl DiscoveryEngine { + /// Performs a full traversal of the sysfs hardware tree. + pub fn run(sysfs_root: &Path) -> Result { + info!("Sentinel: Starting dynamic hardware discovery..."); + + let hwmon_path = sysfs_root.join("sys/class/hwmon"); + let (temp_input, fan_info) = Self::discover_hwmon(&hwmon_path)?; + + let powercap_path = sysfs_root.join("sys/class/powercap"); + let rapl_paths = Self::discover_rapl(&powercap_path)?; + + let hardware = DiscoveredHardware { + temp_input, + fan_inputs: fan_info.rpm_inputs, + pwm_controls: fan_info.pwm_controls, + pwm_enables: fan_info.pwm_enables, + rapl_paths, + }; + + info!("Sentinel: Discovery complete. Found {} fans and {} RAPL nodes.", + hardware.fan_inputs.len(), hardware.rapl_paths.len()); + + Ok(hardware) + } + + fn discover_hwmon(base: &Path) -> Result<(PathBuf, FanHardware)> { + let mut best_temp: Option<(u32, PathBuf)> = None; + let mut fans = FanHardware::default(); + + let entries = fs::read_dir(base) + .with_context(|| format!("Failed to read hwmon base: {:?}", base))?; + + for entry in entries.flatten() { + let path = entry.path(); + let driver_name = fs::read_to_string(path.join("name")) + .map(|s| s.trim().to_string()) + .unwrap_or_else(|_| "unknown".to_string()); + + debug!("Discovery: Probing hwmon node {:?} (driver: {})", path, driver_name); + + // 1. Temperature Discovery + let temp_priority = match driver_name.as_str() { + "coretemp" | "zenpower" => 10, + "k10temp" => 9, + "dell_smm" => 8, + "acpitz" => 1, + _ => 5, + }; + + if let Ok(hw_entries) = fs::read_dir(&path) { + for hw_entry in hw_entries.flatten() { + let file_name = hw_entry.file_name().to_string_lossy().to_string(); + + // Temperature Inputs + if file_name.starts_with("temp") && file_name.ends_with("_input") { + let label_path = path.join(file_name.replace("_input", "_label")); + let label = fs::read_to_string(label_path).unwrap_or_default().trim().to_string(); + + let label_priority = if label.contains("Package") || label.contains("Tdie") { + 2 + } else { + 0 + }; + + let total_priority = temp_priority + label_priority; + if best_temp.is_none() || total_priority > best_temp.as_ref().unwrap().0 { + best_temp = Some((total_priority, hw_entry.path())); + } + } + + // Fan Inputs + if file_name.starts_with("fan") && file_name.ends_with("_input") { + fans.rpm_inputs.push(hw_entry.path()); + } + + // PWM Controls + if file_name.starts_with("pwm") && !file_name.contains("_") { + fans.pwm_controls.push(hw_entry.path()); + } + + // PWM Enables + if file_name.starts_with("pwm") && file_name.ends_with("_enable") { + fans.pwm_enables.push(hw_entry.path()); + } + } + } + } + + let temp_input = best_temp.map(|(_, p)| p) + .ok_or_else(|| anyhow!("Failed to locate any valid temperature sensor in /sys/class/hwmon/"))?; + + Ok((temp_input, fans)) + } + + fn discover_rapl(base: &Path) -> Result> { + let mut paths = Vec::new(); + if !base.exists() { + warn!("Discovery: /sys/class/powercap does not exist."); + return Ok(paths); + } + + let entries = fs::read_dir(base)?; + for entry in entries.flatten() { + let path = entry.path(); + let name = fs::read_to_string(path.join("name")).unwrap_or_default().trim().to_string(); + + if name.contains("package") || name.contains("intel-rapl") { + paths.push(path); + } + } + + Ok(paths) + } +} + +#[derive(Default)] +struct FanHardware { + rpm_inputs: Vec, + pwm_controls: Vec, + pwm_enables: Vec, +} diff --git a/src/sal/generic_linux.rs b/src/sal/generic_linux.rs index 767dbe7..3456794 100644 --- a/src/sal/generic_linux.rs +++ b/src/sal/generic_linux.rs @@ -1,11 +1,12 @@ -use anyhow::{Result, anyhow}; +use anyhow::{Result, anyhow, Context}; use std::path::{Path}; use std::fs; use std::time::{Duration, Instant}; -use std::sync::Mutex; +use std::sync::{Mutex, Arc}; +use tracing::{debug, warn, info}; use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx}; -use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage}; +use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent}; use crate::sal::heuristic::discovery::SystemFactSheet; use crate::sal::heuristic::schema::HardwareDb; @@ -13,14 +14,9 @@ pub struct GenericLinuxSal { ctx: EnvironmentCtx, fact_sheet: SystemFactSheet, db: HardwareDb, - suppressed_services: Mutex>, last_valid_temp: Mutex<(f32, Instant)>, current_pl1: Mutex, last_energy: Mutex<(u64, Instant)>, - - // --- Original State for Restoration --- - original_pl1: Mutex>, - original_pl2: Mutex>, } impl GenericLinuxSal { @@ -33,14 +29,11 @@ impl GenericLinuxSal { Self { db, - suppressed_services: Mutex::new(Vec::new()), last_valid_temp: Mutex::new((0.0, Instant::now())), current_pl1: Mutex::new(15_000_000), last_energy: Mutex::new((initial_energy, Instant::now())), fact_sheet: facts, ctx, - original_pl1: Mutex::new(None), - original_pl2: Mutex::new(None), } } @@ -135,7 +128,6 @@ impl SensorBus for GenericLinuxSal { } fn get_throttling_status(&self) -> Result { - // Fallback: check if any cooling device is active (cur_state > 0) let cooling_base = self.ctx.sysfs_base.join("sys/class/thermal"); if let Ok(entries) = fs::read_dir(cooling_base) { for entry in entries.flatten() { @@ -168,68 +160,37 @@ impl ActuatorBus for GenericLinuxSal { } else { Ok(()) } } - fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> { + fn set_fan_speed(&self, _speed: FanSpeedPercent) -> Result<()> { Ok(()) } - fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { - let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL1 path"))?; - fs::write(rapl_path.join("constraint_0_power_limit_uw"), limit.as_u64().to_string())?; - *self.current_pl1.lock().unwrap() = limit.as_u64(); + fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()> { + for rapl_path in &self.fact_sheet.rapl_paths { + let limit_path = rapl_path.join("constraint_0_power_limit_uw"); + let enable_path = rapl_path.join("constraint_0_enabled"); + fs::write(&limit_path, limit.as_microwatts().to_string()) + .with_context(|| format!("Failed to write PL1 to {:?}", limit_path))?; + let _ = fs::write(&enable_path, "1"); + } + *self.current_pl1.lock().unwrap() = limit.as_microwatts(); Ok(()) } - fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { - let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL2 path"))?; - fs::write(rapl_path.join("constraint_1_power_limit_uw"), limit.as_u64().to_string())?; + fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()> { + for rapl_path in &self.fact_sheet.rapl_paths { + let limit_path = rapl_path.join("constraint_1_power_limit_uw"); + let enable_path = rapl_path.join("constraint_1_enabled"); + fs::write(&limit_path, limit.as_microwatts().to_string()) + .with_context(|| format!("Failed to write PL2 to {:?}", limit_path))?; + let _ = fs::write(&enable_path, "1"); + } Ok(()) } } impl EnvironmentGuard for GenericLinuxSal { - fn suppress(&self) -> Result<()> { - // Snapshot Power Limits - if let Some(rapl_path) = self.fact_sheet.rapl_paths.first() { - if let Ok(pl1) = fs::read_to_string(rapl_path.join("constraint_0_power_limit_uw")) { - *self.original_pl1.lock().unwrap() = pl1.trim().parse().ok(); - } - if let Ok(pl2) = fs::read_to_string(rapl_path.join("constraint_1_power_limit_uw")) { - *self.original_pl2.lock().unwrap() = pl2.trim().parse().ok(); - } - } - - let mut suppressed = self.suppressed_services.lock().unwrap(); - for conflict_id in &self.fact_sheet.active_conflicts { - if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) { - for service in &conflict.services { - if self.ctx.runner.run("systemctl", &["is-active", "--quiet", service]).is_ok() { - let _ = self.ctx.runner.run("systemctl", &["stop", service]); - suppressed.push(service.clone()); - } - } - } - } - Ok(()) - } - - fn restore(&self) -> Result<()> { - // Restore Power Limits - if let Some(rapl_path) = self.fact_sheet.rapl_paths.first() { - if let Some(pl1) = *self.original_pl1.lock().unwrap() { - let _ = fs::write(rapl_path.join("constraint_0_power_limit_uw"), pl1.to_string()); - } - if let Some(pl2) = *self.original_pl2.lock().unwrap() { - let _ = fs::write(rapl_path.join("constraint_1_power_limit_uw"), pl2.to_string()); - } - } - - let mut suppressed = self.suppressed_services.lock().unwrap(); - for service in suppressed.drain(..) { - let _ = self.ctx.runner.run("systemctl", &["start", &service]); - } - if self.is_dell() { let _ = self.set_fan_mode("auto"); } - Ok(()) - } + fn suppress(&self) -> Result<()> { Ok(()) } + fn restore(&self) -> Result<()> { Ok(()) } } impl HardwareWatchdog for GenericLinuxSal { @@ -245,7 +206,3 @@ impl HardwareWatchdog for GenericLinuxSal { Ok(SafetyStatus::Nominal) } } - -impl Drop for GenericLinuxSal { - fn drop(&mut self) { let _ = self.restore(); } -} diff --git a/src/sal/heuristic/discovery.rs b/src/sal/heuristic/discovery.rs index 3dce223..77803df 100644 --- a/src/sal/heuristic/discovery.rs +++ b/src/sal/heuristic/discovery.rs @@ -6,7 +6,7 @@ use std::sync::mpsc; use std::collections::HashMap; use crate::sal::heuristic::schema::{SensorDiscovery, ActuatorDiscovery, Conflict, Discovery, Benchmarking}; use crate::sys::SyscallRunner; -use tracing::{debug, warn}; +use tracing::{debug, warn, info}; /// Registry of dynamically discovered paths for configs and tools. #[derive(Debug, Clone, Default)] @@ -24,6 +24,7 @@ pub struct SystemFactSheet { pub fan_paths: Vec, pub rapl_paths: Vec, pub active_conflicts: Vec, + pub conflict_services: Vec, pub paths: PathRegistry, pub bench_config: Option, } @@ -44,12 +45,17 @@ pub fn discover_facts( let rapl_paths = discover_rapl(base_path, &discovery.actuators); let mut active_conflicts = Vec::new(); + let mut conflict_services = Vec::new(); for conflict in conflicts { + let mut found_active = false; for service in &conflict.services { if is_service_active(runner, service) { - debug!("Detected active conflict: {} (Service: {})", conflict.id, service); - active_conflicts.push(conflict.id.clone()); - break; + if !found_active { + debug!("Detected active conflict: {} (Service: {})", conflict.id, service); + active_conflicts.push(conflict.id.clone()); + found_active = true; + } + conflict_services.push(service.clone()); } } } @@ -57,13 +63,7 @@ pub fn discover_facts( let paths = discover_paths(base_path, discovery); SystemFactSheet { - vendor, - model, - temp_path, - fan_paths, - rapl_paths, - active_conflicts, - paths, + vendor, model, temp_path, fan_paths, rapl_paths, active_conflicts, conflict_services, paths, bench_config: Some(bench_config), } } @@ -71,7 +71,6 @@ pub fn discover_facts( fn discover_paths(base_path: &Path, discovery: &Discovery) -> PathRegistry { let mut registry = PathRegistry::default(); - // 1. Discover Tools via PATH for (id, binary_name) in &discovery.tools { if let Ok(path) = which::which(binary_name) { debug!("Discovered tool: {} -> {:?}", id, path); @@ -79,7 +78,6 @@ fn discover_paths(base_path: &Path, discovery: &Discovery) -> PathRegistry { } } - // 2. Discover Configs via existence check for (id, candidates) in &discovery.configs { for candidate in candidates { let path = if candidate.starts_with('/') { @@ -104,12 +102,11 @@ fn discover_paths(base_path: &Path, discovery: &Discovery) -> PathRegistry { registry } -/// Reads DMI information from sysfs with a safety timeout. fn read_dmi_info(base_path: &Path) -> (String, String) { - let vendor = read_sysfs_with_timeout(&base_path.join("sys/class/dmi/id/sys_vendor"), Duration::from_millis(100)) - .unwrap_or_else(|| "Unknown".to_string()); - let model = read_sysfs_with_timeout(&base_path.join("sys/class/dmi/id/product_name"), Duration::from_millis(100)) - .unwrap_or_else(|| "Unknown".to_string()); + let vendor = fs::read_to_string(base_path.join("sys/class/dmi/id/sys_vendor")) + .map(|s| s.trim().to_string()).unwrap_or_else(|_| "Unknown".to_string()); + let model = fs::read_to_string(base_path.join("sys/class/dmi/id/product_name")) + .map(|s| s.trim().to_string()).unwrap_or_else(|_| "Unknown".to_string()); (vendor, model) } @@ -119,49 +116,62 @@ fn discover_hwmon(base_path: &Path, cfg: &SensorDiscovery) -> (Option, let mut fan_candidates = Vec::new(); let hwmon_base = base_path.join("sys/class/hwmon"); - let entries = match fs::read_dir(&hwmon_base) { - Ok(e) => e, - Err(e) => { - warn!("Could not read {:?}: {}", hwmon_base, e); - return (None, Vec::new()); - } - }; + let entries = fs::read_dir(&hwmon_base).map_err(|e| { + warn!("Could not read {:?}: {}", hwmon_base, e); + e + }).ok(); - for entry in entries.flatten() { - let hwmon_path = entry.path(); - - let driver_name = read_sysfs_with_timeout(&hwmon_path.join("name"), Duration::from_millis(100)) - .unwrap_or_default(); + if let Some(entries) = entries { + for entry in entries.flatten() { + let hwmon_path = entry.path(); + + // # SAFETY: Read driver name directly. This file is virtual and never blocks. + // Using a timeout wrapper here was causing discovery to fail if the thread-pool lagged. + let driver_name = fs::read_to_string(hwmon_path.join("name")) + .map(|s| s.trim().to_string()).unwrap_or_default(); - let priority = cfg.hwmon_priority - .iter() - .position(|p| p == &driver_name) - .unwrap_or(usize::MAX); + let priority = cfg.hwmon_priority + .iter() + .position(|p| driver_name.contains(p)) + .unwrap_or(usize::MAX); - if let Ok(hw_entries) = fs::read_dir(&hwmon_path) { - for hw_entry in hw_entries.flatten() { - let file_name = hw_entry.file_name().into_string().unwrap_or_default(); - - if file_name.starts_with("temp") && file_name.ends_with("_label") { - if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(100)) { - if cfg.temp_labels.iter().any(|l| label.contains(l)) { - let input_path = hwmon_path.join(file_name.replace("_label", "_input")); - if input_path.exists() { - temp_candidates.push((priority, input_path)); + if let Ok(hw_entries) = fs::read_dir(&hwmon_path) { + for hw_entry in hw_entries.flatten() { + let file_name = hw_entry.file_name().into_string().unwrap_or_default(); + + // 1. Temperatures + if file_name.starts_with("temp") && file_name.ends_with("_label") { + if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(500)) { + if cfg.temp_labels.iter().any(|l| label.contains(l)) { + let input_path = hwmon_path.join(file_name.replace("_label", "_input")); + if input_path.exists() { + temp_candidates.push((priority, input_path)); + } } } } - } - if file_name.starts_with("fan") && file_name.ends_with("_label") { - if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(100)) { - if cfg.fan_labels.iter().any(|l| label.contains(l)) { - let input_path = hwmon_path.join(file_name.replace("_label", "_input")); - if input_path.exists() { - fan_candidates.push((priority, input_path)); + // 2. Fans (Label Match) + if file_name.starts_with("fan") && file_name.ends_with("_label") { + if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(500)) { + if cfg.fan_labels.iter().any(|l| label.contains(l)) { + let input_path = hwmon_path.join(file_name.replace("_label", "_input")); + if input_path.exists() { + debug!("Discovered fan by label: {:?} (priority {})", input_path, priority); + fan_candidates.push((priority, input_path)); + } } } } + + // 3. Fans (Priority Fallback - CRITICAL FOR DELL 9380) + // If we found a priority driver (e.g., dell_smm), we take every fan*_input we find. + if priority < usize::MAX && file_name.starts_with("fan") && file_name.ends_with("_input") { + if !fan_candidates.iter().any(|(_, p)| p == &hw_entry.path()) { + info!("Heuristic Discovery: Force-adding unlabeled fan sensor from priority driver '{}': {:?}", driver_name, hw_entry.path()); + fan_candidates.push((priority, hw_entry.path())); + } + } } } } @@ -171,45 +181,45 @@ fn discover_hwmon(base_path: &Path, cfg: &SensorDiscovery) -> (Option, fan_candidates.sort_by_key(|(p, _)| *p); let best_temp = temp_candidates.first().map(|(_, p)| p.clone()); - let best_fans = fan_candidates.into_iter().map(|(_, p)| p).collect(); + let best_fans: Vec = fan_candidates.into_iter().map(|(_, p)| p).collect(); + + if best_fans.is_empty() { + warn!("Heuristic Discovery: No fan RPM sensors found."); + } else { + info!("Heuristic Discovery: Final registry contains {} fan sensors.", best_fans.len()); + } (best_temp, best_fans) } -/// Discovers RAPL powercap paths. fn discover_rapl(base_path: &Path, cfg: &ActuatorDiscovery) -> Vec { let mut paths = Vec::new(); let powercap_base = base_path.join("sys/class/powercap"); - let entries = match fs::read_dir(&powercap_base) { - Ok(e) => e, - Err(_) => return Vec::new(), - }; - - for entry in entries.flatten() { - let path = entry.path(); - let dir_name = entry.file_name().into_string().unwrap_or_default(); - - if cfg.rapl_paths.contains(&dir_name) { - paths.push(path); - continue; - } - - if let Some(name) = read_sysfs_with_timeout(&path.join("name"), Duration::from_millis(100)) { - if cfg.rapl_paths.iter().any(|p| p == &name) { + if let Ok(entries) = fs::read_dir(&powercap_base) { + for entry in entries.flatten() { + let path = entry.path(); + let dir_name = entry.file_name().into_string().unwrap_or_default(); + + if cfg.rapl_paths.contains(&dir_name) { paths.push(path); + continue; + } + + if let Ok(name) = fs::read_to_string(path.join("name")) { + if cfg.rapl_paths.iter().any(|p| p == name.trim()) { + paths.push(path); + } } } } paths } -/// Checks if a systemd service is currently active using the injected runner. pub fn is_service_active(runner: &dyn SyscallRunner, service: &str) -> bool { runner.run("systemctl", &["is-active", "--quiet", service]).is_ok() } -/// Helper to read a sysfs file with a timeout. fn read_sysfs_with_timeout(path: &Path, timeout: Duration) -> Option { let (tx, rx) = mpsc::channel(); let path_buf = path.to_path_buf(); diff --git a/src/sal/mock.rs b/src/sal/mock.rs index 079a982..6a9b3b1 100644 --- a/src/sal/mock.rs +++ b/src/sal/mock.rs @@ -1,6 +1,7 @@ use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, SafetyStatus}; -use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage}; +use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent}; use anyhow::Result; +use std::sync::Arc; pub struct MockSal { pub temperature_sequence: std::sync::atomic::AtomicUsize, @@ -17,65 +18,36 @@ impl MockSal { impl PreflightAuditor for MockSal { fn audit(&self) -> Box + '_> { let steps = vec![ - AuditStep { - description: "Mock Root Privileges".to_string(), - outcome: Ok(()), - }, - AuditStep { - description: "Mock AC Power Status".to_string(), - outcome: Ok(()), - }, + AuditStep { description: "Mock Root Privileges".to_string(), outcome: Ok(()) }, + AuditStep { description: "Mock AC Power Status".to_string(), outcome: Ok(()) }, ]; Box::new(steps.into_iter()) } } impl EnvironmentGuard for MockSal { - fn suppress(&self) -> Result<()> { - Ok(()) - } - fn restore(&self) -> Result<()> { - Ok(()) - } + fn suppress(&self) -> Result<()> { Ok(()) } + fn restore(&self) -> Result<()> { Ok(()) } } impl SensorBus for MockSal { fn get_temp(&self) -> Result { - // Support dynamic sequence for Step 5 let seq = self.temperature_sequence.fetch_add(1, std::sync::atomic::Ordering::SeqCst); - Ok(40.0 + (seq as f32 * 0.5).min(50.0)) // Heats up from 40 to 90 - } - fn get_power_w(&self) -> Result { - Ok(15.0) - } - fn get_fan_rpms(&self) -> Result> { - Ok(vec![2500]) - } - fn get_freq_mhz(&self) -> Result { - Ok(3200.0) - } - fn get_throttling_status(&self) -> Result { - Ok(self.get_temp()? > 90.0) + Ok(40.0 + (seq as f32 * 0.5).min(55.0)) } + fn get_power_w(&self) -> Result { Ok(15.0) } + fn get_fan_rpms(&self) -> Result> { Ok(vec![2500, 2400]) } + fn get_freq_mhz(&self) -> Result { Ok(3200.0) } + fn get_throttling_status(&self) -> Result { Ok(false) } } impl ActuatorBus for MockSal { - fn set_fan_mode(&self, _mode: &str) -> Result<()> { - Ok(()) - } - fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> { - Ok(()) - } - fn set_sustained_power_limit(&self, _limit: TdpLimitMicroWatts) -> Result<()> { - Ok(()) - } - fn set_burst_power_limit(&self, _limit: TdpLimitMicroWatts) -> Result<()> { - Ok(()) - } + fn set_fan_mode(&self, _mode: &str) -> Result<()> { Ok(()) } + fn set_fan_speed(&self, _speed: FanSpeedPercent) -> Result<()> { Ok(()) } + fn set_sustained_power_limit(&self, _limit: PowerLimitWatts) -> Result<()> { Ok(()) } + fn set_burst_power_limit(&self, _limit: PowerLimitWatts) -> Result<()> { Ok(()) } } impl HardwareWatchdog for MockSal { - fn get_safety_status(&self) -> Result { - Ok(SafetyStatus::Nominal) - } + fn get_safety_status(&self) -> Result { Ok(SafetyStatus::Nominal) } } diff --git a/src/sal/mod.rs b/src/sal/mod.rs index d2f276f..a8cd205 100644 --- a/src/sal/mod.rs +++ b/src/sal/mod.rs @@ -4,3 +4,4 @@ pub mod dell_xps_9380; pub mod generic_linux; pub mod heuristic; pub mod safety; +pub mod discovery; diff --git a/src/sal/safety.rs b/src/sal/safety.rs index f33689d..88c641a 100644 --- a/src/sal/safety.rs +++ b/src/sal/safety.rs @@ -8,68 +8,81 @@ use anyhow::{Result, bail, Context}; use std::collections::HashMap; use std::fs; use std::path::{PathBuf}; -use tracing::{info, warn, error}; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::{Duration, Instant}; +use std::thread; +use tracing::{info, warn, error, debug}; + +use crate::sal::traits::SensorBus; // --- 1. Type-Driven Bounds Checking --- -/// Represents a TDP limit in microwatts, strictly bounded between 5W and 80W. -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] -pub struct TdpLimitMicroWatts(u64); +/// Represents a validated TDP limit in Watts. +#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] +pub struct PowerLimitWatts(f32); -impl TdpLimitMicroWatts { - /// # SAFETY: - /// Values below 5W can cause CPU frequency to drop to 400MHz and induce system instability. - pub const MIN_SAFE_UW: u64 = 5_000_000; - /// # SAFETY: - /// Values above 80W can exceed the thermal and electrical design limits of XPS chassis. - pub const MAX_SAFE_UW: u64 = 80_000_000; +impl PowerLimitWatts { + /// Absolute safety floor. Setting TDP below 3W can induce system-wide + /// CPU stalls and I/O deadlocks on certain Intel mobile chipsets. + pub const MIN: f32 = 3.0; + /// Safety ceiling for mobile thin-and-light chassis. + pub const MAX: f32 = 100.0; - /// Validates and constructs a new TDP limit. - pub fn new(microwatts: u64) -> Result { - if microwatts < Self::MIN_SAFE_UW { - bail!("HardwareSafetyError: Requested TDP {}uW is below safety floor (5W).", microwatts); + /// Validates and constructs a new PowerLimitWatts. + pub fn try_new(watts: f32) -> Result { + if watts < Self::MIN || watts > Self::MAX { + bail!("HardwareSafetyError: Requested TDP {:.1}W is outside safe bounds ({:.1}W - {:.1}W).", watts, Self::MIN, Self::MAX); } - if microwatts > Self::MAX_SAFE_UW { - bail!("HardwareSafetyError: Requested TDP {}uW exceeds safety ceiling (80W).", microwatts); - } - Ok(Self(microwatts)) + Ok(Self(watts)) } pub fn from_watts(watts: f32) -> Result { - Self::new((watts * 1_000_000.0) as u64) + Self::try_new(watts) } - pub fn as_u64(&self) -> u64 { self.0 } + pub fn get(&self) -> f32 { self.0 } + pub fn as_microwatts(&self) -> u64 { (self.0 * 1_000_000.0) as u64 } } -/// Represents a fan speed percentage (0-100%). +/// Represents a validated fan speed percentage. #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct FanSpeedPercentage(u8); +pub struct FanSpeedPercent(u8); -impl FanSpeedPercentage { - pub fn new(percent: u8) -> Result { +impl FanSpeedPercent { + pub fn try_new(percent: u8) -> Result { if percent > 100 { bail!("HardwareSafetyError: Fan speed {}% is invalid.", percent); } Ok(Self(percent)) } - pub fn as_u8(&self) -> u8 { self.0 } + + pub fn new(percent: u8) -> Result { + Self::try_new(percent) + } + + pub fn get(&self) -> u8 { self.0 } } -/// Represents a thermal threshold in Celsius, bounded to TjMax - 2°C (98°C). +/// Represents a thermal threshold in Celsius. #[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] pub struct ThermalThresholdCelsius(f32); impl ThermalThresholdCelsius { pub const MAX_SAFE_C: f32 = 98.0; - pub fn new(celsius: f32) -> Result { + pub fn try_new(celsius: f32) -> Result { if celsius > Self::MAX_SAFE_C { - bail!("HardwareSafetyError: Thermal threshold {}C exceeds safe limit (98C).", celsius); + bail!("HardwareSafetyError: Thermal threshold {}C exceeds safe limit ({}C).", celsius, Self::MAX_SAFE_C); } Ok(Self(celsius)) } - pub fn as_f32(&self) -> f32 { self.0 } + + pub fn new(celsius: f32) -> Result { + Self::try_new(celsius) + } + + pub fn get(&self) -> f32 { self.0 } } // --- 2. The HardwareStateGuard (RAII Restorer) --- @@ -78,6 +91,7 @@ impl ThermalThresholdCelsius { pub type RollbackAction = Box; /// Holds a snapshot of the system state. Restores everything on Drop. +/// This is the primary safety mechanism for Project Iron-Ember. pub struct HardwareStateGuard { /// Maps sysfs paths to their original string contents. snapshots: HashMap, @@ -90,6 +104,9 @@ pub struct HardwareStateGuard { impl HardwareStateGuard { /// Snapshots the requested files and neutralizes competing services. + /// + /// # SAFETY: + /// This MUST be acquired before any hardware mutation occurs. pub fn acquire(target_files: &[PathBuf], target_services: &[String]) -> Result { let mut snapshots = HashMap::new(); let mut suppressed = Vec::new(); @@ -101,10 +118,13 @@ impl HardwareStateGuard { let content = fs::read_to_string(path) .with_context(|| format!("Failed to snapshot {:?}", path))?; snapshots.insert(path.clone(), content.trim().to_string()); + } else { + debug!("USA: Skipping snapshot for non-existent path {:?}", path); } } for svc in target_services { + // Check if service is active before stopping let status = std::process::Command::new("systemctl") .args(["is-active", "--quiet", svc]) .status(); @@ -168,7 +188,75 @@ impl Drop for HardwareStateGuard { } } -// --- 3. Transactional Configuration --- +// --- 3. The Active Watchdog --- + +/// A standalone monitor that polls hardware thermals at high frequency. +pub struct ThermalWatchdog { + cancel_token: Arc, + handle: Option>, +} + +impl ThermalWatchdog { + /// If temperature exceeds this ceiling, the watchdog triggers an emergency shutdown. + pub const CRITICAL_TEMP: f32 = 95.0; + /// High polling rate ensures we catch runaways before chassis saturation. + pub const POLL_INTERVAL: Duration = Duration::from_millis(250); + + /// Spawns the watchdog thread. + pub fn spawn(sensors: Arc, cancel_token: Arc) -> Self { + let ct = cancel_token.clone(); + let handle = thread::spawn(move || { + let mut last_temp = 0.0; + loop { + if ct.load(Ordering::SeqCst) { + debug!("Watchdog: Shutdown signal received."); + break; + } + + match sensors.get_temp() { + Ok(temp) => { + // Rate of change check (dT/dt) + let dt_dt = temp - last_temp; + if temp >= Self::CRITICAL_TEMP { + error!("WATCHDOG: CRITICAL THERMAL EVENT ({:.1}C). Triggering emergency abort!", temp); + ct.store(true, Ordering::SeqCst); + break; + } + + if dt_dt > 5.0 && temp > 85.0 { + warn!("WATCHDOG: Dangerous thermal ramp detected (+{:.1}C in 250ms).", dt_dt); + } + + last_temp = temp; + } + Err(e) => { + error!("WATCHDOG: Sensor read failure: {}. Aborting for safety!", e); + ct.store(true, Ordering::SeqCst); + break; + } + } + + thread::sleep(Self::POLL_INTERVAL); + } + }); + + Self { + cancel_token, + handle: Some(handle), + } + } +} + +impl Drop for ThermalWatchdog { + fn drop(&mut self) { + self.cancel_token.store(true, Ordering::SeqCst); + if let Some(h) = self.handle.take() { + let _ = h.join(); + } + } +} + +// --- 4. Transactional Configuration --- /// A staged set of changes to be applied to the hardware. #[derive(Default)] diff --git a/src/sal/traits.rs b/src/sal/traits.rs index 996b4e6..7cd1367 100644 --- a/src/sal/traits.rs +++ b/src/sal/traits.rs @@ -115,30 +115,20 @@ impl EnvironmentGuard for Arc { } } +use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent}; + /// Provides a read-only interface to system telemetry sensors. pub trait SensorBus: Send + Sync { /// Returns the current package temperature in degrees Celsius. - /// - /// # Errors - /// Returns an error if the underlying `hwmon` or `sysfs` node cannot be read. fn get_temp(&self) -> Result; /// Returns the current package power consumption in Watts. - /// - /// # Errors - /// Returns an error if the underlying RAPL or power sensor cannot be read. fn get_power_w(&self) -> Result; /// Returns the current speed of all detected fans in RPM. - /// - /// # Errors - /// Returns an error if the fan sensor nodes cannot be read. fn get_fan_rpms(&self) -> Result>; /// Returns the current average CPU frequency in MHz. - /// - /// # Errors - /// Returns an error if `/proc/cpuinfo` or a `cpufreq` sysfs node cannot be read. fn get_freq_mhz(&self) -> Result; /// Returns true if the system is currently thermally throttling. @@ -146,53 +136,33 @@ pub trait SensorBus: Send + Sync { } impl SensorBus for Arc { - fn get_temp(&self) -> Result { - (**self).get_temp() - } - fn get_power_w(&self) -> Result { - (**self).get_power_w() - } - fn get_fan_rpms(&self) -> Result> { - (**self).get_fan_rpms() - } - fn get_freq_mhz(&self) -> Result { - (**self).get_freq_mhz() - } - fn get_throttling_status(&self) -> Result { - (**self).get_throttling_status() - } + fn get_temp(&self) -> Result { (**self).get_temp() } + fn get_power_w(&self) -> Result { (**self).get_power_w() } + fn get_fan_rpms(&self) -> Result> { (**self).get_fan_rpms() } + fn get_freq_mhz(&self) -> Result { (**self).get_freq_mhz() } + fn get_throttling_status(&self) -> Result { (**self).get_throttling_status() } } -use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage}; - /// Provides a write-only interface for hardware actuators. pub trait ActuatorBus: Send + Sync { /// Sets the fan control mode (e.g., "auto" or "max"). fn set_fan_mode(&self, mode: &str) -> Result<()>; /// Sets the fan speed directly using a validated percentage. - fn set_fan_speed(&self, speed: FanSpeedPercentage) -> Result<()>; + fn set_fan_speed(&self, speed: FanSpeedPercent) -> Result<()>; /// Sets the sustained power limit (PL1) using a validated wrapper. - fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>; + fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()>; /// Sets the burst power limit (PL2) using a validated wrapper. - fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>; + fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()>; } impl ActuatorBus for Arc { - fn set_fan_mode(&self, mode: &str) -> Result<()> { - (**self).set_fan_mode(mode) - } - fn set_fan_speed(&self, speed: FanSpeedPercentage) -> Result<()> { - (**self).set_fan_speed(speed) - } - fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { - (**self).set_sustained_power_limit(limit) - } - fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { - (**self).set_burst_power_limit(limit) - } + fn set_fan_mode(&self, mode: &str) -> Result<()> { (**self).set_fan_mode(mode) } + fn set_fan_speed(&self, speed: FanSpeedPercent) -> Result<()> { (**self).set_fan_speed(speed) } + fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()> { (**self).set_sustained_power_limit(limit) } + fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()> { (**self).set_burst_power_limit(limit) } } /// Represents the high-level safety status of the system. From 8d351c7bdee5ee59432c4b543130ebcc56e75af6 Mon Sep 17 00:00:00 2001 From: Nils Pukropp Date: Sat, 28 Feb 2026 18:55:18 +0100 Subject: [PATCH 13/13] updated safety measurements and benchmarking behavior for 9380 --- src/agent_analyst/mod.rs | 24 +-- src/agent_integrator/mod.rs | 97 +++++++---- src/engine/formatters/throttled.rs | 11 +- src/engine/mod.rs | 150 ++++++----------- src/orchestrator/mod.rs | 262 +++++++++++++---------------- src/sal/generic_linux.rs | 3 +- src/sal/mock.rs | 1 - src/sal/safety.rs | 2 +- tests/config_merge_test.rs | 90 +++++++--- tests/safety_test.rs | 23 ++- 10 files changed, 329 insertions(+), 334 deletions(-) diff --git a/src/agent_analyst/mod.rs b/src/agent_analyst/mod.rs index c5b3b33..47c0af9 100644 --- a/src/agent_analyst/mod.rs +++ b/src/agent_analyst/mod.rs @@ -28,6 +28,7 @@ pub struct OptimizationMatrix { pub balanced: SystemProfile, pub performance: SystemProfile, pub thermal_resistance_kw: f32, + pub ambient_temp: f32, } pub struct HeuristicAnalyst { @@ -43,16 +44,14 @@ impl HeuristicAnalyst { /// Analyzes the raw telemetry to generate the 3 optimal profiles. pub fn analyze(&self, profile: &ThermalProfile, max_soak_watts: f32) -> OptimizationMatrix { - let r_theta = self.engine.calculate_thermal_resistance(profile); + let r_theta = profile.r_theta; let silicon_knee = self.engine.find_silicon_knee(profile); + let ambient = profile.ambient_temp; // 1. State A: Silent / Battery (Scientific Passive Equilibrium) - // Objective: Find P where T_core = 60C with fans OFF. - // T_core = T_ambient + (P * R_theta_passive) - // Note: R_theta measured during benchmark was with fans MAX. - // Passive R_theta is typically 2-3x higher. + // Find P where T_core = 60C with fans OFF. let r_theta_passive = r_theta * 2.5; - let silent_watts = ((60.0 - profile.ambient_temp) / r_theta_passive.max(0.1)).clamp(5.0, 15.0); + let silent_watts = ((60.0 - ambient) / r_theta_passive.max(0.1)).clamp(3.0, 15.0); let silent_profile = SystemProfile { name: "Silent".to_string(), @@ -64,21 +63,21 @@ impl HeuristicAnalyst { ], }; - // 2. State B: Balanced - // The exact calculated Silicon Knee + // 2. State B: Balanced (The Silicon Knee) + // We use R_theta to predict where the knee will sit thermally. let balanced_profile = SystemProfile { name: "Balanced".to_string(), pl1_watts: silicon_knee, pl2_watts: silicon_knee * 1.25, fan_curve: vec![ - FanCurvePoint { temp_on: 60.0, temp_off: 55.0, pwm_percent: 0 }, - FanCurvePoint { temp_on: 75.0, temp_off: 65.0, pwm_percent: 40 }, - FanCurvePoint { temp_on: 85.0, temp_off: 75.0, pwm_percent: 70 }, + FanCurvePoint { temp_on: ambient + 15.0, temp_off: ambient + 10.0, pwm_percent: 0 }, + FanCurvePoint { temp_on: ambient + 25.0, temp_off: ambient + 20.0, pwm_percent: 30 }, + FanCurvePoint { temp_on: 75.0, temp_off: 65.0, pwm_percent: 50 }, + FanCurvePoint { temp_on: 85.0, temp_off: 75.0, pwm_percent: 80 }, ], }; // 3. State C: Sustained Heavy - // Based on the max soak watts from Phase 1. let performance_profile = SystemProfile { name: "Performance".to_string(), pl1_watts: max_soak_watts, @@ -95,6 +94,7 @@ impl HeuristicAnalyst { balanced: balanced_profile, performance: performance_profile, thermal_resistance_kw: r_theta, + ambient_temp: ambient, } } } diff --git a/src/agent_integrator/mod.rs b/src/agent_integrator/mod.rs index 8328498..dc69883 100644 --- a/src/agent_integrator/mod.rs +++ b/src/agent_integrator/mod.rs @@ -6,7 +6,7 @@ //! resolution strategies for overlapping daemons. use anyhow::Result; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::fs; use crate::agent_analyst::OptimizationMatrix; @@ -14,20 +14,42 @@ pub struct ServiceIntegrator; impl ServiceIntegrator { /// Generates and saves an i8kmon configuration based on the balanced profile. - pub fn generate_i8kmon_config(matrix: &OptimizationMatrix, output_path: &Path) -> Result<()> { + pub fn generate_i8kmon_config(matrix: &OptimizationMatrix, output_path: &Path, source_path: Option<&PathBuf>) -> Result<()> { let profile = &matrix.balanced; - - let mut conf = String::new(); - conf.push_str("# Auto-generated by ember-tune Integrator -"); - conf.push_str(&format!("# Profile: {} -", profile.name)); - + let mut conf = String::new(); + + // Read existing content to preserve daemon and other settings + let existing = if let Some(src) = source_path { + if src.exists() { fs::read_to_string(src).unwrap_or_default() } else { String::new() } + } else if output_path.exists() { + fs::read_to_string(output_path).unwrap_or_default() + } else { + String::new() + }; + + if !existing.is_empty() { + for line in existing.lines() { + let trimmed = line.trim(); + // Filter out the old auto-generated config lines and fan configs + if !trimmed.starts_with("set config(0)") && + !trimmed.starts_with("set config(1)") && + !trimmed.starts_with("set config(2)") && + !trimmed.starts_with("set config(3)") && + !trimmed.starts_with("# Auto-generated") && + !trimmed.starts_with("# Profile:") && + !trimmed.is_empty() { + conf.push_str(line); + conf.push('\n'); + } + } + } + + conf.push_str("\n# Auto-generated by ember-tune Integrator\n"); + conf.push_str(&format!("# Profile: {}\n", profile.name)); + conf.push_str(&format!("# Thermal Resistance: {:.3} K/W\n\n", matrix.thermal_resistance_kw)); + for (i, p) in profile.fan_curve.iter().enumerate() { - // i8kmon syntax: set config(state) {left_fan right_fan temp_on temp_off} - // State 0, 1, 2, 3 correspond to BIOS fan states (off, low, high) - let state = match p.pwm_percent { 0..=20 => 0, 21..=50 => 1, @@ -35,31 +57,50 @@ impl ServiceIntegrator { _ => 2, }; - let off = if i == 0 { "-".to_string() } else { format!("{}", p.temp_off) }; - conf.push_str(&format!("set config({}) {{{} {} {} {}}} -", i, state, state, p.temp_on, off)); + let off = if i == 0 { "-".to_string() } else { format!("{:.0}", p.temp_off) }; + conf.push_str(&format!("set config({}) {{{} {} {:.0} {}}}\n", i, state, state, p.temp_on, off)); } fs::write(output_path, conf)?; Ok(()) } - /// Generates a thinkfan configuration. - pub fn generate_thinkfan_config(matrix: &OptimizationMatrix, output_path: &Path) -> Result<()> { + /// Generates a thinkfan configuration, merging with existing sensors if possible. + pub fn generate_thinkfan_config(matrix: &OptimizationMatrix, output_path: &Path, source_path: Option<&PathBuf>) -> Result<()> { let profile = &matrix.balanced; let mut conf = String::new(); - conf.push_str("# Auto-generated by ember-tune Integrator -"); - conf.push_str("sensors: - - hwmon: /sys/class/hwmon/hwmon0/temp1_input + + let existing = if let Some(src) = source_path { + if src.exists() { fs::read_to_string(src).unwrap_or_default() } else { String::new() } + } else if output_path.exists() { + fs::read_to_string(output_path).unwrap_or_default() + } else { + String::new() + }; -"); - conf.push_str("levels: -"); + if !existing.is_empty() { + let mut in_sensors = false; + for line in existing.lines() { + let trimmed = line.trim(); + if trimmed == "sensors:" { in_sensors = true; } + if trimmed == "levels:" { in_sensors = false; } + + if in_sensors { + conf.push_str(line); + conf.push('\n'); + } + } + } + + if conf.is_empty() { + conf.push_str("sensors:\n - hwmon: /sys/class/hwmon/hwmon0/temp1_input\n\n"); + } + + conf.push_str("\n# Auto-generated by ember-tune Integrator\n"); + conf.push_str("levels:\n"); for (i, p) in profile.fan_curve.iter().enumerate() { - // thinkfan syntax: - [level, temp_down, temp_up] let level = match p.pwm_percent { 0..=20 => 0, 21..=40 => 1, @@ -69,8 +110,7 @@ impl ServiceIntegrator { }; let down = if i == 0 { 0.0 } else { p.temp_off }; - conf.push_str(&format!(" - [{}, {}, {}] -", level, down, p.temp_on)); + conf.push_str(&format!(" - [{}, {:.0}, {:.0}]\n", level, down, p.temp_on)); } fs::write(output_path, conf)?; @@ -91,7 +131,6 @@ sed -i 's/^CPU_BOOST_ON_AC=.*/CPU_BOOST_ON_AC=""/' /etc/tlp.conf systemctl restart tlp # 3. Thermald Delegate (We provide the trips, it handles the rest) -# (Ensure your custom thermal-conf.xml is in /etc/thermald/) systemctl restart thermald "#; fs::write(output_path, script)?; @@ -99,7 +138,7 @@ systemctl restart thermald } /// Generates a thermald configuration XML. - pub fn generate_thermald_config(matrix: &OptimizationMatrix, output_path: &Path) -> Result<()> { + pub fn generate_thermald_config(matrix: &OptimizationMatrix, output_path: &Path, _source_path: Option<&PathBuf>) -> Result<()> { let profile = &matrix.balanced; let mut xml = String::new(); xml.push_str("\n\n \n ember-tune Balanced\n Generic\n balanced\n \n \n cpu\n \n"); diff --git a/src/engine/formatters/throttled.rs b/src/engine/formatters/throttled.rs index 9febe7e..17bf284 100644 --- a/src/engine/formatters/throttled.rs +++ b/src/engine/formatters/throttled.rs @@ -118,8 +118,15 @@ Trip_Temp_C: {trip:.0} result_lines.join("\n") } - pub fn save(path: &Path, config: &ThrottledConfig) -> Result<()> { - let existing = if path.exists() { std::fs::read_to_string(path)? } else { String::new() }; + pub fn save(path: &Path, config: &ThrottledConfig, source_path: Option<&std::path::PathBuf>) -> Result<()> { + let existing = if let Some(src) = source_path { + if src.exists() { std::fs::read_to_string(src).unwrap_or_default() } else { String::new() } + } else if path.exists() { + std::fs::read_to_string(path).unwrap_or_default() + } else { + String::new() + }; + let content = if existing.is_empty() { Self::generate_conf(config) } else { Self::merge_conf(&existing, config) }; std::fs::write(path, content)?; Ok(()) diff --git a/src/engine/mod.rs b/src/engine/mod.rs index e65a992..3a31cd4 100644 --- a/src/engine/mod.rs +++ b/src/engine/mod.rs @@ -7,7 +7,7 @@ use serde::{Serialize, Deserialize}; use std::collections::HashMap; use std::path::PathBuf; -use tracing::warn; +use tracing::{warn, debug}; pub mod formatters; @@ -26,6 +26,7 @@ pub struct ThermalPoint { pub struct ThermalProfile { pub points: Vec, pub ambient_temp: f32, + pub r_theta: f32, } /// The final, recommended parameters derived from the thermal benchmark. @@ -52,24 +53,16 @@ pub struct OptimizationResult { } /// Pure mathematics engine for thermal optimization. -/// -/// Contains no hardware I/O and operates solely on the collected [ThermalProfile]. pub struct OptimizerEngine { - /// The size of the sliding window for the `smooth` function. window_size: usize, } impl OptimizerEngine { - /// Creates a new `OptimizerEngine`. pub fn new(window_size: usize) -> Self { Self { window_size } } - /// Applies a simple moving average (SMA) filter with outlier rejection. - /// - /// This function smooths noisy sensor data. It rejects any value in the - /// window that is more than 20.0 units away from the window's average - /// before calculating the final smoothed value. + /// Smoothes sensor jitter using a moving average with outlier rejection. pub fn smooth(&self, data: &[f32]) -> Vec { if data.is_empty() { return vec![]; } let mut smoothed = Vec::with_capacity(data.len()); @@ -81,7 +74,7 @@ impl OptimizerEngine { let window = &data[start..end]; let avg: f32 = window.iter().sum::() / window.len() as f32; let filtered: Vec = window.iter() - .filter(|&&v| (v - avg).abs() < 20.0) // Reject spikes > 20 units + .filter(|&&v| (v - avg).abs() < 10.0) .cloned().collect(); if filtered.is_empty() { @@ -93,108 +86,65 @@ impl OptimizerEngine { smoothed } - /// Calculates Thermal Resistance: R_theta = (T_core - T_ambient) / P_package. - /// - /// This function uses the data point with the highest power draw to ensure - /// the calculation reflects a system under maximum thermal load. - pub fn calculate_thermal_resistance(&self, profile: &ThermalProfile) -> f32 { - profile.points.iter() - .filter(|p| p.power_w > 1.0 && p.temp_c > 30.0) // Filter invalid data - .max_by(|a, b| a.power_w.partial_cmp(&b.power_w).unwrap_or(std::cmp::Ordering::Equal)) - .map(|p| (p.temp_c - profile.ambient_temp) / p.power_w) - .unwrap_or(0.0) + /// Evaluates if a series of temperature readings have reached thermal equilibrium. + /// Criteria: Standard deviation < 0.25C over the last 10 seconds. + pub fn is_stable(&self, temps: &[f32]) -> bool { + if temps.len() < 20 { return false; } // Need at least 10s of data (500ms intervals) + let window = &temps[temps.len() - 20..]; + + let avg = window.iter().sum::() / window.len() as f32; + let variance = window.iter().map(|&t| (t - avg).powi(2)).sum::() / window.len() as f32; + let std_dev = variance.sqrt(); + + debug!("Stability Check: StdDev={:.3}C (Target < 0.25C)", std_dev); + std_dev < 0.25 } - /// Returns the maximum temperature recorded in the profile. - pub fn get_max_temp(&self, profile: &ThermalProfile) -> f32 { - profile.points.iter() - .map(|p| p.temp_c) - .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)) - .unwrap_or(0.0) + /// Predicts the steady-state temperature for a given target wattage. + /// Formula: T_pred = T_ambient + (P_target * R_theta) + pub fn predict_temp(&self, target_watts: f32, ambient: f32, r_theta: f32) -> f32 { + ambient + (target_watts * r_theta) } - /// Finds the "Silicon Knee" - the point where performance-per-watt (efficiency) - /// starts to diminish significantly and thermal density spikes. - /// - /// This heuristic scoring model balances several factors: - /// 1. **Efficiency Drop:** How quickly does performance-per-watt decrease as power increases? - /// 2. **Thermal Acceleration:** How quickly does temperature rise per additional Watt? - /// 3. **Throttling Penalty:** A large penalty is applied if absolute performance drops, indicating a thermal wall. - /// - /// The "Knee" is the power level with the highest score, representing the optimal - /// balance before thermal saturation causes diminishing returns. + /// Calculates Thermal Resistance (K/W) using the steady-state delta. + pub fn calculate_r_theta(&self, ambient: f32, steady_temp: f32, steady_power: f32) -> f32 { + if steady_power < 1.0 { return 0.0; } + (steady_temp - ambient) / steady_power + } + + /// Identifies the "Silicon Knee" by finding the point of maximum efficiency. pub fn find_silicon_knee(&self, profile: &ThermalProfile) -> f32 { - let valid_points: Vec<_> = profile.points.iter() - .filter(|p| p.power_w > 5.0 && p.temp_c > 40.0) // Filter idle/noise - .cloned() - .collect(); + if profile.points.is_empty() { return 15.0; } - if valid_points.len() < 3 { - return profile.points.last().map(|p| p.power_w).unwrap_or(15.0); - } - - let mut points = valid_points; + let mut points = profile.points.clone(); points.sort_by(|a, b| a.power_w.partial_cmp(&b.power_w).unwrap_or(std::cmp::Ordering::Equal)); - let mut best_pl = points[0].power_w; - let mut max_score = f32::MIN; + let efficiencies: Vec<(f32, f32)> = points.iter() + .map(|p| { + let perf = if p.throughput > 0.0 { p.throughput as f32 } else { p.freq_mhz }; + (p.power_w, perf / p.power_w.max(1.0)) + }) + .collect(); - // Use a sliding window (3 points) to calculate gradients more robustly - for i in 1..points.len() - 1 { - let prev = &points[i - 1]; - let curr = &points[i]; - let next = &points[i + 1]; + if efficiencies.is_empty() { return 15.0; } - // 1. Efficiency Metric (Throughput per Watt or Freq per Watt) - let efficiency_curr = if curr.throughput > 0.0 { - curr.throughput as f32 / curr.power_w.max(1.0) + let max_efficiency = efficiencies.iter() + .map(|(_, e)| *e) + .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)) + .unwrap_or(1.0); + + let mut knee_watts = points[0].power_w; + for (watts, efficiency) in efficiencies { + if efficiency >= (max_efficiency * 0.85) { + knee_watts = watts; } else { - curr.freq_mhz / curr.power_w.max(1.0) - }; - - let efficiency_next = if next.throughput > 0.0 { - next.throughput as f32 / next.power_w.max(1.0) - } else { - next.freq_mhz / next.power_w.max(1.0) - }; - - let p_delta = (next.power_w - curr.power_w).max(0.5); - let efficiency_drop = (efficiency_curr - efficiency_next) / p_delta; - - // 2. Thermal Acceleration (d2T/dW2) - let p_delta_prev = (curr.power_w - prev.power_w).max(0.5); - let p_delta_next = (next.power_w - curr.power_w).max(0.5); - - let dt_dw_prev = (curr.temp_c - prev.temp_c) / p_delta_prev; - let dt_dw_next = (next.temp_c - curr.temp_c) / p_delta_next; - - let p_total_delta = (next.power_w - prev.power_w).max(1.0); - let temp_accel = (dt_dw_next - dt_dw_prev) / p_total_delta; - - // 3. Wall Detection (Any drop in absolute performance is a hard wall) - let is_throttling = next.freq_mhz < curr.freq_mhz || (next.throughput > 0.0 && next.throughput < curr.throughput); - let penalty = if is_throttling { 5000.0 } else { 0.0 }; - - let score = (efficiency_curr * 10.0) - (efficiency_drop * 50.0) - (temp_accel * 20.0) - penalty; - - if score > max_score { - max_score = score; - best_pl = curr.power_w; + debug!("Efficiency drop at {:.1}W ({:.1}% of peak)", watts, (efficiency/max_efficiency)*100.0); + break; } } - let best_pl = if max_score > f32::MIN { - best_pl - } else { - profile.points.last().map(|p| p.power_w).unwrap_or(15.0) - }; - - // Safety Floor: Never recommend a TDP below 5W, as this bricks system performance. - if best_pl < 5.0 { - warn!("Heuristic suggested dangerously low PL1 ({:.1}W). Falling back to 15W safety floor.", best_pl); - return 15.0; - } - - best_pl + knee_watts.clamp(PowerLimitWatts::MIN, PowerLimitWatts::MAX) } } + +use crate::sal::safety::PowerLimitWatts; diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index bc426f3..38a5786 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -4,7 +4,7 @@ //! using a [Workload], and feeds telemetry to the frontend via MPSC channels. use anyhow::{Result, Context, bail}; -use tracing::{info, warn, error}; +use tracing::{info, warn, error, debug}; use std::sync::mpsc; use std::time::{Duration, Instant}; use std::thread; @@ -23,67 +23,40 @@ use crate::load::{Workload, IntensityProfile, StressVector}; use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase}; use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult}; use crate::agent_analyst::HeuristicAnalyst; +use crate::agent_integrator::ServiceIntegrator; /// Represents the possible states of the benchmark orchestrator. pub enum OrchestratorState { - /// Performing pre-flight checks and snapshotting. PreFlight, - /// Acquiring idle baseline telemetry. IdleBaseline, - /// Actively sweeping through power limits. - StressSweep { current_wattage: f32 }, - /// Allowing hardware to cool down before releasing the guard. + ThermalCalibration, + StabilitySweep, Cooldown, - /// Benchmark complete, generating final results. Finalizing, } -/// The central state machine responsible for coordinating the thermal benchmark. pub struct BenchmarkOrchestrator { - /// Injected hardware abstraction layer. sal: Arc, - /// Discovered system facts and paths. facts: SystemFactSheet, - /// Heat generation workload. workload: Box, - /// Channel for sending telemetry updates to the UI. telemetry_tx: mpsc::Sender, - /// Channel for receiving commands from the UI. command_rx: mpsc::Receiver, - /// Current phase reported to the UI. ui_phase: BenchmarkPhase, - /// Accumulated thermal data points. profile: ThermalProfile, - /// Mathematics engine for data smoothing and optimization. engine: OptimizerEngine, - /// CLI override for the configuration output path. optional_config_out: Option, - - /// The safety membrane protecting the system. safeguard: Option, - /// Active thermal watchdog. watchdog: Option, - - /// Sliding window of power readings (Watts). history_watts: VecDeque, - /// Sliding window of temperature readings (Celsius). history_temp: VecDeque, - /// Sliding window of CPU frequency (MHz). history_mhz: VecDeque, - - /// Detected CPU model string. cpu_model: String, - /// Total system RAM in Gigabytes. total_ram_gb: u64, - - /// Atomic flag indicating a safety-triggered abort. emergency_abort: Arc, - /// Human-readable reason for the emergency abort. emergency_reason: Arc>>, } impl BenchmarkOrchestrator { - /// Creates a new orchestrator instance with injected dependencies. pub fn new( sal: Arc, facts: SystemFactSheet, @@ -122,14 +95,13 @@ impl BenchmarkOrchestrator { } } - /// Executes the full benchmark sequence. pub fn run(&mut self) -> Result { // Immediate Priming let _ = self.sal.get_temp(); let _ = self.sal.get_power_w(); let _ = self.sal.get_fan_rpms(); - info!("Orchestrator: Initializing Project Iron-Ember lifecycle."); + info!("Orchestrator: Initializing Project Iron-Ember PGC Protocol."); // Spawn safety watchdog immediately let watchdog = ThermalWatchdog::spawn(self.sal.clone(), self.emergency_abort.clone()); @@ -147,24 +119,24 @@ impl BenchmarkOrchestrator { let _ = self.workload.stop_workload(); if let Some(mut sg) = self.safeguard.take() { - if let Err(e) = sg.release() { - error!("CRITICAL: State restoration failure: {}", e); - } + let _ = sg.release(); } - info!("✓ Hardware state restored to pre-flight defaults."); + if let Err(e) = self.sal.restore() { + warn!("Failed secondary SAL restoration: {}", e); + } + + info!("✓ Hardware state restored."); result } - /// Internal execution logic for the benchmark phases. fn execute_benchmark(&mut self) -> Result { - let bench_cfg = self.facts.bench_config.clone().context("Benchmarking configuration missing.")?; + let _bench_cfg = self.facts.bench_config.clone().context("Config missing.")?; // 1. Pre-Flight Phase self.ui_phase = BenchmarkPhase::Auditing; self.log("Phase: Pre-Flight Auditing & Sterilization")?; - // Snapshot and neutralise Brawl Matrix let mut target_files = self.facts.rapl_paths.iter() .map(|p| p.join("constraint_0_power_limit_uw")) .collect::>(); @@ -177,7 +149,6 @@ impl BenchmarkOrchestrator { let sg = HardwareStateGuard::acquire(&target_files, &self.facts.conflict_services)?; self.safeguard = Some(sg); - // Run auditor for step in self.sal.audit() { if let Err(e) = step.outcome { return Err(anyhow::anyhow!("Audit failed ({}): {:?}", step.description, e)); @@ -185,106 +156,117 @@ impl BenchmarkOrchestrator { } self.workload.initialize().context("Failed to initialize load generator.")?; + self.sal.suppress().context("Failed to suppress background services.")?; let tick = Cell::new(0u64); // 2. Idle Baseline Phase self.ui_phase = BenchmarkPhase::IdleCalibration; - self.log(&format!("Phase: Recording Idle Baseline ({}s)", bench_cfg.idle_duration_s))?; - - // Wait for fan spin-up + self.log("Phase: Recording 30s Idle Baseline...")?; self.sal.set_fan_mode("auto")?; let mut idle_temps = Vec::new(); let start = Instant::now(); - while start.elapsed() < Duration::from_secs(bench_cfg.idle_duration_s) { + while start.elapsed() < Duration::from_secs(30) { self.check_safety_abort()?; self.send_telemetry(tick.get())?; idle_temps.push(self.sal.get_temp().unwrap_or(0.0)); tick.set(tick.get() + 1); thread::sleep(Duration::from_millis(500)); } - self.profile.ambient_temp = self.engine.smooth(&idle_temps).last().cloned().unwrap_or(0.0); + self.profile.ambient_temp = self.engine.smooth(&idle_temps).iter().sum::() / idle_temps.len() as f32; self.log(&format!("✓ Idle Baseline: {:.1}°C", self.profile.ambient_temp))?; - // 3. Stress Sweep Phase - self.ui_phase = BenchmarkPhase::StressTesting; - self.log("Phase: Synthetic Stress Matrix (Gradual Ramp)")?; - - // Ensure fans are ramped to MAX before load - self.log("Metrology: Locking fans to MAX...")?; + // 3. Thermal Resistance Mapping (Phase 1) + self.log("Phase: Mapping Thermal Resistance (Rθ) at 10W...")?; self.sal.set_fan_mode("max")?; - let fan_lock_start = Instant::now(); - loop { - let fans = self.sal.get_fan_rpms().unwrap_or_default(); - let max_rpm = fans.iter().cloned().max().unwrap_or(0); - if max_rpm >= 3000 || fan_lock_start.elapsed() > Duration::from_secs(15) { + + let pl_calib = PowerLimitWatts::try_new(10.0)?; + self.sal.set_sustained_power_limit(pl_calib)?; + self.sal.set_burst_power_limit(pl_calib)?; + + self.workload.run_workload( + Duration::from_secs(120), + IntensityProfile { threads: num_cpus::get_physical(), load_percentage: 100, vector: StressVector::CpuMatrix } + )?; + + let mut calib_temps = Vec::new(); + let calib_start = Instant::now(); + while calib_start.elapsed() < Duration::from_secs(90) { + self.check_safety_abort()?; + self.send_telemetry(tick.get())?; + let t = self.sal.get_temp().unwrap_or(0.0); + calib_temps.push(t); + tick.set(tick.get() + 1); + + if calib_start.elapsed() > Duration::from_secs(30) && self.engine.is_stable(&calib_temps) { break; } thread::sleep(Duration::from_millis(500)); - self.send_telemetry(tick.get())?; - tick.set(tick.get() + 1); } + + let steady_t = calib_temps.last().cloned().unwrap_or(0.0); + let steady_p = self.sal.get_power_w().unwrap_or(10.0); + self.profile.r_theta = self.engine.calculate_r_theta(self.profile.ambient_temp, steady_t, steady_p); + self.log(&format!("✓ Physical Model: Rθ = {:.3} K/W", self.profile.r_theta))?; - let physical_threads = num_cpus::get_physical(); + // 4. Physically-Aware Stability Sweep (Phase 2) + self.ui_phase = BenchmarkPhase::StressTesting; + self.log("Phase: Starting Physically-Aware Efficiency Sweep...")?; + + let mut current_w = 12.0_f32; let mut previous_ops = 0.0; - for &watts in &bench_cfg.power_steps_watts { - self.check_safety_abort()?; - self.log(&format!("Testing PL1 = {:.0}W", watts))?; - - // Apply limits safely - let pl1 = PowerLimitWatts::try_new(watts)?; - let pl2 = PowerLimitWatts::try_new(watts + 5.0)?; - - self.sal.set_sustained_power_limit(pl1)?; - self.sal.set_burst_power_limit(pl2)?; - - // Start workload + loop { + // Predict if this step is safe + let pred_t = self.engine.predict_temp(current_w, self.profile.ambient_temp, self.profile.r_theta); + if pred_t > 92.0 { + self.log(&format!("Prediction: {:.1}W would result in {:.1}C (Too Hot). Finalizing...", current_w, pred_t))?; + break; + } + + self.log(&format!("Step: {:.1}W (Predicted: {:.1}C)", current_w, pred_t))?; + let pl = PowerLimitWatts::try_new(current_w)?; + self.sal.set_sustained_power_limit(pl)?; + self.sal.set_burst_power_limit(PowerLimitWatts::try_new(current_w + 2.0)?)?; + self.workload.run_workload( - Duration::from_secs(bench_cfg.stress_duration_max_s), - IntensityProfile { threads: physical_threads, load_percentage: 100, vector: StressVector::CpuMatrix } + Duration::from_secs(60), + IntensityProfile { threads: num_cpus::get_physical(), load_percentage: 100, vector: StressVector::CpuMatrix } )?; let step_start = Instant::now(); - let mut step_temps = VecDeque::with_capacity(30); - let mut previous_step_temp = self.sal.get_temp().unwrap_or(0.0); + let mut step_temps = Vec::new(); + let mut previous_t = self.sal.get_temp().unwrap_or(0.0); - // Equilibrium Gating - while step_start.elapsed() < Duration::from_secs(bench_cfg.stress_duration_max_s) { + while step_start.elapsed() < Duration::from_secs(60) { self.check_safety_abort()?; - + self.send_telemetry(tick.get())?; + let t = self.sal.get_temp().unwrap_or(0.0); - let dt_dt = (t - previous_step_temp) / 0.5; - previous_step_temp = t; + let dt_dt = (t - previous_t) / 0.5; - // Redundant safety check during step - if t > 94.0 || dt_dt > 5.0 { - warn!("Thermal Spike Detected! Aborting current step."); - break; + // # SAFETY: predictive hard-quench threshold raised to 8C/s + if step_start.elapsed() > Duration::from_secs(2) && (t > 95.0 || dt_dt > 8.0) { + warn!("USA: Safety Break triggered! T={:.1}C, dT/dt={:.1}C/s", t, dt_dt); + let _ = self.sal.set_sustained_power_limit(PowerLimitWatts::try_new(3.0)?); + break; // Just break the sweep loop } - step_temps.push_back(t); - if step_temps.len() > 10 { step_temps.pop_front(); } - - self.send_telemetry(tick.get())?; + step_temps.push(t); tick.set(tick.get() + 1); - if step_start.elapsed() > Duration::from_secs(bench_cfg.stress_duration_min_s) && step_temps.len() == 10 { - let min = step_temps.iter().fold(f32::MAX, |a, &b| a.min(b)); - let max = step_temps.iter().fold(f32::MIN, |a, &b| a.max(b)); - if (max - min) < 0.5 { - info!("Equilibrium reached at {:.1}°C", t); - break; - } + if step_start.elapsed() > Duration::from_secs(15) && self.engine.is_stable(&step_temps) { + self.log(&format!(" Equilibrium reached at {:.1}°C", t))?; + break; } + previous_t = t; thread::sleep(Duration::from_millis(500)); } - // Record data point let metrics = self.workload.get_current_metrics().unwrap_or_default(); self.profile.points.push(ThermalPoint { - power_w: self.sal.get_power_w().unwrap_or(watts), + power_w: self.sal.get_power_w().unwrap_or(current_w), temp_c: self.sal.get_temp().unwrap_or(0.0), freq_mhz: self.sal.get_freq_mhz().unwrap_or(0.0), fan_rpm: self.sal.get_fan_rpms().unwrap_or_default().first().cloned().unwrap_or(0), @@ -293,64 +275,62 @@ impl BenchmarkOrchestrator { self.workload.stop_workload()?; - // Performance Halt Condition + // Efficiency Break if previous_ops > 0.0 { let gain = ((metrics.primary_ops_per_sec - previous_ops) / previous_ops) * 100.0; if gain < 1.0 { - self.log("Diminishing returns reached. Stopping sweep.")?; + self.log("Silicon Knee identified (gain < 1%). Finalizing...")?; break; } } previous_ops = metrics.primary_ops_per_sec; + current_w += 2.0; + if current_w > 45.0 { break; } - self.log(&format!("Cooling down ({}s)...", bench_cfg.cool_down_s))?; - thread::sleep(Duration::from_secs(bench_cfg.cool_down_s)); + self.log(&format!("Cooling down ({}s)...", _bench_cfg.cool_down_s))?; + thread::sleep(Duration::from_secs(_bench_cfg.cool_down_s)); } - // 4. Physical Modeling Phase + // 5. Modeling Phase self.ui_phase = BenchmarkPhase::PhysicalModeling; - self.log("Phase: Silicon Physical Sweet Spot Calculation")?; - + let knee = self.engine.find_silicon_knee(&self.profile); let analyst = HeuristicAnalyst::new(); let matrix = analyst.analyze(&self.profile, self.profile.points.last().map(|p| p.power_w).unwrap_or(15.0)); let mut res = self.generate_result(false); res.optimization_matrix = Some(matrix.clone()); - - info!("Identification complete. Knee: {:.1}W, Rθ: {:.3} K/W", res.silicon_knee_watts, res.thermal_resistance_kw); + res.silicon_knee_watts = knee; - // 5. Finalizing Phase + // 6. Finalizing Phase self.ui_phase = BenchmarkPhase::Finalizing; - self.log("Phase: Generation of Optimized Configuration Sets")?; - - let throttled_path = self.optional_config_out.clone() - .or_else(|| self.facts.paths.configs.get("throttled").cloned()); - - if let Some(path) = throttled_path { + let throttled_source = self.facts.paths.configs.get("throttled"); + if let Some(path) = self.optional_config_out.clone().or_else(|| throttled_source.cloned()) { let config = crate::engine::formatters::throttled::ThrottledConfig { pl1_limit: res.silicon_knee_watts, - pl2_limit: res.recommended_pl2, - trip_temp: res.max_temp_c.max(90.0), + pl2_limit: res.silicon_knee_watts * 1.25, + trip_temp: 90.0, }; - crate::engine::formatters::throttled::ThrottledTranslator::save(&path, &config)?; - self.log(&format!("✓ Saved Throttled profile to {}", path.display()))?; + let _ = crate::engine::formatters::throttled::ThrottledTranslator::save(&path, &config, throttled_source); res.config_paths.insert("throttled".to_string(), path); } + let base_out = self.optional_config_out.clone().unwrap_or_else(|| PathBuf::from("/etc")); + let i8k_source = self.facts.paths.configs.get("i8kmon"); + let i8k_out = base_out.join("i8kmon.conf"); + if ServiceIntegrator::generate_i8kmon_config(&matrix, &i8k_out, i8k_source).is_ok() { + res.config_paths.insert("i8kmon".to_string(), i8k_out); + } + Ok(res) } - /// Checks if the safety watchdog or user triggered an abort. fn check_safety_abort(&self) -> Result<()> { if self.emergency_abort.load(Ordering::SeqCst) { - let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Watchdog Triggered".to_string()); + let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Watchdog".to_string()); bail!("EMERGENCY_ABORT: {}", reason); } - if let Ok(cmd) = self.command_rx.try_recv() { - match cmd { - UiCommand::Abort => bail!("ABORTED"), - } + if let UiCommand::Abort = cmd { bail!("ABORTED"); } } Ok(()) } @@ -365,49 +345,35 @@ impl BenchmarkOrchestrator { current_freq: self.sal.get_freq_mhz().unwrap_or(0.0), fans: self.sal.get_fan_rpms().unwrap_or_default(), governor: "performance".to_string(), - pl1_limit: 0.0, - pl2_limit: 0.0, - fan_tier: "auto".to_string(), + pl1_limit: 0.0, pl2_limit: 0.0, fan_tier: "auto".to_string(), is_throttling: self.sal.get_throttling_status().unwrap_or(false), phase: self.ui_phase, - history_watts: Vec::new(), - history_temp: Vec::new(), - history_mhz: Vec::new(), + history_watts: Vec::new(), history_temp: Vec::new(), history_mhz: Vec::new(), log_event: Some(msg.to_string()), metadata: std::collections::HashMap::new(), is_emergency: self.emergency_abort.load(Ordering::SeqCst), emergency_reason: self.emergency_reason.lock().unwrap().clone(), }; - self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed")) + self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Channel closed")) } fn send_telemetry(&mut self, tick: u64) -> Result<()> { let temp = self.sal.get_temp().unwrap_or(0.0); let pwr = self.sal.get_power_w().unwrap_or(0.0); let freq = self.sal.get_freq_mhz().unwrap_or(0.0); - self.history_temp.push_back(temp); self.history_watts.push_back(pwr); self.history_mhz.push_back(freq); - - if self.history_temp.len() > 120 { - self.history_temp.pop_front(); - self.history_watts.pop_front(); - self.history_mhz.pop_front(); - } + if self.history_temp.len() > 120 { self.history_temp.pop_front(); self.history_watts.pop_front(); self.history_mhz.pop_front(); } let state = TelemetryState { cpu_model: self.cpu_model.clone(), total_ram_gb: self.total_ram_gb, tick, - cpu_temp: temp, - power_w: pwr, - current_freq: freq, + cpu_temp: temp, power_w: pwr, current_freq: freq, fans: self.sal.get_fan_rpms().unwrap_or_default(), governor: "performance".to_string(), - pl1_limit: 15.0, - pl2_limit: 25.0, - fan_tier: "max".to_string(), + pl1_limit: 15.0, pl2_limit: 25.0, fan_tier: "max".to_string(), is_throttling: self.sal.get_throttling_status().unwrap_or(false), phase: self.ui_phase, history_watts: self.history_watts.iter().cloned().collect(), @@ -418,21 +384,19 @@ impl BenchmarkOrchestrator { is_emergency: self.emergency_abort.load(Ordering::SeqCst), emergency_reason: self.emergency_reason.lock().unwrap().clone(), }; - self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed")) + self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Channel closed")) } pub fn generate_result(&self, is_partial: bool) -> OptimizationResult { - let r_theta = self.engine.calculate_thermal_resistance(&self.profile); + let r_theta = self.profile.r_theta; let knee = self.engine.find_silicon_knee(&self.profile); - let max_t = self.engine.get_max_temp(&self.profile); - OptimizationResult { profile: self.profile.clone(), silicon_knee_watts: knee, thermal_resistance_kw: r_theta, recommended_pl1: knee, recommended_pl2: knee * 1.25, - max_temp_c: max_t, + max_temp_c: self.profile.points.iter().map(|p| p.temp_c).max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)).unwrap_or(0.0), is_partial, config_paths: std::collections::HashMap::new(), optimization_matrix: None, diff --git a/src/sal/generic_linux.rs b/src/sal/generic_linux.rs index 3456794..fbd7c48 100644 --- a/src/sal/generic_linux.rs +++ b/src/sal/generic_linux.rs @@ -2,8 +2,7 @@ use anyhow::{Result, anyhow, Context}; use std::path::{Path}; use std::fs; use std::time::{Duration, Instant}; -use std::sync::{Mutex, Arc}; -use tracing::{debug, warn, info}; +use std::sync::Mutex; use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx}; use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent}; diff --git a/src/sal/mock.rs b/src/sal/mock.rs index 6a9b3b1..e6e6a40 100644 --- a/src/sal/mock.rs +++ b/src/sal/mock.rs @@ -1,7 +1,6 @@ use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, SafetyStatus}; use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent}; use anyhow::Result; -use std::sync::Arc; pub struct MockSal { pub temperature_sequence: std::sync::atomic::AtomicUsize, diff --git a/src/sal/safety.rs b/src/sal/safety.rs index 88c641a..863ea2e 100644 --- a/src/sal/safety.rs +++ b/src/sal/safety.rs @@ -10,7 +10,7 @@ use std::fs; use std::path::{PathBuf}; use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; -use std::time::{Duration, Instant}; +use std::time::Duration; use std::thread; use tracing::{info, warn, error, debug}; diff --git a/tests/config_merge_test.rs b/tests/config_merge_test.rs index e2f1777..128eeff 100644 --- a/tests/config_merge_test.rs +++ b/tests/config_merge_test.rs @@ -1,35 +1,75 @@ -#[path = "../src/engine/formatters/throttled.rs"] -mod throttled; - -use throttled::{ThrottledTranslator, ThrottledConfig}; +use ember_tune_rs::engine::formatters::throttled::{ThrottledConfig, ThrottledTranslator}; +use ember_tune_rs::agent_analyst::{OptimizationMatrix, SystemProfile, FanCurvePoint}; +use ember_tune_rs::agent_integrator::ServiceIntegrator; use std::fs; +use tempfile::tempdir; #[test] -fn test_throttled_formatter_non_destructive() { - let fixture_path = "tests/fixtures/throttled.conf"; - let existing_content = fs::read_to_string(fixture_path).expect("Failed to read fixture"); - +fn test_throttled_merge_preserves_undervolt() { + let existing = r#"[GENERAL] +Update_Interval_ms: 1000 + +[UNDERVOLT] +# CPU core undervolt +CORE: -100 +# GPU undervolt +GPU: -50 + +[AC] +PL1_Tdp_W: 15 +PL2_Tdp_W: 25 +"#; + let config = ThrottledConfig { - pl1_limit: 25.0, - pl2_limit: 35.0, - trip_temp: 90.0, + pl1_limit: 22.0, + pl2_limit: 28.0, + trip_temp: 95.0, }; - let merged = ThrottledTranslator::merge_conf(&existing_content, &config); + let merged = ThrottledTranslator::merge_conf(existing, &config); - // Assert updates - assert!(merged.contains("PL1_Tdp_W: 25")); - assert!(merged.contains("PL2_Tdp_W: 35")); - assert!(merged.contains("Trip_Temp_C: 90")); - - // Assert preservation - assert!(merged.contains("[UNDERVOLT]")); assert!(merged.contains("CORE: -100")); assert!(merged.contains("GPU: -50")); - assert!(merged.contains("# Important: Preserving undervolt offsets is critical!")); - assert!(merged.contains("Update_Interval_ms: 3000")); - - // Check that we didn't lose the [GENERAL] section - assert!(merged.contains("[GENERAL]")); - assert!(merged.contains("# This is a complex test fixture")); + assert!(merged.contains("PL1_Tdp_W: 22")); + assert!(merged.contains("PL2_Tdp_W: 28")); + assert!(merged.contains("Trip_Temp_C: 95")); + assert!(merged.contains("[UNDERVOLT]")); +} + +#[test] +fn test_i8kmon_merge_preserves_settings() { + let dir = tempdir().unwrap(); + let config_path = dir.path().join("i8kmon.conf"); + + let existing = r#"set config(gen_shadow) 1 +set config(i8k_ignore_dmi) 1 +set config(daemon) 1 + +set config(0) {0 0 60 50} +"#; + fs::write(&config_path, existing).unwrap(); + + let matrix = OptimizationMatrix { + silent: SystemProfile { name: "Silent".to_string(), pl1_watts: 10.0, pl2_watts: 12.0, fan_curve: vec![] }, + balanced: SystemProfile { + name: "Balanced".to_string(), + pl1_watts: 20.0, + pl2_watts: 25.0, + fan_curve: vec![ + FanCurvePoint { temp_on: 70.0, temp_off: 60.0, pwm_percent: 50 } + ] + }, + performance: SystemProfile { name: "Perf".to_string(), pl1_watts: 30.0, pl2_watts: 35.0, fan_curve: vec![] }, + thermal_resistance_kw: 1.5, + ambient_temp: 25.0, + }; + + ServiceIntegrator::generate_i8kmon_config(&matrix, &config_path, Some(&config_path)).unwrap(); + + let result = fs::read_to_string(&config_path).unwrap(); + + assert!(result.contains("set config(gen_shadow) 1")); + assert!(result.contains("set config(daemon) 1")); + assert!(result.contains("set config(0) {1 1 70 -}")); // New config + assert!(!result.contains("set config(0) {0 0 60 50}")); // Old config should be gone } diff --git a/tests/safety_test.rs b/tests/safety_test.rs index 2922019..53d71d2 100644 --- a/tests/safety_test.rs +++ b/tests/safety_test.rs @@ -1,8 +1,6 @@ -use anyhow::Result; -use std::fs; -use std::path::PathBuf; -use ember_tune_rs::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts}; +use ember_tune_rs::sal::safety::{HardwareStateGuard, PowerLimitWatts}; use crate::common::fakesys::FakeSysBuilder; +use std::fs; mod common; @@ -34,23 +32,22 @@ fn test_hardware_state_guard_panic_restoration() { #[test] fn test_tdp_limit_bounds_checking() { // 1. Valid value - assert!(TdpLimitMicroWatts::new(15_000_000).is_ok()); + assert!(PowerLimitWatts::try_new(15.0).is_ok()); - // 2. Too low (Dangerous 0W or below 5W) - let low_res = TdpLimitMicroWatts::new(1_000_000); + // 2. Too low (Dangerous 0W or below 3W) + let low_res = PowerLimitWatts::try_new(1.0); assert!(low_res.is_err()); - assert!(low_res.unwrap_err().to_string().contains("below safety floor")); + assert!(low_res.unwrap_err().to_string().contains("outside safe bounds")); - // 3. Too high (> 80W) - let high_res = TdpLimitMicroWatts::new(100_000_000); + // 3. Too high (> 100W) + let high_res = PowerLimitWatts::try_new(150.0); assert!(high_res.is_err()); - assert!(high_res.unwrap_err().to_string().contains("exceeds safety ceiling")); + assert!(high_res.unwrap_err().to_string().contains("outside safe bounds")); } #[test] fn test_0w_tdp_regression_prevention() { // The prime directive is to never set 0W. - // Ensure the new() constructor explicitly fails for 0. - let zero_res = TdpLimitMicroWatts::new(0); + let zero_res = PowerLimitWatts::try_new(0.0); assert!(zero_res.is_err()); }