diff --git a/Cargo.lock b/Cargo.lock index ac4e99b..b6bb0cc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -901,6 +901,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + [[package]] name = "memchr" version = "2.8.0" @@ -2000,10 +2009,14 @@ version = "0.3.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" dependencies = [ + "matchers", "nu-ansi-term", + "once_cell", + "regex-automata", "sharded-slab", "smallvec", "thread_local", + "tracing", "tracing-core", "tracing-log", ] diff --git a/Cargo.toml b/Cargo.toml index c584f25..c118493 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,7 @@ serde_json = "1.0.149" clap = { version = "4.5", features = ["derive", "string", "wrap_help"] } color-eyre = "0.6" tracing = "0.1" -tracing-subscriber = "0.3" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } tracing-appender = "0.2" sysinfo = "0.38" libc = "0.2" diff --git a/assets/hardware_db.toml b/assets/hardware_db.toml index d695ebf..a275ab3 100644 --- a/assets/hardware_db.toml +++ b/assets/hardware_db.toml @@ -15,7 +15,7 @@ help_text = "TLP and Power-Profiles-Daemon fight over power envelopes. Mask both [[conflicts]] id = "thermal_logic_collision" -services = ["thermald.service", "throttled.service"] +services = ["thermald.service", "throttled.service", "lenovo_fix.service", "lenovo-throttling-fix.service"] contention = "RAPL / MSR / BD-PROCHOT" severity = "High" fix_action = "SuspendService" diff --git a/src/agent_metrology/mod.rs b/src/agent_metrology/mod.rs deleted file mode 100644 index 7bc4946..0000000 --- a/src/agent_metrology/mod.rs +++ /dev/null @@ -1,66 +0,0 @@ -//! Telemetry & Benchmarking Methodology (Agent Metrology) -//! -//! This module defines the execution flow to extract flawless hardware telemetry. -//! It isolates specific subsystems (CPU Core, Memory) and executes the Sweep Protocol -//! and Thermal Soak to map the physical limits of the hardware. - -use anyhow::Result; -use std::time::{Duration, Instant}; -use std::thread; -use crate::sal::traits::PlatformSal; -use crate::load::{Workload, IntensityProfile, StressVector}; -use tracing::info; - -pub struct MetrologyAgent<'a> { - sal: &'a dyn PlatformSal, - workload: &'a mut Box, -} - -impl<'a> MetrologyAgent<'a> { - pub fn new(sal: &'a dyn PlatformSal, workload: &'a mut Box) -> Self { - Self { sal, workload } - } - - /// Performs a prolonged mixed-load test to achieve chassis thermal saturation. - /// Bypasses short-term PL2/boost metrics to find the true steady-state dissipation capacity. - pub fn perform_thermal_soak(&mut self, duration_minutes: u64) -> Result { - info!("Metrology: Starting {} minute Thermal Soak...", duration_minutes); - - self.sal.set_fan_mode("max")?; - - // Mixed load: matrix math + memory stressors to saturate entire SoC and Chassis. - self.workload.run_workload( - Duration::from_secs(duration_minutes * 60), - IntensityProfile { - threads: num_cpus::get(), - load_percentage: 100, - vector: StressVector::Mixed - } - )?; - - let start = Instant::now(); - let target = Duration::from_secs(duration_minutes * 60); - let mut max_sustained_watts = 0.0; - - while start.elapsed() < target { - thread::sleep(Duration::from_secs(5)); - let temp = self.sal.get_temp().unwrap_or(0.0); - let watts = self.sal.get_power_w().unwrap_or(0.0); - - if watts > max_sustained_watts { - max_sustained_watts = watts; - } - - // Abort if dangerously hot - if temp >= 98.0 { - info!("Metrology: Thermal ceiling hit during soak ({}C). Stopping early.", temp); - break; - } - } - - self.workload.stop_workload()?; - info!("Metrology: Thermal Soak complete. Max sustained: {:.1}W", max_sustained_watts); - - Ok(max_sustained_watts) - } -} diff --git a/src/lib.rs b/src/lib.rs index 99103a3..0ce6d3a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,6 +12,5 @@ pub mod ui; pub mod engine; pub mod cli; pub mod sys; -pub mod agent_metrology; pub mod agent_analyst; pub mod agent_integrator; diff --git a/src/load/mod.rs b/src/load/mod.rs index a19ed48..9253917 100644 --- a/src/load/mod.rs +++ b/src/load/mod.rs @@ -88,11 +88,11 @@ impl Workload for StressNg { let load = profile.load_percentage.to_string(); let mut cmd = Command::new("stress-ng"); - cmd.args(["--timeout", &timeout, "--metrics", "--quiet"]); + cmd.args(["--timeout", &timeout, "--metrics", "--quiet", "--cpu-load", &load]); match profile.vector { StressVector::CpuMatrix => { - cmd.args(["--matrix", &threads, "--cpu-load", &load]); + cmd.args(["--matrix", &threads]); }, StressVector::MemoryBandwidth => { cmd.args(["--vm", &threads, "--vm-bytes", "80%"]); diff --git a/src/main.rs b/src/main.rs index 8e09e73..786cb8b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,7 +8,8 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::io; use clap::Parser; -use tracing::{info, debug, error}; +use tracing::error; +use tracing_subscriber::{fmt, prelude::*, EnvFilter}; use crossterm::{ event::{self, Event, KeyCode}, @@ -68,27 +69,24 @@ fn print_summary_report(result: &OptimizationResult) { println!(); } -fn setup_logging(verbose: bool) -> tracing_appender::non_blocking::WorkerGuard { - let file_appender = tracing_appender::rolling::never("/var/log", "ember-tune.log"); - let (non_blocking, guard) = tracing_appender::non_blocking(file_appender); - - let level = if verbose { tracing::Level::DEBUG } else { tracing::Level::INFO }; - - tracing_subscriber::fmt() - .with_max_level(level) - .with_writer(non_blocking) - .with_ansi(false) - .init(); - - guard -} - fn main() -> Result<()> { - // 1. Diagnostics & CLI Initialization let args = Cli::parse(); - let _log_guard = setup_logging(args.verbose); + + // 1. Logging Setup (File-only by default, Stdout during Audit) + let file_appender = tracing_appender::rolling::never(".", "ember-tune.log"); + let (non_blocking, _guard) = tracing_appender::non_blocking(file_appender); + let level = if args.verbose { "debug" } else { "info" }; + + let file_layer = fmt::layer() + .with_writer(non_blocking) + .with_ansi(false); + + // We use a simple println for the audit to avoid complex reload handles + tracing_subscriber::registry() + .with(EnvFilter::new(level)) + .with(file_layer) + .init(); - // Set panic hook to restore terminal state std::panic::set_hook(Box::new(|panic_info| { let _ = disable_raw_mode(); let mut stdout = io::stdout(); @@ -99,11 +97,10 @@ fn main() -> Result<()> { eprintln!("----------------------------------------\n"); })); - info!("ember-tune starting with args: {:?}", args); + println!("{}", console::style("─── Pre-flight System Audit ───").bold().cyan()); let ctx = ember_tune_rs::sal::traits::EnvironmentCtx::production(); - // 2. Platform Detection & Audit let (sal_box, facts): (Box, SystemFactSheet) = if args.mock { (Box::new(MockSal::new()), SystemFactSheet::default()) } else { @@ -111,9 +108,7 @@ fn main() -> Result<()> { }; let sal: Arc = sal_box.into(); - println!("{}", console::style("─── Pre-flight System Audit ───").bold().cyan()); let mut audit_failures = Vec::new(); - for step in sal.audit() { print!(" Checking {:<40} ", step.description); io::Write::flush(&mut io::stdout()).into_diagnostic()?; @@ -137,15 +132,14 @@ fn main() -> Result<()> { return Ok(()); } - // 3. Terminal Setup + // Entering TUI Mode - STDOUT is now strictly for Ratatui enable_raw_mode().into_diagnostic()?; let mut stdout = io::stdout(); - execute!(stdout, EnterAlternateScreen).into_diagnostic()?; + execute!(stdout, EnterAlternateScreen, crossterm::cursor::Hide).into_diagnostic()?; let backend_stdout = io::stdout(); let backend_term = CrosstermBackend::new(backend_stdout); let mut terminal = Terminal::new(backend_term).into_diagnostic()?; - // 4. State & Communication Setup let running = Arc::new(AtomicBool::new(true)); let r = running.clone(); @@ -158,7 +152,6 @@ fn main() -> Result<()> { r.store(false, Ordering::SeqCst); }).expect("Error setting Ctrl-C handler"); - // 5. Spawn Backend Orchestrator let sal_backend = sal.clone(); let facts_backend = facts.clone(); let config_out = args.config_out.clone(); @@ -175,10 +168,9 @@ fn main() -> Result<()> { orchestrator.run() }); - // 6. Frontend Event Loop let mut ui_state = DashboardState::new(); let mut last_telemetry = TelemetryState { - cpu_model: "Loading...".to_string(), + cpu_model: facts.model.clone(), total_ram_gb: 0, tick: 0, cpu_temp: 0.0, @@ -227,7 +219,6 @@ fn main() -> Result<()> { while let Ok(new_state) = telemetry_rx.try_recv() { if let Some(log) = &new_state.log_event { ui_state.add_log(log.clone()); - debug!("Backend Log: {}", log); } else { ui_state.update(&new_state); last_telemetry = new_state; @@ -238,20 +229,11 @@ fn main() -> Result<()> { if backend_handle.is_finished() { break; } } - // 7. Terminal Restoration let _ = disable_raw_mode(); - let _ = execute!(terminal.backend_mut(), LeaveAlternateScreen); - let _ = terminal.show_cursor(); + let _ = execute!(terminal.backend_mut(), LeaveAlternateScreen, crossterm::cursor::Show); - // 8. Final Report & Hardware Restoration let join_res = backend_handle.join(); - // Explicit hardware restoration - info!("Restoring hardware state..."); - if let Err(e) = sal.restore() { - error!("Failed to restore hardware state: {}", e); - } - match join_res { Ok(Ok(result)) => { print_summary_report(&result); @@ -276,6 +258,5 @@ fn main() -> Result<()> { } } - info!("ember-tune exited gracefully."); Ok(()) } diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index b3f1071..bc426f3 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -3,8 +3,8 @@ //! It manages hardware interactions through the [PlatformSal], generates stress //! using a [Workload], and feeds telemetry to the frontend via MPSC channels. -use anyhow::{Result, Context}; -use tracing::warn; +use anyhow::{Result, Context, bail}; +use tracing::{info, warn, error}; use std::sync::mpsc; use std::time::{Duration, Instant}; use std::thread; @@ -14,16 +14,29 @@ use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Mutex; use std::path::PathBuf; +use std::cell::Cell; -use crate::sal::traits::{PlatformSal, SafetyStatus}; +use crate::sal::traits::{PlatformSal, SensorBus}; use crate::sal::heuristic::discovery::SystemFactSheet; -use crate::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts, ConfigurationTransaction, ThermalThresholdCelsius}; +use crate::sal::safety::{HardwareStateGuard, PowerLimitWatts, ThermalWatchdog}; use crate::load::{Workload, IntensityProfile, StressVector}; use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase}; use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult}; -use crate::agent_metrology::MetrologyAgent; -use crate::agent_analyst::{HeuristicAnalyst, OptimizationMatrix}; -use crate::agent_integrator::ServiceIntegrator; +use crate::agent_analyst::HeuristicAnalyst; + +/// Represents the possible states of the benchmark orchestrator. +pub enum OrchestratorState { + /// Performing pre-flight checks and snapshotting. + PreFlight, + /// Acquiring idle baseline telemetry. + IdleBaseline, + /// Actively sweeping through power limits. + StressSweep { current_wattage: f32 }, + /// Allowing hardware to cool down before releasing the guard. + Cooldown, + /// Benchmark complete, generating final results. + Finalizing, +} /// The central state machine responsible for coordinating the thermal benchmark. pub struct BenchmarkOrchestrator { @@ -37,8 +50,8 @@ pub struct BenchmarkOrchestrator { telemetry_tx: mpsc::Sender, /// Channel for receiving commands from the UI. command_rx: mpsc::Receiver, - /// Current phase of the benchmark. - phase: BenchmarkPhase, + /// Current phase reported to the UI. + ui_phase: BenchmarkPhase, /// Accumulated thermal data points. profile: ThermalProfile, /// Mathematics engine for data smoothing and optimization. @@ -48,6 +61,8 @@ pub struct BenchmarkOrchestrator { /// The safety membrane protecting the system. safeguard: Option, + /// Active thermal watchdog. + watchdog: Option, /// Sliding window of power readings (Watts). history_watts: VecDeque, @@ -91,7 +106,7 @@ impl BenchmarkOrchestrator { workload, telemetry_tx, command_rx, - phase: BenchmarkPhase::Auditing, + ui_phase: BenchmarkPhase::Auditing, profile: ThermalProfile::default(), engine: OptimizerEngine::new(5), history_watts: VecDeque::with_capacity(120), @@ -103,147 +118,163 @@ impl BenchmarkOrchestrator { emergency_reason: Arc::new(Mutex::new(None)), optional_config_out, safeguard: None, + watchdog: None, } } /// Executes the full benchmark sequence. pub fn run(&mut self) -> Result { - self.log("Starting ember-tune Benchmark Sequence.")?; + // Immediate Priming + let _ = self.sal.get_temp(); + let _ = self.sal.get_power_w(); + let _ = self.sal.get_fan_rpms(); - let _watchdog_handle = self.spawn_watchdog_monitor(); + info!("Orchestrator: Initializing Project Iron-Ember lifecycle."); + + // Spawn safety watchdog immediately + let watchdog = ThermalWatchdog::spawn(self.sal.clone(), self.emergency_abort.clone()); + self.watchdog = Some(watchdog); - // Core execution wrapped in cleanup logic let result = self.execute_benchmark(); - // --- MANDATORY CLEANUP --- - self.log("Benchmark sequence finished. Restoring hardware defaults...")?; + if let Err(ref e) = result { + error!("Benchmark Lifecycle Failure: {}", e); + let _ = self.log(&format!("⚠ FAILURE: {}", e)); + } + + // --- MANDATORY RAII CLEANUP --- + info!("Benchmark sequence complete. Releasing safeguards..."); let _ = self.workload.stop_workload(); if let Some(mut sg) = self.safeguard.take() { if let Err(e) = sg.release() { - anyhow::bail!("CRITICAL: USA Restoration Failure: {}", e); + error!("CRITICAL: State restoration failure: {}", e); } } - // SAL restore should only handle OEM-specific non-sysfs state not covered by guard - if let Err(e) = self.sal.restore() { - warn!("Failed to perform secondary SAL restoration: {}", e); - } - - self.log("✓ Hardware state restored.")?; - + info!("✓ Hardware state restored to pre-flight defaults."); result } /// Internal execution logic for the benchmark phases. fn execute_benchmark(&mut self) -> Result { - let bench_cfg = self.facts.bench_config.clone().context("Benchmarking config missing in facts")?; + let bench_cfg = self.facts.bench_config.clone().context("Benchmarking configuration missing.")?; - // 1. Snapshot & Arm Safeguard + // 1. Pre-Flight Phase + self.ui_phase = BenchmarkPhase::Auditing; + self.log("Phase: Pre-Flight Auditing & Sterilization")?; + + // Snapshot and neutralise Brawl Matrix let mut target_files = self.facts.rapl_paths.iter() .map(|p| p.join("constraint_0_power_limit_uw")) .collect::>(); target_files.extend(self.facts.rapl_paths.iter().map(|p| p.join("constraint_1_power_limit_uw"))); + if let Some(tp) = self.facts.paths.configs.get("throttled") { target_files.push(tp.clone()); } - let target_services = vec!["tlp.service".to_string(), "thermald.service".to_string(), "throttled.service".to_string()]; - let mut sg = HardwareStateGuard::acquire(&target_files, &target_services)?; - - // # SAFETY: Register fan restoration command if we are on Dell - if self.facts.vendor.to_lowercase().contains("dell") { - if let Some(tool_path) = self.facts.paths.tools.get("dell_fan_ctrl") { - let tool_str = tool_path.to_string_lossy().to_string(); - sg.on_rollback(Box::new(move || { - let _ = std::process::Command::new(tool_str).arg("1").status(); - })); - } - } - + let sg = HardwareStateGuard::acquire(&target_files, &self.facts.conflict_services)?; self.safeguard = Some(sg); - // Phase 1: Audit & Baseline - self.phase = BenchmarkPhase::Auditing; + // Run auditor for step in self.sal.audit() { if let Err(e) = step.outcome { return Err(anyhow::anyhow!("Audit failed ({}): {:?}", step.description, e)); } } - self.workload.initialize().context("Failed to initialize workload")?; - self.sal.suppress().context("Failed to suppress background services")?; + self.workload.initialize().context("Failed to initialize load generator.")?; - // Baseline (Idle Calibration) - self.phase = BenchmarkPhase::IdleCalibration; - self.log(&format!("Phase 1: Recording Idle Baseline ({}s)...", bench_cfg.idle_duration_s))?; + let tick = Cell::new(0u64); + + // 2. Idle Baseline Phase + self.ui_phase = BenchmarkPhase::IdleCalibration; + self.log(&format!("Phase: Recording Idle Baseline ({}s)", bench_cfg.idle_duration_s))?; + + // Wait for fan spin-up self.sal.set_fan_mode("auto")?; let mut idle_temps = Vec::new(); let start = Instant::now(); - let mut tick = 0; while start.elapsed() < Duration::from_secs(bench_cfg.idle_duration_s) { - self.check_abort()?; - self.send_telemetry(tick)?; + self.check_safety_abort()?; + self.send_telemetry(tick.get())?; idle_temps.push(self.sal.get_temp().unwrap_or(0.0)); - tick += 1; + tick.set(tick.get() + 1); thread::sleep(Duration::from_millis(500)); } self.profile.ambient_temp = self.engine.smooth(&idle_temps).last().cloned().unwrap_or(0.0); self.log(&format!("✓ Idle Baseline: {:.1}°C", self.profile.ambient_temp))?; - // Phase 1.5: Thermal Soak (Agent Metrology) - self.log("Phase 1.5: Executing Thermal Soak to achieve chassis saturation...")?; - let soak_duration_minutes = 1; - let mut metrology = MetrologyAgent::new(self.sal.as_ref(), &mut self.workload); - let max_soak_watts = metrology.perform_thermal_soak(soak_duration_minutes)?; - self.log(&format!("✓ Max sustained wattage during soak: {:.1}W", max_soak_watts))?; + // 3. Stress Sweep Phase + self.ui_phase = BenchmarkPhase::StressTesting; + self.log("Phase: Synthetic Stress Matrix (Gradual Ramp)")?; + + // Ensure fans are ramped to MAX before load + self.log("Metrology: Locking fans to MAX...")?; + self.sal.set_fan_mode("max")?; + let fan_lock_start = Instant::now(); + loop { + let fans = self.sal.get_fan_rpms().unwrap_or_default(); + let max_rpm = fans.iter().cloned().max().unwrap_or(0); + if max_rpm >= 3000 || fan_lock_start.elapsed() > Duration::from_secs(15) { + break; + } + thread::sleep(Duration::from_millis(500)); + self.send_telemetry(tick.get())?; + tick.set(tick.get() + 1); + } - // Phase 2: Stress Stepping - self.phase = BenchmarkPhase::StressTesting; - self.log("Phase 2: Starting Synthetic Stress Matrix.")?; - self.sal.set_fan_mode("max")?; - - let mut current_pl = 10.0_f32; // Start at 10W + let physical_threads = num_cpus::get_physical(); let mut previous_ops = 0.0; - loop { - self.log(&format!("Testing PL1 = {:.0}W...", current_pl))?; + for &watts in &bench_cfg.power_steps_watts { + self.check_safety_abort()?; + self.log(&format!("Testing PL1 = {:.0}W", watts))?; - // # SAFETY: Transactional Commit for Power Limits - let pl1_uw = TdpLimitMicroWatts::from_watts(current_pl)?; - let pl2_uw = TdpLimitMicroWatts::from_watts(current_pl + 5.0)?; + // Apply limits safely + let pl1 = PowerLimitWatts::try_new(watts)?; + let pl2 = PowerLimitWatts::try_new(watts + 5.0)?; - let mut tx = ConfigurationTransaction::default(); - if let Some(p) = self.facts.rapl_paths.first() { - tx.add_change(p.join("constraint_0_power_limit_uw"), pl1_uw.as_u64().to_string()); - tx.add_change(p.join("constraint_1_power_limit_uw"), pl2_uw.as_u64().to_string()); - } - tx.commit().context("Failed to commit power limit transaction")?; + self.sal.set_sustained_power_limit(pl1)?; + self.sal.set_burst_power_limit(pl2)?; + // Start workload self.workload.run_workload( Duration::from_secs(bench_cfg.stress_duration_max_s), - IntensityProfile { threads: num_cpus::get(), load_percentage: 100, vector: StressVector::CpuMatrix } + IntensityProfile { threads: physical_threads, load_percentage: 100, vector: StressVector::CpuMatrix } )?; let step_start = Instant::now(); let mut step_temps = VecDeque::with_capacity(30); + let mut previous_step_temp = self.sal.get_temp().unwrap_or(0.0); + // Equilibrium Gating while step_start.elapsed() < Duration::from_secs(bench_cfg.stress_duration_max_s) { - self.check_abort()?; + self.check_safety_abort()?; let t = self.sal.get_temp().unwrap_or(0.0); + let dt_dt = (t - previous_step_temp) / 0.5; + previous_step_temp = t; + + // Redundant safety check during step + if t > 94.0 || dt_dt > 5.0 { + warn!("Thermal Spike Detected! Aborting current step."); + break; + } + step_temps.push_back(t); if step_temps.len() > 10 { step_temps.pop_front(); } - self.send_telemetry(tick)?; - tick += 1; + self.send_telemetry(tick.get())?; + tick.set(tick.get() + 1); if step_start.elapsed() > Duration::from_secs(bench_cfg.stress_duration_min_s) && step_temps.len() == 10 { let min = step_temps.iter().fold(f32::MAX, |a, &b| a.min(b)); let max = step_temps.iter().fold(f32::MIN, |a, &b| a.max(b)); if (max - min) < 0.5 { - self.log(&format!(" Equilibrium reached at {:.1}°C", t))?; + info!("Equilibrium reached at {:.1}°C", t); break; } } @@ -251,197 +282,74 @@ impl BenchmarkOrchestrator { } // Record data point - let avg_p = self.sal.get_power_w().unwrap_or(0.0); - let avg_t = self.sal.get_temp().unwrap_or(0.0); - let avg_f = self.sal.get_freq_mhz().unwrap_or(0.0); - let fans = self.sal.get_fan_rpms().unwrap_or_default(); - let primary_fan = fans.first().cloned().unwrap_or(0); let metrics = self.workload.get_current_metrics().unwrap_or_default(); - self.profile.points.push(ThermalPoint { - power_w: avg_p, - temp_c: avg_t, - freq_mhz: avg_f, - fan_rpm: primary_fan, + power_w: self.sal.get_power_w().unwrap_or(watts), + temp_c: self.sal.get_temp().unwrap_or(0.0), + freq_mhz: self.sal.get_freq_mhz().unwrap_or(0.0), + fan_rpm: self.sal.get_fan_rpms().unwrap_or_default().first().cloned().unwrap_or(0), throughput: metrics.primary_ops_per_sec, }); self.workload.stop_workload()?; - // 1. Check Thermal Ceiling Halt Condition - let max_safe_temp = ThermalThresholdCelsius::MAX_SAFE_C - 5.0; // Margin - if avg_t >= max_safe_temp { - self.log(&format!("Thermal ceiling reached ({:.1}°C). Terminating Identification phase.", avg_t))?; - break; - } - - // 2. Check Diminishing Returns Halt Condition (< 1% gain) + // Performance Halt Condition if previous_ops > 0.0 { - let gain_percent = ((metrics.primary_ops_per_sec - previous_ops) / previous_ops) * 100.0; - if gain_percent < 1.0 { - self.log(&format!("Performance gain ({:.1}%) fell below 1%. Terminating Identification phase.", gain_percent))?; + let gain = ((metrics.primary_ops_per_sec - previous_ops) / previous_ops) * 100.0; + if gain < 1.0 { + self.log("Diminishing returns reached. Stopping sweep.")?; break; } } - - // 3. Absolute Maximum Power Check - if current_pl >= 60.0 { - self.log("Maximum theoretical power limit reached. Terminating Identification phase.")?; - break; - } - previous_ops = metrics.primary_ops_per_sec; - current_pl += 2.0; - self.log(&format!(" Step complete. Cooling down for {}s...", bench_cfg.cool_down_s))?; + self.log(&format!("Cooling down ({}s)...", bench_cfg.cool_down_s))?; thread::sleep(Duration::from_secs(bench_cfg.cool_down_s)); } - // Phase 4: Physical Modeling (Agent Analyst) - self.phase = BenchmarkPhase::PhysicalModeling; - self.log("Phase 3: Calculating Silicon Physical Sweet Spot & Profiles...")?; + // 4. Physical Modeling Phase + self.ui_phase = BenchmarkPhase::PhysicalModeling; + self.log("Phase: Silicon Physical Sweet Spot Calculation")?; let analyst = HeuristicAnalyst::new(); - let matrix = analyst.analyze(&self.profile, max_soak_watts); + let matrix = analyst.analyze(&self.profile, self.profile.points.last().map(|p| p.power_w).unwrap_or(15.0)); let mut res = self.generate_result(false); res.optimization_matrix = Some(matrix.clone()); - self.log(&format!("✓ Thermal Resistance (Rθ): {:.3} K/W", res.thermal_resistance_kw))?; - self.log(&format!("✓ Silicon Knee Found: {:.1} W", res.silicon_knee_watts))?; + info!("Identification complete. Knee: {:.1}W, Rθ: {:.3} K/W", res.silicon_knee_watts, res.thermal_resistance_kw); - thread::sleep(Duration::from_secs(3)); - - // Phase 5: Finalizing (Agent Integrator) - self.phase = BenchmarkPhase::Finalizing; - self.log("Benchmark sequence complete. Generating configurations...")?; - - let config = crate::engine::formatters::throttled::ThrottledConfig { - pl1_limit: res.silicon_knee_watts, - pl2_limit: res.recommended_pl2, - trip_temp: res.max_temp_c.max(95.0), - }; + // 5. Finalizing Phase + self.ui_phase = BenchmarkPhase::Finalizing; + self.log("Phase: Generation of Optimized Configuration Sets")?; let throttled_path = self.optional_config_out.clone() .or_else(|| self.facts.paths.configs.get("throttled").cloned()); if let Some(path) = throttled_path { + let config = crate::engine::formatters::throttled::ThrottledConfig { + pl1_limit: res.silicon_knee_watts, + pl2_limit: res.recommended_pl2, + trip_temp: res.max_temp_c.max(90.0), + }; crate::engine::formatters::throttled::ThrottledTranslator::save(&path, &config)?; - self.log(&format!("✓ Saved '{}'.", path.display()))?; - res.config_paths.insert("throttled".to_string(), path.clone()); + self.log(&format!("✓ Saved Throttled profile to {}", path.display()))?; + res.config_paths.insert("throttled".to_string(), path); } - // Generate Fan configs via Agent Integrator - let base_out = self.optional_config_out.clone().unwrap_or_else(|| PathBuf::from("/etc")); - - let i8k_out = base_out.join("i8kmon.conf"); - if ServiceIntegrator::generate_i8kmon_config(&matrix, &i8k_out).is_ok() { - self.log(&format!("✓ Saved '{}'.", i8k_out.display()))?; - res.config_paths.insert("i8kmon".to_string(), i8k_out); - } - - let thinkfan_out = base_out.join("thinkfan.conf"); - if ServiceIntegrator::generate_thinkfan_config(&matrix, &thinkfan_out).is_ok() { - self.log(&format!("✓ Saved '{}'.", thinkfan_out.display()))?; - res.config_paths.insert("thinkfan".to_string(), thinkfan_out); - } - - let thermald_out = base_out.join("thermal-conf.xml"); - if ServiceIntegrator::generate_thermald_config(&matrix, &thermald_out).is_ok() { - self.log(&format!("✓ Saved '{}'.", thermald_out.display()))?; - res.config_paths.insert("thermald".to_string(), thermald_out); - } - - let script_out = base_out.join("ember-tune-neutralize.sh"); - if ServiceIntegrator::generate_conflict_resolution_script(&script_out).is_ok() { - self.log(&format!("✓ Saved conflict resolution script: '{}'", script_out.display()))?; - res.config_paths.insert("conflict_script".to_string(), script_out); - } - Ok(res) } - fn spawn_watchdog_monitor(&self) -> thread::JoinHandle<()> { - let abort = self.emergency_abort.clone(); - let reason_store = self.emergency_reason.clone(); - let sal = self.sal.clone(); - let tx = self.telemetry_tx.clone(); - - thread::spawn(move || { - while !abort.load(Ordering::SeqCst) { - let status = sal.get_safety_status(); - match status { - Ok(SafetyStatus::EmergencyAbort(reason)) => { - *reason_store.lock().unwrap() = Some(reason.clone()); - abort.store(true, Ordering::SeqCst); - break; - } - Ok(SafetyStatus::Warning(msg)) | Ok(SafetyStatus::Critical(msg)) => { - let state = TelemetryState { - cpu_model: String::new(), - total_ram_gb: 0, - tick: 0, - cpu_temp: 0.0, - power_w: 0.0, - current_freq: 0.0, - fans: Vec::new(), - governor: String::new(), - pl1_limit: 0.0, - pl2_limit: 0.0, - fan_tier: String::new(), - is_throttling: sal.get_throttling_status().unwrap_or(false), - phase: BenchmarkPhase::StressTesting, - history_watts: Vec::new(), - history_temp: Vec::new(), - history_mhz: Vec::new(), - log_event: Some(format!("WATCHDOG: {}", msg)), - metadata: std::collections::HashMap::new(), - is_emergency: false, - emergency_reason: None, - }; - let _ = tx.send(state); - } - Ok(SafetyStatus::Nominal) => {} - Err(e) => { - *reason_store.lock().unwrap() = Some(format!("Watchdog Sensor Failure: {}", e)); - abort.store(true, Ordering::SeqCst); - break; - } - } - thread::sleep(Duration::from_millis(100)); - } - }) - } - - pub fn generate_result(&self, is_partial: bool) -> OptimizationResult { - let r_theta = self.engine.calculate_thermal_resistance(&self.profile); - let knee = self.engine.find_silicon_knee(&self.profile); - let max_t = self.engine.get_max_temp(&self.profile); - - OptimizationResult { - profile: self.profile.clone(), - silicon_knee_watts: knee, - thermal_resistance_kw: r_theta, - recommended_pl1: knee, - recommended_pl2: knee * 1.25, - max_temp_c: max_t, - is_partial, - config_paths: std::collections::HashMap::new(), - optimization_matrix: None, - } - } - - fn check_abort(&self) -> Result<()> { + /// Checks if the safety watchdog or user triggered an abort. + fn check_safety_abort(&self) -> Result<()> { if self.emergency_abort.load(Ordering::SeqCst) { - let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Unknown safety trigger".to_string()); - return Err(anyhow::anyhow!("EMERGENCY_ABORT: {}", reason)); + let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Watchdog Triggered".to_string()); + bail!("EMERGENCY_ABORT: {}", reason); } if let Ok(cmd) = self.command_rx.try_recv() { match cmd { - UiCommand::Abort => { - return Err(anyhow::anyhow!("ABORTED")); - } + UiCommand::Abort => bail!("ABORTED"), } } Ok(()) @@ -456,12 +364,12 @@ impl BenchmarkOrchestrator { power_w: self.sal.get_power_w().unwrap_or(0.0), current_freq: self.sal.get_freq_mhz().unwrap_or(0.0), fans: self.sal.get_fan_rpms().unwrap_or_default(), - governor: "unknown".to_string(), + governor: "performance".to_string(), pl1_limit: 0.0, pl2_limit: 0.0, fan_tier: "auto".to_string(), is_throttling: self.sal.get_throttling_status().unwrap_or(false), - phase: self.phase, + phase: self.ui_phase, history_watts: Vec::new(), history_temp: Vec::new(), history_mhz: Vec::new(), @@ -477,7 +385,6 @@ impl BenchmarkOrchestrator { let temp = self.sal.get_temp().unwrap_or(0.0); let pwr = self.sal.get_power_w().unwrap_or(0.0); let freq = self.sal.get_freq_mhz().unwrap_or(0.0); - let throttling = self.sal.get_throttling_status().unwrap_or(false); self.history_temp.push_back(temp); self.history_watts.push_back(pwr); @@ -501,8 +408,8 @@ impl BenchmarkOrchestrator { pl1_limit: 15.0, pl2_limit: 25.0, fan_tier: "max".to_string(), - is_throttling: throttling, - phase: self.phase, + is_throttling: self.sal.get_throttling_status().unwrap_or(false), + phase: self.ui_phase, history_watts: self.history_watts.iter().cloned().collect(), history_temp: self.history_temp.iter().cloned().collect(), history_mhz: self.history_mhz.iter().cloned().collect(), @@ -513,4 +420,22 @@ impl BenchmarkOrchestrator { }; self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed")) } + + pub fn generate_result(&self, is_partial: bool) -> OptimizationResult { + let r_theta = self.engine.calculate_thermal_resistance(&self.profile); + let knee = self.engine.find_silicon_knee(&self.profile); + let max_t = self.engine.get_max_temp(&self.profile); + + OptimizationResult { + profile: self.profile.clone(), + silicon_knee_watts: knee, + thermal_resistance_kw: r_theta, + recommended_pl1: knee, + recommended_pl2: knee * 1.25, + max_temp_c: max_t, + is_partial, + config_paths: std::collections::HashMap::new(), + optimization_matrix: None, + } + } } diff --git a/src/sal/dell_xps_9380.rs b/src/sal/dell_xps_9380.rs index be78de1..82fde52 100644 --- a/src/sal/dell_xps_9380.rs +++ b/src/sal/dell_xps_9380.rs @@ -1,11 +1,12 @@ use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep, SafetyStatus, EnvironmentCtx}; -use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage}; +use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent}; use anyhow::{Result, Context, anyhow}; use std::fs; use std::path::{PathBuf}; use std::time::{Duration, Instant}; +use std::thread; use std::sync::Mutex; -use tracing::{debug, warn}; +use tracing::{info, debug}; use crate::sal::heuristic::discovery::SystemFactSheet; /// Implementation of the System Abstraction Layer for the Dell XPS 13 9380. @@ -15,30 +16,66 @@ pub struct DellXps9380Sal { temp_path: PathBuf, pwr_path: PathBuf, fan_paths: Vec, + pwm_paths: Vec, + pwm_enable_paths: Vec, + pl1_paths: Vec, + pl2_paths: Vec, freq_path: PathBuf, - pl1_path: PathBuf, - pl2_path: PathBuf, last_poll: Mutex, last_temp: Mutex, last_fans: Mutex>, - suppressed_services: Mutex>, msr_file: Mutex, last_energy: Mutex<(u64, Instant)>, last_watts: Mutex, - - // --- Original State for Restoration --- - original_pl1: Mutex>, - original_pl2: Mutex>, - original_fan_mode: Mutex>, } impl DellXps9380Sal { - /// Initializes the Dell SAL, opening the MSR interface and discovering sensors. + /// Initializes the Dell SAL, opening the MSR interface and discovering sensors and PWM nodes. pub fn init(ctx: EnvironmentCtx, facts: SystemFactSheet) -> Result { let temp_path = facts.temp_path.clone().context("Dell SAL requires temperature sensor")?; let pwr_base = facts.rapl_paths.first().cloned().context("Dell SAL requires RAPL interface")?; let fan_paths = facts.fan_paths.clone(); + // 1. Discover PWM and Enable nodes associated with the fan paths + let mut pwm_paths = Vec::new(); + let mut pwm_enable_paths = Vec::new(); + for fan_p in &fan_paths { + if let Some(parent) = fan_p.parent() { + let fan_file = fan_p.file_name().and_then(|n| n.to_str()).unwrap_or(""); + let fan_idx = fan_file.chars().filter(|c| c.is_ascii_digit()).collect::(); + let idx = if fan_idx.is_empty() { "1".to_string() } else { fan_idx }; + + let pwm_p = parent.join(format!("pwm{}", idx)); + if pwm_p.exists() { pwm_paths.push(pwm_p); } + + let enable_p = parent.join(format!("pwm{}_enable", idx)); + if enable_p.exists() { pwm_enable_paths.push(enable_p); } + } + } + + // 2. Map all RAPL constraints + let mut pl1_paths = Vec::new(); + let mut pl2_paths = Vec::new(); + for rapl_p in &facts.rapl_paths { + pl1_paths.push(rapl_p.join("constraint_0_power_limit_uw")); + pl2_paths.push(rapl_p.join("constraint_1_power_limit_uw")); + } + + // 3. Physical Sensor Verification & Warm Cache Priming + let mut initial_fans = Vec::new(); + for fan_p in &fan_paths { + let mut rpm = 0; + for _ in 0..3 { + if let Ok(val) = fs::read_to_string(fan_p) { + rpm = val.trim().parse::().unwrap_or(0); + if rpm > 0 { break; } + } + thread::sleep(Duration::from_millis(100)); + } + info!("SAL Warm-Start: Fan sensor {:?} -> {} RPM", fan_p, rpm); + initial_fans.push(rpm); + } + let freq_path = ctx.sysfs_base.join("sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq"); let msr_path = ctx.sysfs_base.join("dev/cpu/0/msr"); @@ -47,25 +84,26 @@ impl DellXps9380Sal { let initial_energy = fs::read_to_string(pwr_base.join("energy_uj")).unwrap_or_default().trim().parse().unwrap_or(0); + info!("SAL: Dell XPS 9380 Initialized. ({} fans, {} RAPL nodes found)", + fan_paths.len(), facts.rapl_paths.len()); + Ok(Self { temp_path, pwr_path: pwr_base.join("power1_average"), fan_paths, + pwm_paths, + pwm_enable_paths, + pl1_paths, + pl2_paths, freq_path, - pl1_path: pwr_base.join("constraint_0_power_limit_uw"), - pl2_path: pwr_base.join("constraint_1_power_limit_uw"), last_poll: Mutex::new(Instant::now() - Duration::from_secs(2)), last_temp: Mutex::new(0.0), - last_fans: Mutex::new(Vec::new()), - suppressed_services: Mutex::new(Vec::new()), + last_fans: Mutex::new(initial_fans), msr_file: Mutex::new(msr_file), last_energy: Mutex::new((initial_energy, Instant::now())), last_watts: Mutex::new(0.0), fact_sheet: facts, ctx, - original_pl1: Mutex::new(None), - original_pl2: Mutex::new(None), - original_fan_mode: Mutex::new(None), }) } @@ -93,7 +131,6 @@ impl PreflightAuditor for DellXps9380Sal { outcome: if unsafe { libc::getuid() } == 0 { Ok(()) } else { Err(AuditError::RootRequired) } }); - // RAPL Lock Check (MSR 0x610) let rapl_lock = match self.read_msr(0x610) { Ok(val) => { if (val & (1 << 63)) != 0 { @@ -104,19 +141,14 @@ impl PreflightAuditor for DellXps9380Sal { }, Err(e) => Err(AuditError::ToolMissing(format!("Cannot read MSR 0x610: {}", e))), }; - steps.push(AuditStep { - description: "MSR 0x610 RAPL Lock Status".to_string(), - outcome: rapl_lock, - }); + steps.push(AuditStep { description: "MSR 0x610 RAPL Lock Status".to_string(), outcome: rapl_lock }); let modules = ["dell_smm_hwmon", "msr", "intel_rapl_msr"]; for mod_name in modules { let path = self.ctx.sysfs_base.join(format!("sys/module/{}", mod_name)); steps.push(AuditStep { description: format!("Kernel Module: {}", mod_name), - outcome: if path.exists() { Ok(()) } else { - Err(AuditError::ToolMissing(format!("Module '{}' not loaded.", mod_name))) - } + outcome: if path.exists() { Ok(()) } else { Err(AuditError::ToolMissing(format!("Module '{}' not loaded.", mod_name))) } }); } @@ -138,9 +170,7 @@ impl PreflightAuditor for DellXps9380Sal { let ac_status = fs::read_to_string(ac_status_path).unwrap_or_else(|_| "0".to_string()); steps.push(AuditStep { description: "AC Power Connection".to_string(), - outcome: if ac_status.trim() == "1" { Ok(()) } else { - Err(AuditError::AcPowerMissing("System must be on AC power".to_string())) - } + outcome: if ac_status.trim() == "1" { Ok(()) } else { Err(AuditError::AcPowerMissing("System must be on AC power".to_string())) } }); Box::new(steps.into_iter()) @@ -148,49 +178,16 @@ impl PreflightAuditor for DellXps9380Sal { } impl EnvironmentGuard for DellXps9380Sal { - fn suppress(&self) -> Result<()> { - if let Ok(pl1) = fs::read_to_string(&self.pl1_path) { - *self.original_pl1.lock().unwrap() = pl1.trim().parse().ok(); - } - if let Ok(pl2) = fs::read_to_string(&self.pl2_path) { - *self.original_pl2.lock().unwrap() = pl2.trim().parse().ok(); - } - *self.original_fan_mode.lock().unwrap() = Some("1".to_string()); - - let services = ["tlp", "thermald", "i8kmon"]; - let mut suppressed = self.suppressed_services.lock().unwrap(); - for s in services { - if self.ctx.runner.run("systemctl", &["is-active", "--quiet", s]).is_ok() { - let _ = self.ctx.runner.run("systemctl", &["stop", s]); - suppressed.push(s.to_string()); - } - } - Ok(()) - } - - fn restore(&self) -> Result<()> { - if let Some(pl1) = *self.original_pl1.lock().unwrap() { - let _ = fs::write(&self.pl1_path, pl1.to_string()); - } - if let Some(pl2) = *self.original_pl2.lock().unwrap() { - let _ = fs::write(&self.pl2_path, pl2.to_string()); - } - if let Some(tool_path) = self.fact_sheet.paths.tools.get("dell_fan_ctrl") { - let _ = self.ctx.runner.run(&tool_path.to_string_lossy(), &["1"]); - } - let mut suppressed = self.suppressed_services.lock().unwrap(); - for s in suppressed.drain(..) { - let _ = self.ctx.runner.run("systemctl", &["start", &s]); - } - Ok(()) - } + fn suppress(&self) -> Result<()> { Ok(()) } + fn restore(&self) -> Result<()> { Ok(()) } } impl SensorBus for DellXps9380Sal { fn get_temp(&self) -> Result { let mut last_poll = self.last_poll.lock().unwrap(); let now = Instant::now(); - if now.duration_since(*last_poll) < Duration::from_millis(1000) { + // # SAFETY: High frequency polling for watchdog + if now.duration_since(*last_poll) < Duration::from_millis(100) { return Ok(*self.last_temp.lock().unwrap()); } let s = fs::read_to_string(&self.temp_path)?; @@ -201,7 +198,7 @@ impl SensorBus for DellXps9380Sal { } fn get_power_w(&self) -> Result { - let rapl_base = self.pl1_path.parent().context("RAPL path error")?; + let rapl_base = self.fact_sheet.rapl_paths.first().context("RAPL path error")?; let energy_path = rapl_base.join("energy_uj"); if energy_path.exists() { @@ -212,14 +209,9 @@ impl SensorBus for DellXps9380Sal { let e2 = e2_str.trim().parse::()?; let t2 = Instant::now(); let (e1, t1) = *last_energy; - let delta_e = e2.wrapping_sub(e1); let delta_t = t2.duration_since(t1).as_secs_f32(); - - if delta_t < 0.1 { - return Ok(*last_watts); // Return cached if polled too fast - } - + if delta_t < 0.1 { return Ok(*last_watts); } let watts = (delta_e as f32 / 1_000_000.0) / delta_t; *last_energy = (e2, t2); *last_watts = watts; @@ -236,12 +228,27 @@ impl SensorBus for DellXps9380Sal { if now.duration_since(*last_poll) < Duration::from_millis(1000) { return Ok(self.last_fans.lock().unwrap().clone()); } + let mut fans = Vec::new(); for path in &self.fan_paths { - if let Ok(s) = fs::read_to_string(path) { - if let Ok(rpm) = s.trim().parse::() { fans.push(rpm); } + let mut val = 0; + for i in 0..5 { + match fs::read_to_string(path) { + Ok(s) => { + if let Ok(rpm) = s.trim().parse::() { + val = rpm; + if rpm > 0 { break; } + } + }, + Err(e) => { + debug!("SAL: Fan poll retry {} for {:?} failed: {}", i+1, path, e); + } + } + thread::sleep(Duration::from_millis(150)); } + fans.push(val); } + *self.last_fans.lock().unwrap() = fans.clone(); *last_poll = now; Ok(fans) @@ -253,7 +260,6 @@ impl SensorBus for DellXps9380Sal { } fn get_throttling_status(&self) -> Result { - // MSR 0x19C bit 0 is "Thermal Status", bit 1 is "Thermal Log" let val = self.read_msr(0x19C)?; Ok((val & 0x1) != 0) } @@ -266,24 +272,47 @@ impl ActuatorBus for DellXps9380Sal { let tool_str = tool_path.to_string_lossy(); match mode { - "max" | "Manual" => { self.ctx.runner.run(&tool_str, &["0"])?; } + "max" | "Manual" => { + self.ctx.runner.run(&tool_str, &["0"])?; + // Disabling BIOS control requires immediate PWM override + self.set_fan_speed(FanSpeedPercent::new(100)?)?; + } "auto" | "Auto" => { self.ctx.runner.run(&tool_str, &["1"])?; } _ => {} } Ok(()) } - fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> { + fn set_fan_speed(&self, speed: FanSpeedPercent) -> Result<()> { + let pwm_val = ((speed.get() as u32 * 255) / 100) as u8; + for p in &self.pwm_enable_paths { let _ = fs::write(p, "1"); } + for path in &self.pwm_paths { let _ = fs::write(path, pwm_val.to_string()); } Ok(()) } - fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { - fs::write(&self.pl1_path, limit.as_u64().to_string())?; + fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()> { + for path in &self.pl1_paths { + debug!("SAL: Applying PL1 ({:.1}W) to {:?}", limit.get(), path); + fs::write(path, limit.as_microwatts().to_string()) + .with_context(|| format!("Failed to write PL1 to {:?}", path))?; + if let Some(parent) = path.parent() { + let enable_p = parent.join("constraint_0_enabled"); + let _ = fs::write(&enable_p, "1"); + } + } Ok(()) } - fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { - fs::write(&self.pl2_path, limit.as_u64().to_string())?; + fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()> { + for path in &self.pl2_paths { + debug!("SAL: Applying PL2 ({:.1}W) to {:?}", limit.get(), path); + fs::write(path, limit.as_microwatts().to_string()) + .with_context(|| format!("Failed to write PL2 to {:?}", path))?; + if let Some(parent) = path.parent() { + let enable_p = parent.join("constraint_1_enabled"); + let _ = fs::write(&enable_p, "1"); + } + } Ok(()) } } @@ -305,7 +334,5 @@ impl HardwareWatchdog for DellXps9380Sal { } impl Drop for DellXps9380Sal { - fn drop(&mut self) { - let _ = self.restore(); - } + fn drop(&mut self) { } } diff --git a/src/sal/discovery.rs b/src/sal/discovery.rs new file mode 100644 index 0000000..51c8df6 --- /dev/null +++ b/src/sal/discovery.rs @@ -0,0 +1,148 @@ +//! # Hardware Discovery Engine (Agent Sentinel) +//! +//! This module provides dynamic traversal of `/sys/class/hwmon` and `/sys/class/powercap` +//! to locate sensors and actuators without relying on hardcoded indices. + +use anyhow::{Result, Context, anyhow}; +use std::fs; +use std::path::{Path, PathBuf}; +use tracing::{debug, info, warn}; + +/// Result of a successful hardware discovery. +#[derive(Debug, Clone)] +pub struct DiscoveredHardware { + /// Path to the primary package temperature sensor input. + pub temp_input: PathBuf, + /// Paths to all detected fan RPM inputs. + pub fan_inputs: Vec, + /// Paths to all detected fan PWM control nodes. + pub pwm_controls: Vec, + /// Paths to all detected fan PWM enable nodes. + pub pwm_enables: Vec, + /// Paths to RAPL power limit constraint files. + pub rapl_paths: Vec, +} + +pub struct DiscoveryEngine; + +impl DiscoveryEngine { + /// Performs a full traversal of the sysfs hardware tree. + pub fn run(sysfs_root: &Path) -> Result { + info!("Sentinel: Starting dynamic hardware discovery..."); + + let hwmon_path = sysfs_root.join("sys/class/hwmon"); + let (temp_input, fan_info) = Self::discover_hwmon(&hwmon_path)?; + + let powercap_path = sysfs_root.join("sys/class/powercap"); + let rapl_paths = Self::discover_rapl(&powercap_path)?; + + let hardware = DiscoveredHardware { + temp_input, + fan_inputs: fan_info.rpm_inputs, + pwm_controls: fan_info.pwm_controls, + pwm_enables: fan_info.pwm_enables, + rapl_paths, + }; + + info!("Sentinel: Discovery complete. Found {} fans and {} RAPL nodes.", + hardware.fan_inputs.len(), hardware.rapl_paths.len()); + + Ok(hardware) + } + + fn discover_hwmon(base: &Path) -> Result<(PathBuf, FanHardware)> { + let mut best_temp: Option<(u32, PathBuf)> = None; + let mut fans = FanHardware::default(); + + let entries = fs::read_dir(base) + .with_context(|| format!("Failed to read hwmon base: {:?}", base))?; + + for entry in entries.flatten() { + let path = entry.path(); + let driver_name = fs::read_to_string(path.join("name")) + .map(|s| s.trim().to_string()) + .unwrap_or_else(|_| "unknown".to_string()); + + debug!("Discovery: Probing hwmon node {:?} (driver: {})", path, driver_name); + + // 1. Temperature Discovery + let temp_priority = match driver_name.as_str() { + "coretemp" | "zenpower" => 10, + "k10temp" => 9, + "dell_smm" => 8, + "acpitz" => 1, + _ => 5, + }; + + if let Ok(hw_entries) = fs::read_dir(&path) { + for hw_entry in hw_entries.flatten() { + let file_name = hw_entry.file_name().to_string_lossy().to_string(); + + // Temperature Inputs + if file_name.starts_with("temp") && file_name.ends_with("_input") { + let label_path = path.join(file_name.replace("_input", "_label")); + let label = fs::read_to_string(label_path).unwrap_or_default().trim().to_string(); + + let label_priority = if label.contains("Package") || label.contains("Tdie") { + 2 + } else { + 0 + }; + + let total_priority = temp_priority + label_priority; + if best_temp.is_none() || total_priority > best_temp.as_ref().unwrap().0 { + best_temp = Some((total_priority, hw_entry.path())); + } + } + + // Fan Inputs + if file_name.starts_with("fan") && file_name.ends_with("_input") { + fans.rpm_inputs.push(hw_entry.path()); + } + + // PWM Controls + if file_name.starts_with("pwm") && !file_name.contains("_") { + fans.pwm_controls.push(hw_entry.path()); + } + + // PWM Enables + if file_name.starts_with("pwm") && file_name.ends_with("_enable") { + fans.pwm_enables.push(hw_entry.path()); + } + } + } + } + + let temp_input = best_temp.map(|(_, p)| p) + .ok_or_else(|| anyhow!("Failed to locate any valid temperature sensor in /sys/class/hwmon/"))?; + + Ok((temp_input, fans)) + } + + fn discover_rapl(base: &Path) -> Result> { + let mut paths = Vec::new(); + if !base.exists() { + warn!("Discovery: /sys/class/powercap does not exist."); + return Ok(paths); + } + + let entries = fs::read_dir(base)?; + for entry in entries.flatten() { + let path = entry.path(); + let name = fs::read_to_string(path.join("name")).unwrap_or_default().trim().to_string(); + + if name.contains("package") || name.contains("intel-rapl") { + paths.push(path); + } + } + + Ok(paths) + } +} + +#[derive(Default)] +struct FanHardware { + rpm_inputs: Vec, + pwm_controls: Vec, + pwm_enables: Vec, +} diff --git a/src/sal/generic_linux.rs b/src/sal/generic_linux.rs index 767dbe7..3456794 100644 --- a/src/sal/generic_linux.rs +++ b/src/sal/generic_linux.rs @@ -1,11 +1,12 @@ -use anyhow::{Result, anyhow}; +use anyhow::{Result, anyhow, Context}; use std::path::{Path}; use std::fs; use std::time::{Duration, Instant}; -use std::sync::Mutex; +use std::sync::{Mutex, Arc}; +use tracing::{debug, warn, info}; use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx}; -use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage}; +use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent}; use crate::sal::heuristic::discovery::SystemFactSheet; use crate::sal::heuristic::schema::HardwareDb; @@ -13,14 +14,9 @@ pub struct GenericLinuxSal { ctx: EnvironmentCtx, fact_sheet: SystemFactSheet, db: HardwareDb, - suppressed_services: Mutex>, last_valid_temp: Mutex<(f32, Instant)>, current_pl1: Mutex, last_energy: Mutex<(u64, Instant)>, - - // --- Original State for Restoration --- - original_pl1: Mutex>, - original_pl2: Mutex>, } impl GenericLinuxSal { @@ -33,14 +29,11 @@ impl GenericLinuxSal { Self { db, - suppressed_services: Mutex::new(Vec::new()), last_valid_temp: Mutex::new((0.0, Instant::now())), current_pl1: Mutex::new(15_000_000), last_energy: Mutex::new((initial_energy, Instant::now())), fact_sheet: facts, ctx, - original_pl1: Mutex::new(None), - original_pl2: Mutex::new(None), } } @@ -135,7 +128,6 @@ impl SensorBus for GenericLinuxSal { } fn get_throttling_status(&self) -> Result { - // Fallback: check if any cooling device is active (cur_state > 0) let cooling_base = self.ctx.sysfs_base.join("sys/class/thermal"); if let Ok(entries) = fs::read_dir(cooling_base) { for entry in entries.flatten() { @@ -168,68 +160,37 @@ impl ActuatorBus for GenericLinuxSal { } else { Ok(()) } } - fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> { + fn set_fan_speed(&self, _speed: FanSpeedPercent) -> Result<()> { Ok(()) } - fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { - let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL1 path"))?; - fs::write(rapl_path.join("constraint_0_power_limit_uw"), limit.as_u64().to_string())?; - *self.current_pl1.lock().unwrap() = limit.as_u64(); + fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()> { + for rapl_path in &self.fact_sheet.rapl_paths { + let limit_path = rapl_path.join("constraint_0_power_limit_uw"); + let enable_path = rapl_path.join("constraint_0_enabled"); + fs::write(&limit_path, limit.as_microwatts().to_string()) + .with_context(|| format!("Failed to write PL1 to {:?}", limit_path))?; + let _ = fs::write(&enable_path, "1"); + } + *self.current_pl1.lock().unwrap() = limit.as_microwatts(); Ok(()) } - fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { - let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL2 path"))?; - fs::write(rapl_path.join("constraint_1_power_limit_uw"), limit.as_u64().to_string())?; + fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()> { + for rapl_path in &self.fact_sheet.rapl_paths { + let limit_path = rapl_path.join("constraint_1_power_limit_uw"); + let enable_path = rapl_path.join("constraint_1_enabled"); + fs::write(&limit_path, limit.as_microwatts().to_string()) + .with_context(|| format!("Failed to write PL2 to {:?}", limit_path))?; + let _ = fs::write(&enable_path, "1"); + } Ok(()) } } impl EnvironmentGuard for GenericLinuxSal { - fn suppress(&self) -> Result<()> { - // Snapshot Power Limits - if let Some(rapl_path) = self.fact_sheet.rapl_paths.first() { - if let Ok(pl1) = fs::read_to_string(rapl_path.join("constraint_0_power_limit_uw")) { - *self.original_pl1.lock().unwrap() = pl1.trim().parse().ok(); - } - if let Ok(pl2) = fs::read_to_string(rapl_path.join("constraint_1_power_limit_uw")) { - *self.original_pl2.lock().unwrap() = pl2.trim().parse().ok(); - } - } - - let mut suppressed = self.suppressed_services.lock().unwrap(); - for conflict_id in &self.fact_sheet.active_conflicts { - if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) { - for service in &conflict.services { - if self.ctx.runner.run("systemctl", &["is-active", "--quiet", service]).is_ok() { - let _ = self.ctx.runner.run("systemctl", &["stop", service]); - suppressed.push(service.clone()); - } - } - } - } - Ok(()) - } - - fn restore(&self) -> Result<()> { - // Restore Power Limits - if let Some(rapl_path) = self.fact_sheet.rapl_paths.first() { - if let Some(pl1) = *self.original_pl1.lock().unwrap() { - let _ = fs::write(rapl_path.join("constraint_0_power_limit_uw"), pl1.to_string()); - } - if let Some(pl2) = *self.original_pl2.lock().unwrap() { - let _ = fs::write(rapl_path.join("constraint_1_power_limit_uw"), pl2.to_string()); - } - } - - let mut suppressed = self.suppressed_services.lock().unwrap(); - for service in suppressed.drain(..) { - let _ = self.ctx.runner.run("systemctl", &["start", &service]); - } - if self.is_dell() { let _ = self.set_fan_mode("auto"); } - Ok(()) - } + fn suppress(&self) -> Result<()> { Ok(()) } + fn restore(&self) -> Result<()> { Ok(()) } } impl HardwareWatchdog for GenericLinuxSal { @@ -245,7 +206,3 @@ impl HardwareWatchdog for GenericLinuxSal { Ok(SafetyStatus::Nominal) } } - -impl Drop for GenericLinuxSal { - fn drop(&mut self) { let _ = self.restore(); } -} diff --git a/src/sal/heuristic/discovery.rs b/src/sal/heuristic/discovery.rs index 3dce223..77803df 100644 --- a/src/sal/heuristic/discovery.rs +++ b/src/sal/heuristic/discovery.rs @@ -6,7 +6,7 @@ use std::sync::mpsc; use std::collections::HashMap; use crate::sal::heuristic::schema::{SensorDiscovery, ActuatorDiscovery, Conflict, Discovery, Benchmarking}; use crate::sys::SyscallRunner; -use tracing::{debug, warn}; +use tracing::{debug, warn, info}; /// Registry of dynamically discovered paths for configs and tools. #[derive(Debug, Clone, Default)] @@ -24,6 +24,7 @@ pub struct SystemFactSheet { pub fan_paths: Vec, pub rapl_paths: Vec, pub active_conflicts: Vec, + pub conflict_services: Vec, pub paths: PathRegistry, pub bench_config: Option, } @@ -44,12 +45,17 @@ pub fn discover_facts( let rapl_paths = discover_rapl(base_path, &discovery.actuators); let mut active_conflicts = Vec::new(); + let mut conflict_services = Vec::new(); for conflict in conflicts { + let mut found_active = false; for service in &conflict.services { if is_service_active(runner, service) { - debug!("Detected active conflict: {} (Service: {})", conflict.id, service); - active_conflicts.push(conflict.id.clone()); - break; + if !found_active { + debug!("Detected active conflict: {} (Service: {})", conflict.id, service); + active_conflicts.push(conflict.id.clone()); + found_active = true; + } + conflict_services.push(service.clone()); } } } @@ -57,13 +63,7 @@ pub fn discover_facts( let paths = discover_paths(base_path, discovery); SystemFactSheet { - vendor, - model, - temp_path, - fan_paths, - rapl_paths, - active_conflicts, - paths, + vendor, model, temp_path, fan_paths, rapl_paths, active_conflicts, conflict_services, paths, bench_config: Some(bench_config), } } @@ -71,7 +71,6 @@ pub fn discover_facts( fn discover_paths(base_path: &Path, discovery: &Discovery) -> PathRegistry { let mut registry = PathRegistry::default(); - // 1. Discover Tools via PATH for (id, binary_name) in &discovery.tools { if let Ok(path) = which::which(binary_name) { debug!("Discovered tool: {} -> {:?}", id, path); @@ -79,7 +78,6 @@ fn discover_paths(base_path: &Path, discovery: &Discovery) -> PathRegistry { } } - // 2. Discover Configs via existence check for (id, candidates) in &discovery.configs { for candidate in candidates { let path = if candidate.starts_with('/') { @@ -104,12 +102,11 @@ fn discover_paths(base_path: &Path, discovery: &Discovery) -> PathRegistry { registry } -/// Reads DMI information from sysfs with a safety timeout. fn read_dmi_info(base_path: &Path) -> (String, String) { - let vendor = read_sysfs_with_timeout(&base_path.join("sys/class/dmi/id/sys_vendor"), Duration::from_millis(100)) - .unwrap_or_else(|| "Unknown".to_string()); - let model = read_sysfs_with_timeout(&base_path.join("sys/class/dmi/id/product_name"), Duration::from_millis(100)) - .unwrap_or_else(|| "Unknown".to_string()); + let vendor = fs::read_to_string(base_path.join("sys/class/dmi/id/sys_vendor")) + .map(|s| s.trim().to_string()).unwrap_or_else(|_| "Unknown".to_string()); + let model = fs::read_to_string(base_path.join("sys/class/dmi/id/product_name")) + .map(|s| s.trim().to_string()).unwrap_or_else(|_| "Unknown".to_string()); (vendor, model) } @@ -119,49 +116,62 @@ fn discover_hwmon(base_path: &Path, cfg: &SensorDiscovery) -> (Option, let mut fan_candidates = Vec::new(); let hwmon_base = base_path.join("sys/class/hwmon"); - let entries = match fs::read_dir(&hwmon_base) { - Ok(e) => e, - Err(e) => { - warn!("Could not read {:?}: {}", hwmon_base, e); - return (None, Vec::new()); - } - }; + let entries = fs::read_dir(&hwmon_base).map_err(|e| { + warn!("Could not read {:?}: {}", hwmon_base, e); + e + }).ok(); - for entry in entries.flatten() { - let hwmon_path = entry.path(); - - let driver_name = read_sysfs_with_timeout(&hwmon_path.join("name"), Duration::from_millis(100)) - .unwrap_or_default(); + if let Some(entries) = entries { + for entry in entries.flatten() { + let hwmon_path = entry.path(); + + // # SAFETY: Read driver name directly. This file is virtual and never blocks. + // Using a timeout wrapper here was causing discovery to fail if the thread-pool lagged. + let driver_name = fs::read_to_string(hwmon_path.join("name")) + .map(|s| s.trim().to_string()).unwrap_or_default(); - let priority = cfg.hwmon_priority - .iter() - .position(|p| p == &driver_name) - .unwrap_or(usize::MAX); + let priority = cfg.hwmon_priority + .iter() + .position(|p| driver_name.contains(p)) + .unwrap_or(usize::MAX); - if let Ok(hw_entries) = fs::read_dir(&hwmon_path) { - for hw_entry in hw_entries.flatten() { - let file_name = hw_entry.file_name().into_string().unwrap_or_default(); - - if file_name.starts_with("temp") && file_name.ends_with("_label") { - if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(100)) { - if cfg.temp_labels.iter().any(|l| label.contains(l)) { - let input_path = hwmon_path.join(file_name.replace("_label", "_input")); - if input_path.exists() { - temp_candidates.push((priority, input_path)); + if let Ok(hw_entries) = fs::read_dir(&hwmon_path) { + for hw_entry in hw_entries.flatten() { + let file_name = hw_entry.file_name().into_string().unwrap_or_default(); + + // 1. Temperatures + if file_name.starts_with("temp") && file_name.ends_with("_label") { + if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(500)) { + if cfg.temp_labels.iter().any(|l| label.contains(l)) { + let input_path = hwmon_path.join(file_name.replace("_label", "_input")); + if input_path.exists() { + temp_candidates.push((priority, input_path)); + } } } } - } - if file_name.starts_with("fan") && file_name.ends_with("_label") { - if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(100)) { - if cfg.fan_labels.iter().any(|l| label.contains(l)) { - let input_path = hwmon_path.join(file_name.replace("_label", "_input")); - if input_path.exists() { - fan_candidates.push((priority, input_path)); + // 2. Fans (Label Match) + if file_name.starts_with("fan") && file_name.ends_with("_label") { + if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(500)) { + if cfg.fan_labels.iter().any(|l| label.contains(l)) { + let input_path = hwmon_path.join(file_name.replace("_label", "_input")); + if input_path.exists() { + debug!("Discovered fan by label: {:?} (priority {})", input_path, priority); + fan_candidates.push((priority, input_path)); + } } } } + + // 3. Fans (Priority Fallback - CRITICAL FOR DELL 9380) + // If we found a priority driver (e.g., dell_smm), we take every fan*_input we find. + if priority < usize::MAX && file_name.starts_with("fan") && file_name.ends_with("_input") { + if !fan_candidates.iter().any(|(_, p)| p == &hw_entry.path()) { + info!("Heuristic Discovery: Force-adding unlabeled fan sensor from priority driver '{}': {:?}", driver_name, hw_entry.path()); + fan_candidates.push((priority, hw_entry.path())); + } + } } } } @@ -171,45 +181,45 @@ fn discover_hwmon(base_path: &Path, cfg: &SensorDiscovery) -> (Option, fan_candidates.sort_by_key(|(p, _)| *p); let best_temp = temp_candidates.first().map(|(_, p)| p.clone()); - let best_fans = fan_candidates.into_iter().map(|(_, p)| p).collect(); + let best_fans: Vec = fan_candidates.into_iter().map(|(_, p)| p).collect(); + + if best_fans.is_empty() { + warn!("Heuristic Discovery: No fan RPM sensors found."); + } else { + info!("Heuristic Discovery: Final registry contains {} fan sensors.", best_fans.len()); + } (best_temp, best_fans) } -/// Discovers RAPL powercap paths. fn discover_rapl(base_path: &Path, cfg: &ActuatorDiscovery) -> Vec { let mut paths = Vec::new(); let powercap_base = base_path.join("sys/class/powercap"); - let entries = match fs::read_dir(&powercap_base) { - Ok(e) => e, - Err(_) => return Vec::new(), - }; - - for entry in entries.flatten() { - let path = entry.path(); - let dir_name = entry.file_name().into_string().unwrap_or_default(); - - if cfg.rapl_paths.contains(&dir_name) { - paths.push(path); - continue; - } - - if let Some(name) = read_sysfs_with_timeout(&path.join("name"), Duration::from_millis(100)) { - if cfg.rapl_paths.iter().any(|p| p == &name) { + if let Ok(entries) = fs::read_dir(&powercap_base) { + for entry in entries.flatten() { + let path = entry.path(); + let dir_name = entry.file_name().into_string().unwrap_or_default(); + + if cfg.rapl_paths.contains(&dir_name) { paths.push(path); + continue; + } + + if let Ok(name) = fs::read_to_string(path.join("name")) { + if cfg.rapl_paths.iter().any(|p| p == name.trim()) { + paths.push(path); + } } } } paths } -/// Checks if a systemd service is currently active using the injected runner. pub fn is_service_active(runner: &dyn SyscallRunner, service: &str) -> bool { runner.run("systemctl", &["is-active", "--quiet", service]).is_ok() } -/// Helper to read a sysfs file with a timeout. fn read_sysfs_with_timeout(path: &Path, timeout: Duration) -> Option { let (tx, rx) = mpsc::channel(); let path_buf = path.to_path_buf(); diff --git a/src/sal/mock.rs b/src/sal/mock.rs index 079a982..6a9b3b1 100644 --- a/src/sal/mock.rs +++ b/src/sal/mock.rs @@ -1,6 +1,7 @@ use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, SafetyStatus}; -use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage}; +use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent}; use anyhow::Result; +use std::sync::Arc; pub struct MockSal { pub temperature_sequence: std::sync::atomic::AtomicUsize, @@ -17,65 +18,36 @@ impl MockSal { impl PreflightAuditor for MockSal { fn audit(&self) -> Box + '_> { let steps = vec![ - AuditStep { - description: "Mock Root Privileges".to_string(), - outcome: Ok(()), - }, - AuditStep { - description: "Mock AC Power Status".to_string(), - outcome: Ok(()), - }, + AuditStep { description: "Mock Root Privileges".to_string(), outcome: Ok(()) }, + AuditStep { description: "Mock AC Power Status".to_string(), outcome: Ok(()) }, ]; Box::new(steps.into_iter()) } } impl EnvironmentGuard for MockSal { - fn suppress(&self) -> Result<()> { - Ok(()) - } - fn restore(&self) -> Result<()> { - Ok(()) - } + fn suppress(&self) -> Result<()> { Ok(()) } + fn restore(&self) -> Result<()> { Ok(()) } } impl SensorBus for MockSal { fn get_temp(&self) -> Result { - // Support dynamic sequence for Step 5 let seq = self.temperature_sequence.fetch_add(1, std::sync::atomic::Ordering::SeqCst); - Ok(40.0 + (seq as f32 * 0.5).min(50.0)) // Heats up from 40 to 90 - } - fn get_power_w(&self) -> Result { - Ok(15.0) - } - fn get_fan_rpms(&self) -> Result> { - Ok(vec![2500]) - } - fn get_freq_mhz(&self) -> Result { - Ok(3200.0) - } - fn get_throttling_status(&self) -> Result { - Ok(self.get_temp()? > 90.0) + Ok(40.0 + (seq as f32 * 0.5).min(55.0)) } + fn get_power_w(&self) -> Result { Ok(15.0) } + fn get_fan_rpms(&self) -> Result> { Ok(vec![2500, 2400]) } + fn get_freq_mhz(&self) -> Result { Ok(3200.0) } + fn get_throttling_status(&self) -> Result { Ok(false) } } impl ActuatorBus for MockSal { - fn set_fan_mode(&self, _mode: &str) -> Result<()> { - Ok(()) - } - fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> { - Ok(()) - } - fn set_sustained_power_limit(&self, _limit: TdpLimitMicroWatts) -> Result<()> { - Ok(()) - } - fn set_burst_power_limit(&self, _limit: TdpLimitMicroWatts) -> Result<()> { - Ok(()) - } + fn set_fan_mode(&self, _mode: &str) -> Result<()> { Ok(()) } + fn set_fan_speed(&self, _speed: FanSpeedPercent) -> Result<()> { Ok(()) } + fn set_sustained_power_limit(&self, _limit: PowerLimitWatts) -> Result<()> { Ok(()) } + fn set_burst_power_limit(&self, _limit: PowerLimitWatts) -> Result<()> { Ok(()) } } impl HardwareWatchdog for MockSal { - fn get_safety_status(&self) -> Result { - Ok(SafetyStatus::Nominal) - } + fn get_safety_status(&self) -> Result { Ok(SafetyStatus::Nominal) } } diff --git a/src/sal/mod.rs b/src/sal/mod.rs index d2f276f..a8cd205 100644 --- a/src/sal/mod.rs +++ b/src/sal/mod.rs @@ -4,3 +4,4 @@ pub mod dell_xps_9380; pub mod generic_linux; pub mod heuristic; pub mod safety; +pub mod discovery; diff --git a/src/sal/safety.rs b/src/sal/safety.rs index f33689d..88c641a 100644 --- a/src/sal/safety.rs +++ b/src/sal/safety.rs @@ -8,68 +8,81 @@ use anyhow::{Result, bail, Context}; use std::collections::HashMap; use std::fs; use std::path::{PathBuf}; -use tracing::{info, warn, error}; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::{Duration, Instant}; +use std::thread; +use tracing::{info, warn, error, debug}; + +use crate::sal::traits::SensorBus; // --- 1. Type-Driven Bounds Checking --- -/// Represents a TDP limit in microwatts, strictly bounded between 5W and 80W. -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] -pub struct TdpLimitMicroWatts(u64); +/// Represents a validated TDP limit in Watts. +#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] +pub struct PowerLimitWatts(f32); -impl TdpLimitMicroWatts { - /// # SAFETY: - /// Values below 5W can cause CPU frequency to drop to 400MHz and induce system instability. - pub const MIN_SAFE_UW: u64 = 5_000_000; - /// # SAFETY: - /// Values above 80W can exceed the thermal and electrical design limits of XPS chassis. - pub const MAX_SAFE_UW: u64 = 80_000_000; +impl PowerLimitWatts { + /// Absolute safety floor. Setting TDP below 3W can induce system-wide + /// CPU stalls and I/O deadlocks on certain Intel mobile chipsets. + pub const MIN: f32 = 3.0; + /// Safety ceiling for mobile thin-and-light chassis. + pub const MAX: f32 = 100.0; - /// Validates and constructs a new TDP limit. - pub fn new(microwatts: u64) -> Result { - if microwatts < Self::MIN_SAFE_UW { - bail!("HardwareSafetyError: Requested TDP {}uW is below safety floor (5W).", microwatts); + /// Validates and constructs a new PowerLimitWatts. + pub fn try_new(watts: f32) -> Result { + if watts < Self::MIN || watts > Self::MAX { + bail!("HardwareSafetyError: Requested TDP {:.1}W is outside safe bounds ({:.1}W - {:.1}W).", watts, Self::MIN, Self::MAX); } - if microwatts > Self::MAX_SAFE_UW { - bail!("HardwareSafetyError: Requested TDP {}uW exceeds safety ceiling (80W).", microwatts); - } - Ok(Self(microwatts)) + Ok(Self(watts)) } pub fn from_watts(watts: f32) -> Result { - Self::new((watts * 1_000_000.0) as u64) + Self::try_new(watts) } - pub fn as_u64(&self) -> u64 { self.0 } + pub fn get(&self) -> f32 { self.0 } + pub fn as_microwatts(&self) -> u64 { (self.0 * 1_000_000.0) as u64 } } -/// Represents a fan speed percentage (0-100%). +/// Represents a validated fan speed percentage. #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct FanSpeedPercentage(u8); +pub struct FanSpeedPercent(u8); -impl FanSpeedPercentage { - pub fn new(percent: u8) -> Result { +impl FanSpeedPercent { + pub fn try_new(percent: u8) -> Result { if percent > 100 { bail!("HardwareSafetyError: Fan speed {}% is invalid.", percent); } Ok(Self(percent)) } - pub fn as_u8(&self) -> u8 { self.0 } + + pub fn new(percent: u8) -> Result { + Self::try_new(percent) + } + + pub fn get(&self) -> u8 { self.0 } } -/// Represents a thermal threshold in Celsius, bounded to TjMax - 2°C (98°C). +/// Represents a thermal threshold in Celsius. #[derive(Debug, Clone, Copy, PartialEq, PartialOrd)] pub struct ThermalThresholdCelsius(f32); impl ThermalThresholdCelsius { pub const MAX_SAFE_C: f32 = 98.0; - pub fn new(celsius: f32) -> Result { + pub fn try_new(celsius: f32) -> Result { if celsius > Self::MAX_SAFE_C { - bail!("HardwareSafetyError: Thermal threshold {}C exceeds safe limit (98C).", celsius); + bail!("HardwareSafetyError: Thermal threshold {}C exceeds safe limit ({}C).", celsius, Self::MAX_SAFE_C); } Ok(Self(celsius)) } - pub fn as_f32(&self) -> f32 { self.0 } + + pub fn new(celsius: f32) -> Result { + Self::try_new(celsius) + } + + pub fn get(&self) -> f32 { self.0 } } // --- 2. The HardwareStateGuard (RAII Restorer) --- @@ -78,6 +91,7 @@ impl ThermalThresholdCelsius { pub type RollbackAction = Box; /// Holds a snapshot of the system state. Restores everything on Drop. +/// This is the primary safety mechanism for Project Iron-Ember. pub struct HardwareStateGuard { /// Maps sysfs paths to their original string contents. snapshots: HashMap, @@ -90,6 +104,9 @@ pub struct HardwareStateGuard { impl HardwareStateGuard { /// Snapshots the requested files and neutralizes competing services. + /// + /// # SAFETY: + /// This MUST be acquired before any hardware mutation occurs. pub fn acquire(target_files: &[PathBuf], target_services: &[String]) -> Result { let mut snapshots = HashMap::new(); let mut suppressed = Vec::new(); @@ -101,10 +118,13 @@ impl HardwareStateGuard { let content = fs::read_to_string(path) .with_context(|| format!("Failed to snapshot {:?}", path))?; snapshots.insert(path.clone(), content.trim().to_string()); + } else { + debug!("USA: Skipping snapshot for non-existent path {:?}", path); } } for svc in target_services { + // Check if service is active before stopping let status = std::process::Command::new("systemctl") .args(["is-active", "--quiet", svc]) .status(); @@ -168,7 +188,75 @@ impl Drop for HardwareStateGuard { } } -// --- 3. Transactional Configuration --- +// --- 3. The Active Watchdog --- + +/// A standalone monitor that polls hardware thermals at high frequency. +pub struct ThermalWatchdog { + cancel_token: Arc, + handle: Option>, +} + +impl ThermalWatchdog { + /// If temperature exceeds this ceiling, the watchdog triggers an emergency shutdown. + pub const CRITICAL_TEMP: f32 = 95.0; + /// High polling rate ensures we catch runaways before chassis saturation. + pub const POLL_INTERVAL: Duration = Duration::from_millis(250); + + /// Spawns the watchdog thread. + pub fn spawn(sensors: Arc, cancel_token: Arc) -> Self { + let ct = cancel_token.clone(); + let handle = thread::spawn(move || { + let mut last_temp = 0.0; + loop { + if ct.load(Ordering::SeqCst) { + debug!("Watchdog: Shutdown signal received."); + break; + } + + match sensors.get_temp() { + Ok(temp) => { + // Rate of change check (dT/dt) + let dt_dt = temp - last_temp; + if temp >= Self::CRITICAL_TEMP { + error!("WATCHDOG: CRITICAL THERMAL EVENT ({:.1}C). Triggering emergency abort!", temp); + ct.store(true, Ordering::SeqCst); + break; + } + + if dt_dt > 5.0 && temp > 85.0 { + warn!("WATCHDOG: Dangerous thermal ramp detected (+{:.1}C in 250ms).", dt_dt); + } + + last_temp = temp; + } + Err(e) => { + error!("WATCHDOG: Sensor read failure: {}. Aborting for safety!", e); + ct.store(true, Ordering::SeqCst); + break; + } + } + + thread::sleep(Self::POLL_INTERVAL); + } + }); + + Self { + cancel_token, + handle: Some(handle), + } + } +} + +impl Drop for ThermalWatchdog { + fn drop(&mut self) { + self.cancel_token.store(true, Ordering::SeqCst); + if let Some(h) = self.handle.take() { + let _ = h.join(); + } + } +} + +// --- 4. Transactional Configuration --- /// A staged set of changes to be applied to the hardware. #[derive(Default)] diff --git a/src/sal/traits.rs b/src/sal/traits.rs index 996b4e6..7cd1367 100644 --- a/src/sal/traits.rs +++ b/src/sal/traits.rs @@ -115,30 +115,20 @@ impl EnvironmentGuard for Arc { } } +use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent}; + /// Provides a read-only interface to system telemetry sensors. pub trait SensorBus: Send + Sync { /// Returns the current package temperature in degrees Celsius. - /// - /// # Errors - /// Returns an error if the underlying `hwmon` or `sysfs` node cannot be read. fn get_temp(&self) -> Result; /// Returns the current package power consumption in Watts. - /// - /// # Errors - /// Returns an error if the underlying RAPL or power sensor cannot be read. fn get_power_w(&self) -> Result; /// Returns the current speed of all detected fans in RPM. - /// - /// # Errors - /// Returns an error if the fan sensor nodes cannot be read. fn get_fan_rpms(&self) -> Result>; /// Returns the current average CPU frequency in MHz. - /// - /// # Errors - /// Returns an error if `/proc/cpuinfo` or a `cpufreq` sysfs node cannot be read. fn get_freq_mhz(&self) -> Result; /// Returns true if the system is currently thermally throttling. @@ -146,53 +136,33 @@ pub trait SensorBus: Send + Sync { } impl SensorBus for Arc { - fn get_temp(&self) -> Result { - (**self).get_temp() - } - fn get_power_w(&self) -> Result { - (**self).get_power_w() - } - fn get_fan_rpms(&self) -> Result> { - (**self).get_fan_rpms() - } - fn get_freq_mhz(&self) -> Result { - (**self).get_freq_mhz() - } - fn get_throttling_status(&self) -> Result { - (**self).get_throttling_status() - } + fn get_temp(&self) -> Result { (**self).get_temp() } + fn get_power_w(&self) -> Result { (**self).get_power_w() } + fn get_fan_rpms(&self) -> Result> { (**self).get_fan_rpms() } + fn get_freq_mhz(&self) -> Result { (**self).get_freq_mhz() } + fn get_throttling_status(&self) -> Result { (**self).get_throttling_status() } } -use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage}; - /// Provides a write-only interface for hardware actuators. pub trait ActuatorBus: Send + Sync { /// Sets the fan control mode (e.g., "auto" or "max"). fn set_fan_mode(&self, mode: &str) -> Result<()>; /// Sets the fan speed directly using a validated percentage. - fn set_fan_speed(&self, speed: FanSpeedPercentage) -> Result<()>; + fn set_fan_speed(&self, speed: FanSpeedPercent) -> Result<()>; /// Sets the sustained power limit (PL1) using a validated wrapper. - fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>; + fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()>; /// Sets the burst power limit (PL2) using a validated wrapper. - fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>; + fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()>; } impl ActuatorBus for Arc { - fn set_fan_mode(&self, mode: &str) -> Result<()> { - (**self).set_fan_mode(mode) - } - fn set_fan_speed(&self, speed: FanSpeedPercentage) -> Result<()> { - (**self).set_fan_speed(speed) - } - fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { - (**self).set_sustained_power_limit(limit) - } - fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> { - (**self).set_burst_power_limit(limit) - } + fn set_fan_mode(&self, mode: &str) -> Result<()> { (**self).set_fan_mode(mode) } + fn set_fan_speed(&self, speed: FanSpeedPercent) -> Result<()> { (**self).set_fan_speed(speed) } + fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()> { (**self).set_sustained_power_limit(limit) } + fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()> { (**self).set_burst_power_limit(limit) } } /// Represents the high-level safety status of the system.