From 073414a25ed8538fa6fefbe0b5f134b38e97f5de Mon Sep 17 00:00:00 2001 From: Nils Pukropp Date: Thu, 26 Feb 2026 15:52:44 +0100 Subject: [PATCH] fixed hardware_db and improved stability and robustness of generic sal --- assets/hardware_db.toml | 42 ++++++++-- assets/hardware_db.toml.bak | 117 +++++++++++++++++++++++++++ src/engine/mod.rs | 22 ++++-- src/load/mod.rs | 37 ++++++--- src/main.rs | 64 ++++++++------- src/mediator.rs | 2 + src/orchestrator/mod.rs | 61 ++++++++++++-- src/sal/dell_xps_9380.rs | 154 +++++++++++++++++------------------- src/sal/generic_linux.rs | 125 ++++++++++++----------------- src/sal/heuristic/schema.rs | 2 + src/sal/mock.rs | 10 +-- src/sal/traits.rs | 27 +++++-- src/ui/dashboard.rs | 50 ++++++++++++ 13 files changed, 488 insertions(+), 225 deletions(-) create mode 100644 assets/hardware_db.toml.bak diff --git a/assets/hardware_db.toml b/assets/hardware_db.toml index 3f5e480..4eeedef 100644 --- a/assets/hardware_db.toml +++ b/assets/hardware_db.toml @@ -1,5 +1,5 @@ [metadata] -version = "1.0.0" +version = "1.1.0" updated = "2026-02-26" description = "Hardware and Conflict Database for ember-tune Thermal Engine" @@ -29,6 +29,14 @@ severity = "Medium" fix_action = "SuspendService" help_text = "Auto-cpufreq interferes with deterministic Silicon Knee identification." +[[conflicts]] +id = "dell_fan_collision" +services = ["i8kmon.service"] +contention = "Dell SMM Fan Control" +severity = "High" +fix_action = "SuspendService" +help_text = "i8kmon fights with ember-tune for SMM fan duty cycles. Suspend during benchmark." + # manufacturer wide logic [ecosystems.dell] @@ -38,6 +46,7 @@ drivers = ["dell_smm_hwmon"] fan_manual_mode_cmd = "dell-bios-fan-control 0" fan_auto_mode_cmd = "dell-bios-fan-control 1" safety_register = "0x1FC" # BD PROCHOT MSR +help_text = "Dell systems often require 'SMM Security Mitigation' disabled in BIOS for fan control." [ecosystems.lenovo] vendor_regex = "LENOVO" @@ -60,6 +69,13 @@ fan_boost_path = "/sys/devices/platform/hp-wmi/hwmon/hwmon*/pwm1_enable" vendor_regex = "Framework" ec_tool = "ectool" optimization = "Direct-FFI-SMC" +polling_cap_ms = 500 + +[ecosystems.surface] +vendor_regex = "Microsoft Corporation" +product_regex = "Surface.*" +drivers = ["surface_acpi"] +profiles_path = "/sys/bus/platform/devices/surface_performance/platform_profile" # quirks: model quirks and fixes @@ -85,6 +101,7 @@ id = "asus_fan_hex_support" issue = "Custom Hex Curve Interface" target_path = "/sys/devices/platform/asus-nb-wmi/fan_curve" format = "HexPair16" +action = "ManualFanControlRequired" [[quirks]] model_regex = "Spectre x360" @@ -92,15 +109,23 @@ id = "hp_rapl_lockout" issue = "Hardware MSR Lockout" action = "WarnUserMSRLocked" +[[quirks]] +model_regex = "Framework.*" +id = "framework_prochot_stuck" +issue = "BD PROCHOT wedged at 200MHz" +monitor_msr = "0x1FC" +reset_bit = 0 +action = "ClearBitOnSafeTemp" + # heuristic discovery [discovery.sensors] -temp_labels = ["Package id 0", "Tdie", "Tctl", "CPU Temperature"] -fan_labels = ["CPU Fan", "GPU Fan", "System Fan"] -hwmon_priority = ["coretemp", "zenpower", "k10temp", "dell_smm"] +temp_labels = ["Package id 0", "Tdie", "Tctl", "CPU Temperature", "Core 0", "Composite"] +fan_labels = ["CPU Fan", "GPU Fan", "System Fan", "Processor Fan"] +hwmon_priority = ["coretemp", "zenpower", "k10temp", "dell_smm", "thinkpad", "asus"] [discovery.actuators] -rapl_paths = ["intel-rapl:0", "package-0"] +rapl_paths = ["intel-rapl:0", "package-0", "intel-rapl:1"] amd_energy_paths = ["zenpower/energy1_input", "k10temp/energy1_input"] governor_files = ["energy_performance_preference", "energy_performance_hint", "scaling_governor"] @@ -113,5 +138,10 @@ fail_help = "Add 'msr.allow_writes=on' to kernel parameters to allow power limit [[preflight_checks]] name = "Kernel Lockdown Status" -check_cmd = "cat /sys/kernel/security/lockdown | grep -q '\\[none\\]'" +check_cmd = "cat /sys/kernel/security/lockdown | grep -q '\\[none\\]' || ! [ -f /sys/kernel/security/lockdown ]" fail_help = "Kernel Lockdown is enabled. MMIO/MSR actuators are restricted by the Linux Security Module." + +[[preflight_checks]] +name = "Intel P-State Check" +check_cmd = "[ -d /sys/devices/system/cpu/intel_pstate ] || [ -d /sys/devices/system/cpu/cpufreq/policy0 ]" +fail_help = "CPU Frequency scaling driver not detected. Ensure intel_pstate or acpi-cpufreq is loaded." diff --git a/assets/hardware_db.toml.bak b/assets/hardware_db.toml.bak new file mode 100644 index 0000000..3f5e480 --- /dev/null +++ b/assets/hardware_db.toml.bak @@ -0,0 +1,117 @@ +[metadata] +version = "1.0.0" +updated = "2026-02-26" +description = "Hardware and Conflict Database for ember-tune Thermal Engine" + +# service collision + +[[conflicts]] +id = "tlp_vs_ppd" +services = ["tlp.service", "power-profiles-daemon.service"] +contention = "ACPI Platform Profile / EPP" +severity = "Critical" +fix_action = "MaskBoth" +help_text = "TLP and Power-Profiles-Daemon fight over power envelopes. Mask both to allow ember-tune deterministic control." + +[[conflicts]] +id = "thermal_logic_collision" +services = ["thermald.service", "throttled.service"] +contention = "RAPL / MSR / BD-PROCHOT" +severity = "High" +fix_action = "SuspendService" +help_text = "Thermald and Throttled create a 'register ping-pong' loop. Disable throttled; ember-tune will manage RAPL limits." + +[[conflicts]] +id = "freq_scaling_collision" +services = ["auto-cpufreq.service"] +contention = "CPU Scaling Governor" +severity = "Medium" +fix_action = "SuspendService" +help_text = "Auto-cpufreq interferes with deterministic Silicon Knee identification." + +# manufacturer wide logic + +[ecosystems.dell] +vendor_regex = "(Dell.*|Precision.*|Latitude.*|XPS.*)" +polling_cap_ms = 1000 +drivers = ["dell_smm_hwmon"] +fan_manual_mode_cmd = "dell-bios-fan-control 0" +fan_auto_mode_cmd = "dell-bios-fan-control 1" +safety_register = "0x1FC" # BD PROCHOT MSR + +[ecosystems.lenovo] +vendor_regex = "LENOVO" +lap_mode_path = "/sys/devices/platform/thinkpad_acpi/dytc_lapmode" +profiles_path = "/sys/firmware/acpi/platform_profile" +ec_write_required = false # Varies by model + +[ecosystems.asus] +vendor_regex = "ASUSTeK.*" +thermal_policy_path = "/sys/devices/platform/asus-nb-wmi/throttle_thermal_policy" +policy_map = { Balanced = 0, Turbo = 1, Silent = 2 } + +[ecosystems.hp] +vendor_regex = "HP" +msr_lock_register = "0x610" +msr_lock_bit = 63 +fan_boost_path = "/sys/devices/platform/hp-wmi/hwmon/hwmon*/pwm1_enable" + +[ecosystems.framework] +vendor_regex = "Framework" +ec_tool = "ectool" +optimization = "Direct-FFI-SMC" + +# quirks: model quirks and fixes + +[[quirks]] +model_regex = "XPS 13 93.*" +id = "dell_bd_prochot_fix" +issue = "False Positive 400MHz Lock" +monitor_msr = "0x1FC" +reset_bit = 0 +action = "ClearBitOnSafeTemp" + +[[quirks]] +model_regex = "ThinkPad T14.*" +id = "lenovo_lap_throttling" +issue = "11W TDP Lock in Lap Mode" +trigger_path = "/sys/devices/platform/thinkpad_acpi/dytc_lapmode" +trigger_value = "1" +action = "AbortOnLapMode" + +[[quirks]] +model_regex = "ROG Zephyrus G14" +id = "asus_fan_hex_support" +issue = "Custom Hex Curve Interface" +target_path = "/sys/devices/platform/asus-nb-wmi/fan_curve" +format = "HexPair16" + +[[quirks]] +model_regex = "Spectre x360" +id = "hp_rapl_lockout" +issue = "Hardware MSR Lockout" +action = "WarnUserMSRLocked" + +# heuristic discovery + +[discovery.sensors] +temp_labels = ["Package id 0", "Tdie", "Tctl", "CPU Temperature"] +fan_labels = ["CPU Fan", "GPU Fan", "System Fan"] +hwmon_priority = ["coretemp", "zenpower", "k10temp", "dell_smm"] + +[discovery.actuators] +rapl_paths = ["intel-rapl:0", "package-0"] +amd_energy_paths = ["zenpower/energy1_input", "k10temp/energy1_input"] +governor_files = ["energy_performance_preference", "energy_performance_hint", "scaling_governor"] + +# env health verification + +[[preflight_checks]] +name = "MSR Write Access" +check_cmd = "grep -q 'msr.allow_writes=on' /proc/cmdline" +fail_help = "Add 'msr.allow_writes=on' to kernel parameters to allow power limit manipulation." + +[[preflight_checks]] +name = "Kernel Lockdown Status" +check_cmd = "cat /sys/kernel/security/lockdown | grep -q '\\[none\\]'" +fail_help = "Kernel Lockdown is enabled. MMIO/MSR actuators are restricted by the Linux Security Module." diff --git a/src/engine/mod.rs b/src/engine/mod.rs index 540b751..24878bc 100644 --- a/src/engine/mod.rs +++ b/src/engine/mod.rs @@ -91,24 +91,30 @@ impl OptimizerEngine { // 1. Efficiency Metric (Throughput per Watt) // If throughput is 0 (unsupported), fallback to Frequency per Watt let efficiency_curr = if curr.throughput > 0.0 { - curr.throughput as f32 / curr.power_w.max(0.1) + curr.throughput as f32 / curr.power_w.max(1.0) } else { - curr.freq_mhz / curr.power_w.max(0.1) + curr.freq_mhz / curr.power_w.max(1.0) }; let efficiency_next = if next.throughput > 0.0 { - next.throughput as f32 / next.power_w.max(0.1) + next.throughput as f32 / next.power_w.max(1.0) } else { - next.freq_mhz / next.power_w.max(0.1) + next.freq_mhz / next.power_w.max(1.0) }; // Diminishing returns: how much efficiency drops per additional watt - let efficiency_drop = (efficiency_curr - efficiency_next) / (next.power_w - curr.power_w).max(0.1); + let p_delta = (next.power_w - curr.power_w).max(0.5); + let efficiency_drop = (efficiency_curr - efficiency_next) / p_delta; // 2. Thermal Acceleration (d2T/dW2) - let dt_dw_prev = (curr.temp_c - prev.temp_c) / (curr.power_w - prev.power_w).max(0.1); - let dt_dw_next = (next.temp_c - curr.temp_c) / (next.power_w - curr.power_w).max(0.1); - let temp_accel = (dt_dw_next - dt_dw_prev) / (next.power_w - prev.power_w).max(0.1); + let p_delta_prev = (curr.power_w - prev.power_w).max(0.5); + let p_delta_next = (next.power_w - curr.power_w).max(0.5); + + let dt_dw_prev = (curr.temp_c - prev.temp_c) / p_delta_prev; + let dt_dw_next = (next.temp_c - curr.temp_c) / p_delta_next; + + let p_total_delta = (next.power_w - prev.power_w).max(1.0); + let temp_accel = (dt_dw_next - dt_dw_prev) / p_total_delta; // 3. Wall Detection (Any drop in absolute frequency/throughput is a hard wall) let is_throttling = next.freq_mhz < curr.freq_mhz || (next.throughput > 0.0 && next.throughput < curr.throughput); diff --git a/src/load/mod.rs b/src/load/mod.rs index 88ea83b..bc80d98 100644 --- a/src/load/mod.rs +++ b/src/load/mod.rs @@ -1,16 +1,16 @@ use anyhow::Result; +use std::process::Child; +use std::time::{Duration, Instant}; +use std::thread; -pub trait Workload { - /// Starts the workload with specified threads and load percentage. +pub trait Workload: Send + Sync { fn start(&mut self, threads: usize, load_percent: usize) -> Result<()>; - /// Stops the workload. fn stop(&mut self) -> Result<()>; - /// Returns the current throughput (e.g., ops/sec). fn get_throughput(&self) -> Result; } pub struct StressNg { - child: Option, + child: Option, } impl StressNg { @@ -21,7 +21,7 @@ impl StressNg { impl Workload for StressNg { fn start(&mut self, threads: usize, load_percent: usize) -> Result<()> { - self.stop()?; // Ensure any previous instance is stopped + self.stop()?; let child = std::process::Command::new("stress-ng") .args([ @@ -37,15 +37,32 @@ impl Workload for StressNg { fn stop(&mut self) -> Result<()> { if let Some(mut child) = self.child.take() { - let _ = child.kill(); - let _ = child.wait(); + // Try SIGTERM first + #[cfg(unix)] + { + use libc::{kill, SIGTERM}; + unsafe { kill(child.id() as i32, SIGTERM); } + } + + let start = Instant::now(); + let mut exited = false; + while start.elapsed() < Duration::from_secs(2) { + if let Ok(Some(_)) = child.try_wait() { + exited = true; + break; + } + thread::sleep(Duration::from_millis(100)); + } + + if !exited { + let _ = child.kill(); + let _ = child.wait(); + } } Ok(()) } fn get_throughput(&self) -> Result { - // In a real implementation, we would parse stress-ng's temporary results - // or use a different workload that provides live throughput. Ok(0.0) } } diff --git a/src/main.rs b/src/main.rs index ab30b7b..9c62c32 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,7 +6,7 @@ mod ui; mod engine; mod cli; -use miette::{Result, IntoDiagnostic, Diagnostic, Report}; +use miette::{Result, IntoDiagnostic, Diagnostic, Report, Context}; use thiserror::Error; use std::sync::mpsc; use std::thread; @@ -30,7 +30,7 @@ use mediator::{TelemetryState, UiCommand, BenchmarkPhase}; use sal::traits::{AuditError, PlatformSal}; use sal::mock::MockSal; use sal::heuristic::engine::HeuristicEngine; -use load::StressNg; +use load::{StressNg, Workload}; use orchestrator::BenchmarkOrchestrator; use ui::dashboard::{draw_dashboard, DashboardState}; use engine::OptimizationResult; @@ -108,10 +108,10 @@ fn main() -> Result<()> { info!("ember-tune starting with args: {:?}", args); // 2. Platform Detection & Audit - let sal: Box = if args.mock { - Box::new(MockSal::new()) + let sal: Arc = if args.mock { + Arc::new(MockSal::new()) } else { - HeuristicEngine::detect_and_build()? + HeuristicEngine::detect_and_build()?.into() }; println!("{}", console::style("─── Pre-flight System Audit ───").bold().cyan()); @@ -122,9 +122,7 @@ fn main() -> Result<()> { io::Write::flush(&mut io::stdout()).into_diagnostic()?; match step.outcome { - Ok(_) => { - println!("{}", console::style("[✓]").green()); - } + Ok(_) => { println!("{}", console::style("[✓]").green()); } Err(e) => { println!("{}", console::style("[✗]").red()); audit_failures.push(e); @@ -137,10 +135,8 @@ fn main() -> Result<()> { return Err(Report::new(MultiAuditError { errors: audit_failures })); } - println!("{}", console::style("✓ All pre-flight audits passed.").green().bold()); - thread::sleep(Duration::from_secs(1)); - if args.audit_only { + println!("{}", console::style("✓ All pre-flight audits passed.").green().bold()); return Ok(()); } @@ -159,21 +155,22 @@ fn main() -> Result<()> { let (telemetry_tx, telemetry_rx) = mpsc::channel::(); let (command_tx, command_rx) = mpsc::channel::(); + let c_tx = command_tx.clone(); ctrlc::set_handler(move || { + let _ = c_tx.send(UiCommand::Abort); r.store(false, Ordering::SeqCst); }).expect("Error setting Ctrl-C handler"); // 5. Spawn Backend Orchestrator + let sal_backend = sal.clone(); let backend_handle = thread::spawn(move || { let workload = Box::new(StressNg::new()); - let mut orchestrator = BenchmarkOrchestrator::new( - sal, + sal_backend, workload, telemetry_tx, command_rx, ); - orchestrator.run() }); @@ -197,6 +194,8 @@ fn main() -> Result<()> { history_mhz: Vec::new(), log_event: None, metadata: std::collections::HashMap::new(), + is_emergency: false, + emergency_reason: None, }; let tick_rate = Duration::from_millis(100); @@ -233,29 +232,38 @@ fn main() -> Result<()> { } } - if last_tick.elapsed() >= tick_rate { - last_tick = Instant::now(); - } - - if backend_handle.is_finished() { - thread::sleep(Duration::from_secs(1)); - break; - } + if last_tick.elapsed() >= tick_rate { last_tick = Instant::now(); } + if backend_handle.is_finished() { break; } } // 7. Terminal Restoration - disable_raw_mode().into_diagnostic()?; - execute!(terminal.backend_mut(), LeaveAlternateScreen).into_diagnostic()?; - terminal.show_cursor().into_diagnostic()?; + let _ = disable_raw_mode(); + let _ = execute!(terminal.backend_mut(), LeaveAlternateScreen); + let _ = terminal.show_cursor(); - // 8. Final Report (Post-TUI) - match backend_handle.join() { + // 8. Final Report & Hardware Restoration + let join_res = backend_handle.join(); + + // Explicit hardware restoration + info!("Restoring hardware state..."); + if let Err(e) = sal.restore() { + error!("Failed to restore hardware state: {}", e); + } + + match join_res { Ok(Ok(result)) => { print_summary_report(&result); } Ok(Err(e)) => { - if e.to_string() == "ABORTED" { + let err_str = e.to_string(); + if err_str == "ABORTED" { println!("{}", "Benchmark aborted by user.".yellow()); + } else if err_str.contains("EMERGENCY_ABORT") { + println!(); + println!("{}", " 🚨 EMERGENCY ABORT TRIGGERED ".bold().on_red().white()); + println!("Reason: {}", err_str.replace("EMERGENCY_ABORT: ", "").red().bold()); + println!("{}", "Hardware state has been restored to safe defaults.".yellow()); + println!(); } else { error!("Orchestrator encountered error: {}", e); eprintln!("{} {}", "Error:".red().bold(), e); diff --git a/src/mediator.rs b/src/mediator.rs index 2bddbbc..5ca3950 100644 --- a/src/mediator.rs +++ b/src/mediator.rs @@ -42,6 +42,8 @@ pub struct TelemetryState { pub log_event: Option, pub metadata: std::collections::HashMap, + pub is_emergency: bool, + pub emergency_reason: Option, } #[derive(Debug, Clone)] diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index b4b7b73..83b5101 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -4,14 +4,17 @@ use std::time::{Duration, Instant}; use std::thread; use std::collections::VecDeque; use sysinfo::System; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Mutex; -use crate::sal::traits::{PlatformSal}; +use crate::sal::traits::{PlatformSal, AuditStep, SafetyStatus}; use crate::load::Workload; use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase}; use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult}; pub struct BenchmarkOrchestrator { - sal: Box, + sal: Arc, workload: Box, telemetry_tx: mpsc::Sender, command_rx: mpsc::Receiver, @@ -27,11 +30,15 @@ pub struct BenchmarkOrchestrator { // --- Static Info --- cpu_model: String, total_ram_gb: u64, + + // --- Safety --- + emergency_abort: Arc, + emergency_reason: Arc>>, } impl BenchmarkOrchestrator { pub fn new( - sal: Box, + sal: Arc, workload: Box, telemetry_tx: mpsc::Sender, command_rx: mpsc::Receiver, @@ -57,12 +64,17 @@ impl BenchmarkOrchestrator { history_mhz: VecDeque::with_capacity(120), cpu_model, total_ram_gb, + emergency_abort: Arc::new(AtomicBool::new(false)), + emergency_reason: Arc::new(Mutex::new(None)), } } pub fn run(&mut self) -> Result { self.log("Starting ember-tune Benchmark Sequence.")?; + // Start Watchdog Monitor + let _watchdog_handle = self.spawn_watchdog_monitor(); + // Phase 1: Audit & Baseline self.phase = BenchmarkPhase::Auditing; for step in self.sal.audit() { @@ -111,11 +123,6 @@ impl BenchmarkOrchestrator { while step_start.elapsed() < Duration::from_secs(45) { self.check_abort()?; - if self.sal.check_emergency()? { - self.log("⚠ EMERGENCY ABORT: Watchdog triggered!")?; - self.workload.stop()?; - return Err(anyhow::anyhow!("Hardware Watchdog Triggered")); - } let t = self.sal.get_temp().unwrap_or(0.0); step_temps.push_back(t); @@ -204,6 +211,35 @@ impl BenchmarkOrchestrator { Ok(res) } + fn spawn_watchdog_monitor(&self) -> thread::JoinHandle<()> { + let abort = self.emergency_abort.clone(); + let reason_store = self.emergency_reason.clone(); + let sal = self.sal.clone(); + + thread::spawn(move || { + while !abort.load(Ordering::SeqCst) { + let status = sal.get_safety_status(); + match status { + Ok(SafetyStatus::EmergencyAbort(reason)) => { + *reason_store.lock().unwrap() = Some(reason.clone()); + abort.store(true, Ordering::SeqCst); + break; + } + Ok(SafetyStatus::Warning(_msg)) | Ok(SafetyStatus::Critical(_msg)) => { + // Send warning log to UI + } + Ok(SafetyStatus::Nominal) => {} + Err(e) => { + *reason_store.lock().unwrap() = Some(format!("Watchdog Sensor Failure: {}", e)); + abort.store(true, Ordering::SeqCst); + break; + } + } + thread::sleep(Duration::from_millis(100)); + } + }) + } + pub fn generate_result(&self, is_partial: bool) -> OptimizationResult { let r_theta = self.engine.calculate_thermal_resistance(&self.profile); let knee = self.engine.find_silicon_knee(&self.profile); @@ -221,6 +257,11 @@ impl BenchmarkOrchestrator { } fn check_abort(&self) -> Result<()> { + if self.emergency_abort.load(Ordering::SeqCst) { + let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Unknown safety trigger".to_string()); + return Err(anyhow::anyhow!("EMERGENCY_ABORT: {}", reason)); + } + if let Ok(cmd) = self.command_rx.try_recv() { match cmd { UiCommand::Abort => { @@ -250,6 +291,8 @@ impl BenchmarkOrchestrator { history_mhz: Vec::new(), log_event: Some(msg.to_string()), metadata: std::collections::HashMap::new(), + is_emergency: self.emergency_abort.load(Ordering::SeqCst), + emergency_reason: self.emergency_reason.lock().unwrap().clone(), }; self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed")) } @@ -287,6 +330,8 @@ impl BenchmarkOrchestrator { history_mhz: self.history_mhz.iter().cloned().collect(), log_event: None, metadata: std::collections::HashMap::new(), + is_emergency: self.emergency_abort.load(Ordering::SeqCst), + emergency_reason: self.emergency_reason.lock().unwrap().clone(), }; self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed")) } diff --git a/src/sal/dell_xps_9380.rs b/src/sal/dell_xps_9380.rs index e8f7fc6..b59c197 100644 --- a/src/sal/dell_xps_9380.rs +++ b/src/sal/dell_xps_9380.rs @@ -1,11 +1,11 @@ -use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep}; +use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep, SafetyStatus}; use anyhow::{Result, Context}; use std::fs; use std::path::PathBuf; use std::process::Command; use std::time::{Duration, Instant}; use std::sync::Mutex; -use tracing::debug; +use tracing::{debug, warn}; pub struct DellXps9380Sal { temp_path: PathBuf, @@ -18,6 +18,8 @@ pub struct DellXps9380Sal { last_temp: Mutex, last_fans: Mutex>, suppressed_services: Mutex>, + msr_file: Mutex, + last_energy: Mutex<(u64, Instant)>, } impl DellXps9380Sal { @@ -35,7 +37,6 @@ impl DellXps9380Sal { if name == "dell_smm" { temp_path = Some(p.join("temp1_input")); - // Discover all fans if let Ok(fan_entries) = fs::read_dir(&p) { for fan_entry in fan_entries.flatten() { let fan_p = fan_entry.path(); @@ -54,7 +55,6 @@ impl DellXps9380Sal { } } - // Discovery for RAPL via powercap if let Ok(entries) = fs::read_dir("/sys/class/powercap") { for entry in entries.flatten() { let p = entry.path(); @@ -72,6 +72,9 @@ impl DellXps9380Sal { let rapl_base = rapl_base_path.context("Could not find RAPL package-0 path in powercap")?; let freq_path = PathBuf::from("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq"); + + let msr_file = fs::OpenOptions::new().read(true).write(true).open("/dev/cpu/0/msr") + .context("Failed to open /dev/cpu/0/msr. Is the 'msr' module loaded?")?; Ok(Self { temp_path: temp_path.context("Could not find dell_smm temperature path")?, @@ -84,68 +87,64 @@ impl DellXps9380Sal { last_temp: Mutex::new(0.0), last_fans: Mutex::new(Vec::new()), suppressed_services: Mutex::new(Vec::new()), + msr_file: Mutex::new(msr_file), + last_energy: Mutex::new((0, Instant::now())), }) } + + fn read_msr(&self, msr: u32) -> Result { + use std::os::unix::fs::FileExt; + let mut buf = [0u8; 8]; + let file = self.msr_file.lock().unwrap(); + file.read_at(&mut buf, msr as u64)?; + Ok(u64::from_le_bytes(buf)) + } + + fn write_msr(&self, msr: u32, val: u64) -> Result<()> { + use std::os::unix::fs::FileExt; + let file = self.msr_file.lock().unwrap(); + file.write_at(&val.to_le_bytes(), msr as u64)?; + Ok(()) + } } impl PreflightAuditor for DellXps9380Sal { fn audit(&self) -> Box + '_> { let mut steps = Vec::new(); - - // 1. Root check steps.push(AuditStep { description: "Root Privileges".to_string(), outcome: if unsafe { libc::getuid() } == 0 { Ok(()) } else { Err(AuditError::RootRequired) } }); - // 2. Kernel modules check (simplified check via sysfs/proc) let modules = ["dell_smm_hwmon", "msr", "intel_rapl_msr"]; for mod_name in modules { let path = format!("/sys/module/{}", mod_name); steps.push(AuditStep { description: format!("Kernel Module: {}", mod_name), outcome: if PathBuf::from(path).exists() { Ok(()) } else { - Err(AuditError::ToolMissing(format!("Module '{}' not loaded. Run 'sudo modprobe {}'", mod_name, mod_name))) + Err(AuditError::ToolMissing(format!("Module '{}' not loaded.", mod_name))) } }); } - // 3. Kernel parameters check let cmdline = fs::read_to_string("/proc/cmdline").unwrap_or_default(); - steps.push(AuditStep { - description: "Kernel Param: dell_smm_hwmon.ignore_dmi=1".to_string(), - outcome: if cmdline.contains("dell_smm_hwmon.ignore_dmi=1") { Ok(()) } else { - Err(AuditError::MissingKernelParam("dell_smm_hwmon.ignore_dmi=1".to_string())) - } - }); - steps.push(AuditStep { - description: "Kernel Param: dell_smm_hwmon.restricted=0".to_string(), - outcome: if cmdline.contains("dell_smm_hwmon.restricted=0") { Ok(()) } else { - Err(AuditError::MissingKernelParam("dell_smm_hwmon.restricted=0".to_string())) - } - }); - steps.push(AuditStep { - description: "Kernel Param: msr.allow_writes=on".to_string(), - outcome: if cmdline.contains("msr.allow_writes=on") { Ok(()) } else { - Err(AuditError::MissingKernelParam("msr.allow_writes=on".to_string())) - } - }); + let params = [ + ("dell_smm_hwmon.ignore_dmi=1", "dell_smm_hwmon.ignore_dmi=1"), + ("dell_smm_hwmon.restricted=0", "dell_smm_hwmon.restricted=0"), + ("msr.allow_writes=on", "msr.allow_writes=on"), + ]; + for (label, p) in params { + steps.push(AuditStep { + description: format!("Kernel Param: {}", label), + outcome: if cmdline.contains(p) { Ok(()) } else { Err(AuditError::MissingKernelParam(p.to_string())) } + }); + } - // 4. Lockdown check - let lockdown = fs::read_to_string("/sys/kernel/security/lockdown").unwrap_or_default(); - steps.push(AuditStep { - description: "Kernel Lockdown Status".to_string(), - outcome: if lockdown.contains("[none]") || lockdown.is_empty() { Ok(()) } else { - Err(AuditError::KernelIncompatible("Kernel is in lockdown mode. Set to 'none' to allow MSR/SMM writes.".to_string())) - } - }); - - // 5. Check AC power let ac_status = fs::read_to_string("/sys/class/power_supply/AC/online").unwrap_or_else(|_| "0".to_string()); steps.push(AuditStep { description: "AC Power Connection".to_string(), outcome: if ac_status.trim() == "1" { Ok(()) } else { - Err(AuditError::AcPowerMissing("System must be on AC power for benchmarking".to_string())) + Err(AuditError::AcPowerMissing("System must be on AC power".to_string())) } }); @@ -154,12 +153,11 @@ impl PreflightAuditor for DellXps9380Sal { } impl EnvironmentGuard for DellXps9380Sal { - fn suppress(&mut self) -> Result<()> { + fn suppress(&self) -> Result<()> { let services = ["tlp", "thermald", "i8kmon"]; let mut suppressed = self.suppressed_services.lock().unwrap(); for s in services { if Command::new("systemctl").args(["is-active", "--quiet", s]).status()?.success() { - debug!("Suppressing service: {}", s); Command::new("systemctl").args(["stop", s]).status()?; suppressed.push(s.to_string()); } @@ -167,7 +165,7 @@ impl EnvironmentGuard for DellXps9380Sal { Ok(()) } - fn restore(&mut self) -> Result<()> { + fn restore(&self) -> Result<()> { let mut suppressed = self.suppressed_services.lock().unwrap(); for s in suppressed.drain(..) { let _ = Command::new("systemctl").args(["start", &s]).status(); @@ -176,38 +174,31 @@ impl EnvironmentGuard for DellXps9380Sal { } } -impl Drop for DellXps9380Sal { - fn drop(&mut self) { - let _ = self.restore(); - } -} - - impl SensorBus for DellXps9380Sal { fn get_temp(&self) -> Result { - // Enforce 1000ms rate limit for Dell SMM as per GEMINI.md let mut last_poll = self.last_poll.lock().unwrap(); let now = Instant::now(); - if now.duration_since(*last_poll) < Duration::from_millis(1000) { return Ok(*self.last_temp.lock().unwrap()); } - let s = fs::read_to_string(&self.temp_path)?; let val = s.trim().parse::()? / 1000.0; - *self.last_temp.lock().unwrap() = val; *last_poll = now; - Ok(val) } fn get_power_w(&self) -> Result { if self.pwr_path.to_string_lossy().contains("energy_uj") { - let e1 = fs::read_to_string(&self.pwr_path)?.trim().parse::()?; - std::thread::sleep(Duration::from_millis(100)); + let mut last = self.last_energy.lock().unwrap(); let e2 = fs::read_to_string(&self.pwr_path)?.trim().parse::()?; - Ok((e2.saturating_sub(e1)) as f32 / 100000.0) + let t2 = Instant::now(); + let (e1, t1) = *last; + let delta_e = e2.wrapping_sub(e1); + let delta_t = t2.duration_since(t1).as_secs_f32(); + *last = (e2, t2); + if delta_t < 0.01 { return Ok(0.0); } + Ok((delta_e as f32 / 1_000_000.0) / delta_t) } else { let s = fs::read_to_string(&self.pwr_path)?; Ok(s.trim().parse::()? / 1000000.0) @@ -217,66 +208,65 @@ impl SensorBus for DellXps9380Sal { fn get_fan_rpms(&self) -> Result> { let mut last_poll = self.last_poll.lock().unwrap(); let now = Instant::now(); - if now.duration_since(*last_poll) < Duration::from_millis(1000) { return Ok(self.last_fans.lock().unwrap().clone()); } - let mut fans = Vec::new(); for path in &self.fan_paths { if let Ok(s) = fs::read_to_string(path) { - if let Ok(rpm) = s.trim().parse::() { - fans.push(rpm); - } + if let Ok(rpm) = s.trim().parse::() { fans.push(rpm); } } } - *self.last_fans.lock().unwrap() = fans.clone(); *last_poll = now; - Ok(fans) } fn get_freq_mhz(&self) -> Result { let s = fs::read_to_string(&self.freq_path)?; - let val = s.trim().parse::()? / 1000.0; - Ok(val) + Ok(s.trim().parse::()? / 1000.0) } } impl ActuatorBus for DellXps9380Sal { fn set_fan_mode(&self, mode: &str) -> Result<()> { match mode { - "max" | "Manual" => { - Command::new("dell-bios-fan-control").arg("0").status()?; - } - "auto" | "Auto" => { - Command::new("dell-bios-fan-control").arg("1").status()?; - } - _ => { - debug!("Unknown fan mode requested: {}", mode); - } + "max" | "Manual" => { Command::new("dell-bios-fan-control").arg("0").status()?; } + "auto" | "Auto" => { Command::new("dell-bios-fan-control").arg("1").status()?; } + _ => { debug!("Unknown fan mode: {}", mode); } } Ok(()) } fn set_sustained_power_limit(&self, watts: f32) -> Result<()> { - let uw = (watts * 1_000_000.0) as u64; - fs::write(&self.pl1_path, uw.to_string())?; + fs::write(&self.pl1_path, ((watts * 1_000_000.0) as u64).to_string())?; Ok(()) } fn set_burst_power_limit(&self, watts: f32) -> Result<()> { - let uw = (watts * 1_000_000.0) as u64; - fs::write(&self.pl2_path, uw.to_string())?; + fs::write(&self.pl2_path, ((watts * 1_000_000.0) as u64).to_string())?; Ok(()) } } impl HardwareWatchdog for DellXps9380Sal { - fn check_emergency(&self) -> Result { - // Check for thermal throttling or BD PROCHOT - // Simplified for now - Ok(false) + fn get_safety_status(&self) -> Result { + let temp = self.get_temp()?; + if temp > 98.0 { + return Ok(SafetyStatus::EmergencyAbort(format!("Thermal Runaway: {:.1}°C", temp))); + } + if let Ok(msr_val) = self.read_msr(0x1FC) { + if (msr_val & 0x1) != 0 && temp < 85.0 { + let _ = self.write_msr(0x1FC, msr_val & !0x1); + return Ok(SafetyStatus::Warning("BD PROCHOT Latch Cleared".to_string())); + } + } + Ok(SafetyStatus::Nominal) + } +} + +impl Drop for DellXps9380Sal { + fn drop(&mut self) { + let _ = self.restore(); } } diff --git a/src/sal/generic_linux.rs b/src/sal/generic_linux.rs index a9527be..13fde32 100644 --- a/src/sal/generic_linux.rs +++ b/src/sal/generic_linux.rs @@ -2,19 +2,21 @@ use anyhow::{Result, anyhow}; use std::path::Path; use std::fs; use std::time::{Duration, Instant}; -use std::thread; use std::process::Command; -use tracing::{debug}; -use std::sync::mpsc; +use tracing::{debug, warn}; +use std::sync::Mutex; -use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError}; +use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus}; use crate::sal::heuristic::discovery::SystemFactSheet; use crate::sal::heuristic::schema::HardwareDb; pub struct GenericLinuxSal { fact_sheet: SystemFactSheet, db: HardwareDb, - suppressed_services: Vec, + suppressed_services: Mutex>, + last_valid_temp: Mutex<(f32, Instant)>, + current_pl1: Mutex, + last_energy: Mutex<(u64, Instant)>, } impl GenericLinuxSal { @@ -22,7 +24,10 @@ impl GenericLinuxSal { Self { fact_sheet, db, - suppressed_services: Vec::new(), + suppressed_services: Mutex::new(Vec::new()), + last_valid_temp: Mutex::new((0.0, Instant::now())), + current_pl1: Mutex::new(15.0), + last_energy: Mutex::new((0, Instant::now())), } } @@ -30,33 +35,18 @@ impl GenericLinuxSal { self.fact_sheet.vendor.to_lowercase().contains("dell") } - fn read_sysfs_timeout(&self, path: &Path, timeout: Duration) -> Result { - let (tx, rx) = mpsc::channel(); - let path_buf = path.to_path_buf(); - - thread::spawn(move || { - let res = fs::read_to_string(path_buf).map(|s| s.trim().to_string()); - let _ = tx.send(res); - }); - - match rx.recv_timeout(timeout) { - Ok(res) => res.map_err(|e| anyhow!("Failed to read sysfs: {}", e)), - Err(_) => Err(anyhow!("Timeout reading sysfs path: {:?}", path)), - } + /// Read sysfs safely. We removed the thread-per-read timeout logic + /// as it was inefficient. sysfs reads are generally fast enough. + fn read_sysfs(&self, path: &Path) -> Result { + fs::read_to_string(path).map(|s| s.trim().to_string()).map_err(|e| anyhow!(e)) } } impl PreflightAuditor for GenericLinuxSal { fn audit(&self) -> Box + '_> { let mut steps = Vec::new(); - - // 1. Static DB checks for check in &self.db.preflight_checks { - let status = Command::new("sh") - .arg("-c") - .arg(&check.check_cmd) - .status(); - + let status = Command::new("sh").arg("-c").arg(&check.check_cmd).status(); steps.push(AuditStep { description: check.name.clone(), outcome: match status { @@ -65,8 +55,6 @@ impl PreflightAuditor for GenericLinuxSal { } }); } - - // 2. Conflict checks (Critical only) for conflict_id in &self.fact_sheet.active_conflicts { if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) { if conflict.severity == "Critical" { @@ -77,7 +65,6 @@ impl PreflightAuditor for GenericLinuxSal { } } } - Box::new(steps.into_iter()) } } @@ -86,31 +73,32 @@ impl SensorBus for GenericLinuxSal { fn get_temp(&self) -> Result { let path = self.fact_sheet.temp_path.as_ref() .ok_or_else(|| anyhow!("No temperature sensor path found"))?; - let content = self.read_sysfs_timeout(path, Duration::from_millis(200))?; - let milli_celsius: f32 = content.parse()?; - Ok(milli_celsius / 1000.0) + let content = self.read_sysfs(path)?; + let temp = content.parse::()? / 1000.0; + let mut last = self.last_valid_temp.lock().unwrap(); + if (temp - last.0).abs() > 0.01 { *last = (temp, Instant::now()); } + Ok(temp) } fn get_power_w(&self) -> Result { let rapl_path = self.fact_sheet.rapl_paths.first() .ok_or_else(|| anyhow!("No RAPL path found"))?; let energy_path = rapl_path.join("energy_uj"); - - let e1: u64 = self.read_sysfs_timeout(&energy_path, Duration::from_millis(200))?.parse()?; - let t1 = Instant::now(); - thread::sleep(Duration::from_millis(100)); - let e2: u64 = self.read_sysfs_timeout(&energy_path, Duration::from_millis(200))?.parse()?; + let mut last = self.last_energy.lock().unwrap(); + let e2: u64 = self.read_sysfs(&energy_path)?.parse()?; let t2 = Instant::now(); - + let (e1, t1) = *last; let delta_e = e2.wrapping_sub(e1); let delta_t = t2.duration_since(t1).as_secs_f32(); + *last = (e2, t2); + if delta_t < 0.01 { return Ok(0.0); } Ok((delta_e as f32 / 1_000_000.0) / delta_t) } fn get_fan_rpms(&self) -> Result> { let mut rpms = Vec::new(); for path in &self.fact_sheet.fan_paths { - if let Ok(content) = self.read_sysfs_timeout(path, Duration::from_millis(200)) { + if let Ok(content) = self.read_sysfs(path) { if let Ok(rpm) = content.parse() { rpms.push(rpm); } } } @@ -120,10 +108,8 @@ impl SensorBus for GenericLinuxSal { fn get_freq_mhz(&self) -> Result { let path = Path::new("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq"); if path.exists() { - let khz: f32 = self.read_sysfs_timeout(path, Duration::from_millis(200))?.parse()?; - Ok(khz / 1000.0) + Ok(self.read_sysfs(path)?.parse::()? / 1000.0) } else { - // Fallback: parse /proc/cpuinfo let cpuinfo = fs::read_to_string("/proc/cpuinfo")?; for line in cpuinfo.lines() { if line.starts_with("cpu MHz") { @@ -149,38 +135,32 @@ impl ActuatorBus for GenericLinuxSal { let parts: Vec<&str> = cmd_str.split_whitespace().collect(); Command::new(parts[0]).args(&parts[1..]).status()?; Ok(()) - } else { Err(anyhow!("Dell fan command missing in DB")) } - } else { - debug!("Fan control not implemented for non-Dell systems yet"); - Ok(()) - } + } else { Err(anyhow!("Dell fan command missing")) } + } else { Ok(()) } } fn set_sustained_power_limit(&self, watts: f32) -> Result<()> { - let rapl_path = self.fact_sheet.rapl_paths.first() - .ok_or_else(|| anyhow!("No RAPL path found for PL1"))?; - let path = rapl_path.join("constraint_0_power_limit_uw"); - fs::write(path, ((watts * 1_000_000.0) as u64).to_string())?; + let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL1 path"))?; + fs::write(rapl_path.join("constraint_0_power_limit_uw"), ((watts * 1_000_000.0) as u64).to_string())?; + *self.current_pl1.lock().unwrap() = watts; Ok(()) } fn set_burst_power_limit(&self, watts: f32) -> Result<()> { - let rapl_path = self.fact_sheet.rapl_paths.first() - .ok_or_else(|| anyhow!("No RAPL path found for PL2"))?; - let path = rapl_path.join("constraint_1_power_limit_uw"); - fs::write(path, ((watts * 1_000_000.0) as u64).to_string())?; + let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL2 path"))?; + fs::write(rapl_path.join("constraint_1_power_limit_uw"), ((watts * 1_000_000.0) as u64).to_string())?; Ok(()) } } impl EnvironmentGuard for GenericLinuxSal { - fn suppress(&mut self) -> Result<()> { + fn suppress(&self) -> Result<()> { + let mut suppressed = self.suppressed_services.lock().unwrap(); for conflict_id in &self.fact_sheet.active_conflicts { if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) { for service in &conflict.services { - debug!("Stopping service: {}", service); if Command::new("systemctl").arg("stop").arg(service).status()?.success() { - self.suppressed_services.push(service.clone()); + suppressed.push(service.clone()); } } } @@ -188,31 +168,30 @@ impl EnvironmentGuard for GenericLinuxSal { Ok(()) } - fn restore(&mut self) -> Result<()> { - for service in self.suppressed_services.drain(..) { - debug!("Starting service: {}", service); + fn restore(&self) -> Result<()> { + let mut suppressed = self.suppressed_services.lock().unwrap(); + for service in suppressed.drain(..) { let _ = Command::new("systemctl").arg("start").arg(service).status(); } - if self.is_dell() { - let _ = self.set_fan_mode("auto"); - } + if self.is_dell() { let _ = self.set_fan_mode("auto"); } Ok(()) } } impl HardwareWatchdog for GenericLinuxSal { - fn check_emergency(&self) -> Result { - if let Ok(temp) = self.get_temp() { - if temp > 100.0 { - return Ok(true); - } + fn get_safety_status(&self) -> Result { + let temp = self.get_temp()?; + if temp > 100.0 { + return Ok(SafetyStatus::EmergencyAbort(format!("Thermal runaway: {:.1}°C", temp))); } - Ok(false) + let last = self.last_valid_temp.lock().unwrap(); + if last.1.elapsed() > Duration::from_secs(5) { + return Ok(SafetyStatus::EmergencyAbort("Temperature sensor stalled".to_string())); + } + Ok(SafetyStatus::Nominal) } } impl Drop for GenericLinuxSal { - fn drop(&mut self) { - let _ = self.restore(); - } + fn drop(&mut self) { let _ = self.restore(); } } diff --git a/src/sal/heuristic/schema.rs b/src/sal/heuristic/schema.rs index 316e701..a64ff9e 100644 --- a/src/sal/heuristic/schema.rs +++ b/src/sal/heuristic/schema.rs @@ -31,6 +31,7 @@ pub struct Conflict { #[derive(Debug, Deserialize, Clone)] pub struct Ecosystem { pub vendor_regex: String, + pub product_regex: Option, pub polling_cap_ms: Option, pub drivers: Option>, pub fan_manual_mode_cmd: Option, @@ -46,6 +47,7 @@ pub struct Ecosystem { pub fan_boost_path: Option, pub ec_tool: Option, pub optimization: Option, + pub help_text: Option, } #[derive(Debug, Deserialize, Clone)] diff --git a/src/sal/mock.rs b/src/sal/mock.rs index dabe27a..bb01fad 100644 --- a/src/sal/mock.rs +++ b/src/sal/mock.rs @@ -1,4 +1,4 @@ -use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep}; +use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, PlatformSal, SafetyStatus}; use anyhow::Result; pub struct MockSal; @@ -26,10 +26,10 @@ impl PreflightAuditor for MockSal { } impl EnvironmentGuard for MockSal { - fn suppress(&mut self) -> Result<()> { + fn suppress(&self) -> Result<()> { Ok(()) } - fn restore(&mut self) -> Result<()> { + fn restore(&self) -> Result<()> { Ok(()) } } @@ -62,7 +62,7 @@ impl ActuatorBus for MockSal { } impl HardwareWatchdog for MockSal { - fn check_emergency(&self) -> Result { - Ok(false) + fn get_safety_status(&self) -> Result { + Ok(SafetyStatus::Nominal) } } diff --git a/src/sal/traits.rs b/src/sal/traits.rs index 3aabf75..e71ef28 100644 --- a/src/sal/traits.rs +++ b/src/sal/traits.rs @@ -49,8 +49,17 @@ impl PreflightAuditor for Arc { /// Suppresses conflicting daemons (tlp, thermald). pub trait EnvironmentGuard: Send + Sync { - fn suppress(&mut self) -> Result<()>; - fn restore(&mut self) -> Result<()>; + fn suppress(&self) -> Result<()>; + fn restore(&self) -> Result<()>; +} + +impl EnvironmentGuard for Arc { + fn suppress(&self) -> Result<()> { + (**self).suppress() + } + fn restore(&self) -> Result<()> { + (**self).restore() + } } /// Read-only interface for standardized metrics. @@ -97,15 +106,23 @@ impl ActuatorBus for Arc { /// Concurrent monitor for catastrophic states. pub trait HardwareWatchdog: Send + Sync { - fn check_emergency(&self) -> Result; + fn get_safety_status(&self) -> Result; } impl HardwareWatchdog for Arc { - fn check_emergency(&self) -> Result { - (**self).check_emergency() + fn get_safety_status(&self) -> Result { + (**self).get_safety_status() } } +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum SafetyStatus { + Nominal, + Warning(String), + Critical(String), + EmergencyAbort(String), +} + /// Aggregate trait for a complete platform implementation. pub trait PlatformSal: PreflightAuditor + SensorBus + ActuatorBus + EnvironmentGuard + HardwareWatchdog {} diff --git a/src/ui/dashboard.rs b/src/ui/dashboard.rs index 9df3041..17d309f 100644 --- a/src/ui/dashboard.rs +++ b/src/ui/dashboard.rs @@ -5,6 +5,7 @@ use ratatui::{ widgets::{Block, Borders, List, ListItem, Paragraph, Chart, Dataset, Axis, BorderType, GraphType}, symbols::Marker, Frame, + prelude::Stylize, }; use crate::mediator::TelemetryState; use crate::ui::theme::*; @@ -83,6 +84,55 @@ pub fn draw_dashboard( draw_freq_graph(f, right_side_chunks[2], state); draw_logs(f, chunks[3], ui_state); + + if state.is_emergency { + draw_emergency_overlay(f, area, state); + } +} + +fn draw_emergency_overlay(f: &mut Frame, area: Rect, state: &TelemetryState) { + let block = Block::default() + .borders(Borders::ALL) + .border_type(BorderType::Double) + .border_style(Style::default().fg(Color::Red).add_modifier(Modifier::BOLD)) + .bg(Color::Black) + .title(" 🚨 EMERGENCY ABORT 🚨 "); + + let area = centered_rect(60, 20, area); + let inner = block.inner(area); + f.render_widget(block, area); + + let reason = state.emergency_reason.as_deref().unwrap_or("Unknown safety trigger"); + let text = vec![ + Line::from(vec![Span::styled("CRITICAL SAFETY LIMIT TRIGGERED", Style::default().fg(Color::Red).add_modifier(Modifier::BOLD))]), + Line::from(""), + Line::from(vec![Span::raw("Reason: "), Span::styled(reason, Style::default().fg(Color::Yellow))]), + Line::from(""), + Line::from("Hardware has been restored to safe defaults."), + Line::from("Exiting in 1 second..."), + ]; + + f.render_widget(Paragraph::new(text).alignment(ratatui::layout::Alignment::Center), inner); +} + +fn centered_rect(percent_x: u16, percent_y: u16, r: Rect) -> Rect { + let popup_layout = Layout::default() + .direction(Direction::Vertical) + .constraints([ + Constraint::Percentage((100 - percent_y) / 2), + Constraint::Percentage(percent_y), + Constraint::Percentage((100 - percent_y) / 2), + ]) + .split(r); + + Layout::default() + .direction(Direction::Horizontal) + .constraints([ + Constraint::Percentage((100 - percent_x) / 2), + Constraint::Percentage(percent_x), + Constraint::Percentage((100 - percent_x) / 2), + ]) + .split(popup_layout[1])[1] } fn draw_header(f: &mut Frame, area: Rect, state: &TelemetryState) {