From 4c4026a6002de7727b6e4cf87327cfdba86d04f7 Mon Sep 17 00:00:00 2001 From: Nils Pukropp Date: Fri, 27 Feb 2026 00:59:36 +0100 Subject: [PATCH] fixed dangerous states to be applied --- src/engine/mod.rs | 13 ++++++++ src/main.rs | 2 ++ src/orchestrator/mod.rs | 19 +++++++++--- src/sal/dell_xps_9380.rs | 49 ++++++++++++++++++++++++++++--- src/sal/generic_linux.rs | 31 +++++++++++++++++-- src/sal/heuristic/discovery.rs | 23 ++++----------- src/sal/heuristic/engine.rs | 2 +- tests/heuristic_discovery_test.rs | 5 +++- tests/orchestrator_e2e_test.rs | 1 + 9 files changed, 116 insertions(+), 29 deletions(-) diff --git a/src/engine/mod.rs b/src/engine/mod.rs index 42dabad..07997d8 100644 --- a/src/engine/mod.rs +++ b/src/engine/mod.rs @@ -7,6 +7,7 @@ use serde::{Serialize, Deserialize}; use std::collections::HashMap; use std::path::PathBuf; +use tracing::warn; pub mod formatters; @@ -180,6 +181,18 @@ impl OptimizerEngine { } } + let best_pl = if max_score > f32::MIN { + best_pl + } else { + profile.points.last().map(|p| p.power_w).unwrap_or(15.0) + }; + + // Safety Floor: Never recommend a TDP below 5W, as this bricks system performance. + if best_pl < 5.0 { + warn!("Heuristic suggested dangerously low PL1 ({:.1}W). Falling back to 15W safety floor.", best_pl); + return 15.0; + } + best_pl } } diff --git a/src/main.rs b/src/main.rs index 014817e..bc962f4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -161,6 +161,7 @@ fn main() -> Result<()> { // 5. Spawn Backend Orchestrator let sal_backend = sal.clone(); let facts_backend = facts.clone(); + let config_out = args.config_out.clone(); let backend_handle = thread::spawn(move || { let workload = Box::new(StressNg::new()); let mut orchestrator = BenchmarkOrchestrator::new( @@ -169,6 +170,7 @@ fn main() -> Result<()> { workload, telemetry_tx, command_rx, + config_out, ); orchestrator.run() }); diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index ab46853..ebe99da 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -12,6 +12,7 @@ use sysinfo::System; use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Mutex; +use std::path::PathBuf; use crate::sal::traits::{PlatformSal, SafetyStatus}; use crate::sal::heuristic::discovery::SystemFactSheet; @@ -40,6 +41,8 @@ pub struct BenchmarkOrchestrator { profile: ThermalProfile, /// Mathematics engine for data smoothing and optimization. engine: OptimizerEngine, + /// CLI override for the configuration output path. + optional_config_out: Option, /// Sliding window of power readings (Watts). history_watts: VecDeque, @@ -67,6 +70,7 @@ impl BenchmarkOrchestrator { workload: Box, telemetry_tx: mpsc::Sender, command_rx: mpsc::Receiver, + optional_config_out: Option, ) -> Self { let mut sys = System::new_all(); sys.refresh_all(); @@ -92,6 +96,7 @@ impl BenchmarkOrchestrator { total_ram_gb, emergency_abort: Arc::new(AtomicBool::new(false)), emergency_reason: Arc::new(Mutex::new(None)), + optional_config_out, } } @@ -222,12 +227,18 @@ impl BenchmarkOrchestrator { trip_temp: res.max_temp_c.max(95.0), }; - if let Some(throttled_path) = self.facts.paths.configs.get("throttled") { - crate::engine::formatters::throttled::ThrottledTranslator::save(throttled_path, &config)?; - self.log(&format!("✓ Saved '{}' (merged).", throttled_path.display()))?; - res.config_paths.insert("throttled".to_string(), throttled_path.clone()); + // 1. Throttled (Merged if exists) + // PRIORITY: optional_config_out > facts discovery > fallback + let throttled_path = self.optional_config_out.clone() + .or_else(|| self.facts.paths.configs.get("throttled").cloned()); + + if let Some(path) = throttled_path { + crate::engine::formatters::throttled::ThrottledTranslator::save(&path, &config)?; + self.log(&format!("✓ Saved '{}'.", path.display()))?; + res.config_paths.insert("throttled".to_string(), path.clone()); } + // 2. i8kmon if let Some(i8k_path) = self.facts.paths.configs.get("i8kmon") { let i8k_config = crate::engine::formatters::i8kmon::I8kmonConfig { t_ambient: self.profile.ambient_temp, diff --git a/src/sal/dell_xps_9380.rs b/src/sal/dell_xps_9380.rs index fbf12af..dcc73ae 100644 --- a/src/sal/dell_xps_9380.rs +++ b/src/sal/dell_xps_9380.rs @@ -22,6 +22,11 @@ pub struct DellXps9380Sal { suppressed_services: Mutex>, msr_file: Mutex, last_energy: Mutex<(u64, Instant)>, + + // --- Original State for Restoration --- + original_pl1: Mutex>, + original_pl2: Mutex>, + original_fan_mode: Mutex>, } impl DellXps9380Sal { @@ -53,6 +58,9 @@ impl DellXps9380Sal { last_energy: Mutex::new((initial_energy, Instant::now())), fact_sheet: facts, ctx, + original_pl1: Mutex::new(None), + original_pl2: Mutex::new(None), + original_fan_mode: Mutex::new(None), }) } @@ -126,12 +134,25 @@ impl PreflightAuditor for DellXps9380Sal { impl EnvironmentGuard for DellXps9380Sal { fn suppress(&self) -> Result<()> { + // 1. Snapshot Power Limits + if let Ok(pl1) = fs::read_to_string(&self.pl1_path) { + *self.original_pl1.lock().unwrap() = pl1.trim().parse().ok(); + } + if let Ok(pl2) = fs::read_to_string(&self.pl2_path) { + *self.original_pl2.lock().unwrap() = pl2.trim().parse().ok(); + } + + // 2. Snapshot Fan Mode (Assumption: Dell BIOS Fan Control is active) + // We can't easily read current state of dell-bios-fan-control, so we assume 'auto' (1) + *self.original_fan_mode.lock().unwrap() = Some("1".to_string()); + + // 3. Stop Services let services = ["tlp", "thermald", "i8kmon"]; let mut suppressed = self.suppressed_services.lock().unwrap(); for s in services { if self.ctx.runner.run("systemctl", &["is-active", "--quiet", s]).is_ok() { debug!("Suppressing service: {}", s); - self.ctx.runner.run("systemctl", &["stop", s])?; + let _ = self.ctx.runner.run("systemctl", &["stop", s]); suppressed.push(s.to_string()); } } @@ -139,6 +160,20 @@ impl EnvironmentGuard for DellXps9380Sal { } fn restore(&self) -> Result<()> { + // 1. Restore Power Limits + if let Some(pl1) = *self.original_pl1.lock().unwrap() { + let _ = fs::write(&self.pl1_path, pl1.to_string()); + } + if let Some(pl2) = *self.original_pl2.lock().unwrap() { + let _ = fs::write(&self.pl2_path, pl2.to_string()); + } + + // 2. Restore Fan Mode (BIOS Control) + if let Some(tool_path) = self.fact_sheet.paths.tools.get("dell_fan_ctrl") { + let _ = self.ctx.runner.run(&tool_path.to_string_lossy(), &["1"]); + } + + // 3. Restart Services let mut suppressed = self.suppressed_services.lock().unwrap(); for s in suppressed.drain(..) { let _ = self.ctx.runner.run("systemctl", &["start", &s]); @@ -162,17 +197,23 @@ impl SensorBus for DellXps9380Sal { } fn get_power_w(&self) -> Result { - if self.pwr_path.to_string_lossy().contains("energy_uj") { + // FIX: Ensure we always read from energy_uj if available for delta calculation + let rapl_base = self.pl1_path.parent().context("RAPL path error")?; + let energy_path = rapl_base.join("energy_uj"); + + if energy_path.exists() { let mut last = self.last_energy.lock().unwrap(); - let e2 = fs::read_to_string(&self.pwr_path)?.trim().parse::()?; + let e2_str = fs::read_to_string(&energy_path)?; + let e2 = e2_str.trim().parse::()?; let t2 = Instant::now(); let (e1, t1) = *last; let delta_e = e2.wrapping_sub(e1); let delta_t = t2.duration_since(t1).as_secs_f32(); *last = (e2, t2); - if delta_t < 0.01 { return Ok(0.0); } + if delta_t < 0.05 { return Ok(0.0); } Ok((delta_e as f32 / 1_000_000.0) / delta_t) } else { + // Fallback to power1_average if it exists (units are µW) let s = fs::read_to_string(&self.pwr_path)?; Ok(s.trim().parse::()? / 1000000.0) } diff --git a/src/sal/generic_linux.rs b/src/sal/generic_linux.rs index 7a0e2ce..7007b25 100644 --- a/src/sal/generic_linux.rs +++ b/src/sal/generic_linux.rs @@ -3,6 +3,7 @@ use std::path::{Path}; use std::fs; use std::time::{Duration, Instant}; use std::sync::Mutex; +use tracing::{debug}; use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx}; use crate::sal::heuristic::discovery::SystemFactSheet; @@ -16,6 +17,10 @@ pub struct GenericLinuxSal { last_valid_temp: Mutex<(f32, Instant)>, current_pl1: Mutex, last_energy: Mutex<(u64, Instant)>, + + // --- Original State for Restoration --- + original_pl1: Mutex>, + original_pl2: Mutex>, } impl GenericLinuxSal { @@ -34,6 +39,8 @@ impl GenericLinuxSal { last_energy: Mutex::new((initial_energy, Instant::now())), fact_sheet: facts, ctx, + original_pl1: Mutex::new(None), + original_pl2: Mutex::new(None), } } @@ -95,7 +102,7 @@ impl SensorBus for GenericLinuxSal { let delta_e = e2.wrapping_sub(e1); let delta_t = t2.duration_since(t1).as_secs_f32(); *last = (e2, t2); - if delta_t < 0.01 { return Ok(0.0); } + if delta_t < 0.05 { return Ok(0.0); } Ok((delta_e as f32 / 1_000_000.0) / delta_t) } @@ -160,12 +167,22 @@ impl ActuatorBus for GenericLinuxSal { impl EnvironmentGuard for GenericLinuxSal { fn suppress(&self) -> Result<()> { + // Snapshot Power Limits + if let Some(rapl_path) = self.fact_sheet.rapl_paths.first() { + if let Ok(pl1) = fs::read_to_string(rapl_path.join("constraint_0_power_limit_uw")) { + *self.original_pl1.lock().unwrap() = pl1.trim().parse().ok(); + } + if let Ok(pl2) = fs::read_to_string(rapl_path.join("constraint_1_power_limit_uw")) { + *self.original_pl2.lock().unwrap() = pl2.trim().parse().ok(); + } + } + let mut suppressed = self.suppressed_services.lock().unwrap(); for conflict_id in &self.fact_sheet.active_conflicts { if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) { for service in &conflict.services { if self.ctx.runner.run("systemctl", &["is-active", "--quiet", service]).is_ok() { - self.ctx.runner.run("systemctl", &["stop", service])?; + let _ = self.ctx.runner.run("systemctl", &["stop", service]); suppressed.push(service.clone()); } } @@ -175,6 +192,16 @@ impl EnvironmentGuard for GenericLinuxSal { } fn restore(&self) -> Result<()> { + // Restore Power Limits + if let Some(rapl_path) = self.fact_sheet.rapl_paths.first() { + if let Some(pl1) = *self.original_pl1.lock().unwrap() { + let _ = fs::write(rapl_path.join("constraint_0_power_limit_uw"), pl1.to_string()); + } + if let Some(pl2) = *self.original_pl2.lock().unwrap() { + let _ = fs::write(rapl_path.join("constraint_1_power_limit_uw"), pl2.to_string()); + } + } + let mut suppressed = self.suppressed_services.lock().unwrap(); for service in suppressed.drain(..) { let _ = self.ctx.runner.run("systemctl", &["start", &service]); diff --git a/src/sal/heuristic/discovery.rs b/src/sal/heuristic/discovery.rs index 6f6952b..3dce223 100644 --- a/src/sal/heuristic/discovery.rs +++ b/src/sal/heuristic/discovery.rs @@ -1,11 +1,11 @@ use std::fs; use std::path::{Path, PathBuf}; -use std::process::Command; use std::time::{Duration}; use std::thread; use std::sync::mpsc; use std::collections::HashMap; use crate::sal::heuristic::schema::{SensorDiscovery, ActuatorDiscovery, Conflict, Discovery, Benchmarking}; +use crate::sys::SyscallRunner; use tracing::{debug, warn}; /// Registry of dynamically discovered paths for configs and tools. @@ -31,6 +31,7 @@ pub struct SystemFactSheet { /// Probes the system for hardware sensors, actuators, service conflicts, and paths. pub fn discover_facts( base_path: &Path, + runner: &dyn SyscallRunner, discovery: &Discovery, conflicts: &[Conflict], bench_config: Benchmarking, @@ -45,7 +46,7 @@ pub fn discover_facts( let mut active_conflicts = Vec::new(); for conflict in conflicts { for service in &conflict.services { - if is_service_active(service) { + if is_service_active(runner, service) { debug!("Detected active conflict: {} (Service: {})", conflict.id, service); active_conflicts.push(conflict.id.clone()); break; @@ -93,7 +94,6 @@ fn discover_paths(base_path: &Path, discovery: &Discovery) -> PathRegistry { break; } } - // If not found, use the first one as default if any exist if !registry.configs.contains_key(id) { if let Some(first) = candidates.first() { registry.configs.insert(id.clone(), PathBuf::from(first)); @@ -142,7 +142,6 @@ fn discover_hwmon(base_path: &Path, cfg: &SensorDiscovery) -> (Option, for hw_entry in hw_entries.flatten() { let file_name = hw_entry.file_name().into_string().unwrap_or_default(); - // Temperature Sensors if file_name.starts_with("temp") && file_name.ends_with("_label") { if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(100)) { if cfg.temp_labels.iter().any(|l| label.contains(l)) { @@ -154,7 +153,6 @@ fn discover_hwmon(base_path: &Path, cfg: &SensorDiscovery) -> (Option, } } - // Fan Sensors if file_name.starts_with("fan") && file_name.ends_with("_label") { if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(100)) { if cfg.fan_labels.iter().any(|l| label.contains(l)) { @@ -206,18 +204,9 @@ fn discover_rapl(base_path: &Path, cfg: &ActuatorDiscovery) -> Vec { paths } -/// Checks if a systemd service is currently active. -pub fn is_service_active(service: &str) -> bool { - let status = Command::new("systemctl") - .arg("is-active") - .arg("--quiet") - .arg(service) - .status(); - - match status { - Ok(s) => s.success(), - Err(_) => false, - } +/// Checks if a systemd service is currently active using the injected runner. +pub fn is_service_active(runner: &dyn SyscallRunner, service: &str) -> bool { + runner.run("systemctl", &["is-active", "--quiet", service]).is_ok() } /// Helper to read a sysfs file with a timeout. diff --git a/src/sal/heuristic/engine.rs b/src/sal/heuristic/engine.rs index eb7cdeb..04974f9 100644 --- a/src/sal/heuristic/engine.rs +++ b/src/sal/heuristic/engine.rs @@ -24,7 +24,7 @@ impl HeuristicEngine { .context("Failed to parse hardware_db.toml")?; // 2. Discover Facts - let facts = discover_facts(&ctx.sysfs_base, &db.discovery, &db.conflicts, db.benchmarking.clone()); + let facts = discover_facts(&ctx.sysfs_base, ctx.runner.as_ref(), &db.discovery, &db.conflicts, db.benchmarking.clone()); info!("System Identity: {} {}", facts.vendor, facts.model); // 3. Routing Logic diff --git a/tests/heuristic_discovery_test.rs b/tests/heuristic_discovery_test.rs index 2905124..a6dfc2d 100644 --- a/tests/heuristic_discovery_test.rs +++ b/tests/heuristic_discovery_test.rs @@ -1,5 +1,6 @@ use ember_tune_rs::sal::heuristic::discovery::discover_facts; use ember_tune_rs::sal::heuristic::schema::{Discovery, SensorDiscovery, ActuatorDiscovery, Benchmarking}; +use ember_tune_rs::sys::MockSyscallRunner; use crate::common::fakesys::FakeSysBuilder; mod common; @@ -35,7 +36,9 @@ fn test_heuristic_discovery_with_fakesys() { power_steps_watts: vec![10.0, 15.0], }; - let facts = discover_facts(&fake.base_path(), &discovery, &[], benchmarking); + let runner = MockSyscallRunner::new(); + + let facts = discover_facts(&fake.base_path(), &runner, &discovery, &[], benchmarking); assert_eq!(facts.vendor, "Dell Inc."); assert_eq!(facts.model, "XPS 13 9380"); diff --git a/tests/orchestrator_e2e_test.rs b/tests/orchestrator_e2e_test.rs index 7681499..f445c4c 100644 --- a/tests/orchestrator_e2e_test.rs +++ b/tests/orchestrator_e2e_test.rs @@ -28,6 +28,7 @@ fn test_orchestrator_e2e_state_machine() { workload, telemetry_tx, command_rx, + None, ); // For the purpose of this architecture audit, we've demonstrated the