This commit is contained in:
2026-02-28 14:54:48 +01:00
parent 4f54fd81ce
commit 1702e7d058
16 changed files with 713 additions and 688 deletions

13
Cargo.lock generated
View File

@@ -901,6 +901,15 @@ dependencies = [
"winapi",
]
[[package]]
name = "matchers"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9"
dependencies = [
"regex-automata",
]
[[package]]
name = "memchr"
version = "2.8.0"
@@ -2000,10 +2009,14 @@ version = "0.3.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e"
dependencies = [
"matchers",
"nu-ansi-term",
"once_cell",
"regex-automata",
"sharded-slab",
"smallvec",
"thread_local",
"tracing",
"tracing-core",
"tracing-log",
]

View File

@@ -23,7 +23,7 @@ serde_json = "1.0.149"
clap = { version = "4.5", features = ["derive", "string", "wrap_help"] }
color-eyre = "0.6"
tracing = "0.1"
tracing-subscriber = "0.3"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
tracing-appender = "0.2"
sysinfo = "0.38"
libc = "0.2"

View File

@@ -15,7 +15,7 @@ help_text = "TLP and Power-Profiles-Daemon fight over power envelopes. Mask both
[[conflicts]]
id = "thermal_logic_collision"
services = ["thermald.service", "throttled.service"]
services = ["thermald.service", "throttled.service", "lenovo_fix.service", "lenovo-throttling-fix.service"]
contention = "RAPL / MSR / BD-PROCHOT"
severity = "High"
fix_action = "SuspendService"

View File

@@ -1,66 +0,0 @@
//! Telemetry & Benchmarking Methodology (Agent Metrology)
//!
//! This module defines the execution flow to extract flawless hardware telemetry.
//! It isolates specific subsystems (CPU Core, Memory) and executes the Sweep Protocol
//! and Thermal Soak to map the physical limits of the hardware.
use anyhow::Result;
use std::time::{Duration, Instant};
use std::thread;
use crate::sal::traits::PlatformSal;
use crate::load::{Workload, IntensityProfile, StressVector};
use tracing::info;
pub struct MetrologyAgent<'a> {
sal: &'a dyn PlatformSal,
workload: &'a mut Box<dyn Workload>,
}
impl<'a> MetrologyAgent<'a> {
pub fn new(sal: &'a dyn PlatformSal, workload: &'a mut Box<dyn Workload>) -> Self {
Self { sal, workload }
}
/// Performs a prolonged mixed-load test to achieve chassis thermal saturation.
/// Bypasses short-term PL2/boost metrics to find the true steady-state dissipation capacity.
pub fn perform_thermal_soak(&mut self, duration_minutes: u64) -> Result<f32> {
info!("Metrology: Starting {} minute Thermal Soak...", duration_minutes);
self.sal.set_fan_mode("max")?;
// Mixed load: matrix math + memory stressors to saturate entire SoC and Chassis.
self.workload.run_workload(
Duration::from_secs(duration_minutes * 60),
IntensityProfile {
threads: num_cpus::get(),
load_percentage: 100,
vector: StressVector::Mixed
}
)?;
let start = Instant::now();
let target = Duration::from_secs(duration_minutes * 60);
let mut max_sustained_watts = 0.0;
while start.elapsed() < target {
thread::sleep(Duration::from_secs(5));
let temp = self.sal.get_temp().unwrap_or(0.0);
let watts = self.sal.get_power_w().unwrap_or(0.0);
if watts > max_sustained_watts {
max_sustained_watts = watts;
}
// Abort if dangerously hot
if temp >= 98.0 {
info!("Metrology: Thermal ceiling hit during soak ({}C). Stopping early.", temp);
break;
}
}
self.workload.stop_workload()?;
info!("Metrology: Thermal Soak complete. Max sustained: {:.1}W", max_sustained_watts);
Ok(max_sustained_watts)
}
}

View File

@@ -12,6 +12,5 @@ pub mod ui;
pub mod engine;
pub mod cli;
pub mod sys;
pub mod agent_metrology;
pub mod agent_analyst;
pub mod agent_integrator;

View File

@@ -88,11 +88,11 @@ impl Workload for StressNg {
let load = profile.load_percentage.to_string();
let mut cmd = Command::new("stress-ng");
cmd.args(["--timeout", &timeout, "--metrics", "--quiet"]);
cmd.args(["--timeout", &timeout, "--metrics", "--quiet", "--cpu-load", &load]);
match profile.vector {
StressVector::CpuMatrix => {
cmd.args(["--matrix", &threads, "--cpu-load", &load]);
cmd.args(["--matrix", &threads]);
},
StressVector::MemoryBandwidth => {
cmd.args(["--vm", &threads, "--vm-bytes", "80%"]);

View File

@@ -8,7 +8,8 @@ use std::sync::atomic::{AtomicBool, Ordering};
use std::io;
use clap::Parser;
use tracing::{info, debug, error};
use tracing::error;
use tracing_subscriber::{fmt, prelude::*, EnvFilter};
use crossterm::{
event::{self, Event, KeyCode},
@@ -68,27 +69,24 @@ fn print_summary_report(result: &OptimizationResult) {
println!();
}
fn setup_logging(verbose: bool) -> tracing_appender::non_blocking::WorkerGuard {
let file_appender = tracing_appender::rolling::never("/var/log", "ember-tune.log");
let (non_blocking, guard) = tracing_appender::non_blocking(file_appender);
let level = if verbose { tracing::Level::DEBUG } else { tracing::Level::INFO };
tracing_subscriber::fmt()
.with_max_level(level)
.with_writer(non_blocking)
.with_ansi(false)
.init();
guard
}
fn main() -> Result<()> {
// 1. Diagnostics & CLI Initialization
let args = Cli::parse();
let _log_guard = setup_logging(args.verbose);
// 1. Logging Setup (File-only by default, Stdout during Audit)
let file_appender = tracing_appender::rolling::never(".", "ember-tune.log");
let (non_blocking, _guard) = tracing_appender::non_blocking(file_appender);
let level = if args.verbose { "debug" } else { "info" };
let file_layer = fmt::layer()
.with_writer(non_blocking)
.with_ansi(false);
// We use a simple println for the audit to avoid complex reload handles
tracing_subscriber::registry()
.with(EnvFilter::new(level))
.with(file_layer)
.init();
// Set panic hook to restore terminal state
std::panic::set_hook(Box::new(|panic_info| {
let _ = disable_raw_mode();
let mut stdout = io::stdout();
@@ -99,11 +97,10 @@ fn main() -> Result<()> {
eprintln!("----------------------------------------\n");
}));
info!("ember-tune starting with args: {:?}", args);
println!("{}", console::style("─── Pre-flight System Audit ───").bold().cyan());
let ctx = ember_tune_rs::sal::traits::EnvironmentCtx::production();
// 2. Platform Detection & Audit
let (sal_box, facts): (Box<dyn PlatformSal>, SystemFactSheet) = if args.mock {
(Box::new(MockSal::new()), SystemFactSheet::default())
} else {
@@ -111,9 +108,7 @@ fn main() -> Result<()> {
};
let sal: Arc<dyn PlatformSal> = sal_box.into();
println!("{}", console::style("─── Pre-flight System Audit ───").bold().cyan());
let mut audit_failures = Vec::new();
for step in sal.audit() {
print!(" Checking {:<40} ", step.description);
io::Write::flush(&mut io::stdout()).into_diagnostic()?;
@@ -137,15 +132,14 @@ fn main() -> Result<()> {
return Ok(());
}
// 3. Terminal Setup
// Entering TUI Mode - STDOUT is now strictly for Ratatui
enable_raw_mode().into_diagnostic()?;
let mut stdout = io::stdout();
execute!(stdout, EnterAlternateScreen).into_diagnostic()?;
execute!(stdout, EnterAlternateScreen, crossterm::cursor::Hide).into_diagnostic()?;
let backend_stdout = io::stdout();
let backend_term = CrosstermBackend::new(backend_stdout);
let mut terminal = Terminal::new(backend_term).into_diagnostic()?;
// 4. State & Communication Setup
let running = Arc::new(AtomicBool::new(true));
let r = running.clone();
@@ -158,7 +152,6 @@ fn main() -> Result<()> {
r.store(false, Ordering::SeqCst);
}).expect("Error setting Ctrl-C handler");
// 5. Spawn Backend Orchestrator
let sal_backend = sal.clone();
let facts_backend = facts.clone();
let config_out = args.config_out.clone();
@@ -175,10 +168,9 @@ fn main() -> Result<()> {
orchestrator.run()
});
// 6. Frontend Event Loop
let mut ui_state = DashboardState::new();
let mut last_telemetry = TelemetryState {
cpu_model: "Loading...".to_string(),
cpu_model: facts.model.clone(),
total_ram_gb: 0,
tick: 0,
cpu_temp: 0.0,
@@ -227,7 +219,6 @@ fn main() -> Result<()> {
while let Ok(new_state) = telemetry_rx.try_recv() {
if let Some(log) = &new_state.log_event {
ui_state.add_log(log.clone());
debug!("Backend Log: {}", log);
} else {
ui_state.update(&new_state);
last_telemetry = new_state;
@@ -238,20 +229,11 @@ fn main() -> Result<()> {
if backend_handle.is_finished() { break; }
}
// 7. Terminal Restoration
let _ = disable_raw_mode();
let _ = execute!(terminal.backend_mut(), LeaveAlternateScreen);
let _ = terminal.show_cursor();
let _ = execute!(terminal.backend_mut(), LeaveAlternateScreen, crossterm::cursor::Show);
// 8. Final Report & Hardware Restoration
let join_res = backend_handle.join();
// Explicit hardware restoration
info!("Restoring hardware state...");
if let Err(e) = sal.restore() {
error!("Failed to restore hardware state: {}", e);
}
match join_res {
Ok(Ok(result)) => {
print_summary_report(&result);
@@ -276,6 +258,5 @@ fn main() -> Result<()> {
}
}
info!("ember-tune exited gracefully.");
Ok(())
}

View File

@@ -3,8 +3,8 @@
//! It manages hardware interactions through the [PlatformSal], generates stress
//! using a [Workload], and feeds telemetry to the frontend via MPSC channels.
use anyhow::{Result, Context};
use tracing::warn;
use anyhow::{Result, Context, bail};
use tracing::{info, warn, error};
use std::sync::mpsc;
use std::time::{Duration, Instant};
use std::thread;
@@ -14,16 +14,29 @@ use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Mutex;
use std::path::PathBuf;
use std::cell::Cell;
use crate::sal::traits::{PlatformSal, SafetyStatus};
use crate::sal::traits::{PlatformSal, SensorBus};
use crate::sal::heuristic::discovery::SystemFactSheet;
use crate::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts, ConfigurationTransaction, ThermalThresholdCelsius};
use crate::sal::safety::{HardwareStateGuard, PowerLimitWatts, ThermalWatchdog};
use crate::load::{Workload, IntensityProfile, StressVector};
use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase};
use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult};
use crate::agent_metrology::MetrologyAgent;
use crate::agent_analyst::{HeuristicAnalyst, OptimizationMatrix};
use crate::agent_integrator::ServiceIntegrator;
use crate::agent_analyst::HeuristicAnalyst;
/// Represents the possible states of the benchmark orchestrator.
pub enum OrchestratorState {
/// Performing pre-flight checks and snapshotting.
PreFlight,
/// Acquiring idle baseline telemetry.
IdleBaseline,
/// Actively sweeping through power limits.
StressSweep { current_wattage: f32 },
/// Allowing hardware to cool down before releasing the guard.
Cooldown,
/// Benchmark complete, generating final results.
Finalizing,
}
/// The central state machine responsible for coordinating the thermal benchmark.
pub struct BenchmarkOrchestrator {
@@ -37,8 +50,8 @@ pub struct BenchmarkOrchestrator {
telemetry_tx: mpsc::Sender<TelemetryState>,
/// Channel for receiving commands from the UI.
command_rx: mpsc::Receiver<UiCommand>,
/// Current phase of the benchmark.
phase: BenchmarkPhase,
/// Current phase reported to the UI.
ui_phase: BenchmarkPhase,
/// Accumulated thermal data points.
profile: ThermalProfile,
/// Mathematics engine for data smoothing and optimization.
@@ -48,6 +61,8 @@ pub struct BenchmarkOrchestrator {
/// The safety membrane protecting the system.
safeguard: Option<HardwareStateGuard>,
/// Active thermal watchdog.
watchdog: Option<ThermalWatchdog>,
/// Sliding window of power readings (Watts).
history_watts: VecDeque<f32>,
@@ -91,7 +106,7 @@ impl BenchmarkOrchestrator {
workload,
telemetry_tx,
command_rx,
phase: BenchmarkPhase::Auditing,
ui_phase: BenchmarkPhase::Auditing,
profile: ThermalProfile::default(),
engine: OptimizerEngine::new(5),
history_watts: VecDeque::with_capacity(120),
@@ -103,147 +118,163 @@ impl BenchmarkOrchestrator {
emergency_reason: Arc::new(Mutex::new(None)),
optional_config_out,
safeguard: None,
watchdog: None,
}
}
/// Executes the full benchmark sequence.
pub fn run(&mut self) -> Result<OptimizationResult> {
self.log("Starting ember-tune Benchmark Sequence.")?;
// Immediate Priming
let _ = self.sal.get_temp();
let _ = self.sal.get_power_w();
let _ = self.sal.get_fan_rpms();
let _watchdog_handle = self.spawn_watchdog_monitor();
info!("Orchestrator: Initializing Project Iron-Ember lifecycle.");
// Spawn safety watchdog immediately
let watchdog = ThermalWatchdog::spawn(self.sal.clone(), self.emergency_abort.clone());
self.watchdog = Some(watchdog);
// Core execution wrapped in cleanup logic
let result = self.execute_benchmark();
// --- MANDATORY CLEANUP ---
self.log("Benchmark sequence finished. Restoring hardware defaults...")?;
if let Err(ref e) = result {
error!("Benchmark Lifecycle Failure: {}", e);
let _ = self.log(&format!("⚠ FAILURE: {}", e));
}
// --- MANDATORY RAII CLEANUP ---
info!("Benchmark sequence complete. Releasing safeguards...");
let _ = self.workload.stop_workload();
if let Some(mut sg) = self.safeguard.take() {
if let Err(e) = sg.release() {
anyhow::bail!("CRITICAL: USA Restoration Failure: {}", e);
error!("CRITICAL: State restoration failure: {}", e);
}
}
// SAL restore should only handle OEM-specific non-sysfs state not covered by guard
if let Err(e) = self.sal.restore() {
warn!("Failed to perform secondary SAL restoration: {}", e);
}
self.log("✓ Hardware state restored.")?;
info!("✓ Hardware state restored to pre-flight defaults.");
result
}
/// Internal execution logic for the benchmark phases.
fn execute_benchmark(&mut self) -> Result<OptimizationResult> {
let bench_cfg = self.facts.bench_config.clone().context("Benchmarking config missing in facts")?;
let bench_cfg = self.facts.bench_config.clone().context("Benchmarking configuration missing.")?;
// 1. Snapshot & Arm Safeguard
// 1. Pre-Flight Phase
self.ui_phase = BenchmarkPhase::Auditing;
self.log("Phase: Pre-Flight Auditing & Sterilization")?;
// Snapshot and neutralise Brawl Matrix
let mut target_files = self.facts.rapl_paths.iter()
.map(|p| p.join("constraint_0_power_limit_uw"))
.collect::<Vec<_>>();
target_files.extend(self.facts.rapl_paths.iter().map(|p| p.join("constraint_1_power_limit_uw")));
if let Some(tp) = self.facts.paths.configs.get("throttled") {
target_files.push(tp.clone());
}
let target_services = vec!["tlp.service".to_string(), "thermald.service".to_string(), "throttled.service".to_string()];
let mut sg = HardwareStateGuard::acquire(&target_files, &target_services)?;
// # SAFETY: Register fan restoration command if we are on Dell
if self.facts.vendor.to_lowercase().contains("dell") {
if let Some(tool_path) = self.facts.paths.tools.get("dell_fan_ctrl") {
let tool_str = tool_path.to_string_lossy().to_string();
sg.on_rollback(Box::new(move || {
let _ = std::process::Command::new(tool_str).arg("1").status();
}));
}
}
let sg = HardwareStateGuard::acquire(&target_files, &self.facts.conflict_services)?;
self.safeguard = Some(sg);
// Phase 1: Audit & Baseline
self.phase = BenchmarkPhase::Auditing;
// Run auditor
for step in self.sal.audit() {
if let Err(e) = step.outcome {
return Err(anyhow::anyhow!("Audit failed ({}): {:?}", step.description, e));
}
}
self.workload.initialize().context("Failed to initialize workload")?;
self.sal.suppress().context("Failed to suppress background services")?;
self.workload.initialize().context("Failed to initialize load generator.")?;
// Baseline (Idle Calibration)
self.phase = BenchmarkPhase::IdleCalibration;
self.log(&format!("Phase 1: Recording Idle Baseline ({}s)...", bench_cfg.idle_duration_s))?;
let tick = Cell::new(0u64);
// 2. Idle Baseline Phase
self.ui_phase = BenchmarkPhase::IdleCalibration;
self.log(&format!("Phase: Recording Idle Baseline ({}s)", bench_cfg.idle_duration_s))?;
// Wait for fan spin-up
self.sal.set_fan_mode("auto")?;
let mut idle_temps = Vec::new();
let start = Instant::now();
let mut tick = 0;
while start.elapsed() < Duration::from_secs(bench_cfg.idle_duration_s) {
self.check_abort()?;
self.send_telemetry(tick)?;
self.check_safety_abort()?;
self.send_telemetry(tick.get())?;
idle_temps.push(self.sal.get_temp().unwrap_or(0.0));
tick += 1;
tick.set(tick.get() + 1);
thread::sleep(Duration::from_millis(500));
}
self.profile.ambient_temp = self.engine.smooth(&idle_temps).last().cloned().unwrap_or(0.0);
self.log(&format!("✓ Idle Baseline: {:.1}°C", self.profile.ambient_temp))?;
// Phase 1.5: Thermal Soak (Agent Metrology)
self.log("Phase 1.5: Executing Thermal Soak to achieve chassis saturation...")?;
let soak_duration_minutes = 1;
let mut metrology = MetrologyAgent::new(self.sal.as_ref(), &mut self.workload);
let max_soak_watts = metrology.perform_thermal_soak(soak_duration_minutes)?;
self.log(&format!("✓ Max sustained wattage during soak: {:.1}W", max_soak_watts))?;
// 3. Stress Sweep Phase
self.ui_phase = BenchmarkPhase::StressTesting;
self.log("Phase: Synthetic Stress Matrix (Gradual Ramp)")?;
// Ensure fans are ramped to MAX before load
self.log("Metrology: Locking fans to MAX...")?;
self.sal.set_fan_mode("max")?;
let fan_lock_start = Instant::now();
loop {
let fans = self.sal.get_fan_rpms().unwrap_or_default();
let max_rpm = fans.iter().cloned().max().unwrap_or(0);
if max_rpm >= 3000 || fan_lock_start.elapsed() > Duration::from_secs(15) {
break;
}
thread::sleep(Duration::from_millis(500));
self.send_telemetry(tick.get())?;
tick.set(tick.get() + 1);
}
// Phase 2: Stress Stepping
self.phase = BenchmarkPhase::StressTesting;
self.log("Phase 2: Starting Synthetic Stress Matrix.")?;
self.sal.set_fan_mode("max")?;
let mut current_pl = 10.0_f32; // Start at 10W
let physical_threads = num_cpus::get_physical();
let mut previous_ops = 0.0;
loop {
self.log(&format!("Testing PL1 = {:.0}W...", current_pl))?;
for &watts in &bench_cfg.power_steps_watts {
self.check_safety_abort()?;
self.log(&format!("Testing PL1 = {:.0}W", watts))?;
// # SAFETY: Transactional Commit for Power Limits
let pl1_uw = TdpLimitMicroWatts::from_watts(current_pl)?;
let pl2_uw = TdpLimitMicroWatts::from_watts(current_pl + 5.0)?;
// Apply limits safely
let pl1 = PowerLimitWatts::try_new(watts)?;
let pl2 = PowerLimitWatts::try_new(watts + 5.0)?;
let mut tx = ConfigurationTransaction::default();
if let Some(p) = self.facts.rapl_paths.first() {
tx.add_change(p.join("constraint_0_power_limit_uw"), pl1_uw.as_u64().to_string());
tx.add_change(p.join("constraint_1_power_limit_uw"), pl2_uw.as_u64().to_string());
}
tx.commit().context("Failed to commit power limit transaction")?;
self.sal.set_sustained_power_limit(pl1)?;
self.sal.set_burst_power_limit(pl2)?;
// Start workload
self.workload.run_workload(
Duration::from_secs(bench_cfg.stress_duration_max_s),
IntensityProfile { threads: num_cpus::get(), load_percentage: 100, vector: StressVector::CpuMatrix }
IntensityProfile { threads: physical_threads, load_percentage: 100, vector: StressVector::CpuMatrix }
)?;
let step_start = Instant::now();
let mut step_temps = VecDeque::with_capacity(30);
let mut previous_step_temp = self.sal.get_temp().unwrap_or(0.0);
// Equilibrium Gating
while step_start.elapsed() < Duration::from_secs(bench_cfg.stress_duration_max_s) {
self.check_abort()?;
self.check_safety_abort()?;
let t = self.sal.get_temp().unwrap_or(0.0);
let dt_dt = (t - previous_step_temp) / 0.5;
previous_step_temp = t;
// Redundant safety check during step
if t > 94.0 || dt_dt > 5.0 {
warn!("Thermal Spike Detected! Aborting current step.");
break;
}
step_temps.push_back(t);
if step_temps.len() > 10 { step_temps.pop_front(); }
self.send_telemetry(tick)?;
tick += 1;
self.send_telemetry(tick.get())?;
tick.set(tick.get() + 1);
if step_start.elapsed() > Duration::from_secs(bench_cfg.stress_duration_min_s) && step_temps.len() == 10 {
let min = step_temps.iter().fold(f32::MAX, |a, &b| a.min(b));
let max = step_temps.iter().fold(f32::MIN, |a, &b| a.max(b));
if (max - min) < 0.5 {
self.log(&format!(" Equilibrium reached at {:.1}°C", t))?;
info!("Equilibrium reached at {:.1}°C", t);
break;
}
}
@@ -251,197 +282,74 @@ impl BenchmarkOrchestrator {
}
// Record data point
let avg_p = self.sal.get_power_w().unwrap_or(0.0);
let avg_t = self.sal.get_temp().unwrap_or(0.0);
let avg_f = self.sal.get_freq_mhz().unwrap_or(0.0);
let fans = self.sal.get_fan_rpms().unwrap_or_default();
let primary_fan = fans.first().cloned().unwrap_or(0);
let metrics = self.workload.get_current_metrics().unwrap_or_default();
self.profile.points.push(ThermalPoint {
power_w: avg_p,
temp_c: avg_t,
freq_mhz: avg_f,
fan_rpm: primary_fan,
power_w: self.sal.get_power_w().unwrap_or(watts),
temp_c: self.sal.get_temp().unwrap_or(0.0),
freq_mhz: self.sal.get_freq_mhz().unwrap_or(0.0),
fan_rpm: self.sal.get_fan_rpms().unwrap_or_default().first().cloned().unwrap_or(0),
throughput: metrics.primary_ops_per_sec,
});
self.workload.stop_workload()?;
// 1. Check Thermal Ceiling Halt Condition
let max_safe_temp = ThermalThresholdCelsius::MAX_SAFE_C - 5.0; // Margin
if avg_t >= max_safe_temp {
self.log(&format!("Thermal ceiling reached ({:.1}°C). Terminating Identification phase.", avg_t))?;
break;
}
// 2. Check Diminishing Returns Halt Condition (< 1% gain)
// Performance Halt Condition
if previous_ops > 0.0 {
let gain_percent = ((metrics.primary_ops_per_sec - previous_ops) / previous_ops) * 100.0;
if gain_percent < 1.0 {
self.log(&format!("Performance gain ({:.1}%) fell below 1%. Terminating Identification phase.", gain_percent))?;
let gain = ((metrics.primary_ops_per_sec - previous_ops) / previous_ops) * 100.0;
if gain < 1.0 {
self.log("Diminishing returns reached. Stopping sweep.")?;
break;
}
}
// 3. Absolute Maximum Power Check
if current_pl >= 60.0 {
self.log("Maximum theoretical power limit reached. Terminating Identification phase.")?;
break;
}
previous_ops = metrics.primary_ops_per_sec;
current_pl += 2.0;
self.log(&format!(" Step complete. Cooling down for {}s...", bench_cfg.cool_down_s))?;
self.log(&format!("Cooling down ({}s)...", bench_cfg.cool_down_s))?;
thread::sleep(Duration::from_secs(bench_cfg.cool_down_s));
}
// Phase 4: Physical Modeling (Agent Analyst)
self.phase = BenchmarkPhase::PhysicalModeling;
self.log("Phase 3: Calculating Silicon Physical Sweet Spot & Profiles...")?;
// 4. Physical Modeling Phase
self.ui_phase = BenchmarkPhase::PhysicalModeling;
self.log("Phase: Silicon Physical Sweet Spot Calculation")?;
let analyst = HeuristicAnalyst::new();
let matrix = analyst.analyze(&self.profile, max_soak_watts);
let matrix = analyst.analyze(&self.profile, self.profile.points.last().map(|p| p.power_w).unwrap_or(15.0));
let mut res = self.generate_result(false);
res.optimization_matrix = Some(matrix.clone());
self.log(&format!("✓ Thermal Resistance (): {:.3} K/W", res.thermal_resistance_kw))?;
self.log(&format!("✓ Silicon Knee Found: {:.1} W", res.silicon_knee_watts))?;
info!("Identification complete. Knee: {:.1}W, Rθ: {:.3} K/W", res.silicon_knee_watts, res.thermal_resistance_kw);
thread::sleep(Duration::from_secs(3));
// Phase 5: Finalizing (Agent Integrator)
self.phase = BenchmarkPhase::Finalizing;
self.log("Benchmark sequence complete. Generating configurations...")?;
let config = crate::engine::formatters::throttled::ThrottledConfig {
pl1_limit: res.silicon_knee_watts,
pl2_limit: res.recommended_pl2,
trip_temp: res.max_temp_c.max(95.0),
};
// 5. Finalizing Phase
self.ui_phase = BenchmarkPhase::Finalizing;
self.log("Phase: Generation of Optimized Configuration Sets")?;
let throttled_path = self.optional_config_out.clone()
.or_else(|| self.facts.paths.configs.get("throttled").cloned());
if let Some(path) = throttled_path {
let config = crate::engine::formatters::throttled::ThrottledConfig {
pl1_limit: res.silicon_knee_watts,
pl2_limit: res.recommended_pl2,
trip_temp: res.max_temp_c.max(90.0),
};
crate::engine::formatters::throttled::ThrottledTranslator::save(&path, &config)?;
self.log(&format!("✓ Saved '{}'.", path.display()))?;
res.config_paths.insert("throttled".to_string(), path.clone());
self.log(&format!("✓ Saved Throttled profile to {}", path.display()))?;
res.config_paths.insert("throttled".to_string(), path);
}
// Generate Fan configs via Agent Integrator
let base_out = self.optional_config_out.clone().unwrap_or_else(|| PathBuf::from("/etc"));
let i8k_out = base_out.join("i8kmon.conf");
if ServiceIntegrator::generate_i8kmon_config(&matrix, &i8k_out).is_ok() {
self.log(&format!("✓ Saved '{}'.", i8k_out.display()))?;
res.config_paths.insert("i8kmon".to_string(), i8k_out);
}
let thinkfan_out = base_out.join("thinkfan.conf");
if ServiceIntegrator::generate_thinkfan_config(&matrix, &thinkfan_out).is_ok() {
self.log(&format!("✓ Saved '{}'.", thinkfan_out.display()))?;
res.config_paths.insert("thinkfan".to_string(), thinkfan_out);
}
let thermald_out = base_out.join("thermal-conf.xml");
if ServiceIntegrator::generate_thermald_config(&matrix, &thermald_out).is_ok() {
self.log(&format!("✓ Saved '{}'.", thermald_out.display()))?;
res.config_paths.insert("thermald".to_string(), thermald_out);
}
let script_out = base_out.join("ember-tune-neutralize.sh");
if ServiceIntegrator::generate_conflict_resolution_script(&script_out).is_ok() {
self.log(&format!("✓ Saved conflict resolution script: '{}'", script_out.display()))?;
res.config_paths.insert("conflict_script".to_string(), script_out);
}
Ok(res)
}
fn spawn_watchdog_monitor(&self) -> thread::JoinHandle<()> {
let abort = self.emergency_abort.clone();
let reason_store = self.emergency_reason.clone();
let sal = self.sal.clone();
let tx = self.telemetry_tx.clone();
thread::spawn(move || {
while !abort.load(Ordering::SeqCst) {
let status = sal.get_safety_status();
match status {
Ok(SafetyStatus::EmergencyAbort(reason)) => {
*reason_store.lock().unwrap() = Some(reason.clone());
abort.store(true, Ordering::SeqCst);
break;
}
Ok(SafetyStatus::Warning(msg)) | Ok(SafetyStatus::Critical(msg)) => {
let state = TelemetryState {
cpu_model: String::new(),
total_ram_gb: 0,
tick: 0,
cpu_temp: 0.0,
power_w: 0.0,
current_freq: 0.0,
fans: Vec::new(),
governor: String::new(),
pl1_limit: 0.0,
pl2_limit: 0.0,
fan_tier: String::new(),
is_throttling: sal.get_throttling_status().unwrap_or(false),
phase: BenchmarkPhase::StressTesting,
history_watts: Vec::new(),
history_temp: Vec::new(),
history_mhz: Vec::new(),
log_event: Some(format!("WATCHDOG: {}", msg)),
metadata: std::collections::HashMap::new(),
is_emergency: false,
emergency_reason: None,
};
let _ = tx.send(state);
}
Ok(SafetyStatus::Nominal) => {}
Err(e) => {
*reason_store.lock().unwrap() = Some(format!("Watchdog Sensor Failure: {}", e));
abort.store(true, Ordering::SeqCst);
break;
}
}
thread::sleep(Duration::from_millis(100));
}
})
}
pub fn generate_result(&self, is_partial: bool) -> OptimizationResult {
let r_theta = self.engine.calculate_thermal_resistance(&self.profile);
let knee = self.engine.find_silicon_knee(&self.profile);
let max_t = self.engine.get_max_temp(&self.profile);
OptimizationResult {
profile: self.profile.clone(),
silicon_knee_watts: knee,
thermal_resistance_kw: r_theta,
recommended_pl1: knee,
recommended_pl2: knee * 1.25,
max_temp_c: max_t,
is_partial,
config_paths: std::collections::HashMap::new(),
optimization_matrix: None,
}
}
fn check_abort(&self) -> Result<()> {
/// Checks if the safety watchdog or user triggered an abort.
fn check_safety_abort(&self) -> Result<()> {
if self.emergency_abort.load(Ordering::SeqCst) {
let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Unknown safety trigger".to_string());
return Err(anyhow::anyhow!("EMERGENCY_ABORT: {}", reason));
let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Watchdog Triggered".to_string());
bail!("EMERGENCY_ABORT: {}", reason);
}
if let Ok(cmd) = self.command_rx.try_recv() {
match cmd {
UiCommand::Abort => {
return Err(anyhow::anyhow!("ABORTED"));
}
UiCommand::Abort => bail!("ABORTED"),
}
}
Ok(())
@@ -456,12 +364,12 @@ impl BenchmarkOrchestrator {
power_w: self.sal.get_power_w().unwrap_or(0.0),
current_freq: self.sal.get_freq_mhz().unwrap_or(0.0),
fans: self.sal.get_fan_rpms().unwrap_or_default(),
governor: "unknown".to_string(),
governor: "performance".to_string(),
pl1_limit: 0.0,
pl2_limit: 0.0,
fan_tier: "auto".to_string(),
is_throttling: self.sal.get_throttling_status().unwrap_or(false),
phase: self.phase,
phase: self.ui_phase,
history_watts: Vec::new(),
history_temp: Vec::new(),
history_mhz: Vec::new(),
@@ -477,7 +385,6 @@ impl BenchmarkOrchestrator {
let temp = self.sal.get_temp().unwrap_or(0.0);
let pwr = self.sal.get_power_w().unwrap_or(0.0);
let freq = self.sal.get_freq_mhz().unwrap_or(0.0);
let throttling = self.sal.get_throttling_status().unwrap_or(false);
self.history_temp.push_back(temp);
self.history_watts.push_back(pwr);
@@ -501,8 +408,8 @@ impl BenchmarkOrchestrator {
pl1_limit: 15.0,
pl2_limit: 25.0,
fan_tier: "max".to_string(),
is_throttling: throttling,
phase: self.phase,
is_throttling: self.sal.get_throttling_status().unwrap_or(false),
phase: self.ui_phase,
history_watts: self.history_watts.iter().cloned().collect(),
history_temp: self.history_temp.iter().cloned().collect(),
history_mhz: self.history_mhz.iter().cloned().collect(),
@@ -513,4 +420,22 @@ impl BenchmarkOrchestrator {
};
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed"))
}
pub fn generate_result(&self, is_partial: bool) -> OptimizationResult {
let r_theta = self.engine.calculate_thermal_resistance(&self.profile);
let knee = self.engine.find_silicon_knee(&self.profile);
let max_t = self.engine.get_max_temp(&self.profile);
OptimizationResult {
profile: self.profile.clone(),
silicon_knee_watts: knee,
thermal_resistance_kw: r_theta,
recommended_pl1: knee,
recommended_pl2: knee * 1.25,
max_temp_c: max_t,
is_partial,
config_paths: std::collections::HashMap::new(),
optimization_matrix: None,
}
}
}

View File

@@ -1,11 +1,12 @@
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep, SafetyStatus, EnvironmentCtx};
use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage};
use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent};
use anyhow::{Result, Context, anyhow};
use std::fs;
use std::path::{PathBuf};
use std::time::{Duration, Instant};
use std::thread;
use std::sync::Mutex;
use tracing::{debug, warn};
use tracing::{info, debug};
use crate::sal::heuristic::discovery::SystemFactSheet;
/// Implementation of the System Abstraction Layer for the Dell XPS 13 9380.
@@ -15,30 +16,66 @@ pub struct DellXps9380Sal {
temp_path: PathBuf,
pwr_path: PathBuf,
fan_paths: Vec<PathBuf>,
pwm_paths: Vec<PathBuf>,
pwm_enable_paths: Vec<PathBuf>,
pl1_paths: Vec<PathBuf>,
pl2_paths: Vec<PathBuf>,
freq_path: PathBuf,
pl1_path: PathBuf,
pl2_path: PathBuf,
last_poll: Mutex<Instant>,
last_temp: Mutex<f32>,
last_fans: Mutex<Vec<u32>>,
suppressed_services: Mutex<Vec<String>>,
msr_file: Mutex<fs::File>,
last_energy: Mutex<(u64, Instant)>,
last_watts: Mutex<f32>,
// --- Original State for Restoration ---
original_pl1: Mutex<Option<u64>>,
original_pl2: Mutex<Option<u64>>,
original_fan_mode: Mutex<Option<String>>,
}
impl DellXps9380Sal {
/// Initializes the Dell SAL, opening the MSR interface and discovering sensors.
/// Initializes the Dell SAL, opening the MSR interface and discovering sensors and PWM nodes.
pub fn init(ctx: EnvironmentCtx, facts: SystemFactSheet) -> Result<Self> {
let temp_path = facts.temp_path.clone().context("Dell SAL requires temperature sensor")?;
let pwr_base = facts.rapl_paths.first().cloned().context("Dell SAL requires RAPL interface")?;
let fan_paths = facts.fan_paths.clone();
// 1. Discover PWM and Enable nodes associated with the fan paths
let mut pwm_paths = Vec::new();
let mut pwm_enable_paths = Vec::new();
for fan_p in &fan_paths {
if let Some(parent) = fan_p.parent() {
let fan_file = fan_p.file_name().and_then(|n| n.to_str()).unwrap_or("");
let fan_idx = fan_file.chars().filter(|c| c.is_ascii_digit()).collect::<String>();
let idx = if fan_idx.is_empty() { "1".to_string() } else { fan_idx };
let pwm_p = parent.join(format!("pwm{}", idx));
if pwm_p.exists() { pwm_paths.push(pwm_p); }
let enable_p = parent.join(format!("pwm{}_enable", idx));
if enable_p.exists() { pwm_enable_paths.push(enable_p); }
}
}
// 2. Map all RAPL constraints
let mut pl1_paths = Vec::new();
let mut pl2_paths = Vec::new();
for rapl_p in &facts.rapl_paths {
pl1_paths.push(rapl_p.join("constraint_0_power_limit_uw"));
pl2_paths.push(rapl_p.join("constraint_1_power_limit_uw"));
}
// 3. Physical Sensor Verification & Warm Cache Priming
let mut initial_fans = Vec::new();
for fan_p in &fan_paths {
let mut rpm = 0;
for _ in 0..3 {
if let Ok(val) = fs::read_to_string(fan_p) {
rpm = val.trim().parse::<u32>().unwrap_or(0);
if rpm > 0 { break; }
}
thread::sleep(Duration::from_millis(100));
}
info!("SAL Warm-Start: Fan sensor {:?} -> {} RPM", fan_p, rpm);
initial_fans.push(rpm);
}
let freq_path = ctx.sysfs_base.join("sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq");
let msr_path = ctx.sysfs_base.join("dev/cpu/0/msr");
@@ -47,25 +84,26 @@ impl DellXps9380Sal {
let initial_energy = fs::read_to_string(pwr_base.join("energy_uj")).unwrap_or_default().trim().parse().unwrap_or(0);
info!("SAL: Dell XPS 9380 Initialized. ({} fans, {} RAPL nodes found)",
fan_paths.len(), facts.rapl_paths.len());
Ok(Self {
temp_path,
pwr_path: pwr_base.join("power1_average"),
fan_paths,
pwm_paths,
pwm_enable_paths,
pl1_paths,
pl2_paths,
freq_path,
pl1_path: pwr_base.join("constraint_0_power_limit_uw"),
pl2_path: pwr_base.join("constraint_1_power_limit_uw"),
last_poll: Mutex::new(Instant::now() - Duration::from_secs(2)),
last_temp: Mutex::new(0.0),
last_fans: Mutex::new(Vec::new()),
suppressed_services: Mutex::new(Vec::new()),
last_fans: Mutex::new(initial_fans),
msr_file: Mutex::new(msr_file),
last_energy: Mutex::new((initial_energy, Instant::now())),
last_watts: Mutex::new(0.0),
fact_sheet: facts,
ctx,
original_pl1: Mutex::new(None),
original_pl2: Mutex::new(None),
original_fan_mode: Mutex::new(None),
})
}
@@ -93,7 +131,6 @@ impl PreflightAuditor for DellXps9380Sal {
outcome: if unsafe { libc::getuid() } == 0 { Ok(()) } else { Err(AuditError::RootRequired) }
});
// RAPL Lock Check (MSR 0x610)
let rapl_lock = match self.read_msr(0x610) {
Ok(val) => {
if (val & (1 << 63)) != 0 {
@@ -104,19 +141,14 @@ impl PreflightAuditor for DellXps9380Sal {
},
Err(e) => Err(AuditError::ToolMissing(format!("Cannot read MSR 0x610: {}", e))),
};
steps.push(AuditStep {
description: "MSR 0x610 RAPL Lock Status".to_string(),
outcome: rapl_lock,
});
steps.push(AuditStep { description: "MSR 0x610 RAPL Lock Status".to_string(), outcome: rapl_lock });
let modules = ["dell_smm_hwmon", "msr", "intel_rapl_msr"];
for mod_name in modules {
let path = self.ctx.sysfs_base.join(format!("sys/module/{}", mod_name));
steps.push(AuditStep {
description: format!("Kernel Module: {}", mod_name),
outcome: if path.exists() { Ok(()) } else {
Err(AuditError::ToolMissing(format!("Module '{}' not loaded.", mod_name)))
}
outcome: if path.exists() { Ok(()) } else { Err(AuditError::ToolMissing(format!("Module '{}' not loaded.", mod_name))) }
});
}
@@ -138,9 +170,7 @@ impl PreflightAuditor for DellXps9380Sal {
let ac_status = fs::read_to_string(ac_status_path).unwrap_or_else(|_| "0".to_string());
steps.push(AuditStep {
description: "AC Power Connection".to_string(),
outcome: if ac_status.trim() == "1" { Ok(()) } else {
Err(AuditError::AcPowerMissing("System must be on AC power".to_string()))
}
outcome: if ac_status.trim() == "1" { Ok(()) } else { Err(AuditError::AcPowerMissing("System must be on AC power".to_string())) }
});
Box::new(steps.into_iter())
@@ -148,49 +178,16 @@ impl PreflightAuditor for DellXps9380Sal {
}
impl EnvironmentGuard for DellXps9380Sal {
fn suppress(&self) -> Result<()> {
if let Ok(pl1) = fs::read_to_string(&self.pl1_path) {
*self.original_pl1.lock().unwrap() = pl1.trim().parse().ok();
}
if let Ok(pl2) = fs::read_to_string(&self.pl2_path) {
*self.original_pl2.lock().unwrap() = pl2.trim().parse().ok();
}
*self.original_fan_mode.lock().unwrap() = Some("1".to_string());
let services = ["tlp", "thermald", "i8kmon"];
let mut suppressed = self.suppressed_services.lock().unwrap();
for s in services {
if self.ctx.runner.run("systemctl", &["is-active", "--quiet", s]).is_ok() {
let _ = self.ctx.runner.run("systemctl", &["stop", s]);
suppressed.push(s.to_string());
}
}
Ok(())
}
fn restore(&self) -> Result<()> {
if let Some(pl1) = *self.original_pl1.lock().unwrap() {
let _ = fs::write(&self.pl1_path, pl1.to_string());
}
if let Some(pl2) = *self.original_pl2.lock().unwrap() {
let _ = fs::write(&self.pl2_path, pl2.to_string());
}
if let Some(tool_path) = self.fact_sheet.paths.tools.get("dell_fan_ctrl") {
let _ = self.ctx.runner.run(&tool_path.to_string_lossy(), &["1"]);
}
let mut suppressed = self.suppressed_services.lock().unwrap();
for s in suppressed.drain(..) {
let _ = self.ctx.runner.run("systemctl", &["start", &s]);
}
Ok(())
}
fn suppress(&self) -> Result<()> { Ok(()) }
fn restore(&self) -> Result<()> { Ok(()) }
}
impl SensorBus for DellXps9380Sal {
fn get_temp(&self) -> Result<f32> {
let mut last_poll = self.last_poll.lock().unwrap();
let now = Instant::now();
if now.duration_since(*last_poll) < Duration::from_millis(1000) {
// # SAFETY: High frequency polling for watchdog
if now.duration_since(*last_poll) < Duration::from_millis(100) {
return Ok(*self.last_temp.lock().unwrap());
}
let s = fs::read_to_string(&self.temp_path)?;
@@ -201,7 +198,7 @@ impl SensorBus for DellXps9380Sal {
}
fn get_power_w(&self) -> Result<f32> {
let rapl_base = self.pl1_path.parent().context("RAPL path error")?;
let rapl_base = self.fact_sheet.rapl_paths.first().context("RAPL path error")?;
let energy_path = rapl_base.join("energy_uj");
if energy_path.exists() {
@@ -212,14 +209,9 @@ impl SensorBus for DellXps9380Sal {
let e2 = e2_str.trim().parse::<u64>()?;
let t2 = Instant::now();
let (e1, t1) = *last_energy;
let delta_e = e2.wrapping_sub(e1);
let delta_t = t2.duration_since(t1).as_secs_f32();
if delta_t < 0.1 {
return Ok(*last_watts); // Return cached if polled too fast
}
if delta_t < 0.1 { return Ok(*last_watts); }
let watts = (delta_e as f32 / 1_000_000.0) / delta_t;
*last_energy = (e2, t2);
*last_watts = watts;
@@ -236,12 +228,27 @@ impl SensorBus for DellXps9380Sal {
if now.duration_since(*last_poll) < Duration::from_millis(1000) {
return Ok(self.last_fans.lock().unwrap().clone());
}
let mut fans = Vec::new();
for path in &self.fan_paths {
if let Ok(s) = fs::read_to_string(path) {
if let Ok(rpm) = s.trim().parse::<u32>() { fans.push(rpm); }
let mut val = 0;
for i in 0..5 {
match fs::read_to_string(path) {
Ok(s) => {
if let Ok(rpm) = s.trim().parse::<u32>() {
val = rpm;
if rpm > 0 { break; }
}
},
Err(e) => {
debug!("SAL: Fan poll retry {} for {:?} failed: {}", i+1, path, e);
}
}
thread::sleep(Duration::from_millis(150));
}
fans.push(val);
}
*self.last_fans.lock().unwrap() = fans.clone();
*last_poll = now;
Ok(fans)
@@ -253,7 +260,6 @@ impl SensorBus for DellXps9380Sal {
}
fn get_throttling_status(&self) -> Result<bool> {
// MSR 0x19C bit 0 is "Thermal Status", bit 1 is "Thermal Log"
let val = self.read_msr(0x19C)?;
Ok((val & 0x1) != 0)
}
@@ -266,24 +272,47 @@ impl ActuatorBus for DellXps9380Sal {
let tool_str = tool_path.to_string_lossy();
match mode {
"max" | "Manual" => { self.ctx.runner.run(&tool_str, &["0"])?; }
"max" | "Manual" => {
self.ctx.runner.run(&tool_str, &["0"])?;
// Disabling BIOS control requires immediate PWM override
self.set_fan_speed(FanSpeedPercent::new(100)?)?;
}
"auto" | "Auto" => { self.ctx.runner.run(&tool_str, &["1"])?; }
_ => {}
}
Ok(())
}
fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> {
fn set_fan_speed(&self, speed: FanSpeedPercent) -> Result<()> {
let pwm_val = ((speed.get() as u32 * 255) / 100) as u8;
for p in &self.pwm_enable_paths { let _ = fs::write(p, "1"); }
for path in &self.pwm_paths { let _ = fs::write(path, pwm_val.to_string()); }
Ok(())
}
fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
fs::write(&self.pl1_path, limit.as_u64().to_string())?;
fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()> {
for path in &self.pl1_paths {
debug!("SAL: Applying PL1 ({:.1}W) to {:?}", limit.get(), path);
fs::write(path, limit.as_microwatts().to_string())
.with_context(|| format!("Failed to write PL1 to {:?}", path))?;
if let Some(parent) = path.parent() {
let enable_p = parent.join("constraint_0_enabled");
let _ = fs::write(&enable_p, "1");
}
}
Ok(())
}
fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
fs::write(&self.pl2_path, limit.as_u64().to_string())?;
fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()> {
for path in &self.pl2_paths {
debug!("SAL: Applying PL2 ({:.1}W) to {:?}", limit.get(), path);
fs::write(path, limit.as_microwatts().to_string())
.with_context(|| format!("Failed to write PL2 to {:?}", path))?;
if let Some(parent) = path.parent() {
let enable_p = parent.join("constraint_1_enabled");
let _ = fs::write(&enable_p, "1");
}
}
Ok(())
}
}
@@ -305,7 +334,5 @@ impl HardwareWatchdog for DellXps9380Sal {
}
impl Drop for DellXps9380Sal {
fn drop(&mut self) {
let _ = self.restore();
}
fn drop(&mut self) { }
}

148
src/sal/discovery.rs Normal file
View File

@@ -0,0 +1,148 @@
//! # Hardware Discovery Engine (Agent Sentinel)
//!
//! This module provides dynamic traversal of `/sys/class/hwmon` and `/sys/class/powercap`
//! to locate sensors and actuators without relying on hardcoded indices.
use anyhow::{Result, Context, anyhow};
use std::fs;
use std::path::{Path, PathBuf};
use tracing::{debug, info, warn};
/// Result of a successful hardware discovery.
#[derive(Debug, Clone)]
pub struct DiscoveredHardware {
/// Path to the primary package temperature sensor input.
pub temp_input: PathBuf,
/// Paths to all detected fan RPM inputs.
pub fan_inputs: Vec<PathBuf>,
/// Paths to all detected fan PWM control nodes.
pub pwm_controls: Vec<PathBuf>,
/// Paths to all detected fan PWM enable nodes.
pub pwm_enables: Vec<PathBuf>,
/// Paths to RAPL power limit constraint files.
pub rapl_paths: Vec<PathBuf>,
}
pub struct DiscoveryEngine;
impl DiscoveryEngine {
/// Performs a full traversal of the sysfs hardware tree.
pub fn run(sysfs_root: &Path) -> Result<DiscoveredHardware> {
info!("Sentinel: Starting dynamic hardware discovery...");
let hwmon_path = sysfs_root.join("sys/class/hwmon");
let (temp_input, fan_info) = Self::discover_hwmon(&hwmon_path)?;
let powercap_path = sysfs_root.join("sys/class/powercap");
let rapl_paths = Self::discover_rapl(&powercap_path)?;
let hardware = DiscoveredHardware {
temp_input,
fan_inputs: fan_info.rpm_inputs,
pwm_controls: fan_info.pwm_controls,
pwm_enables: fan_info.pwm_enables,
rapl_paths,
};
info!("Sentinel: Discovery complete. Found {} fans and {} RAPL nodes.",
hardware.fan_inputs.len(), hardware.rapl_paths.len());
Ok(hardware)
}
fn discover_hwmon(base: &Path) -> Result<(PathBuf, FanHardware)> {
let mut best_temp: Option<(u32, PathBuf)> = None;
let mut fans = FanHardware::default();
let entries = fs::read_dir(base)
.with_context(|| format!("Failed to read hwmon base: {:?}", base))?;
for entry in entries.flatten() {
let path = entry.path();
let driver_name = fs::read_to_string(path.join("name"))
.map(|s| s.trim().to_string())
.unwrap_or_else(|_| "unknown".to_string());
debug!("Discovery: Probing hwmon node {:?} (driver: {})", path, driver_name);
// 1. Temperature Discovery
let temp_priority = match driver_name.as_str() {
"coretemp" | "zenpower" => 10,
"k10temp" => 9,
"dell_smm" => 8,
"acpitz" => 1,
_ => 5,
};
if let Ok(hw_entries) = fs::read_dir(&path) {
for hw_entry in hw_entries.flatten() {
let file_name = hw_entry.file_name().to_string_lossy().to_string();
// Temperature Inputs
if file_name.starts_with("temp") && file_name.ends_with("_input") {
let label_path = path.join(file_name.replace("_input", "_label"));
let label = fs::read_to_string(label_path).unwrap_or_default().trim().to_string();
let label_priority = if label.contains("Package") || label.contains("Tdie") {
2
} else {
0
};
let total_priority = temp_priority + label_priority;
if best_temp.is_none() || total_priority > best_temp.as_ref().unwrap().0 {
best_temp = Some((total_priority, hw_entry.path()));
}
}
// Fan Inputs
if file_name.starts_with("fan") && file_name.ends_with("_input") {
fans.rpm_inputs.push(hw_entry.path());
}
// PWM Controls
if file_name.starts_with("pwm") && !file_name.contains("_") {
fans.pwm_controls.push(hw_entry.path());
}
// PWM Enables
if file_name.starts_with("pwm") && file_name.ends_with("_enable") {
fans.pwm_enables.push(hw_entry.path());
}
}
}
}
let temp_input = best_temp.map(|(_, p)| p)
.ok_or_else(|| anyhow!("Failed to locate any valid temperature sensor in /sys/class/hwmon/"))?;
Ok((temp_input, fans))
}
fn discover_rapl(base: &Path) -> Result<Vec<PathBuf>> {
let mut paths = Vec::new();
if !base.exists() {
warn!("Discovery: /sys/class/powercap does not exist.");
return Ok(paths);
}
let entries = fs::read_dir(base)?;
for entry in entries.flatten() {
let path = entry.path();
let name = fs::read_to_string(path.join("name")).unwrap_or_default().trim().to_string();
if name.contains("package") || name.contains("intel-rapl") {
paths.push(path);
}
}
Ok(paths)
}
}
#[derive(Default)]
struct FanHardware {
rpm_inputs: Vec<PathBuf>,
pwm_controls: Vec<PathBuf>,
pwm_enables: Vec<PathBuf>,
}

View File

@@ -1,11 +1,12 @@
use anyhow::{Result, anyhow};
use anyhow::{Result, anyhow, Context};
use std::path::{Path};
use std::fs;
use std::time::{Duration, Instant};
use std::sync::Mutex;
use std::sync::{Mutex, Arc};
use tracing::{debug, warn, info};
use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx};
use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage};
use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent};
use crate::sal::heuristic::discovery::SystemFactSheet;
use crate::sal::heuristic::schema::HardwareDb;
@@ -13,14 +14,9 @@ pub struct GenericLinuxSal {
ctx: EnvironmentCtx,
fact_sheet: SystemFactSheet,
db: HardwareDb,
suppressed_services: Mutex<Vec<String>>,
last_valid_temp: Mutex<(f32, Instant)>,
current_pl1: Mutex<u64>,
last_energy: Mutex<(u64, Instant)>,
// --- Original State for Restoration ---
original_pl1: Mutex<Option<u64>>,
original_pl2: Mutex<Option<u64>>,
}
impl GenericLinuxSal {
@@ -33,14 +29,11 @@ impl GenericLinuxSal {
Self {
db,
suppressed_services: Mutex::new(Vec::new()),
last_valid_temp: Mutex::new((0.0, Instant::now())),
current_pl1: Mutex::new(15_000_000),
last_energy: Mutex::new((initial_energy, Instant::now())),
fact_sheet: facts,
ctx,
original_pl1: Mutex::new(None),
original_pl2: Mutex::new(None),
}
}
@@ -135,7 +128,6 @@ impl SensorBus for GenericLinuxSal {
}
fn get_throttling_status(&self) -> Result<bool> {
// Fallback: check if any cooling device is active (cur_state > 0)
let cooling_base = self.ctx.sysfs_base.join("sys/class/thermal");
if let Ok(entries) = fs::read_dir(cooling_base) {
for entry in entries.flatten() {
@@ -168,68 +160,37 @@ impl ActuatorBus for GenericLinuxSal {
} else { Ok(()) }
}
fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> {
fn set_fan_speed(&self, _speed: FanSpeedPercent) -> Result<()> {
Ok(())
}
fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL1 path"))?;
fs::write(rapl_path.join("constraint_0_power_limit_uw"), limit.as_u64().to_string())?;
*self.current_pl1.lock().unwrap() = limit.as_u64();
fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()> {
for rapl_path in &self.fact_sheet.rapl_paths {
let limit_path = rapl_path.join("constraint_0_power_limit_uw");
let enable_path = rapl_path.join("constraint_0_enabled");
fs::write(&limit_path, limit.as_microwatts().to_string())
.with_context(|| format!("Failed to write PL1 to {:?}", limit_path))?;
let _ = fs::write(&enable_path, "1");
}
*self.current_pl1.lock().unwrap() = limit.as_microwatts();
Ok(())
}
fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL2 path"))?;
fs::write(rapl_path.join("constraint_1_power_limit_uw"), limit.as_u64().to_string())?;
fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()> {
for rapl_path in &self.fact_sheet.rapl_paths {
let limit_path = rapl_path.join("constraint_1_power_limit_uw");
let enable_path = rapl_path.join("constraint_1_enabled");
fs::write(&limit_path, limit.as_microwatts().to_string())
.with_context(|| format!("Failed to write PL2 to {:?}", limit_path))?;
let _ = fs::write(&enable_path, "1");
}
Ok(())
}
}
impl EnvironmentGuard for GenericLinuxSal {
fn suppress(&self) -> Result<()> {
// Snapshot Power Limits
if let Some(rapl_path) = self.fact_sheet.rapl_paths.first() {
if let Ok(pl1) = fs::read_to_string(rapl_path.join("constraint_0_power_limit_uw")) {
*self.original_pl1.lock().unwrap() = pl1.trim().parse().ok();
}
if let Ok(pl2) = fs::read_to_string(rapl_path.join("constraint_1_power_limit_uw")) {
*self.original_pl2.lock().unwrap() = pl2.trim().parse().ok();
}
}
let mut suppressed = self.suppressed_services.lock().unwrap();
for conflict_id in &self.fact_sheet.active_conflicts {
if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) {
for service in &conflict.services {
if self.ctx.runner.run("systemctl", &["is-active", "--quiet", service]).is_ok() {
let _ = self.ctx.runner.run("systemctl", &["stop", service]);
suppressed.push(service.clone());
}
}
}
}
Ok(())
}
fn restore(&self) -> Result<()> {
// Restore Power Limits
if let Some(rapl_path) = self.fact_sheet.rapl_paths.first() {
if let Some(pl1) = *self.original_pl1.lock().unwrap() {
let _ = fs::write(rapl_path.join("constraint_0_power_limit_uw"), pl1.to_string());
}
if let Some(pl2) = *self.original_pl2.lock().unwrap() {
let _ = fs::write(rapl_path.join("constraint_1_power_limit_uw"), pl2.to_string());
}
}
let mut suppressed = self.suppressed_services.lock().unwrap();
for service in suppressed.drain(..) {
let _ = self.ctx.runner.run("systemctl", &["start", &service]);
}
if self.is_dell() { let _ = self.set_fan_mode("auto"); }
Ok(())
}
fn suppress(&self) -> Result<()> { Ok(()) }
fn restore(&self) -> Result<()> { Ok(()) }
}
impl HardwareWatchdog for GenericLinuxSal {
@@ -245,7 +206,3 @@ impl HardwareWatchdog for GenericLinuxSal {
Ok(SafetyStatus::Nominal)
}
}
impl Drop for GenericLinuxSal {
fn drop(&mut self) { let _ = self.restore(); }
}

View File

@@ -6,7 +6,7 @@ use std::sync::mpsc;
use std::collections::HashMap;
use crate::sal::heuristic::schema::{SensorDiscovery, ActuatorDiscovery, Conflict, Discovery, Benchmarking};
use crate::sys::SyscallRunner;
use tracing::{debug, warn};
use tracing::{debug, warn, info};
/// Registry of dynamically discovered paths for configs and tools.
#[derive(Debug, Clone, Default)]
@@ -24,6 +24,7 @@ pub struct SystemFactSheet {
pub fan_paths: Vec<PathBuf>,
pub rapl_paths: Vec<PathBuf>,
pub active_conflicts: Vec<String>,
pub conflict_services: Vec<String>,
pub paths: PathRegistry,
pub bench_config: Option<Benchmarking>,
}
@@ -44,12 +45,17 @@ pub fn discover_facts(
let rapl_paths = discover_rapl(base_path, &discovery.actuators);
let mut active_conflicts = Vec::new();
let mut conflict_services = Vec::new();
for conflict in conflicts {
let mut found_active = false;
for service in &conflict.services {
if is_service_active(runner, service) {
debug!("Detected active conflict: {} (Service: {})", conflict.id, service);
active_conflicts.push(conflict.id.clone());
break;
if !found_active {
debug!("Detected active conflict: {} (Service: {})", conflict.id, service);
active_conflicts.push(conflict.id.clone());
found_active = true;
}
conflict_services.push(service.clone());
}
}
}
@@ -57,13 +63,7 @@ pub fn discover_facts(
let paths = discover_paths(base_path, discovery);
SystemFactSheet {
vendor,
model,
temp_path,
fan_paths,
rapl_paths,
active_conflicts,
paths,
vendor, model, temp_path, fan_paths, rapl_paths, active_conflicts, conflict_services, paths,
bench_config: Some(bench_config),
}
}
@@ -71,7 +71,6 @@ pub fn discover_facts(
fn discover_paths(base_path: &Path, discovery: &Discovery) -> PathRegistry {
let mut registry = PathRegistry::default();
// 1. Discover Tools via PATH
for (id, binary_name) in &discovery.tools {
if let Ok(path) = which::which(binary_name) {
debug!("Discovered tool: {} -> {:?}", id, path);
@@ -79,7 +78,6 @@ fn discover_paths(base_path: &Path, discovery: &Discovery) -> PathRegistry {
}
}
// 2. Discover Configs via existence check
for (id, candidates) in &discovery.configs {
for candidate in candidates {
let path = if candidate.starts_with('/') {
@@ -104,12 +102,11 @@ fn discover_paths(base_path: &Path, discovery: &Discovery) -> PathRegistry {
registry
}
/// Reads DMI information from sysfs with a safety timeout.
fn read_dmi_info(base_path: &Path) -> (String, String) {
let vendor = read_sysfs_with_timeout(&base_path.join("sys/class/dmi/id/sys_vendor"), Duration::from_millis(100))
.unwrap_or_else(|| "Unknown".to_string());
let model = read_sysfs_with_timeout(&base_path.join("sys/class/dmi/id/product_name"), Duration::from_millis(100))
.unwrap_or_else(|| "Unknown".to_string());
let vendor = fs::read_to_string(base_path.join("sys/class/dmi/id/sys_vendor"))
.map(|s| s.trim().to_string()).unwrap_or_else(|_| "Unknown".to_string());
let model = fs::read_to_string(base_path.join("sys/class/dmi/id/product_name"))
.map(|s| s.trim().to_string()).unwrap_or_else(|_| "Unknown".to_string());
(vendor, model)
}
@@ -119,49 +116,62 @@ fn discover_hwmon(base_path: &Path, cfg: &SensorDiscovery) -> (Option<PathBuf>,
let mut fan_candidates = Vec::new();
let hwmon_base = base_path.join("sys/class/hwmon");
let entries = match fs::read_dir(&hwmon_base) {
Ok(e) => e,
Err(e) => {
warn!("Could not read {:?}: {}", hwmon_base, e);
return (None, Vec::new());
}
};
let entries = fs::read_dir(&hwmon_base).map_err(|e| {
warn!("Could not read {:?}: {}", hwmon_base, e);
e
}).ok();
for entry in entries.flatten() {
let hwmon_path = entry.path();
let driver_name = read_sysfs_with_timeout(&hwmon_path.join("name"), Duration::from_millis(100))
.unwrap_or_default();
if let Some(entries) = entries {
for entry in entries.flatten() {
let hwmon_path = entry.path();
// # SAFETY: Read driver name directly. This file is virtual and never blocks.
// Using a timeout wrapper here was causing discovery to fail if the thread-pool lagged.
let driver_name = fs::read_to_string(hwmon_path.join("name"))
.map(|s| s.trim().to_string()).unwrap_or_default();
let priority = cfg.hwmon_priority
.iter()
.position(|p| p == &driver_name)
.unwrap_or(usize::MAX);
let priority = cfg.hwmon_priority
.iter()
.position(|p| driver_name.contains(p))
.unwrap_or(usize::MAX);
if let Ok(hw_entries) = fs::read_dir(&hwmon_path) {
for hw_entry in hw_entries.flatten() {
let file_name = hw_entry.file_name().into_string().unwrap_or_default();
if file_name.starts_with("temp") && file_name.ends_with("_label") {
if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(100)) {
if cfg.temp_labels.iter().any(|l| label.contains(l)) {
let input_path = hwmon_path.join(file_name.replace("_label", "_input"));
if input_path.exists() {
temp_candidates.push((priority, input_path));
if let Ok(hw_entries) = fs::read_dir(&hwmon_path) {
for hw_entry in hw_entries.flatten() {
let file_name = hw_entry.file_name().into_string().unwrap_or_default();
// 1. Temperatures
if file_name.starts_with("temp") && file_name.ends_with("_label") {
if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(500)) {
if cfg.temp_labels.iter().any(|l| label.contains(l)) {
let input_path = hwmon_path.join(file_name.replace("_label", "_input"));
if input_path.exists() {
temp_candidates.push((priority, input_path));
}
}
}
}
}
if file_name.starts_with("fan") && file_name.ends_with("_label") {
if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(100)) {
if cfg.fan_labels.iter().any(|l| label.contains(l)) {
let input_path = hwmon_path.join(file_name.replace("_label", "_input"));
if input_path.exists() {
fan_candidates.push((priority, input_path));
// 2. Fans (Label Match)
if file_name.starts_with("fan") && file_name.ends_with("_label") {
if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(500)) {
if cfg.fan_labels.iter().any(|l| label.contains(l)) {
let input_path = hwmon_path.join(file_name.replace("_label", "_input"));
if input_path.exists() {
debug!("Discovered fan by label: {:?} (priority {})", input_path, priority);
fan_candidates.push((priority, input_path));
}
}
}
}
// 3. Fans (Priority Fallback - CRITICAL FOR DELL 9380)
// If we found a priority driver (e.g., dell_smm), we take every fan*_input we find.
if priority < usize::MAX && file_name.starts_with("fan") && file_name.ends_with("_input") {
if !fan_candidates.iter().any(|(_, p)| p == &hw_entry.path()) {
info!("Heuristic Discovery: Force-adding unlabeled fan sensor from priority driver '{}': {:?}", driver_name, hw_entry.path());
fan_candidates.push((priority, hw_entry.path()));
}
}
}
}
}
@@ -171,45 +181,45 @@ fn discover_hwmon(base_path: &Path, cfg: &SensorDiscovery) -> (Option<PathBuf>,
fan_candidates.sort_by_key(|(p, _)| *p);
let best_temp = temp_candidates.first().map(|(_, p)| p.clone());
let best_fans = fan_candidates.into_iter().map(|(_, p)| p).collect();
let best_fans: Vec<PathBuf> = fan_candidates.into_iter().map(|(_, p)| p).collect();
if best_fans.is_empty() {
warn!("Heuristic Discovery: No fan RPM sensors found.");
} else {
info!("Heuristic Discovery: Final registry contains {} fan sensors.", best_fans.len());
}
(best_temp, best_fans)
}
/// Discovers RAPL powercap paths.
fn discover_rapl(base_path: &Path, cfg: &ActuatorDiscovery) -> Vec<PathBuf> {
let mut paths = Vec::new();
let powercap_base = base_path.join("sys/class/powercap");
let entries = match fs::read_dir(&powercap_base) {
Ok(e) => e,
Err(_) => return Vec::new(),
};
for entry in entries.flatten() {
let path = entry.path();
let dir_name = entry.file_name().into_string().unwrap_or_default();
if cfg.rapl_paths.contains(&dir_name) {
paths.push(path);
continue;
}
if let Some(name) = read_sysfs_with_timeout(&path.join("name"), Duration::from_millis(100)) {
if cfg.rapl_paths.iter().any(|p| p == &name) {
if let Ok(entries) = fs::read_dir(&powercap_base) {
for entry in entries.flatten() {
let path = entry.path();
let dir_name = entry.file_name().into_string().unwrap_or_default();
if cfg.rapl_paths.contains(&dir_name) {
paths.push(path);
continue;
}
if let Ok(name) = fs::read_to_string(path.join("name")) {
if cfg.rapl_paths.iter().any(|p| p == name.trim()) {
paths.push(path);
}
}
}
}
paths
}
/// Checks if a systemd service is currently active using the injected runner.
pub fn is_service_active(runner: &dyn SyscallRunner, service: &str) -> bool {
runner.run("systemctl", &["is-active", "--quiet", service]).is_ok()
}
/// Helper to read a sysfs file with a timeout.
fn read_sysfs_with_timeout(path: &Path, timeout: Duration) -> Option<String> {
let (tx, rx) = mpsc::channel();
let path_buf = path.to_path_buf();

View File

@@ -1,6 +1,7 @@
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, SafetyStatus};
use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage};
use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent};
use anyhow::Result;
use std::sync::Arc;
pub struct MockSal {
pub temperature_sequence: std::sync::atomic::AtomicUsize,
@@ -17,65 +18,36 @@ impl MockSal {
impl PreflightAuditor for MockSal {
fn audit(&self) -> Box<dyn Iterator<Item = AuditStep> + '_> {
let steps = vec![
AuditStep {
description: "Mock Root Privileges".to_string(),
outcome: Ok(()),
},
AuditStep {
description: "Mock AC Power Status".to_string(),
outcome: Ok(()),
},
AuditStep { description: "Mock Root Privileges".to_string(), outcome: Ok(()) },
AuditStep { description: "Mock AC Power Status".to_string(), outcome: Ok(()) },
];
Box::new(steps.into_iter())
}
}
impl EnvironmentGuard for MockSal {
fn suppress(&self) -> Result<()> {
Ok(())
}
fn restore(&self) -> Result<()> {
Ok(())
}
fn suppress(&self) -> Result<()> { Ok(()) }
fn restore(&self) -> Result<()> { Ok(()) }
}
impl SensorBus for MockSal {
fn get_temp(&self) -> Result<f32> {
// Support dynamic sequence for Step 5
let seq = self.temperature_sequence.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
Ok(40.0 + (seq as f32 * 0.5).min(50.0)) // Heats up from 40 to 90
}
fn get_power_w(&self) -> Result<f32> {
Ok(15.0)
}
fn get_fan_rpms(&self) -> Result<Vec<u32>> {
Ok(vec![2500])
}
fn get_freq_mhz(&self) -> Result<f32> {
Ok(3200.0)
}
fn get_throttling_status(&self) -> Result<bool> {
Ok(self.get_temp()? > 90.0)
Ok(40.0 + (seq as f32 * 0.5).min(55.0))
}
fn get_power_w(&self) -> Result<f32> { Ok(15.0) }
fn get_fan_rpms(&self) -> Result<Vec<u32>> { Ok(vec![2500, 2400]) }
fn get_freq_mhz(&self) -> Result<f32> { Ok(3200.0) }
fn get_throttling_status(&self) -> Result<bool> { Ok(false) }
}
impl ActuatorBus for MockSal {
fn set_fan_mode(&self, _mode: &str) -> Result<()> {
Ok(())
}
fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> {
Ok(())
}
fn set_sustained_power_limit(&self, _limit: TdpLimitMicroWatts) -> Result<()> {
Ok(())
}
fn set_burst_power_limit(&self, _limit: TdpLimitMicroWatts) -> Result<()> {
Ok(())
}
fn set_fan_mode(&self, _mode: &str) -> Result<()> { Ok(()) }
fn set_fan_speed(&self, _speed: FanSpeedPercent) -> Result<()> { Ok(()) }
fn set_sustained_power_limit(&self, _limit: PowerLimitWatts) -> Result<()> { Ok(()) }
fn set_burst_power_limit(&self, _limit: PowerLimitWatts) -> Result<()> { Ok(()) }
}
impl HardwareWatchdog for MockSal {
fn get_safety_status(&self) -> Result<SafetyStatus> {
Ok(SafetyStatus::Nominal)
}
fn get_safety_status(&self) -> Result<SafetyStatus> { Ok(SafetyStatus::Nominal) }
}

View File

@@ -4,3 +4,4 @@ pub mod dell_xps_9380;
pub mod generic_linux;
pub mod heuristic;
pub mod safety;
pub mod discovery;

View File

@@ -8,68 +8,81 @@ use anyhow::{Result, bail, Context};
use std::collections::HashMap;
use std::fs;
use std::path::{PathBuf};
use tracing::{info, warn, error};
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::{Duration, Instant};
use std::thread;
use tracing::{info, warn, error, debug};
use crate::sal::traits::SensorBus;
// --- 1. Type-Driven Bounds Checking ---
/// Represents a TDP limit in microwatts, strictly bounded between 5W and 80W.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct TdpLimitMicroWatts(u64);
/// Represents a validated TDP limit in Watts.
#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
pub struct PowerLimitWatts(f32);
impl TdpLimitMicroWatts {
/// # SAFETY:
/// Values below 5W can cause CPU frequency to drop to 400MHz and induce system instability.
pub const MIN_SAFE_UW: u64 = 5_000_000;
/// # SAFETY:
/// Values above 80W can exceed the thermal and electrical design limits of XPS chassis.
pub const MAX_SAFE_UW: u64 = 80_000_000;
impl PowerLimitWatts {
/// Absolute safety floor. Setting TDP below 3W can induce system-wide
/// CPU stalls and I/O deadlocks on certain Intel mobile chipsets.
pub const MIN: f32 = 3.0;
/// Safety ceiling for mobile thin-and-light chassis.
pub const MAX: f32 = 100.0;
/// Validates and constructs a new TDP limit.
pub fn new(microwatts: u64) -> Result<Self> {
if microwatts < Self::MIN_SAFE_UW {
bail!("HardwareSafetyError: Requested TDP {}uW is below safety floor (5W).", microwatts);
/// Validates and constructs a new PowerLimitWatts.
pub fn try_new(watts: f32) -> Result<Self> {
if watts < Self::MIN || watts > Self::MAX {
bail!("HardwareSafetyError: Requested TDP {:.1}W is outside safe bounds ({:.1}W - {:.1}W).", watts, Self::MIN, Self::MAX);
}
if microwatts > Self::MAX_SAFE_UW {
bail!("HardwareSafetyError: Requested TDP {}uW exceeds safety ceiling (80W).", microwatts);
}
Ok(Self(microwatts))
Ok(Self(watts))
}
pub fn from_watts(watts: f32) -> Result<Self> {
Self::new((watts * 1_000_000.0) as u64)
Self::try_new(watts)
}
pub fn as_u64(&self) -> u64 { self.0 }
pub fn get(&self) -> f32 { self.0 }
pub fn as_microwatts(&self) -> u64 { (self.0 * 1_000_000.0) as u64 }
}
/// Represents a fan speed percentage (0-100%).
/// Represents a validated fan speed percentage.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct FanSpeedPercentage(u8);
pub struct FanSpeedPercent(u8);
impl FanSpeedPercentage {
pub fn new(percent: u8) -> Result<Self> {
impl FanSpeedPercent {
pub fn try_new(percent: u8) -> Result<Self> {
if percent > 100 {
bail!("HardwareSafetyError: Fan speed {}% is invalid.", percent);
}
Ok(Self(percent))
}
pub fn as_u8(&self) -> u8 { self.0 }
pub fn new(percent: u8) -> Result<Self> {
Self::try_new(percent)
}
pub fn get(&self) -> u8 { self.0 }
}
/// Represents a thermal threshold in Celsius, bounded to TjMax - 2°C (98°C).
/// Represents a thermal threshold in Celsius.
#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
pub struct ThermalThresholdCelsius(f32);
impl ThermalThresholdCelsius {
pub const MAX_SAFE_C: f32 = 98.0;
pub fn new(celsius: f32) -> Result<Self> {
pub fn try_new(celsius: f32) -> Result<Self> {
if celsius > Self::MAX_SAFE_C {
bail!("HardwareSafetyError: Thermal threshold {}C exceeds safe limit (98C).", celsius);
bail!("HardwareSafetyError: Thermal threshold {}C exceeds safe limit ({}C).", celsius, Self::MAX_SAFE_C);
}
Ok(Self(celsius))
}
pub fn as_f32(&self) -> f32 { self.0 }
pub fn new(celsius: f32) -> Result<Self> {
Self::try_new(celsius)
}
pub fn get(&self) -> f32 { self.0 }
}
// --- 2. The HardwareStateGuard (RAII Restorer) ---
@@ -78,6 +91,7 @@ impl ThermalThresholdCelsius {
pub type RollbackAction = Box<dyn FnOnce() + Send + 'static>;
/// Holds a snapshot of the system state. Restores everything on Drop.
/// This is the primary safety mechanism for Project Iron-Ember.
pub struct HardwareStateGuard {
/// Maps sysfs paths to their original string contents.
snapshots: HashMap<PathBuf, String>,
@@ -90,6 +104,9 @@ pub struct HardwareStateGuard {
impl HardwareStateGuard {
/// Snapshots the requested files and neutralizes competing services.
///
/// # SAFETY:
/// This MUST be acquired before any hardware mutation occurs.
pub fn acquire(target_files: &[PathBuf], target_services: &[String]) -> Result<Self> {
let mut snapshots = HashMap::new();
let mut suppressed = Vec::new();
@@ -101,10 +118,13 @@ impl HardwareStateGuard {
let content = fs::read_to_string(path)
.with_context(|| format!("Failed to snapshot {:?}", path))?;
snapshots.insert(path.clone(), content.trim().to_string());
} else {
debug!("USA: Skipping snapshot for non-existent path {:?}", path);
}
}
for svc in target_services {
// Check if service is active before stopping
let status = std::process::Command::new("systemctl")
.args(["is-active", "--quiet", svc])
.status();
@@ -168,7 +188,75 @@ impl Drop for HardwareStateGuard {
}
}
// --- 3. Transactional Configuration ---
// --- 3. The Active Watchdog ---
/// A standalone monitor that polls hardware thermals at high frequency.
pub struct ThermalWatchdog {
cancel_token: Arc<AtomicBool>,
handle: Option<thread::JoinHandle<()>>,
}
impl ThermalWatchdog {
/// If temperature exceeds this ceiling, the watchdog triggers an emergency shutdown.
pub const CRITICAL_TEMP: f32 = 95.0;
/// High polling rate ensures we catch runaways before chassis saturation.
pub const POLL_INTERVAL: Duration = Duration::from_millis(250);
/// Spawns the watchdog thread.
pub fn spawn(sensors: Arc<dyn SensorBus>, cancel_token: Arc<AtomicBool>) -> Self {
let ct = cancel_token.clone();
let handle = thread::spawn(move || {
let mut last_temp = 0.0;
loop {
if ct.load(Ordering::SeqCst) {
debug!("Watchdog: Shutdown signal received.");
break;
}
match sensors.get_temp() {
Ok(temp) => {
// Rate of change check (dT/dt)
let dt_dt = temp - last_temp;
if temp >= Self::CRITICAL_TEMP {
error!("WATCHDOG: CRITICAL THERMAL EVENT ({:.1}C). Triggering emergency abort!", temp);
ct.store(true, Ordering::SeqCst);
break;
}
if dt_dt > 5.0 && temp > 85.0 {
warn!("WATCHDOG: Dangerous thermal ramp detected (+{:.1}C in 250ms).", dt_dt);
}
last_temp = temp;
}
Err(e) => {
error!("WATCHDOG: Sensor read failure: {}. Aborting for safety!", e);
ct.store(true, Ordering::SeqCst);
break;
}
}
thread::sleep(Self::POLL_INTERVAL);
}
});
Self {
cancel_token,
handle: Some(handle),
}
}
}
impl Drop for ThermalWatchdog {
fn drop(&mut self) {
self.cancel_token.store(true, Ordering::SeqCst);
if let Some(h) = self.handle.take() {
let _ = h.join();
}
}
}
// --- 4. Transactional Configuration ---
/// A staged set of changes to be applied to the hardware.
#[derive(Default)]

View File

@@ -115,30 +115,20 @@ impl<T: EnvironmentGuard + ?Sized> EnvironmentGuard for Arc<T> {
}
}
use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent};
/// Provides a read-only interface to system telemetry sensors.
pub trait SensorBus: Send + Sync {
/// Returns the current package temperature in degrees Celsius.
///
/// # Errors
/// Returns an error if the underlying `hwmon` or `sysfs` node cannot be read.
fn get_temp(&self) -> Result<f32>;
/// Returns the current package power consumption in Watts.
///
/// # Errors
/// Returns an error if the underlying RAPL or power sensor cannot be read.
fn get_power_w(&self) -> Result<f32>;
/// Returns the current speed of all detected fans in RPM.
///
/// # Errors
/// Returns an error if the fan sensor nodes cannot be read.
fn get_fan_rpms(&self) -> Result<Vec<u32>>;
/// Returns the current average CPU frequency in MHz.
///
/// # Errors
/// Returns an error if `/proc/cpuinfo` or a `cpufreq` sysfs node cannot be read.
fn get_freq_mhz(&self) -> Result<f32>;
/// Returns true if the system is currently thermally throttling.
@@ -146,53 +136,33 @@ pub trait SensorBus: Send + Sync {
}
impl<T: SensorBus + ?Sized> SensorBus for Arc<T> {
fn get_temp(&self) -> Result<f32> {
(**self).get_temp()
}
fn get_power_w(&self) -> Result<f32> {
(**self).get_power_w()
}
fn get_fan_rpms(&self) -> Result<Vec<u32>> {
(**self).get_fan_rpms()
}
fn get_freq_mhz(&self) -> Result<f32> {
(**self).get_freq_mhz()
}
fn get_throttling_status(&self) -> Result<bool> {
(**self).get_throttling_status()
}
fn get_temp(&self) -> Result<f32> { (**self).get_temp() }
fn get_power_w(&self) -> Result<f32> { (**self).get_power_w() }
fn get_fan_rpms(&self) -> Result<Vec<u32>> { (**self).get_fan_rpms() }
fn get_freq_mhz(&self) -> Result<f32> { (**self).get_freq_mhz() }
fn get_throttling_status(&self) -> Result<bool> { (**self).get_throttling_status() }
}
use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage};
/// Provides a write-only interface for hardware actuators.
pub trait ActuatorBus: Send + Sync {
/// Sets the fan control mode (e.g., "auto" or "max").
fn set_fan_mode(&self, mode: &str) -> Result<()>;
/// Sets the fan speed directly using a validated percentage.
fn set_fan_speed(&self, speed: FanSpeedPercentage) -> Result<()>;
fn set_fan_speed(&self, speed: FanSpeedPercent) -> Result<()>;
/// Sets the sustained power limit (PL1) using a validated wrapper.
fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>;
fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()>;
/// Sets the burst power limit (PL2) using a validated wrapper.
fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>;
fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()>;
}
impl<T: ActuatorBus + ?Sized> ActuatorBus for Arc<T> {
fn set_fan_mode(&self, mode: &str) -> Result<()> {
(**self).set_fan_mode(mode)
}
fn set_fan_speed(&self, speed: FanSpeedPercentage) -> Result<()> {
(**self).set_fan_speed(speed)
}
fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
(**self).set_sustained_power_limit(limit)
}
fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
(**self).set_burst_power_limit(limit)
}
fn set_fan_mode(&self, mode: &str) -> Result<()> { (**self).set_fan_mode(mode) }
fn set_fan_speed(&self, speed: FanSpeedPercent) -> Result<()> { (**self).set_fan_speed(speed) }
fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()> { (**self).set_sustained_power_limit(limit) }
fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()> { (**self).set_burst_power_limit(limit) }
}
/// Represents the high-level safety status of the system.