diff --git a/Cargo.toml b/Cargo.toml index 9a37bc6..c584f25 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ember-tune-rs" -version = "1.1.0" +version = "1.2.0" edition = "2024" authors = ["Nils Pukropp "] readme = "README.md" diff --git a/README.md b/README.md new file mode 100644 index 0000000..eeb4b38 --- /dev/null +++ b/README.md @@ -0,0 +1,82 @@ +## ⚙️ Development Setup + +`ember-tune` is a standard Cargo project. You will need a recent Rust toolchain and common build utilities. + +**Prerequisites:** +- `rustup` +- `build-essential` (or equivalent for your distribution) +- `libudev-dev` + +```bash +# 1. Clone the repository +git clone https://gitea.com/narl/ember-tune.git +cd ember-tune + +# 2. Build the release binary +cargo build --release + +# 3. Run the test suite (safe, uses a virtual environment) +# This requires no special permissions and does not touch your hardware. +cargo test +``` + +**Running:** +Due to its direct hardware access, `ember-tune` requires root privileges. + +```bash +# Run a full benchmark and generate optimized configs +sudo ./target/release/ember-tune + +# Run a mock benchmark for UI/logic testing +sudo ./target/release/ember-tune --mock +``` + +--- + +## 🤝 Contributing Quirk Data (`hardware_db.toml`) + +**This is the most impactful way to contribute.** `ember-tune`'s strength comes from its `assets/hardware_db.toml`, which encodes community knowledge about how to manage specific laptops. If your hardware isn't working perfectly, you can likely fix it by adding a new entry here. + +The database is composed of four key sections: `conflicts`, `ecosystems`, `quirks`, and `discovery`. + +### A. Reporting a Service Conflict +If a background service on your system interferes with `ember-tune`, add it to `[[conflicts]]`. + +**Example:** Adding `laptop-mode-tools`. +```toml +[[conflicts]] +id = "laptop_mode_conflict" +services = ["laptop-mode.service"] +contention = "Multiple - I/O schedulers, Power limits" +severity = "Medium" +fix_action = "SuspendService" # Orchestrator will stop/start this service +help_text = "laptop-mode-tools can override power-related sysfs settings." +``` + +### B. Adding a New Hardware Ecosystem +If your laptop manufacturer (e.g., Razer) has a unique fan control tool or ACPI platform profile path, define it in `[ecosystems]`. + +**Example:** A hypothetical "Razer" ecosystem. +```toml +[ecosystems.razer] +vendor_regex = "Razer" +# Path to the sysfs node that controls performance profiles +profiles_path = "/sys/bus/platform/drivers/razer_acpi/power_mode" +# Map human-readable names to the values the driver expects +policy_map = { Balanced = 0, Boost = 1, Silent = 2 } +``` + +### C. Defining a Model-Specific Quirk +If a specific laptop model has a bug (like a stuck sensor or incorrect fan reporting), define a `[[quirks]]` entry. + +**Example:** A laptop whose fans report 0 RPM even when spinning. +```toml +[[quirks]] +model_regex = "HP Envy 15-ep.*" +id = "hp_fan_stuck_sensor" +issue = "Fan sensor reports 0 RPM when active." +# The 'action' tells the SAL to use a different method for fan detection. +action = "UseThermalVelocityFallback" +``` + +After adding your changes, run the test suite and then submit a Pull Request! diff --git a/src/cli.rs b/src/cli.rs index dcc3e5b..b64a168 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,3 +1,8 @@ +//! Defines the command-line interface for `ember-tune`. +//! +//! This module uses the `clap` crate to define the CLI arguments, subcommands, +//! and help text. + use clap::{Parser, builder::styling}; use std::path::PathBuf; @@ -7,27 +12,28 @@ const STYLES: styling::Styles = styling::Styles::styled() .literal(styling::AnsiColor::Cyan.on_default().bold()) .placeholder(styling::AnsiColor::Cyan.on_default()); +/// Scientifically-driven hardware power and thermal optimizer. #[derive(Parser, Debug)] #[command( name = "ember-tune", author = "Nils Pukropp ", - version = "1.0.0", - about = "ember-tune: Scientifically-driven hardware power and thermal optimizer.", - long_about = "ember-tune transforms manual laptop tuning into a rigorous, automated engineering workflow. \nIt executes a state machine to find the 'Physical Sweet Spot' of your specific hardware by measuring \nthe Silicon Knee, Thermal Resistance (Rθ), and Thermal Inertia, then outputs optimal \nconfigurations for tools like 'throttled' or 'ryzenadj'.", + version = "1.1.0", + about = "ember-tune: A physically-grounded thermal and power optimizer for Linux.", + long_about = "ember-tune transforms manual laptop tuning into a rigorous, automated engineering workflow. \nIt executes a state machine to find the 'Physical Sweet Spot' of your specific hardware by measuring \nthe Silicon Knee, Thermal Resistance (Rθ), and Thermal Inertia, then outputs optimal \nconfigurations for tools like 'throttled' or 'i8kmon'.", styles = STYLES, - after_help = "EXAMPLES:\n sudo ember-tune run # Run standard optimization\n sudo ember-tune run --dry-run # Audit and simulate without changes\n sudo ember-tune run --mock # Safe demo with fake hardware" + after_help = "EXAMPLES:\n sudo ember-tune # Run standard optimization\n sudo ember-tune --audit-only # Validate system requirements only\n sudo ember-tune --mock # Safe demo with fake hardware" )] pub struct Cli { - /// Path to output the optimized configuration file + /// Path to output the final `throttled.conf` file. #[arg( short, long, - default_value = "throttled.conf", - help = "Destination for the generated configuration file (e.g. /etc/throttled.conf)" + value_name = "THROTTLED_PATH", + help = "Optional: Overrides the discovered or default path for throttled.conf." )] - pub config_out: PathBuf, + pub config_out: Option, - /// Maximum safe temperature (Celsius) for the benchmark + /// Maximum safe temperature (Celsius) for the benchmark. #[arg( short, long, @@ -36,7 +42,7 @@ pub struct Cli { )] pub max_temp: f32, - /// Enable verbose debug logging + /// Enable verbose debug logging. #[arg( short, long, @@ -44,17 +50,17 @@ pub struct Cli { )] pub verbose: bool, - /// Use a mock hardware layer for safe testing + /// Use a mock hardware layer for safe testing. #[arg( long, help = "Emulates hardware responses. Ideal for testing UI/Logic on unsupported systems." )] pub mock: bool, - /// Run pre-flight audit only + /// Run pre-flight audit only, then exit. #[arg( long, - help = "Validate system requirements and conflict management without starting the benchmark." + help = "Validate system requirements and conflicts without starting the benchmark." )] pub audit_only: bool, } diff --git a/src/engine/mod.rs b/src/engine/mod.rs index 99d094f..42dabad 100644 --- a/src/engine/mod.rs +++ b/src/engine/mod.rs @@ -1,7 +1,16 @@ +//! The core mathematics and physics engine for `ember-tune`. +//! +//! This module contains the `OptimizerEngine`, which is responsible for all +//! data smoothing, thermal resistance calculations, and the heuristic scoring +//! used to identify the "Silicon Knee". + use serde::{Serialize, Deserialize}; +use std::collections::HashMap; +use std::path::PathBuf; pub mod formatters; +/// A single, atomic data point captured during the benchmark. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct ThermalPoint { pub power_w: f32, @@ -11,34 +20,53 @@ pub struct ThermalPoint { pub throughput: f64, } +/// A complete thermal profile containing all data points for a benchmark run. #[derive(Debug, Default, Serialize, Deserialize, Clone)] pub struct ThermalProfile { pub points: Vec, pub ambient_temp: f32, } +/// The final, recommended parameters derived from the thermal benchmark. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct OptimizationResult { + /// The full thermal profile used for calculations. pub profile: ThermalProfile, + /// The power level (in Watts) where performance-per-watt plateaus. pub silicon_knee_watts: f32, + /// The measured thermal resistance of the system (Kelvin/Watt). pub thermal_resistance_kw: f32, + /// The recommended sustained power limit (PL1). pub recommended_pl1: f32, + /// The recommended burst power limit (PL2). pub recommended_pl2: f32, + /// The maximum temperature reached during the test. pub max_temp_c: f32, + /// Indicates if the benchmark was aborted before completion. pub is_partial: bool, - pub config_paths: std::collections::HashMap, + /// A map of configuration files that were written to. + pub config_paths: HashMap, } +/// Pure mathematics engine for thermal optimization. +/// +/// Contains no hardware I/O and operates solely on the collected [ThermalProfile]. pub struct OptimizerEngine { + /// The size of the sliding window for the `smooth` function. window_size: usize, } impl OptimizerEngine { + /// Creates a new `OptimizerEngine`. pub fn new(window_size: usize) -> Self { Self { window_size } } /// Applies a simple moving average (SMA) filter with outlier rejection. + /// + /// This function smooths noisy sensor data. It rejects any value in the + /// window that is more than 20.0 units away from the window's average + /// before calculating the final smoothed value. pub fn smooth(&self, data: &[f32]) -> Vec { if data.is_empty() { return vec![]; } let mut smoothed = Vec::with_capacity(data.len()); @@ -47,7 +75,6 @@ impl OptimizerEngine { let start = if i < self.window_size { 0 } else { i - self.window_size + 1 }; let end = i + 1; - // Outlier rejection: only average values within a reasonable range let window = &data[start..end]; let avg: f32 = window.iter().sum::() / window.len() as f32; let filtered: Vec = window.iter() @@ -63,7 +90,10 @@ impl OptimizerEngine { smoothed } - /// Calculates Thermal Resistance: R_theta = (T_core - T_ambient) / P_package + /// Calculates Thermal Resistance: R_theta = (T_core - T_ambient) / P_package. + /// + /// This function uses the data point with the highest power draw to ensure + /// the calculation reflects a system under maximum thermal load. pub fn calculate_thermal_resistance(&self, profile: &ThermalProfile) -> f32 { profile.points.iter() .filter(|p| p.power_w > 1.0 && p.temp_c > 30.0) // Filter invalid data @@ -72,6 +102,7 @@ impl OptimizerEngine { .unwrap_or(0.0) } + /// Returns the maximum temperature recorded in the profile. pub fn get_max_temp(&self, profile: &ThermalProfile) -> f32 { profile.points.iter() .map(|p| p.temp_c) @@ -79,8 +110,16 @@ impl OptimizerEngine { .unwrap_or(0.0) } - /// Finds the "Silicon Knee" - the point where performance per watt (efficiency) + /// Finds the "Silicon Knee" - the point where performance-per-watt (efficiency) /// starts to diminish significantly and thermal density spikes. + /// + /// This heuristic scoring model balances several factors: + /// 1. **Efficiency Drop:** How quickly does performance-per-watt decrease as power increases? + /// 2. **Thermal Acceleration:** How quickly does temperature rise per additional Watt? + /// 3. **Throttling Penalty:** A large penalty is applied if absolute performance drops, indicating a thermal wall. + /// + /// The "Knee" is the power level with the highest score, representing the optimal + /// balance before thermal saturation causes diminishing returns. pub fn find_silicon_knee(&self, profile: &ThermalProfile) -> f32 { let valid_points: Vec<_> = profile.points.iter() .filter(|p| p.power_w > 5.0 && p.temp_c > 40.0) // Filter idle/noise @@ -103,8 +142,7 @@ impl OptimizerEngine { let curr = &points[i]; let next = &points[i + 1]; - // 1. Efficiency Metric (Throughput per Watt) - // If throughput is 0 (unsupported), fallback to Frequency per Watt + // 1. Efficiency Metric (Throughput per Watt or Freq per Watt) let efficiency_curr = if curr.throughput > 0.0 { curr.throughput as f32 / curr.power_w.max(1.0) } else { @@ -117,7 +155,6 @@ impl OptimizerEngine { next.freq_mhz / next.power_w.max(1.0) }; - // Diminishing returns: how much efficiency drops per additional watt let p_delta = (next.power_w - curr.power_w).max(0.5); let efficiency_drop = (efficiency_curr - efficiency_next) / p_delta; @@ -131,13 +168,10 @@ impl OptimizerEngine { let p_total_delta = (next.power_w - prev.power_w).max(1.0); let temp_accel = (dt_dw_next - dt_dw_prev) / p_total_delta; - // 3. Wall Detection (Any drop in absolute frequency/throughput is a hard wall) + // 3. Wall Detection (Any drop in absolute performance is a hard wall) let is_throttling = next.freq_mhz < curr.freq_mhz || (next.throughput > 0.0 && next.throughput < curr.throughput); let penalty = if is_throttling { 5000.0 } else { 0.0 }; - // Heuristic scoring: - // - Higher score is "Better" (The Knee is the peak of this curve) - // - We want high efficiency (low drop) and low thermal acceleration. let score = (efficiency_curr * 10.0) - (efficiency_drop * 50.0) - (temp_accel * 20.0) - penalty; if score > max_score { diff --git a/src/lib.rs b/src/lib.rs index 07dcb24..0f4aa6a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,9 @@ +//! # ember-tune: A physically-grounded thermal and power optimizer for Linux. +//! +//! This crate provides the core library for `ember-tune`, a tool that +//! scientifically determines the optimal power and thermal settings for laptops +//! by measuring physical properties like Thermal Resistance and the "Silicon Knee". + pub mod mediator; pub mod sal; pub mod load; diff --git a/src/load/mod.rs b/src/load/mod.rs index bc80d98..501e5fd 100644 --- a/src/load/mod.rs +++ b/src/load/mod.rs @@ -1,14 +1,33 @@ +//! Defines the `Workload` trait for generating synthetic CPU/GPU load. + use anyhow::Result; use std::process::Child; use std::time::{Duration, Instant}; use std::thread; +/// A trait for objects that can generate a measurable system load. pub trait Workload: Send + Sync { + /// Starts the workload with the specified number of threads and load percentage. + /// + /// # Errors + /// Returns an error if the underlying stress test process fails to spawn. fn start(&mut self, threads: usize, load_percent: usize) -> Result<()>; + + /// Stops the workload gracefully. + /// + /// # Errors + /// This method should aim to not fail, but may return an error if + /// forcefully killing the child process fails. fn stop(&mut self) -> Result<()>; + + /// Returns the current throughput of the workload (e.g., ops/sec). + /// + /// # Errors + /// Returns an error if throughput cannot be measured. fn get_throughput(&self) -> Result; } +/// An implementation of `Workload` that uses the `stress-ng` utility. pub struct StressNg { child: Option, } @@ -37,7 +56,6 @@ impl Workload for StressNg { fn stop(&mut self) -> Result<()> { if let Some(mut child) = self.child.take() { - // Try SIGTERM first #[cfg(unix)] { use libc::{kill, SIGTERM}; @@ -62,6 +80,9 @@ impl Workload for StressNg { Ok(()) } + /// Returns the current throughput of the workload (e.g., ops/sec). + /// + /// This is currently a stub and does not parse `stress-ng` output. fn get_throughput(&self) -> Result { Ok(0.0) } diff --git a/src/mediator.rs b/src/mediator.rs index 5ca3950..a2d4266 100644 --- a/src/mediator.rs +++ b/src/mediator.rs @@ -1,5 +1,13 @@ -use serde::{Serialize, Deserialize}; +//! Defines the data structures used for communication between the frontend and backend. +//! +//! This module acts as the "Mediator" in the Mediator Pattern, providing the +//! message-passing interface for the MPSC channels that connect the TUI thread +//! with the `BenchmarkOrchestrator` thread. +use serde::{Serialize, Deserialize}; +use std::collections::HashMap; + +/// Defines the current high-level phase of the benchmark. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub enum BenchmarkPhase { Auditing, @@ -9,44 +17,41 @@ pub enum BenchmarkPhase { Finalizing, } -impl Default for BenchmarkPhase { - fn default() -> Self { - Self::Auditing - } -} - -#[derive(Debug, Clone)] +/// A complete snapshot of system telemetry at a single point in time. +/// This struct is sent from the backend to the frontend on every tick. +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct TelemetryState { - // --- Static Info --- + // --- Static System Info --- pub cpu_model: String, pub total_ram_gb: u64, - - // --- Dynamic States --- - pub tick: u64, - pub phase: BenchmarkPhase, - pub governor: String, - pub pl1_limit: f32, - pub pl2_limit: f32, - pub fan_tier: String, - // --- Instantaneous Metrics --- + // --- Dynamic Metrics --- + pub tick: u64, pub cpu_temp: f32, pub power_w: f32, pub current_freq: f32, pub fans: Vec, - - // --- High-res History (Last 60s @ 500ms = 120 points) --- + pub governor: String, + pub pl1_limit: f32, + pub pl2_limit: f32, + pub fan_tier: String, + pub phase: BenchmarkPhase, + + // --- High-res History --- pub history_watts: Vec, pub history_temp: Vec, pub history_mhz: Vec, + // --- Events & Metadata --- pub log_event: Option, - pub metadata: std::collections::HashMap, + pub metadata: HashMap, pub is_emergency: bool, pub emergency_reason: Option, } +/// Commands sent from the frontend (UI) to the backend (`BenchmarkOrchestrator`). #[derive(Debug, Clone)] pub enum UiCommand { + /// Signals the orchestrator to gracefully abort the benchmark. Abort, } diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index 41cdc49..ab46853 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -1,3 +1,8 @@ +//! The central state machine responsible for coordinating the thermal benchmark. +//! +//! It manages hardware interactions through the [PlatformSal], generates stress +//! using a [Workload], and feeds telemetry to the frontend via MPSC channels. + use anyhow::{Result, Context}; use std::sync::mpsc; use std::time::{Duration, Instant}; @@ -14,31 +19,48 @@ use crate::load::Workload; use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase}; use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult}; +/// The central state machine responsible for coordinating the thermal benchmark. +/// +/// It manages hardware interactions through the [PlatformSal], generates stress +/// using a [Workload], and feeds telemetry to the frontend via MPSC channels. pub struct BenchmarkOrchestrator { + /// Injected hardware abstraction layer. sal: Arc, + /// Discovered system facts and paths. facts: SystemFactSheet, + /// Heat generation workload. workload: Box, + /// Channel for sending telemetry updates to the UI. telemetry_tx: mpsc::Sender, + /// Channel for receiving commands from the UI. command_rx: mpsc::Receiver, + /// Current phase of the benchmark. phase: BenchmarkPhase, + /// Accumulated thermal data points. profile: ThermalProfile, + /// Mathematics engine for data smoothing and optimization. engine: OptimizerEngine, - // --- History Buffers (120 points for 60s @ 500ms) --- + /// Sliding window of power readings (Watts). history_watts: VecDeque, + /// Sliding window of temperature readings (Celsius). history_temp: VecDeque, + /// Sliding window of CPU frequency (MHz). history_mhz: VecDeque, - // --- Static Info --- + /// Detected CPU model string. cpu_model: String, + /// Total system RAM in Gigabytes. total_ram_gb: u64, - // --- Safety --- + /// Atomic flag indicating a safety-triggered abort. emergency_abort: Arc, + /// Human-readable reason for the emergency abort. emergency_reason: Arc>>, } impl BenchmarkOrchestrator { + /// Creates a new orchestrator instance with injected dependencies. pub fn new( sal: Arc, facts: SystemFactSheet, @@ -73,16 +95,17 @@ impl BenchmarkOrchestrator { } } + /// Executes the full benchmark sequence. + /// + /// This method guarantees that [crate::sal::traits::EnvironmentGuard::restore] and [Workload::stop] + /// are called regardless of whether the benchmark succeeds or fails. pub fn run(&mut self) -> Result { self.log("Starting ember-tune Benchmark Sequence.")?; - // Start Watchdog Monitor let _watchdog_handle = self.spawn_watchdog_monitor(); - // Use a closure to ensure cleanup always runs let result = self.execute_benchmark(); - // --- MANDATORY CLEANUP --- self.log("Benchmark sequence finished. Restoring hardware defaults...")?; let _ = self.workload.stop(); if let Err(e) = self.sal.restore() { @@ -93,10 +116,10 @@ impl BenchmarkOrchestrator { result } + /// Internal execution logic for the benchmark phases. fn execute_benchmark(&mut self) -> Result { let bench_cfg = self.facts.bench_config.clone().context("Benchmarking config missing in facts")?; - // Phase 1: Audit & Baseline self.phase = BenchmarkPhase::Auditing; for step in self.sal.audit() { if let Err(e) = step.outcome { @@ -107,10 +130,9 @@ impl BenchmarkOrchestrator { self.log("Suppressing background services (tlp, thermald)...")?; self.sal.suppress().context("Failed to suppress background services")?; - // Baseline (Idle Calibration) self.phase = BenchmarkPhase::IdleCalibration; self.log(&format!("Phase 1: Recording Idle Baseline ({}s)...", bench_cfg.idle_duration_s))?; - self.sal.set_fan_mode("auto")?; // Use auto for idle + self.sal.set_fan_mode("auto")?; let mut idle_temps = Vec::new(); let start = Instant::now(); @@ -125,10 +147,9 @@ impl BenchmarkOrchestrator { self.profile.ambient_temp = self.engine.smooth(&idle_temps).last().cloned().unwrap_or(0.0); self.log(&format!("✓ Idle Baseline: {:.1}°C", self.profile.ambient_temp))?; - // Phase 2: Stress Stepping self.phase = BenchmarkPhase::StressTesting; self.log("Phase 2: Starting Synthetic Stress Matrix.")?; - self.sal.set_fan_mode("max")?; // Lock fans for consistent resistance + self.sal.set_fan_mode("max")?; let steps = bench_cfg.power_steps_watts.clone(); for &pl in &steps { @@ -138,7 +159,6 @@ impl BenchmarkOrchestrator { self.workload.start(num_cpus::get(), 100)?; - // Wait for equilibrium let step_start = Instant::now(); let mut step_temps = VecDeque::with_capacity(30); @@ -152,7 +172,6 @@ impl BenchmarkOrchestrator { self.send_telemetry(tick)?; tick += 1; - // Check for stability: Range < 0.5C over last 5s (10 ticks) if step_start.elapsed() > Duration::from_secs(bench_cfg.stress_duration_min_s) && step_temps.len() == 10 { let min = step_temps.iter().fold(f32::MAX, |a, &b| a.min(b)); let max = step_temps.iter().fold(f32::MIN, |a, &b| a.max(b)); @@ -164,7 +183,6 @@ impl BenchmarkOrchestrator { thread::sleep(Duration::from_millis(500)); } - // Record data point let avg_p = self.sal.get_power_w().unwrap_or(0.0); let avg_t = self.sal.get_temp().unwrap_or(0.0); let avg_f = self.sal.get_freq_mhz().unwrap_or(0.0); @@ -185,7 +203,6 @@ impl BenchmarkOrchestrator { thread::sleep(Duration::from_secs(bench_cfg.cool_down_s)); } - // Phase 4: Physical Modeling self.phase = BenchmarkPhase::PhysicalModeling; self.log("Phase 3: Calculating Silicon Physical Sweet Spot...")?; @@ -196,7 +213,6 @@ impl BenchmarkOrchestrator { thread::sleep(Duration::from_secs(3)); - // Phase 5: Finalizing self.phase = BenchmarkPhase::Finalizing; self.log("Benchmark sequence complete. Generating configurations...")?; @@ -206,14 +222,12 @@ impl BenchmarkOrchestrator { trip_temp: res.max_temp_c.max(95.0), }; - // 1. Throttled (Merged if exists) if let Some(throttled_path) = self.facts.paths.configs.get("throttled") { crate::engine::formatters::throttled::ThrottledTranslator::save(throttled_path, &config)?; self.log(&format!("✓ Saved '{}' (merged).", throttled_path.display()))?; res.config_paths.insert("throttled".to_string(), throttled_path.clone()); } - // 2. i8kmon if let Some(i8k_path) = self.facts.paths.configs.get("i8kmon") { let i8k_config = crate::engine::formatters::i8kmon::I8kmonConfig { t_ambient: self.profile.ambient_temp, @@ -228,6 +242,7 @@ impl BenchmarkOrchestrator { Ok(res) } + /// Spawns a concurrent monitor that polls safety sensors every 100ms. fn spawn_watchdog_monitor(&self) -> thread::JoinHandle<()> { let abort = self.emergency_abort.clone(); let reason_store = self.emergency_reason.clone(); @@ -279,6 +294,7 @@ impl BenchmarkOrchestrator { }) } + /// Generates the final [OptimizationResult] based on current measurements. pub fn generate_result(&self, is_partial: bool) -> OptimizationResult { let r_theta = self.engine.calculate_thermal_resistance(&self.profile); let knee = self.engine.find_silicon_knee(&self.profile); @@ -296,6 +312,7 @@ impl BenchmarkOrchestrator { } } + /// Checks if the benchmark has been aborted by the user or the watchdog. fn check_abort(&self) -> Result<()> { if self.emergency_abort.load(Ordering::SeqCst) { let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Unknown safety trigger".to_string()); @@ -312,6 +329,7 @@ impl BenchmarkOrchestrator { Ok(()) } + /// Helper to send log messages to the frontend. fn log(&self, msg: &str) -> Result<()> { let state = TelemetryState { cpu_model: self.cpu_model.clone(), @@ -337,6 +355,7 @@ impl BenchmarkOrchestrator { self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed")) } + /// Collects current sensors and sends a complete [TelemetryState] to the frontend. fn send_telemetry(&mut self, tick: u64) -> Result<()> { let temp = self.sal.get_temp().unwrap_or(0.0); let pwr = self.sal.get_power_w().unwrap_or(0.0); diff --git a/src/sal/generic_linux.rs b/src/sal/generic_linux.rs index d234354..7a0e2ce 100644 --- a/src/sal/generic_linux.rs +++ b/src/sal/generic_linux.rs @@ -3,7 +3,6 @@ use std::path::{Path}; use std::fs; use std::time::{Duration, Instant}; use std::sync::Mutex; -use tracing::{debug}; use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx}; use crate::sal::heuristic::discovery::SystemFactSheet; diff --git a/src/sal/traits.rs b/src/sal/traits.rs index a88ebcf..704ce5c 100644 --- a/src/sal/traits.rs +++ b/src/sal/traits.rs @@ -1,11 +1,21 @@ -use anyhow::Result; -use thiserror::Error; +//! Core traits defining the System Abstraction Layer (SAL). +//! +//! This module provides a set of hardware-agnostic interfaces that the +//! `BenchmarkOrchestrator` uses to interact with the underlying system. +//! These traits allow `ember-tune` to support diverse hardware by abstracting +//! away platform-specific details. + use miette::Diagnostic; use std::sync::Arc; use std::path::PathBuf; use crate::sys::SyscallRunner; +use anyhow::Result; +use thiserror::Error; /// Context holding OS abstractions (filesystem base and syscall runner). +/// +/// This is injected into SAL implementations to allow for a mocked "virtual" +/// environment during testing, preventing `cargo test` from mutating the host system. #[derive(Clone)] pub struct EnvironmentCtx { pub sysfs_base: PathBuf, @@ -13,6 +23,7 @@ pub struct EnvironmentCtx { } impl EnvironmentCtx { + /// Creates a production-ready context pointing to the real filesystem root. pub fn production() -> Self { Self { sysfs_base: PathBuf::from("/"), @@ -21,41 +32,52 @@ impl EnvironmentCtx { } } +/// Errors that can occur during the pre-flight system audit. #[derive(Error, Diagnostic, Debug, Clone)] pub enum AuditError { + /// The user does not have root privileges (`uid=0`). #[error("Missing root privileges.")] #[diagnostic(code(ember_tune::root_required), severity(error))] #[help("ember-tune requires direct hardware access (MSRs, sysfs). Please run with 'sudo'.")] RootRequired, + /// A required kernel parameter is missing from the boot command line. #[error("Missing kernel parameter: {0}")] #[diagnostic(code(ember_tune::missing_kernel_param), severity(error))] #[help("Add '{0}' to your GRUB_CMDLINE_LINUX_DEFAULT in /etc/default/grub, then run 'sudo update-grub' and reboot.")] MissingKernelParam(String), + /// The system is running on battery power. #[error("System is running on battery: {0}")] #[diagnostic(code(ember_tune::ac_power_missing), severity(error))] #[help("Thermal benchmarking requires a stable AC power source to ensure consistent PL limits. Please plug in your charger.")] AcPowerMissing(String), + /// The Linux kernel version is known to be incompatible. #[error("Incompatible kernel version: {0}")] #[diagnostic(code(ember_tune::kernel_incompatible), severity(error))] #[help("Your kernel version '{0}' may not support the required RAPL or SMM interfaces. Please upgrade to a recent LTS kernel (6.1+).")] KernelIncompatible(String), + /// A required kernel module or CLI tool is not available. #[error("Required tool missing: {0}")] #[diagnostic(code(ember_tune::tool_missing), severity(error))] #[help("The utility '{0}' is required for this SAL. Please install it using your package manager (e.g., 'sudo apt install {0}').")] ToolMissing(String), } +/// A single, verifiable step in the pre-flight audit process. pub struct AuditStep { + /// Human-readable description of the check. pub description: String, + /// The outcome of the check. pub outcome: Result<(), AuditError>, } -/// Evaluates immutable system states (e.g., kernel bootline parameters, AC power status). +/// Evaluates immutable system states before the benchmark begins. pub trait PreflightAuditor: Send + Sync { + /// Returns an iterator of [AuditStep] results. + /// This allows the UI to show a live checklist of system verification steps. fn audit(&self) -> Box + '_>; } @@ -65,9 +87,22 @@ impl PreflightAuditor for Arc { } } -/// Suppresses conflicting daemons (tlp, thermald). +/// Manages system services that conflict with the benchmark. +/// +/// # Invariants +/// The `Drop` trait is *not* used for guaranteed cleanup. The orchestrator must +/// explicitly call `restore()` to ensure hardware state is reset. pub trait EnvironmentGuard: Send + Sync { + /// Stops any conflicting system daemons (e.g., `tlp`, `thermald`). + /// + /// # Errors + /// Returns an error if the `systemctl` command fails. fn suppress(&self) -> Result<()>; + + /// Restarts any services that were stopped by `suppress`. + /// + /// # Errors + /// Returns an error if the `systemctl` command fails. fn restore(&self) -> Result<()>; } @@ -80,11 +115,30 @@ impl EnvironmentGuard for Arc { } } -/// Read-only interface for standardized metrics. +/// Provides a read-only interface to system telemetry sensors. pub trait SensorBus: Send + Sync { + /// Returns the current package temperature in degrees Celsius. + /// + /// # Errors + /// Returns an error if the underlying `hwmon` or `sysfs` node cannot be read. fn get_temp(&self) -> Result; + + /// Returns the current package power consumption in Watts. + /// + /// # Errors + /// Returns an error if the underlying RAPL or power sensor cannot be read. fn get_power_w(&self) -> Result; + + /// Returns the current speed of all detected fans in RPM. + /// + /// # Errors + /// Returns an error if the fan sensor nodes cannot be read. fn get_fan_rpms(&self) -> Result>; + + /// Returns the current average CPU frequency in MHz. + /// + /// # Errors + /// Returns an error if `/proc/cpuinfo` or a `cpufreq` sysfs node cannot be read. fn get_freq_mhz(&self) -> Result; } @@ -103,10 +157,24 @@ impl SensorBus for Arc { } } -/// Write-only interface for hardware commands. +/// Provides a write-only interface for hardware actuators. pub trait ActuatorBus: Send + Sync { + /// Sets the fan control mode (e.g., "auto" or "max"). + /// + /// # Errors + /// Returns an error if the fan control command or `sysfs` write fails. fn set_fan_mode(&self, mode: &str) -> Result<()>; + + /// Sets the sustained power limit (PL1) in Watts. + /// + /// # Errors + /// Returns an error if the RAPL `sysfs` node cannot be written to. fn set_sustained_power_limit(&self, watts: f32) -> Result<()>; + + /// Sets the burst power limit (PL2) in Watts. + /// + /// # Errors + /// Returns an error if the RAPL `sysfs` node cannot be written to. fn set_burst_power_limit(&self, watts: f32) -> Result<()>; } @@ -122,8 +190,27 @@ impl ActuatorBus for Arc { } } -/// Concurrent monitor for catastrophic states. +/// Represents the high-level safety status of the system. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum SafetyStatus { + /// The system is operating within normal parameters. + Nominal, + /// A non-critical issue was detected and may have been auto-corrected. + Warning(String), + /// A potentially dangerous state was detected, but is not yet an emergency. + Critical(String), + /// A critical failure has occurred, requiring an immediate shutdown of the benchmark. + EmergencyAbort(String), +} + +/// A high-frequency monitor for catastrophic hardware states. pub trait HardwareWatchdog: Send + Sync { + /// Returns the current [SafetyStatus] of the system. + /// + /// # Errors + /// This method can return an error if a sensor required for a safety check + /// (e.g., the thermal sensor) fails to read. The orchestrator must treat + /// this as an `EmergencyAbort` condition. fn get_safety_status(&self) -> Result; } @@ -133,15 +220,10 @@ impl HardwareWatchdog for Arc { } } -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum SafetyStatus { - Nominal, - Warning(String), - Critical(String), - EmergencyAbort(String), -} - /// Aggregate trait for a complete platform implementation. +/// +/// This "super-trait" combines all SAL interfaces into a single object-safe +/// trait, simplifying dependency injection into the `BenchmarkOrchestrator`. pub trait PlatformSal: PreflightAuditor + SensorBus + ActuatorBus + EnvironmentGuard + HardwareWatchdog {} impl PlatformSal for T {}