From 8d351c7bdee5ee59432c4b543130ebcc56e75af6 Mon Sep 17 00:00:00 2001 From: Nils Pukropp Date: Sat, 28 Feb 2026 18:55:18 +0100 Subject: [PATCH] updated safety measurements and benchmarking behavior for 9380 --- src/agent_analyst/mod.rs | 24 +-- src/agent_integrator/mod.rs | 97 +++++++---- src/engine/formatters/throttled.rs | 11 +- src/engine/mod.rs | 150 ++++++----------- src/orchestrator/mod.rs | 262 +++++++++++++---------------- src/sal/generic_linux.rs | 3 +- src/sal/mock.rs | 1 - src/sal/safety.rs | 2 +- tests/config_merge_test.rs | 90 +++++++--- tests/safety_test.rs | 23 ++- 10 files changed, 329 insertions(+), 334 deletions(-) diff --git a/src/agent_analyst/mod.rs b/src/agent_analyst/mod.rs index c5b3b33..47c0af9 100644 --- a/src/agent_analyst/mod.rs +++ b/src/agent_analyst/mod.rs @@ -28,6 +28,7 @@ pub struct OptimizationMatrix { pub balanced: SystemProfile, pub performance: SystemProfile, pub thermal_resistance_kw: f32, + pub ambient_temp: f32, } pub struct HeuristicAnalyst { @@ -43,16 +44,14 @@ impl HeuristicAnalyst { /// Analyzes the raw telemetry to generate the 3 optimal profiles. pub fn analyze(&self, profile: &ThermalProfile, max_soak_watts: f32) -> OptimizationMatrix { - let r_theta = self.engine.calculate_thermal_resistance(profile); + let r_theta = profile.r_theta; let silicon_knee = self.engine.find_silicon_knee(profile); + let ambient = profile.ambient_temp; // 1. State A: Silent / Battery (Scientific Passive Equilibrium) - // Objective: Find P where T_core = 60C with fans OFF. - // T_core = T_ambient + (P * R_theta_passive) - // Note: R_theta measured during benchmark was with fans MAX. - // Passive R_theta is typically 2-3x higher. + // Find P where T_core = 60C with fans OFF. let r_theta_passive = r_theta * 2.5; - let silent_watts = ((60.0 - profile.ambient_temp) / r_theta_passive.max(0.1)).clamp(5.0, 15.0); + let silent_watts = ((60.0 - ambient) / r_theta_passive.max(0.1)).clamp(3.0, 15.0); let silent_profile = SystemProfile { name: "Silent".to_string(), @@ -64,21 +63,21 @@ impl HeuristicAnalyst { ], }; - // 2. State B: Balanced - // The exact calculated Silicon Knee + // 2. State B: Balanced (The Silicon Knee) + // We use R_theta to predict where the knee will sit thermally. let balanced_profile = SystemProfile { name: "Balanced".to_string(), pl1_watts: silicon_knee, pl2_watts: silicon_knee * 1.25, fan_curve: vec![ - FanCurvePoint { temp_on: 60.0, temp_off: 55.0, pwm_percent: 0 }, - FanCurvePoint { temp_on: 75.0, temp_off: 65.0, pwm_percent: 40 }, - FanCurvePoint { temp_on: 85.0, temp_off: 75.0, pwm_percent: 70 }, + FanCurvePoint { temp_on: ambient + 15.0, temp_off: ambient + 10.0, pwm_percent: 0 }, + FanCurvePoint { temp_on: ambient + 25.0, temp_off: ambient + 20.0, pwm_percent: 30 }, + FanCurvePoint { temp_on: 75.0, temp_off: 65.0, pwm_percent: 50 }, + FanCurvePoint { temp_on: 85.0, temp_off: 75.0, pwm_percent: 80 }, ], }; // 3. State C: Sustained Heavy - // Based on the max soak watts from Phase 1. let performance_profile = SystemProfile { name: "Performance".to_string(), pl1_watts: max_soak_watts, @@ -95,6 +94,7 @@ impl HeuristicAnalyst { balanced: balanced_profile, performance: performance_profile, thermal_resistance_kw: r_theta, + ambient_temp: ambient, } } } diff --git a/src/agent_integrator/mod.rs b/src/agent_integrator/mod.rs index 8328498..dc69883 100644 --- a/src/agent_integrator/mod.rs +++ b/src/agent_integrator/mod.rs @@ -6,7 +6,7 @@ //! resolution strategies for overlapping daemons. use anyhow::Result; -use std::path::Path; +use std::path::{Path, PathBuf}; use std::fs; use crate::agent_analyst::OptimizationMatrix; @@ -14,20 +14,42 @@ pub struct ServiceIntegrator; impl ServiceIntegrator { /// Generates and saves an i8kmon configuration based on the balanced profile. - pub fn generate_i8kmon_config(matrix: &OptimizationMatrix, output_path: &Path) -> Result<()> { + pub fn generate_i8kmon_config(matrix: &OptimizationMatrix, output_path: &Path, source_path: Option<&PathBuf>) -> Result<()> { let profile = &matrix.balanced; - - let mut conf = String::new(); - conf.push_str("# Auto-generated by ember-tune Integrator -"); - conf.push_str(&format!("# Profile: {} -", profile.name)); - + let mut conf = String::new(); + + // Read existing content to preserve daemon and other settings + let existing = if let Some(src) = source_path { + if src.exists() { fs::read_to_string(src).unwrap_or_default() } else { String::new() } + } else if output_path.exists() { + fs::read_to_string(output_path).unwrap_or_default() + } else { + String::new() + }; + + if !existing.is_empty() { + for line in existing.lines() { + let trimmed = line.trim(); + // Filter out the old auto-generated config lines and fan configs + if !trimmed.starts_with("set config(0)") && + !trimmed.starts_with("set config(1)") && + !trimmed.starts_with("set config(2)") && + !trimmed.starts_with("set config(3)") && + !trimmed.starts_with("# Auto-generated") && + !trimmed.starts_with("# Profile:") && + !trimmed.is_empty() { + conf.push_str(line); + conf.push('\n'); + } + } + } + + conf.push_str("\n# Auto-generated by ember-tune Integrator\n"); + conf.push_str(&format!("# Profile: {}\n", profile.name)); + conf.push_str(&format!("# Thermal Resistance: {:.3} K/W\n\n", matrix.thermal_resistance_kw)); + for (i, p) in profile.fan_curve.iter().enumerate() { - // i8kmon syntax: set config(state) {left_fan right_fan temp_on temp_off} - // State 0, 1, 2, 3 correspond to BIOS fan states (off, low, high) - let state = match p.pwm_percent { 0..=20 => 0, 21..=50 => 1, @@ -35,31 +57,50 @@ impl ServiceIntegrator { _ => 2, }; - let off = if i == 0 { "-".to_string() } else { format!("{}", p.temp_off) }; - conf.push_str(&format!("set config({}) {{{} {} {} {}}} -", i, state, state, p.temp_on, off)); + let off = if i == 0 { "-".to_string() } else { format!("{:.0}", p.temp_off) }; + conf.push_str(&format!("set config({}) {{{} {} {:.0} {}}}\n", i, state, state, p.temp_on, off)); } fs::write(output_path, conf)?; Ok(()) } - /// Generates a thinkfan configuration. - pub fn generate_thinkfan_config(matrix: &OptimizationMatrix, output_path: &Path) -> Result<()> { + /// Generates a thinkfan configuration, merging with existing sensors if possible. + pub fn generate_thinkfan_config(matrix: &OptimizationMatrix, output_path: &Path, source_path: Option<&PathBuf>) -> Result<()> { let profile = &matrix.balanced; let mut conf = String::new(); - conf.push_str("# Auto-generated by ember-tune Integrator -"); - conf.push_str("sensors: - - hwmon: /sys/class/hwmon/hwmon0/temp1_input + + let existing = if let Some(src) = source_path { + if src.exists() { fs::read_to_string(src).unwrap_or_default() } else { String::new() } + } else if output_path.exists() { + fs::read_to_string(output_path).unwrap_or_default() + } else { + String::new() + }; -"); - conf.push_str("levels: -"); + if !existing.is_empty() { + let mut in_sensors = false; + for line in existing.lines() { + let trimmed = line.trim(); + if trimmed == "sensors:" { in_sensors = true; } + if trimmed == "levels:" { in_sensors = false; } + + if in_sensors { + conf.push_str(line); + conf.push('\n'); + } + } + } + + if conf.is_empty() { + conf.push_str("sensors:\n - hwmon: /sys/class/hwmon/hwmon0/temp1_input\n\n"); + } + + conf.push_str("\n# Auto-generated by ember-tune Integrator\n"); + conf.push_str("levels:\n"); for (i, p) in profile.fan_curve.iter().enumerate() { - // thinkfan syntax: - [level, temp_down, temp_up] let level = match p.pwm_percent { 0..=20 => 0, 21..=40 => 1, @@ -69,8 +110,7 @@ impl ServiceIntegrator { }; let down = if i == 0 { 0.0 } else { p.temp_off }; - conf.push_str(&format!(" - [{}, {}, {}] -", level, down, p.temp_on)); + conf.push_str(&format!(" - [{}, {:.0}, {:.0}]\n", level, down, p.temp_on)); } fs::write(output_path, conf)?; @@ -91,7 +131,6 @@ sed -i 's/^CPU_BOOST_ON_AC=.*/CPU_BOOST_ON_AC=""/' /etc/tlp.conf systemctl restart tlp # 3. Thermald Delegate (We provide the trips, it handles the rest) -# (Ensure your custom thermal-conf.xml is in /etc/thermald/) systemctl restart thermald "#; fs::write(output_path, script)?; @@ -99,7 +138,7 @@ systemctl restart thermald } /// Generates a thermald configuration XML. - pub fn generate_thermald_config(matrix: &OptimizationMatrix, output_path: &Path) -> Result<()> { + pub fn generate_thermald_config(matrix: &OptimizationMatrix, output_path: &Path, _source_path: Option<&PathBuf>) -> Result<()> { let profile = &matrix.balanced; let mut xml = String::new(); xml.push_str("\n\n \n ember-tune Balanced\n Generic\n balanced\n \n \n cpu\n \n"); diff --git a/src/engine/formatters/throttled.rs b/src/engine/formatters/throttled.rs index 9febe7e..17bf284 100644 --- a/src/engine/formatters/throttled.rs +++ b/src/engine/formatters/throttled.rs @@ -118,8 +118,15 @@ Trip_Temp_C: {trip:.0} result_lines.join("\n") } - pub fn save(path: &Path, config: &ThrottledConfig) -> Result<()> { - let existing = if path.exists() { std::fs::read_to_string(path)? } else { String::new() }; + pub fn save(path: &Path, config: &ThrottledConfig, source_path: Option<&std::path::PathBuf>) -> Result<()> { + let existing = if let Some(src) = source_path { + if src.exists() { std::fs::read_to_string(src).unwrap_or_default() } else { String::new() } + } else if path.exists() { + std::fs::read_to_string(path).unwrap_or_default() + } else { + String::new() + }; + let content = if existing.is_empty() { Self::generate_conf(config) } else { Self::merge_conf(&existing, config) }; std::fs::write(path, content)?; Ok(()) diff --git a/src/engine/mod.rs b/src/engine/mod.rs index e65a992..3a31cd4 100644 --- a/src/engine/mod.rs +++ b/src/engine/mod.rs @@ -7,7 +7,7 @@ use serde::{Serialize, Deserialize}; use std::collections::HashMap; use std::path::PathBuf; -use tracing::warn; +use tracing::{warn, debug}; pub mod formatters; @@ -26,6 +26,7 @@ pub struct ThermalPoint { pub struct ThermalProfile { pub points: Vec, pub ambient_temp: f32, + pub r_theta: f32, } /// The final, recommended parameters derived from the thermal benchmark. @@ -52,24 +53,16 @@ pub struct OptimizationResult { } /// Pure mathematics engine for thermal optimization. -/// -/// Contains no hardware I/O and operates solely on the collected [ThermalProfile]. pub struct OptimizerEngine { - /// The size of the sliding window for the `smooth` function. window_size: usize, } impl OptimizerEngine { - /// Creates a new `OptimizerEngine`. pub fn new(window_size: usize) -> Self { Self { window_size } } - /// Applies a simple moving average (SMA) filter with outlier rejection. - /// - /// This function smooths noisy sensor data. It rejects any value in the - /// window that is more than 20.0 units away from the window's average - /// before calculating the final smoothed value. + /// Smoothes sensor jitter using a moving average with outlier rejection. pub fn smooth(&self, data: &[f32]) -> Vec { if data.is_empty() { return vec![]; } let mut smoothed = Vec::with_capacity(data.len()); @@ -81,7 +74,7 @@ impl OptimizerEngine { let window = &data[start..end]; let avg: f32 = window.iter().sum::() / window.len() as f32; let filtered: Vec = window.iter() - .filter(|&&v| (v - avg).abs() < 20.0) // Reject spikes > 20 units + .filter(|&&v| (v - avg).abs() < 10.0) .cloned().collect(); if filtered.is_empty() { @@ -93,108 +86,65 @@ impl OptimizerEngine { smoothed } - /// Calculates Thermal Resistance: R_theta = (T_core - T_ambient) / P_package. - /// - /// This function uses the data point with the highest power draw to ensure - /// the calculation reflects a system under maximum thermal load. - pub fn calculate_thermal_resistance(&self, profile: &ThermalProfile) -> f32 { - profile.points.iter() - .filter(|p| p.power_w > 1.0 && p.temp_c > 30.0) // Filter invalid data - .max_by(|a, b| a.power_w.partial_cmp(&b.power_w).unwrap_or(std::cmp::Ordering::Equal)) - .map(|p| (p.temp_c - profile.ambient_temp) / p.power_w) - .unwrap_or(0.0) + /// Evaluates if a series of temperature readings have reached thermal equilibrium. + /// Criteria: Standard deviation < 0.25C over the last 10 seconds. + pub fn is_stable(&self, temps: &[f32]) -> bool { + if temps.len() < 20 { return false; } // Need at least 10s of data (500ms intervals) + let window = &temps[temps.len() - 20..]; + + let avg = window.iter().sum::() / window.len() as f32; + let variance = window.iter().map(|&t| (t - avg).powi(2)).sum::() / window.len() as f32; + let std_dev = variance.sqrt(); + + debug!("Stability Check: StdDev={:.3}C (Target < 0.25C)", std_dev); + std_dev < 0.25 } - /// Returns the maximum temperature recorded in the profile. - pub fn get_max_temp(&self, profile: &ThermalProfile) -> f32 { - profile.points.iter() - .map(|p| p.temp_c) - .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)) - .unwrap_or(0.0) + /// Predicts the steady-state temperature for a given target wattage. + /// Formula: T_pred = T_ambient + (P_target * R_theta) + pub fn predict_temp(&self, target_watts: f32, ambient: f32, r_theta: f32) -> f32 { + ambient + (target_watts * r_theta) } - /// Finds the "Silicon Knee" - the point where performance-per-watt (efficiency) - /// starts to diminish significantly and thermal density spikes. - /// - /// This heuristic scoring model balances several factors: - /// 1. **Efficiency Drop:** How quickly does performance-per-watt decrease as power increases? - /// 2. **Thermal Acceleration:** How quickly does temperature rise per additional Watt? - /// 3. **Throttling Penalty:** A large penalty is applied if absolute performance drops, indicating a thermal wall. - /// - /// The "Knee" is the power level with the highest score, representing the optimal - /// balance before thermal saturation causes diminishing returns. + /// Calculates Thermal Resistance (K/W) using the steady-state delta. + pub fn calculate_r_theta(&self, ambient: f32, steady_temp: f32, steady_power: f32) -> f32 { + if steady_power < 1.0 { return 0.0; } + (steady_temp - ambient) / steady_power + } + + /// Identifies the "Silicon Knee" by finding the point of maximum efficiency. pub fn find_silicon_knee(&self, profile: &ThermalProfile) -> f32 { - let valid_points: Vec<_> = profile.points.iter() - .filter(|p| p.power_w > 5.0 && p.temp_c > 40.0) // Filter idle/noise - .cloned() - .collect(); + if profile.points.is_empty() { return 15.0; } - if valid_points.len() < 3 { - return profile.points.last().map(|p| p.power_w).unwrap_or(15.0); - } - - let mut points = valid_points; + let mut points = profile.points.clone(); points.sort_by(|a, b| a.power_w.partial_cmp(&b.power_w).unwrap_or(std::cmp::Ordering::Equal)); - let mut best_pl = points[0].power_w; - let mut max_score = f32::MIN; + let efficiencies: Vec<(f32, f32)> = points.iter() + .map(|p| { + let perf = if p.throughput > 0.0 { p.throughput as f32 } else { p.freq_mhz }; + (p.power_w, perf / p.power_w.max(1.0)) + }) + .collect(); - // Use a sliding window (3 points) to calculate gradients more robustly - for i in 1..points.len() - 1 { - let prev = &points[i - 1]; - let curr = &points[i]; - let next = &points[i + 1]; + if efficiencies.is_empty() { return 15.0; } - // 1. Efficiency Metric (Throughput per Watt or Freq per Watt) - let efficiency_curr = if curr.throughput > 0.0 { - curr.throughput as f32 / curr.power_w.max(1.0) + let max_efficiency = efficiencies.iter() + .map(|(_, e)| *e) + .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)) + .unwrap_or(1.0); + + let mut knee_watts = points[0].power_w; + for (watts, efficiency) in efficiencies { + if efficiency >= (max_efficiency * 0.85) { + knee_watts = watts; } else { - curr.freq_mhz / curr.power_w.max(1.0) - }; - - let efficiency_next = if next.throughput > 0.0 { - next.throughput as f32 / next.power_w.max(1.0) - } else { - next.freq_mhz / next.power_w.max(1.0) - }; - - let p_delta = (next.power_w - curr.power_w).max(0.5); - let efficiency_drop = (efficiency_curr - efficiency_next) / p_delta; - - // 2. Thermal Acceleration (d2T/dW2) - let p_delta_prev = (curr.power_w - prev.power_w).max(0.5); - let p_delta_next = (next.power_w - curr.power_w).max(0.5); - - let dt_dw_prev = (curr.temp_c - prev.temp_c) / p_delta_prev; - let dt_dw_next = (next.temp_c - curr.temp_c) / p_delta_next; - - let p_total_delta = (next.power_w - prev.power_w).max(1.0); - let temp_accel = (dt_dw_next - dt_dw_prev) / p_total_delta; - - // 3. Wall Detection (Any drop in absolute performance is a hard wall) - let is_throttling = next.freq_mhz < curr.freq_mhz || (next.throughput > 0.0 && next.throughput < curr.throughput); - let penalty = if is_throttling { 5000.0 } else { 0.0 }; - - let score = (efficiency_curr * 10.0) - (efficiency_drop * 50.0) - (temp_accel * 20.0) - penalty; - - if score > max_score { - max_score = score; - best_pl = curr.power_w; + debug!("Efficiency drop at {:.1}W ({:.1}% of peak)", watts, (efficiency/max_efficiency)*100.0); + break; } } - let best_pl = if max_score > f32::MIN { - best_pl - } else { - profile.points.last().map(|p| p.power_w).unwrap_or(15.0) - }; - - // Safety Floor: Never recommend a TDP below 5W, as this bricks system performance. - if best_pl < 5.0 { - warn!("Heuristic suggested dangerously low PL1 ({:.1}W). Falling back to 15W safety floor.", best_pl); - return 15.0; - } - - best_pl + knee_watts.clamp(PowerLimitWatts::MIN, PowerLimitWatts::MAX) } } + +use crate::sal::safety::PowerLimitWatts; diff --git a/src/orchestrator/mod.rs b/src/orchestrator/mod.rs index bc426f3..38a5786 100644 --- a/src/orchestrator/mod.rs +++ b/src/orchestrator/mod.rs @@ -4,7 +4,7 @@ //! using a [Workload], and feeds telemetry to the frontend via MPSC channels. use anyhow::{Result, Context, bail}; -use tracing::{info, warn, error}; +use tracing::{info, warn, error, debug}; use std::sync::mpsc; use std::time::{Duration, Instant}; use std::thread; @@ -23,67 +23,40 @@ use crate::load::{Workload, IntensityProfile, StressVector}; use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase}; use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult}; use crate::agent_analyst::HeuristicAnalyst; +use crate::agent_integrator::ServiceIntegrator; /// Represents the possible states of the benchmark orchestrator. pub enum OrchestratorState { - /// Performing pre-flight checks and snapshotting. PreFlight, - /// Acquiring idle baseline telemetry. IdleBaseline, - /// Actively sweeping through power limits. - StressSweep { current_wattage: f32 }, - /// Allowing hardware to cool down before releasing the guard. + ThermalCalibration, + StabilitySweep, Cooldown, - /// Benchmark complete, generating final results. Finalizing, } -/// The central state machine responsible for coordinating the thermal benchmark. pub struct BenchmarkOrchestrator { - /// Injected hardware abstraction layer. sal: Arc, - /// Discovered system facts and paths. facts: SystemFactSheet, - /// Heat generation workload. workload: Box, - /// Channel for sending telemetry updates to the UI. telemetry_tx: mpsc::Sender, - /// Channel for receiving commands from the UI. command_rx: mpsc::Receiver, - /// Current phase reported to the UI. ui_phase: BenchmarkPhase, - /// Accumulated thermal data points. profile: ThermalProfile, - /// Mathematics engine for data smoothing and optimization. engine: OptimizerEngine, - /// CLI override for the configuration output path. optional_config_out: Option, - - /// The safety membrane protecting the system. safeguard: Option, - /// Active thermal watchdog. watchdog: Option, - - /// Sliding window of power readings (Watts). history_watts: VecDeque, - /// Sliding window of temperature readings (Celsius). history_temp: VecDeque, - /// Sliding window of CPU frequency (MHz). history_mhz: VecDeque, - - /// Detected CPU model string. cpu_model: String, - /// Total system RAM in Gigabytes. total_ram_gb: u64, - - /// Atomic flag indicating a safety-triggered abort. emergency_abort: Arc, - /// Human-readable reason for the emergency abort. emergency_reason: Arc>>, } impl BenchmarkOrchestrator { - /// Creates a new orchestrator instance with injected dependencies. pub fn new( sal: Arc, facts: SystemFactSheet, @@ -122,14 +95,13 @@ impl BenchmarkOrchestrator { } } - /// Executes the full benchmark sequence. pub fn run(&mut self) -> Result { // Immediate Priming let _ = self.sal.get_temp(); let _ = self.sal.get_power_w(); let _ = self.sal.get_fan_rpms(); - info!("Orchestrator: Initializing Project Iron-Ember lifecycle."); + info!("Orchestrator: Initializing Project Iron-Ember PGC Protocol."); // Spawn safety watchdog immediately let watchdog = ThermalWatchdog::spawn(self.sal.clone(), self.emergency_abort.clone()); @@ -147,24 +119,24 @@ impl BenchmarkOrchestrator { let _ = self.workload.stop_workload(); if let Some(mut sg) = self.safeguard.take() { - if let Err(e) = sg.release() { - error!("CRITICAL: State restoration failure: {}", e); - } + let _ = sg.release(); } - info!("✓ Hardware state restored to pre-flight defaults."); + if let Err(e) = self.sal.restore() { + warn!("Failed secondary SAL restoration: {}", e); + } + + info!("✓ Hardware state restored."); result } - /// Internal execution logic for the benchmark phases. fn execute_benchmark(&mut self) -> Result { - let bench_cfg = self.facts.bench_config.clone().context("Benchmarking configuration missing.")?; + let _bench_cfg = self.facts.bench_config.clone().context("Config missing.")?; // 1. Pre-Flight Phase self.ui_phase = BenchmarkPhase::Auditing; self.log("Phase: Pre-Flight Auditing & Sterilization")?; - // Snapshot and neutralise Brawl Matrix let mut target_files = self.facts.rapl_paths.iter() .map(|p| p.join("constraint_0_power_limit_uw")) .collect::>(); @@ -177,7 +149,6 @@ impl BenchmarkOrchestrator { let sg = HardwareStateGuard::acquire(&target_files, &self.facts.conflict_services)?; self.safeguard = Some(sg); - // Run auditor for step in self.sal.audit() { if let Err(e) = step.outcome { return Err(anyhow::anyhow!("Audit failed ({}): {:?}", step.description, e)); @@ -185,106 +156,117 @@ impl BenchmarkOrchestrator { } self.workload.initialize().context("Failed to initialize load generator.")?; + self.sal.suppress().context("Failed to suppress background services.")?; let tick = Cell::new(0u64); // 2. Idle Baseline Phase self.ui_phase = BenchmarkPhase::IdleCalibration; - self.log(&format!("Phase: Recording Idle Baseline ({}s)", bench_cfg.idle_duration_s))?; - - // Wait for fan spin-up + self.log("Phase: Recording 30s Idle Baseline...")?; self.sal.set_fan_mode("auto")?; let mut idle_temps = Vec::new(); let start = Instant::now(); - while start.elapsed() < Duration::from_secs(bench_cfg.idle_duration_s) { + while start.elapsed() < Duration::from_secs(30) { self.check_safety_abort()?; self.send_telemetry(tick.get())?; idle_temps.push(self.sal.get_temp().unwrap_or(0.0)); tick.set(tick.get() + 1); thread::sleep(Duration::from_millis(500)); } - self.profile.ambient_temp = self.engine.smooth(&idle_temps).last().cloned().unwrap_or(0.0); + self.profile.ambient_temp = self.engine.smooth(&idle_temps).iter().sum::() / idle_temps.len() as f32; self.log(&format!("✓ Idle Baseline: {:.1}°C", self.profile.ambient_temp))?; - // 3. Stress Sweep Phase - self.ui_phase = BenchmarkPhase::StressTesting; - self.log("Phase: Synthetic Stress Matrix (Gradual Ramp)")?; - - // Ensure fans are ramped to MAX before load - self.log("Metrology: Locking fans to MAX...")?; + // 3. Thermal Resistance Mapping (Phase 1) + self.log("Phase: Mapping Thermal Resistance (Rθ) at 10W...")?; self.sal.set_fan_mode("max")?; - let fan_lock_start = Instant::now(); - loop { - let fans = self.sal.get_fan_rpms().unwrap_or_default(); - let max_rpm = fans.iter().cloned().max().unwrap_or(0); - if max_rpm >= 3000 || fan_lock_start.elapsed() > Duration::from_secs(15) { + + let pl_calib = PowerLimitWatts::try_new(10.0)?; + self.sal.set_sustained_power_limit(pl_calib)?; + self.sal.set_burst_power_limit(pl_calib)?; + + self.workload.run_workload( + Duration::from_secs(120), + IntensityProfile { threads: num_cpus::get_physical(), load_percentage: 100, vector: StressVector::CpuMatrix } + )?; + + let mut calib_temps = Vec::new(); + let calib_start = Instant::now(); + while calib_start.elapsed() < Duration::from_secs(90) { + self.check_safety_abort()?; + self.send_telemetry(tick.get())?; + let t = self.sal.get_temp().unwrap_or(0.0); + calib_temps.push(t); + tick.set(tick.get() + 1); + + if calib_start.elapsed() > Duration::from_secs(30) && self.engine.is_stable(&calib_temps) { break; } thread::sleep(Duration::from_millis(500)); - self.send_telemetry(tick.get())?; - tick.set(tick.get() + 1); } + + let steady_t = calib_temps.last().cloned().unwrap_or(0.0); + let steady_p = self.sal.get_power_w().unwrap_or(10.0); + self.profile.r_theta = self.engine.calculate_r_theta(self.profile.ambient_temp, steady_t, steady_p); + self.log(&format!("✓ Physical Model: Rθ = {:.3} K/W", self.profile.r_theta))?; - let physical_threads = num_cpus::get_physical(); + // 4. Physically-Aware Stability Sweep (Phase 2) + self.ui_phase = BenchmarkPhase::StressTesting; + self.log("Phase: Starting Physically-Aware Efficiency Sweep...")?; + + let mut current_w = 12.0_f32; let mut previous_ops = 0.0; - for &watts in &bench_cfg.power_steps_watts { - self.check_safety_abort()?; - self.log(&format!("Testing PL1 = {:.0}W", watts))?; - - // Apply limits safely - let pl1 = PowerLimitWatts::try_new(watts)?; - let pl2 = PowerLimitWatts::try_new(watts + 5.0)?; - - self.sal.set_sustained_power_limit(pl1)?; - self.sal.set_burst_power_limit(pl2)?; - - // Start workload + loop { + // Predict if this step is safe + let pred_t = self.engine.predict_temp(current_w, self.profile.ambient_temp, self.profile.r_theta); + if pred_t > 92.0 { + self.log(&format!("Prediction: {:.1}W would result in {:.1}C (Too Hot). Finalizing...", current_w, pred_t))?; + break; + } + + self.log(&format!("Step: {:.1}W (Predicted: {:.1}C)", current_w, pred_t))?; + let pl = PowerLimitWatts::try_new(current_w)?; + self.sal.set_sustained_power_limit(pl)?; + self.sal.set_burst_power_limit(PowerLimitWatts::try_new(current_w + 2.0)?)?; + self.workload.run_workload( - Duration::from_secs(bench_cfg.stress_duration_max_s), - IntensityProfile { threads: physical_threads, load_percentage: 100, vector: StressVector::CpuMatrix } + Duration::from_secs(60), + IntensityProfile { threads: num_cpus::get_physical(), load_percentage: 100, vector: StressVector::CpuMatrix } )?; let step_start = Instant::now(); - let mut step_temps = VecDeque::with_capacity(30); - let mut previous_step_temp = self.sal.get_temp().unwrap_or(0.0); + let mut step_temps = Vec::new(); + let mut previous_t = self.sal.get_temp().unwrap_or(0.0); - // Equilibrium Gating - while step_start.elapsed() < Duration::from_secs(bench_cfg.stress_duration_max_s) { + while step_start.elapsed() < Duration::from_secs(60) { self.check_safety_abort()?; - + self.send_telemetry(tick.get())?; + let t = self.sal.get_temp().unwrap_or(0.0); - let dt_dt = (t - previous_step_temp) / 0.5; - previous_step_temp = t; + let dt_dt = (t - previous_t) / 0.5; - // Redundant safety check during step - if t > 94.0 || dt_dt > 5.0 { - warn!("Thermal Spike Detected! Aborting current step."); - break; + // # SAFETY: predictive hard-quench threshold raised to 8C/s + if step_start.elapsed() > Duration::from_secs(2) && (t > 95.0 || dt_dt > 8.0) { + warn!("USA: Safety Break triggered! T={:.1}C, dT/dt={:.1}C/s", t, dt_dt); + let _ = self.sal.set_sustained_power_limit(PowerLimitWatts::try_new(3.0)?); + break; // Just break the sweep loop } - step_temps.push_back(t); - if step_temps.len() > 10 { step_temps.pop_front(); } - - self.send_telemetry(tick.get())?; + step_temps.push(t); tick.set(tick.get() + 1); - if step_start.elapsed() > Duration::from_secs(bench_cfg.stress_duration_min_s) && step_temps.len() == 10 { - let min = step_temps.iter().fold(f32::MAX, |a, &b| a.min(b)); - let max = step_temps.iter().fold(f32::MIN, |a, &b| a.max(b)); - if (max - min) < 0.5 { - info!("Equilibrium reached at {:.1}°C", t); - break; - } + if step_start.elapsed() > Duration::from_secs(15) && self.engine.is_stable(&step_temps) { + self.log(&format!(" Equilibrium reached at {:.1}°C", t))?; + break; } + previous_t = t; thread::sleep(Duration::from_millis(500)); } - // Record data point let metrics = self.workload.get_current_metrics().unwrap_or_default(); self.profile.points.push(ThermalPoint { - power_w: self.sal.get_power_w().unwrap_or(watts), + power_w: self.sal.get_power_w().unwrap_or(current_w), temp_c: self.sal.get_temp().unwrap_or(0.0), freq_mhz: self.sal.get_freq_mhz().unwrap_or(0.0), fan_rpm: self.sal.get_fan_rpms().unwrap_or_default().first().cloned().unwrap_or(0), @@ -293,64 +275,62 @@ impl BenchmarkOrchestrator { self.workload.stop_workload()?; - // Performance Halt Condition + // Efficiency Break if previous_ops > 0.0 { let gain = ((metrics.primary_ops_per_sec - previous_ops) / previous_ops) * 100.0; if gain < 1.0 { - self.log("Diminishing returns reached. Stopping sweep.")?; + self.log("Silicon Knee identified (gain < 1%). Finalizing...")?; break; } } previous_ops = metrics.primary_ops_per_sec; + current_w += 2.0; + if current_w > 45.0 { break; } - self.log(&format!("Cooling down ({}s)...", bench_cfg.cool_down_s))?; - thread::sleep(Duration::from_secs(bench_cfg.cool_down_s)); + self.log(&format!("Cooling down ({}s)...", _bench_cfg.cool_down_s))?; + thread::sleep(Duration::from_secs(_bench_cfg.cool_down_s)); } - // 4. Physical Modeling Phase + // 5. Modeling Phase self.ui_phase = BenchmarkPhase::PhysicalModeling; - self.log("Phase: Silicon Physical Sweet Spot Calculation")?; - + let knee = self.engine.find_silicon_knee(&self.profile); let analyst = HeuristicAnalyst::new(); let matrix = analyst.analyze(&self.profile, self.profile.points.last().map(|p| p.power_w).unwrap_or(15.0)); let mut res = self.generate_result(false); res.optimization_matrix = Some(matrix.clone()); - - info!("Identification complete. Knee: {:.1}W, Rθ: {:.3} K/W", res.silicon_knee_watts, res.thermal_resistance_kw); + res.silicon_knee_watts = knee; - // 5. Finalizing Phase + // 6. Finalizing Phase self.ui_phase = BenchmarkPhase::Finalizing; - self.log("Phase: Generation of Optimized Configuration Sets")?; - - let throttled_path = self.optional_config_out.clone() - .or_else(|| self.facts.paths.configs.get("throttled").cloned()); - - if let Some(path) = throttled_path { + let throttled_source = self.facts.paths.configs.get("throttled"); + if let Some(path) = self.optional_config_out.clone().or_else(|| throttled_source.cloned()) { let config = crate::engine::formatters::throttled::ThrottledConfig { pl1_limit: res.silicon_knee_watts, - pl2_limit: res.recommended_pl2, - trip_temp: res.max_temp_c.max(90.0), + pl2_limit: res.silicon_knee_watts * 1.25, + trip_temp: 90.0, }; - crate::engine::formatters::throttled::ThrottledTranslator::save(&path, &config)?; - self.log(&format!("✓ Saved Throttled profile to {}", path.display()))?; + let _ = crate::engine::formatters::throttled::ThrottledTranslator::save(&path, &config, throttled_source); res.config_paths.insert("throttled".to_string(), path); } + let base_out = self.optional_config_out.clone().unwrap_or_else(|| PathBuf::from("/etc")); + let i8k_source = self.facts.paths.configs.get("i8kmon"); + let i8k_out = base_out.join("i8kmon.conf"); + if ServiceIntegrator::generate_i8kmon_config(&matrix, &i8k_out, i8k_source).is_ok() { + res.config_paths.insert("i8kmon".to_string(), i8k_out); + } + Ok(res) } - /// Checks if the safety watchdog or user triggered an abort. fn check_safety_abort(&self) -> Result<()> { if self.emergency_abort.load(Ordering::SeqCst) { - let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Watchdog Triggered".to_string()); + let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Watchdog".to_string()); bail!("EMERGENCY_ABORT: {}", reason); } - if let Ok(cmd) = self.command_rx.try_recv() { - match cmd { - UiCommand::Abort => bail!("ABORTED"), - } + if let UiCommand::Abort = cmd { bail!("ABORTED"); } } Ok(()) } @@ -365,49 +345,35 @@ impl BenchmarkOrchestrator { current_freq: self.sal.get_freq_mhz().unwrap_or(0.0), fans: self.sal.get_fan_rpms().unwrap_or_default(), governor: "performance".to_string(), - pl1_limit: 0.0, - pl2_limit: 0.0, - fan_tier: "auto".to_string(), + pl1_limit: 0.0, pl2_limit: 0.0, fan_tier: "auto".to_string(), is_throttling: self.sal.get_throttling_status().unwrap_or(false), phase: self.ui_phase, - history_watts: Vec::new(), - history_temp: Vec::new(), - history_mhz: Vec::new(), + history_watts: Vec::new(), history_temp: Vec::new(), history_mhz: Vec::new(), log_event: Some(msg.to_string()), metadata: std::collections::HashMap::new(), is_emergency: self.emergency_abort.load(Ordering::SeqCst), emergency_reason: self.emergency_reason.lock().unwrap().clone(), }; - self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed")) + self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Channel closed")) } fn send_telemetry(&mut self, tick: u64) -> Result<()> { let temp = self.sal.get_temp().unwrap_or(0.0); let pwr = self.sal.get_power_w().unwrap_or(0.0); let freq = self.sal.get_freq_mhz().unwrap_or(0.0); - self.history_temp.push_back(temp); self.history_watts.push_back(pwr); self.history_mhz.push_back(freq); - - if self.history_temp.len() > 120 { - self.history_temp.pop_front(); - self.history_watts.pop_front(); - self.history_mhz.pop_front(); - } + if self.history_temp.len() > 120 { self.history_temp.pop_front(); self.history_watts.pop_front(); self.history_mhz.pop_front(); } let state = TelemetryState { cpu_model: self.cpu_model.clone(), total_ram_gb: self.total_ram_gb, tick, - cpu_temp: temp, - power_w: pwr, - current_freq: freq, + cpu_temp: temp, power_w: pwr, current_freq: freq, fans: self.sal.get_fan_rpms().unwrap_or_default(), governor: "performance".to_string(), - pl1_limit: 15.0, - pl2_limit: 25.0, - fan_tier: "max".to_string(), + pl1_limit: 15.0, pl2_limit: 25.0, fan_tier: "max".to_string(), is_throttling: self.sal.get_throttling_status().unwrap_or(false), phase: self.ui_phase, history_watts: self.history_watts.iter().cloned().collect(), @@ -418,21 +384,19 @@ impl BenchmarkOrchestrator { is_emergency: self.emergency_abort.load(Ordering::SeqCst), emergency_reason: self.emergency_reason.lock().unwrap().clone(), }; - self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed")) + self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Channel closed")) } pub fn generate_result(&self, is_partial: bool) -> OptimizationResult { - let r_theta = self.engine.calculate_thermal_resistance(&self.profile); + let r_theta = self.profile.r_theta; let knee = self.engine.find_silicon_knee(&self.profile); - let max_t = self.engine.get_max_temp(&self.profile); - OptimizationResult { profile: self.profile.clone(), silicon_knee_watts: knee, thermal_resistance_kw: r_theta, recommended_pl1: knee, recommended_pl2: knee * 1.25, - max_temp_c: max_t, + max_temp_c: self.profile.points.iter().map(|p| p.temp_c).max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)).unwrap_or(0.0), is_partial, config_paths: std::collections::HashMap::new(), optimization_matrix: None, diff --git a/src/sal/generic_linux.rs b/src/sal/generic_linux.rs index 3456794..fbd7c48 100644 --- a/src/sal/generic_linux.rs +++ b/src/sal/generic_linux.rs @@ -2,8 +2,7 @@ use anyhow::{Result, anyhow, Context}; use std::path::{Path}; use std::fs; use std::time::{Duration, Instant}; -use std::sync::{Mutex, Arc}; -use tracing::{debug, warn, info}; +use std::sync::Mutex; use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx}; use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent}; diff --git a/src/sal/mock.rs b/src/sal/mock.rs index 6a9b3b1..e6e6a40 100644 --- a/src/sal/mock.rs +++ b/src/sal/mock.rs @@ -1,7 +1,6 @@ use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, SafetyStatus}; use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent}; use anyhow::Result; -use std::sync::Arc; pub struct MockSal { pub temperature_sequence: std::sync::atomic::AtomicUsize, diff --git a/src/sal/safety.rs b/src/sal/safety.rs index 88c641a..863ea2e 100644 --- a/src/sal/safety.rs +++ b/src/sal/safety.rs @@ -10,7 +10,7 @@ use std::fs; use std::path::{PathBuf}; use std::sync::Arc; use std::sync::atomic::{AtomicBool, Ordering}; -use std::time::{Duration, Instant}; +use std::time::Duration; use std::thread; use tracing::{info, warn, error, debug}; diff --git a/tests/config_merge_test.rs b/tests/config_merge_test.rs index e2f1777..128eeff 100644 --- a/tests/config_merge_test.rs +++ b/tests/config_merge_test.rs @@ -1,35 +1,75 @@ -#[path = "../src/engine/formatters/throttled.rs"] -mod throttled; - -use throttled::{ThrottledTranslator, ThrottledConfig}; +use ember_tune_rs::engine::formatters::throttled::{ThrottledConfig, ThrottledTranslator}; +use ember_tune_rs::agent_analyst::{OptimizationMatrix, SystemProfile, FanCurvePoint}; +use ember_tune_rs::agent_integrator::ServiceIntegrator; use std::fs; +use tempfile::tempdir; #[test] -fn test_throttled_formatter_non_destructive() { - let fixture_path = "tests/fixtures/throttled.conf"; - let existing_content = fs::read_to_string(fixture_path).expect("Failed to read fixture"); - +fn test_throttled_merge_preserves_undervolt() { + let existing = r#"[GENERAL] +Update_Interval_ms: 1000 + +[UNDERVOLT] +# CPU core undervolt +CORE: -100 +# GPU undervolt +GPU: -50 + +[AC] +PL1_Tdp_W: 15 +PL2_Tdp_W: 25 +"#; + let config = ThrottledConfig { - pl1_limit: 25.0, - pl2_limit: 35.0, - trip_temp: 90.0, + pl1_limit: 22.0, + pl2_limit: 28.0, + trip_temp: 95.0, }; - let merged = ThrottledTranslator::merge_conf(&existing_content, &config); + let merged = ThrottledTranslator::merge_conf(existing, &config); - // Assert updates - assert!(merged.contains("PL1_Tdp_W: 25")); - assert!(merged.contains("PL2_Tdp_W: 35")); - assert!(merged.contains("Trip_Temp_C: 90")); - - // Assert preservation - assert!(merged.contains("[UNDERVOLT]")); assert!(merged.contains("CORE: -100")); assert!(merged.contains("GPU: -50")); - assert!(merged.contains("# Important: Preserving undervolt offsets is critical!")); - assert!(merged.contains("Update_Interval_ms: 3000")); - - // Check that we didn't lose the [GENERAL] section - assert!(merged.contains("[GENERAL]")); - assert!(merged.contains("# This is a complex test fixture")); + assert!(merged.contains("PL1_Tdp_W: 22")); + assert!(merged.contains("PL2_Tdp_W: 28")); + assert!(merged.contains("Trip_Temp_C: 95")); + assert!(merged.contains("[UNDERVOLT]")); +} + +#[test] +fn test_i8kmon_merge_preserves_settings() { + let dir = tempdir().unwrap(); + let config_path = dir.path().join("i8kmon.conf"); + + let existing = r#"set config(gen_shadow) 1 +set config(i8k_ignore_dmi) 1 +set config(daemon) 1 + +set config(0) {0 0 60 50} +"#; + fs::write(&config_path, existing).unwrap(); + + let matrix = OptimizationMatrix { + silent: SystemProfile { name: "Silent".to_string(), pl1_watts: 10.0, pl2_watts: 12.0, fan_curve: vec![] }, + balanced: SystemProfile { + name: "Balanced".to_string(), + pl1_watts: 20.0, + pl2_watts: 25.0, + fan_curve: vec![ + FanCurvePoint { temp_on: 70.0, temp_off: 60.0, pwm_percent: 50 } + ] + }, + performance: SystemProfile { name: "Perf".to_string(), pl1_watts: 30.0, pl2_watts: 35.0, fan_curve: vec![] }, + thermal_resistance_kw: 1.5, + ambient_temp: 25.0, + }; + + ServiceIntegrator::generate_i8kmon_config(&matrix, &config_path, Some(&config_path)).unwrap(); + + let result = fs::read_to_string(&config_path).unwrap(); + + assert!(result.contains("set config(gen_shadow) 1")); + assert!(result.contains("set config(daemon) 1")); + assert!(result.contains("set config(0) {1 1 70 -}")); // New config + assert!(!result.contains("set config(0) {0 0 60 50}")); // Old config should be gone } diff --git a/tests/safety_test.rs b/tests/safety_test.rs index 2922019..53d71d2 100644 --- a/tests/safety_test.rs +++ b/tests/safety_test.rs @@ -1,8 +1,6 @@ -use anyhow::Result; -use std::fs; -use std::path::PathBuf; -use ember_tune_rs::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts}; +use ember_tune_rs::sal::safety::{HardwareStateGuard, PowerLimitWatts}; use crate::common::fakesys::FakeSysBuilder; +use std::fs; mod common; @@ -34,23 +32,22 @@ fn test_hardware_state_guard_panic_restoration() { #[test] fn test_tdp_limit_bounds_checking() { // 1. Valid value - assert!(TdpLimitMicroWatts::new(15_000_000).is_ok()); + assert!(PowerLimitWatts::try_new(15.0).is_ok()); - // 2. Too low (Dangerous 0W or below 5W) - let low_res = TdpLimitMicroWatts::new(1_000_000); + // 2. Too low (Dangerous 0W or below 3W) + let low_res = PowerLimitWatts::try_new(1.0); assert!(low_res.is_err()); - assert!(low_res.unwrap_err().to_string().contains("below safety floor")); + assert!(low_res.unwrap_err().to_string().contains("outside safe bounds")); - // 3. Too high (> 80W) - let high_res = TdpLimitMicroWatts::new(100_000_000); + // 3. Too high (> 100W) + let high_res = PowerLimitWatts::try_new(150.0); assert!(high_res.is_err()); - assert!(high_res.unwrap_err().to_string().contains("exceeds safety ceiling")); + assert!(high_res.unwrap_err().to_string().contains("outside safe bounds")); } #[test] fn test_0w_tdp_regression_prevention() { // The prime directive is to never set 0W. - // Ensure the new() constructor explicitly fails for 0. - let zero_res = TdpLimitMicroWatts::new(0); + let zero_res = PowerLimitWatts::try_new(0.0); assert!(zero_res.is_err()); }