updated safety measurements and benchmarking behavior for 9380

This commit is contained in:
2026-02-28 18:55:18 +01:00
parent 1702e7d058
commit 8d351c7bde
10 changed files with 329 additions and 334 deletions

View File

@@ -28,6 +28,7 @@ pub struct OptimizationMatrix {
pub balanced: SystemProfile,
pub performance: SystemProfile,
pub thermal_resistance_kw: f32,
pub ambient_temp: f32,
}
pub struct HeuristicAnalyst {
@@ -43,16 +44,14 @@ impl HeuristicAnalyst {
/// Analyzes the raw telemetry to generate the 3 optimal profiles.
pub fn analyze(&self, profile: &ThermalProfile, max_soak_watts: f32) -> OptimizationMatrix {
let r_theta = self.engine.calculate_thermal_resistance(profile);
let r_theta = profile.r_theta;
let silicon_knee = self.engine.find_silicon_knee(profile);
let ambient = profile.ambient_temp;
// 1. State A: Silent / Battery (Scientific Passive Equilibrium)
// Objective: Find P where T_core = 60C with fans OFF.
// T_core = T_ambient + (P * R_theta_passive)
// Note: R_theta measured during benchmark was with fans MAX.
// Passive R_theta is typically 2-3x higher.
// Find P where T_core = 60C with fans OFF.
let r_theta_passive = r_theta * 2.5;
let silent_watts = ((60.0 - profile.ambient_temp) / r_theta_passive.max(0.1)).clamp(5.0, 15.0);
let silent_watts = ((60.0 - ambient) / r_theta_passive.max(0.1)).clamp(3.0, 15.0);
let silent_profile = SystemProfile {
name: "Silent".to_string(),
@@ -64,21 +63,21 @@ impl HeuristicAnalyst {
],
};
// 2. State B: Balanced
// The exact calculated Silicon Knee
// 2. State B: Balanced (The Silicon Knee)
// We use R_theta to predict where the knee will sit thermally.
let balanced_profile = SystemProfile {
name: "Balanced".to_string(),
pl1_watts: silicon_knee,
pl2_watts: silicon_knee * 1.25,
fan_curve: vec![
FanCurvePoint { temp_on: 60.0, temp_off: 55.0, pwm_percent: 0 },
FanCurvePoint { temp_on: 75.0, temp_off: 65.0, pwm_percent: 40 },
FanCurvePoint { temp_on: 85.0, temp_off: 75.0, pwm_percent: 70 },
FanCurvePoint { temp_on: ambient + 15.0, temp_off: ambient + 10.0, pwm_percent: 0 },
FanCurvePoint { temp_on: ambient + 25.0, temp_off: ambient + 20.0, pwm_percent: 30 },
FanCurvePoint { temp_on: 75.0, temp_off: 65.0, pwm_percent: 50 },
FanCurvePoint { temp_on: 85.0, temp_off: 75.0, pwm_percent: 80 },
],
};
// 3. State C: Sustained Heavy
// Based on the max soak watts from Phase 1.
let performance_profile = SystemProfile {
name: "Performance".to_string(),
pl1_watts: max_soak_watts,
@@ -95,6 +94,7 @@ impl HeuristicAnalyst {
balanced: balanced_profile,
performance: performance_profile,
thermal_resistance_kw: r_theta,
ambient_temp: ambient,
}
}
}

View File

@@ -6,7 +6,7 @@
//! resolution strategies for overlapping daemons.
use anyhow::Result;
use std::path::Path;
use std::path::{Path, PathBuf};
use std::fs;
use crate::agent_analyst::OptimizationMatrix;
@@ -14,20 +14,42 @@ pub struct ServiceIntegrator;
impl ServiceIntegrator {
/// Generates and saves an i8kmon configuration based on the balanced profile.
pub fn generate_i8kmon_config(matrix: &OptimizationMatrix, output_path: &Path) -> Result<()> {
pub fn generate_i8kmon_config(matrix: &OptimizationMatrix, output_path: &Path, source_path: Option<&PathBuf>) -> Result<()> {
let profile = &matrix.balanced;
let mut conf = String::new();
conf.push_str("# Auto-generated by ember-tune Integrator
");
conf.push_str(&format!("# Profile: {}
", profile.name));
let mut conf = String::new();
// Read existing content to preserve daemon and other settings
let existing = if let Some(src) = source_path {
if src.exists() { fs::read_to_string(src).unwrap_or_default() } else { String::new() }
} else if output_path.exists() {
fs::read_to_string(output_path).unwrap_or_default()
} else {
String::new()
};
if !existing.is_empty() {
for line in existing.lines() {
let trimmed = line.trim();
// Filter out the old auto-generated config lines and fan configs
if !trimmed.starts_with("set config(0)") &&
!trimmed.starts_with("set config(1)") &&
!trimmed.starts_with("set config(2)") &&
!trimmed.starts_with("set config(3)") &&
!trimmed.starts_with("# Auto-generated") &&
!trimmed.starts_with("# Profile:") &&
!trimmed.is_empty() {
conf.push_str(line);
conf.push('\n');
}
}
}
conf.push_str("\n# Auto-generated by ember-tune Integrator\n");
conf.push_str(&format!("# Profile: {}\n", profile.name));
conf.push_str(&format!("# Thermal Resistance: {:.3} K/W\n\n", matrix.thermal_resistance_kw));
for (i, p) in profile.fan_curve.iter().enumerate() {
// i8kmon syntax: set config(state) {left_fan right_fan temp_on temp_off}
// State 0, 1, 2, 3 correspond to BIOS fan states (off, low, high)
let state = match p.pwm_percent {
0..=20 => 0,
21..=50 => 1,
@@ -35,31 +57,50 @@ impl ServiceIntegrator {
_ => 2,
};
let off = if i == 0 { "-".to_string() } else { format!("{}", p.temp_off) };
conf.push_str(&format!("set config({}) {{{} {} {} {}}}
", i, state, state, p.temp_on, off));
let off = if i == 0 { "-".to_string() } else { format!("{:.0}", p.temp_off) };
conf.push_str(&format!("set config({}) {{{} {} {:.0} {}}}\n", i, state, state, p.temp_on, off));
}
fs::write(output_path, conf)?;
Ok(())
}
/// Generates a thinkfan configuration.
pub fn generate_thinkfan_config(matrix: &OptimizationMatrix, output_path: &Path) -> Result<()> {
/// Generates a thinkfan configuration, merging with existing sensors if possible.
pub fn generate_thinkfan_config(matrix: &OptimizationMatrix, output_path: &Path, source_path: Option<&PathBuf>) -> Result<()> {
let profile = &matrix.balanced;
let mut conf = String::new();
conf.push_str("# Auto-generated by ember-tune Integrator
");
conf.push_str("sensors:
- hwmon: /sys/class/hwmon/hwmon0/temp1_input
let existing = if let Some(src) = source_path {
if src.exists() { fs::read_to_string(src).unwrap_or_default() } else { String::new() }
} else if output_path.exists() {
fs::read_to_string(output_path).unwrap_or_default()
} else {
String::new()
};
");
conf.push_str("levels:
");
if !existing.is_empty() {
let mut in_sensors = false;
for line in existing.lines() {
let trimmed = line.trim();
if trimmed == "sensors:" { in_sensors = true; }
if trimmed == "levels:" { in_sensors = false; }
if in_sensors {
conf.push_str(line);
conf.push('\n');
}
}
}
if conf.is_empty() {
conf.push_str("sensors:\n - hwmon: /sys/class/hwmon/hwmon0/temp1_input\n\n");
}
conf.push_str("\n# Auto-generated by ember-tune Integrator\n");
conf.push_str("levels:\n");
for (i, p) in profile.fan_curve.iter().enumerate() {
// thinkfan syntax: - [level, temp_down, temp_up]
let level = match p.pwm_percent {
0..=20 => 0,
21..=40 => 1,
@@ -69,8 +110,7 @@ impl ServiceIntegrator {
};
let down = if i == 0 { 0.0 } else { p.temp_off };
conf.push_str(&format!(" - [{}, {}, {}]
", level, down, p.temp_on));
conf.push_str(&format!(" - [{}, {:.0}, {:.0}]\n", level, down, p.temp_on));
}
fs::write(output_path, conf)?;
@@ -91,7 +131,6 @@ sed -i 's/^CPU_BOOST_ON_AC=.*/CPU_BOOST_ON_AC=""/' /etc/tlp.conf
systemctl restart tlp
# 3. Thermald Delegate (We provide the trips, it handles the rest)
# (Ensure your custom thermal-conf.xml is in /etc/thermald/)
systemctl restart thermald
"#;
fs::write(output_path, script)?;
@@ -99,7 +138,7 @@ systemctl restart thermald
}
/// Generates a thermald configuration XML.
pub fn generate_thermald_config(matrix: &OptimizationMatrix, output_path: &Path) -> Result<()> {
pub fn generate_thermald_config(matrix: &OptimizationMatrix, output_path: &Path, _source_path: Option<&PathBuf>) -> Result<()> {
let profile = &matrix.balanced;
let mut xml = String::new();
xml.push_str("<?xml version=\"1.0\"?>\n<ThermalConfiguration>\n <Platform>\n <Name>ember-tune Balanced</Name>\n <ProductName>Generic</ProductName>\n <Preference>balanced</Preference>\n <ThermalZones>\n <ThermalZone>\n <Type>cpu</Type>\n <TripPoints>\n");

View File

@@ -118,8 +118,15 @@ Trip_Temp_C: {trip:.0}
result_lines.join("\n")
}
pub fn save(path: &Path, config: &ThrottledConfig) -> Result<()> {
let existing = if path.exists() { std::fs::read_to_string(path)? } else { String::new() };
pub fn save(path: &Path, config: &ThrottledConfig, source_path: Option<&std::path::PathBuf>) -> Result<()> {
let existing = if let Some(src) = source_path {
if src.exists() { std::fs::read_to_string(src).unwrap_or_default() } else { String::new() }
} else if path.exists() {
std::fs::read_to_string(path).unwrap_or_default()
} else {
String::new()
};
let content = if existing.is_empty() { Self::generate_conf(config) } else { Self::merge_conf(&existing, config) };
std::fs::write(path, content)?;
Ok(())

View File

@@ -7,7 +7,7 @@
use serde::{Serialize, Deserialize};
use std::collections::HashMap;
use std::path::PathBuf;
use tracing::warn;
use tracing::{warn, debug};
pub mod formatters;
@@ -26,6 +26,7 @@ pub struct ThermalPoint {
pub struct ThermalProfile {
pub points: Vec<ThermalPoint>,
pub ambient_temp: f32,
pub r_theta: f32,
}
/// The final, recommended parameters derived from the thermal benchmark.
@@ -52,24 +53,16 @@ pub struct OptimizationResult {
}
/// Pure mathematics engine for thermal optimization.
///
/// Contains no hardware I/O and operates solely on the collected [ThermalProfile].
pub struct OptimizerEngine {
/// The size of the sliding window for the `smooth` function.
window_size: usize,
}
impl OptimizerEngine {
/// Creates a new `OptimizerEngine`.
pub fn new(window_size: usize) -> Self {
Self { window_size }
}
/// Applies a simple moving average (SMA) filter with outlier rejection.
///
/// This function smooths noisy sensor data. It rejects any value in the
/// window that is more than 20.0 units away from the window's average
/// before calculating the final smoothed value.
/// Smoothes sensor jitter using a moving average with outlier rejection.
pub fn smooth(&self, data: &[f32]) -> Vec<f32> {
if data.is_empty() { return vec![]; }
let mut smoothed = Vec::with_capacity(data.len());
@@ -81,7 +74,7 @@ impl OptimizerEngine {
let window = &data[start..end];
let avg: f32 = window.iter().sum::<f32>() / window.len() as f32;
let filtered: Vec<f32> = window.iter()
.filter(|&&v| (v - avg).abs() < 20.0) // Reject spikes > 20 units
.filter(|&&v| (v - avg).abs() < 10.0)
.cloned().collect();
if filtered.is_empty() {
@@ -93,108 +86,65 @@ impl OptimizerEngine {
smoothed
}
/// Calculates Thermal Resistance: R_theta = (T_core - T_ambient) / P_package.
///
/// This function uses the data point with the highest power draw to ensure
/// the calculation reflects a system under maximum thermal load.
pub fn calculate_thermal_resistance(&self, profile: &ThermalProfile) -> f32 {
profile.points.iter()
.filter(|p| p.power_w > 1.0 && p.temp_c > 30.0) // Filter invalid data
.max_by(|a, b| a.power_w.partial_cmp(&b.power_w).unwrap_or(std::cmp::Ordering::Equal))
.map(|p| (p.temp_c - profile.ambient_temp) / p.power_w)
.unwrap_or(0.0)
/// Evaluates if a series of temperature readings have reached thermal equilibrium.
/// Criteria: Standard deviation < 0.25C over the last 10 seconds.
pub fn is_stable(&self, temps: &[f32]) -> bool {
if temps.len() < 20 { return false; } // Need at least 10s of data (500ms intervals)
let window = &temps[temps.len() - 20..];
let avg = window.iter().sum::<f32>() / window.len() as f32;
let variance = window.iter().map(|&t| (t - avg).powi(2)).sum::<f32>() / window.len() as f32;
let std_dev = variance.sqrt();
debug!("Stability Check: StdDev={:.3}C (Target < 0.25C)", std_dev);
std_dev < 0.25
}
/// Returns the maximum temperature recorded in the profile.
pub fn get_max_temp(&self, profile: &ThermalProfile) -> f32 {
profile.points.iter()
.map(|p| p.temp_c)
.max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.unwrap_or(0.0)
/// Predicts the steady-state temperature for a given target wattage.
/// Formula: T_pred = T_ambient + (P_target * R_theta)
pub fn predict_temp(&self, target_watts: f32, ambient: f32, r_theta: f32) -> f32 {
ambient + (target_watts * r_theta)
}
/// Finds the "Silicon Knee" - the point where performance-per-watt (efficiency)
/// starts to diminish significantly and thermal density spikes.
///
/// This heuristic scoring model balances several factors:
/// 1. **Efficiency Drop:** How quickly does performance-per-watt decrease as power increases?
/// 2. **Thermal Acceleration:** How quickly does temperature rise per additional Watt?
/// 3. **Throttling Penalty:** A large penalty is applied if absolute performance drops, indicating a thermal wall.
///
/// The "Knee" is the power level with the highest score, representing the optimal
/// balance before thermal saturation causes diminishing returns.
/// Calculates Thermal Resistance (K/W) using the steady-state delta.
pub fn calculate_r_theta(&self, ambient: f32, steady_temp: f32, steady_power: f32) -> f32 {
if steady_power < 1.0 { return 0.0; }
(steady_temp - ambient) / steady_power
}
/// Identifies the "Silicon Knee" by finding the point of maximum efficiency.
pub fn find_silicon_knee(&self, profile: &ThermalProfile) -> f32 {
let valid_points: Vec<_> = profile.points.iter()
.filter(|p| p.power_w > 5.0 && p.temp_c > 40.0) // Filter idle/noise
.cloned()
.collect();
if profile.points.is_empty() { return 15.0; }
if valid_points.len() < 3 {
return profile.points.last().map(|p| p.power_w).unwrap_or(15.0);
}
let mut points = valid_points;
let mut points = profile.points.clone();
points.sort_by(|a, b| a.power_w.partial_cmp(&b.power_w).unwrap_or(std::cmp::Ordering::Equal));
let mut best_pl = points[0].power_w;
let mut max_score = f32::MIN;
let efficiencies: Vec<(f32, f32)> = points.iter()
.map(|p| {
let perf = if p.throughput > 0.0 { p.throughput as f32 } else { p.freq_mhz };
(p.power_w, perf / p.power_w.max(1.0))
})
.collect();
// Use a sliding window (3 points) to calculate gradients more robustly
for i in 1..points.len() - 1 {
let prev = &points[i - 1];
let curr = &points[i];
let next = &points[i + 1];
if efficiencies.is_empty() { return 15.0; }
// 1. Efficiency Metric (Throughput per Watt or Freq per Watt)
let efficiency_curr = if curr.throughput > 0.0 {
curr.throughput as f32 / curr.power_w.max(1.0)
let max_efficiency = efficiencies.iter()
.map(|(_, e)| *e)
.max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.unwrap_or(1.0);
let mut knee_watts = points[0].power_w;
for (watts, efficiency) in efficiencies {
if efficiency >= (max_efficiency * 0.85) {
knee_watts = watts;
} else {
curr.freq_mhz / curr.power_w.max(1.0)
};
let efficiency_next = if next.throughput > 0.0 {
next.throughput as f32 / next.power_w.max(1.0)
} else {
next.freq_mhz / next.power_w.max(1.0)
};
let p_delta = (next.power_w - curr.power_w).max(0.5);
let efficiency_drop = (efficiency_curr - efficiency_next) / p_delta;
// 2. Thermal Acceleration (d2T/dW2)
let p_delta_prev = (curr.power_w - prev.power_w).max(0.5);
let p_delta_next = (next.power_w - curr.power_w).max(0.5);
let dt_dw_prev = (curr.temp_c - prev.temp_c) / p_delta_prev;
let dt_dw_next = (next.temp_c - curr.temp_c) / p_delta_next;
let p_total_delta = (next.power_w - prev.power_w).max(1.0);
let temp_accel = (dt_dw_next - dt_dw_prev) / p_total_delta;
// 3. Wall Detection (Any drop in absolute performance is a hard wall)
let is_throttling = next.freq_mhz < curr.freq_mhz || (next.throughput > 0.0 && next.throughput < curr.throughput);
let penalty = if is_throttling { 5000.0 } else { 0.0 };
let score = (efficiency_curr * 10.0) - (efficiency_drop * 50.0) - (temp_accel * 20.0) - penalty;
if score > max_score {
max_score = score;
best_pl = curr.power_w;
debug!("Efficiency drop at {:.1}W ({:.1}% of peak)", watts, (efficiency/max_efficiency)*100.0);
break;
}
}
let best_pl = if max_score > f32::MIN {
best_pl
} else {
profile.points.last().map(|p| p.power_w).unwrap_or(15.0)
};
// Safety Floor: Never recommend a TDP below 5W, as this bricks system performance.
if best_pl < 5.0 {
warn!("Heuristic suggested dangerously low PL1 ({:.1}W). Falling back to 15W safety floor.", best_pl);
return 15.0;
}
best_pl
knee_watts.clamp(PowerLimitWatts::MIN, PowerLimitWatts::MAX)
}
}
use crate::sal::safety::PowerLimitWatts;

View File

@@ -4,7 +4,7 @@
//! using a [Workload], and feeds telemetry to the frontend via MPSC channels.
use anyhow::{Result, Context, bail};
use tracing::{info, warn, error};
use tracing::{info, warn, error, debug};
use std::sync::mpsc;
use std::time::{Duration, Instant};
use std::thread;
@@ -23,67 +23,40 @@ use crate::load::{Workload, IntensityProfile, StressVector};
use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase};
use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult};
use crate::agent_analyst::HeuristicAnalyst;
use crate::agent_integrator::ServiceIntegrator;
/// Represents the possible states of the benchmark orchestrator.
pub enum OrchestratorState {
/// Performing pre-flight checks and snapshotting.
PreFlight,
/// Acquiring idle baseline telemetry.
IdleBaseline,
/// Actively sweeping through power limits.
StressSweep { current_wattage: f32 },
/// Allowing hardware to cool down before releasing the guard.
ThermalCalibration,
StabilitySweep,
Cooldown,
/// Benchmark complete, generating final results.
Finalizing,
}
/// The central state machine responsible for coordinating the thermal benchmark.
pub struct BenchmarkOrchestrator {
/// Injected hardware abstraction layer.
sal: Arc<dyn PlatformSal>,
/// Discovered system facts and paths.
facts: SystemFactSheet,
/// Heat generation workload.
workload: Box<dyn Workload>,
/// Channel for sending telemetry updates to the UI.
telemetry_tx: mpsc::Sender<TelemetryState>,
/// Channel for receiving commands from the UI.
command_rx: mpsc::Receiver<UiCommand>,
/// Current phase reported to the UI.
ui_phase: BenchmarkPhase,
/// Accumulated thermal data points.
profile: ThermalProfile,
/// Mathematics engine for data smoothing and optimization.
engine: OptimizerEngine,
/// CLI override for the configuration output path.
optional_config_out: Option<PathBuf>,
/// The safety membrane protecting the system.
safeguard: Option<HardwareStateGuard>,
/// Active thermal watchdog.
watchdog: Option<ThermalWatchdog>,
/// Sliding window of power readings (Watts).
history_watts: VecDeque<f32>,
/// Sliding window of temperature readings (Celsius).
history_temp: VecDeque<f32>,
/// Sliding window of CPU frequency (MHz).
history_mhz: VecDeque<f32>,
/// Detected CPU model string.
cpu_model: String,
/// Total system RAM in Gigabytes.
total_ram_gb: u64,
/// Atomic flag indicating a safety-triggered abort.
emergency_abort: Arc<AtomicBool>,
/// Human-readable reason for the emergency abort.
emergency_reason: Arc<Mutex<Option<String>>>,
}
impl BenchmarkOrchestrator {
/// Creates a new orchestrator instance with injected dependencies.
pub fn new(
sal: Arc<dyn PlatformSal>,
facts: SystemFactSheet,
@@ -122,14 +95,13 @@ impl BenchmarkOrchestrator {
}
}
/// Executes the full benchmark sequence.
pub fn run(&mut self) -> Result<OptimizationResult> {
// Immediate Priming
let _ = self.sal.get_temp();
let _ = self.sal.get_power_w();
let _ = self.sal.get_fan_rpms();
info!("Orchestrator: Initializing Project Iron-Ember lifecycle.");
info!("Orchestrator: Initializing Project Iron-Ember PGC Protocol.");
// Spawn safety watchdog immediately
let watchdog = ThermalWatchdog::spawn(self.sal.clone(), self.emergency_abort.clone());
@@ -147,24 +119,24 @@ impl BenchmarkOrchestrator {
let _ = self.workload.stop_workload();
if let Some(mut sg) = self.safeguard.take() {
if let Err(e) = sg.release() {
error!("CRITICAL: State restoration failure: {}", e);
}
let _ = sg.release();
}
info!("✓ Hardware state restored to pre-flight defaults.");
if let Err(e) = self.sal.restore() {
warn!("Failed secondary SAL restoration: {}", e);
}
info!("✓ Hardware state restored.");
result
}
/// Internal execution logic for the benchmark phases.
fn execute_benchmark(&mut self) -> Result<OptimizationResult> {
let bench_cfg = self.facts.bench_config.clone().context("Benchmarking configuration missing.")?;
let _bench_cfg = self.facts.bench_config.clone().context("Config missing.")?;
// 1. Pre-Flight Phase
self.ui_phase = BenchmarkPhase::Auditing;
self.log("Phase: Pre-Flight Auditing & Sterilization")?;
// Snapshot and neutralise Brawl Matrix
let mut target_files = self.facts.rapl_paths.iter()
.map(|p| p.join("constraint_0_power_limit_uw"))
.collect::<Vec<_>>();
@@ -177,7 +149,6 @@ impl BenchmarkOrchestrator {
let sg = HardwareStateGuard::acquire(&target_files, &self.facts.conflict_services)?;
self.safeguard = Some(sg);
// Run auditor
for step in self.sal.audit() {
if let Err(e) = step.outcome {
return Err(anyhow::anyhow!("Audit failed ({}): {:?}", step.description, e));
@@ -185,106 +156,117 @@ impl BenchmarkOrchestrator {
}
self.workload.initialize().context("Failed to initialize load generator.")?;
self.sal.suppress().context("Failed to suppress background services.")?;
let tick = Cell::new(0u64);
// 2. Idle Baseline Phase
self.ui_phase = BenchmarkPhase::IdleCalibration;
self.log(&format!("Phase: Recording Idle Baseline ({}s)", bench_cfg.idle_duration_s))?;
// Wait for fan spin-up
self.log("Phase: Recording 30s Idle Baseline...")?;
self.sal.set_fan_mode("auto")?;
let mut idle_temps = Vec::new();
let start = Instant::now();
while start.elapsed() < Duration::from_secs(bench_cfg.idle_duration_s) {
while start.elapsed() < Duration::from_secs(30) {
self.check_safety_abort()?;
self.send_telemetry(tick.get())?;
idle_temps.push(self.sal.get_temp().unwrap_or(0.0));
tick.set(tick.get() + 1);
thread::sleep(Duration::from_millis(500));
}
self.profile.ambient_temp = self.engine.smooth(&idle_temps).last().cloned().unwrap_or(0.0);
self.profile.ambient_temp = self.engine.smooth(&idle_temps).iter().sum::<f32>() / idle_temps.len() as f32;
self.log(&format!("✓ Idle Baseline: {:.1}°C", self.profile.ambient_temp))?;
// 3. Stress Sweep Phase
self.ui_phase = BenchmarkPhase::StressTesting;
self.log("Phase: Synthetic Stress Matrix (Gradual Ramp)")?;
// Ensure fans are ramped to MAX before load
self.log("Metrology: Locking fans to MAX...")?;
// 3. Thermal Resistance Mapping (Phase 1)
self.log("Phase: Mapping Thermal Resistance (Rθ) at 10W...")?;
self.sal.set_fan_mode("max")?;
let fan_lock_start = Instant::now();
loop {
let fans = self.sal.get_fan_rpms().unwrap_or_default();
let max_rpm = fans.iter().cloned().max().unwrap_or(0);
if max_rpm >= 3000 || fan_lock_start.elapsed() > Duration::from_secs(15) {
let pl_calib = PowerLimitWatts::try_new(10.0)?;
self.sal.set_sustained_power_limit(pl_calib)?;
self.sal.set_burst_power_limit(pl_calib)?;
self.workload.run_workload(
Duration::from_secs(120),
IntensityProfile { threads: num_cpus::get_physical(), load_percentage: 100, vector: StressVector::CpuMatrix }
)?;
let mut calib_temps = Vec::new();
let calib_start = Instant::now();
while calib_start.elapsed() < Duration::from_secs(90) {
self.check_safety_abort()?;
self.send_telemetry(tick.get())?;
let t = self.sal.get_temp().unwrap_or(0.0);
calib_temps.push(t);
tick.set(tick.get() + 1);
if calib_start.elapsed() > Duration::from_secs(30) && self.engine.is_stable(&calib_temps) {
break;
}
thread::sleep(Duration::from_millis(500));
self.send_telemetry(tick.get())?;
tick.set(tick.get() + 1);
}
let steady_t = calib_temps.last().cloned().unwrap_or(0.0);
let steady_p = self.sal.get_power_w().unwrap_or(10.0);
self.profile.r_theta = self.engine.calculate_r_theta(self.profile.ambient_temp, steady_t, steady_p);
self.log(&format!("✓ Physical Model: Rθ = {:.3} K/W", self.profile.r_theta))?;
let physical_threads = num_cpus::get_physical();
// 4. Physically-Aware Stability Sweep (Phase 2)
self.ui_phase = BenchmarkPhase::StressTesting;
self.log("Phase: Starting Physically-Aware Efficiency Sweep...")?;
let mut current_w = 12.0_f32;
let mut previous_ops = 0.0;
for &watts in &bench_cfg.power_steps_watts {
self.check_safety_abort()?;
self.log(&format!("Testing PL1 = {:.0}W", watts))?;
// Apply limits safely
let pl1 = PowerLimitWatts::try_new(watts)?;
let pl2 = PowerLimitWatts::try_new(watts + 5.0)?;
self.sal.set_sustained_power_limit(pl1)?;
self.sal.set_burst_power_limit(pl2)?;
// Start workload
loop {
// Predict if this step is safe
let pred_t = self.engine.predict_temp(current_w, self.profile.ambient_temp, self.profile.r_theta);
if pred_t > 92.0 {
self.log(&format!("Prediction: {:.1}W would result in {:.1}C (Too Hot). Finalizing...", current_w, pred_t))?;
break;
}
self.log(&format!("Step: {:.1}W (Predicted: {:.1}C)", current_w, pred_t))?;
let pl = PowerLimitWatts::try_new(current_w)?;
self.sal.set_sustained_power_limit(pl)?;
self.sal.set_burst_power_limit(PowerLimitWatts::try_new(current_w + 2.0)?)?;
self.workload.run_workload(
Duration::from_secs(bench_cfg.stress_duration_max_s),
IntensityProfile { threads: physical_threads, load_percentage: 100, vector: StressVector::CpuMatrix }
Duration::from_secs(60),
IntensityProfile { threads: num_cpus::get_physical(), load_percentage: 100, vector: StressVector::CpuMatrix }
)?;
let step_start = Instant::now();
let mut step_temps = VecDeque::with_capacity(30);
let mut previous_step_temp = self.sal.get_temp().unwrap_or(0.0);
let mut step_temps = Vec::new();
let mut previous_t = self.sal.get_temp().unwrap_or(0.0);
// Equilibrium Gating
while step_start.elapsed() < Duration::from_secs(bench_cfg.stress_duration_max_s) {
while step_start.elapsed() < Duration::from_secs(60) {
self.check_safety_abort()?;
self.send_telemetry(tick.get())?;
let t = self.sal.get_temp().unwrap_or(0.0);
let dt_dt = (t - previous_step_temp) / 0.5;
previous_step_temp = t;
let dt_dt = (t - previous_t) / 0.5;
// Redundant safety check during step
if t > 94.0 || dt_dt > 5.0 {
warn!("Thermal Spike Detected! Aborting current step.");
break;
// # SAFETY: predictive hard-quench threshold raised to 8C/s
if step_start.elapsed() > Duration::from_secs(2) && (t > 95.0 || dt_dt > 8.0) {
warn!("USA: Safety Break triggered! T={:.1}C, dT/dt={:.1}C/s", t, dt_dt);
let _ = self.sal.set_sustained_power_limit(PowerLimitWatts::try_new(3.0)?);
break; // Just break the sweep loop
}
step_temps.push_back(t);
if step_temps.len() > 10 { step_temps.pop_front(); }
self.send_telemetry(tick.get())?;
step_temps.push(t);
tick.set(tick.get() + 1);
if step_start.elapsed() > Duration::from_secs(bench_cfg.stress_duration_min_s) && step_temps.len() == 10 {
let min = step_temps.iter().fold(f32::MAX, |a, &b| a.min(b));
let max = step_temps.iter().fold(f32::MIN, |a, &b| a.max(b));
if (max - min) < 0.5 {
info!("Equilibrium reached at {:.1}°C", t);
break;
}
if step_start.elapsed() > Duration::from_secs(15) && self.engine.is_stable(&step_temps) {
self.log(&format!(" Equilibrium reached at {:.1}°C", t))?;
break;
}
previous_t = t;
thread::sleep(Duration::from_millis(500));
}
// Record data point
let metrics = self.workload.get_current_metrics().unwrap_or_default();
self.profile.points.push(ThermalPoint {
power_w: self.sal.get_power_w().unwrap_or(watts),
power_w: self.sal.get_power_w().unwrap_or(current_w),
temp_c: self.sal.get_temp().unwrap_or(0.0),
freq_mhz: self.sal.get_freq_mhz().unwrap_or(0.0),
fan_rpm: self.sal.get_fan_rpms().unwrap_or_default().first().cloned().unwrap_or(0),
@@ -293,64 +275,62 @@ impl BenchmarkOrchestrator {
self.workload.stop_workload()?;
// Performance Halt Condition
// Efficiency Break
if previous_ops > 0.0 {
let gain = ((metrics.primary_ops_per_sec - previous_ops) / previous_ops) * 100.0;
if gain < 1.0 {
self.log("Diminishing returns reached. Stopping sweep.")?;
self.log("Silicon Knee identified (gain < 1%). Finalizing...")?;
break;
}
}
previous_ops = metrics.primary_ops_per_sec;
current_w += 2.0;
if current_w > 45.0 { break; }
self.log(&format!("Cooling down ({}s)...", bench_cfg.cool_down_s))?;
thread::sleep(Duration::from_secs(bench_cfg.cool_down_s));
self.log(&format!("Cooling down ({}s)...", _bench_cfg.cool_down_s))?;
thread::sleep(Duration::from_secs(_bench_cfg.cool_down_s));
}
// 4. Physical Modeling Phase
// 5. Modeling Phase
self.ui_phase = BenchmarkPhase::PhysicalModeling;
self.log("Phase: Silicon Physical Sweet Spot Calculation")?;
let knee = self.engine.find_silicon_knee(&self.profile);
let analyst = HeuristicAnalyst::new();
let matrix = analyst.analyze(&self.profile, self.profile.points.last().map(|p| p.power_w).unwrap_or(15.0));
let mut res = self.generate_result(false);
res.optimization_matrix = Some(matrix.clone());
info!("Identification complete. Knee: {:.1}W, Rθ: {:.3} K/W", res.silicon_knee_watts, res.thermal_resistance_kw);
res.silicon_knee_watts = knee;
// 5. Finalizing Phase
// 6. Finalizing Phase
self.ui_phase = BenchmarkPhase::Finalizing;
self.log("Phase: Generation of Optimized Configuration Sets")?;
let throttled_path = self.optional_config_out.clone()
.or_else(|| self.facts.paths.configs.get("throttled").cloned());
if let Some(path) = throttled_path {
let throttled_source = self.facts.paths.configs.get("throttled");
if let Some(path) = self.optional_config_out.clone().or_else(|| throttled_source.cloned()) {
let config = crate::engine::formatters::throttled::ThrottledConfig {
pl1_limit: res.silicon_knee_watts,
pl2_limit: res.recommended_pl2,
trip_temp: res.max_temp_c.max(90.0),
pl2_limit: res.silicon_knee_watts * 1.25,
trip_temp: 90.0,
};
crate::engine::formatters::throttled::ThrottledTranslator::save(&path, &config)?;
self.log(&format!("✓ Saved Throttled profile to {}", path.display()))?;
let _ = crate::engine::formatters::throttled::ThrottledTranslator::save(&path, &config, throttled_source);
res.config_paths.insert("throttled".to_string(), path);
}
let base_out = self.optional_config_out.clone().unwrap_or_else(|| PathBuf::from("/etc"));
let i8k_source = self.facts.paths.configs.get("i8kmon");
let i8k_out = base_out.join("i8kmon.conf");
if ServiceIntegrator::generate_i8kmon_config(&matrix, &i8k_out, i8k_source).is_ok() {
res.config_paths.insert("i8kmon".to_string(), i8k_out);
}
Ok(res)
}
/// Checks if the safety watchdog or user triggered an abort.
fn check_safety_abort(&self) -> Result<()> {
if self.emergency_abort.load(Ordering::SeqCst) {
let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Watchdog Triggered".to_string());
let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Watchdog".to_string());
bail!("EMERGENCY_ABORT: {}", reason);
}
if let Ok(cmd) = self.command_rx.try_recv() {
match cmd {
UiCommand::Abort => bail!("ABORTED"),
}
if let UiCommand::Abort = cmd { bail!("ABORTED"); }
}
Ok(())
}
@@ -365,49 +345,35 @@ impl BenchmarkOrchestrator {
current_freq: self.sal.get_freq_mhz().unwrap_or(0.0),
fans: self.sal.get_fan_rpms().unwrap_or_default(),
governor: "performance".to_string(),
pl1_limit: 0.0,
pl2_limit: 0.0,
fan_tier: "auto".to_string(),
pl1_limit: 0.0, pl2_limit: 0.0, fan_tier: "auto".to_string(),
is_throttling: self.sal.get_throttling_status().unwrap_or(false),
phase: self.ui_phase,
history_watts: Vec::new(),
history_temp: Vec::new(),
history_mhz: Vec::new(),
history_watts: Vec::new(), history_temp: Vec::new(), history_mhz: Vec::new(),
log_event: Some(msg.to_string()),
metadata: std::collections::HashMap::new(),
is_emergency: self.emergency_abort.load(Ordering::SeqCst),
emergency_reason: self.emergency_reason.lock().unwrap().clone(),
};
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed"))
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Channel closed"))
}
fn send_telemetry(&mut self, tick: u64) -> Result<()> {
let temp = self.sal.get_temp().unwrap_or(0.0);
let pwr = self.sal.get_power_w().unwrap_or(0.0);
let freq = self.sal.get_freq_mhz().unwrap_or(0.0);
self.history_temp.push_back(temp);
self.history_watts.push_back(pwr);
self.history_mhz.push_back(freq);
if self.history_temp.len() > 120 {
self.history_temp.pop_front();
self.history_watts.pop_front();
self.history_mhz.pop_front();
}
if self.history_temp.len() > 120 { self.history_temp.pop_front(); self.history_watts.pop_front(); self.history_mhz.pop_front(); }
let state = TelemetryState {
cpu_model: self.cpu_model.clone(),
total_ram_gb: self.total_ram_gb,
tick,
cpu_temp: temp,
power_w: pwr,
current_freq: freq,
cpu_temp: temp, power_w: pwr, current_freq: freq,
fans: self.sal.get_fan_rpms().unwrap_or_default(),
governor: "performance".to_string(),
pl1_limit: 15.0,
pl2_limit: 25.0,
fan_tier: "max".to_string(),
pl1_limit: 15.0, pl2_limit: 25.0, fan_tier: "max".to_string(),
is_throttling: self.sal.get_throttling_status().unwrap_or(false),
phase: self.ui_phase,
history_watts: self.history_watts.iter().cloned().collect(),
@@ -418,21 +384,19 @@ impl BenchmarkOrchestrator {
is_emergency: self.emergency_abort.load(Ordering::SeqCst),
emergency_reason: self.emergency_reason.lock().unwrap().clone(),
};
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed"))
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Channel closed"))
}
pub fn generate_result(&self, is_partial: bool) -> OptimizationResult {
let r_theta = self.engine.calculate_thermal_resistance(&self.profile);
let r_theta = self.profile.r_theta;
let knee = self.engine.find_silicon_knee(&self.profile);
let max_t = self.engine.get_max_temp(&self.profile);
OptimizationResult {
profile: self.profile.clone(),
silicon_knee_watts: knee,
thermal_resistance_kw: r_theta,
recommended_pl1: knee,
recommended_pl2: knee * 1.25,
max_temp_c: max_t,
max_temp_c: self.profile.points.iter().map(|p| p.temp_c).max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)).unwrap_or(0.0),
is_partial,
config_paths: std::collections::HashMap::new(),
optimization_matrix: None,

View File

@@ -2,8 +2,7 @@ use anyhow::{Result, anyhow, Context};
use std::path::{Path};
use std::fs;
use std::time::{Duration, Instant};
use std::sync::{Mutex, Arc};
use tracing::{debug, warn, info};
use std::sync::Mutex;
use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx};
use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent};

View File

@@ -1,7 +1,6 @@
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, SafetyStatus};
use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent};
use anyhow::Result;
use std::sync::Arc;
pub struct MockSal {
pub temperature_sequence: std::sync::atomic::AtomicUsize,

View File

@@ -10,7 +10,7 @@ use std::fs;
use std::path::{PathBuf};
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::{Duration, Instant};
use std::time::Duration;
use std::thread;
use tracing::{info, warn, error, debug};

View File

@@ -1,35 +1,75 @@
#[path = "../src/engine/formatters/throttled.rs"]
mod throttled;
use throttled::{ThrottledTranslator, ThrottledConfig};
use ember_tune_rs::engine::formatters::throttled::{ThrottledConfig, ThrottledTranslator};
use ember_tune_rs::agent_analyst::{OptimizationMatrix, SystemProfile, FanCurvePoint};
use ember_tune_rs::agent_integrator::ServiceIntegrator;
use std::fs;
use tempfile::tempdir;
#[test]
fn test_throttled_formatter_non_destructive() {
let fixture_path = "tests/fixtures/throttled.conf";
let existing_content = fs::read_to_string(fixture_path).expect("Failed to read fixture");
fn test_throttled_merge_preserves_undervolt() {
let existing = r#"[GENERAL]
Update_Interval_ms: 1000
[UNDERVOLT]
# CPU core undervolt
CORE: -100
# GPU undervolt
GPU: -50
[AC]
PL1_Tdp_W: 15
PL2_Tdp_W: 25
"#;
let config = ThrottledConfig {
pl1_limit: 25.0,
pl2_limit: 35.0,
trip_temp: 90.0,
pl1_limit: 22.0,
pl2_limit: 28.0,
trip_temp: 95.0,
};
let merged = ThrottledTranslator::merge_conf(&existing_content, &config);
let merged = ThrottledTranslator::merge_conf(existing, &config);
// Assert updates
assert!(merged.contains("PL1_Tdp_W: 25"));
assert!(merged.contains("PL2_Tdp_W: 35"));
assert!(merged.contains("Trip_Temp_C: 90"));
// Assert preservation
assert!(merged.contains("[UNDERVOLT]"));
assert!(merged.contains("CORE: -100"));
assert!(merged.contains("GPU: -50"));
assert!(merged.contains("# Important: Preserving undervolt offsets is critical!"));
assert!(merged.contains("Update_Interval_ms: 3000"));
// Check that we didn't lose the [GENERAL] section
assert!(merged.contains("[GENERAL]"));
assert!(merged.contains("# This is a complex test fixture"));
assert!(merged.contains("PL1_Tdp_W: 22"));
assert!(merged.contains("PL2_Tdp_W: 28"));
assert!(merged.contains("Trip_Temp_C: 95"));
assert!(merged.contains("[UNDERVOLT]"));
}
#[test]
fn test_i8kmon_merge_preserves_settings() {
let dir = tempdir().unwrap();
let config_path = dir.path().join("i8kmon.conf");
let existing = r#"set config(gen_shadow) 1
set config(i8k_ignore_dmi) 1
set config(daemon) 1
set config(0) {0 0 60 50}
"#;
fs::write(&config_path, existing).unwrap();
let matrix = OptimizationMatrix {
silent: SystemProfile { name: "Silent".to_string(), pl1_watts: 10.0, pl2_watts: 12.0, fan_curve: vec![] },
balanced: SystemProfile {
name: "Balanced".to_string(),
pl1_watts: 20.0,
pl2_watts: 25.0,
fan_curve: vec![
FanCurvePoint { temp_on: 70.0, temp_off: 60.0, pwm_percent: 50 }
]
},
performance: SystemProfile { name: "Perf".to_string(), pl1_watts: 30.0, pl2_watts: 35.0, fan_curve: vec![] },
thermal_resistance_kw: 1.5,
ambient_temp: 25.0,
};
ServiceIntegrator::generate_i8kmon_config(&matrix, &config_path, Some(&config_path)).unwrap();
let result = fs::read_to_string(&config_path).unwrap();
assert!(result.contains("set config(gen_shadow) 1"));
assert!(result.contains("set config(daemon) 1"));
assert!(result.contains("set config(0) {1 1 70 -}")); // New config
assert!(!result.contains("set config(0) {0 0 60 50}")); // Old config should be gone
}

View File

@@ -1,8 +1,6 @@
use anyhow::Result;
use std::fs;
use std::path::PathBuf;
use ember_tune_rs::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts};
use ember_tune_rs::sal::safety::{HardwareStateGuard, PowerLimitWatts};
use crate::common::fakesys::FakeSysBuilder;
use std::fs;
mod common;
@@ -34,23 +32,22 @@ fn test_hardware_state_guard_panic_restoration() {
#[test]
fn test_tdp_limit_bounds_checking() {
// 1. Valid value
assert!(TdpLimitMicroWatts::new(15_000_000).is_ok());
assert!(PowerLimitWatts::try_new(15.0).is_ok());
// 2. Too low (Dangerous 0W or below 5W)
let low_res = TdpLimitMicroWatts::new(1_000_000);
// 2. Too low (Dangerous 0W or below 3W)
let low_res = PowerLimitWatts::try_new(1.0);
assert!(low_res.is_err());
assert!(low_res.unwrap_err().to_string().contains("below safety floor"));
assert!(low_res.unwrap_err().to_string().contains("outside safe bounds"));
// 3. Too high (> 80W)
let high_res = TdpLimitMicroWatts::new(100_000_000);
// 3. Too high (> 100W)
let high_res = PowerLimitWatts::try_new(150.0);
assert!(high_res.is_err());
assert!(high_res.unwrap_err().to_string().contains("exceeds safety ceiling"));
assert!(high_res.unwrap_err().to_string().contains("outside safe bounds"));
}
#[test]
fn test_0w_tdp_regression_prevention() {
// The prime directive is to never set 0W.
// Ensure the new() constructor explicitly fails for 0.
let zero_res = TdpLimitMicroWatts::new(0);
let zero_res = PowerLimitWatts::try_new(0.0);
assert!(zero_res.is_err());
}