updated safety measurements and benchmarking behavior for 9380
This commit is contained in:
@@ -28,6 +28,7 @@ pub struct OptimizationMatrix {
|
||||
pub balanced: SystemProfile,
|
||||
pub performance: SystemProfile,
|
||||
pub thermal_resistance_kw: f32,
|
||||
pub ambient_temp: f32,
|
||||
}
|
||||
|
||||
pub struct HeuristicAnalyst {
|
||||
@@ -43,16 +44,14 @@ impl HeuristicAnalyst {
|
||||
|
||||
/// Analyzes the raw telemetry to generate the 3 optimal profiles.
|
||||
pub fn analyze(&self, profile: &ThermalProfile, max_soak_watts: f32) -> OptimizationMatrix {
|
||||
let r_theta = self.engine.calculate_thermal_resistance(profile);
|
||||
let r_theta = profile.r_theta;
|
||||
let silicon_knee = self.engine.find_silicon_knee(profile);
|
||||
let ambient = profile.ambient_temp;
|
||||
|
||||
// 1. State A: Silent / Battery (Scientific Passive Equilibrium)
|
||||
// Objective: Find P where T_core = 60C with fans OFF.
|
||||
// T_core = T_ambient + (P * R_theta_passive)
|
||||
// Note: R_theta measured during benchmark was with fans MAX.
|
||||
// Passive R_theta is typically 2-3x higher.
|
||||
// Find P where T_core = 60C with fans OFF.
|
||||
let r_theta_passive = r_theta * 2.5;
|
||||
let silent_watts = ((60.0 - profile.ambient_temp) / r_theta_passive.max(0.1)).clamp(5.0, 15.0);
|
||||
let silent_watts = ((60.0 - ambient) / r_theta_passive.max(0.1)).clamp(3.0, 15.0);
|
||||
|
||||
let silent_profile = SystemProfile {
|
||||
name: "Silent".to_string(),
|
||||
@@ -64,21 +63,21 @@ impl HeuristicAnalyst {
|
||||
],
|
||||
};
|
||||
|
||||
// 2. State B: Balanced
|
||||
// The exact calculated Silicon Knee
|
||||
// 2. State B: Balanced (The Silicon Knee)
|
||||
// We use R_theta to predict where the knee will sit thermally.
|
||||
let balanced_profile = SystemProfile {
|
||||
name: "Balanced".to_string(),
|
||||
pl1_watts: silicon_knee,
|
||||
pl2_watts: silicon_knee * 1.25,
|
||||
fan_curve: vec![
|
||||
FanCurvePoint { temp_on: 60.0, temp_off: 55.0, pwm_percent: 0 },
|
||||
FanCurvePoint { temp_on: 75.0, temp_off: 65.0, pwm_percent: 40 },
|
||||
FanCurvePoint { temp_on: 85.0, temp_off: 75.0, pwm_percent: 70 },
|
||||
FanCurvePoint { temp_on: ambient + 15.0, temp_off: ambient + 10.0, pwm_percent: 0 },
|
||||
FanCurvePoint { temp_on: ambient + 25.0, temp_off: ambient + 20.0, pwm_percent: 30 },
|
||||
FanCurvePoint { temp_on: 75.0, temp_off: 65.0, pwm_percent: 50 },
|
||||
FanCurvePoint { temp_on: 85.0, temp_off: 75.0, pwm_percent: 80 },
|
||||
],
|
||||
};
|
||||
|
||||
// 3. State C: Sustained Heavy
|
||||
// Based on the max soak watts from Phase 1.
|
||||
let performance_profile = SystemProfile {
|
||||
name: "Performance".to_string(),
|
||||
pl1_watts: max_soak_watts,
|
||||
@@ -95,6 +94,7 @@ impl HeuristicAnalyst {
|
||||
balanced: balanced_profile,
|
||||
performance: performance_profile,
|
||||
thermal_resistance_kw: r_theta,
|
||||
ambient_temp: ambient,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
//! resolution strategies for overlapping daemons.
|
||||
|
||||
use anyhow::Result;
|
||||
use std::path::Path;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::fs;
|
||||
use crate::agent_analyst::OptimizationMatrix;
|
||||
|
||||
@@ -14,20 +14,42 @@ pub struct ServiceIntegrator;
|
||||
|
||||
impl ServiceIntegrator {
|
||||
/// Generates and saves an i8kmon configuration based on the balanced profile.
|
||||
pub fn generate_i8kmon_config(matrix: &OptimizationMatrix, output_path: &Path) -> Result<()> {
|
||||
pub fn generate_i8kmon_config(matrix: &OptimizationMatrix, output_path: &Path, source_path: Option<&PathBuf>) -> Result<()> {
|
||||
let profile = &matrix.balanced;
|
||||
|
||||
let mut conf = String::new();
|
||||
conf.push_str("# Auto-generated by ember-tune Integrator
|
||||
");
|
||||
conf.push_str(&format!("# Profile: {}
|
||||
|
||||
", profile.name));
|
||||
|
||||
let mut conf = String::new();
|
||||
|
||||
// Read existing content to preserve daemon and other settings
|
||||
let existing = if let Some(src) = source_path {
|
||||
if src.exists() { fs::read_to_string(src).unwrap_or_default() } else { String::new() }
|
||||
} else if output_path.exists() {
|
||||
fs::read_to_string(output_path).unwrap_or_default()
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
if !existing.is_empty() {
|
||||
for line in existing.lines() {
|
||||
let trimmed = line.trim();
|
||||
// Filter out the old auto-generated config lines and fan configs
|
||||
if !trimmed.starts_with("set config(0)") &&
|
||||
!trimmed.starts_with("set config(1)") &&
|
||||
!trimmed.starts_with("set config(2)") &&
|
||||
!trimmed.starts_with("set config(3)") &&
|
||||
!trimmed.starts_with("# Auto-generated") &&
|
||||
!trimmed.starts_with("# Profile:") &&
|
||||
!trimmed.is_empty() {
|
||||
conf.push_str(line);
|
||||
conf.push('\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
conf.push_str("\n# Auto-generated by ember-tune Integrator\n");
|
||||
conf.push_str(&format!("# Profile: {}\n", profile.name));
|
||||
conf.push_str(&format!("# Thermal Resistance: {:.3} K/W\n\n", matrix.thermal_resistance_kw));
|
||||
|
||||
for (i, p) in profile.fan_curve.iter().enumerate() {
|
||||
// i8kmon syntax: set config(state) {left_fan right_fan temp_on temp_off}
|
||||
// State 0, 1, 2, 3 correspond to BIOS fan states (off, low, high)
|
||||
|
||||
let state = match p.pwm_percent {
|
||||
0..=20 => 0,
|
||||
21..=50 => 1,
|
||||
@@ -35,31 +57,50 @@ impl ServiceIntegrator {
|
||||
_ => 2,
|
||||
};
|
||||
|
||||
let off = if i == 0 { "-".to_string() } else { format!("{}", p.temp_off) };
|
||||
conf.push_str(&format!("set config({}) {{{} {} {} {}}}
|
||||
", i, state, state, p.temp_on, off));
|
||||
let off = if i == 0 { "-".to_string() } else { format!("{:.0}", p.temp_off) };
|
||||
conf.push_str(&format!("set config({}) {{{} {} {:.0} {}}}\n", i, state, state, p.temp_on, off));
|
||||
}
|
||||
|
||||
fs::write(output_path, conf)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Generates a thinkfan configuration.
|
||||
pub fn generate_thinkfan_config(matrix: &OptimizationMatrix, output_path: &Path) -> Result<()> {
|
||||
/// Generates a thinkfan configuration, merging with existing sensors if possible.
|
||||
pub fn generate_thinkfan_config(matrix: &OptimizationMatrix, output_path: &Path, source_path: Option<&PathBuf>) -> Result<()> {
|
||||
let profile = &matrix.balanced;
|
||||
|
||||
let mut conf = String::new();
|
||||
conf.push_str("# Auto-generated by ember-tune Integrator
|
||||
");
|
||||
conf.push_str("sensors:
|
||||
- hwmon: /sys/class/hwmon/hwmon0/temp1_input
|
||||
|
||||
let existing = if let Some(src) = source_path {
|
||||
if src.exists() { fs::read_to_string(src).unwrap_or_default() } else { String::new() }
|
||||
} else if output_path.exists() {
|
||||
fs::read_to_string(output_path).unwrap_or_default()
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
");
|
||||
conf.push_str("levels:
|
||||
");
|
||||
if !existing.is_empty() {
|
||||
let mut in_sensors = false;
|
||||
for line in existing.lines() {
|
||||
let trimmed = line.trim();
|
||||
if trimmed == "sensors:" { in_sensors = true; }
|
||||
if trimmed == "levels:" { in_sensors = false; }
|
||||
|
||||
if in_sensors {
|
||||
conf.push_str(line);
|
||||
conf.push('\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if conf.is_empty() {
|
||||
conf.push_str("sensors:\n - hwmon: /sys/class/hwmon/hwmon0/temp1_input\n\n");
|
||||
}
|
||||
|
||||
conf.push_str("\n# Auto-generated by ember-tune Integrator\n");
|
||||
conf.push_str("levels:\n");
|
||||
|
||||
for (i, p) in profile.fan_curve.iter().enumerate() {
|
||||
// thinkfan syntax: - [level, temp_down, temp_up]
|
||||
let level = match p.pwm_percent {
|
||||
0..=20 => 0,
|
||||
21..=40 => 1,
|
||||
@@ -69,8 +110,7 @@ impl ServiceIntegrator {
|
||||
};
|
||||
|
||||
let down = if i == 0 { 0.0 } else { p.temp_off };
|
||||
conf.push_str(&format!(" - [{}, {}, {}]
|
||||
", level, down, p.temp_on));
|
||||
conf.push_str(&format!(" - [{}, {:.0}, {:.0}]\n", level, down, p.temp_on));
|
||||
}
|
||||
|
||||
fs::write(output_path, conf)?;
|
||||
@@ -91,7 +131,6 @@ sed -i 's/^CPU_BOOST_ON_AC=.*/CPU_BOOST_ON_AC=""/' /etc/tlp.conf
|
||||
systemctl restart tlp
|
||||
|
||||
# 3. Thermald Delegate (We provide the trips, it handles the rest)
|
||||
# (Ensure your custom thermal-conf.xml is in /etc/thermald/)
|
||||
systemctl restart thermald
|
||||
"#;
|
||||
fs::write(output_path, script)?;
|
||||
@@ -99,7 +138,7 @@ systemctl restart thermald
|
||||
}
|
||||
|
||||
/// Generates a thermald configuration XML.
|
||||
pub fn generate_thermald_config(matrix: &OptimizationMatrix, output_path: &Path) -> Result<()> {
|
||||
pub fn generate_thermald_config(matrix: &OptimizationMatrix, output_path: &Path, _source_path: Option<&PathBuf>) -> Result<()> {
|
||||
let profile = &matrix.balanced;
|
||||
let mut xml = String::new();
|
||||
xml.push_str("<?xml version=\"1.0\"?>\n<ThermalConfiguration>\n <Platform>\n <Name>ember-tune Balanced</Name>\n <ProductName>Generic</ProductName>\n <Preference>balanced</Preference>\n <ThermalZones>\n <ThermalZone>\n <Type>cpu</Type>\n <TripPoints>\n");
|
||||
|
||||
@@ -118,8 +118,15 @@ Trip_Temp_C: {trip:.0}
|
||||
result_lines.join("\n")
|
||||
}
|
||||
|
||||
pub fn save(path: &Path, config: &ThrottledConfig) -> Result<()> {
|
||||
let existing = if path.exists() { std::fs::read_to_string(path)? } else { String::new() };
|
||||
pub fn save(path: &Path, config: &ThrottledConfig, source_path: Option<&std::path::PathBuf>) -> Result<()> {
|
||||
let existing = if let Some(src) = source_path {
|
||||
if src.exists() { std::fs::read_to_string(src).unwrap_or_default() } else { String::new() }
|
||||
} else if path.exists() {
|
||||
std::fs::read_to_string(path).unwrap_or_default()
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
let content = if existing.is_empty() { Self::generate_conf(config) } else { Self::merge_conf(&existing, config) };
|
||||
std::fs::write(path, content)?;
|
||||
Ok(())
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
use serde::{Serialize, Deserialize};
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use tracing::warn;
|
||||
use tracing::{warn, debug};
|
||||
|
||||
pub mod formatters;
|
||||
|
||||
@@ -26,6 +26,7 @@ pub struct ThermalPoint {
|
||||
pub struct ThermalProfile {
|
||||
pub points: Vec<ThermalPoint>,
|
||||
pub ambient_temp: f32,
|
||||
pub r_theta: f32,
|
||||
}
|
||||
|
||||
/// The final, recommended parameters derived from the thermal benchmark.
|
||||
@@ -52,24 +53,16 @@ pub struct OptimizationResult {
|
||||
}
|
||||
|
||||
/// Pure mathematics engine for thermal optimization.
|
||||
///
|
||||
/// Contains no hardware I/O and operates solely on the collected [ThermalProfile].
|
||||
pub struct OptimizerEngine {
|
||||
/// The size of the sliding window for the `smooth` function.
|
||||
window_size: usize,
|
||||
}
|
||||
|
||||
impl OptimizerEngine {
|
||||
/// Creates a new `OptimizerEngine`.
|
||||
pub fn new(window_size: usize) -> Self {
|
||||
Self { window_size }
|
||||
}
|
||||
|
||||
/// Applies a simple moving average (SMA) filter with outlier rejection.
|
||||
///
|
||||
/// This function smooths noisy sensor data. It rejects any value in the
|
||||
/// window that is more than 20.0 units away from the window's average
|
||||
/// before calculating the final smoothed value.
|
||||
/// Smoothes sensor jitter using a moving average with outlier rejection.
|
||||
pub fn smooth(&self, data: &[f32]) -> Vec<f32> {
|
||||
if data.is_empty() { return vec![]; }
|
||||
let mut smoothed = Vec::with_capacity(data.len());
|
||||
@@ -81,7 +74,7 @@ impl OptimizerEngine {
|
||||
let window = &data[start..end];
|
||||
let avg: f32 = window.iter().sum::<f32>() / window.len() as f32;
|
||||
let filtered: Vec<f32> = window.iter()
|
||||
.filter(|&&v| (v - avg).abs() < 20.0) // Reject spikes > 20 units
|
||||
.filter(|&&v| (v - avg).abs() < 10.0)
|
||||
.cloned().collect();
|
||||
|
||||
if filtered.is_empty() {
|
||||
@@ -93,108 +86,65 @@ impl OptimizerEngine {
|
||||
smoothed
|
||||
}
|
||||
|
||||
/// Calculates Thermal Resistance: R_theta = (T_core - T_ambient) / P_package.
|
||||
///
|
||||
/// This function uses the data point with the highest power draw to ensure
|
||||
/// the calculation reflects a system under maximum thermal load.
|
||||
pub fn calculate_thermal_resistance(&self, profile: &ThermalProfile) -> f32 {
|
||||
profile.points.iter()
|
||||
.filter(|p| p.power_w > 1.0 && p.temp_c > 30.0) // Filter invalid data
|
||||
.max_by(|a, b| a.power_w.partial_cmp(&b.power_w).unwrap_or(std::cmp::Ordering::Equal))
|
||||
.map(|p| (p.temp_c - profile.ambient_temp) / p.power_w)
|
||||
.unwrap_or(0.0)
|
||||
/// Evaluates if a series of temperature readings have reached thermal equilibrium.
|
||||
/// Criteria: Standard deviation < 0.25C over the last 10 seconds.
|
||||
pub fn is_stable(&self, temps: &[f32]) -> bool {
|
||||
if temps.len() < 20 { return false; } // Need at least 10s of data (500ms intervals)
|
||||
let window = &temps[temps.len() - 20..];
|
||||
|
||||
let avg = window.iter().sum::<f32>() / window.len() as f32;
|
||||
let variance = window.iter().map(|&t| (t - avg).powi(2)).sum::<f32>() / window.len() as f32;
|
||||
let std_dev = variance.sqrt();
|
||||
|
||||
debug!("Stability Check: StdDev={:.3}C (Target < 0.25C)", std_dev);
|
||||
std_dev < 0.25
|
||||
}
|
||||
|
||||
/// Returns the maximum temperature recorded in the profile.
|
||||
pub fn get_max_temp(&self, profile: &ThermalProfile) -> f32 {
|
||||
profile.points.iter()
|
||||
.map(|p| p.temp_c)
|
||||
.max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
|
||||
.unwrap_or(0.0)
|
||||
/// Predicts the steady-state temperature for a given target wattage.
|
||||
/// Formula: T_pred = T_ambient + (P_target * R_theta)
|
||||
pub fn predict_temp(&self, target_watts: f32, ambient: f32, r_theta: f32) -> f32 {
|
||||
ambient + (target_watts * r_theta)
|
||||
}
|
||||
|
||||
/// Finds the "Silicon Knee" - the point where performance-per-watt (efficiency)
|
||||
/// starts to diminish significantly and thermal density spikes.
|
||||
///
|
||||
/// This heuristic scoring model balances several factors:
|
||||
/// 1. **Efficiency Drop:** How quickly does performance-per-watt decrease as power increases?
|
||||
/// 2. **Thermal Acceleration:** How quickly does temperature rise per additional Watt?
|
||||
/// 3. **Throttling Penalty:** A large penalty is applied if absolute performance drops, indicating a thermal wall.
|
||||
///
|
||||
/// The "Knee" is the power level with the highest score, representing the optimal
|
||||
/// balance before thermal saturation causes diminishing returns.
|
||||
/// Calculates Thermal Resistance (K/W) using the steady-state delta.
|
||||
pub fn calculate_r_theta(&self, ambient: f32, steady_temp: f32, steady_power: f32) -> f32 {
|
||||
if steady_power < 1.0 { return 0.0; }
|
||||
(steady_temp - ambient) / steady_power
|
||||
}
|
||||
|
||||
/// Identifies the "Silicon Knee" by finding the point of maximum efficiency.
|
||||
pub fn find_silicon_knee(&self, profile: &ThermalProfile) -> f32 {
|
||||
let valid_points: Vec<_> = profile.points.iter()
|
||||
.filter(|p| p.power_w > 5.0 && p.temp_c > 40.0) // Filter idle/noise
|
||||
.cloned()
|
||||
.collect();
|
||||
if profile.points.is_empty() { return 15.0; }
|
||||
|
||||
if valid_points.len() < 3 {
|
||||
return profile.points.last().map(|p| p.power_w).unwrap_or(15.0);
|
||||
}
|
||||
|
||||
let mut points = valid_points;
|
||||
let mut points = profile.points.clone();
|
||||
points.sort_by(|a, b| a.power_w.partial_cmp(&b.power_w).unwrap_or(std::cmp::Ordering::Equal));
|
||||
|
||||
let mut best_pl = points[0].power_w;
|
||||
let mut max_score = f32::MIN;
|
||||
let efficiencies: Vec<(f32, f32)> = points.iter()
|
||||
.map(|p| {
|
||||
let perf = if p.throughput > 0.0 { p.throughput as f32 } else { p.freq_mhz };
|
||||
(p.power_w, perf / p.power_w.max(1.0))
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Use a sliding window (3 points) to calculate gradients more robustly
|
||||
for i in 1..points.len() - 1 {
|
||||
let prev = &points[i - 1];
|
||||
let curr = &points[i];
|
||||
let next = &points[i + 1];
|
||||
if efficiencies.is_empty() { return 15.0; }
|
||||
|
||||
// 1. Efficiency Metric (Throughput per Watt or Freq per Watt)
|
||||
let efficiency_curr = if curr.throughput > 0.0 {
|
||||
curr.throughput as f32 / curr.power_w.max(1.0)
|
||||
let max_efficiency = efficiencies.iter()
|
||||
.map(|(_, e)| *e)
|
||||
.max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
|
||||
.unwrap_or(1.0);
|
||||
|
||||
let mut knee_watts = points[0].power_w;
|
||||
for (watts, efficiency) in efficiencies {
|
||||
if efficiency >= (max_efficiency * 0.85) {
|
||||
knee_watts = watts;
|
||||
} else {
|
||||
curr.freq_mhz / curr.power_w.max(1.0)
|
||||
};
|
||||
|
||||
let efficiency_next = if next.throughput > 0.0 {
|
||||
next.throughput as f32 / next.power_w.max(1.0)
|
||||
} else {
|
||||
next.freq_mhz / next.power_w.max(1.0)
|
||||
};
|
||||
|
||||
let p_delta = (next.power_w - curr.power_w).max(0.5);
|
||||
let efficiency_drop = (efficiency_curr - efficiency_next) / p_delta;
|
||||
|
||||
// 2. Thermal Acceleration (d2T/dW2)
|
||||
let p_delta_prev = (curr.power_w - prev.power_w).max(0.5);
|
||||
let p_delta_next = (next.power_w - curr.power_w).max(0.5);
|
||||
|
||||
let dt_dw_prev = (curr.temp_c - prev.temp_c) / p_delta_prev;
|
||||
let dt_dw_next = (next.temp_c - curr.temp_c) / p_delta_next;
|
||||
|
||||
let p_total_delta = (next.power_w - prev.power_w).max(1.0);
|
||||
let temp_accel = (dt_dw_next - dt_dw_prev) / p_total_delta;
|
||||
|
||||
// 3. Wall Detection (Any drop in absolute performance is a hard wall)
|
||||
let is_throttling = next.freq_mhz < curr.freq_mhz || (next.throughput > 0.0 && next.throughput < curr.throughput);
|
||||
let penalty = if is_throttling { 5000.0 } else { 0.0 };
|
||||
|
||||
let score = (efficiency_curr * 10.0) - (efficiency_drop * 50.0) - (temp_accel * 20.0) - penalty;
|
||||
|
||||
if score > max_score {
|
||||
max_score = score;
|
||||
best_pl = curr.power_w;
|
||||
debug!("Efficiency drop at {:.1}W ({:.1}% of peak)", watts, (efficiency/max_efficiency)*100.0);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let best_pl = if max_score > f32::MIN {
|
||||
best_pl
|
||||
} else {
|
||||
profile.points.last().map(|p| p.power_w).unwrap_or(15.0)
|
||||
};
|
||||
|
||||
// Safety Floor: Never recommend a TDP below 5W, as this bricks system performance.
|
||||
if best_pl < 5.0 {
|
||||
warn!("Heuristic suggested dangerously low PL1 ({:.1}W). Falling back to 15W safety floor.", best_pl);
|
||||
return 15.0;
|
||||
}
|
||||
|
||||
best_pl
|
||||
knee_watts.clamp(PowerLimitWatts::MIN, PowerLimitWatts::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
use crate::sal::safety::PowerLimitWatts;
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
//! using a [Workload], and feeds telemetry to the frontend via MPSC channels.
|
||||
|
||||
use anyhow::{Result, Context, bail};
|
||||
use tracing::{info, warn, error};
|
||||
use tracing::{info, warn, error, debug};
|
||||
use std::sync::mpsc;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::thread;
|
||||
@@ -23,67 +23,40 @@ use crate::load::{Workload, IntensityProfile, StressVector};
|
||||
use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase};
|
||||
use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult};
|
||||
use crate::agent_analyst::HeuristicAnalyst;
|
||||
use crate::agent_integrator::ServiceIntegrator;
|
||||
|
||||
/// Represents the possible states of the benchmark orchestrator.
|
||||
pub enum OrchestratorState {
|
||||
/// Performing pre-flight checks and snapshotting.
|
||||
PreFlight,
|
||||
/// Acquiring idle baseline telemetry.
|
||||
IdleBaseline,
|
||||
/// Actively sweeping through power limits.
|
||||
StressSweep { current_wattage: f32 },
|
||||
/// Allowing hardware to cool down before releasing the guard.
|
||||
ThermalCalibration,
|
||||
StabilitySweep,
|
||||
Cooldown,
|
||||
/// Benchmark complete, generating final results.
|
||||
Finalizing,
|
||||
}
|
||||
|
||||
/// The central state machine responsible for coordinating the thermal benchmark.
|
||||
pub struct BenchmarkOrchestrator {
|
||||
/// Injected hardware abstraction layer.
|
||||
sal: Arc<dyn PlatformSal>,
|
||||
/// Discovered system facts and paths.
|
||||
facts: SystemFactSheet,
|
||||
/// Heat generation workload.
|
||||
workload: Box<dyn Workload>,
|
||||
/// Channel for sending telemetry updates to the UI.
|
||||
telemetry_tx: mpsc::Sender<TelemetryState>,
|
||||
/// Channel for receiving commands from the UI.
|
||||
command_rx: mpsc::Receiver<UiCommand>,
|
||||
/// Current phase reported to the UI.
|
||||
ui_phase: BenchmarkPhase,
|
||||
/// Accumulated thermal data points.
|
||||
profile: ThermalProfile,
|
||||
/// Mathematics engine for data smoothing and optimization.
|
||||
engine: OptimizerEngine,
|
||||
/// CLI override for the configuration output path.
|
||||
optional_config_out: Option<PathBuf>,
|
||||
|
||||
/// The safety membrane protecting the system.
|
||||
safeguard: Option<HardwareStateGuard>,
|
||||
/// Active thermal watchdog.
|
||||
watchdog: Option<ThermalWatchdog>,
|
||||
|
||||
/// Sliding window of power readings (Watts).
|
||||
history_watts: VecDeque<f32>,
|
||||
/// Sliding window of temperature readings (Celsius).
|
||||
history_temp: VecDeque<f32>,
|
||||
/// Sliding window of CPU frequency (MHz).
|
||||
history_mhz: VecDeque<f32>,
|
||||
|
||||
/// Detected CPU model string.
|
||||
cpu_model: String,
|
||||
/// Total system RAM in Gigabytes.
|
||||
total_ram_gb: u64,
|
||||
|
||||
/// Atomic flag indicating a safety-triggered abort.
|
||||
emergency_abort: Arc<AtomicBool>,
|
||||
/// Human-readable reason for the emergency abort.
|
||||
emergency_reason: Arc<Mutex<Option<String>>>,
|
||||
}
|
||||
|
||||
impl BenchmarkOrchestrator {
|
||||
/// Creates a new orchestrator instance with injected dependencies.
|
||||
pub fn new(
|
||||
sal: Arc<dyn PlatformSal>,
|
||||
facts: SystemFactSheet,
|
||||
@@ -122,14 +95,13 @@ impl BenchmarkOrchestrator {
|
||||
}
|
||||
}
|
||||
|
||||
/// Executes the full benchmark sequence.
|
||||
pub fn run(&mut self) -> Result<OptimizationResult> {
|
||||
// Immediate Priming
|
||||
let _ = self.sal.get_temp();
|
||||
let _ = self.sal.get_power_w();
|
||||
let _ = self.sal.get_fan_rpms();
|
||||
|
||||
info!("Orchestrator: Initializing Project Iron-Ember lifecycle.");
|
||||
info!("Orchestrator: Initializing Project Iron-Ember PGC Protocol.");
|
||||
|
||||
// Spawn safety watchdog immediately
|
||||
let watchdog = ThermalWatchdog::spawn(self.sal.clone(), self.emergency_abort.clone());
|
||||
@@ -147,24 +119,24 @@ impl BenchmarkOrchestrator {
|
||||
let _ = self.workload.stop_workload();
|
||||
|
||||
if let Some(mut sg) = self.safeguard.take() {
|
||||
if let Err(e) = sg.release() {
|
||||
error!("CRITICAL: State restoration failure: {}", e);
|
||||
}
|
||||
let _ = sg.release();
|
||||
}
|
||||
|
||||
info!("✓ Hardware state restored to pre-flight defaults.");
|
||||
if let Err(e) = self.sal.restore() {
|
||||
warn!("Failed secondary SAL restoration: {}", e);
|
||||
}
|
||||
|
||||
info!("✓ Hardware state restored.");
|
||||
result
|
||||
}
|
||||
|
||||
/// Internal execution logic for the benchmark phases.
|
||||
fn execute_benchmark(&mut self) -> Result<OptimizationResult> {
|
||||
let bench_cfg = self.facts.bench_config.clone().context("Benchmarking configuration missing.")?;
|
||||
let _bench_cfg = self.facts.bench_config.clone().context("Config missing.")?;
|
||||
|
||||
// 1. Pre-Flight Phase
|
||||
self.ui_phase = BenchmarkPhase::Auditing;
|
||||
self.log("Phase: Pre-Flight Auditing & Sterilization")?;
|
||||
|
||||
// Snapshot and neutralise Brawl Matrix
|
||||
let mut target_files = self.facts.rapl_paths.iter()
|
||||
.map(|p| p.join("constraint_0_power_limit_uw"))
|
||||
.collect::<Vec<_>>();
|
||||
@@ -177,7 +149,6 @@ impl BenchmarkOrchestrator {
|
||||
let sg = HardwareStateGuard::acquire(&target_files, &self.facts.conflict_services)?;
|
||||
self.safeguard = Some(sg);
|
||||
|
||||
// Run auditor
|
||||
for step in self.sal.audit() {
|
||||
if let Err(e) = step.outcome {
|
||||
return Err(anyhow::anyhow!("Audit failed ({}): {:?}", step.description, e));
|
||||
@@ -185,106 +156,117 @@ impl BenchmarkOrchestrator {
|
||||
}
|
||||
|
||||
self.workload.initialize().context("Failed to initialize load generator.")?;
|
||||
self.sal.suppress().context("Failed to suppress background services.")?;
|
||||
|
||||
let tick = Cell::new(0u64);
|
||||
|
||||
// 2. Idle Baseline Phase
|
||||
self.ui_phase = BenchmarkPhase::IdleCalibration;
|
||||
self.log(&format!("Phase: Recording Idle Baseline ({}s)", bench_cfg.idle_duration_s))?;
|
||||
|
||||
// Wait for fan spin-up
|
||||
self.log("Phase: Recording 30s Idle Baseline...")?;
|
||||
self.sal.set_fan_mode("auto")?;
|
||||
|
||||
let mut idle_temps = Vec::new();
|
||||
let start = Instant::now();
|
||||
while start.elapsed() < Duration::from_secs(bench_cfg.idle_duration_s) {
|
||||
while start.elapsed() < Duration::from_secs(30) {
|
||||
self.check_safety_abort()?;
|
||||
self.send_telemetry(tick.get())?;
|
||||
idle_temps.push(self.sal.get_temp().unwrap_or(0.0));
|
||||
tick.set(tick.get() + 1);
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
}
|
||||
self.profile.ambient_temp = self.engine.smooth(&idle_temps).last().cloned().unwrap_or(0.0);
|
||||
self.profile.ambient_temp = self.engine.smooth(&idle_temps).iter().sum::<f32>() / idle_temps.len() as f32;
|
||||
self.log(&format!("✓ Idle Baseline: {:.1}°C", self.profile.ambient_temp))?;
|
||||
|
||||
// 3. Stress Sweep Phase
|
||||
self.ui_phase = BenchmarkPhase::StressTesting;
|
||||
self.log("Phase: Synthetic Stress Matrix (Gradual Ramp)")?;
|
||||
|
||||
// Ensure fans are ramped to MAX before load
|
||||
self.log("Metrology: Locking fans to MAX...")?;
|
||||
// 3. Thermal Resistance Mapping (Phase 1)
|
||||
self.log("Phase: Mapping Thermal Resistance (Rθ) at 10W...")?;
|
||||
self.sal.set_fan_mode("max")?;
|
||||
let fan_lock_start = Instant::now();
|
||||
loop {
|
||||
let fans = self.sal.get_fan_rpms().unwrap_or_default();
|
||||
let max_rpm = fans.iter().cloned().max().unwrap_or(0);
|
||||
if max_rpm >= 3000 || fan_lock_start.elapsed() > Duration::from_secs(15) {
|
||||
|
||||
let pl_calib = PowerLimitWatts::try_new(10.0)?;
|
||||
self.sal.set_sustained_power_limit(pl_calib)?;
|
||||
self.sal.set_burst_power_limit(pl_calib)?;
|
||||
|
||||
self.workload.run_workload(
|
||||
Duration::from_secs(120),
|
||||
IntensityProfile { threads: num_cpus::get_physical(), load_percentage: 100, vector: StressVector::CpuMatrix }
|
||||
)?;
|
||||
|
||||
let mut calib_temps = Vec::new();
|
||||
let calib_start = Instant::now();
|
||||
while calib_start.elapsed() < Duration::from_secs(90) {
|
||||
self.check_safety_abort()?;
|
||||
self.send_telemetry(tick.get())?;
|
||||
let t = self.sal.get_temp().unwrap_or(0.0);
|
||||
calib_temps.push(t);
|
||||
tick.set(tick.get() + 1);
|
||||
|
||||
if calib_start.elapsed() > Duration::from_secs(30) && self.engine.is_stable(&calib_temps) {
|
||||
break;
|
||||
}
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
self.send_telemetry(tick.get())?;
|
||||
tick.set(tick.get() + 1);
|
||||
}
|
||||
|
||||
let steady_t = calib_temps.last().cloned().unwrap_or(0.0);
|
||||
let steady_p = self.sal.get_power_w().unwrap_or(10.0);
|
||||
self.profile.r_theta = self.engine.calculate_r_theta(self.profile.ambient_temp, steady_t, steady_p);
|
||||
self.log(&format!("✓ Physical Model: Rθ = {:.3} K/W", self.profile.r_theta))?;
|
||||
|
||||
let physical_threads = num_cpus::get_physical();
|
||||
// 4. Physically-Aware Stability Sweep (Phase 2)
|
||||
self.ui_phase = BenchmarkPhase::StressTesting;
|
||||
self.log("Phase: Starting Physically-Aware Efficiency Sweep...")?;
|
||||
|
||||
let mut current_w = 12.0_f32;
|
||||
let mut previous_ops = 0.0;
|
||||
|
||||
for &watts in &bench_cfg.power_steps_watts {
|
||||
self.check_safety_abort()?;
|
||||
self.log(&format!("Testing PL1 = {:.0}W", watts))?;
|
||||
|
||||
// Apply limits safely
|
||||
let pl1 = PowerLimitWatts::try_new(watts)?;
|
||||
let pl2 = PowerLimitWatts::try_new(watts + 5.0)?;
|
||||
|
||||
self.sal.set_sustained_power_limit(pl1)?;
|
||||
self.sal.set_burst_power_limit(pl2)?;
|
||||
|
||||
// Start workload
|
||||
loop {
|
||||
// Predict if this step is safe
|
||||
let pred_t = self.engine.predict_temp(current_w, self.profile.ambient_temp, self.profile.r_theta);
|
||||
if pred_t > 92.0 {
|
||||
self.log(&format!("Prediction: {:.1}W would result in {:.1}C (Too Hot). Finalizing...", current_w, pred_t))?;
|
||||
break;
|
||||
}
|
||||
|
||||
self.log(&format!("Step: {:.1}W (Predicted: {:.1}C)", current_w, pred_t))?;
|
||||
let pl = PowerLimitWatts::try_new(current_w)?;
|
||||
self.sal.set_sustained_power_limit(pl)?;
|
||||
self.sal.set_burst_power_limit(PowerLimitWatts::try_new(current_w + 2.0)?)?;
|
||||
|
||||
self.workload.run_workload(
|
||||
Duration::from_secs(bench_cfg.stress_duration_max_s),
|
||||
IntensityProfile { threads: physical_threads, load_percentage: 100, vector: StressVector::CpuMatrix }
|
||||
Duration::from_secs(60),
|
||||
IntensityProfile { threads: num_cpus::get_physical(), load_percentage: 100, vector: StressVector::CpuMatrix }
|
||||
)?;
|
||||
|
||||
let step_start = Instant::now();
|
||||
let mut step_temps = VecDeque::with_capacity(30);
|
||||
let mut previous_step_temp = self.sal.get_temp().unwrap_or(0.0);
|
||||
let mut step_temps = Vec::new();
|
||||
let mut previous_t = self.sal.get_temp().unwrap_or(0.0);
|
||||
|
||||
// Equilibrium Gating
|
||||
while step_start.elapsed() < Duration::from_secs(bench_cfg.stress_duration_max_s) {
|
||||
while step_start.elapsed() < Duration::from_secs(60) {
|
||||
self.check_safety_abort()?;
|
||||
|
||||
self.send_telemetry(tick.get())?;
|
||||
|
||||
let t = self.sal.get_temp().unwrap_or(0.0);
|
||||
let dt_dt = (t - previous_step_temp) / 0.5;
|
||||
previous_step_temp = t;
|
||||
let dt_dt = (t - previous_t) / 0.5;
|
||||
|
||||
// Redundant safety check during step
|
||||
if t > 94.0 || dt_dt > 5.0 {
|
||||
warn!("Thermal Spike Detected! Aborting current step.");
|
||||
break;
|
||||
// # SAFETY: predictive hard-quench threshold raised to 8C/s
|
||||
if step_start.elapsed() > Duration::from_secs(2) && (t > 95.0 || dt_dt > 8.0) {
|
||||
warn!("USA: Safety Break triggered! T={:.1}C, dT/dt={:.1}C/s", t, dt_dt);
|
||||
let _ = self.sal.set_sustained_power_limit(PowerLimitWatts::try_new(3.0)?);
|
||||
break; // Just break the sweep loop
|
||||
}
|
||||
|
||||
step_temps.push_back(t);
|
||||
if step_temps.len() > 10 { step_temps.pop_front(); }
|
||||
|
||||
self.send_telemetry(tick.get())?;
|
||||
step_temps.push(t);
|
||||
tick.set(tick.get() + 1);
|
||||
|
||||
if step_start.elapsed() > Duration::from_secs(bench_cfg.stress_duration_min_s) && step_temps.len() == 10 {
|
||||
let min = step_temps.iter().fold(f32::MAX, |a, &b| a.min(b));
|
||||
let max = step_temps.iter().fold(f32::MIN, |a, &b| a.max(b));
|
||||
if (max - min) < 0.5 {
|
||||
info!("Equilibrium reached at {:.1}°C", t);
|
||||
break;
|
||||
}
|
||||
if step_start.elapsed() > Duration::from_secs(15) && self.engine.is_stable(&step_temps) {
|
||||
self.log(&format!(" Equilibrium reached at {:.1}°C", t))?;
|
||||
break;
|
||||
}
|
||||
previous_t = t;
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
}
|
||||
|
||||
// Record data point
|
||||
let metrics = self.workload.get_current_metrics().unwrap_or_default();
|
||||
self.profile.points.push(ThermalPoint {
|
||||
power_w: self.sal.get_power_w().unwrap_or(watts),
|
||||
power_w: self.sal.get_power_w().unwrap_or(current_w),
|
||||
temp_c: self.sal.get_temp().unwrap_or(0.0),
|
||||
freq_mhz: self.sal.get_freq_mhz().unwrap_or(0.0),
|
||||
fan_rpm: self.sal.get_fan_rpms().unwrap_or_default().first().cloned().unwrap_or(0),
|
||||
@@ -293,64 +275,62 @@ impl BenchmarkOrchestrator {
|
||||
|
||||
self.workload.stop_workload()?;
|
||||
|
||||
// Performance Halt Condition
|
||||
// Efficiency Break
|
||||
if previous_ops > 0.0 {
|
||||
let gain = ((metrics.primary_ops_per_sec - previous_ops) / previous_ops) * 100.0;
|
||||
if gain < 1.0 {
|
||||
self.log("Diminishing returns reached. Stopping sweep.")?;
|
||||
self.log("Silicon Knee identified (gain < 1%). Finalizing...")?;
|
||||
break;
|
||||
}
|
||||
}
|
||||
previous_ops = metrics.primary_ops_per_sec;
|
||||
current_w += 2.0;
|
||||
if current_w > 45.0 { break; }
|
||||
|
||||
self.log(&format!("Cooling down ({}s)...", bench_cfg.cool_down_s))?;
|
||||
thread::sleep(Duration::from_secs(bench_cfg.cool_down_s));
|
||||
self.log(&format!("Cooling down ({}s)...", _bench_cfg.cool_down_s))?;
|
||||
thread::sleep(Duration::from_secs(_bench_cfg.cool_down_s));
|
||||
}
|
||||
|
||||
// 4. Physical Modeling Phase
|
||||
// 5. Modeling Phase
|
||||
self.ui_phase = BenchmarkPhase::PhysicalModeling;
|
||||
self.log("Phase: Silicon Physical Sweet Spot Calculation")?;
|
||||
|
||||
let knee = self.engine.find_silicon_knee(&self.profile);
|
||||
let analyst = HeuristicAnalyst::new();
|
||||
let matrix = analyst.analyze(&self.profile, self.profile.points.last().map(|p| p.power_w).unwrap_or(15.0));
|
||||
|
||||
let mut res = self.generate_result(false);
|
||||
res.optimization_matrix = Some(matrix.clone());
|
||||
|
||||
info!("Identification complete. Knee: {:.1}W, Rθ: {:.3} K/W", res.silicon_knee_watts, res.thermal_resistance_kw);
|
||||
res.silicon_knee_watts = knee;
|
||||
|
||||
// 5. Finalizing Phase
|
||||
// 6. Finalizing Phase
|
||||
self.ui_phase = BenchmarkPhase::Finalizing;
|
||||
self.log("Phase: Generation of Optimized Configuration Sets")?;
|
||||
|
||||
let throttled_path = self.optional_config_out.clone()
|
||||
.or_else(|| self.facts.paths.configs.get("throttled").cloned());
|
||||
|
||||
if let Some(path) = throttled_path {
|
||||
let throttled_source = self.facts.paths.configs.get("throttled");
|
||||
if let Some(path) = self.optional_config_out.clone().or_else(|| throttled_source.cloned()) {
|
||||
let config = crate::engine::formatters::throttled::ThrottledConfig {
|
||||
pl1_limit: res.silicon_knee_watts,
|
||||
pl2_limit: res.recommended_pl2,
|
||||
trip_temp: res.max_temp_c.max(90.0),
|
||||
pl2_limit: res.silicon_knee_watts * 1.25,
|
||||
trip_temp: 90.0,
|
||||
};
|
||||
crate::engine::formatters::throttled::ThrottledTranslator::save(&path, &config)?;
|
||||
self.log(&format!("✓ Saved Throttled profile to {}", path.display()))?;
|
||||
let _ = crate::engine::formatters::throttled::ThrottledTranslator::save(&path, &config, throttled_source);
|
||||
res.config_paths.insert("throttled".to_string(), path);
|
||||
}
|
||||
|
||||
let base_out = self.optional_config_out.clone().unwrap_or_else(|| PathBuf::from("/etc"));
|
||||
let i8k_source = self.facts.paths.configs.get("i8kmon");
|
||||
let i8k_out = base_out.join("i8kmon.conf");
|
||||
if ServiceIntegrator::generate_i8kmon_config(&matrix, &i8k_out, i8k_source).is_ok() {
|
||||
res.config_paths.insert("i8kmon".to_string(), i8k_out);
|
||||
}
|
||||
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
/// Checks if the safety watchdog or user triggered an abort.
|
||||
fn check_safety_abort(&self) -> Result<()> {
|
||||
if self.emergency_abort.load(Ordering::SeqCst) {
|
||||
let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Watchdog Triggered".to_string());
|
||||
let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Watchdog".to_string());
|
||||
bail!("EMERGENCY_ABORT: {}", reason);
|
||||
}
|
||||
|
||||
if let Ok(cmd) = self.command_rx.try_recv() {
|
||||
match cmd {
|
||||
UiCommand::Abort => bail!("ABORTED"),
|
||||
}
|
||||
if let UiCommand::Abort = cmd { bail!("ABORTED"); }
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -365,49 +345,35 @@ impl BenchmarkOrchestrator {
|
||||
current_freq: self.sal.get_freq_mhz().unwrap_or(0.0),
|
||||
fans: self.sal.get_fan_rpms().unwrap_or_default(),
|
||||
governor: "performance".to_string(),
|
||||
pl1_limit: 0.0,
|
||||
pl2_limit: 0.0,
|
||||
fan_tier: "auto".to_string(),
|
||||
pl1_limit: 0.0, pl2_limit: 0.0, fan_tier: "auto".to_string(),
|
||||
is_throttling: self.sal.get_throttling_status().unwrap_or(false),
|
||||
phase: self.ui_phase,
|
||||
history_watts: Vec::new(),
|
||||
history_temp: Vec::new(),
|
||||
history_mhz: Vec::new(),
|
||||
history_watts: Vec::new(), history_temp: Vec::new(), history_mhz: Vec::new(),
|
||||
log_event: Some(msg.to_string()),
|
||||
metadata: std::collections::HashMap::new(),
|
||||
is_emergency: self.emergency_abort.load(Ordering::SeqCst),
|
||||
emergency_reason: self.emergency_reason.lock().unwrap().clone(),
|
||||
};
|
||||
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed"))
|
||||
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Channel closed"))
|
||||
}
|
||||
|
||||
fn send_telemetry(&mut self, tick: u64) -> Result<()> {
|
||||
let temp = self.sal.get_temp().unwrap_or(0.0);
|
||||
let pwr = self.sal.get_power_w().unwrap_or(0.0);
|
||||
let freq = self.sal.get_freq_mhz().unwrap_or(0.0);
|
||||
|
||||
self.history_temp.push_back(temp);
|
||||
self.history_watts.push_back(pwr);
|
||||
self.history_mhz.push_back(freq);
|
||||
|
||||
if self.history_temp.len() > 120 {
|
||||
self.history_temp.pop_front();
|
||||
self.history_watts.pop_front();
|
||||
self.history_mhz.pop_front();
|
||||
}
|
||||
if self.history_temp.len() > 120 { self.history_temp.pop_front(); self.history_watts.pop_front(); self.history_mhz.pop_front(); }
|
||||
|
||||
let state = TelemetryState {
|
||||
cpu_model: self.cpu_model.clone(),
|
||||
total_ram_gb: self.total_ram_gb,
|
||||
tick,
|
||||
cpu_temp: temp,
|
||||
power_w: pwr,
|
||||
current_freq: freq,
|
||||
cpu_temp: temp, power_w: pwr, current_freq: freq,
|
||||
fans: self.sal.get_fan_rpms().unwrap_or_default(),
|
||||
governor: "performance".to_string(),
|
||||
pl1_limit: 15.0,
|
||||
pl2_limit: 25.0,
|
||||
fan_tier: "max".to_string(),
|
||||
pl1_limit: 15.0, pl2_limit: 25.0, fan_tier: "max".to_string(),
|
||||
is_throttling: self.sal.get_throttling_status().unwrap_or(false),
|
||||
phase: self.ui_phase,
|
||||
history_watts: self.history_watts.iter().cloned().collect(),
|
||||
@@ -418,21 +384,19 @@ impl BenchmarkOrchestrator {
|
||||
is_emergency: self.emergency_abort.load(Ordering::SeqCst),
|
||||
emergency_reason: self.emergency_reason.lock().unwrap().clone(),
|
||||
};
|
||||
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed"))
|
||||
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Channel closed"))
|
||||
}
|
||||
|
||||
pub fn generate_result(&self, is_partial: bool) -> OptimizationResult {
|
||||
let r_theta = self.engine.calculate_thermal_resistance(&self.profile);
|
||||
let r_theta = self.profile.r_theta;
|
||||
let knee = self.engine.find_silicon_knee(&self.profile);
|
||||
let max_t = self.engine.get_max_temp(&self.profile);
|
||||
|
||||
OptimizationResult {
|
||||
profile: self.profile.clone(),
|
||||
silicon_knee_watts: knee,
|
||||
thermal_resistance_kw: r_theta,
|
||||
recommended_pl1: knee,
|
||||
recommended_pl2: knee * 1.25,
|
||||
max_temp_c: max_t,
|
||||
max_temp_c: self.profile.points.iter().map(|p| p.temp_c).max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)).unwrap_or(0.0),
|
||||
is_partial,
|
||||
config_paths: std::collections::HashMap::new(),
|
||||
optimization_matrix: None,
|
||||
|
||||
@@ -2,8 +2,7 @@ use anyhow::{Result, anyhow, Context};
|
||||
use std::path::{Path};
|
||||
use std::fs;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::sync::{Mutex, Arc};
|
||||
use tracing::{debug, warn, info};
|
||||
use std::sync::Mutex;
|
||||
|
||||
use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx};
|
||||
use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent};
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, SafetyStatus};
|
||||
use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent};
|
||||
use anyhow::Result;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub struct MockSal {
|
||||
pub temperature_sequence: std::sync::atomic::AtomicUsize,
|
||||
|
||||
@@ -10,7 +10,7 @@ use std::fs;
|
||||
use std::path::{PathBuf};
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::time::Duration;
|
||||
use std::thread;
|
||||
use tracing::{info, warn, error, debug};
|
||||
|
||||
|
||||
@@ -1,35 +1,75 @@
|
||||
#[path = "../src/engine/formatters/throttled.rs"]
|
||||
mod throttled;
|
||||
|
||||
use throttled::{ThrottledTranslator, ThrottledConfig};
|
||||
use ember_tune_rs::engine::formatters::throttled::{ThrottledConfig, ThrottledTranslator};
|
||||
use ember_tune_rs::agent_analyst::{OptimizationMatrix, SystemProfile, FanCurvePoint};
|
||||
use ember_tune_rs::agent_integrator::ServiceIntegrator;
|
||||
use std::fs;
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[test]
|
||||
fn test_throttled_formatter_non_destructive() {
|
||||
let fixture_path = "tests/fixtures/throttled.conf";
|
||||
let existing_content = fs::read_to_string(fixture_path).expect("Failed to read fixture");
|
||||
|
||||
fn test_throttled_merge_preserves_undervolt() {
|
||||
let existing = r#"[GENERAL]
|
||||
Update_Interval_ms: 1000
|
||||
|
||||
[UNDERVOLT]
|
||||
# CPU core undervolt
|
||||
CORE: -100
|
||||
# GPU undervolt
|
||||
GPU: -50
|
||||
|
||||
[AC]
|
||||
PL1_Tdp_W: 15
|
||||
PL2_Tdp_W: 25
|
||||
"#;
|
||||
|
||||
let config = ThrottledConfig {
|
||||
pl1_limit: 25.0,
|
||||
pl2_limit: 35.0,
|
||||
trip_temp: 90.0,
|
||||
pl1_limit: 22.0,
|
||||
pl2_limit: 28.0,
|
||||
trip_temp: 95.0,
|
||||
};
|
||||
|
||||
let merged = ThrottledTranslator::merge_conf(&existing_content, &config);
|
||||
let merged = ThrottledTranslator::merge_conf(existing, &config);
|
||||
|
||||
// Assert updates
|
||||
assert!(merged.contains("PL1_Tdp_W: 25"));
|
||||
assert!(merged.contains("PL2_Tdp_W: 35"));
|
||||
assert!(merged.contains("Trip_Temp_C: 90"));
|
||||
|
||||
// Assert preservation
|
||||
assert!(merged.contains("[UNDERVOLT]"));
|
||||
assert!(merged.contains("CORE: -100"));
|
||||
assert!(merged.contains("GPU: -50"));
|
||||
assert!(merged.contains("# Important: Preserving undervolt offsets is critical!"));
|
||||
assert!(merged.contains("Update_Interval_ms: 3000"));
|
||||
|
||||
// Check that we didn't lose the [GENERAL] section
|
||||
assert!(merged.contains("[GENERAL]"));
|
||||
assert!(merged.contains("# This is a complex test fixture"));
|
||||
assert!(merged.contains("PL1_Tdp_W: 22"));
|
||||
assert!(merged.contains("PL2_Tdp_W: 28"));
|
||||
assert!(merged.contains("Trip_Temp_C: 95"));
|
||||
assert!(merged.contains("[UNDERVOLT]"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_i8kmon_merge_preserves_settings() {
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("i8kmon.conf");
|
||||
|
||||
let existing = r#"set config(gen_shadow) 1
|
||||
set config(i8k_ignore_dmi) 1
|
||||
set config(daemon) 1
|
||||
|
||||
set config(0) {0 0 60 50}
|
||||
"#;
|
||||
fs::write(&config_path, existing).unwrap();
|
||||
|
||||
let matrix = OptimizationMatrix {
|
||||
silent: SystemProfile { name: "Silent".to_string(), pl1_watts: 10.0, pl2_watts: 12.0, fan_curve: vec![] },
|
||||
balanced: SystemProfile {
|
||||
name: "Balanced".to_string(),
|
||||
pl1_watts: 20.0,
|
||||
pl2_watts: 25.0,
|
||||
fan_curve: vec![
|
||||
FanCurvePoint { temp_on: 70.0, temp_off: 60.0, pwm_percent: 50 }
|
||||
]
|
||||
},
|
||||
performance: SystemProfile { name: "Perf".to_string(), pl1_watts: 30.0, pl2_watts: 35.0, fan_curve: vec![] },
|
||||
thermal_resistance_kw: 1.5,
|
||||
ambient_temp: 25.0,
|
||||
};
|
||||
|
||||
ServiceIntegrator::generate_i8kmon_config(&matrix, &config_path, Some(&config_path)).unwrap();
|
||||
|
||||
let result = fs::read_to_string(&config_path).unwrap();
|
||||
|
||||
assert!(result.contains("set config(gen_shadow) 1"));
|
||||
assert!(result.contains("set config(daemon) 1"));
|
||||
assert!(result.contains("set config(0) {1 1 70 -}")); // New config
|
||||
assert!(!result.contains("set config(0) {0 0 60 50}")); // Old config should be gone
|
||||
}
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
use anyhow::Result;
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use ember_tune_rs::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts};
|
||||
use ember_tune_rs::sal::safety::{HardwareStateGuard, PowerLimitWatts};
|
||||
use crate::common::fakesys::FakeSysBuilder;
|
||||
use std::fs;
|
||||
|
||||
mod common;
|
||||
|
||||
@@ -34,23 +32,22 @@ fn test_hardware_state_guard_panic_restoration() {
|
||||
#[test]
|
||||
fn test_tdp_limit_bounds_checking() {
|
||||
// 1. Valid value
|
||||
assert!(TdpLimitMicroWatts::new(15_000_000).is_ok());
|
||||
assert!(PowerLimitWatts::try_new(15.0).is_ok());
|
||||
|
||||
// 2. Too low (Dangerous 0W or below 5W)
|
||||
let low_res = TdpLimitMicroWatts::new(1_000_000);
|
||||
// 2. Too low (Dangerous 0W or below 3W)
|
||||
let low_res = PowerLimitWatts::try_new(1.0);
|
||||
assert!(low_res.is_err());
|
||||
assert!(low_res.unwrap_err().to_string().contains("below safety floor"));
|
||||
assert!(low_res.unwrap_err().to_string().contains("outside safe bounds"));
|
||||
|
||||
// 3. Too high (> 80W)
|
||||
let high_res = TdpLimitMicroWatts::new(100_000_000);
|
||||
// 3. Too high (> 100W)
|
||||
let high_res = PowerLimitWatts::try_new(150.0);
|
||||
assert!(high_res.is_err());
|
||||
assert!(high_res.unwrap_err().to_string().contains("exceeds safety ceiling"));
|
||||
assert!(high_res.unwrap_err().to_string().contains("outside safe bounds"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_0w_tdp_regression_prevention() {
|
||||
// The prime directive is to never set 0W.
|
||||
// Ensure the new() constructor explicitly fails for 0.
|
||||
let zero_res = TdpLimitMicroWatts::new(0);
|
||||
let zero_res = PowerLimitWatts::try_new(0.0);
|
||||
assert!(zero_res.is_err());
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user