release/1.2.0 #2
151
src/load/mod.rs
151
src/load/mod.rs
@@ -1,61 +1,136 @@
|
|||||||
//! Defines the `Workload` trait for generating synthetic CPU/GPU load.
|
//! Load generation and performance measurement subsystem.
|
||||||
|
|
||||||
use anyhow::Result;
|
use anyhow::{Result, Context, anyhow};
|
||||||
use std::process::Child;
|
use std::process::{Child, Command, Stdio};
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
use std::thread;
|
use std::thread;
|
||||||
|
use std::io::{BufRead, BufReader};
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
/// A trait for objects that can generate a measurable system load.
|
/// Standardized telemetry returned by any workload implementation.
|
||||||
pub trait Workload: Send + Sync {
|
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||||||
/// Starts the workload with the specified number of threads and load percentage.
|
pub struct WorkloadMetrics {
|
||||||
///
|
/// Primary performance heuristic (e.g., Bogo Ops/s)
|
||||||
/// # Errors
|
pub primary_ops_per_sec: f64,
|
||||||
/// Returns an error if the underlying stress test process fails to spawn.
|
/// Time elapsed since the workload started
|
||||||
fn start(&mut self, threads: usize, load_percent: usize) -> Result<()>;
|
pub elapsed_time: Duration,
|
||||||
|
|
||||||
/// Stops the workload gracefully.
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// This method should aim to not fail, but may return an error if
|
|
||||||
/// forcefully killing the child process fails.
|
|
||||||
fn stop(&mut self) -> Result<()>;
|
|
||||||
|
|
||||||
/// Returns the current throughput of the workload (e.g., ops/sec).
|
|
||||||
///
|
|
||||||
/// # Errors
|
|
||||||
/// Returns an error if throughput cannot be measured.
|
|
||||||
fn get_throughput(&self) -> Result<f64>;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// An implementation of `Workload` that uses the `stress-ng` utility.
|
/// A normalized profile defining the intensity and constraints of the workload.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct IntensityProfile {
|
||||||
|
pub threads: usize,
|
||||||
|
pub load_percentage: u8,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The replaceable interface for load generation and performance measurement.
|
||||||
|
pub trait Workload: Send + Sync {
|
||||||
|
/// Sets up prerequisites (e.g., binary checks).
|
||||||
|
fn initialize(&mut self) -> Result<()>;
|
||||||
|
|
||||||
|
/// Executes the load asynchronously.
|
||||||
|
fn run_workload(&mut self, duration: Duration, profile: IntensityProfile) -> Result<()>;
|
||||||
|
|
||||||
|
/// Returns the current standardized telemetry object.
|
||||||
|
fn get_current_metrics(&self) -> Result<WorkloadMetrics>;
|
||||||
|
|
||||||
|
/// Gracefully and forcefully terminates the workload.
|
||||||
|
fn stop_workload(&mut self) -> Result<()>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Implementation of the Benchmarking Interface using stress-ng matrix stressors.
|
||||||
pub struct StressNg {
|
pub struct StressNg {
|
||||||
child: Option<Child>,
|
child: Option<Child>,
|
||||||
|
start_time: Option<Instant>,
|
||||||
|
latest_metrics: Arc<Mutex<WorkloadMetrics>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl StressNg {
|
impl StressNg {
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Self { child: None }
|
Self {
|
||||||
|
child: None,
|
||||||
|
start_time: None,
|
||||||
|
latest_metrics: Arc::new(Mutex::new(WorkloadMetrics::default())),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Workload for StressNg {
|
impl Workload for StressNg {
|
||||||
fn start(&mut self, threads: usize, load_percent: usize) -> Result<()> {
|
fn initialize(&mut self) -> Result<()> {
|
||||||
self.stop()?;
|
let status = Command::new("stress-ng")
|
||||||
|
.arg("--version")
|
||||||
|
.stdout(Stdio::null())
|
||||||
|
.stderr(Stdio::null())
|
||||||
|
.status()
|
||||||
|
.context("stress-ng binary not found in PATH")?;
|
||||||
|
|
||||||
let child = std::process::Command::new("stress-ng")
|
if !status.success() {
|
||||||
|
return Err(anyhow!("stress-ng failed to initialize"));
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run_workload(&mut self, duration: Duration, profile: IntensityProfile) -> Result<()> {
|
||||||
|
self.stop_workload()?; // Ensure clean state
|
||||||
|
|
||||||
|
let threads = profile.threads.to_string();
|
||||||
|
let timeout = format!("{}s", duration.as_secs());
|
||||||
|
let load = profile.load_percentage.to_string();
|
||||||
|
|
||||||
|
let mut child = Command::new("stress-ng")
|
||||||
.args([
|
.args([
|
||||||
"--cpu", &threads.to_string(),
|
"--matrix", &threads,
|
||||||
"--cpu-load", &load_percent.to_string(),
|
"--cpu-load", &load,
|
||||||
"--quiet"
|
"--timeout", &timeout,
|
||||||
|
"--metrics-brief",
|
||||||
|
"--metrics-brief", // Repeat for stderr/stdout consistency
|
||||||
])
|
])
|
||||||
.spawn()?;
|
.stdout(Stdio::piped())
|
||||||
|
.stderr(Stdio::piped())
|
||||||
|
.spawn()
|
||||||
|
.context("Failed to spawn stress-ng")?;
|
||||||
|
|
||||||
|
self.start_time = Some(Instant::now());
|
||||||
|
|
||||||
|
// Spawn metrics parser thread
|
||||||
|
let metrics_ref = Arc::clone(&self.latest_metrics);
|
||||||
|
let stderr = child.stderr.take().expect("Failed to capture stderr");
|
||||||
|
|
||||||
|
thread::spawn(move || {
|
||||||
|
let reader = BufReader::new(stderr);
|
||||||
|
for line in reader.lines().flatten() {
|
||||||
|
// Parse stress-ng metrics line:
|
||||||
|
// stress-ng: info: [PID] matrix [OPS] [TIME] [BOGO OPS/S]
|
||||||
|
if line.contains("matrix") && line.contains("bogo ops/s") {
|
||||||
|
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||||
|
if let Some(ops_idx) = parts.iter().position(|&p| p == "ops/s") {
|
||||||
|
if let Some(ops_val) = parts.get(ops_idx - 1) {
|
||||||
|
if let Ok(ops) = ops_val.parse::<f64>() {
|
||||||
|
let mut m = metrics_ref.lock().unwrap();
|
||||||
|
m.primary_ops_per_sec = ops;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
self.child = Some(child);
|
self.child = Some(child);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn stop(&mut self) -> Result<()> {
|
fn get_current_metrics(&self) -> Result<WorkloadMetrics> {
|
||||||
|
let mut m = self.latest_metrics.lock().unwrap().clone();
|
||||||
|
if let Some(start) = self.start_time {
|
||||||
|
m.elapsed_time = start.elapsed();
|
||||||
|
}
|
||||||
|
Ok(m)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn stop_workload(&mut self) -> Result<()> {
|
||||||
if let Some(mut child) = self.child.take() {
|
if let Some(mut child) = self.child.take() {
|
||||||
|
// Polite SIGTERM
|
||||||
#[cfg(unix)]
|
#[cfg(unix)]
|
||||||
{
|
{
|
||||||
use libc::{kill, SIGTERM};
|
use libc::{kill, SIGTERM};
|
||||||
@@ -77,19 +152,13 @@ impl Workload for StressNg {
|
|||||||
let _ = child.wait();
|
let _ = child.wait();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
self.start_time = None;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the current throughput of the workload (e.g., ops/sec).
|
|
||||||
///
|
|
||||||
/// This is currently a stub and does not parse `stress-ng` output.
|
|
||||||
fn get_throughput(&self) -> Result<f64> {
|
|
||||||
Ok(0.0)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Drop for StressNg {
|
impl Drop for StressNg {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
let _ = self.stop();
|
let _ = self.stop_workload();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,9 +14,10 @@ use std::sync::atomic::{AtomicBool, Ordering};
|
|||||||
use std::sync::Mutex;
|
use std::sync::Mutex;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
use crate::sal::traits::{PlatformSal, SafetyStatus};
|
use crate::sal::traits::{PlatformSal, AuditStep, SafetyStatus};
|
||||||
use crate::sal::heuristic::discovery::SystemFactSheet;
|
use crate::sal::heuristic::discovery::SystemFactSheet;
|
||||||
use crate::load::Workload;
|
use crate::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts};
|
||||||
|
use crate::load::{Workload, IntensityProfile};
|
||||||
use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase};
|
use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase};
|
||||||
use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult};
|
use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult};
|
||||||
|
|
||||||
@@ -44,6 +45,9 @@ pub struct BenchmarkOrchestrator {
|
|||||||
/// CLI override for the configuration output path.
|
/// CLI override for the configuration output path.
|
||||||
optional_config_out: Option<PathBuf>,
|
optional_config_out: Option<PathBuf>,
|
||||||
|
|
||||||
|
/// The safety membrane protecting the system.
|
||||||
|
safeguard: Option<HardwareStateGuard>,
|
||||||
|
|
||||||
/// Sliding window of power readings (Watts).
|
/// Sliding window of power readings (Watts).
|
||||||
history_watts: VecDeque<f32>,
|
history_watts: VecDeque<f32>,
|
||||||
/// Sliding window of temperature readings (Celsius).
|
/// Sliding window of temperature readings (Celsius).
|
||||||
@@ -97,12 +101,13 @@ impl BenchmarkOrchestrator {
|
|||||||
emergency_abort: Arc::new(AtomicBool::new(false)),
|
emergency_abort: Arc::new(AtomicBool::new(false)),
|
||||||
emergency_reason: Arc::new(Mutex::new(None)),
|
emergency_reason: Arc::new(Mutex::new(None)),
|
||||||
optional_config_out,
|
optional_config_out,
|
||||||
|
safeguard: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Executes the full benchmark sequence.
|
/// Executes the full benchmark sequence.
|
||||||
///
|
///
|
||||||
/// This method guarantees that [crate::sal::traits::EnvironmentGuard::restore] and [Workload::stop]
|
/// This method guarantees that [crate::sal::traits::EnvironmentGuard::restore] and [Workload::stop_workload]
|
||||||
/// are called regardless of whether the benchmark succeeds or fails.
|
/// are called regardless of whether the benchmark succeeds or fails.
|
||||||
pub fn run(&mut self) -> Result<OptimizationResult> {
|
pub fn run(&mut self) -> Result<OptimizationResult> {
|
||||||
self.log("Starting ember-tune Benchmark Sequence.")?;
|
self.log("Starting ember-tune Benchmark Sequence.")?;
|
||||||
@@ -111,8 +116,16 @@ impl BenchmarkOrchestrator {
|
|||||||
|
|
||||||
let result = self.execute_benchmark();
|
let result = self.execute_benchmark();
|
||||||
|
|
||||||
|
// --- MANDATORY CLEANUP ---
|
||||||
self.log("Benchmark sequence finished. Restoring hardware defaults...")?;
|
self.log("Benchmark sequence finished. Restoring hardware defaults...")?;
|
||||||
let _ = self.workload.stop();
|
let _ = self.workload.stop_workload();
|
||||||
|
|
||||||
|
if let Some(mut sg) = self.safeguard.take() {
|
||||||
|
if let Err(e) = sg.release() {
|
||||||
|
anyhow::bail!("CRITICAL: USA Restoration Failure: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if let Err(e) = self.sal.restore() {
|
if let Err(e) = self.sal.restore() {
|
||||||
anyhow::bail!("CRITICAL: Failed to restore hardware state: {}", e);
|
anyhow::bail!("CRITICAL: Failed to restore hardware state: {}", e);
|
||||||
}
|
}
|
||||||
@@ -125,6 +138,19 @@ impl BenchmarkOrchestrator {
|
|||||||
fn execute_benchmark(&mut self) -> Result<OptimizationResult> {
|
fn execute_benchmark(&mut self) -> Result<OptimizationResult> {
|
||||||
let bench_cfg = self.facts.bench_config.clone().context("Benchmarking config missing in facts")?;
|
let bench_cfg = self.facts.bench_config.clone().context("Benchmarking config missing in facts")?;
|
||||||
|
|
||||||
|
// 1. Snapshot & Arm Safeguard
|
||||||
|
let mut target_files = self.facts.rapl_paths.iter()
|
||||||
|
.map(|p| p.join("constraint_0_power_limit_uw"))
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
target_files.extend(self.facts.rapl_paths.iter().map(|p| p.join("constraint_1_power_limit_uw")));
|
||||||
|
if let Some(tp) = self.facts.paths.configs.get("throttled") {
|
||||||
|
target_files.push(tp.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
let target_services = vec!["tlp.service".to_string(), "thermald.service".to_string(), "throttled.service".to_string()];
|
||||||
|
self.safeguard = Some(HardwareStateGuard::acquire(&target_files, &target_services)?);
|
||||||
|
|
||||||
|
// Phase 1: Audit & Baseline
|
||||||
self.phase = BenchmarkPhase::Auditing;
|
self.phase = BenchmarkPhase::Auditing;
|
||||||
for step in self.sal.audit() {
|
for step in self.sal.audit() {
|
||||||
if let Err(e) = step.outcome {
|
if let Err(e) = step.outcome {
|
||||||
@@ -132,9 +158,11 @@ impl BenchmarkOrchestrator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.workload.initialize().context("Failed to initialize workload")?;
|
||||||
self.log("Suppressing background services (tlp, thermald)...")?;
|
self.log("Suppressing background services (tlp, thermald)...")?;
|
||||||
self.sal.suppress().context("Failed to suppress background services")?;
|
self.sal.suppress().context("Failed to suppress background services")?;
|
||||||
|
|
||||||
|
// Baseline (Idle Calibration)
|
||||||
self.phase = BenchmarkPhase::IdleCalibration;
|
self.phase = BenchmarkPhase::IdleCalibration;
|
||||||
self.log(&format!("Phase 1: Recording Idle Baseline ({}s)...", bench_cfg.idle_duration_s))?;
|
self.log(&format!("Phase 1: Recording Idle Baseline ({}s)...", bench_cfg.idle_duration_s))?;
|
||||||
self.sal.set_fan_mode("auto")?;
|
self.sal.set_fan_mode("auto")?;
|
||||||
@@ -152,6 +180,7 @@ impl BenchmarkOrchestrator {
|
|||||||
self.profile.ambient_temp = self.engine.smooth(&idle_temps).last().cloned().unwrap_or(0.0);
|
self.profile.ambient_temp = self.engine.smooth(&idle_temps).last().cloned().unwrap_or(0.0);
|
||||||
self.log(&format!("✓ Idle Baseline: {:.1}°C", self.profile.ambient_temp))?;
|
self.log(&format!("✓ Idle Baseline: {:.1}°C", self.profile.ambient_temp))?;
|
||||||
|
|
||||||
|
// Phase 2: Stress Stepping
|
||||||
self.phase = BenchmarkPhase::StressTesting;
|
self.phase = BenchmarkPhase::StressTesting;
|
||||||
self.log("Phase 2: Starting Synthetic Stress Matrix.")?;
|
self.log("Phase 2: Starting Synthetic Stress Matrix.")?;
|
||||||
self.sal.set_fan_mode("max")?;
|
self.sal.set_fan_mode("max")?;
|
||||||
@@ -159,10 +188,16 @@ impl BenchmarkOrchestrator {
|
|||||||
let steps = bench_cfg.power_steps_watts.clone();
|
let steps = bench_cfg.power_steps_watts.clone();
|
||||||
for &pl in &steps {
|
for &pl in &steps {
|
||||||
self.log(&format!("Testing PL1 = {:.0}W...", pl))?;
|
self.log(&format!("Testing PL1 = {:.0}W...", pl))?;
|
||||||
self.sal.set_sustained_power_limit(pl)?;
|
|
||||||
self.sal.set_burst_power_limit(pl + 5.0)?;
|
|
||||||
|
|
||||||
self.workload.start(num_cpus::get(), 100)?;
|
let pl1_uw = crate::sal::safety::TdpLimitMicroWatts::new((pl * 1_000_000.0) as u64)?;
|
||||||
|
let pl2_uw = crate::sal::safety::TdpLimitMicroWatts::new(((pl + 5.0) * 1_000_000.0) as u64)?;
|
||||||
|
self.sal.set_sustained_power_limit(pl1_uw)?;
|
||||||
|
self.sal.set_burst_power_limit(pl2_uw)?;
|
||||||
|
|
||||||
|
self.workload.run_workload(
|
||||||
|
Duration::from_secs(bench_cfg.stress_duration_max_s),
|
||||||
|
IntensityProfile { threads: num_cpus::get(), load_percentage: 100 }
|
||||||
|
)?;
|
||||||
|
|
||||||
let step_start = Instant::now();
|
let step_start = Instant::now();
|
||||||
let mut step_temps = VecDeque::with_capacity(30);
|
let mut step_temps = VecDeque::with_capacity(30);
|
||||||
@@ -188,26 +223,28 @@ impl BenchmarkOrchestrator {
|
|||||||
thread::sleep(Duration::from_millis(500));
|
thread::sleep(Duration::from_millis(500));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Record data point
|
||||||
let avg_p = self.sal.get_power_w().unwrap_or(0.0);
|
let avg_p = self.sal.get_power_w().unwrap_or(0.0);
|
||||||
let avg_t = self.sal.get_temp().unwrap_or(0.0);
|
let avg_t = self.sal.get_temp().unwrap_or(0.0);
|
||||||
let avg_f = self.sal.get_freq_mhz().unwrap_or(0.0);
|
let avg_f = self.sal.get_freq_mhz().unwrap_or(0.0);
|
||||||
let fans = self.sal.get_fan_rpms().unwrap_or_default();
|
let fans = self.sal.get_fan_rpms().unwrap_or_default();
|
||||||
let primary_fan = fans.first().cloned().unwrap_or(0);
|
let primary_fan = fans.first().cloned().unwrap_or(0);
|
||||||
let tp = self.workload.get_throughput().unwrap_or(0.0);
|
let metrics = self.workload.get_current_metrics().unwrap_or_default();
|
||||||
|
|
||||||
self.profile.points.push(ThermalPoint {
|
self.profile.points.push(ThermalPoint {
|
||||||
power_w: avg_p,
|
power_w: avg_p,
|
||||||
temp_c: avg_t,
|
temp_c: avg_t,
|
||||||
freq_mhz: avg_f,
|
freq_mhz: avg_f,
|
||||||
fan_rpm: primary_fan,
|
fan_rpm: primary_fan,
|
||||||
throughput: tp,
|
throughput: metrics.primary_ops_per_sec,
|
||||||
});
|
});
|
||||||
|
|
||||||
self.workload.stop()?;
|
self.workload.stop_workload()?;
|
||||||
self.log(&format!(" Step complete. Cooling down for {}s...", bench_cfg.cool_down_s))?;
|
self.log(&format!(" Step complete. Cooling down for {}s...", bench_cfg.cool_down_s))?;
|
||||||
thread::sleep(Duration::from_secs(bench_cfg.cool_down_s));
|
thread::sleep(Duration::from_secs(bench_cfg.cool_down_s));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Phase 4: Physical Modeling
|
||||||
self.phase = BenchmarkPhase::PhysicalModeling;
|
self.phase = BenchmarkPhase::PhysicalModeling;
|
||||||
self.log("Phase 3: Calculating Silicon Physical Sweet Spot...")?;
|
self.log("Phase 3: Calculating Silicon Physical Sweet Spot...")?;
|
||||||
|
|
||||||
@@ -218,6 +255,7 @@ impl BenchmarkOrchestrator {
|
|||||||
|
|
||||||
thread::sleep(Duration::from_secs(3));
|
thread::sleep(Duration::from_secs(3));
|
||||||
|
|
||||||
|
// Phase 5: Finalizing
|
||||||
self.phase = BenchmarkPhase::Finalizing;
|
self.phase = BenchmarkPhase::Finalizing;
|
||||||
self.log("Benchmark sequence complete. Generating configurations...")?;
|
self.log("Benchmark sequence complete. Generating configurations...")?;
|
||||||
|
|
||||||
@@ -227,8 +265,6 @@ impl BenchmarkOrchestrator {
|
|||||||
trip_temp: res.max_temp_c.max(95.0),
|
trip_temp: res.max_temp_c.max(95.0),
|
||||||
};
|
};
|
||||||
|
|
||||||
// 1. Throttled (Merged if exists)
|
|
||||||
// PRIORITY: optional_config_out > facts discovery > fallback
|
|
||||||
let throttled_path = self.optional_config_out.clone()
|
let throttled_path = self.optional_config_out.clone()
|
||||||
.or_else(|| self.facts.paths.configs.get("throttled").cloned());
|
.or_else(|| self.facts.paths.configs.get("throttled").cloned());
|
||||||
|
|
||||||
@@ -238,7 +274,6 @@ impl BenchmarkOrchestrator {
|
|||||||
res.config_paths.insert("throttled".to_string(), path.clone());
|
res.config_paths.insert("throttled".to_string(), path.clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. i8kmon
|
|
||||||
if let Some(i8k_path) = self.facts.paths.configs.get("i8kmon") {
|
if let Some(i8k_path) = self.facts.paths.configs.get("i8kmon") {
|
||||||
let i8k_config = crate::engine::formatters::i8kmon::I8kmonConfig {
|
let i8k_config = crate::engine::formatters::i8kmon::I8kmonConfig {
|
||||||
t_ambient: self.profile.ambient_temp,
|
t_ambient: self.profile.ambient_temp,
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep, SafetyStatus, EnvironmentCtx};
|
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep, SafetyStatus, EnvironmentCtx};
|
||||||
|
use crate::sal::safety::TdpLimitMicroWatts;
|
||||||
use anyhow::{Result, Context, anyhow};
|
use anyhow::{Result, Context, anyhow};
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::path::{PathBuf};
|
use std::path::{PathBuf};
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
use std::sync::Mutex;
|
use std::sync::Mutex;
|
||||||
use tracing::{debug};
|
|
||||||
use crate::sal::heuristic::discovery::SystemFactSheet;
|
use crate::sal::heuristic::discovery::SystemFactSheet;
|
||||||
|
|
||||||
pub struct DellXps9380Sal {
|
pub struct DellXps9380Sal {
|
||||||
@@ -151,7 +151,6 @@ impl EnvironmentGuard for DellXps9380Sal {
|
|||||||
let mut suppressed = self.suppressed_services.lock().unwrap();
|
let mut suppressed = self.suppressed_services.lock().unwrap();
|
||||||
for s in services {
|
for s in services {
|
||||||
if self.ctx.runner.run("systemctl", &["is-active", "--quiet", s]).is_ok() {
|
if self.ctx.runner.run("systemctl", &["is-active", "--quiet", s]).is_ok() {
|
||||||
debug!("Suppressing service: {}", s);
|
|
||||||
let _ = self.ctx.runner.run("systemctl", &["stop", s]);
|
let _ = self.ctx.runner.run("systemctl", &["stop", s]);
|
||||||
suppressed.push(s.to_string());
|
suppressed.push(s.to_string());
|
||||||
}
|
}
|
||||||
@@ -251,18 +250,18 @@ impl ActuatorBus for DellXps9380Sal {
|
|||||||
match mode {
|
match mode {
|
||||||
"max" | "Manual" => { self.ctx.runner.run(&tool_str, &["0"])?; }
|
"max" | "Manual" => { self.ctx.runner.run(&tool_str, &["0"])?; }
|
||||||
"auto" | "Auto" => { self.ctx.runner.run(&tool_str, &["1"])?; }
|
"auto" | "Auto" => { self.ctx.runner.run(&tool_str, &["1"])?; }
|
||||||
_ => { debug!("Unknown fan mode: {}", mode); }
|
_ => {}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn set_sustained_power_limit(&self, watts: f32) -> Result<()> {
|
fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
|
||||||
fs::write(&self.pl1_path, ((watts * 1_000_000.0) as u64).to_string())?;
|
fs::write(&self.pl1_path, limit.as_u64().to_string())?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn set_burst_power_limit(&self, watts: f32) -> Result<()> {
|
fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
|
||||||
fs::write(&self.pl2_path, ((watts * 1_000_000.0) as u64).to_string())?;
|
fs::write(&self.pl2_path, limit.as_u64().to_string())?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ use std::sync::Mutex;
|
|||||||
use tracing::{debug};
|
use tracing::{debug};
|
||||||
|
|
||||||
use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx};
|
use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx};
|
||||||
|
use crate::sal::safety::TdpLimitMicroWatts;
|
||||||
use crate::sal::heuristic::discovery::SystemFactSheet;
|
use crate::sal::heuristic::discovery::SystemFactSheet;
|
||||||
use crate::sal::heuristic::schema::HardwareDb;
|
use crate::sal::heuristic::schema::HardwareDb;
|
||||||
|
|
||||||
@@ -15,7 +16,7 @@ pub struct GenericLinuxSal {
|
|||||||
db: HardwareDb,
|
db: HardwareDb,
|
||||||
suppressed_services: Mutex<Vec<String>>,
|
suppressed_services: Mutex<Vec<String>>,
|
||||||
last_valid_temp: Mutex<(f32, Instant)>,
|
last_valid_temp: Mutex<(f32, Instant)>,
|
||||||
current_pl1: Mutex<f32>,
|
current_pl1: Mutex<u64>,
|
||||||
last_energy: Mutex<(u64, Instant)>,
|
last_energy: Mutex<(u64, Instant)>,
|
||||||
|
|
||||||
// --- Original State for Restoration ---
|
// --- Original State for Restoration ---
|
||||||
@@ -35,7 +36,7 @@ impl GenericLinuxSal {
|
|||||||
db,
|
db,
|
||||||
suppressed_services: Mutex::new(Vec::new()),
|
suppressed_services: Mutex::new(Vec::new()),
|
||||||
last_valid_temp: Mutex::new((0.0, Instant::now())),
|
last_valid_temp: Mutex::new((0.0, Instant::now())),
|
||||||
current_pl1: Mutex::new(15.0),
|
current_pl1: Mutex::new(15_000_000),
|
||||||
last_energy: Mutex::new((initial_energy, Instant::now())),
|
last_energy: Mutex::new((initial_energy, Instant::now())),
|
||||||
fact_sheet: facts,
|
fact_sheet: facts,
|
||||||
ctx,
|
ctx,
|
||||||
@@ -151,16 +152,16 @@ impl ActuatorBus for GenericLinuxSal {
|
|||||||
} else { Ok(()) }
|
} else { Ok(()) }
|
||||||
}
|
}
|
||||||
|
|
||||||
fn set_sustained_power_limit(&self, watts: f32) -> Result<()> {
|
fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
|
||||||
let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL1 path"))?;
|
let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL1 path"))?;
|
||||||
fs::write(rapl_path.join("constraint_0_power_limit_uw"), ((watts * 1_000_000.0) as u64).to_string())?;
|
fs::write(rapl_path.join("constraint_0_power_limit_uw"), limit.as_u64().to_string())?;
|
||||||
*self.current_pl1.lock().unwrap() = watts;
|
*self.current_pl1.lock().unwrap() = limit.as_u64();
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn set_burst_power_limit(&self, watts: f32) -> Result<()> {
|
fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
|
||||||
let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL2 path"))?;
|
let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL2 path"))?;
|
||||||
fs::write(rapl_path.join("constraint_1_power_limit_uw"), ((watts * 1_000_000.0) as u64).to_string())?;
|
fs::write(rapl_path.join("constraint_1_power_limit_uw"), limit.as_u64().to_string())?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, SafetyStatus};
|
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, SafetyStatus};
|
||||||
|
use crate::sal::safety::TdpLimitMicroWatts;
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
|
|
||||||
pub struct MockSal {
|
pub struct MockSal {
|
||||||
@@ -59,10 +60,10 @@ impl ActuatorBus for MockSal {
|
|||||||
fn set_fan_mode(&self, _mode: &str) -> Result<()> {
|
fn set_fan_mode(&self, _mode: &str) -> Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
fn set_sustained_power_limit(&self, _watts: f32) -> Result<()> {
|
fn set_sustained_power_limit(&self, _limit: TdpLimitMicroWatts) -> Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
fn set_burst_power_limit(&self, _watts: f32) -> Result<()> {
|
fn set_burst_power_limit(&self, _limit: TdpLimitMicroWatts) -> Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,3 +3,4 @@ pub mod mock;
|
|||||||
pub mod dell_xps_9380;
|
pub mod dell_xps_9380;
|
||||||
pub mod generic_linux;
|
pub mod generic_linux;
|
||||||
pub mod heuristic;
|
pub mod heuristic;
|
||||||
|
pub mod safety;
|
||||||
|
|||||||
175
src/sal/safety.rs
Normal file
175
src/sal/safety.rs
Normal file
@@ -0,0 +1,175 @@
|
|||||||
|
//! Universal Safeguard Architecture (USA) and Hardware Primitives.
|
||||||
|
//!
|
||||||
|
//! This module provides the `HardwareStateGuard` for guaranteed state
|
||||||
|
//! restoration and type-safe primitives to prevent dangerous hardware states.
|
||||||
|
|
||||||
|
use anyhow::{Result, bail, Context};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::fs;
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use tracing::{info, warn, error};
|
||||||
|
|
||||||
|
// --- Type-Driven Safety Primitives ---
|
||||||
|
|
||||||
|
/// Represents a safe TDP limit in microwatts.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
|
pub struct TdpLimitMicroWatts(u64);
|
||||||
|
|
||||||
|
impl TdpLimitMicroWatts {
|
||||||
|
/// Strict bounds to prevent hardware bricking.
|
||||||
|
pub const MIN_SAFE_UW: u64 = 5_000_000; // 5 Watts
|
||||||
|
pub const MAX_SAFE_UW: u64 = 80_000_000; // 80 Watts
|
||||||
|
|
||||||
|
/// Constructs a new TdpLimitMicroWatts, enforcing safety bounds.
|
||||||
|
///
|
||||||
|
/// # Errors
|
||||||
|
/// Returns a `HardwareSafetyError` (via `anyhow::bail`) if the value is out of bounds.
|
||||||
|
pub fn new(microwatts: u64) -> Result<Self> {
|
||||||
|
if microwatts < Self::MIN_SAFE_UW {
|
||||||
|
bail!("HardwareSafetyError: Requested TDP {} uW is below the absolute safety floor of {} uW.", microwatts, Self::MIN_SAFE_UW);
|
||||||
|
}
|
||||||
|
if microwatts > Self::MAX_SAFE_UW {
|
||||||
|
bail!("HardwareSafetyError: Requested TDP {} uW exceeds absolute maximum of {} uW.", microwatts, Self::MAX_SAFE_UW);
|
||||||
|
}
|
||||||
|
Ok(Self(microwatts))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_u64(&self) -> u64 {
|
||||||
|
self.0
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_watts(&self) -> f32 {
|
||||||
|
self.0 as f32 / 1_000_000.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents a safe Fan Speed in Percentage (0-100).
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
|
pub struct FanSpeedPercentage(u8);
|
||||||
|
|
||||||
|
impl FanSpeedPercentage {
|
||||||
|
/// Constructs a new FanSpeedPercentage, enforcing safety bounds.
|
||||||
|
pub fn new(percent: u8) -> Result<Self> {
|
||||||
|
if percent > 100 {
|
||||||
|
bail!("HardwareSafetyError: Fan speed percentage {} exceeds 100%.", percent);
|
||||||
|
}
|
||||||
|
Ok(Self(percent))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_u8(&self) -> u8 {
|
||||||
|
self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents a safe Thermal Threshold in Celsius.
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
|
||||||
|
pub struct ThermalThresholdCelsius(f32);
|
||||||
|
|
||||||
|
impl ThermalThresholdCelsius {
|
||||||
|
pub const MAX_SAFE_C: f32 = 98.0;
|
||||||
|
|
||||||
|
/// Constructs a new ThermalThresholdCelsius, enforcing safety bounds.
|
||||||
|
pub fn new(celsius: f32) -> Result<Self> {
|
||||||
|
if celsius < 0.0 || celsius > Self::MAX_SAFE_C {
|
||||||
|
bail!("HardwareSafetyError: Thermal threshold {}°C is outside safe bounds (0.0 - {}).", celsius, Self::MAX_SAFE_C);
|
||||||
|
}
|
||||||
|
Ok(Self(celsius))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_f32(&self) -> f32 {
|
||||||
|
self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- The HardwareStateGuard (RAII Restorer) ---
|
||||||
|
|
||||||
|
/// Represents a deep snapshot of the system state before benchmarking.
|
||||||
|
#[derive(Debug, Default, Clone)]
|
||||||
|
pub struct SystemSnapshot {
|
||||||
|
/// Maps file paths to their raw string content (e.g., RAPL limits).
|
||||||
|
pub sysfs_nodes: HashMap<PathBuf, String>,
|
||||||
|
/// List of services that were active and subsequently stopped.
|
||||||
|
pub suppressed_services: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The Universal Safeguard wrapper.
|
||||||
|
///
|
||||||
|
/// Implements the "Ironclad Restorer" pattern via the [Drop] trait.
|
||||||
|
pub struct HardwareStateGuard {
|
||||||
|
snapshot: SystemSnapshot,
|
||||||
|
is_armed: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HardwareStateGuard {
|
||||||
|
/// Arms the safeguard by taking a snapshot of the target files and services.
|
||||||
|
///
|
||||||
|
/// # Errors
|
||||||
|
/// Returns an error if any critical sysfs node cannot be read.
|
||||||
|
pub fn acquire(target_files: &[PathBuf], target_services: &[String]) -> Result<Self> {
|
||||||
|
let mut snapshot = SystemSnapshot::default();
|
||||||
|
|
||||||
|
info!("USA: Arming safeguard and snapshotting system state...");
|
||||||
|
|
||||||
|
for path in target_files {
|
||||||
|
if path.exists() {
|
||||||
|
let content = fs::read_to_string(path)
|
||||||
|
.with_context(|| format!("Failed to snapshot {:?}", path))?;
|
||||||
|
snapshot.sysfs_nodes.insert(path.clone(), content.trim().to_string());
|
||||||
|
} else {
|
||||||
|
warn!("USA: Target node {:?} does not exist, skipping snapshot.", path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for service in target_services {
|
||||||
|
let status = std::process::Command::new("systemctl")
|
||||||
|
.args(["is-active", "--quiet", service])
|
||||||
|
.status();
|
||||||
|
|
||||||
|
if let Ok(s) = status {
|
||||||
|
if s.success() {
|
||||||
|
snapshot.suppressed_services.push(service.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
snapshot,
|
||||||
|
is_armed: true,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Explicit manual restoration (can be called upon successful exit).
|
||||||
|
pub fn release(&mut self) -> Result<()> {
|
||||||
|
if !self.is_armed {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("USA: Initiating Ironclad Restoration...");
|
||||||
|
|
||||||
|
// 1. Restore Power/Sysfs states
|
||||||
|
for (path, content) in &self.snapshot.sysfs_nodes {
|
||||||
|
if let Err(e) = fs::write(path, content) {
|
||||||
|
error!("USA RESTORATION FAILURE: Could not revert {:?}: {}", path, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Restart Services
|
||||||
|
for service in &self.snapshot.suppressed_services {
|
||||||
|
let _ = std::process::Command::new("systemctl")
|
||||||
|
.args(["start", service])
|
||||||
|
.status();
|
||||||
|
}
|
||||||
|
|
||||||
|
self.is_armed = false;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for HardwareStateGuard {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
if self.is_armed {
|
||||||
|
warn!("USA: HardwareStateGuard triggered via Drop (panic/unexpected exit). Reverting system state...");
|
||||||
|
let _ = self.release();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -157,6 +157,8 @@ impl<T: SensorBus + ?Sized> SensorBus for Arc<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
use crate::sal::safety::TdpLimitMicroWatts;
|
||||||
|
|
||||||
/// Provides a write-only interface for hardware actuators.
|
/// Provides a write-only interface for hardware actuators.
|
||||||
pub trait ActuatorBus: Send + Sync {
|
pub trait ActuatorBus: Send + Sync {
|
||||||
/// Sets the fan control mode (e.g., "auto" or "max").
|
/// Sets the fan control mode (e.g., "auto" or "max").
|
||||||
@@ -165,28 +167,28 @@ pub trait ActuatorBus: Send + Sync {
|
|||||||
/// Returns an error if the fan control command or `sysfs` write fails.
|
/// Returns an error if the fan control command or `sysfs` write fails.
|
||||||
fn set_fan_mode(&self, mode: &str) -> Result<()>;
|
fn set_fan_mode(&self, mode: &str) -> Result<()>;
|
||||||
|
|
||||||
/// Sets the sustained power limit (PL1) in Watts.
|
/// Sets the sustained power limit (PL1) using a validated wrapper.
|
||||||
///
|
///
|
||||||
/// # Errors
|
/// # Errors
|
||||||
/// Returns an error if the RAPL `sysfs` node cannot be written to.
|
/// Returns an error if the RAPL `sysfs` node cannot be written to.
|
||||||
fn set_sustained_power_limit(&self, watts: f32) -> Result<()>;
|
fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>;
|
||||||
|
|
||||||
/// Sets the burst power limit (PL2) in Watts.
|
/// Sets the burst power limit (PL2) using a validated wrapper.
|
||||||
///
|
///
|
||||||
/// # Errors
|
/// # Errors
|
||||||
/// Returns an error if the RAPL `sysfs` node cannot be written to.
|
/// Returns an error if the RAPL `sysfs` node cannot be written to.
|
||||||
fn set_burst_power_limit(&self, watts: f32) -> Result<()>;
|
fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: ActuatorBus + ?Sized> ActuatorBus for Arc<T> {
|
impl<T: ActuatorBus + ?Sized> ActuatorBus for Arc<T> {
|
||||||
fn set_fan_mode(&self, mode: &str) -> Result<()> {
|
fn set_fan_mode(&self, mode: &str) -> Result<()> {
|
||||||
(**self).set_fan_mode(mode)
|
(**self).set_fan_mode(mode)
|
||||||
}
|
}
|
||||||
fn set_sustained_power_limit(&self, watts: f32) -> Result<()> {
|
fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
|
||||||
(**self).set_sustained_power_limit(watts)
|
(**self).set_sustained_power_limit(limit)
|
||||||
}
|
}
|
||||||
fn set_burst_power_limit(&self, watts: f32) -> Result<()> {
|
fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
|
||||||
(**self).set_burst_power_limit(watts)
|
(**self).set_burst_power_limit(limit)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,16 +1,23 @@
|
|||||||
use ember_tune_rs::orchestrator::BenchmarkOrchestrator;
|
use ember_tune_rs::orchestrator::BenchmarkOrchestrator;
|
||||||
use ember_tune_rs::sal::mock::MockSal;
|
use ember_tune_rs::sal::mock::MockSal;
|
||||||
use ember_tune_rs::sal::heuristic::discovery::SystemFactSheet;
|
use ember_tune_rs::sal::heuristic::discovery::SystemFactSheet;
|
||||||
use ember_tune_rs::load::Workload;
|
use ember_tune_rs::load::{Workload, IntensityProfile, WorkloadMetrics};
|
||||||
|
use std::time::Duration;
|
||||||
|
use anyhow::Result;
|
||||||
use std::sync::mpsc;
|
use std::sync::mpsc;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use anyhow::Result;
|
|
||||||
|
|
||||||
struct MockWorkload;
|
struct MockWorkload;
|
||||||
impl Workload for MockWorkload {
|
impl Workload for MockWorkload {
|
||||||
fn start(&mut self, _threads: usize, _load_percent: usize) -> Result<()> { Ok(()) }
|
fn initialize(&mut self) -> Result<()> { Ok(()) }
|
||||||
fn stop(&mut self) -> Result<()> { Ok(()) }
|
fn run_workload(&mut self, _duration: Duration, _profile: IntensityProfile) -> Result<()> { Ok(()) }
|
||||||
fn get_throughput(&self) -> Result<f64> { Ok(100.0) }
|
fn get_current_metrics(&self) -> Result<WorkloadMetrics> {
|
||||||
|
Ok(WorkloadMetrics {
|
||||||
|
primary_ops_per_sec: 100.0,
|
||||||
|
elapsed_time: Duration::from_secs(1),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
fn stop_workload(&mut self) -> Result<()> { Ok(()) }
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
Reference in New Issue
Block a user