fixed hardware_db and improved stability and robustness of generic sal

This commit is contained in:
2026-02-26 15:52:44 +01:00
parent f87efa1d24
commit 073414a25e
13 changed files with 488 additions and 225 deletions

View File

@@ -4,14 +4,17 @@ use std::time::{Duration, Instant};
use std::thread;
use std::collections::VecDeque;
use sysinfo::System;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Mutex;
use crate::sal::traits::{PlatformSal};
use crate::sal::traits::{PlatformSal, AuditStep, SafetyStatus};
use crate::load::Workload;
use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase};
use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult};
pub struct BenchmarkOrchestrator {
sal: Box<dyn PlatformSal>,
sal: Arc<dyn PlatformSal>,
workload: Box<dyn Workload>,
telemetry_tx: mpsc::Sender<TelemetryState>,
command_rx: mpsc::Receiver<UiCommand>,
@@ -27,11 +30,15 @@ pub struct BenchmarkOrchestrator {
// --- Static Info ---
cpu_model: String,
total_ram_gb: u64,
// --- Safety ---
emergency_abort: Arc<AtomicBool>,
emergency_reason: Arc<Mutex<Option<String>>>,
}
impl BenchmarkOrchestrator {
pub fn new(
sal: Box<dyn PlatformSal>,
sal: Arc<dyn PlatformSal>,
workload: Box<dyn Workload>,
telemetry_tx: mpsc::Sender<TelemetryState>,
command_rx: mpsc::Receiver<UiCommand>,
@@ -57,12 +64,17 @@ impl BenchmarkOrchestrator {
history_mhz: VecDeque::with_capacity(120),
cpu_model,
total_ram_gb,
emergency_abort: Arc::new(AtomicBool::new(false)),
emergency_reason: Arc::new(Mutex::new(None)),
}
}
pub fn run(&mut self) -> Result<OptimizationResult> {
self.log("Starting ember-tune Benchmark Sequence.")?;
// Start Watchdog Monitor
let _watchdog_handle = self.spawn_watchdog_monitor();
// Phase 1: Audit & Baseline
self.phase = BenchmarkPhase::Auditing;
for step in self.sal.audit() {
@@ -111,11 +123,6 @@ impl BenchmarkOrchestrator {
while step_start.elapsed() < Duration::from_secs(45) {
self.check_abort()?;
if self.sal.check_emergency()? {
self.log("⚠ EMERGENCY ABORT: Watchdog triggered!")?;
self.workload.stop()?;
return Err(anyhow::anyhow!("Hardware Watchdog Triggered"));
}
let t = self.sal.get_temp().unwrap_or(0.0);
step_temps.push_back(t);
@@ -204,6 +211,35 @@ impl BenchmarkOrchestrator {
Ok(res)
}
fn spawn_watchdog_monitor(&self) -> thread::JoinHandle<()> {
let abort = self.emergency_abort.clone();
let reason_store = self.emergency_reason.clone();
let sal = self.sal.clone();
thread::spawn(move || {
while !abort.load(Ordering::SeqCst) {
let status = sal.get_safety_status();
match status {
Ok(SafetyStatus::EmergencyAbort(reason)) => {
*reason_store.lock().unwrap() = Some(reason.clone());
abort.store(true, Ordering::SeqCst);
break;
}
Ok(SafetyStatus::Warning(_msg)) | Ok(SafetyStatus::Critical(_msg)) => {
// Send warning log to UI
}
Ok(SafetyStatus::Nominal) => {}
Err(e) => {
*reason_store.lock().unwrap() = Some(format!("Watchdog Sensor Failure: {}", e));
abort.store(true, Ordering::SeqCst);
break;
}
}
thread::sleep(Duration::from_millis(100));
}
})
}
pub fn generate_result(&self, is_partial: bool) -> OptimizationResult {
let r_theta = self.engine.calculate_thermal_resistance(&self.profile);
let knee = self.engine.find_silicon_knee(&self.profile);
@@ -221,6 +257,11 @@ impl BenchmarkOrchestrator {
}
fn check_abort(&self) -> Result<()> {
if self.emergency_abort.load(Ordering::SeqCst) {
let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Unknown safety trigger".to_string());
return Err(anyhow::anyhow!("EMERGENCY_ABORT: {}", reason));
}
if let Ok(cmd) = self.command_rx.try_recv() {
match cmd {
UiCommand::Abort => {
@@ -250,6 +291,8 @@ impl BenchmarkOrchestrator {
history_mhz: Vec::new(),
log_event: Some(msg.to_string()),
metadata: std::collections::HashMap::new(),
is_emergency: self.emergency_abort.load(Ordering::SeqCst),
emergency_reason: self.emergency_reason.lock().unwrap().clone(),
};
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed"))
}
@@ -287,6 +330,8 @@ impl BenchmarkOrchestrator {
history_mhz: self.history_mhz.iter().cloned().collect(),
log_event: None,
metadata: std::collections::HashMap::new(),
is_emergency: self.emergency_abort.load(Ordering::SeqCst),
emergency_reason: self.emergency_reason.lock().unwrap().clone(),
};
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed"))
}