fixed hardware_db and improved stability and robustness of generic sal
This commit is contained in:
@@ -4,14 +4,17 @@ use std::time::{Duration, Instant};
|
||||
use std::thread;
|
||||
use std::collections::VecDeque;
|
||||
use sysinfo::System;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Mutex;
|
||||
|
||||
use crate::sal::traits::{PlatformSal};
|
||||
use crate::sal::traits::{PlatformSal, AuditStep, SafetyStatus};
|
||||
use crate::load::Workload;
|
||||
use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase};
|
||||
use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult};
|
||||
|
||||
pub struct BenchmarkOrchestrator {
|
||||
sal: Box<dyn PlatformSal>,
|
||||
sal: Arc<dyn PlatformSal>,
|
||||
workload: Box<dyn Workload>,
|
||||
telemetry_tx: mpsc::Sender<TelemetryState>,
|
||||
command_rx: mpsc::Receiver<UiCommand>,
|
||||
@@ -27,11 +30,15 @@ pub struct BenchmarkOrchestrator {
|
||||
// --- Static Info ---
|
||||
cpu_model: String,
|
||||
total_ram_gb: u64,
|
||||
|
||||
// --- Safety ---
|
||||
emergency_abort: Arc<AtomicBool>,
|
||||
emergency_reason: Arc<Mutex<Option<String>>>,
|
||||
}
|
||||
|
||||
impl BenchmarkOrchestrator {
|
||||
pub fn new(
|
||||
sal: Box<dyn PlatformSal>,
|
||||
sal: Arc<dyn PlatformSal>,
|
||||
workload: Box<dyn Workload>,
|
||||
telemetry_tx: mpsc::Sender<TelemetryState>,
|
||||
command_rx: mpsc::Receiver<UiCommand>,
|
||||
@@ -57,12 +64,17 @@ impl BenchmarkOrchestrator {
|
||||
history_mhz: VecDeque::with_capacity(120),
|
||||
cpu_model,
|
||||
total_ram_gb,
|
||||
emergency_abort: Arc::new(AtomicBool::new(false)),
|
||||
emergency_reason: Arc::new(Mutex::new(None)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn run(&mut self) -> Result<OptimizationResult> {
|
||||
self.log("Starting ember-tune Benchmark Sequence.")?;
|
||||
|
||||
// Start Watchdog Monitor
|
||||
let _watchdog_handle = self.spawn_watchdog_monitor();
|
||||
|
||||
// Phase 1: Audit & Baseline
|
||||
self.phase = BenchmarkPhase::Auditing;
|
||||
for step in self.sal.audit() {
|
||||
@@ -111,11 +123,6 @@ impl BenchmarkOrchestrator {
|
||||
|
||||
while step_start.elapsed() < Duration::from_secs(45) {
|
||||
self.check_abort()?;
|
||||
if self.sal.check_emergency()? {
|
||||
self.log("⚠ EMERGENCY ABORT: Watchdog triggered!")?;
|
||||
self.workload.stop()?;
|
||||
return Err(anyhow::anyhow!("Hardware Watchdog Triggered"));
|
||||
}
|
||||
|
||||
let t = self.sal.get_temp().unwrap_or(0.0);
|
||||
step_temps.push_back(t);
|
||||
@@ -204,6 +211,35 @@ impl BenchmarkOrchestrator {
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
fn spawn_watchdog_monitor(&self) -> thread::JoinHandle<()> {
|
||||
let abort = self.emergency_abort.clone();
|
||||
let reason_store = self.emergency_reason.clone();
|
||||
let sal = self.sal.clone();
|
||||
|
||||
thread::spawn(move || {
|
||||
while !abort.load(Ordering::SeqCst) {
|
||||
let status = sal.get_safety_status();
|
||||
match status {
|
||||
Ok(SafetyStatus::EmergencyAbort(reason)) => {
|
||||
*reason_store.lock().unwrap() = Some(reason.clone());
|
||||
abort.store(true, Ordering::SeqCst);
|
||||
break;
|
||||
}
|
||||
Ok(SafetyStatus::Warning(_msg)) | Ok(SafetyStatus::Critical(_msg)) => {
|
||||
// Send warning log to UI
|
||||
}
|
||||
Ok(SafetyStatus::Nominal) => {}
|
||||
Err(e) => {
|
||||
*reason_store.lock().unwrap() = Some(format!("Watchdog Sensor Failure: {}", e));
|
||||
abort.store(true, Ordering::SeqCst);
|
||||
break;
|
||||
}
|
||||
}
|
||||
thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn generate_result(&self, is_partial: bool) -> OptimizationResult {
|
||||
let r_theta = self.engine.calculate_thermal_resistance(&self.profile);
|
||||
let knee = self.engine.find_silicon_knee(&self.profile);
|
||||
@@ -221,6 +257,11 @@ impl BenchmarkOrchestrator {
|
||||
}
|
||||
|
||||
fn check_abort(&self) -> Result<()> {
|
||||
if self.emergency_abort.load(Ordering::SeqCst) {
|
||||
let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Unknown safety trigger".to_string());
|
||||
return Err(anyhow::anyhow!("EMERGENCY_ABORT: {}", reason));
|
||||
}
|
||||
|
||||
if let Ok(cmd) = self.command_rx.try_recv() {
|
||||
match cmd {
|
||||
UiCommand::Abort => {
|
||||
@@ -250,6 +291,8 @@ impl BenchmarkOrchestrator {
|
||||
history_mhz: Vec::new(),
|
||||
log_event: Some(msg.to_string()),
|
||||
metadata: std::collections::HashMap::new(),
|
||||
is_emergency: self.emergency_abort.load(Ordering::SeqCst),
|
||||
emergency_reason: self.emergency_reason.lock().unwrap().clone(),
|
||||
};
|
||||
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed"))
|
||||
}
|
||||
@@ -287,6 +330,8 @@ impl BenchmarkOrchestrator {
|
||||
history_mhz: self.history_mhz.iter().cloned().collect(),
|
||||
log_event: None,
|
||||
metadata: std::collections::HashMap::new(),
|
||||
is_emergency: self.emergency_abort.load(Ordering::SeqCst),
|
||||
emergency_reason: self.emergency_reason.lock().unwrap().clone(),
|
||||
};
|
||||
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed"))
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user