fixed hardware_db and improved stability and robustness of generic sal

This commit is contained in:
2026-02-26 15:52:44 +01:00
parent f87efa1d24
commit 073414a25e
13 changed files with 488 additions and 225 deletions

View File

@@ -2,19 +2,21 @@ use anyhow::{Result, anyhow};
use std::path::Path;
use std::fs;
use std::time::{Duration, Instant};
use std::thread;
use std::process::Command;
use tracing::{debug};
use std::sync::mpsc;
use tracing::{debug, warn};
use std::sync::Mutex;
use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError};
use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus};
use crate::sal::heuristic::discovery::SystemFactSheet;
use crate::sal::heuristic::schema::HardwareDb;
pub struct GenericLinuxSal {
fact_sheet: SystemFactSheet,
db: HardwareDb,
suppressed_services: Vec<String>,
suppressed_services: Mutex<Vec<String>>,
last_valid_temp: Mutex<(f32, Instant)>,
current_pl1: Mutex<f32>,
last_energy: Mutex<(u64, Instant)>,
}
impl GenericLinuxSal {
@@ -22,7 +24,10 @@ impl GenericLinuxSal {
Self {
fact_sheet,
db,
suppressed_services: Vec::new(),
suppressed_services: Mutex::new(Vec::new()),
last_valid_temp: Mutex::new((0.0, Instant::now())),
current_pl1: Mutex::new(15.0),
last_energy: Mutex::new((0, Instant::now())),
}
}
@@ -30,33 +35,18 @@ impl GenericLinuxSal {
self.fact_sheet.vendor.to_lowercase().contains("dell")
}
fn read_sysfs_timeout(&self, path: &Path, timeout: Duration) -> Result<String> {
let (tx, rx) = mpsc::channel();
let path_buf = path.to_path_buf();
thread::spawn(move || {
let res = fs::read_to_string(path_buf).map(|s| s.trim().to_string());
let _ = tx.send(res);
});
match rx.recv_timeout(timeout) {
Ok(res) => res.map_err(|e| anyhow!("Failed to read sysfs: {}", e)),
Err(_) => Err(anyhow!("Timeout reading sysfs path: {:?}", path)),
}
/// Read sysfs safely. We removed the thread-per-read timeout logic
/// as it was inefficient. sysfs reads are generally fast enough.
fn read_sysfs(&self, path: &Path) -> Result<String> {
fs::read_to_string(path).map(|s| s.trim().to_string()).map_err(|e| anyhow!(e))
}
}
impl PreflightAuditor for GenericLinuxSal {
fn audit(&self) -> Box<dyn Iterator<Item = AuditStep> + '_> {
let mut steps = Vec::new();
// 1. Static DB checks
for check in &self.db.preflight_checks {
let status = Command::new("sh")
.arg("-c")
.arg(&check.check_cmd)
.status();
let status = Command::new("sh").arg("-c").arg(&check.check_cmd).status();
steps.push(AuditStep {
description: check.name.clone(),
outcome: match status {
@@ -65,8 +55,6 @@ impl PreflightAuditor for GenericLinuxSal {
}
});
}
// 2. Conflict checks (Critical only)
for conflict_id in &self.fact_sheet.active_conflicts {
if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) {
if conflict.severity == "Critical" {
@@ -77,7 +65,6 @@ impl PreflightAuditor for GenericLinuxSal {
}
}
}
Box::new(steps.into_iter())
}
}
@@ -86,31 +73,32 @@ impl SensorBus for GenericLinuxSal {
fn get_temp(&self) -> Result<f32> {
let path = self.fact_sheet.temp_path.as_ref()
.ok_or_else(|| anyhow!("No temperature sensor path found"))?;
let content = self.read_sysfs_timeout(path, Duration::from_millis(200))?;
let milli_celsius: f32 = content.parse()?;
Ok(milli_celsius / 1000.0)
let content = self.read_sysfs(path)?;
let temp = content.parse::<f32>()? / 1000.0;
let mut last = self.last_valid_temp.lock().unwrap();
if (temp - last.0).abs() > 0.01 { *last = (temp, Instant::now()); }
Ok(temp)
}
fn get_power_w(&self) -> Result<f32> {
let rapl_path = self.fact_sheet.rapl_paths.first()
.ok_or_else(|| anyhow!("No RAPL path found"))?;
let energy_path = rapl_path.join("energy_uj");
let e1: u64 = self.read_sysfs_timeout(&energy_path, Duration::from_millis(200))?.parse()?;
let t1 = Instant::now();
thread::sleep(Duration::from_millis(100));
let e2: u64 = self.read_sysfs_timeout(&energy_path, Duration::from_millis(200))?.parse()?;
let mut last = self.last_energy.lock().unwrap();
let e2: u64 = self.read_sysfs(&energy_path)?.parse()?;
let t2 = Instant::now();
let (e1, t1) = *last;
let delta_e = e2.wrapping_sub(e1);
let delta_t = t2.duration_since(t1).as_secs_f32();
*last = (e2, t2);
if delta_t < 0.01 { return Ok(0.0); }
Ok((delta_e as f32 / 1_000_000.0) / delta_t)
}
fn get_fan_rpms(&self) -> Result<Vec<u32>> {
let mut rpms = Vec::new();
for path in &self.fact_sheet.fan_paths {
if let Ok(content) = self.read_sysfs_timeout(path, Duration::from_millis(200)) {
if let Ok(content) = self.read_sysfs(path) {
if let Ok(rpm) = content.parse() { rpms.push(rpm); }
}
}
@@ -120,10 +108,8 @@ impl SensorBus for GenericLinuxSal {
fn get_freq_mhz(&self) -> Result<f32> {
let path = Path::new("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq");
if path.exists() {
let khz: f32 = self.read_sysfs_timeout(path, Duration::from_millis(200))?.parse()?;
Ok(khz / 1000.0)
Ok(self.read_sysfs(path)?.parse::<f32>()? / 1000.0)
} else {
// Fallback: parse /proc/cpuinfo
let cpuinfo = fs::read_to_string("/proc/cpuinfo")?;
for line in cpuinfo.lines() {
if line.starts_with("cpu MHz") {
@@ -149,38 +135,32 @@ impl ActuatorBus for GenericLinuxSal {
let parts: Vec<&str> = cmd_str.split_whitespace().collect();
Command::new(parts[0]).args(&parts[1..]).status()?;
Ok(())
} else { Err(anyhow!("Dell fan command missing in DB")) }
} else {
debug!("Fan control not implemented for non-Dell systems yet");
Ok(())
}
} else { Err(anyhow!("Dell fan command missing")) }
} else { Ok(()) }
}
fn set_sustained_power_limit(&self, watts: f32) -> Result<()> {
let rapl_path = self.fact_sheet.rapl_paths.first()
.ok_or_else(|| anyhow!("No RAPL path found for PL1"))?;
let path = rapl_path.join("constraint_0_power_limit_uw");
fs::write(path, ((watts * 1_000_000.0) as u64).to_string())?;
let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL1 path"))?;
fs::write(rapl_path.join("constraint_0_power_limit_uw"), ((watts * 1_000_000.0) as u64).to_string())?;
*self.current_pl1.lock().unwrap() = watts;
Ok(())
}
fn set_burst_power_limit(&self, watts: f32) -> Result<()> {
let rapl_path = self.fact_sheet.rapl_paths.first()
.ok_or_else(|| anyhow!("No RAPL path found for PL2"))?;
let path = rapl_path.join("constraint_1_power_limit_uw");
fs::write(path, ((watts * 1_000_000.0) as u64).to_string())?;
let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL2 path"))?;
fs::write(rapl_path.join("constraint_1_power_limit_uw"), ((watts * 1_000_000.0) as u64).to_string())?;
Ok(())
}
}
impl EnvironmentGuard for GenericLinuxSal {
fn suppress(&mut self) -> Result<()> {
fn suppress(&self) -> Result<()> {
let mut suppressed = self.suppressed_services.lock().unwrap();
for conflict_id in &self.fact_sheet.active_conflicts {
if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) {
for service in &conflict.services {
debug!("Stopping service: {}", service);
if Command::new("systemctl").arg("stop").arg(service).status()?.success() {
self.suppressed_services.push(service.clone());
suppressed.push(service.clone());
}
}
}
@@ -188,31 +168,30 @@ impl EnvironmentGuard for GenericLinuxSal {
Ok(())
}
fn restore(&mut self) -> Result<()> {
for service in self.suppressed_services.drain(..) {
debug!("Starting service: {}", service);
fn restore(&self) -> Result<()> {
let mut suppressed = self.suppressed_services.lock().unwrap();
for service in suppressed.drain(..) {
let _ = Command::new("systemctl").arg("start").arg(service).status();
}
if self.is_dell() {
let _ = self.set_fan_mode("auto");
}
if self.is_dell() { let _ = self.set_fan_mode("auto"); }
Ok(())
}
}
impl HardwareWatchdog for GenericLinuxSal {
fn check_emergency(&self) -> Result<bool> {
if let Ok(temp) = self.get_temp() {
if temp > 100.0 {
return Ok(true);
}
fn get_safety_status(&self) -> Result<SafetyStatus> {
let temp = self.get_temp()?;
if temp > 100.0 {
return Ok(SafetyStatus::EmergencyAbort(format!("Thermal runaway: {:.1}°C", temp)));
}
Ok(false)
let last = self.last_valid_temp.lock().unwrap();
if last.1.elapsed() > Duration::from_secs(5) {
return Ok(SafetyStatus::EmergencyAbort("Temperature sensor stalled".to_string()));
}
Ok(SafetyStatus::Nominal)
}
}
impl Drop for GenericLinuxSal {
fn drop(&mut self) {
let _ = self.restore();
}
fn drop(&mut self) { let _ = self.restore(); }
}