updated
This commit is contained in:
@@ -1,11 +1,12 @@
|
||||
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep, SafetyStatus, EnvironmentCtx};
|
||||
use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage};
|
||||
use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent};
|
||||
use anyhow::{Result, Context, anyhow};
|
||||
use std::fs;
|
||||
use std::path::{PathBuf};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::thread;
|
||||
use std::sync::Mutex;
|
||||
use tracing::{debug, warn};
|
||||
use tracing::{info, debug};
|
||||
use crate::sal::heuristic::discovery::SystemFactSheet;
|
||||
|
||||
/// Implementation of the System Abstraction Layer for the Dell XPS 13 9380.
|
||||
@@ -15,30 +16,66 @@ pub struct DellXps9380Sal {
|
||||
temp_path: PathBuf,
|
||||
pwr_path: PathBuf,
|
||||
fan_paths: Vec<PathBuf>,
|
||||
pwm_paths: Vec<PathBuf>,
|
||||
pwm_enable_paths: Vec<PathBuf>,
|
||||
pl1_paths: Vec<PathBuf>,
|
||||
pl2_paths: Vec<PathBuf>,
|
||||
freq_path: PathBuf,
|
||||
pl1_path: PathBuf,
|
||||
pl2_path: PathBuf,
|
||||
last_poll: Mutex<Instant>,
|
||||
last_temp: Mutex<f32>,
|
||||
last_fans: Mutex<Vec<u32>>,
|
||||
suppressed_services: Mutex<Vec<String>>,
|
||||
msr_file: Mutex<fs::File>,
|
||||
last_energy: Mutex<(u64, Instant)>,
|
||||
last_watts: Mutex<f32>,
|
||||
|
||||
// --- Original State for Restoration ---
|
||||
original_pl1: Mutex<Option<u64>>,
|
||||
original_pl2: Mutex<Option<u64>>,
|
||||
original_fan_mode: Mutex<Option<String>>,
|
||||
}
|
||||
|
||||
impl DellXps9380Sal {
|
||||
/// Initializes the Dell SAL, opening the MSR interface and discovering sensors.
|
||||
/// Initializes the Dell SAL, opening the MSR interface and discovering sensors and PWM nodes.
|
||||
pub fn init(ctx: EnvironmentCtx, facts: SystemFactSheet) -> Result<Self> {
|
||||
let temp_path = facts.temp_path.clone().context("Dell SAL requires temperature sensor")?;
|
||||
let pwr_base = facts.rapl_paths.first().cloned().context("Dell SAL requires RAPL interface")?;
|
||||
let fan_paths = facts.fan_paths.clone();
|
||||
|
||||
// 1. Discover PWM and Enable nodes associated with the fan paths
|
||||
let mut pwm_paths = Vec::new();
|
||||
let mut pwm_enable_paths = Vec::new();
|
||||
for fan_p in &fan_paths {
|
||||
if let Some(parent) = fan_p.parent() {
|
||||
let fan_file = fan_p.file_name().and_then(|n| n.to_str()).unwrap_or("");
|
||||
let fan_idx = fan_file.chars().filter(|c| c.is_ascii_digit()).collect::<String>();
|
||||
let idx = if fan_idx.is_empty() { "1".to_string() } else { fan_idx };
|
||||
|
||||
let pwm_p = parent.join(format!("pwm{}", idx));
|
||||
if pwm_p.exists() { pwm_paths.push(pwm_p); }
|
||||
|
||||
let enable_p = parent.join(format!("pwm{}_enable", idx));
|
||||
if enable_p.exists() { pwm_enable_paths.push(enable_p); }
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Map all RAPL constraints
|
||||
let mut pl1_paths = Vec::new();
|
||||
let mut pl2_paths = Vec::new();
|
||||
for rapl_p in &facts.rapl_paths {
|
||||
pl1_paths.push(rapl_p.join("constraint_0_power_limit_uw"));
|
||||
pl2_paths.push(rapl_p.join("constraint_1_power_limit_uw"));
|
||||
}
|
||||
|
||||
// 3. Physical Sensor Verification & Warm Cache Priming
|
||||
let mut initial_fans = Vec::new();
|
||||
for fan_p in &fan_paths {
|
||||
let mut rpm = 0;
|
||||
for _ in 0..3 {
|
||||
if let Ok(val) = fs::read_to_string(fan_p) {
|
||||
rpm = val.trim().parse::<u32>().unwrap_or(0);
|
||||
if rpm > 0 { break; }
|
||||
}
|
||||
thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
info!("SAL Warm-Start: Fan sensor {:?} -> {} RPM", fan_p, rpm);
|
||||
initial_fans.push(rpm);
|
||||
}
|
||||
|
||||
let freq_path = ctx.sysfs_base.join("sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq");
|
||||
let msr_path = ctx.sysfs_base.join("dev/cpu/0/msr");
|
||||
|
||||
@@ -47,25 +84,26 @@ impl DellXps9380Sal {
|
||||
|
||||
let initial_energy = fs::read_to_string(pwr_base.join("energy_uj")).unwrap_or_default().trim().parse().unwrap_or(0);
|
||||
|
||||
info!("SAL: Dell XPS 9380 Initialized. ({} fans, {} RAPL nodes found)",
|
||||
fan_paths.len(), facts.rapl_paths.len());
|
||||
|
||||
Ok(Self {
|
||||
temp_path,
|
||||
pwr_path: pwr_base.join("power1_average"),
|
||||
fan_paths,
|
||||
pwm_paths,
|
||||
pwm_enable_paths,
|
||||
pl1_paths,
|
||||
pl2_paths,
|
||||
freq_path,
|
||||
pl1_path: pwr_base.join("constraint_0_power_limit_uw"),
|
||||
pl2_path: pwr_base.join("constraint_1_power_limit_uw"),
|
||||
last_poll: Mutex::new(Instant::now() - Duration::from_secs(2)),
|
||||
last_temp: Mutex::new(0.0),
|
||||
last_fans: Mutex::new(Vec::new()),
|
||||
suppressed_services: Mutex::new(Vec::new()),
|
||||
last_fans: Mutex::new(initial_fans),
|
||||
msr_file: Mutex::new(msr_file),
|
||||
last_energy: Mutex::new((initial_energy, Instant::now())),
|
||||
last_watts: Mutex::new(0.0),
|
||||
fact_sheet: facts,
|
||||
ctx,
|
||||
original_pl1: Mutex::new(None),
|
||||
original_pl2: Mutex::new(None),
|
||||
original_fan_mode: Mutex::new(None),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -93,7 +131,6 @@ impl PreflightAuditor for DellXps9380Sal {
|
||||
outcome: if unsafe { libc::getuid() } == 0 { Ok(()) } else { Err(AuditError::RootRequired) }
|
||||
});
|
||||
|
||||
// RAPL Lock Check (MSR 0x610)
|
||||
let rapl_lock = match self.read_msr(0x610) {
|
||||
Ok(val) => {
|
||||
if (val & (1 << 63)) != 0 {
|
||||
@@ -104,19 +141,14 @@ impl PreflightAuditor for DellXps9380Sal {
|
||||
},
|
||||
Err(e) => Err(AuditError::ToolMissing(format!("Cannot read MSR 0x610: {}", e))),
|
||||
};
|
||||
steps.push(AuditStep {
|
||||
description: "MSR 0x610 RAPL Lock Status".to_string(),
|
||||
outcome: rapl_lock,
|
||||
});
|
||||
steps.push(AuditStep { description: "MSR 0x610 RAPL Lock Status".to_string(), outcome: rapl_lock });
|
||||
|
||||
let modules = ["dell_smm_hwmon", "msr", "intel_rapl_msr"];
|
||||
for mod_name in modules {
|
||||
let path = self.ctx.sysfs_base.join(format!("sys/module/{}", mod_name));
|
||||
steps.push(AuditStep {
|
||||
description: format!("Kernel Module: {}", mod_name),
|
||||
outcome: if path.exists() { Ok(()) } else {
|
||||
Err(AuditError::ToolMissing(format!("Module '{}' not loaded.", mod_name)))
|
||||
}
|
||||
outcome: if path.exists() { Ok(()) } else { Err(AuditError::ToolMissing(format!("Module '{}' not loaded.", mod_name))) }
|
||||
});
|
||||
}
|
||||
|
||||
@@ -138,9 +170,7 @@ impl PreflightAuditor for DellXps9380Sal {
|
||||
let ac_status = fs::read_to_string(ac_status_path).unwrap_or_else(|_| "0".to_string());
|
||||
steps.push(AuditStep {
|
||||
description: "AC Power Connection".to_string(),
|
||||
outcome: if ac_status.trim() == "1" { Ok(()) } else {
|
||||
Err(AuditError::AcPowerMissing("System must be on AC power".to_string()))
|
||||
}
|
||||
outcome: if ac_status.trim() == "1" { Ok(()) } else { Err(AuditError::AcPowerMissing("System must be on AC power".to_string())) }
|
||||
});
|
||||
|
||||
Box::new(steps.into_iter())
|
||||
@@ -148,49 +178,16 @@ impl PreflightAuditor for DellXps9380Sal {
|
||||
}
|
||||
|
||||
impl EnvironmentGuard for DellXps9380Sal {
|
||||
fn suppress(&self) -> Result<()> {
|
||||
if let Ok(pl1) = fs::read_to_string(&self.pl1_path) {
|
||||
*self.original_pl1.lock().unwrap() = pl1.trim().parse().ok();
|
||||
}
|
||||
if let Ok(pl2) = fs::read_to_string(&self.pl2_path) {
|
||||
*self.original_pl2.lock().unwrap() = pl2.trim().parse().ok();
|
||||
}
|
||||
*self.original_fan_mode.lock().unwrap() = Some("1".to_string());
|
||||
|
||||
let services = ["tlp", "thermald", "i8kmon"];
|
||||
let mut suppressed = self.suppressed_services.lock().unwrap();
|
||||
for s in services {
|
||||
if self.ctx.runner.run("systemctl", &["is-active", "--quiet", s]).is_ok() {
|
||||
let _ = self.ctx.runner.run("systemctl", &["stop", s]);
|
||||
suppressed.push(s.to_string());
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn restore(&self) -> Result<()> {
|
||||
if let Some(pl1) = *self.original_pl1.lock().unwrap() {
|
||||
let _ = fs::write(&self.pl1_path, pl1.to_string());
|
||||
}
|
||||
if let Some(pl2) = *self.original_pl2.lock().unwrap() {
|
||||
let _ = fs::write(&self.pl2_path, pl2.to_string());
|
||||
}
|
||||
if let Some(tool_path) = self.fact_sheet.paths.tools.get("dell_fan_ctrl") {
|
||||
let _ = self.ctx.runner.run(&tool_path.to_string_lossy(), &["1"]);
|
||||
}
|
||||
let mut suppressed = self.suppressed_services.lock().unwrap();
|
||||
for s in suppressed.drain(..) {
|
||||
let _ = self.ctx.runner.run("systemctl", &["start", &s]);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
fn suppress(&self) -> Result<()> { Ok(()) }
|
||||
fn restore(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
impl SensorBus for DellXps9380Sal {
|
||||
fn get_temp(&self) -> Result<f32> {
|
||||
let mut last_poll = self.last_poll.lock().unwrap();
|
||||
let now = Instant::now();
|
||||
if now.duration_since(*last_poll) < Duration::from_millis(1000) {
|
||||
// # SAFETY: High frequency polling for watchdog
|
||||
if now.duration_since(*last_poll) < Duration::from_millis(100) {
|
||||
return Ok(*self.last_temp.lock().unwrap());
|
||||
}
|
||||
let s = fs::read_to_string(&self.temp_path)?;
|
||||
@@ -201,7 +198,7 @@ impl SensorBus for DellXps9380Sal {
|
||||
}
|
||||
|
||||
fn get_power_w(&self) -> Result<f32> {
|
||||
let rapl_base = self.pl1_path.parent().context("RAPL path error")?;
|
||||
let rapl_base = self.fact_sheet.rapl_paths.first().context("RAPL path error")?;
|
||||
let energy_path = rapl_base.join("energy_uj");
|
||||
|
||||
if energy_path.exists() {
|
||||
@@ -212,14 +209,9 @@ impl SensorBus for DellXps9380Sal {
|
||||
let e2 = e2_str.trim().parse::<u64>()?;
|
||||
let t2 = Instant::now();
|
||||
let (e1, t1) = *last_energy;
|
||||
|
||||
let delta_e = e2.wrapping_sub(e1);
|
||||
let delta_t = t2.duration_since(t1).as_secs_f32();
|
||||
|
||||
if delta_t < 0.1 {
|
||||
return Ok(*last_watts); // Return cached if polled too fast
|
||||
}
|
||||
|
||||
if delta_t < 0.1 { return Ok(*last_watts); }
|
||||
let watts = (delta_e as f32 / 1_000_000.0) / delta_t;
|
||||
*last_energy = (e2, t2);
|
||||
*last_watts = watts;
|
||||
@@ -236,12 +228,27 @@ impl SensorBus for DellXps9380Sal {
|
||||
if now.duration_since(*last_poll) < Duration::from_millis(1000) {
|
||||
return Ok(self.last_fans.lock().unwrap().clone());
|
||||
}
|
||||
|
||||
let mut fans = Vec::new();
|
||||
for path in &self.fan_paths {
|
||||
if let Ok(s) = fs::read_to_string(path) {
|
||||
if let Ok(rpm) = s.trim().parse::<u32>() { fans.push(rpm); }
|
||||
let mut val = 0;
|
||||
for i in 0..5 {
|
||||
match fs::read_to_string(path) {
|
||||
Ok(s) => {
|
||||
if let Ok(rpm) = s.trim().parse::<u32>() {
|
||||
val = rpm;
|
||||
if rpm > 0 { break; }
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
debug!("SAL: Fan poll retry {} for {:?} failed: {}", i+1, path, e);
|
||||
}
|
||||
}
|
||||
thread::sleep(Duration::from_millis(150));
|
||||
}
|
||||
fans.push(val);
|
||||
}
|
||||
|
||||
*self.last_fans.lock().unwrap() = fans.clone();
|
||||
*last_poll = now;
|
||||
Ok(fans)
|
||||
@@ -253,7 +260,6 @@ impl SensorBus for DellXps9380Sal {
|
||||
}
|
||||
|
||||
fn get_throttling_status(&self) -> Result<bool> {
|
||||
// MSR 0x19C bit 0 is "Thermal Status", bit 1 is "Thermal Log"
|
||||
let val = self.read_msr(0x19C)?;
|
||||
Ok((val & 0x1) != 0)
|
||||
}
|
||||
@@ -266,24 +272,47 @@ impl ActuatorBus for DellXps9380Sal {
|
||||
let tool_str = tool_path.to_string_lossy();
|
||||
|
||||
match mode {
|
||||
"max" | "Manual" => { self.ctx.runner.run(&tool_str, &["0"])?; }
|
||||
"max" | "Manual" => {
|
||||
self.ctx.runner.run(&tool_str, &["0"])?;
|
||||
// Disabling BIOS control requires immediate PWM override
|
||||
self.set_fan_speed(FanSpeedPercent::new(100)?)?;
|
||||
}
|
||||
"auto" | "Auto" => { self.ctx.runner.run(&tool_str, &["1"])?; }
|
||||
_ => {}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> {
|
||||
fn set_fan_speed(&self, speed: FanSpeedPercent) -> Result<()> {
|
||||
let pwm_val = ((speed.get() as u32 * 255) / 100) as u8;
|
||||
for p in &self.pwm_enable_paths { let _ = fs::write(p, "1"); }
|
||||
for path in &self.pwm_paths { let _ = fs::write(path, pwm_val.to_string()); }
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
|
||||
fs::write(&self.pl1_path, limit.as_u64().to_string())?;
|
||||
fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()> {
|
||||
for path in &self.pl1_paths {
|
||||
debug!("SAL: Applying PL1 ({:.1}W) to {:?}", limit.get(), path);
|
||||
fs::write(path, limit.as_microwatts().to_string())
|
||||
.with_context(|| format!("Failed to write PL1 to {:?}", path))?;
|
||||
if let Some(parent) = path.parent() {
|
||||
let enable_p = parent.join("constraint_0_enabled");
|
||||
let _ = fs::write(&enable_p, "1");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
|
||||
fs::write(&self.pl2_path, limit.as_u64().to_string())?;
|
||||
fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()> {
|
||||
for path in &self.pl2_paths {
|
||||
debug!("SAL: Applying PL2 ({:.1}W) to {:?}", limit.get(), path);
|
||||
fs::write(path, limit.as_microwatts().to_string())
|
||||
.with_context(|| format!("Failed to write PL2 to {:?}", path))?;
|
||||
if let Some(parent) = path.parent() {
|
||||
let enable_p = parent.join("constraint_1_enabled");
|
||||
let _ = fs::write(&enable_p, "1");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -305,7 +334,5 @@ impl HardwareWatchdog for DellXps9380Sal {
|
||||
}
|
||||
|
||||
impl Drop for DellXps9380Sal {
|
||||
fn drop(&mut self) {
|
||||
let _ = self.restore();
|
||||
}
|
||||
fn drop(&mut self) { }
|
||||
}
|
||||
|
||||
148
src/sal/discovery.rs
Normal file
148
src/sal/discovery.rs
Normal file
@@ -0,0 +1,148 @@
|
||||
//! # Hardware Discovery Engine (Agent Sentinel)
|
||||
//!
|
||||
//! This module provides dynamic traversal of `/sys/class/hwmon` and `/sys/class/powercap`
|
||||
//! to locate sensors and actuators without relying on hardcoded indices.
|
||||
|
||||
use anyhow::{Result, Context, anyhow};
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
/// Result of a successful hardware discovery.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DiscoveredHardware {
|
||||
/// Path to the primary package temperature sensor input.
|
||||
pub temp_input: PathBuf,
|
||||
/// Paths to all detected fan RPM inputs.
|
||||
pub fan_inputs: Vec<PathBuf>,
|
||||
/// Paths to all detected fan PWM control nodes.
|
||||
pub pwm_controls: Vec<PathBuf>,
|
||||
/// Paths to all detected fan PWM enable nodes.
|
||||
pub pwm_enables: Vec<PathBuf>,
|
||||
/// Paths to RAPL power limit constraint files.
|
||||
pub rapl_paths: Vec<PathBuf>,
|
||||
}
|
||||
|
||||
pub struct DiscoveryEngine;
|
||||
|
||||
impl DiscoveryEngine {
|
||||
/// Performs a full traversal of the sysfs hardware tree.
|
||||
pub fn run(sysfs_root: &Path) -> Result<DiscoveredHardware> {
|
||||
info!("Sentinel: Starting dynamic hardware discovery...");
|
||||
|
||||
let hwmon_path = sysfs_root.join("sys/class/hwmon");
|
||||
let (temp_input, fan_info) = Self::discover_hwmon(&hwmon_path)?;
|
||||
|
||||
let powercap_path = sysfs_root.join("sys/class/powercap");
|
||||
let rapl_paths = Self::discover_rapl(&powercap_path)?;
|
||||
|
||||
let hardware = DiscoveredHardware {
|
||||
temp_input,
|
||||
fan_inputs: fan_info.rpm_inputs,
|
||||
pwm_controls: fan_info.pwm_controls,
|
||||
pwm_enables: fan_info.pwm_enables,
|
||||
rapl_paths,
|
||||
};
|
||||
|
||||
info!("Sentinel: Discovery complete. Found {} fans and {} RAPL nodes.",
|
||||
hardware.fan_inputs.len(), hardware.rapl_paths.len());
|
||||
|
||||
Ok(hardware)
|
||||
}
|
||||
|
||||
fn discover_hwmon(base: &Path) -> Result<(PathBuf, FanHardware)> {
|
||||
let mut best_temp: Option<(u32, PathBuf)> = None;
|
||||
let mut fans = FanHardware::default();
|
||||
|
||||
let entries = fs::read_dir(base)
|
||||
.with_context(|| format!("Failed to read hwmon base: {:?}", base))?;
|
||||
|
||||
for entry in entries.flatten() {
|
||||
let path = entry.path();
|
||||
let driver_name = fs::read_to_string(path.join("name"))
|
||||
.map(|s| s.trim().to_string())
|
||||
.unwrap_or_else(|_| "unknown".to_string());
|
||||
|
||||
debug!("Discovery: Probing hwmon node {:?} (driver: {})", path, driver_name);
|
||||
|
||||
// 1. Temperature Discovery
|
||||
let temp_priority = match driver_name.as_str() {
|
||||
"coretemp" | "zenpower" => 10,
|
||||
"k10temp" => 9,
|
||||
"dell_smm" => 8,
|
||||
"acpitz" => 1,
|
||||
_ => 5,
|
||||
};
|
||||
|
||||
if let Ok(hw_entries) = fs::read_dir(&path) {
|
||||
for hw_entry in hw_entries.flatten() {
|
||||
let file_name = hw_entry.file_name().to_string_lossy().to_string();
|
||||
|
||||
// Temperature Inputs
|
||||
if file_name.starts_with("temp") && file_name.ends_with("_input") {
|
||||
let label_path = path.join(file_name.replace("_input", "_label"));
|
||||
let label = fs::read_to_string(label_path).unwrap_or_default().trim().to_string();
|
||||
|
||||
let label_priority = if label.contains("Package") || label.contains("Tdie") {
|
||||
2
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
let total_priority = temp_priority + label_priority;
|
||||
if best_temp.is_none() || total_priority > best_temp.as_ref().unwrap().0 {
|
||||
best_temp = Some((total_priority, hw_entry.path()));
|
||||
}
|
||||
}
|
||||
|
||||
// Fan Inputs
|
||||
if file_name.starts_with("fan") && file_name.ends_with("_input") {
|
||||
fans.rpm_inputs.push(hw_entry.path());
|
||||
}
|
||||
|
||||
// PWM Controls
|
||||
if file_name.starts_with("pwm") && !file_name.contains("_") {
|
||||
fans.pwm_controls.push(hw_entry.path());
|
||||
}
|
||||
|
||||
// PWM Enables
|
||||
if file_name.starts_with("pwm") && file_name.ends_with("_enable") {
|
||||
fans.pwm_enables.push(hw_entry.path());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let temp_input = best_temp.map(|(_, p)| p)
|
||||
.ok_or_else(|| anyhow!("Failed to locate any valid temperature sensor in /sys/class/hwmon/"))?;
|
||||
|
||||
Ok((temp_input, fans))
|
||||
}
|
||||
|
||||
fn discover_rapl(base: &Path) -> Result<Vec<PathBuf>> {
|
||||
let mut paths = Vec::new();
|
||||
if !base.exists() {
|
||||
warn!("Discovery: /sys/class/powercap does not exist.");
|
||||
return Ok(paths);
|
||||
}
|
||||
|
||||
let entries = fs::read_dir(base)?;
|
||||
for entry in entries.flatten() {
|
||||
let path = entry.path();
|
||||
let name = fs::read_to_string(path.join("name")).unwrap_or_default().trim().to_string();
|
||||
|
||||
if name.contains("package") || name.contains("intel-rapl") {
|
||||
paths.push(path);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(paths)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct FanHardware {
|
||||
rpm_inputs: Vec<PathBuf>,
|
||||
pwm_controls: Vec<PathBuf>,
|
||||
pwm_enables: Vec<PathBuf>,
|
||||
}
|
||||
@@ -1,11 +1,12 @@
|
||||
use anyhow::{Result, anyhow};
|
||||
use anyhow::{Result, anyhow, Context};
|
||||
use std::path::{Path};
|
||||
use std::fs;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::sync::Mutex;
|
||||
use std::sync::{Mutex, Arc};
|
||||
use tracing::{debug, warn, info};
|
||||
|
||||
use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx};
|
||||
use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage};
|
||||
use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent};
|
||||
use crate::sal::heuristic::discovery::SystemFactSheet;
|
||||
use crate::sal::heuristic::schema::HardwareDb;
|
||||
|
||||
@@ -13,14 +14,9 @@ pub struct GenericLinuxSal {
|
||||
ctx: EnvironmentCtx,
|
||||
fact_sheet: SystemFactSheet,
|
||||
db: HardwareDb,
|
||||
suppressed_services: Mutex<Vec<String>>,
|
||||
last_valid_temp: Mutex<(f32, Instant)>,
|
||||
current_pl1: Mutex<u64>,
|
||||
last_energy: Mutex<(u64, Instant)>,
|
||||
|
||||
// --- Original State for Restoration ---
|
||||
original_pl1: Mutex<Option<u64>>,
|
||||
original_pl2: Mutex<Option<u64>>,
|
||||
}
|
||||
|
||||
impl GenericLinuxSal {
|
||||
@@ -33,14 +29,11 @@ impl GenericLinuxSal {
|
||||
|
||||
Self {
|
||||
db,
|
||||
suppressed_services: Mutex::new(Vec::new()),
|
||||
last_valid_temp: Mutex::new((0.0, Instant::now())),
|
||||
current_pl1: Mutex::new(15_000_000),
|
||||
last_energy: Mutex::new((initial_energy, Instant::now())),
|
||||
fact_sheet: facts,
|
||||
ctx,
|
||||
original_pl1: Mutex::new(None),
|
||||
original_pl2: Mutex::new(None),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,7 +128,6 @@ impl SensorBus for GenericLinuxSal {
|
||||
}
|
||||
|
||||
fn get_throttling_status(&self) -> Result<bool> {
|
||||
// Fallback: check if any cooling device is active (cur_state > 0)
|
||||
let cooling_base = self.ctx.sysfs_base.join("sys/class/thermal");
|
||||
if let Ok(entries) = fs::read_dir(cooling_base) {
|
||||
for entry in entries.flatten() {
|
||||
@@ -168,68 +160,37 @@ impl ActuatorBus for GenericLinuxSal {
|
||||
} else { Ok(()) }
|
||||
}
|
||||
|
||||
fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> {
|
||||
fn set_fan_speed(&self, _speed: FanSpeedPercent) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
|
||||
let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL1 path"))?;
|
||||
fs::write(rapl_path.join("constraint_0_power_limit_uw"), limit.as_u64().to_string())?;
|
||||
*self.current_pl1.lock().unwrap() = limit.as_u64();
|
||||
fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()> {
|
||||
for rapl_path in &self.fact_sheet.rapl_paths {
|
||||
let limit_path = rapl_path.join("constraint_0_power_limit_uw");
|
||||
let enable_path = rapl_path.join("constraint_0_enabled");
|
||||
fs::write(&limit_path, limit.as_microwatts().to_string())
|
||||
.with_context(|| format!("Failed to write PL1 to {:?}", limit_path))?;
|
||||
let _ = fs::write(&enable_path, "1");
|
||||
}
|
||||
*self.current_pl1.lock().unwrap() = limit.as_microwatts();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
|
||||
let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL2 path"))?;
|
||||
fs::write(rapl_path.join("constraint_1_power_limit_uw"), limit.as_u64().to_string())?;
|
||||
fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()> {
|
||||
for rapl_path in &self.fact_sheet.rapl_paths {
|
||||
let limit_path = rapl_path.join("constraint_1_power_limit_uw");
|
||||
let enable_path = rapl_path.join("constraint_1_enabled");
|
||||
fs::write(&limit_path, limit.as_microwatts().to_string())
|
||||
.with_context(|| format!("Failed to write PL2 to {:?}", limit_path))?;
|
||||
let _ = fs::write(&enable_path, "1");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl EnvironmentGuard for GenericLinuxSal {
|
||||
fn suppress(&self) -> Result<()> {
|
||||
// Snapshot Power Limits
|
||||
if let Some(rapl_path) = self.fact_sheet.rapl_paths.first() {
|
||||
if let Ok(pl1) = fs::read_to_string(rapl_path.join("constraint_0_power_limit_uw")) {
|
||||
*self.original_pl1.lock().unwrap() = pl1.trim().parse().ok();
|
||||
}
|
||||
if let Ok(pl2) = fs::read_to_string(rapl_path.join("constraint_1_power_limit_uw")) {
|
||||
*self.original_pl2.lock().unwrap() = pl2.trim().parse().ok();
|
||||
}
|
||||
}
|
||||
|
||||
let mut suppressed = self.suppressed_services.lock().unwrap();
|
||||
for conflict_id in &self.fact_sheet.active_conflicts {
|
||||
if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) {
|
||||
for service in &conflict.services {
|
||||
if self.ctx.runner.run("systemctl", &["is-active", "--quiet", service]).is_ok() {
|
||||
let _ = self.ctx.runner.run("systemctl", &["stop", service]);
|
||||
suppressed.push(service.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn restore(&self) -> Result<()> {
|
||||
// Restore Power Limits
|
||||
if let Some(rapl_path) = self.fact_sheet.rapl_paths.first() {
|
||||
if let Some(pl1) = *self.original_pl1.lock().unwrap() {
|
||||
let _ = fs::write(rapl_path.join("constraint_0_power_limit_uw"), pl1.to_string());
|
||||
}
|
||||
if let Some(pl2) = *self.original_pl2.lock().unwrap() {
|
||||
let _ = fs::write(rapl_path.join("constraint_1_power_limit_uw"), pl2.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
let mut suppressed = self.suppressed_services.lock().unwrap();
|
||||
for service in suppressed.drain(..) {
|
||||
let _ = self.ctx.runner.run("systemctl", &["start", &service]);
|
||||
}
|
||||
if self.is_dell() { let _ = self.set_fan_mode("auto"); }
|
||||
Ok(())
|
||||
}
|
||||
fn suppress(&self) -> Result<()> { Ok(()) }
|
||||
fn restore(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
impl HardwareWatchdog for GenericLinuxSal {
|
||||
@@ -245,7 +206,3 @@ impl HardwareWatchdog for GenericLinuxSal {
|
||||
Ok(SafetyStatus::Nominal)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for GenericLinuxSal {
|
||||
fn drop(&mut self) { let _ = self.restore(); }
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::sync::mpsc;
|
||||
use std::collections::HashMap;
|
||||
use crate::sal::heuristic::schema::{SensorDiscovery, ActuatorDiscovery, Conflict, Discovery, Benchmarking};
|
||||
use crate::sys::SyscallRunner;
|
||||
use tracing::{debug, warn};
|
||||
use tracing::{debug, warn, info};
|
||||
|
||||
/// Registry of dynamically discovered paths for configs and tools.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
@@ -24,6 +24,7 @@ pub struct SystemFactSheet {
|
||||
pub fan_paths: Vec<PathBuf>,
|
||||
pub rapl_paths: Vec<PathBuf>,
|
||||
pub active_conflicts: Vec<String>,
|
||||
pub conflict_services: Vec<String>,
|
||||
pub paths: PathRegistry,
|
||||
pub bench_config: Option<Benchmarking>,
|
||||
}
|
||||
@@ -44,12 +45,17 @@ pub fn discover_facts(
|
||||
let rapl_paths = discover_rapl(base_path, &discovery.actuators);
|
||||
|
||||
let mut active_conflicts = Vec::new();
|
||||
let mut conflict_services = Vec::new();
|
||||
for conflict in conflicts {
|
||||
let mut found_active = false;
|
||||
for service in &conflict.services {
|
||||
if is_service_active(runner, service) {
|
||||
debug!("Detected active conflict: {} (Service: {})", conflict.id, service);
|
||||
active_conflicts.push(conflict.id.clone());
|
||||
break;
|
||||
if !found_active {
|
||||
debug!("Detected active conflict: {} (Service: {})", conflict.id, service);
|
||||
active_conflicts.push(conflict.id.clone());
|
||||
found_active = true;
|
||||
}
|
||||
conflict_services.push(service.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -57,13 +63,7 @@ pub fn discover_facts(
|
||||
let paths = discover_paths(base_path, discovery);
|
||||
|
||||
SystemFactSheet {
|
||||
vendor,
|
||||
model,
|
||||
temp_path,
|
||||
fan_paths,
|
||||
rapl_paths,
|
||||
active_conflicts,
|
||||
paths,
|
||||
vendor, model, temp_path, fan_paths, rapl_paths, active_conflicts, conflict_services, paths,
|
||||
bench_config: Some(bench_config),
|
||||
}
|
||||
}
|
||||
@@ -71,7 +71,6 @@ pub fn discover_facts(
|
||||
fn discover_paths(base_path: &Path, discovery: &Discovery) -> PathRegistry {
|
||||
let mut registry = PathRegistry::default();
|
||||
|
||||
// 1. Discover Tools via PATH
|
||||
for (id, binary_name) in &discovery.tools {
|
||||
if let Ok(path) = which::which(binary_name) {
|
||||
debug!("Discovered tool: {} -> {:?}", id, path);
|
||||
@@ -79,7 +78,6 @@ fn discover_paths(base_path: &Path, discovery: &Discovery) -> PathRegistry {
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Discover Configs via existence check
|
||||
for (id, candidates) in &discovery.configs {
|
||||
for candidate in candidates {
|
||||
let path = if candidate.starts_with('/') {
|
||||
@@ -104,12 +102,11 @@ fn discover_paths(base_path: &Path, discovery: &Discovery) -> PathRegistry {
|
||||
registry
|
||||
}
|
||||
|
||||
/// Reads DMI information from sysfs with a safety timeout.
|
||||
fn read_dmi_info(base_path: &Path) -> (String, String) {
|
||||
let vendor = read_sysfs_with_timeout(&base_path.join("sys/class/dmi/id/sys_vendor"), Duration::from_millis(100))
|
||||
.unwrap_or_else(|| "Unknown".to_string());
|
||||
let model = read_sysfs_with_timeout(&base_path.join("sys/class/dmi/id/product_name"), Duration::from_millis(100))
|
||||
.unwrap_or_else(|| "Unknown".to_string());
|
||||
let vendor = fs::read_to_string(base_path.join("sys/class/dmi/id/sys_vendor"))
|
||||
.map(|s| s.trim().to_string()).unwrap_or_else(|_| "Unknown".to_string());
|
||||
let model = fs::read_to_string(base_path.join("sys/class/dmi/id/product_name"))
|
||||
.map(|s| s.trim().to_string()).unwrap_or_else(|_| "Unknown".to_string());
|
||||
(vendor, model)
|
||||
}
|
||||
|
||||
@@ -119,49 +116,62 @@ fn discover_hwmon(base_path: &Path, cfg: &SensorDiscovery) -> (Option<PathBuf>,
|
||||
let mut fan_candidates = Vec::new();
|
||||
|
||||
let hwmon_base = base_path.join("sys/class/hwmon");
|
||||
let entries = match fs::read_dir(&hwmon_base) {
|
||||
Ok(e) => e,
|
||||
Err(e) => {
|
||||
warn!("Could not read {:?}: {}", hwmon_base, e);
|
||||
return (None, Vec::new());
|
||||
}
|
||||
};
|
||||
let entries = fs::read_dir(&hwmon_base).map_err(|e| {
|
||||
warn!("Could not read {:?}: {}", hwmon_base, e);
|
||||
e
|
||||
}).ok();
|
||||
|
||||
for entry in entries.flatten() {
|
||||
let hwmon_path = entry.path();
|
||||
|
||||
let driver_name = read_sysfs_with_timeout(&hwmon_path.join("name"), Duration::from_millis(100))
|
||||
.unwrap_or_default();
|
||||
if let Some(entries) = entries {
|
||||
for entry in entries.flatten() {
|
||||
let hwmon_path = entry.path();
|
||||
|
||||
// # SAFETY: Read driver name directly. This file is virtual and never blocks.
|
||||
// Using a timeout wrapper here was causing discovery to fail if the thread-pool lagged.
|
||||
let driver_name = fs::read_to_string(hwmon_path.join("name"))
|
||||
.map(|s| s.trim().to_string()).unwrap_or_default();
|
||||
|
||||
let priority = cfg.hwmon_priority
|
||||
.iter()
|
||||
.position(|p| p == &driver_name)
|
||||
.unwrap_or(usize::MAX);
|
||||
let priority = cfg.hwmon_priority
|
||||
.iter()
|
||||
.position(|p| driver_name.contains(p))
|
||||
.unwrap_or(usize::MAX);
|
||||
|
||||
if let Ok(hw_entries) = fs::read_dir(&hwmon_path) {
|
||||
for hw_entry in hw_entries.flatten() {
|
||||
let file_name = hw_entry.file_name().into_string().unwrap_or_default();
|
||||
|
||||
if file_name.starts_with("temp") && file_name.ends_with("_label") {
|
||||
if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(100)) {
|
||||
if cfg.temp_labels.iter().any(|l| label.contains(l)) {
|
||||
let input_path = hwmon_path.join(file_name.replace("_label", "_input"));
|
||||
if input_path.exists() {
|
||||
temp_candidates.push((priority, input_path));
|
||||
if let Ok(hw_entries) = fs::read_dir(&hwmon_path) {
|
||||
for hw_entry in hw_entries.flatten() {
|
||||
let file_name = hw_entry.file_name().into_string().unwrap_or_default();
|
||||
|
||||
// 1. Temperatures
|
||||
if file_name.starts_with("temp") && file_name.ends_with("_label") {
|
||||
if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(500)) {
|
||||
if cfg.temp_labels.iter().any(|l| label.contains(l)) {
|
||||
let input_path = hwmon_path.join(file_name.replace("_label", "_input"));
|
||||
if input_path.exists() {
|
||||
temp_candidates.push((priority, input_path));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if file_name.starts_with("fan") && file_name.ends_with("_label") {
|
||||
if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(100)) {
|
||||
if cfg.fan_labels.iter().any(|l| label.contains(l)) {
|
||||
let input_path = hwmon_path.join(file_name.replace("_label", "_input"));
|
||||
if input_path.exists() {
|
||||
fan_candidates.push((priority, input_path));
|
||||
// 2. Fans (Label Match)
|
||||
if file_name.starts_with("fan") && file_name.ends_with("_label") {
|
||||
if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(500)) {
|
||||
if cfg.fan_labels.iter().any(|l| label.contains(l)) {
|
||||
let input_path = hwmon_path.join(file_name.replace("_label", "_input"));
|
||||
if input_path.exists() {
|
||||
debug!("Discovered fan by label: {:?} (priority {})", input_path, priority);
|
||||
fan_candidates.push((priority, input_path));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Fans (Priority Fallback - CRITICAL FOR DELL 9380)
|
||||
// If we found a priority driver (e.g., dell_smm), we take every fan*_input we find.
|
||||
if priority < usize::MAX && file_name.starts_with("fan") && file_name.ends_with("_input") {
|
||||
if !fan_candidates.iter().any(|(_, p)| p == &hw_entry.path()) {
|
||||
info!("Heuristic Discovery: Force-adding unlabeled fan sensor from priority driver '{}': {:?}", driver_name, hw_entry.path());
|
||||
fan_candidates.push((priority, hw_entry.path()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -171,45 +181,45 @@ fn discover_hwmon(base_path: &Path, cfg: &SensorDiscovery) -> (Option<PathBuf>,
|
||||
fan_candidates.sort_by_key(|(p, _)| *p);
|
||||
|
||||
let best_temp = temp_candidates.first().map(|(_, p)| p.clone());
|
||||
let best_fans = fan_candidates.into_iter().map(|(_, p)| p).collect();
|
||||
let best_fans: Vec<PathBuf> = fan_candidates.into_iter().map(|(_, p)| p).collect();
|
||||
|
||||
if best_fans.is_empty() {
|
||||
warn!("Heuristic Discovery: No fan RPM sensors found.");
|
||||
} else {
|
||||
info!("Heuristic Discovery: Final registry contains {} fan sensors.", best_fans.len());
|
||||
}
|
||||
|
||||
(best_temp, best_fans)
|
||||
}
|
||||
|
||||
/// Discovers RAPL powercap paths.
|
||||
fn discover_rapl(base_path: &Path, cfg: &ActuatorDiscovery) -> Vec<PathBuf> {
|
||||
let mut paths = Vec::new();
|
||||
let powercap_base = base_path.join("sys/class/powercap");
|
||||
|
||||
let entries = match fs::read_dir(&powercap_base) {
|
||||
Ok(e) => e,
|
||||
Err(_) => return Vec::new(),
|
||||
};
|
||||
|
||||
for entry in entries.flatten() {
|
||||
let path = entry.path();
|
||||
let dir_name = entry.file_name().into_string().unwrap_or_default();
|
||||
|
||||
if cfg.rapl_paths.contains(&dir_name) {
|
||||
paths.push(path);
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(name) = read_sysfs_with_timeout(&path.join("name"), Duration::from_millis(100)) {
|
||||
if cfg.rapl_paths.iter().any(|p| p == &name) {
|
||||
if let Ok(entries) = fs::read_dir(&powercap_base) {
|
||||
for entry in entries.flatten() {
|
||||
let path = entry.path();
|
||||
let dir_name = entry.file_name().into_string().unwrap_or_default();
|
||||
|
||||
if cfg.rapl_paths.contains(&dir_name) {
|
||||
paths.push(path);
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Ok(name) = fs::read_to_string(path.join("name")) {
|
||||
if cfg.rapl_paths.iter().any(|p| p == name.trim()) {
|
||||
paths.push(path);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
paths
|
||||
}
|
||||
|
||||
/// Checks if a systemd service is currently active using the injected runner.
|
||||
pub fn is_service_active(runner: &dyn SyscallRunner, service: &str) -> bool {
|
||||
runner.run("systemctl", &["is-active", "--quiet", service]).is_ok()
|
||||
}
|
||||
|
||||
/// Helper to read a sysfs file with a timeout.
|
||||
fn read_sysfs_with_timeout(path: &Path, timeout: Duration) -> Option<String> {
|
||||
let (tx, rx) = mpsc::channel();
|
||||
let path_buf = path.to_path_buf();
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, SafetyStatus};
|
||||
use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage};
|
||||
use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent};
|
||||
use anyhow::Result;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub struct MockSal {
|
||||
pub temperature_sequence: std::sync::atomic::AtomicUsize,
|
||||
@@ -17,65 +18,36 @@ impl MockSal {
|
||||
impl PreflightAuditor for MockSal {
|
||||
fn audit(&self) -> Box<dyn Iterator<Item = AuditStep> + '_> {
|
||||
let steps = vec![
|
||||
AuditStep {
|
||||
description: "Mock Root Privileges".to_string(),
|
||||
outcome: Ok(()),
|
||||
},
|
||||
AuditStep {
|
||||
description: "Mock AC Power Status".to_string(),
|
||||
outcome: Ok(()),
|
||||
},
|
||||
AuditStep { description: "Mock Root Privileges".to_string(), outcome: Ok(()) },
|
||||
AuditStep { description: "Mock AC Power Status".to_string(), outcome: Ok(()) },
|
||||
];
|
||||
Box::new(steps.into_iter())
|
||||
}
|
||||
}
|
||||
|
||||
impl EnvironmentGuard for MockSal {
|
||||
fn suppress(&self) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
fn restore(&self) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
fn suppress(&self) -> Result<()> { Ok(()) }
|
||||
fn restore(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
impl SensorBus for MockSal {
|
||||
fn get_temp(&self) -> Result<f32> {
|
||||
// Support dynamic sequence for Step 5
|
||||
let seq = self.temperature_sequence.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
|
||||
Ok(40.0 + (seq as f32 * 0.5).min(50.0)) // Heats up from 40 to 90
|
||||
}
|
||||
fn get_power_w(&self) -> Result<f32> {
|
||||
Ok(15.0)
|
||||
}
|
||||
fn get_fan_rpms(&self) -> Result<Vec<u32>> {
|
||||
Ok(vec![2500])
|
||||
}
|
||||
fn get_freq_mhz(&self) -> Result<f32> {
|
||||
Ok(3200.0)
|
||||
}
|
||||
fn get_throttling_status(&self) -> Result<bool> {
|
||||
Ok(self.get_temp()? > 90.0)
|
||||
Ok(40.0 + (seq as f32 * 0.5).min(55.0))
|
||||
}
|
||||
fn get_power_w(&self) -> Result<f32> { Ok(15.0) }
|
||||
fn get_fan_rpms(&self) -> Result<Vec<u32>> { Ok(vec![2500, 2400]) }
|
||||
fn get_freq_mhz(&self) -> Result<f32> { Ok(3200.0) }
|
||||
fn get_throttling_status(&self) -> Result<bool> { Ok(false) }
|
||||
}
|
||||
|
||||
impl ActuatorBus for MockSal {
|
||||
fn set_fan_mode(&self, _mode: &str) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
fn set_sustained_power_limit(&self, _limit: TdpLimitMicroWatts) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
fn set_burst_power_limit(&self, _limit: TdpLimitMicroWatts) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
fn set_fan_mode(&self, _mode: &str) -> Result<()> { Ok(()) }
|
||||
fn set_fan_speed(&self, _speed: FanSpeedPercent) -> Result<()> { Ok(()) }
|
||||
fn set_sustained_power_limit(&self, _limit: PowerLimitWatts) -> Result<()> { Ok(()) }
|
||||
fn set_burst_power_limit(&self, _limit: PowerLimitWatts) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
impl HardwareWatchdog for MockSal {
|
||||
fn get_safety_status(&self) -> Result<SafetyStatus> {
|
||||
Ok(SafetyStatus::Nominal)
|
||||
}
|
||||
fn get_safety_status(&self) -> Result<SafetyStatus> { Ok(SafetyStatus::Nominal) }
|
||||
}
|
||||
|
||||
@@ -4,3 +4,4 @@ pub mod dell_xps_9380;
|
||||
pub mod generic_linux;
|
||||
pub mod heuristic;
|
||||
pub mod safety;
|
||||
pub mod discovery;
|
||||
|
||||
@@ -8,68 +8,81 @@ use anyhow::{Result, bail, Context};
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::path::{PathBuf};
|
||||
use tracing::{info, warn, error};
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::thread;
|
||||
use tracing::{info, warn, error, debug};
|
||||
|
||||
use crate::sal::traits::SensorBus;
|
||||
|
||||
// --- 1. Type-Driven Bounds Checking ---
|
||||
|
||||
/// Represents a TDP limit in microwatts, strictly bounded between 5W and 80W.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct TdpLimitMicroWatts(u64);
|
||||
/// Represents a validated TDP limit in Watts.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
|
||||
pub struct PowerLimitWatts(f32);
|
||||
|
||||
impl TdpLimitMicroWatts {
|
||||
/// # SAFETY:
|
||||
/// Values below 5W can cause CPU frequency to drop to 400MHz and induce system instability.
|
||||
pub const MIN_SAFE_UW: u64 = 5_000_000;
|
||||
/// # SAFETY:
|
||||
/// Values above 80W can exceed the thermal and electrical design limits of XPS chassis.
|
||||
pub const MAX_SAFE_UW: u64 = 80_000_000;
|
||||
impl PowerLimitWatts {
|
||||
/// Absolute safety floor. Setting TDP below 3W can induce system-wide
|
||||
/// CPU stalls and I/O deadlocks on certain Intel mobile chipsets.
|
||||
pub const MIN: f32 = 3.0;
|
||||
/// Safety ceiling for mobile thin-and-light chassis.
|
||||
pub const MAX: f32 = 100.0;
|
||||
|
||||
/// Validates and constructs a new TDP limit.
|
||||
pub fn new(microwatts: u64) -> Result<Self> {
|
||||
if microwatts < Self::MIN_SAFE_UW {
|
||||
bail!("HardwareSafetyError: Requested TDP {}uW is below safety floor (5W).", microwatts);
|
||||
/// Validates and constructs a new PowerLimitWatts.
|
||||
pub fn try_new(watts: f32) -> Result<Self> {
|
||||
if watts < Self::MIN || watts > Self::MAX {
|
||||
bail!("HardwareSafetyError: Requested TDP {:.1}W is outside safe bounds ({:.1}W - {:.1}W).", watts, Self::MIN, Self::MAX);
|
||||
}
|
||||
if microwatts > Self::MAX_SAFE_UW {
|
||||
bail!("HardwareSafetyError: Requested TDP {}uW exceeds safety ceiling (80W).", microwatts);
|
||||
}
|
||||
Ok(Self(microwatts))
|
||||
Ok(Self(watts))
|
||||
}
|
||||
|
||||
pub fn from_watts(watts: f32) -> Result<Self> {
|
||||
Self::new((watts * 1_000_000.0) as u64)
|
||||
Self::try_new(watts)
|
||||
}
|
||||
|
||||
pub fn as_u64(&self) -> u64 { self.0 }
|
||||
pub fn get(&self) -> f32 { self.0 }
|
||||
pub fn as_microwatts(&self) -> u64 { (self.0 * 1_000_000.0) as u64 }
|
||||
}
|
||||
|
||||
/// Represents a fan speed percentage (0-100%).
|
||||
/// Represents a validated fan speed percentage.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct FanSpeedPercentage(u8);
|
||||
pub struct FanSpeedPercent(u8);
|
||||
|
||||
impl FanSpeedPercentage {
|
||||
pub fn new(percent: u8) -> Result<Self> {
|
||||
impl FanSpeedPercent {
|
||||
pub fn try_new(percent: u8) -> Result<Self> {
|
||||
if percent > 100 {
|
||||
bail!("HardwareSafetyError: Fan speed {}% is invalid.", percent);
|
||||
}
|
||||
Ok(Self(percent))
|
||||
}
|
||||
pub fn as_u8(&self) -> u8 { self.0 }
|
||||
|
||||
pub fn new(percent: u8) -> Result<Self> {
|
||||
Self::try_new(percent)
|
||||
}
|
||||
|
||||
pub fn get(&self) -> u8 { self.0 }
|
||||
}
|
||||
|
||||
/// Represents a thermal threshold in Celsius, bounded to TjMax - 2°C (98°C).
|
||||
/// Represents a thermal threshold in Celsius.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
|
||||
pub struct ThermalThresholdCelsius(f32);
|
||||
|
||||
impl ThermalThresholdCelsius {
|
||||
pub const MAX_SAFE_C: f32 = 98.0;
|
||||
|
||||
pub fn new(celsius: f32) -> Result<Self> {
|
||||
pub fn try_new(celsius: f32) -> Result<Self> {
|
||||
if celsius > Self::MAX_SAFE_C {
|
||||
bail!("HardwareSafetyError: Thermal threshold {}C exceeds safe limit (98C).", celsius);
|
||||
bail!("HardwareSafetyError: Thermal threshold {}C exceeds safe limit ({}C).", celsius, Self::MAX_SAFE_C);
|
||||
}
|
||||
Ok(Self(celsius))
|
||||
}
|
||||
pub fn as_f32(&self) -> f32 { self.0 }
|
||||
|
||||
pub fn new(celsius: f32) -> Result<Self> {
|
||||
Self::try_new(celsius)
|
||||
}
|
||||
|
||||
pub fn get(&self) -> f32 { self.0 }
|
||||
}
|
||||
|
||||
// --- 2. The HardwareStateGuard (RAII Restorer) ---
|
||||
@@ -78,6 +91,7 @@ impl ThermalThresholdCelsius {
|
||||
pub type RollbackAction = Box<dyn FnOnce() + Send + 'static>;
|
||||
|
||||
/// Holds a snapshot of the system state. Restores everything on Drop.
|
||||
/// This is the primary safety mechanism for Project Iron-Ember.
|
||||
pub struct HardwareStateGuard {
|
||||
/// Maps sysfs paths to their original string contents.
|
||||
snapshots: HashMap<PathBuf, String>,
|
||||
@@ -90,6 +104,9 @@ pub struct HardwareStateGuard {
|
||||
|
||||
impl HardwareStateGuard {
|
||||
/// Snapshots the requested files and neutralizes competing services.
|
||||
///
|
||||
/// # SAFETY:
|
||||
/// This MUST be acquired before any hardware mutation occurs.
|
||||
pub fn acquire(target_files: &[PathBuf], target_services: &[String]) -> Result<Self> {
|
||||
let mut snapshots = HashMap::new();
|
||||
let mut suppressed = Vec::new();
|
||||
@@ -101,10 +118,13 @@ impl HardwareStateGuard {
|
||||
let content = fs::read_to_string(path)
|
||||
.with_context(|| format!("Failed to snapshot {:?}", path))?;
|
||||
snapshots.insert(path.clone(), content.trim().to_string());
|
||||
} else {
|
||||
debug!("USA: Skipping snapshot for non-existent path {:?}", path);
|
||||
}
|
||||
}
|
||||
|
||||
for svc in target_services {
|
||||
// Check if service is active before stopping
|
||||
let status = std::process::Command::new("systemctl")
|
||||
.args(["is-active", "--quiet", svc])
|
||||
.status();
|
||||
@@ -168,7 +188,75 @@ impl Drop for HardwareStateGuard {
|
||||
}
|
||||
}
|
||||
|
||||
// --- 3. Transactional Configuration ---
|
||||
// --- 3. The Active Watchdog ---
|
||||
|
||||
/// A standalone monitor that polls hardware thermals at high frequency.
|
||||
pub struct ThermalWatchdog {
|
||||
cancel_token: Arc<AtomicBool>,
|
||||
handle: Option<thread::JoinHandle<()>>,
|
||||
}
|
||||
|
||||
impl ThermalWatchdog {
|
||||
/// If temperature exceeds this ceiling, the watchdog triggers an emergency shutdown.
|
||||
pub const CRITICAL_TEMP: f32 = 95.0;
|
||||
/// High polling rate ensures we catch runaways before chassis saturation.
|
||||
pub const POLL_INTERVAL: Duration = Duration::from_millis(250);
|
||||
|
||||
/// Spawns the watchdog thread.
|
||||
pub fn spawn(sensors: Arc<dyn SensorBus>, cancel_token: Arc<AtomicBool>) -> Self {
|
||||
let ct = cancel_token.clone();
|
||||
let handle = thread::spawn(move || {
|
||||
let mut last_temp = 0.0;
|
||||
loop {
|
||||
if ct.load(Ordering::SeqCst) {
|
||||
debug!("Watchdog: Shutdown signal received.");
|
||||
break;
|
||||
}
|
||||
|
||||
match sensors.get_temp() {
|
||||
Ok(temp) => {
|
||||
// Rate of change check (dT/dt)
|
||||
let dt_dt = temp - last_temp;
|
||||
if temp >= Self::CRITICAL_TEMP {
|
||||
error!("WATCHDOG: CRITICAL THERMAL EVENT ({:.1}C). Triggering emergency abort!", temp);
|
||||
ct.store(true, Ordering::SeqCst);
|
||||
break;
|
||||
}
|
||||
|
||||
if dt_dt > 5.0 && temp > 85.0 {
|
||||
warn!("WATCHDOG: Dangerous thermal ramp detected (+{:.1}C in 250ms).", dt_dt);
|
||||
}
|
||||
|
||||
last_temp = temp;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("WATCHDOG: Sensor read failure: {}. Aborting for safety!", e);
|
||||
ct.store(true, Ordering::SeqCst);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
thread::sleep(Self::POLL_INTERVAL);
|
||||
}
|
||||
});
|
||||
|
||||
Self {
|
||||
cancel_token,
|
||||
handle: Some(handle),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ThermalWatchdog {
|
||||
fn drop(&mut self) {
|
||||
self.cancel_token.store(true, Ordering::SeqCst);
|
||||
if let Some(h) = self.handle.take() {
|
||||
let _ = h.join();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- 4. Transactional Configuration ---
|
||||
|
||||
/// A staged set of changes to be applied to the hardware.
|
||||
#[derive(Default)]
|
||||
|
||||
@@ -115,30 +115,20 @@ impl<T: EnvironmentGuard + ?Sized> EnvironmentGuard for Arc<T> {
|
||||
}
|
||||
}
|
||||
|
||||
use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent};
|
||||
|
||||
/// Provides a read-only interface to system telemetry sensors.
|
||||
pub trait SensorBus: Send + Sync {
|
||||
/// Returns the current package temperature in degrees Celsius.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if the underlying `hwmon` or `sysfs` node cannot be read.
|
||||
fn get_temp(&self) -> Result<f32>;
|
||||
|
||||
/// Returns the current package power consumption in Watts.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if the underlying RAPL or power sensor cannot be read.
|
||||
fn get_power_w(&self) -> Result<f32>;
|
||||
|
||||
/// Returns the current speed of all detected fans in RPM.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if the fan sensor nodes cannot be read.
|
||||
fn get_fan_rpms(&self) -> Result<Vec<u32>>;
|
||||
|
||||
/// Returns the current average CPU frequency in MHz.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if `/proc/cpuinfo` or a `cpufreq` sysfs node cannot be read.
|
||||
fn get_freq_mhz(&self) -> Result<f32>;
|
||||
|
||||
/// Returns true if the system is currently thermally throttling.
|
||||
@@ -146,53 +136,33 @@ pub trait SensorBus: Send + Sync {
|
||||
}
|
||||
|
||||
impl<T: SensorBus + ?Sized> SensorBus for Arc<T> {
|
||||
fn get_temp(&self) -> Result<f32> {
|
||||
(**self).get_temp()
|
||||
}
|
||||
fn get_power_w(&self) -> Result<f32> {
|
||||
(**self).get_power_w()
|
||||
}
|
||||
fn get_fan_rpms(&self) -> Result<Vec<u32>> {
|
||||
(**self).get_fan_rpms()
|
||||
}
|
||||
fn get_freq_mhz(&self) -> Result<f32> {
|
||||
(**self).get_freq_mhz()
|
||||
}
|
||||
fn get_throttling_status(&self) -> Result<bool> {
|
||||
(**self).get_throttling_status()
|
||||
}
|
||||
fn get_temp(&self) -> Result<f32> { (**self).get_temp() }
|
||||
fn get_power_w(&self) -> Result<f32> { (**self).get_power_w() }
|
||||
fn get_fan_rpms(&self) -> Result<Vec<u32>> { (**self).get_fan_rpms() }
|
||||
fn get_freq_mhz(&self) -> Result<f32> { (**self).get_freq_mhz() }
|
||||
fn get_throttling_status(&self) -> Result<bool> { (**self).get_throttling_status() }
|
||||
}
|
||||
|
||||
use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage};
|
||||
|
||||
/// Provides a write-only interface for hardware actuators.
|
||||
pub trait ActuatorBus: Send + Sync {
|
||||
/// Sets the fan control mode (e.g., "auto" or "max").
|
||||
fn set_fan_mode(&self, mode: &str) -> Result<()>;
|
||||
|
||||
/// Sets the fan speed directly using a validated percentage.
|
||||
fn set_fan_speed(&self, speed: FanSpeedPercentage) -> Result<()>;
|
||||
fn set_fan_speed(&self, speed: FanSpeedPercent) -> Result<()>;
|
||||
|
||||
/// Sets the sustained power limit (PL1) using a validated wrapper.
|
||||
fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>;
|
||||
fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()>;
|
||||
|
||||
/// Sets the burst power limit (PL2) using a validated wrapper.
|
||||
fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>;
|
||||
fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()>;
|
||||
}
|
||||
|
||||
impl<T: ActuatorBus + ?Sized> ActuatorBus for Arc<T> {
|
||||
fn set_fan_mode(&self, mode: &str) -> Result<()> {
|
||||
(**self).set_fan_mode(mode)
|
||||
}
|
||||
fn set_fan_speed(&self, speed: FanSpeedPercentage) -> Result<()> {
|
||||
(**self).set_fan_speed(speed)
|
||||
}
|
||||
fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
|
||||
(**self).set_sustained_power_limit(limit)
|
||||
}
|
||||
fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
|
||||
(**self).set_burst_power_limit(limit)
|
||||
}
|
||||
fn set_fan_mode(&self, mode: &str) -> Result<()> { (**self).set_fan_mode(mode) }
|
||||
fn set_fan_speed(&self, speed: FanSpeedPercent) -> Result<()> { (**self).set_fan_speed(speed) }
|
||||
fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()> { (**self).set_sustained_power_limit(limit) }
|
||||
fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()> { (**self).set_burst_power_limit(limit) }
|
||||
}
|
||||
|
||||
/// Represents the high-level safety status of the system.
|
||||
|
||||
Reference in New Issue
Block a user