implemented more safeguards and autodiscovery

This commit is contained in:
2026-02-27 02:59:23 +01:00
parent f0925a3ab3
commit fe1f58b5ce
7 changed files with 248 additions and 156 deletions

View File

@@ -1,10 +1,11 @@
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep, SafetyStatus, EnvironmentCtx};
use crate::sal::safety::TdpLimitMicroWatts;
use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage};
use anyhow::{Result, Context, anyhow};
use std::fs;
use std::path::{PathBuf};
use std::time::{Duration, Instant};
use std::sync::Mutex;
use tracing::{debug};
use crate::sal::heuristic::discovery::SystemFactSheet;
pub struct DellXps9380Sal {
@@ -22,11 +23,6 @@ pub struct DellXps9380Sal {
suppressed_services: Mutex<Vec<String>>,
msr_file: Mutex<fs::File>,
last_energy: Mutex<(u64, Instant)>,
// --- Original State for Restoration ---
original_pl1: Mutex<Option<u64>>,
original_pl2: Mutex<Option<u64>>,
original_fan_mode: Mutex<Option<String>>,
}
impl DellXps9380Sal {
@@ -58,9 +54,6 @@ impl DellXps9380Sal {
last_energy: Mutex::new((initial_energy, Instant::now())),
fact_sheet: facts,
ctx,
original_pl1: Mutex::new(None),
original_pl2: Mutex::new(None),
original_fan_mode: Mutex::new(None),
})
}
@@ -134,23 +127,11 @@ impl PreflightAuditor for DellXps9380Sal {
impl EnvironmentGuard for DellXps9380Sal {
fn suppress(&self) -> Result<()> {
// 1. Snapshot Power Limits
if let Ok(pl1) = fs::read_to_string(&self.pl1_path) {
*self.original_pl1.lock().unwrap() = pl1.trim().parse().ok();
}
if let Ok(pl2) = fs::read_to_string(&self.pl2_path) {
*self.original_pl2.lock().unwrap() = pl2.trim().parse().ok();
}
// 2. Snapshot Fan Mode (Assumption: Dell BIOS Fan Control is active)
// We can't easily read current state of dell-bios-fan-control, so we assume 'auto' (1)
*self.original_fan_mode.lock().unwrap() = Some("1".to_string());
// 3. Stop Services
let services = ["tlp", "thermald", "i8kmon"];
let mut suppressed = self.suppressed_services.lock().unwrap();
let services = ["tlp", "thermald", "i8kmon"];
for s in services {
if self.ctx.runner.run("systemctl", &["is-active", "--quiet", s]).is_ok() {
debug!("Suppressing service: {}", s);
let _ = self.ctx.runner.run("systemctl", &["stop", s]);
suppressed.push(s.to_string());
}
@@ -159,20 +140,6 @@ impl EnvironmentGuard for DellXps9380Sal {
}
fn restore(&self) -> Result<()> {
// 1. Restore Power Limits
if let Some(pl1) = *self.original_pl1.lock().unwrap() {
let _ = fs::write(&self.pl1_path, pl1.to_string());
}
if let Some(pl2) = *self.original_pl2.lock().unwrap() {
let _ = fs::write(&self.pl2_path, pl2.to_string());
}
// 2. Restore Fan Mode (BIOS Control)
if let Some(tool_path) = self.fact_sheet.paths.tools.get("dell_fan_ctrl") {
let _ = self.ctx.runner.run(&tool_path.to_string_lossy(), &["1"]);
}
// 3. Restart Services
let mut suppressed = self.suppressed_services.lock().unwrap();
for s in suppressed.drain(..) {
let _ = self.ctx.runner.run("systemctl", &["start", &s]);
@@ -196,7 +163,6 @@ impl SensorBus for DellXps9380Sal {
}
fn get_power_w(&self) -> Result<f32> {
// FIX: Ensure we always read from energy_uj if available for delta calculation
let rapl_base = self.pl1_path.parent().context("RAPL path error")?;
let energy_path = rapl_base.join("energy_uj");
@@ -212,7 +178,6 @@ impl SensorBus for DellXps9380Sal {
if delta_t < 0.05 { return Ok(0.0); }
Ok((delta_e as f32 / 1_000_000.0) / delta_t)
} else {
// Fallback to power1_average if it exists (units are µW)
let s = fs::read_to_string(&self.pwr_path)?;
Ok(s.trim().parse::<f32>()? / 1000000.0)
}
@@ -255,6 +220,17 @@ impl ActuatorBus for DellXps9380Sal {
Ok(())
}
fn set_fan_speed(&self, speed: FanSpeedPercentage) -> Result<()> {
let tool_path = self.fact_sheet.paths.tools.get("dell_fan_ctrl")
.ok_or_else(|| anyhow!("Dell fan control tool not found in PATH"))?;
let tool_str = tool_path.to_string_lossy();
if speed.as_u8() > 50 {
let _ = self.ctx.runner.run(&tool_str, &["0"]);
}
Ok(())
}
fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
fs::write(&self.pl1_path, limit.as_u64().to_string())?;
Ok(())

View File

@@ -3,10 +3,9 @@ use std::path::{Path};
use std::fs;
use std::time::{Duration, Instant};
use std::sync::Mutex;
use tracing::{debug};
use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx};
use crate::sal::safety::TdpLimitMicroWatts;
use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage};
use crate::sal::heuristic::discovery::SystemFactSheet;
use crate::sal::heuristic::schema::HardwareDb;
@@ -152,6 +151,10 @@ impl ActuatorBus for GenericLinuxSal {
} else { Ok(()) }
}
fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> {
Ok(())
}
fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL1 path"))?;
fs::write(rapl_path.join("constraint_0_power_limit_uw"), limit.as_u64().to_string())?;

View File

@@ -1,5 +1,5 @@
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, SafetyStatus};
use crate::sal::safety::TdpLimitMicroWatts;
use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage};
use anyhow::Result;
pub struct MockSal {
@@ -60,6 +60,9 @@ impl ActuatorBus for MockSal {
fn set_fan_mode(&self, _mode: &str) -> Result<()> {
Ok(())
}
fn set_fan_speed(&self, _speed: FanSpeedPercentage) -> Result<()> {
Ok(())
}
fn set_sustained_power_limit(&self, _limit: TdpLimitMicroWatts) -> Result<()> {
Ok(())
}

View File

@@ -1,175 +1,194 @@
//! Universal Safeguard Architecture (USA) and Hardware Primitives.
//!
//! This module provides the `HardwareStateGuard` for guaranteed state
//! restoration and type-safe primitives to prevent dangerous hardware states.
//! # Hardware Safety & Universal Safeguard Architecture
//!
//! This module implements the core safety logic for `ember-tune`. It uses the Rust
//! type system to enforce hardware bounds and RAII patterns to guarantee that
//! the system is restored to a safe state even after a crash.
use anyhow::{Result, bail, Context};
use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};
use std::path::{PathBuf};
use tracing::{info, warn, error};
// --- Type-Driven Safety Primitives ---
// --- 1. Type-Driven Bounds Checking ---
/// Represents a safe TDP limit in microwatts.
/// Represents a TDP limit in microwatts, strictly bounded between 5W and 80W.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct TdpLimitMicroWatts(u64);
impl TdpLimitMicroWatts {
/// Strict bounds to prevent hardware bricking.
pub const MIN_SAFE_UW: u64 = 5_000_000; // 5 Watts
pub const MAX_SAFE_UW: u64 = 80_000_000; // 80 Watts
/// # SAFETY:
/// Values below 5W can cause CPU frequency to drop to 400MHz and induce system instability.
pub const MIN_SAFE_UW: u64 = 5_000_000;
/// # SAFETY:
/// Values above 80W can exceed the thermal and electrical design limits of XPS chassis.
pub const MAX_SAFE_UW: u64 = 80_000_000;
/// Constructs a new TdpLimitMicroWatts, enforcing safety bounds.
///
/// # Errors
/// Returns a `HardwareSafetyError` (via `anyhow::bail`) if the value is out of bounds.
/// Validates and constructs a new TDP limit.
pub fn new(microwatts: u64) -> Result<Self> {
if microwatts < Self::MIN_SAFE_UW {
bail!("HardwareSafetyError: Requested TDP {} uW is below the absolute safety floor of {} uW.", microwatts, Self::MIN_SAFE_UW);
bail!("HardwareSafetyError: Requested TDP {}uW is below safety floor (5W).", microwatts);
}
if microwatts > Self::MAX_SAFE_UW {
bail!("HardwareSafetyError: Requested TDP {} uW exceeds absolute maximum of {} uW.", microwatts, Self::MAX_SAFE_UW);
bail!("HardwareSafetyError: Requested TDP {}uW exceeds safety ceiling (80W).", microwatts);
}
Ok(Self(microwatts))
}
pub fn as_u64(&self) -> u64 {
self.0
}
pub fn as_watts(&self) -> f32 {
self.0 as f32 / 1_000_000.0
pub fn from_watts(watts: f32) -> Result<Self> {
Self::new((watts * 1_000_000.0) as u64)
}
pub fn as_u64(&self) -> u64 { self.0 }
}
/// Represents a safe Fan Speed in Percentage (0-100).
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
/// Represents a fan speed percentage (0-100%).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct FanSpeedPercentage(u8);
impl FanSpeedPercentage {
/// Constructs a new FanSpeedPercentage, enforcing safety bounds.
pub fn new(percent: u8) -> Result<Self> {
if percent > 100 {
bail!("HardwareSafetyError: Fan speed percentage {} exceeds 100%.", percent);
bail!("HardwareSafetyError: Fan speed {}% is invalid.", percent);
}
Ok(Self(percent))
}
pub fn as_u8(&self) -> u8 {
self.0
}
pub fn as_u8(&self) -> u8 { self.0 }
}
/// Represents a safe Thermal Threshold in Celsius.
/// Represents a thermal threshold in Celsius, bounded to TjMax - 2°C (98°C).
#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
pub struct ThermalThresholdCelsius(f32);
impl ThermalThresholdCelsius {
pub const MAX_SAFE_C: f32 = 98.0;
/// Constructs a new ThermalThresholdCelsius, enforcing safety bounds.
pub fn new(celsius: f32) -> Result<Self> {
if celsius < 0.0 || celsius > Self::MAX_SAFE_C {
bail!("HardwareSafetyError: Thermal threshold {}°C is outside safe bounds (0.0 - {}).", celsius, Self::MAX_SAFE_C);
if celsius > Self::MAX_SAFE_C {
bail!("HardwareSafetyError: Thermal threshold {}C exceeds safe limit (98C).", celsius);
}
Ok(Self(celsius))
}
pub fn as_f32(&self) -> f32 {
self.0
}
pub fn as_f32(&self) -> f32 { self.0 }
}
// --- The HardwareStateGuard (RAII Restorer) ---
// --- 2. The HardwareStateGuard (RAII Restorer) ---
/// Represents a deep snapshot of the system state before benchmarking.
#[derive(Debug, Default, Clone)]
pub struct SystemSnapshot {
/// Maps file paths to their raw string content (e.g., RAPL limits).
pub sysfs_nodes: HashMap<PathBuf, String>,
/// List of services that were active and subsequently stopped.
pub suppressed_services: Vec<String>,
}
/// Defines an arbitrary action to take during restoration.
pub type RollbackAction = Box<dyn FnOnce() + Send + 'static>;
/// The Universal Safeguard wrapper.
///
/// Implements the "Ironclad Restorer" pattern via the [Drop] trait.
/// Holds a snapshot of the system state. Restores everything on Drop.
pub struct HardwareStateGuard {
snapshot: SystemSnapshot,
is_armed: bool,
/// Maps sysfs paths to their original string contents.
snapshots: HashMap<PathBuf, String>,
/// Services that were stopped and must be restarted.
suppressed_services: Vec<String>,
/// Arbitrary actions to perform on restoration (e.g., reset fan mode).
rollback_actions: Vec<RollbackAction>,
is_active: bool,
}
impl HardwareStateGuard {
/// Arms the safeguard by taking a snapshot of the target files and services.
///
/// # Errors
/// Returns an error if any critical sysfs node cannot be read.
/// Snapshots the requested files and neutralizes competing services.
pub fn acquire(target_files: &[PathBuf], target_services: &[String]) -> Result<Self> {
let mut snapshot = SystemSnapshot::default();
let mut snapshots = HashMap::new();
let mut suppressed = Vec::new();
info!("USA: Arming safeguard and snapshotting system state...");
info!("USA: Arming HardwareStateGuard. Snapshotting critical registers...");
for path in target_files {
if path.exists() {
let content = fs::read_to_string(path)
.with_context(|| format!("Failed to snapshot {:?}", path))?;
snapshot.sysfs_nodes.insert(path.clone(), content.trim().to_string());
} else {
warn!("USA: Target node {:?} does not exist, skipping snapshot.", path);
snapshots.insert(path.clone(), content.trim().to_string());
}
}
for service in target_services {
for svc in target_services {
let status = std::process::Command::new("systemctl")
.args(["is-active", "--quiet", service])
.args(["is-active", "--quiet", svc])
.status();
if let Ok(s) = status {
if s.success() {
snapshot.suppressed_services.push(service.clone());
info!("USA: Neutralizing service '{}'", svc);
let _ = std::process::Command::new("systemctl").args(["stop", svc]).status();
suppressed.push(svc.clone());
}
}
}
Ok(Self {
snapshot,
is_armed: true,
snapshots,
suppressed_services: suppressed,
rollback_actions: Vec::new(),
is_active: true,
})
}
/// Explicit manual restoration (can be called upon successful exit).
pub fn release(&mut self) -> Result<()> {
if !self.is_armed {
return Ok(());
}
/// Registers a custom action to be performed when the guard is released.
pub fn on_rollback(&mut self, action: RollbackAction) {
self.rollback_actions.push(action);
}
info!("USA: Initiating Ironclad Restoration...");
/// Explicitly release and restore the hardware state.
pub fn release(&mut self) -> Result<()> {
if !self.is_active { return Ok(()); }
info!("USA: Releasing guard. Restoring hardware to pre-flight state...");
// 1. Restore Power/Sysfs states
for (path, content) in &self.snapshot.sysfs_nodes {
for (path, content) in &self.snapshots {
if let Err(e) = fs::write(path, content) {
error!("USA RESTORATION FAILURE: Could not revert {:?}: {}", path, e);
error!("CRITICAL: Failed to restore {:?}: {}", path, e);
}
}
// 2. Restart Services
for service in &self.snapshot.suppressed_services {
let _ = std::process::Command::new("systemctl")
.args(["start", service])
.status();
for svc in &self.suppressed_services {
let _ = std::process::Command::new("systemctl").args(["start", svc]).status();
}
self.is_armed = false;
// 3. Perform Custom Rollback Actions
for action in self.rollback_actions.drain(..) {
(action)();
}
self.is_active = false;
Ok(())
}
}
impl Drop for HardwareStateGuard {
fn drop(&mut self) {
if self.is_armed {
warn!("USA: HardwareStateGuard triggered via Drop (panic/unexpected exit). Reverting system state...");
if self.is_active {
warn!("USA: Guard dropped prematurely (panic/SIGTERM). Force-restoring system...");
let _ = self.release();
}
}
}
// --- 3. Transactional Configuration ---
/// A staged set of changes to be applied to the hardware.
#[derive(Default)]
pub struct ConfigurationTransaction {
changes: Vec<(PathBuf, String)>,
}
impl ConfigurationTransaction {
pub fn add_change(&mut self, path: PathBuf, value: String) {
self.changes.push((path, value));
}
/// # SAFETY:
/// Commits all changes. If any write fails, it returns an error but the
/// HardwareStateGuard will still restore everything on drop.
pub fn commit(self) -> Result<()> {
for (path, val) in self.changes {
fs::write(&path, val)
.with_context(|| format!("Failed to apply change to {:?}", path))?;
}
Ok(())
}
}

View File

@@ -157,26 +157,20 @@ impl<T: SensorBus + ?Sized> SensorBus for Arc<T> {
}
}
use crate::sal::safety::TdpLimitMicroWatts;
use crate::sal::safety::{TdpLimitMicroWatts, FanSpeedPercentage};
/// Provides a write-only interface for hardware actuators.
pub trait ActuatorBus: Send + Sync {
/// Sets the fan control mode (e.g., "auto" or "max").
///
/// # Errors
/// Returns an error if the fan control command or `sysfs` write fails.
fn set_fan_mode(&self, mode: &str) -> Result<()>;
/// Sets the fan speed directly using a validated percentage.
fn set_fan_speed(&self, speed: FanSpeedPercentage) -> Result<()>;
/// Sets the sustained power limit (PL1) using a validated wrapper.
///
/// # Errors
/// Returns an error if the RAPL `sysfs` node cannot be written to.
fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>;
/// Sets the burst power limit (PL2) using a validated wrapper.
///
/// # Errors
/// Returns an error if the RAPL `sysfs` node cannot be written to.
fn set_burst_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()>;
}
@@ -184,6 +178,9 @@ impl<T: ActuatorBus + ?Sized> ActuatorBus for Arc<T> {
fn set_fan_mode(&self, mode: &str) -> Result<()> {
(**self).set_fan_mode(mode)
}
fn set_fan_speed(&self, speed: FanSpeedPercentage) -> Result<()> {
(**self).set_fan_speed(speed)
}
fn set_sustained_power_limit(&self, limit: TdpLimitMicroWatts) -> Result<()> {
(**self).set_sustained_power_limit(limit)
}