updated docs for release

This commit is contained in:
2026-02-26 17:17:57 +01:00
parent 667d94af7a
commit f4656619be
10 changed files with 335 additions and 81 deletions

View File

@@ -1,11 +1,21 @@
use anyhow::Result;
use thiserror::Error;
//! Core traits defining the System Abstraction Layer (SAL).
//!
//! This module provides a set of hardware-agnostic interfaces that the
//! `BenchmarkOrchestrator` uses to interact with the underlying system.
//! These traits allow `ember-tune` to support diverse hardware by abstracting
//! away platform-specific details.
use miette::Diagnostic;
use std::sync::Arc;
use std::path::PathBuf;
use crate::sys::SyscallRunner;
use anyhow::Result;
use thiserror::Error;
/// Context holding OS abstractions (filesystem base and syscall runner).
///
/// This is injected into SAL implementations to allow for a mocked "virtual"
/// environment during testing, preventing `cargo test` from mutating the host system.
#[derive(Clone)]
pub struct EnvironmentCtx {
pub sysfs_base: PathBuf,
@@ -13,6 +23,7 @@ pub struct EnvironmentCtx {
}
impl EnvironmentCtx {
/// Creates a production-ready context pointing to the real filesystem root.
pub fn production() -> Self {
Self {
sysfs_base: PathBuf::from("/"),
@@ -21,41 +32,52 @@ impl EnvironmentCtx {
}
}
/// Errors that can occur during the pre-flight system audit.
#[derive(Error, Diagnostic, Debug, Clone)]
pub enum AuditError {
/// The user does not have root privileges (`uid=0`).
#[error("Missing root privileges.")]
#[diagnostic(code(ember_tune::root_required), severity(error))]
#[help("ember-tune requires direct hardware access (MSRs, sysfs). Please run with 'sudo'.")]
RootRequired,
/// A required kernel parameter is missing from the boot command line.
#[error("Missing kernel parameter: {0}")]
#[diagnostic(code(ember_tune::missing_kernel_param), severity(error))]
#[help("Add '{0}' to your GRUB_CMDLINE_LINUX_DEFAULT in /etc/default/grub, then run 'sudo update-grub' and reboot.")]
MissingKernelParam(String),
/// The system is running on battery power.
#[error("System is running on battery: {0}")]
#[diagnostic(code(ember_tune::ac_power_missing), severity(error))]
#[help("Thermal benchmarking requires a stable AC power source to ensure consistent PL limits. Please plug in your charger.")]
AcPowerMissing(String),
/// The Linux kernel version is known to be incompatible.
#[error("Incompatible kernel version: {0}")]
#[diagnostic(code(ember_tune::kernel_incompatible), severity(error))]
#[help("Your kernel version '{0}' may not support the required RAPL or SMM interfaces. Please upgrade to a recent LTS kernel (6.1+).")]
KernelIncompatible(String),
/// A required kernel module or CLI tool is not available.
#[error("Required tool missing: {0}")]
#[diagnostic(code(ember_tune::tool_missing), severity(error))]
#[help("The utility '{0}' is required for this SAL. Please install it using your package manager (e.g., 'sudo apt install {0}').")]
ToolMissing(String),
}
/// A single, verifiable step in the pre-flight audit process.
pub struct AuditStep {
/// Human-readable description of the check.
pub description: String,
/// The outcome of the check.
pub outcome: Result<(), AuditError>,
}
/// Evaluates immutable system states (e.g., kernel bootline parameters, AC power status).
/// Evaluates immutable system states before the benchmark begins.
pub trait PreflightAuditor: Send + Sync {
/// Returns an iterator of [AuditStep] results.
/// This allows the UI to show a live checklist of system verification steps.
fn audit(&self) -> Box<dyn Iterator<Item = AuditStep> + '_>;
}
@@ -65,9 +87,22 @@ impl<T: PreflightAuditor + ?Sized> PreflightAuditor for Arc<T> {
}
}
/// Suppresses conflicting daemons (tlp, thermald).
/// Manages system services that conflict with the benchmark.
///
/// # Invariants
/// The `Drop` trait is *not* used for guaranteed cleanup. The orchestrator must
/// explicitly call `restore()` to ensure hardware state is reset.
pub trait EnvironmentGuard: Send + Sync {
/// Stops any conflicting system daemons (e.g., `tlp`, `thermald`).
///
/// # Errors
/// Returns an error if the `systemctl` command fails.
fn suppress(&self) -> Result<()>;
/// Restarts any services that were stopped by `suppress`.
///
/// # Errors
/// Returns an error if the `systemctl` command fails.
fn restore(&self) -> Result<()>;
}
@@ -80,11 +115,30 @@ impl<T: EnvironmentGuard + ?Sized> EnvironmentGuard for Arc<T> {
}
}
/// Read-only interface for standardized metrics.
/// Provides a read-only interface to system telemetry sensors.
pub trait SensorBus: Send + Sync {
/// Returns the current package temperature in degrees Celsius.
///
/// # Errors
/// Returns an error if the underlying `hwmon` or `sysfs` node cannot be read.
fn get_temp(&self) -> Result<f32>;
/// Returns the current package power consumption in Watts.
///
/// # Errors
/// Returns an error if the underlying RAPL or power sensor cannot be read.
fn get_power_w(&self) -> Result<f32>;
/// Returns the current speed of all detected fans in RPM.
///
/// # Errors
/// Returns an error if the fan sensor nodes cannot be read.
fn get_fan_rpms(&self) -> Result<Vec<u32>>;
/// Returns the current average CPU frequency in MHz.
///
/// # Errors
/// Returns an error if `/proc/cpuinfo` or a `cpufreq` sysfs node cannot be read.
fn get_freq_mhz(&self) -> Result<f32>;
}
@@ -103,10 +157,24 @@ impl<T: SensorBus + ?Sized> SensorBus for Arc<T> {
}
}
/// Write-only interface for hardware commands.
/// Provides a write-only interface for hardware actuators.
pub trait ActuatorBus: Send + Sync {
/// Sets the fan control mode (e.g., "auto" or "max").
///
/// # Errors
/// Returns an error if the fan control command or `sysfs` write fails.
fn set_fan_mode(&self, mode: &str) -> Result<()>;
/// Sets the sustained power limit (PL1) in Watts.
///
/// # Errors
/// Returns an error if the RAPL `sysfs` node cannot be written to.
fn set_sustained_power_limit(&self, watts: f32) -> Result<()>;
/// Sets the burst power limit (PL2) in Watts.
///
/// # Errors
/// Returns an error if the RAPL `sysfs` node cannot be written to.
fn set_burst_power_limit(&self, watts: f32) -> Result<()>;
}
@@ -122,8 +190,27 @@ impl<T: ActuatorBus + ?Sized> ActuatorBus for Arc<T> {
}
}
/// Concurrent monitor for catastrophic states.
/// Represents the high-level safety status of the system.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum SafetyStatus {
/// The system is operating within normal parameters.
Nominal,
/// A non-critical issue was detected and may have been auto-corrected.
Warning(String),
/// A potentially dangerous state was detected, but is not yet an emergency.
Critical(String),
/// A critical failure has occurred, requiring an immediate shutdown of the benchmark.
EmergencyAbort(String),
}
/// A high-frequency monitor for catastrophic hardware states.
pub trait HardwareWatchdog: Send + Sync {
/// Returns the current [SafetyStatus] of the system.
///
/// # Errors
/// This method can return an error if a sensor required for a safety check
/// (e.g., the thermal sensor) fails to read. The orchestrator must treat
/// this as an `EmergencyAbort` condition.
fn get_safety_status(&self) -> Result<SafetyStatus>;
}
@@ -133,15 +220,10 @@ impl<T: HardwareWatchdog + ?Sized> HardwareWatchdog for Arc<T> {
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum SafetyStatus {
Nominal,
Warning(String),
Critical(String),
EmergencyAbort(String),
}
/// Aggregate trait for a complete platform implementation.
///
/// This "super-trait" combines all SAL interfaces into a single object-safe
/// trait, simplifying dependency injection into the `BenchmarkOrchestrator`.
pub trait PlatformSal: PreflightAuditor + SensorBus + ActuatorBus + EnvironmentGuard + HardwareWatchdog {}
impl<T: PreflightAuditor + SensorBus + ActuatorBus + EnvironmentGuard + HardwareWatchdog + ?Sized> PlatformSal for T {}