updated docs for release
This commit is contained in:
@@ -1,11 +1,21 @@
|
||||
use anyhow::Result;
|
||||
use thiserror::Error;
|
||||
//! Core traits defining the System Abstraction Layer (SAL).
|
||||
//!
|
||||
//! This module provides a set of hardware-agnostic interfaces that the
|
||||
//! `BenchmarkOrchestrator` uses to interact with the underlying system.
|
||||
//! These traits allow `ember-tune` to support diverse hardware by abstracting
|
||||
//! away platform-specific details.
|
||||
|
||||
use miette::Diagnostic;
|
||||
use std::sync::Arc;
|
||||
use std::path::PathBuf;
|
||||
use crate::sys::SyscallRunner;
|
||||
use anyhow::Result;
|
||||
use thiserror::Error;
|
||||
|
||||
/// Context holding OS abstractions (filesystem base and syscall runner).
|
||||
///
|
||||
/// This is injected into SAL implementations to allow for a mocked "virtual"
|
||||
/// environment during testing, preventing `cargo test` from mutating the host system.
|
||||
#[derive(Clone)]
|
||||
pub struct EnvironmentCtx {
|
||||
pub sysfs_base: PathBuf,
|
||||
@@ -13,6 +23,7 @@ pub struct EnvironmentCtx {
|
||||
}
|
||||
|
||||
impl EnvironmentCtx {
|
||||
/// Creates a production-ready context pointing to the real filesystem root.
|
||||
pub fn production() -> Self {
|
||||
Self {
|
||||
sysfs_base: PathBuf::from("/"),
|
||||
@@ -21,41 +32,52 @@ impl EnvironmentCtx {
|
||||
}
|
||||
}
|
||||
|
||||
/// Errors that can occur during the pre-flight system audit.
|
||||
#[derive(Error, Diagnostic, Debug, Clone)]
|
||||
pub enum AuditError {
|
||||
/// The user does not have root privileges (`uid=0`).
|
||||
#[error("Missing root privileges.")]
|
||||
#[diagnostic(code(ember_tune::root_required), severity(error))]
|
||||
#[help("ember-tune requires direct hardware access (MSRs, sysfs). Please run with 'sudo'.")]
|
||||
RootRequired,
|
||||
|
||||
/// A required kernel parameter is missing from the boot command line.
|
||||
#[error("Missing kernel parameter: {0}")]
|
||||
#[diagnostic(code(ember_tune::missing_kernel_param), severity(error))]
|
||||
#[help("Add '{0}' to your GRUB_CMDLINE_LINUX_DEFAULT in /etc/default/grub, then run 'sudo update-grub' and reboot.")]
|
||||
MissingKernelParam(String),
|
||||
|
||||
/// The system is running on battery power.
|
||||
#[error("System is running on battery: {0}")]
|
||||
#[diagnostic(code(ember_tune::ac_power_missing), severity(error))]
|
||||
#[help("Thermal benchmarking requires a stable AC power source to ensure consistent PL limits. Please plug in your charger.")]
|
||||
AcPowerMissing(String),
|
||||
|
||||
/// The Linux kernel version is known to be incompatible.
|
||||
#[error("Incompatible kernel version: {0}")]
|
||||
#[diagnostic(code(ember_tune::kernel_incompatible), severity(error))]
|
||||
#[help("Your kernel version '{0}' may not support the required RAPL or SMM interfaces. Please upgrade to a recent LTS kernel (6.1+).")]
|
||||
KernelIncompatible(String),
|
||||
|
||||
/// A required kernel module or CLI tool is not available.
|
||||
#[error("Required tool missing: {0}")]
|
||||
#[diagnostic(code(ember_tune::tool_missing), severity(error))]
|
||||
#[help("The utility '{0}' is required for this SAL. Please install it using your package manager (e.g., 'sudo apt install {0}').")]
|
||||
ToolMissing(String),
|
||||
}
|
||||
|
||||
/// A single, verifiable step in the pre-flight audit process.
|
||||
pub struct AuditStep {
|
||||
/// Human-readable description of the check.
|
||||
pub description: String,
|
||||
/// The outcome of the check.
|
||||
pub outcome: Result<(), AuditError>,
|
||||
}
|
||||
|
||||
/// Evaluates immutable system states (e.g., kernel bootline parameters, AC power status).
|
||||
/// Evaluates immutable system states before the benchmark begins.
|
||||
pub trait PreflightAuditor: Send + Sync {
|
||||
/// Returns an iterator of [AuditStep] results.
|
||||
/// This allows the UI to show a live checklist of system verification steps.
|
||||
fn audit(&self) -> Box<dyn Iterator<Item = AuditStep> + '_>;
|
||||
}
|
||||
|
||||
@@ -65,9 +87,22 @@ impl<T: PreflightAuditor + ?Sized> PreflightAuditor for Arc<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Suppresses conflicting daemons (tlp, thermald).
|
||||
/// Manages system services that conflict with the benchmark.
|
||||
///
|
||||
/// # Invariants
|
||||
/// The `Drop` trait is *not* used for guaranteed cleanup. The orchestrator must
|
||||
/// explicitly call `restore()` to ensure hardware state is reset.
|
||||
pub trait EnvironmentGuard: Send + Sync {
|
||||
/// Stops any conflicting system daemons (e.g., `tlp`, `thermald`).
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if the `systemctl` command fails.
|
||||
fn suppress(&self) -> Result<()>;
|
||||
|
||||
/// Restarts any services that were stopped by `suppress`.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if the `systemctl` command fails.
|
||||
fn restore(&self) -> Result<()>;
|
||||
}
|
||||
|
||||
@@ -80,11 +115,30 @@ impl<T: EnvironmentGuard + ?Sized> EnvironmentGuard for Arc<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Read-only interface for standardized metrics.
|
||||
/// Provides a read-only interface to system telemetry sensors.
|
||||
pub trait SensorBus: Send + Sync {
|
||||
/// Returns the current package temperature in degrees Celsius.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if the underlying `hwmon` or `sysfs` node cannot be read.
|
||||
fn get_temp(&self) -> Result<f32>;
|
||||
|
||||
/// Returns the current package power consumption in Watts.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if the underlying RAPL or power sensor cannot be read.
|
||||
fn get_power_w(&self) -> Result<f32>;
|
||||
|
||||
/// Returns the current speed of all detected fans in RPM.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if the fan sensor nodes cannot be read.
|
||||
fn get_fan_rpms(&self) -> Result<Vec<u32>>;
|
||||
|
||||
/// Returns the current average CPU frequency in MHz.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if `/proc/cpuinfo` or a `cpufreq` sysfs node cannot be read.
|
||||
fn get_freq_mhz(&self) -> Result<f32>;
|
||||
}
|
||||
|
||||
@@ -103,10 +157,24 @@ impl<T: SensorBus + ?Sized> SensorBus for Arc<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Write-only interface for hardware commands.
|
||||
/// Provides a write-only interface for hardware actuators.
|
||||
pub trait ActuatorBus: Send + Sync {
|
||||
/// Sets the fan control mode (e.g., "auto" or "max").
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if the fan control command or `sysfs` write fails.
|
||||
fn set_fan_mode(&self, mode: &str) -> Result<()>;
|
||||
|
||||
/// Sets the sustained power limit (PL1) in Watts.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if the RAPL `sysfs` node cannot be written to.
|
||||
fn set_sustained_power_limit(&self, watts: f32) -> Result<()>;
|
||||
|
||||
/// Sets the burst power limit (PL2) in Watts.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if the RAPL `sysfs` node cannot be written to.
|
||||
fn set_burst_power_limit(&self, watts: f32) -> Result<()>;
|
||||
}
|
||||
|
||||
@@ -122,8 +190,27 @@ impl<T: ActuatorBus + ?Sized> ActuatorBus for Arc<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Concurrent monitor for catastrophic states.
|
||||
/// Represents the high-level safety status of the system.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum SafetyStatus {
|
||||
/// The system is operating within normal parameters.
|
||||
Nominal,
|
||||
/// A non-critical issue was detected and may have been auto-corrected.
|
||||
Warning(String),
|
||||
/// A potentially dangerous state was detected, but is not yet an emergency.
|
||||
Critical(String),
|
||||
/// A critical failure has occurred, requiring an immediate shutdown of the benchmark.
|
||||
EmergencyAbort(String),
|
||||
}
|
||||
|
||||
/// A high-frequency monitor for catastrophic hardware states.
|
||||
pub trait HardwareWatchdog: Send + Sync {
|
||||
/// Returns the current [SafetyStatus] of the system.
|
||||
///
|
||||
/// # Errors
|
||||
/// This method can return an error if a sensor required for a safety check
|
||||
/// (e.g., the thermal sensor) fails to read. The orchestrator must treat
|
||||
/// this as an `EmergencyAbort` condition.
|
||||
fn get_safety_status(&self) -> Result<SafetyStatus>;
|
||||
}
|
||||
|
||||
@@ -133,15 +220,10 @@ impl<T: HardwareWatchdog + ?Sized> HardwareWatchdog for Arc<T> {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum SafetyStatus {
|
||||
Nominal,
|
||||
Warning(String),
|
||||
Critical(String),
|
||||
EmergencyAbort(String),
|
||||
}
|
||||
|
||||
/// Aggregate trait for a complete platform implementation.
|
||||
///
|
||||
/// This "super-trait" combines all SAL interfaces into a single object-safe
|
||||
/// trait, simplifying dependency injection into the `BenchmarkOrchestrator`.
|
||||
pub trait PlatformSal: PreflightAuditor + SensorBus + ActuatorBus + EnvironmentGuard + HardwareWatchdog {}
|
||||
|
||||
impl<T: PreflightAuditor + SensorBus + ActuatorBus + EnvironmentGuard + HardwareWatchdog + ?Sized> PlatformSal for T {}
|
||||
|
||||
Reference in New Issue
Block a user