230 lines
8.3 KiB
Rust
230 lines
8.3 KiB
Rust
//! Core traits defining the System Abstraction Layer (SAL).
|
|
//!
|
|
//! This module provides a set of hardware-agnostic interfaces that the
|
|
//! `BenchmarkOrchestrator` uses to interact with the underlying system.
|
|
//! These traits allow `ember-tune` to support diverse hardware by abstracting
|
|
//! away platform-specific details.
|
|
|
|
use miette::Diagnostic;
|
|
use std::sync::Arc;
|
|
use std::path::PathBuf;
|
|
use crate::sys::SyscallRunner;
|
|
use anyhow::Result;
|
|
use thiserror::Error;
|
|
|
|
/// Context holding OS abstractions (filesystem base and syscall runner).
|
|
///
|
|
/// This is injected into SAL implementations to allow for a mocked "virtual"
|
|
/// environment during testing, preventing `cargo test` from mutating the host system.
|
|
#[derive(Clone)]
|
|
pub struct EnvironmentCtx {
|
|
pub sysfs_base: PathBuf,
|
|
pub runner: Arc<dyn SyscallRunner>,
|
|
}
|
|
|
|
impl EnvironmentCtx {
|
|
/// Creates a production-ready context pointing to the real filesystem root.
|
|
pub fn production() -> Self {
|
|
Self {
|
|
sysfs_base: PathBuf::from("/"),
|
|
runner: Arc::new(crate::sys::RealSyscallRunner),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Errors that can occur during the pre-flight system audit.
|
|
#[derive(Error, Diagnostic, Debug, Clone)]
|
|
pub enum AuditError {
|
|
/// The user does not have root privileges (`uid=0`).
|
|
#[error("Missing root privileges.")]
|
|
#[diagnostic(code(ember_tune::root_required), severity(error))]
|
|
#[help("ember-tune requires direct hardware access (MSRs, sysfs). Please run with 'sudo'.")]
|
|
RootRequired,
|
|
|
|
/// A required kernel parameter is missing from the boot command line.
|
|
#[error("Missing kernel parameter: {0}")]
|
|
#[diagnostic(code(ember_tune::missing_kernel_param), severity(error))]
|
|
#[help("Add '{0}' to your GRUB_CMDLINE_LINUX_DEFAULT in /etc/default/grub, then run 'sudo update-grub' and reboot.")]
|
|
MissingKernelParam(String),
|
|
|
|
/// The system is running on battery power.
|
|
#[error("System is running on battery: {0}")]
|
|
#[diagnostic(code(ember_tune::ac_power_missing), severity(error))]
|
|
#[help("Thermal benchmarking requires a stable AC power source to ensure consistent PL limits. Please plug in your charger.")]
|
|
AcPowerMissing(String),
|
|
|
|
/// The Linux kernel version is known to be incompatible.
|
|
#[error("Incompatible kernel version: {0}")]
|
|
#[diagnostic(code(ember_tune::kernel_incompatible), severity(error))]
|
|
#[help("Your kernel version '{0}' may not support the required RAPL or SMM interfaces. Please upgrade to a recent LTS kernel (6.1+).")]
|
|
KernelIncompatible(String),
|
|
|
|
/// A required kernel module or CLI tool is not available.
|
|
#[error("Required tool missing: {0}")]
|
|
#[diagnostic(code(ember_tune::tool_missing), severity(error))]
|
|
#[help("The utility '{0}' is required for this SAL. Please install it using your package manager (e.g., 'sudo apt install {0}').")]
|
|
ToolMissing(String),
|
|
}
|
|
|
|
/// A single, verifiable step in the pre-flight audit process.
|
|
pub struct AuditStep {
|
|
/// Human-readable description of the check.
|
|
pub description: String,
|
|
/// The outcome of the check.
|
|
pub outcome: Result<(), AuditError>,
|
|
}
|
|
|
|
/// Evaluates immutable system states before the benchmark begins.
|
|
pub trait PreflightAuditor: Send + Sync {
|
|
/// Returns an iterator of [AuditStep] results.
|
|
/// This allows the UI to show a live checklist of system verification steps.
|
|
fn audit(&self) -> Box<dyn Iterator<Item = AuditStep> + '_>;
|
|
}
|
|
|
|
impl<T: PreflightAuditor + ?Sized> PreflightAuditor for Arc<T> {
|
|
fn audit(&self) -> Box<dyn Iterator<Item = AuditStep> + '_> {
|
|
(**self).audit()
|
|
}
|
|
}
|
|
|
|
/// Manages system services that conflict with the benchmark.
|
|
///
|
|
/// # Invariants
|
|
/// The `Drop` trait is *not* used for guaranteed cleanup. The orchestrator must
|
|
/// explicitly call `restore()` to ensure hardware state is reset.
|
|
pub trait EnvironmentGuard: Send + Sync {
|
|
/// Stops any conflicting system daemons (e.g., `tlp`, `thermald`).
|
|
///
|
|
/// # Errors
|
|
/// Returns an error if the `systemctl` command fails.
|
|
fn suppress(&self) -> Result<()>;
|
|
|
|
/// Restarts any services that were stopped by `suppress`.
|
|
///
|
|
/// # Errors
|
|
/// Returns an error if the `systemctl` command fails.
|
|
fn restore(&self) -> Result<()>;
|
|
}
|
|
|
|
impl<T: EnvironmentGuard + ?Sized> EnvironmentGuard for Arc<T> {
|
|
fn suppress(&self) -> Result<()> {
|
|
(**self).suppress()
|
|
}
|
|
fn restore(&self) -> Result<()> {
|
|
(**self).restore()
|
|
}
|
|
}
|
|
|
|
/// Provides a read-only interface to system telemetry sensors.
|
|
pub trait SensorBus: Send + Sync {
|
|
/// Returns the current package temperature in degrees Celsius.
|
|
///
|
|
/// # Errors
|
|
/// Returns an error if the underlying `hwmon` or `sysfs` node cannot be read.
|
|
fn get_temp(&self) -> Result<f32>;
|
|
|
|
/// Returns the current package power consumption in Watts.
|
|
///
|
|
/// # Errors
|
|
/// Returns an error if the underlying RAPL or power sensor cannot be read.
|
|
fn get_power_w(&self) -> Result<f32>;
|
|
|
|
/// Returns the current speed of all detected fans in RPM.
|
|
///
|
|
/// # Errors
|
|
/// Returns an error if the fan sensor nodes cannot be read.
|
|
fn get_fan_rpms(&self) -> Result<Vec<u32>>;
|
|
|
|
/// Returns the current average CPU frequency in MHz.
|
|
///
|
|
/// # Errors
|
|
/// Returns an error if `/proc/cpuinfo` or a `cpufreq` sysfs node cannot be read.
|
|
fn get_freq_mhz(&self) -> Result<f32>;
|
|
}
|
|
|
|
impl<T: SensorBus + ?Sized> SensorBus for Arc<T> {
|
|
fn get_temp(&self) -> Result<f32> {
|
|
(**self).get_temp()
|
|
}
|
|
fn get_power_w(&self) -> Result<f32> {
|
|
(**self).get_power_w()
|
|
}
|
|
fn get_fan_rpms(&self) -> Result<Vec<u32>> {
|
|
(**self).get_fan_rpms()
|
|
}
|
|
fn get_freq_mhz(&self) -> Result<f32> {
|
|
(**self).get_freq_mhz()
|
|
}
|
|
}
|
|
|
|
/// Provides a write-only interface for hardware actuators.
|
|
pub trait ActuatorBus: Send + Sync {
|
|
/// Sets the fan control mode (e.g., "auto" or "max").
|
|
///
|
|
/// # Errors
|
|
/// Returns an error if the fan control command or `sysfs` write fails.
|
|
fn set_fan_mode(&self, mode: &str) -> Result<()>;
|
|
|
|
/// Sets the sustained power limit (PL1) in Watts.
|
|
///
|
|
/// # Errors
|
|
/// Returns an error if the RAPL `sysfs` node cannot be written to.
|
|
fn set_sustained_power_limit(&self, watts: f32) -> Result<()>;
|
|
|
|
/// Sets the burst power limit (PL2) in Watts.
|
|
///
|
|
/// # Errors
|
|
/// Returns an error if the RAPL `sysfs` node cannot be written to.
|
|
fn set_burst_power_limit(&self, watts: f32) -> Result<()>;
|
|
}
|
|
|
|
impl<T: ActuatorBus + ?Sized> ActuatorBus for Arc<T> {
|
|
fn set_fan_mode(&self, mode: &str) -> Result<()> {
|
|
(**self).set_fan_mode(mode)
|
|
}
|
|
fn set_sustained_power_limit(&self, watts: f32) -> Result<()> {
|
|
(**self).set_sustained_power_limit(watts)
|
|
}
|
|
fn set_burst_power_limit(&self, watts: f32) -> Result<()> {
|
|
(**self).set_burst_power_limit(watts)
|
|
}
|
|
}
|
|
|
|
/// Represents the high-level safety status of the system.
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
pub enum SafetyStatus {
|
|
/// The system is operating within normal parameters.
|
|
Nominal,
|
|
/// A non-critical issue was detected and may have been auto-corrected.
|
|
Warning(String),
|
|
/// A potentially dangerous state was detected, but is not yet an emergency.
|
|
Critical(String),
|
|
/// A critical failure has occurred, requiring an immediate shutdown of the benchmark.
|
|
EmergencyAbort(String),
|
|
}
|
|
|
|
/// A high-frequency monitor for catastrophic hardware states.
|
|
pub trait HardwareWatchdog: Send + Sync {
|
|
/// Returns the current [SafetyStatus] of the system.
|
|
///
|
|
/// # Errors
|
|
/// This method can return an error if a sensor required for a safety check
|
|
/// (e.g., the thermal sensor) fails to read. The orchestrator must treat
|
|
/// this as an `EmergencyAbort` condition.
|
|
fn get_safety_status(&self) -> Result<SafetyStatus>;
|
|
}
|
|
|
|
impl<T: HardwareWatchdog + ?Sized> HardwareWatchdog for Arc<T> {
|
|
fn get_safety_status(&self) -> Result<SafetyStatus> {
|
|
(**self).get_safety_status()
|
|
}
|
|
}
|
|
|
|
/// Aggregate trait for a complete platform implementation.
|
|
///
|
|
/// This "super-trait" combines all SAL interfaces into a single object-safe
|
|
/// trait, simplifying dependency injection into the `BenchmarkOrchestrator`.
|
|
pub trait PlatformSal: PreflightAuditor + SensorBus + ActuatorBus + EnvironmentGuard + HardwareWatchdog {}
|
|
|
|
impl<T: PreflightAuditor + SensorBus + ActuatorBus + EnvironmentGuard + HardwareWatchdog + ?Sized> PlatformSal for T {}
|