release/1.2.0 #2
@@ -1,5 +1,5 @@
|
|||||||
[metadata]
|
[metadata]
|
||||||
version = "1.0.0"
|
version = "1.1.0"
|
||||||
updated = "2026-02-26"
|
updated = "2026-02-26"
|
||||||
description = "Hardware and Conflict Database for ember-tune Thermal Engine"
|
description = "Hardware and Conflict Database for ember-tune Thermal Engine"
|
||||||
|
|
||||||
@@ -29,6 +29,14 @@ severity = "Medium"
|
|||||||
fix_action = "SuspendService"
|
fix_action = "SuspendService"
|
||||||
help_text = "Auto-cpufreq interferes with deterministic Silicon Knee identification."
|
help_text = "Auto-cpufreq interferes with deterministic Silicon Knee identification."
|
||||||
|
|
||||||
|
[[conflicts]]
|
||||||
|
id = "dell_fan_collision"
|
||||||
|
services = ["i8kmon.service"]
|
||||||
|
contention = "Dell SMM Fan Control"
|
||||||
|
severity = "High"
|
||||||
|
fix_action = "SuspendService"
|
||||||
|
help_text = "i8kmon fights with ember-tune for SMM fan duty cycles. Suspend during benchmark."
|
||||||
|
|
||||||
# manufacturer wide logic
|
# manufacturer wide logic
|
||||||
|
|
||||||
[ecosystems.dell]
|
[ecosystems.dell]
|
||||||
@@ -38,6 +46,7 @@ drivers = ["dell_smm_hwmon"]
|
|||||||
fan_manual_mode_cmd = "dell-bios-fan-control 0"
|
fan_manual_mode_cmd = "dell-bios-fan-control 0"
|
||||||
fan_auto_mode_cmd = "dell-bios-fan-control 1"
|
fan_auto_mode_cmd = "dell-bios-fan-control 1"
|
||||||
safety_register = "0x1FC" # BD PROCHOT MSR
|
safety_register = "0x1FC" # BD PROCHOT MSR
|
||||||
|
help_text = "Dell systems often require 'SMM Security Mitigation' disabled in BIOS for fan control."
|
||||||
|
|
||||||
[ecosystems.lenovo]
|
[ecosystems.lenovo]
|
||||||
vendor_regex = "LENOVO"
|
vendor_regex = "LENOVO"
|
||||||
@@ -60,6 +69,13 @@ fan_boost_path = "/sys/devices/platform/hp-wmi/hwmon/hwmon*/pwm1_enable"
|
|||||||
vendor_regex = "Framework"
|
vendor_regex = "Framework"
|
||||||
ec_tool = "ectool"
|
ec_tool = "ectool"
|
||||||
optimization = "Direct-FFI-SMC"
|
optimization = "Direct-FFI-SMC"
|
||||||
|
polling_cap_ms = 500
|
||||||
|
|
||||||
|
[ecosystems.surface]
|
||||||
|
vendor_regex = "Microsoft Corporation"
|
||||||
|
product_regex = "Surface.*"
|
||||||
|
drivers = ["surface_acpi"]
|
||||||
|
profiles_path = "/sys/bus/platform/devices/surface_performance/platform_profile"
|
||||||
|
|
||||||
# quirks: model quirks and fixes
|
# quirks: model quirks and fixes
|
||||||
|
|
||||||
@@ -85,6 +101,7 @@ id = "asus_fan_hex_support"
|
|||||||
issue = "Custom Hex Curve Interface"
|
issue = "Custom Hex Curve Interface"
|
||||||
target_path = "/sys/devices/platform/asus-nb-wmi/fan_curve"
|
target_path = "/sys/devices/platform/asus-nb-wmi/fan_curve"
|
||||||
format = "HexPair16"
|
format = "HexPair16"
|
||||||
|
action = "ManualFanControlRequired"
|
||||||
|
|
||||||
[[quirks]]
|
[[quirks]]
|
||||||
model_regex = "Spectre x360"
|
model_regex = "Spectre x360"
|
||||||
@@ -92,15 +109,23 @@ id = "hp_rapl_lockout"
|
|||||||
issue = "Hardware MSR Lockout"
|
issue = "Hardware MSR Lockout"
|
||||||
action = "WarnUserMSRLocked"
|
action = "WarnUserMSRLocked"
|
||||||
|
|
||||||
|
[[quirks]]
|
||||||
|
model_regex = "Framework.*"
|
||||||
|
id = "framework_prochot_stuck"
|
||||||
|
issue = "BD PROCHOT wedged at 200MHz"
|
||||||
|
monitor_msr = "0x1FC"
|
||||||
|
reset_bit = 0
|
||||||
|
action = "ClearBitOnSafeTemp"
|
||||||
|
|
||||||
# heuristic discovery
|
# heuristic discovery
|
||||||
|
|
||||||
[discovery.sensors]
|
[discovery.sensors]
|
||||||
temp_labels = ["Package id 0", "Tdie", "Tctl", "CPU Temperature"]
|
temp_labels = ["Package id 0", "Tdie", "Tctl", "CPU Temperature", "Core 0", "Composite"]
|
||||||
fan_labels = ["CPU Fan", "GPU Fan", "System Fan"]
|
fan_labels = ["CPU Fan", "GPU Fan", "System Fan", "Processor Fan"]
|
||||||
hwmon_priority = ["coretemp", "zenpower", "k10temp", "dell_smm"]
|
hwmon_priority = ["coretemp", "zenpower", "k10temp", "dell_smm", "thinkpad", "asus"]
|
||||||
|
|
||||||
[discovery.actuators]
|
[discovery.actuators]
|
||||||
rapl_paths = ["intel-rapl:0", "package-0"]
|
rapl_paths = ["intel-rapl:0", "package-0", "intel-rapl:1"]
|
||||||
amd_energy_paths = ["zenpower/energy1_input", "k10temp/energy1_input"]
|
amd_energy_paths = ["zenpower/energy1_input", "k10temp/energy1_input"]
|
||||||
governor_files = ["energy_performance_preference", "energy_performance_hint", "scaling_governor"]
|
governor_files = ["energy_performance_preference", "energy_performance_hint", "scaling_governor"]
|
||||||
|
|
||||||
@@ -113,5 +138,10 @@ fail_help = "Add 'msr.allow_writes=on' to kernel parameters to allow power limit
|
|||||||
|
|
||||||
[[preflight_checks]]
|
[[preflight_checks]]
|
||||||
name = "Kernel Lockdown Status"
|
name = "Kernel Lockdown Status"
|
||||||
check_cmd = "cat /sys/kernel/security/lockdown | grep -q '\\[none\\]'"
|
check_cmd = "cat /sys/kernel/security/lockdown | grep -q '\\[none\\]' || ! [ -f /sys/kernel/security/lockdown ]"
|
||||||
fail_help = "Kernel Lockdown is enabled. MMIO/MSR actuators are restricted by the Linux Security Module."
|
fail_help = "Kernel Lockdown is enabled. MMIO/MSR actuators are restricted by the Linux Security Module."
|
||||||
|
|
||||||
|
[[preflight_checks]]
|
||||||
|
name = "Intel P-State Check"
|
||||||
|
check_cmd = "[ -d /sys/devices/system/cpu/intel_pstate ] || [ -d /sys/devices/system/cpu/cpufreq/policy0 ]"
|
||||||
|
fail_help = "CPU Frequency scaling driver not detected. Ensure intel_pstate or acpi-cpufreq is loaded."
|
||||||
|
|||||||
117
assets/hardware_db.toml.bak
Normal file
117
assets/hardware_db.toml.bak
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
[metadata]
|
||||||
|
version = "1.0.0"
|
||||||
|
updated = "2026-02-26"
|
||||||
|
description = "Hardware and Conflict Database for ember-tune Thermal Engine"
|
||||||
|
|
||||||
|
# service collision
|
||||||
|
|
||||||
|
[[conflicts]]
|
||||||
|
id = "tlp_vs_ppd"
|
||||||
|
services = ["tlp.service", "power-profiles-daemon.service"]
|
||||||
|
contention = "ACPI Platform Profile / EPP"
|
||||||
|
severity = "Critical"
|
||||||
|
fix_action = "MaskBoth"
|
||||||
|
help_text = "TLP and Power-Profiles-Daemon fight over power envelopes. Mask both to allow ember-tune deterministic control."
|
||||||
|
|
||||||
|
[[conflicts]]
|
||||||
|
id = "thermal_logic_collision"
|
||||||
|
services = ["thermald.service", "throttled.service"]
|
||||||
|
contention = "RAPL / MSR / BD-PROCHOT"
|
||||||
|
severity = "High"
|
||||||
|
fix_action = "SuspendService"
|
||||||
|
help_text = "Thermald and Throttled create a 'register ping-pong' loop. Disable throttled; ember-tune will manage RAPL limits."
|
||||||
|
|
||||||
|
[[conflicts]]
|
||||||
|
id = "freq_scaling_collision"
|
||||||
|
services = ["auto-cpufreq.service"]
|
||||||
|
contention = "CPU Scaling Governor"
|
||||||
|
severity = "Medium"
|
||||||
|
fix_action = "SuspendService"
|
||||||
|
help_text = "Auto-cpufreq interferes with deterministic Silicon Knee identification."
|
||||||
|
|
||||||
|
# manufacturer wide logic
|
||||||
|
|
||||||
|
[ecosystems.dell]
|
||||||
|
vendor_regex = "(Dell.*|Precision.*|Latitude.*|XPS.*)"
|
||||||
|
polling_cap_ms = 1000
|
||||||
|
drivers = ["dell_smm_hwmon"]
|
||||||
|
fan_manual_mode_cmd = "dell-bios-fan-control 0"
|
||||||
|
fan_auto_mode_cmd = "dell-bios-fan-control 1"
|
||||||
|
safety_register = "0x1FC" # BD PROCHOT MSR
|
||||||
|
|
||||||
|
[ecosystems.lenovo]
|
||||||
|
vendor_regex = "LENOVO"
|
||||||
|
lap_mode_path = "/sys/devices/platform/thinkpad_acpi/dytc_lapmode"
|
||||||
|
profiles_path = "/sys/firmware/acpi/platform_profile"
|
||||||
|
ec_write_required = false # Varies by model
|
||||||
|
|
||||||
|
[ecosystems.asus]
|
||||||
|
vendor_regex = "ASUSTeK.*"
|
||||||
|
thermal_policy_path = "/sys/devices/platform/asus-nb-wmi/throttle_thermal_policy"
|
||||||
|
policy_map = { Balanced = 0, Turbo = 1, Silent = 2 }
|
||||||
|
|
||||||
|
[ecosystems.hp]
|
||||||
|
vendor_regex = "HP"
|
||||||
|
msr_lock_register = "0x610"
|
||||||
|
msr_lock_bit = 63
|
||||||
|
fan_boost_path = "/sys/devices/platform/hp-wmi/hwmon/hwmon*/pwm1_enable"
|
||||||
|
|
||||||
|
[ecosystems.framework]
|
||||||
|
vendor_regex = "Framework"
|
||||||
|
ec_tool = "ectool"
|
||||||
|
optimization = "Direct-FFI-SMC"
|
||||||
|
|
||||||
|
# quirks: model quirks and fixes
|
||||||
|
|
||||||
|
[[quirks]]
|
||||||
|
model_regex = "XPS 13 93.*"
|
||||||
|
id = "dell_bd_prochot_fix"
|
||||||
|
issue = "False Positive 400MHz Lock"
|
||||||
|
monitor_msr = "0x1FC"
|
||||||
|
reset_bit = 0
|
||||||
|
action = "ClearBitOnSafeTemp"
|
||||||
|
|
||||||
|
[[quirks]]
|
||||||
|
model_regex = "ThinkPad T14.*"
|
||||||
|
id = "lenovo_lap_throttling"
|
||||||
|
issue = "11W TDP Lock in Lap Mode"
|
||||||
|
trigger_path = "/sys/devices/platform/thinkpad_acpi/dytc_lapmode"
|
||||||
|
trigger_value = "1"
|
||||||
|
action = "AbortOnLapMode"
|
||||||
|
|
||||||
|
[[quirks]]
|
||||||
|
model_regex = "ROG Zephyrus G14"
|
||||||
|
id = "asus_fan_hex_support"
|
||||||
|
issue = "Custom Hex Curve Interface"
|
||||||
|
target_path = "/sys/devices/platform/asus-nb-wmi/fan_curve"
|
||||||
|
format = "HexPair16"
|
||||||
|
|
||||||
|
[[quirks]]
|
||||||
|
model_regex = "Spectre x360"
|
||||||
|
id = "hp_rapl_lockout"
|
||||||
|
issue = "Hardware MSR Lockout"
|
||||||
|
action = "WarnUserMSRLocked"
|
||||||
|
|
||||||
|
# heuristic discovery
|
||||||
|
|
||||||
|
[discovery.sensors]
|
||||||
|
temp_labels = ["Package id 0", "Tdie", "Tctl", "CPU Temperature"]
|
||||||
|
fan_labels = ["CPU Fan", "GPU Fan", "System Fan"]
|
||||||
|
hwmon_priority = ["coretemp", "zenpower", "k10temp", "dell_smm"]
|
||||||
|
|
||||||
|
[discovery.actuators]
|
||||||
|
rapl_paths = ["intel-rapl:0", "package-0"]
|
||||||
|
amd_energy_paths = ["zenpower/energy1_input", "k10temp/energy1_input"]
|
||||||
|
governor_files = ["energy_performance_preference", "energy_performance_hint", "scaling_governor"]
|
||||||
|
|
||||||
|
# env health verification
|
||||||
|
|
||||||
|
[[preflight_checks]]
|
||||||
|
name = "MSR Write Access"
|
||||||
|
check_cmd = "grep -q 'msr.allow_writes=on' /proc/cmdline"
|
||||||
|
fail_help = "Add 'msr.allow_writes=on' to kernel parameters to allow power limit manipulation."
|
||||||
|
|
||||||
|
[[preflight_checks]]
|
||||||
|
name = "Kernel Lockdown Status"
|
||||||
|
check_cmd = "cat /sys/kernel/security/lockdown | grep -q '\\[none\\]'"
|
||||||
|
fail_help = "Kernel Lockdown is enabled. MMIO/MSR actuators are restricted by the Linux Security Module."
|
||||||
@@ -91,24 +91,30 @@ impl OptimizerEngine {
|
|||||||
// 1. Efficiency Metric (Throughput per Watt)
|
// 1. Efficiency Metric (Throughput per Watt)
|
||||||
// If throughput is 0 (unsupported), fallback to Frequency per Watt
|
// If throughput is 0 (unsupported), fallback to Frequency per Watt
|
||||||
let efficiency_curr = if curr.throughput > 0.0 {
|
let efficiency_curr = if curr.throughput > 0.0 {
|
||||||
curr.throughput as f32 / curr.power_w.max(0.1)
|
curr.throughput as f32 / curr.power_w.max(1.0)
|
||||||
} else {
|
} else {
|
||||||
curr.freq_mhz / curr.power_w.max(0.1)
|
curr.freq_mhz / curr.power_w.max(1.0)
|
||||||
};
|
};
|
||||||
|
|
||||||
let efficiency_next = if next.throughput > 0.0 {
|
let efficiency_next = if next.throughput > 0.0 {
|
||||||
next.throughput as f32 / next.power_w.max(0.1)
|
next.throughput as f32 / next.power_w.max(1.0)
|
||||||
} else {
|
} else {
|
||||||
next.freq_mhz / next.power_w.max(0.1)
|
next.freq_mhz / next.power_w.max(1.0)
|
||||||
};
|
};
|
||||||
|
|
||||||
// Diminishing returns: how much efficiency drops per additional watt
|
// Diminishing returns: how much efficiency drops per additional watt
|
||||||
let efficiency_drop = (efficiency_curr - efficiency_next) / (next.power_w - curr.power_w).max(0.1);
|
let p_delta = (next.power_w - curr.power_w).max(0.5);
|
||||||
|
let efficiency_drop = (efficiency_curr - efficiency_next) / p_delta;
|
||||||
|
|
||||||
// 2. Thermal Acceleration (d2T/dW2)
|
// 2. Thermal Acceleration (d2T/dW2)
|
||||||
let dt_dw_prev = (curr.temp_c - prev.temp_c) / (curr.power_w - prev.power_w).max(0.1);
|
let p_delta_prev = (curr.power_w - prev.power_w).max(0.5);
|
||||||
let dt_dw_next = (next.temp_c - curr.temp_c) / (next.power_w - curr.power_w).max(0.1);
|
let p_delta_next = (next.power_w - curr.power_w).max(0.5);
|
||||||
let temp_accel = (dt_dw_next - dt_dw_prev) / (next.power_w - prev.power_w).max(0.1);
|
|
||||||
|
let dt_dw_prev = (curr.temp_c - prev.temp_c) / p_delta_prev;
|
||||||
|
let dt_dw_next = (next.temp_c - curr.temp_c) / p_delta_next;
|
||||||
|
|
||||||
|
let p_total_delta = (next.power_w - prev.power_w).max(1.0);
|
||||||
|
let temp_accel = (dt_dw_next - dt_dw_prev) / p_total_delta;
|
||||||
|
|
||||||
// 3. Wall Detection (Any drop in absolute frequency/throughput is a hard wall)
|
// 3. Wall Detection (Any drop in absolute frequency/throughput is a hard wall)
|
||||||
let is_throttling = next.freq_mhz < curr.freq_mhz || (next.throughput > 0.0 && next.throughput < curr.throughput);
|
let is_throttling = next.freq_mhz < curr.freq_mhz || (next.throughput > 0.0 && next.throughput < curr.throughput);
|
||||||
|
|||||||
@@ -1,16 +1,16 @@
|
|||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
|
use std::process::Child;
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
use std::thread;
|
||||||
|
|
||||||
pub trait Workload {
|
pub trait Workload: Send + Sync {
|
||||||
/// Starts the workload with specified threads and load percentage.
|
|
||||||
fn start(&mut self, threads: usize, load_percent: usize) -> Result<()>;
|
fn start(&mut self, threads: usize, load_percent: usize) -> Result<()>;
|
||||||
/// Stops the workload.
|
|
||||||
fn stop(&mut self) -> Result<()>;
|
fn stop(&mut self) -> Result<()>;
|
||||||
/// Returns the current throughput (e.g., ops/sec).
|
|
||||||
fn get_throughput(&self) -> Result<f64>;
|
fn get_throughput(&self) -> Result<f64>;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct StressNg {
|
pub struct StressNg {
|
||||||
child: Option<std::process::Child>,
|
child: Option<Child>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl StressNg {
|
impl StressNg {
|
||||||
@@ -21,7 +21,7 @@ impl StressNg {
|
|||||||
|
|
||||||
impl Workload for StressNg {
|
impl Workload for StressNg {
|
||||||
fn start(&mut self, threads: usize, load_percent: usize) -> Result<()> {
|
fn start(&mut self, threads: usize, load_percent: usize) -> Result<()> {
|
||||||
self.stop()?; // Ensure any previous instance is stopped
|
self.stop()?;
|
||||||
|
|
||||||
let child = std::process::Command::new("stress-ng")
|
let child = std::process::Command::new("stress-ng")
|
||||||
.args([
|
.args([
|
||||||
@@ -37,15 +37,32 @@ impl Workload for StressNg {
|
|||||||
|
|
||||||
fn stop(&mut self) -> Result<()> {
|
fn stop(&mut self) -> Result<()> {
|
||||||
if let Some(mut child) = self.child.take() {
|
if let Some(mut child) = self.child.take() {
|
||||||
let _ = child.kill();
|
// Try SIGTERM first
|
||||||
let _ = child.wait();
|
#[cfg(unix)]
|
||||||
|
{
|
||||||
|
use libc::{kill, SIGTERM};
|
||||||
|
unsafe { kill(child.id() as i32, SIGTERM); }
|
||||||
|
}
|
||||||
|
|
||||||
|
let start = Instant::now();
|
||||||
|
let mut exited = false;
|
||||||
|
while start.elapsed() < Duration::from_secs(2) {
|
||||||
|
if let Ok(Some(_)) = child.try_wait() {
|
||||||
|
exited = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
thread::sleep(Duration::from_millis(100));
|
||||||
|
}
|
||||||
|
|
||||||
|
if !exited {
|
||||||
|
let _ = child.kill();
|
||||||
|
let _ = child.wait();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_throughput(&self) -> Result<f64> {
|
fn get_throughput(&self) -> Result<f64> {
|
||||||
// In a real implementation, we would parse stress-ng's temporary results
|
|
||||||
// or use a different workload that provides live throughput.
|
|
||||||
Ok(0.0)
|
Ok(0.0)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
64
src/main.rs
64
src/main.rs
@@ -6,7 +6,7 @@ mod ui;
|
|||||||
mod engine;
|
mod engine;
|
||||||
mod cli;
|
mod cli;
|
||||||
|
|
||||||
use miette::{Result, IntoDiagnostic, Diagnostic, Report};
|
use miette::{Result, IntoDiagnostic, Diagnostic, Report, Context};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use std::sync::mpsc;
|
use std::sync::mpsc;
|
||||||
use std::thread;
|
use std::thread;
|
||||||
@@ -30,7 +30,7 @@ use mediator::{TelemetryState, UiCommand, BenchmarkPhase};
|
|||||||
use sal::traits::{AuditError, PlatformSal};
|
use sal::traits::{AuditError, PlatformSal};
|
||||||
use sal::mock::MockSal;
|
use sal::mock::MockSal;
|
||||||
use sal::heuristic::engine::HeuristicEngine;
|
use sal::heuristic::engine::HeuristicEngine;
|
||||||
use load::StressNg;
|
use load::{StressNg, Workload};
|
||||||
use orchestrator::BenchmarkOrchestrator;
|
use orchestrator::BenchmarkOrchestrator;
|
||||||
use ui::dashboard::{draw_dashboard, DashboardState};
|
use ui::dashboard::{draw_dashboard, DashboardState};
|
||||||
use engine::OptimizationResult;
|
use engine::OptimizationResult;
|
||||||
@@ -108,10 +108,10 @@ fn main() -> Result<()> {
|
|||||||
info!("ember-tune starting with args: {:?}", args);
|
info!("ember-tune starting with args: {:?}", args);
|
||||||
|
|
||||||
// 2. Platform Detection & Audit
|
// 2. Platform Detection & Audit
|
||||||
let sal: Box<dyn PlatformSal> = if args.mock {
|
let sal: Arc<dyn PlatformSal> = if args.mock {
|
||||||
Box::new(MockSal::new())
|
Arc::new(MockSal::new())
|
||||||
} else {
|
} else {
|
||||||
HeuristicEngine::detect_and_build()?
|
HeuristicEngine::detect_and_build()?.into()
|
||||||
};
|
};
|
||||||
|
|
||||||
println!("{}", console::style("─── Pre-flight System Audit ───").bold().cyan());
|
println!("{}", console::style("─── Pre-flight System Audit ───").bold().cyan());
|
||||||
@@ -122,9 +122,7 @@ fn main() -> Result<()> {
|
|||||||
io::Write::flush(&mut io::stdout()).into_diagnostic()?;
|
io::Write::flush(&mut io::stdout()).into_diagnostic()?;
|
||||||
|
|
||||||
match step.outcome {
|
match step.outcome {
|
||||||
Ok(_) => {
|
Ok(_) => { println!("{}", console::style("[✓]").green()); }
|
||||||
println!("{}", console::style("[✓]").green());
|
|
||||||
}
|
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
println!("{}", console::style("[✗]").red());
|
println!("{}", console::style("[✗]").red());
|
||||||
audit_failures.push(e);
|
audit_failures.push(e);
|
||||||
@@ -137,10 +135,8 @@ fn main() -> Result<()> {
|
|||||||
return Err(Report::new(MultiAuditError { errors: audit_failures }));
|
return Err(Report::new(MultiAuditError { errors: audit_failures }));
|
||||||
}
|
}
|
||||||
|
|
||||||
println!("{}", console::style("✓ All pre-flight audits passed.").green().bold());
|
|
||||||
thread::sleep(Duration::from_secs(1));
|
|
||||||
|
|
||||||
if args.audit_only {
|
if args.audit_only {
|
||||||
|
println!("{}", console::style("✓ All pre-flight audits passed.").green().bold());
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -159,21 +155,22 @@ fn main() -> Result<()> {
|
|||||||
let (telemetry_tx, telemetry_rx) = mpsc::channel::<TelemetryState>();
|
let (telemetry_tx, telemetry_rx) = mpsc::channel::<TelemetryState>();
|
||||||
let (command_tx, command_rx) = mpsc::channel::<UiCommand>();
|
let (command_tx, command_rx) = mpsc::channel::<UiCommand>();
|
||||||
|
|
||||||
|
let c_tx = command_tx.clone();
|
||||||
ctrlc::set_handler(move || {
|
ctrlc::set_handler(move || {
|
||||||
|
let _ = c_tx.send(UiCommand::Abort);
|
||||||
r.store(false, Ordering::SeqCst);
|
r.store(false, Ordering::SeqCst);
|
||||||
}).expect("Error setting Ctrl-C handler");
|
}).expect("Error setting Ctrl-C handler");
|
||||||
|
|
||||||
// 5. Spawn Backend Orchestrator
|
// 5. Spawn Backend Orchestrator
|
||||||
|
let sal_backend = sal.clone();
|
||||||
let backend_handle = thread::spawn(move || {
|
let backend_handle = thread::spawn(move || {
|
||||||
let workload = Box::new(StressNg::new());
|
let workload = Box::new(StressNg::new());
|
||||||
|
|
||||||
let mut orchestrator = BenchmarkOrchestrator::new(
|
let mut orchestrator = BenchmarkOrchestrator::new(
|
||||||
sal,
|
sal_backend,
|
||||||
workload,
|
workload,
|
||||||
telemetry_tx,
|
telemetry_tx,
|
||||||
command_rx,
|
command_rx,
|
||||||
);
|
);
|
||||||
|
|
||||||
orchestrator.run()
|
orchestrator.run()
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -197,6 +194,8 @@ fn main() -> Result<()> {
|
|||||||
history_mhz: Vec::new(),
|
history_mhz: Vec::new(),
|
||||||
log_event: None,
|
log_event: None,
|
||||||
metadata: std::collections::HashMap::new(),
|
metadata: std::collections::HashMap::new(),
|
||||||
|
is_emergency: false,
|
||||||
|
emergency_reason: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let tick_rate = Duration::from_millis(100);
|
let tick_rate = Duration::from_millis(100);
|
||||||
@@ -233,29 +232,38 @@ fn main() -> Result<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if last_tick.elapsed() >= tick_rate {
|
if last_tick.elapsed() >= tick_rate { last_tick = Instant::now(); }
|
||||||
last_tick = Instant::now();
|
if backend_handle.is_finished() { break; }
|
||||||
}
|
|
||||||
|
|
||||||
if backend_handle.is_finished() {
|
|
||||||
thread::sleep(Duration::from_secs(1));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 7. Terminal Restoration
|
// 7. Terminal Restoration
|
||||||
disable_raw_mode().into_diagnostic()?;
|
let _ = disable_raw_mode();
|
||||||
execute!(terminal.backend_mut(), LeaveAlternateScreen).into_diagnostic()?;
|
let _ = execute!(terminal.backend_mut(), LeaveAlternateScreen);
|
||||||
terminal.show_cursor().into_diagnostic()?;
|
let _ = terminal.show_cursor();
|
||||||
|
|
||||||
// 8. Final Report (Post-TUI)
|
// 8. Final Report & Hardware Restoration
|
||||||
match backend_handle.join() {
|
let join_res = backend_handle.join();
|
||||||
|
|
||||||
|
// Explicit hardware restoration
|
||||||
|
info!("Restoring hardware state...");
|
||||||
|
if let Err(e) = sal.restore() {
|
||||||
|
error!("Failed to restore hardware state: {}", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
match join_res {
|
||||||
Ok(Ok(result)) => {
|
Ok(Ok(result)) => {
|
||||||
print_summary_report(&result);
|
print_summary_report(&result);
|
||||||
}
|
}
|
||||||
Ok(Err(e)) => {
|
Ok(Err(e)) => {
|
||||||
if e.to_string() == "ABORTED" {
|
let err_str = e.to_string();
|
||||||
|
if err_str == "ABORTED" {
|
||||||
println!("{}", "Benchmark aborted by user.".yellow());
|
println!("{}", "Benchmark aborted by user.".yellow());
|
||||||
|
} else if err_str.contains("EMERGENCY_ABORT") {
|
||||||
|
println!();
|
||||||
|
println!("{}", " 🚨 EMERGENCY ABORT TRIGGERED ".bold().on_red().white());
|
||||||
|
println!("Reason: {}", err_str.replace("EMERGENCY_ABORT: ", "").red().bold());
|
||||||
|
println!("{}", "Hardware state has been restored to safe defaults.".yellow());
|
||||||
|
println!();
|
||||||
} else {
|
} else {
|
||||||
error!("Orchestrator encountered error: {}", e);
|
error!("Orchestrator encountered error: {}", e);
|
||||||
eprintln!("{} {}", "Error:".red().bold(), e);
|
eprintln!("{} {}", "Error:".red().bold(), e);
|
||||||
|
|||||||
@@ -42,6 +42,8 @@ pub struct TelemetryState {
|
|||||||
|
|
||||||
pub log_event: Option<String>,
|
pub log_event: Option<String>,
|
||||||
pub metadata: std::collections::HashMap<String, String>,
|
pub metadata: std::collections::HashMap<String, String>,
|
||||||
|
pub is_emergency: bool,
|
||||||
|
pub emergency_reason: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
|
|||||||
@@ -4,14 +4,17 @@ use std::time::{Duration, Instant};
|
|||||||
use std::thread;
|
use std::thread;
|
||||||
use std::collections::VecDeque;
|
use std::collections::VecDeque;
|
||||||
use sysinfo::System;
|
use sysinfo::System;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
use std::sync::Mutex;
|
||||||
|
|
||||||
use crate::sal::traits::{PlatformSal};
|
use crate::sal::traits::{PlatformSal, AuditStep, SafetyStatus};
|
||||||
use crate::load::Workload;
|
use crate::load::Workload;
|
||||||
use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase};
|
use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase};
|
||||||
use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult};
|
use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult};
|
||||||
|
|
||||||
pub struct BenchmarkOrchestrator {
|
pub struct BenchmarkOrchestrator {
|
||||||
sal: Box<dyn PlatformSal>,
|
sal: Arc<dyn PlatformSal>,
|
||||||
workload: Box<dyn Workload>,
|
workload: Box<dyn Workload>,
|
||||||
telemetry_tx: mpsc::Sender<TelemetryState>,
|
telemetry_tx: mpsc::Sender<TelemetryState>,
|
||||||
command_rx: mpsc::Receiver<UiCommand>,
|
command_rx: mpsc::Receiver<UiCommand>,
|
||||||
@@ -27,11 +30,15 @@ pub struct BenchmarkOrchestrator {
|
|||||||
// --- Static Info ---
|
// --- Static Info ---
|
||||||
cpu_model: String,
|
cpu_model: String,
|
||||||
total_ram_gb: u64,
|
total_ram_gb: u64,
|
||||||
|
|
||||||
|
// --- Safety ---
|
||||||
|
emergency_abort: Arc<AtomicBool>,
|
||||||
|
emergency_reason: Arc<Mutex<Option<String>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BenchmarkOrchestrator {
|
impl BenchmarkOrchestrator {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
sal: Box<dyn PlatformSal>,
|
sal: Arc<dyn PlatformSal>,
|
||||||
workload: Box<dyn Workload>,
|
workload: Box<dyn Workload>,
|
||||||
telemetry_tx: mpsc::Sender<TelemetryState>,
|
telemetry_tx: mpsc::Sender<TelemetryState>,
|
||||||
command_rx: mpsc::Receiver<UiCommand>,
|
command_rx: mpsc::Receiver<UiCommand>,
|
||||||
@@ -57,12 +64,17 @@ impl BenchmarkOrchestrator {
|
|||||||
history_mhz: VecDeque::with_capacity(120),
|
history_mhz: VecDeque::with_capacity(120),
|
||||||
cpu_model,
|
cpu_model,
|
||||||
total_ram_gb,
|
total_ram_gb,
|
||||||
|
emergency_abort: Arc::new(AtomicBool::new(false)),
|
||||||
|
emergency_reason: Arc::new(Mutex::new(None)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn run(&mut self) -> Result<OptimizationResult> {
|
pub fn run(&mut self) -> Result<OptimizationResult> {
|
||||||
self.log("Starting ember-tune Benchmark Sequence.")?;
|
self.log("Starting ember-tune Benchmark Sequence.")?;
|
||||||
|
|
||||||
|
// Start Watchdog Monitor
|
||||||
|
let _watchdog_handle = self.spawn_watchdog_monitor();
|
||||||
|
|
||||||
// Phase 1: Audit & Baseline
|
// Phase 1: Audit & Baseline
|
||||||
self.phase = BenchmarkPhase::Auditing;
|
self.phase = BenchmarkPhase::Auditing;
|
||||||
for step in self.sal.audit() {
|
for step in self.sal.audit() {
|
||||||
@@ -111,11 +123,6 @@ impl BenchmarkOrchestrator {
|
|||||||
|
|
||||||
while step_start.elapsed() < Duration::from_secs(45) {
|
while step_start.elapsed() < Duration::from_secs(45) {
|
||||||
self.check_abort()?;
|
self.check_abort()?;
|
||||||
if self.sal.check_emergency()? {
|
|
||||||
self.log("⚠ EMERGENCY ABORT: Watchdog triggered!")?;
|
|
||||||
self.workload.stop()?;
|
|
||||||
return Err(anyhow::anyhow!("Hardware Watchdog Triggered"));
|
|
||||||
}
|
|
||||||
|
|
||||||
let t = self.sal.get_temp().unwrap_or(0.0);
|
let t = self.sal.get_temp().unwrap_or(0.0);
|
||||||
step_temps.push_back(t);
|
step_temps.push_back(t);
|
||||||
@@ -204,6 +211,35 @@ impl BenchmarkOrchestrator {
|
|||||||
Ok(res)
|
Ok(res)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn spawn_watchdog_monitor(&self) -> thread::JoinHandle<()> {
|
||||||
|
let abort = self.emergency_abort.clone();
|
||||||
|
let reason_store = self.emergency_reason.clone();
|
||||||
|
let sal = self.sal.clone();
|
||||||
|
|
||||||
|
thread::spawn(move || {
|
||||||
|
while !abort.load(Ordering::SeqCst) {
|
||||||
|
let status = sal.get_safety_status();
|
||||||
|
match status {
|
||||||
|
Ok(SafetyStatus::EmergencyAbort(reason)) => {
|
||||||
|
*reason_store.lock().unwrap() = Some(reason.clone());
|
||||||
|
abort.store(true, Ordering::SeqCst);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Ok(SafetyStatus::Warning(_msg)) | Ok(SafetyStatus::Critical(_msg)) => {
|
||||||
|
// Send warning log to UI
|
||||||
|
}
|
||||||
|
Ok(SafetyStatus::Nominal) => {}
|
||||||
|
Err(e) => {
|
||||||
|
*reason_store.lock().unwrap() = Some(format!("Watchdog Sensor Failure: {}", e));
|
||||||
|
abort.store(true, Ordering::SeqCst);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
thread::sleep(Duration::from_millis(100));
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
pub fn generate_result(&self, is_partial: bool) -> OptimizationResult {
|
pub fn generate_result(&self, is_partial: bool) -> OptimizationResult {
|
||||||
let r_theta = self.engine.calculate_thermal_resistance(&self.profile);
|
let r_theta = self.engine.calculate_thermal_resistance(&self.profile);
|
||||||
let knee = self.engine.find_silicon_knee(&self.profile);
|
let knee = self.engine.find_silicon_knee(&self.profile);
|
||||||
@@ -221,6 +257,11 @@ impl BenchmarkOrchestrator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn check_abort(&self) -> Result<()> {
|
fn check_abort(&self) -> Result<()> {
|
||||||
|
if self.emergency_abort.load(Ordering::SeqCst) {
|
||||||
|
let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Unknown safety trigger".to_string());
|
||||||
|
return Err(anyhow::anyhow!("EMERGENCY_ABORT: {}", reason));
|
||||||
|
}
|
||||||
|
|
||||||
if let Ok(cmd) = self.command_rx.try_recv() {
|
if let Ok(cmd) = self.command_rx.try_recv() {
|
||||||
match cmd {
|
match cmd {
|
||||||
UiCommand::Abort => {
|
UiCommand::Abort => {
|
||||||
@@ -250,6 +291,8 @@ impl BenchmarkOrchestrator {
|
|||||||
history_mhz: Vec::new(),
|
history_mhz: Vec::new(),
|
||||||
log_event: Some(msg.to_string()),
|
log_event: Some(msg.to_string()),
|
||||||
metadata: std::collections::HashMap::new(),
|
metadata: std::collections::HashMap::new(),
|
||||||
|
is_emergency: self.emergency_abort.load(Ordering::SeqCst),
|
||||||
|
emergency_reason: self.emergency_reason.lock().unwrap().clone(),
|
||||||
};
|
};
|
||||||
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed"))
|
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed"))
|
||||||
}
|
}
|
||||||
@@ -287,6 +330,8 @@ impl BenchmarkOrchestrator {
|
|||||||
history_mhz: self.history_mhz.iter().cloned().collect(),
|
history_mhz: self.history_mhz.iter().cloned().collect(),
|
||||||
log_event: None,
|
log_event: None,
|
||||||
metadata: std::collections::HashMap::new(),
|
metadata: std::collections::HashMap::new(),
|
||||||
|
is_emergency: self.emergency_abort.load(Ordering::SeqCst),
|
||||||
|
emergency_reason: self.emergency_reason.lock().unwrap().clone(),
|
||||||
};
|
};
|
||||||
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed"))
|
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed"))
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep};
|
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep, SafetyStatus};
|
||||||
use anyhow::{Result, Context};
|
use anyhow::{Result, Context};
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::process::Command;
|
use std::process::Command;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
use std::sync::Mutex;
|
use std::sync::Mutex;
|
||||||
use tracing::debug;
|
use tracing::{debug, warn};
|
||||||
|
|
||||||
pub struct DellXps9380Sal {
|
pub struct DellXps9380Sal {
|
||||||
temp_path: PathBuf,
|
temp_path: PathBuf,
|
||||||
@@ -18,6 +18,8 @@ pub struct DellXps9380Sal {
|
|||||||
last_temp: Mutex<f32>,
|
last_temp: Mutex<f32>,
|
||||||
last_fans: Mutex<Vec<u32>>,
|
last_fans: Mutex<Vec<u32>>,
|
||||||
suppressed_services: Mutex<Vec<String>>,
|
suppressed_services: Mutex<Vec<String>>,
|
||||||
|
msr_file: Mutex<fs::File>,
|
||||||
|
last_energy: Mutex<(u64, Instant)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DellXps9380Sal {
|
impl DellXps9380Sal {
|
||||||
@@ -35,7 +37,6 @@ impl DellXps9380Sal {
|
|||||||
|
|
||||||
if name == "dell_smm" {
|
if name == "dell_smm" {
|
||||||
temp_path = Some(p.join("temp1_input"));
|
temp_path = Some(p.join("temp1_input"));
|
||||||
// Discover all fans
|
|
||||||
if let Ok(fan_entries) = fs::read_dir(&p) {
|
if let Ok(fan_entries) = fs::read_dir(&p) {
|
||||||
for fan_entry in fan_entries.flatten() {
|
for fan_entry in fan_entries.flatten() {
|
||||||
let fan_p = fan_entry.path();
|
let fan_p = fan_entry.path();
|
||||||
@@ -54,7 +55,6 @@ impl DellXps9380Sal {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Discovery for RAPL via powercap
|
|
||||||
if let Ok(entries) = fs::read_dir("/sys/class/powercap") {
|
if let Ok(entries) = fs::read_dir("/sys/class/powercap") {
|
||||||
for entry in entries.flatten() {
|
for entry in entries.flatten() {
|
||||||
let p = entry.path();
|
let p = entry.path();
|
||||||
@@ -72,6 +72,9 @@ impl DellXps9380Sal {
|
|||||||
|
|
||||||
let rapl_base = rapl_base_path.context("Could not find RAPL package-0 path in powercap")?;
|
let rapl_base = rapl_base_path.context("Could not find RAPL package-0 path in powercap")?;
|
||||||
let freq_path = PathBuf::from("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq");
|
let freq_path = PathBuf::from("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq");
|
||||||
|
|
||||||
|
let msr_file = fs::OpenOptions::new().read(true).write(true).open("/dev/cpu/0/msr")
|
||||||
|
.context("Failed to open /dev/cpu/0/msr. Is the 'msr' module loaded?")?;
|
||||||
|
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
temp_path: temp_path.context("Could not find dell_smm temperature path")?,
|
temp_path: temp_path.context("Could not find dell_smm temperature path")?,
|
||||||
@@ -84,68 +87,64 @@ impl DellXps9380Sal {
|
|||||||
last_temp: Mutex::new(0.0),
|
last_temp: Mutex::new(0.0),
|
||||||
last_fans: Mutex::new(Vec::new()),
|
last_fans: Mutex::new(Vec::new()),
|
||||||
suppressed_services: Mutex::new(Vec::new()),
|
suppressed_services: Mutex::new(Vec::new()),
|
||||||
|
msr_file: Mutex::new(msr_file),
|
||||||
|
last_energy: Mutex::new((0, Instant::now())),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn read_msr(&self, msr: u32) -> Result<u64> {
|
||||||
|
use std::os::unix::fs::FileExt;
|
||||||
|
let mut buf = [0u8; 8];
|
||||||
|
let file = self.msr_file.lock().unwrap();
|
||||||
|
file.read_at(&mut buf, msr as u64)?;
|
||||||
|
Ok(u64::from_le_bytes(buf))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_msr(&self, msr: u32, val: u64) -> Result<()> {
|
||||||
|
use std::os::unix::fs::FileExt;
|
||||||
|
let file = self.msr_file.lock().unwrap();
|
||||||
|
file.write_at(&val.to_le_bytes(), msr as u64)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PreflightAuditor for DellXps9380Sal {
|
impl PreflightAuditor for DellXps9380Sal {
|
||||||
fn audit(&self) -> Box<dyn Iterator<Item = AuditStep> + '_> {
|
fn audit(&self) -> Box<dyn Iterator<Item = AuditStep> + '_> {
|
||||||
let mut steps = Vec::new();
|
let mut steps = Vec::new();
|
||||||
|
|
||||||
// 1. Root check
|
|
||||||
steps.push(AuditStep {
|
steps.push(AuditStep {
|
||||||
description: "Root Privileges".to_string(),
|
description: "Root Privileges".to_string(),
|
||||||
outcome: if unsafe { libc::getuid() } == 0 { Ok(()) } else { Err(AuditError::RootRequired) }
|
outcome: if unsafe { libc::getuid() } == 0 { Ok(()) } else { Err(AuditError::RootRequired) }
|
||||||
});
|
});
|
||||||
|
|
||||||
// 2. Kernel modules check (simplified check via sysfs/proc)
|
|
||||||
let modules = ["dell_smm_hwmon", "msr", "intel_rapl_msr"];
|
let modules = ["dell_smm_hwmon", "msr", "intel_rapl_msr"];
|
||||||
for mod_name in modules {
|
for mod_name in modules {
|
||||||
let path = format!("/sys/module/{}", mod_name);
|
let path = format!("/sys/module/{}", mod_name);
|
||||||
steps.push(AuditStep {
|
steps.push(AuditStep {
|
||||||
description: format!("Kernel Module: {}", mod_name),
|
description: format!("Kernel Module: {}", mod_name),
|
||||||
outcome: if PathBuf::from(path).exists() { Ok(()) } else {
|
outcome: if PathBuf::from(path).exists() { Ok(()) } else {
|
||||||
Err(AuditError::ToolMissing(format!("Module '{}' not loaded. Run 'sudo modprobe {}'", mod_name, mod_name)))
|
Err(AuditError::ToolMissing(format!("Module '{}' not loaded.", mod_name)))
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3. Kernel parameters check
|
|
||||||
let cmdline = fs::read_to_string("/proc/cmdline").unwrap_or_default();
|
let cmdline = fs::read_to_string("/proc/cmdline").unwrap_or_default();
|
||||||
steps.push(AuditStep {
|
let params = [
|
||||||
description: "Kernel Param: dell_smm_hwmon.ignore_dmi=1".to_string(),
|
("dell_smm_hwmon.ignore_dmi=1", "dell_smm_hwmon.ignore_dmi=1"),
|
||||||
outcome: if cmdline.contains("dell_smm_hwmon.ignore_dmi=1") { Ok(()) } else {
|
("dell_smm_hwmon.restricted=0", "dell_smm_hwmon.restricted=0"),
|
||||||
Err(AuditError::MissingKernelParam("dell_smm_hwmon.ignore_dmi=1".to_string()))
|
("msr.allow_writes=on", "msr.allow_writes=on"),
|
||||||
}
|
];
|
||||||
});
|
for (label, p) in params {
|
||||||
steps.push(AuditStep {
|
steps.push(AuditStep {
|
||||||
description: "Kernel Param: dell_smm_hwmon.restricted=0".to_string(),
|
description: format!("Kernel Param: {}", label),
|
||||||
outcome: if cmdline.contains("dell_smm_hwmon.restricted=0") { Ok(()) } else {
|
outcome: if cmdline.contains(p) { Ok(()) } else { Err(AuditError::MissingKernelParam(p.to_string())) }
|
||||||
Err(AuditError::MissingKernelParam("dell_smm_hwmon.restricted=0".to_string()))
|
});
|
||||||
}
|
}
|
||||||
});
|
|
||||||
steps.push(AuditStep {
|
|
||||||
description: "Kernel Param: msr.allow_writes=on".to_string(),
|
|
||||||
outcome: if cmdline.contains("msr.allow_writes=on") { Ok(()) } else {
|
|
||||||
Err(AuditError::MissingKernelParam("msr.allow_writes=on".to_string()))
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// 4. Lockdown check
|
|
||||||
let lockdown = fs::read_to_string("/sys/kernel/security/lockdown").unwrap_or_default();
|
|
||||||
steps.push(AuditStep {
|
|
||||||
description: "Kernel Lockdown Status".to_string(),
|
|
||||||
outcome: if lockdown.contains("[none]") || lockdown.is_empty() { Ok(()) } else {
|
|
||||||
Err(AuditError::KernelIncompatible("Kernel is in lockdown mode. Set to 'none' to allow MSR/SMM writes.".to_string()))
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// 5. Check AC power
|
|
||||||
let ac_status = fs::read_to_string("/sys/class/power_supply/AC/online").unwrap_or_else(|_| "0".to_string());
|
let ac_status = fs::read_to_string("/sys/class/power_supply/AC/online").unwrap_or_else(|_| "0".to_string());
|
||||||
steps.push(AuditStep {
|
steps.push(AuditStep {
|
||||||
description: "AC Power Connection".to_string(),
|
description: "AC Power Connection".to_string(),
|
||||||
outcome: if ac_status.trim() == "1" { Ok(()) } else {
|
outcome: if ac_status.trim() == "1" { Ok(()) } else {
|
||||||
Err(AuditError::AcPowerMissing("System must be on AC power for benchmarking".to_string()))
|
Err(AuditError::AcPowerMissing("System must be on AC power".to_string()))
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -154,12 +153,11 @@ impl PreflightAuditor for DellXps9380Sal {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl EnvironmentGuard for DellXps9380Sal {
|
impl EnvironmentGuard for DellXps9380Sal {
|
||||||
fn suppress(&mut self) -> Result<()> {
|
fn suppress(&self) -> Result<()> {
|
||||||
let services = ["tlp", "thermald", "i8kmon"];
|
let services = ["tlp", "thermald", "i8kmon"];
|
||||||
let mut suppressed = self.suppressed_services.lock().unwrap();
|
let mut suppressed = self.suppressed_services.lock().unwrap();
|
||||||
for s in services {
|
for s in services {
|
||||||
if Command::new("systemctl").args(["is-active", "--quiet", s]).status()?.success() {
|
if Command::new("systemctl").args(["is-active", "--quiet", s]).status()?.success() {
|
||||||
debug!("Suppressing service: {}", s);
|
|
||||||
Command::new("systemctl").args(["stop", s]).status()?;
|
Command::new("systemctl").args(["stop", s]).status()?;
|
||||||
suppressed.push(s.to_string());
|
suppressed.push(s.to_string());
|
||||||
}
|
}
|
||||||
@@ -167,7 +165,7 @@ impl EnvironmentGuard for DellXps9380Sal {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn restore(&mut self) -> Result<()> {
|
fn restore(&self) -> Result<()> {
|
||||||
let mut suppressed = self.suppressed_services.lock().unwrap();
|
let mut suppressed = self.suppressed_services.lock().unwrap();
|
||||||
for s in suppressed.drain(..) {
|
for s in suppressed.drain(..) {
|
||||||
let _ = Command::new("systemctl").args(["start", &s]).status();
|
let _ = Command::new("systemctl").args(["start", &s]).status();
|
||||||
@@ -176,38 +174,31 @@ impl EnvironmentGuard for DellXps9380Sal {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Drop for DellXps9380Sal {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
let _ = self.restore();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
impl SensorBus for DellXps9380Sal {
|
impl SensorBus for DellXps9380Sal {
|
||||||
fn get_temp(&self) -> Result<f32> {
|
fn get_temp(&self) -> Result<f32> {
|
||||||
// Enforce 1000ms rate limit for Dell SMM as per GEMINI.md
|
|
||||||
let mut last_poll = self.last_poll.lock().unwrap();
|
let mut last_poll = self.last_poll.lock().unwrap();
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
|
|
||||||
if now.duration_since(*last_poll) < Duration::from_millis(1000) {
|
if now.duration_since(*last_poll) < Duration::from_millis(1000) {
|
||||||
return Ok(*self.last_temp.lock().unwrap());
|
return Ok(*self.last_temp.lock().unwrap());
|
||||||
}
|
}
|
||||||
|
|
||||||
let s = fs::read_to_string(&self.temp_path)?;
|
let s = fs::read_to_string(&self.temp_path)?;
|
||||||
let val = s.trim().parse::<f32>()? / 1000.0;
|
let val = s.trim().parse::<f32>()? / 1000.0;
|
||||||
|
|
||||||
*self.last_temp.lock().unwrap() = val;
|
*self.last_temp.lock().unwrap() = val;
|
||||||
*last_poll = now;
|
*last_poll = now;
|
||||||
|
|
||||||
Ok(val)
|
Ok(val)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_power_w(&self) -> Result<f32> {
|
fn get_power_w(&self) -> Result<f32> {
|
||||||
if self.pwr_path.to_string_lossy().contains("energy_uj") {
|
if self.pwr_path.to_string_lossy().contains("energy_uj") {
|
||||||
let e1 = fs::read_to_string(&self.pwr_path)?.trim().parse::<u64>()?;
|
let mut last = self.last_energy.lock().unwrap();
|
||||||
std::thread::sleep(Duration::from_millis(100));
|
|
||||||
let e2 = fs::read_to_string(&self.pwr_path)?.trim().parse::<u64>()?;
|
let e2 = fs::read_to_string(&self.pwr_path)?.trim().parse::<u64>()?;
|
||||||
Ok((e2.saturating_sub(e1)) as f32 / 100000.0)
|
let t2 = Instant::now();
|
||||||
|
let (e1, t1) = *last;
|
||||||
|
let delta_e = e2.wrapping_sub(e1);
|
||||||
|
let delta_t = t2.duration_since(t1).as_secs_f32();
|
||||||
|
*last = (e2, t2);
|
||||||
|
if delta_t < 0.01 { return Ok(0.0); }
|
||||||
|
Ok((delta_e as f32 / 1_000_000.0) / delta_t)
|
||||||
} else {
|
} else {
|
||||||
let s = fs::read_to_string(&self.pwr_path)?;
|
let s = fs::read_to_string(&self.pwr_path)?;
|
||||||
Ok(s.trim().parse::<f32>()? / 1000000.0)
|
Ok(s.trim().parse::<f32>()? / 1000000.0)
|
||||||
@@ -217,66 +208,65 @@ impl SensorBus for DellXps9380Sal {
|
|||||||
fn get_fan_rpms(&self) -> Result<Vec<u32>> {
|
fn get_fan_rpms(&self) -> Result<Vec<u32>> {
|
||||||
let mut last_poll = self.last_poll.lock().unwrap();
|
let mut last_poll = self.last_poll.lock().unwrap();
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
|
|
||||||
if now.duration_since(*last_poll) < Duration::from_millis(1000) {
|
if now.duration_since(*last_poll) < Duration::from_millis(1000) {
|
||||||
return Ok(self.last_fans.lock().unwrap().clone());
|
return Ok(self.last_fans.lock().unwrap().clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut fans = Vec::new();
|
let mut fans = Vec::new();
|
||||||
for path in &self.fan_paths {
|
for path in &self.fan_paths {
|
||||||
if let Ok(s) = fs::read_to_string(path) {
|
if let Ok(s) = fs::read_to_string(path) {
|
||||||
if let Ok(rpm) = s.trim().parse::<u32>() {
|
if let Ok(rpm) = s.trim().parse::<u32>() { fans.push(rpm); }
|
||||||
fans.push(rpm);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
*self.last_fans.lock().unwrap() = fans.clone();
|
*self.last_fans.lock().unwrap() = fans.clone();
|
||||||
*last_poll = now;
|
*last_poll = now;
|
||||||
|
|
||||||
Ok(fans)
|
Ok(fans)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_freq_mhz(&self) -> Result<f32> {
|
fn get_freq_mhz(&self) -> Result<f32> {
|
||||||
let s = fs::read_to_string(&self.freq_path)?;
|
let s = fs::read_to_string(&self.freq_path)?;
|
||||||
let val = s.trim().parse::<f32>()? / 1000.0;
|
Ok(s.trim().parse::<f32>()? / 1000.0)
|
||||||
Ok(val)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ActuatorBus for DellXps9380Sal {
|
impl ActuatorBus for DellXps9380Sal {
|
||||||
fn set_fan_mode(&self, mode: &str) -> Result<()> {
|
fn set_fan_mode(&self, mode: &str) -> Result<()> {
|
||||||
match mode {
|
match mode {
|
||||||
"max" | "Manual" => {
|
"max" | "Manual" => { Command::new("dell-bios-fan-control").arg("0").status()?; }
|
||||||
Command::new("dell-bios-fan-control").arg("0").status()?;
|
"auto" | "Auto" => { Command::new("dell-bios-fan-control").arg("1").status()?; }
|
||||||
}
|
_ => { debug!("Unknown fan mode: {}", mode); }
|
||||||
"auto" | "Auto" => {
|
|
||||||
Command::new("dell-bios-fan-control").arg("1").status()?;
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
debug!("Unknown fan mode requested: {}", mode);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn set_sustained_power_limit(&self, watts: f32) -> Result<()> {
|
fn set_sustained_power_limit(&self, watts: f32) -> Result<()> {
|
||||||
let uw = (watts * 1_000_000.0) as u64;
|
fs::write(&self.pl1_path, ((watts * 1_000_000.0) as u64).to_string())?;
|
||||||
fs::write(&self.pl1_path, uw.to_string())?;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn set_burst_power_limit(&self, watts: f32) -> Result<()> {
|
fn set_burst_power_limit(&self, watts: f32) -> Result<()> {
|
||||||
let uw = (watts * 1_000_000.0) as u64;
|
fs::write(&self.pl2_path, ((watts * 1_000_000.0) as u64).to_string())?;
|
||||||
fs::write(&self.pl2_path, uw.to_string())?;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl HardwareWatchdog for DellXps9380Sal {
|
impl HardwareWatchdog for DellXps9380Sal {
|
||||||
fn check_emergency(&self) -> Result<bool> {
|
fn get_safety_status(&self) -> Result<SafetyStatus> {
|
||||||
// Check for thermal throttling or BD PROCHOT
|
let temp = self.get_temp()?;
|
||||||
// Simplified for now
|
if temp > 98.0 {
|
||||||
Ok(false)
|
return Ok(SafetyStatus::EmergencyAbort(format!("Thermal Runaway: {:.1}°C", temp)));
|
||||||
|
}
|
||||||
|
if let Ok(msr_val) = self.read_msr(0x1FC) {
|
||||||
|
if (msr_val & 0x1) != 0 && temp < 85.0 {
|
||||||
|
let _ = self.write_msr(0x1FC, msr_val & !0x1);
|
||||||
|
return Ok(SafetyStatus::Warning("BD PROCHOT Latch Cleared".to_string()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(SafetyStatus::Nominal)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for DellXps9380Sal {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
let _ = self.restore();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,19 +2,21 @@ use anyhow::{Result, anyhow};
|
|||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
use std::thread;
|
|
||||||
use std::process::Command;
|
use std::process::Command;
|
||||||
use tracing::{debug};
|
use tracing::{debug, warn};
|
||||||
use std::sync::mpsc;
|
use std::sync::Mutex;
|
||||||
|
|
||||||
use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError};
|
use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus};
|
||||||
use crate::sal::heuristic::discovery::SystemFactSheet;
|
use crate::sal::heuristic::discovery::SystemFactSheet;
|
||||||
use crate::sal::heuristic::schema::HardwareDb;
|
use crate::sal::heuristic::schema::HardwareDb;
|
||||||
|
|
||||||
pub struct GenericLinuxSal {
|
pub struct GenericLinuxSal {
|
||||||
fact_sheet: SystemFactSheet,
|
fact_sheet: SystemFactSheet,
|
||||||
db: HardwareDb,
|
db: HardwareDb,
|
||||||
suppressed_services: Vec<String>,
|
suppressed_services: Mutex<Vec<String>>,
|
||||||
|
last_valid_temp: Mutex<(f32, Instant)>,
|
||||||
|
current_pl1: Mutex<f32>,
|
||||||
|
last_energy: Mutex<(u64, Instant)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl GenericLinuxSal {
|
impl GenericLinuxSal {
|
||||||
@@ -22,7 +24,10 @@ impl GenericLinuxSal {
|
|||||||
Self {
|
Self {
|
||||||
fact_sheet,
|
fact_sheet,
|
||||||
db,
|
db,
|
||||||
suppressed_services: Vec::new(),
|
suppressed_services: Mutex::new(Vec::new()),
|
||||||
|
last_valid_temp: Mutex::new((0.0, Instant::now())),
|
||||||
|
current_pl1: Mutex::new(15.0),
|
||||||
|
last_energy: Mutex::new((0, Instant::now())),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -30,33 +35,18 @@ impl GenericLinuxSal {
|
|||||||
self.fact_sheet.vendor.to_lowercase().contains("dell")
|
self.fact_sheet.vendor.to_lowercase().contains("dell")
|
||||||
}
|
}
|
||||||
|
|
||||||
fn read_sysfs_timeout(&self, path: &Path, timeout: Duration) -> Result<String> {
|
/// Read sysfs safely. We removed the thread-per-read timeout logic
|
||||||
let (tx, rx) = mpsc::channel();
|
/// as it was inefficient. sysfs reads are generally fast enough.
|
||||||
let path_buf = path.to_path_buf();
|
fn read_sysfs(&self, path: &Path) -> Result<String> {
|
||||||
|
fs::read_to_string(path).map(|s| s.trim().to_string()).map_err(|e| anyhow!(e))
|
||||||
thread::spawn(move || {
|
|
||||||
let res = fs::read_to_string(path_buf).map(|s| s.trim().to_string());
|
|
||||||
let _ = tx.send(res);
|
|
||||||
});
|
|
||||||
|
|
||||||
match rx.recv_timeout(timeout) {
|
|
||||||
Ok(res) => res.map_err(|e| anyhow!("Failed to read sysfs: {}", e)),
|
|
||||||
Err(_) => Err(anyhow!("Timeout reading sysfs path: {:?}", path)),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PreflightAuditor for GenericLinuxSal {
|
impl PreflightAuditor for GenericLinuxSal {
|
||||||
fn audit(&self) -> Box<dyn Iterator<Item = AuditStep> + '_> {
|
fn audit(&self) -> Box<dyn Iterator<Item = AuditStep> + '_> {
|
||||||
let mut steps = Vec::new();
|
let mut steps = Vec::new();
|
||||||
|
|
||||||
// 1. Static DB checks
|
|
||||||
for check in &self.db.preflight_checks {
|
for check in &self.db.preflight_checks {
|
||||||
let status = Command::new("sh")
|
let status = Command::new("sh").arg("-c").arg(&check.check_cmd).status();
|
||||||
.arg("-c")
|
|
||||||
.arg(&check.check_cmd)
|
|
||||||
.status();
|
|
||||||
|
|
||||||
steps.push(AuditStep {
|
steps.push(AuditStep {
|
||||||
description: check.name.clone(),
|
description: check.name.clone(),
|
||||||
outcome: match status {
|
outcome: match status {
|
||||||
@@ -65,8 +55,6 @@ impl PreflightAuditor for GenericLinuxSal {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. Conflict checks (Critical only)
|
|
||||||
for conflict_id in &self.fact_sheet.active_conflicts {
|
for conflict_id in &self.fact_sheet.active_conflicts {
|
||||||
if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) {
|
if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) {
|
||||||
if conflict.severity == "Critical" {
|
if conflict.severity == "Critical" {
|
||||||
@@ -77,7 +65,6 @@ impl PreflightAuditor for GenericLinuxSal {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Box::new(steps.into_iter())
|
Box::new(steps.into_iter())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -86,31 +73,32 @@ impl SensorBus for GenericLinuxSal {
|
|||||||
fn get_temp(&self) -> Result<f32> {
|
fn get_temp(&self) -> Result<f32> {
|
||||||
let path = self.fact_sheet.temp_path.as_ref()
|
let path = self.fact_sheet.temp_path.as_ref()
|
||||||
.ok_or_else(|| anyhow!("No temperature sensor path found"))?;
|
.ok_or_else(|| anyhow!("No temperature sensor path found"))?;
|
||||||
let content = self.read_sysfs_timeout(path, Duration::from_millis(200))?;
|
let content = self.read_sysfs(path)?;
|
||||||
let milli_celsius: f32 = content.parse()?;
|
let temp = content.parse::<f32>()? / 1000.0;
|
||||||
Ok(milli_celsius / 1000.0)
|
let mut last = self.last_valid_temp.lock().unwrap();
|
||||||
|
if (temp - last.0).abs() > 0.01 { *last = (temp, Instant::now()); }
|
||||||
|
Ok(temp)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_power_w(&self) -> Result<f32> {
|
fn get_power_w(&self) -> Result<f32> {
|
||||||
let rapl_path = self.fact_sheet.rapl_paths.first()
|
let rapl_path = self.fact_sheet.rapl_paths.first()
|
||||||
.ok_or_else(|| anyhow!("No RAPL path found"))?;
|
.ok_or_else(|| anyhow!("No RAPL path found"))?;
|
||||||
let energy_path = rapl_path.join("energy_uj");
|
let energy_path = rapl_path.join("energy_uj");
|
||||||
|
let mut last = self.last_energy.lock().unwrap();
|
||||||
let e1: u64 = self.read_sysfs_timeout(&energy_path, Duration::from_millis(200))?.parse()?;
|
let e2: u64 = self.read_sysfs(&energy_path)?.parse()?;
|
||||||
let t1 = Instant::now();
|
|
||||||
thread::sleep(Duration::from_millis(100));
|
|
||||||
let e2: u64 = self.read_sysfs_timeout(&energy_path, Duration::from_millis(200))?.parse()?;
|
|
||||||
let t2 = Instant::now();
|
let t2 = Instant::now();
|
||||||
|
let (e1, t1) = *last;
|
||||||
let delta_e = e2.wrapping_sub(e1);
|
let delta_e = e2.wrapping_sub(e1);
|
||||||
let delta_t = t2.duration_since(t1).as_secs_f32();
|
let delta_t = t2.duration_since(t1).as_secs_f32();
|
||||||
|
*last = (e2, t2);
|
||||||
|
if delta_t < 0.01 { return Ok(0.0); }
|
||||||
Ok((delta_e as f32 / 1_000_000.0) / delta_t)
|
Ok((delta_e as f32 / 1_000_000.0) / delta_t)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_fan_rpms(&self) -> Result<Vec<u32>> {
|
fn get_fan_rpms(&self) -> Result<Vec<u32>> {
|
||||||
let mut rpms = Vec::new();
|
let mut rpms = Vec::new();
|
||||||
for path in &self.fact_sheet.fan_paths {
|
for path in &self.fact_sheet.fan_paths {
|
||||||
if let Ok(content) = self.read_sysfs_timeout(path, Duration::from_millis(200)) {
|
if let Ok(content) = self.read_sysfs(path) {
|
||||||
if let Ok(rpm) = content.parse() { rpms.push(rpm); }
|
if let Ok(rpm) = content.parse() { rpms.push(rpm); }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -120,10 +108,8 @@ impl SensorBus for GenericLinuxSal {
|
|||||||
fn get_freq_mhz(&self) -> Result<f32> {
|
fn get_freq_mhz(&self) -> Result<f32> {
|
||||||
let path = Path::new("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq");
|
let path = Path::new("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq");
|
||||||
if path.exists() {
|
if path.exists() {
|
||||||
let khz: f32 = self.read_sysfs_timeout(path, Duration::from_millis(200))?.parse()?;
|
Ok(self.read_sysfs(path)?.parse::<f32>()? / 1000.0)
|
||||||
Ok(khz / 1000.0)
|
|
||||||
} else {
|
} else {
|
||||||
// Fallback: parse /proc/cpuinfo
|
|
||||||
let cpuinfo = fs::read_to_string("/proc/cpuinfo")?;
|
let cpuinfo = fs::read_to_string("/proc/cpuinfo")?;
|
||||||
for line in cpuinfo.lines() {
|
for line in cpuinfo.lines() {
|
||||||
if line.starts_with("cpu MHz") {
|
if line.starts_with("cpu MHz") {
|
||||||
@@ -149,38 +135,32 @@ impl ActuatorBus for GenericLinuxSal {
|
|||||||
let parts: Vec<&str> = cmd_str.split_whitespace().collect();
|
let parts: Vec<&str> = cmd_str.split_whitespace().collect();
|
||||||
Command::new(parts[0]).args(&parts[1..]).status()?;
|
Command::new(parts[0]).args(&parts[1..]).status()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
} else { Err(anyhow!("Dell fan command missing in DB")) }
|
} else { Err(anyhow!("Dell fan command missing")) }
|
||||||
} else {
|
} else { Ok(()) }
|
||||||
debug!("Fan control not implemented for non-Dell systems yet");
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn set_sustained_power_limit(&self, watts: f32) -> Result<()> {
|
fn set_sustained_power_limit(&self, watts: f32) -> Result<()> {
|
||||||
let rapl_path = self.fact_sheet.rapl_paths.first()
|
let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL1 path"))?;
|
||||||
.ok_or_else(|| anyhow!("No RAPL path found for PL1"))?;
|
fs::write(rapl_path.join("constraint_0_power_limit_uw"), ((watts * 1_000_000.0) as u64).to_string())?;
|
||||||
let path = rapl_path.join("constraint_0_power_limit_uw");
|
*self.current_pl1.lock().unwrap() = watts;
|
||||||
fs::write(path, ((watts * 1_000_000.0) as u64).to_string())?;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn set_burst_power_limit(&self, watts: f32) -> Result<()> {
|
fn set_burst_power_limit(&self, watts: f32) -> Result<()> {
|
||||||
let rapl_path = self.fact_sheet.rapl_paths.first()
|
let rapl_path = self.fact_sheet.rapl_paths.first().ok_or_else(|| anyhow!("No PL2 path"))?;
|
||||||
.ok_or_else(|| anyhow!("No RAPL path found for PL2"))?;
|
fs::write(rapl_path.join("constraint_1_power_limit_uw"), ((watts * 1_000_000.0) as u64).to_string())?;
|
||||||
let path = rapl_path.join("constraint_1_power_limit_uw");
|
|
||||||
fs::write(path, ((watts * 1_000_000.0) as u64).to_string())?;
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl EnvironmentGuard for GenericLinuxSal {
|
impl EnvironmentGuard for GenericLinuxSal {
|
||||||
fn suppress(&mut self) -> Result<()> {
|
fn suppress(&self) -> Result<()> {
|
||||||
|
let mut suppressed = self.suppressed_services.lock().unwrap();
|
||||||
for conflict_id in &self.fact_sheet.active_conflicts {
|
for conflict_id in &self.fact_sheet.active_conflicts {
|
||||||
if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) {
|
if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) {
|
||||||
for service in &conflict.services {
|
for service in &conflict.services {
|
||||||
debug!("Stopping service: {}", service);
|
|
||||||
if Command::new("systemctl").arg("stop").arg(service).status()?.success() {
|
if Command::new("systemctl").arg("stop").arg(service).status()?.success() {
|
||||||
self.suppressed_services.push(service.clone());
|
suppressed.push(service.clone());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -188,31 +168,30 @@ impl EnvironmentGuard for GenericLinuxSal {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn restore(&mut self) -> Result<()> {
|
fn restore(&self) -> Result<()> {
|
||||||
for service in self.suppressed_services.drain(..) {
|
let mut suppressed = self.suppressed_services.lock().unwrap();
|
||||||
debug!("Starting service: {}", service);
|
for service in suppressed.drain(..) {
|
||||||
let _ = Command::new("systemctl").arg("start").arg(service).status();
|
let _ = Command::new("systemctl").arg("start").arg(service).status();
|
||||||
}
|
}
|
||||||
if self.is_dell() {
|
if self.is_dell() { let _ = self.set_fan_mode("auto"); }
|
||||||
let _ = self.set_fan_mode("auto");
|
|
||||||
}
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl HardwareWatchdog for GenericLinuxSal {
|
impl HardwareWatchdog for GenericLinuxSal {
|
||||||
fn check_emergency(&self) -> Result<bool> {
|
fn get_safety_status(&self) -> Result<SafetyStatus> {
|
||||||
if let Ok(temp) = self.get_temp() {
|
let temp = self.get_temp()?;
|
||||||
if temp > 100.0 {
|
if temp > 100.0 {
|
||||||
return Ok(true);
|
return Ok(SafetyStatus::EmergencyAbort(format!("Thermal runaway: {:.1}°C", temp)));
|
||||||
}
|
|
||||||
}
|
}
|
||||||
Ok(false)
|
let last = self.last_valid_temp.lock().unwrap();
|
||||||
|
if last.1.elapsed() > Duration::from_secs(5) {
|
||||||
|
return Ok(SafetyStatus::EmergencyAbort("Temperature sensor stalled".to_string()));
|
||||||
|
}
|
||||||
|
Ok(SafetyStatus::Nominal)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Drop for GenericLinuxSal {
|
impl Drop for GenericLinuxSal {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) { let _ = self.restore(); }
|
||||||
let _ = self.restore();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ pub struct Conflict {
|
|||||||
#[derive(Debug, Deserialize, Clone)]
|
#[derive(Debug, Deserialize, Clone)]
|
||||||
pub struct Ecosystem {
|
pub struct Ecosystem {
|
||||||
pub vendor_regex: String,
|
pub vendor_regex: String,
|
||||||
|
pub product_regex: Option<String>,
|
||||||
pub polling_cap_ms: Option<u64>,
|
pub polling_cap_ms: Option<u64>,
|
||||||
pub drivers: Option<Vec<String>>,
|
pub drivers: Option<Vec<String>>,
|
||||||
pub fan_manual_mode_cmd: Option<String>,
|
pub fan_manual_mode_cmd: Option<String>,
|
||||||
@@ -46,6 +47,7 @@ pub struct Ecosystem {
|
|||||||
pub fan_boost_path: Option<String>,
|
pub fan_boost_path: Option<String>,
|
||||||
pub ec_tool: Option<String>,
|
pub ec_tool: Option<String>,
|
||||||
pub optimization: Option<String>,
|
pub optimization: Option<String>,
|
||||||
|
pub help_text: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize, Clone)]
|
#[derive(Debug, Deserialize, Clone)]
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep};
|
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, PlatformSal, SafetyStatus};
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
|
|
||||||
pub struct MockSal;
|
pub struct MockSal;
|
||||||
@@ -26,10 +26,10 @@ impl PreflightAuditor for MockSal {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl EnvironmentGuard for MockSal {
|
impl EnvironmentGuard for MockSal {
|
||||||
fn suppress(&mut self) -> Result<()> {
|
fn suppress(&self) -> Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
fn restore(&mut self) -> Result<()> {
|
fn restore(&self) -> Result<()> {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -62,7 +62,7 @@ impl ActuatorBus for MockSal {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl HardwareWatchdog for MockSal {
|
impl HardwareWatchdog for MockSal {
|
||||||
fn check_emergency(&self) -> Result<bool> {
|
fn get_safety_status(&self) -> Result<SafetyStatus> {
|
||||||
Ok(false)
|
Ok(SafetyStatus::Nominal)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -49,8 +49,17 @@ impl<T: PreflightAuditor + ?Sized> PreflightAuditor for Arc<T> {
|
|||||||
|
|
||||||
/// Suppresses conflicting daemons (tlp, thermald).
|
/// Suppresses conflicting daemons (tlp, thermald).
|
||||||
pub trait EnvironmentGuard: Send + Sync {
|
pub trait EnvironmentGuard: Send + Sync {
|
||||||
fn suppress(&mut self) -> Result<()>;
|
fn suppress(&self) -> Result<()>;
|
||||||
fn restore(&mut self) -> Result<()>;
|
fn restore(&self) -> Result<()>;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: EnvironmentGuard + ?Sized> EnvironmentGuard for Arc<T> {
|
||||||
|
fn suppress(&self) -> Result<()> {
|
||||||
|
(**self).suppress()
|
||||||
|
}
|
||||||
|
fn restore(&self) -> Result<()> {
|
||||||
|
(**self).restore()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Read-only interface for standardized metrics.
|
/// Read-only interface for standardized metrics.
|
||||||
@@ -97,15 +106,23 @@ impl<T: ActuatorBus + ?Sized> ActuatorBus for Arc<T> {
|
|||||||
|
|
||||||
/// Concurrent monitor for catastrophic states.
|
/// Concurrent monitor for catastrophic states.
|
||||||
pub trait HardwareWatchdog: Send + Sync {
|
pub trait HardwareWatchdog: Send + Sync {
|
||||||
fn check_emergency(&self) -> Result<bool>;
|
fn get_safety_status(&self) -> Result<SafetyStatus>;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: HardwareWatchdog + ?Sized> HardwareWatchdog for Arc<T> {
|
impl<T: HardwareWatchdog + ?Sized> HardwareWatchdog for Arc<T> {
|
||||||
fn check_emergency(&self) -> Result<bool> {
|
fn get_safety_status(&self) -> Result<SafetyStatus> {
|
||||||
(**self).check_emergency()
|
(**self).get_safety_status()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
|
pub enum SafetyStatus {
|
||||||
|
Nominal,
|
||||||
|
Warning(String),
|
||||||
|
Critical(String),
|
||||||
|
EmergencyAbort(String),
|
||||||
|
}
|
||||||
|
|
||||||
/// Aggregate trait for a complete platform implementation.
|
/// Aggregate trait for a complete platform implementation.
|
||||||
pub trait PlatformSal: PreflightAuditor + SensorBus + ActuatorBus + EnvironmentGuard + HardwareWatchdog {}
|
pub trait PlatformSal: PreflightAuditor + SensorBus + ActuatorBus + EnvironmentGuard + HardwareWatchdog {}
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ use ratatui::{
|
|||||||
widgets::{Block, Borders, List, ListItem, Paragraph, Chart, Dataset, Axis, BorderType, GraphType},
|
widgets::{Block, Borders, List, ListItem, Paragraph, Chart, Dataset, Axis, BorderType, GraphType},
|
||||||
symbols::Marker,
|
symbols::Marker,
|
||||||
Frame,
|
Frame,
|
||||||
|
prelude::Stylize,
|
||||||
};
|
};
|
||||||
use crate::mediator::TelemetryState;
|
use crate::mediator::TelemetryState;
|
||||||
use crate::ui::theme::*;
|
use crate::ui::theme::*;
|
||||||
@@ -83,6 +84,55 @@ pub fn draw_dashboard(
|
|||||||
draw_freq_graph(f, right_side_chunks[2], state);
|
draw_freq_graph(f, right_side_chunks[2], state);
|
||||||
|
|
||||||
draw_logs(f, chunks[3], ui_state);
|
draw_logs(f, chunks[3], ui_state);
|
||||||
|
|
||||||
|
if state.is_emergency {
|
||||||
|
draw_emergency_overlay(f, area, state);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn draw_emergency_overlay(f: &mut Frame, area: Rect, state: &TelemetryState) {
|
||||||
|
let block = Block::default()
|
||||||
|
.borders(Borders::ALL)
|
||||||
|
.border_type(BorderType::Double)
|
||||||
|
.border_style(Style::default().fg(Color::Red).add_modifier(Modifier::BOLD))
|
||||||
|
.bg(Color::Black)
|
||||||
|
.title(" 🚨 EMERGENCY ABORT 🚨 ");
|
||||||
|
|
||||||
|
let area = centered_rect(60, 20, area);
|
||||||
|
let inner = block.inner(area);
|
||||||
|
f.render_widget(block, area);
|
||||||
|
|
||||||
|
let reason = state.emergency_reason.as_deref().unwrap_or("Unknown safety trigger");
|
||||||
|
let text = vec![
|
||||||
|
Line::from(vec![Span::styled("CRITICAL SAFETY LIMIT TRIGGERED", Style::default().fg(Color::Red).add_modifier(Modifier::BOLD))]),
|
||||||
|
Line::from(""),
|
||||||
|
Line::from(vec![Span::raw("Reason: "), Span::styled(reason, Style::default().fg(Color::Yellow))]),
|
||||||
|
Line::from(""),
|
||||||
|
Line::from("Hardware has been restored to safe defaults."),
|
||||||
|
Line::from("Exiting in 1 second..."),
|
||||||
|
];
|
||||||
|
|
||||||
|
f.render_widget(Paragraph::new(text).alignment(ratatui::layout::Alignment::Center), inner);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn centered_rect(percent_x: u16, percent_y: u16, r: Rect) -> Rect {
|
||||||
|
let popup_layout = Layout::default()
|
||||||
|
.direction(Direction::Vertical)
|
||||||
|
.constraints([
|
||||||
|
Constraint::Percentage((100 - percent_y) / 2),
|
||||||
|
Constraint::Percentage(percent_y),
|
||||||
|
Constraint::Percentage((100 - percent_y) / 2),
|
||||||
|
])
|
||||||
|
.split(r);
|
||||||
|
|
||||||
|
Layout::default()
|
||||||
|
.direction(Direction::Horizontal)
|
||||||
|
.constraints([
|
||||||
|
Constraint::Percentage((100 - percent_x) / 2),
|
||||||
|
Constraint::Percentage(percent_x),
|
||||||
|
Constraint::Percentage((100 - percent_x) / 2),
|
||||||
|
])
|
||||||
|
.split(popup_layout[1])[1]
|
||||||
}
|
}
|
||||||
|
|
||||||
fn draw_header(f: &mut Frame, area: Rect, state: &TelemetryState) {
|
fn draw_header(f: &mut Frame, area: Rect, state: &TelemetryState) {
|
||||||
|
|||||||
Reference in New Issue
Block a user