fixed dangerous states to be applied

This commit is contained in:
2026-02-27 00:59:36 +01:00
parent 4ed7228355
commit 4c4026a600
9 changed files with 116 additions and 29 deletions

View File

@@ -7,6 +7,7 @@
use serde::{Serialize, Deserialize};
use std::collections::HashMap;
use std::path::PathBuf;
use tracing::warn;
pub mod formatters;
@@ -180,6 +181,18 @@ impl OptimizerEngine {
}
}
let best_pl = if max_score > f32::MIN {
best_pl
} else {
profile.points.last().map(|p| p.power_w).unwrap_or(15.0)
};
// Safety Floor: Never recommend a TDP below 5W, as this bricks system performance.
if best_pl < 5.0 {
warn!("Heuristic suggested dangerously low PL1 ({:.1}W). Falling back to 15W safety floor.", best_pl);
return 15.0;
}
best_pl
}
}

View File

@@ -161,6 +161,7 @@ fn main() -> Result<()> {
// 5. Spawn Backend Orchestrator
let sal_backend = sal.clone();
let facts_backend = facts.clone();
let config_out = args.config_out.clone();
let backend_handle = thread::spawn(move || {
let workload = Box::new(StressNg::new());
let mut orchestrator = BenchmarkOrchestrator::new(
@@ -169,6 +170,7 @@ fn main() -> Result<()> {
workload,
telemetry_tx,
command_rx,
config_out,
);
orchestrator.run()
});

View File

@@ -12,6 +12,7 @@ use sysinfo::System;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Mutex;
use std::path::PathBuf;
use crate::sal::traits::{PlatformSal, SafetyStatus};
use crate::sal::heuristic::discovery::SystemFactSheet;
@@ -40,6 +41,8 @@ pub struct BenchmarkOrchestrator {
profile: ThermalProfile,
/// Mathematics engine for data smoothing and optimization.
engine: OptimizerEngine,
/// CLI override for the configuration output path.
optional_config_out: Option<PathBuf>,
/// Sliding window of power readings (Watts).
history_watts: VecDeque<f32>,
@@ -67,6 +70,7 @@ impl BenchmarkOrchestrator {
workload: Box<dyn Workload>,
telemetry_tx: mpsc::Sender<TelemetryState>,
command_rx: mpsc::Receiver<UiCommand>,
optional_config_out: Option<PathBuf>,
) -> Self {
let mut sys = System::new_all();
sys.refresh_all();
@@ -92,6 +96,7 @@ impl BenchmarkOrchestrator {
total_ram_gb,
emergency_abort: Arc::new(AtomicBool::new(false)),
emergency_reason: Arc::new(Mutex::new(None)),
optional_config_out,
}
}
@@ -222,12 +227,18 @@ impl BenchmarkOrchestrator {
trip_temp: res.max_temp_c.max(95.0),
};
if let Some(throttled_path) = self.facts.paths.configs.get("throttled") {
crate::engine::formatters::throttled::ThrottledTranslator::save(throttled_path, &config)?;
self.log(&format!("✓ Saved '{}' (merged).", throttled_path.display()))?;
res.config_paths.insert("throttled".to_string(), throttled_path.clone());
// 1. Throttled (Merged if exists)
// PRIORITY: optional_config_out > facts discovery > fallback
let throttled_path = self.optional_config_out.clone()
.or_else(|| self.facts.paths.configs.get("throttled").cloned());
if let Some(path) = throttled_path {
crate::engine::formatters::throttled::ThrottledTranslator::save(&path, &config)?;
self.log(&format!("✓ Saved '{}'.", path.display()))?;
res.config_paths.insert("throttled".to_string(), path.clone());
}
// 2. i8kmon
if let Some(i8k_path) = self.facts.paths.configs.get("i8kmon") {
let i8k_config = crate::engine::formatters::i8kmon::I8kmonConfig {
t_ambient: self.profile.ambient_temp,

View File

@@ -22,6 +22,11 @@ pub struct DellXps9380Sal {
suppressed_services: Mutex<Vec<String>>,
msr_file: Mutex<fs::File>,
last_energy: Mutex<(u64, Instant)>,
// --- Original State for Restoration ---
original_pl1: Mutex<Option<u64>>,
original_pl2: Mutex<Option<u64>>,
original_fan_mode: Mutex<Option<String>>,
}
impl DellXps9380Sal {
@@ -53,6 +58,9 @@ impl DellXps9380Sal {
last_energy: Mutex::new((initial_energy, Instant::now())),
fact_sheet: facts,
ctx,
original_pl1: Mutex::new(None),
original_pl2: Mutex::new(None),
original_fan_mode: Mutex::new(None),
})
}
@@ -126,12 +134,25 @@ impl PreflightAuditor for DellXps9380Sal {
impl EnvironmentGuard for DellXps9380Sal {
fn suppress(&self) -> Result<()> {
// 1. Snapshot Power Limits
if let Ok(pl1) = fs::read_to_string(&self.pl1_path) {
*self.original_pl1.lock().unwrap() = pl1.trim().parse().ok();
}
if let Ok(pl2) = fs::read_to_string(&self.pl2_path) {
*self.original_pl2.lock().unwrap() = pl2.trim().parse().ok();
}
// 2. Snapshot Fan Mode (Assumption: Dell BIOS Fan Control is active)
// We can't easily read current state of dell-bios-fan-control, so we assume 'auto' (1)
*self.original_fan_mode.lock().unwrap() = Some("1".to_string());
// 3. Stop Services
let services = ["tlp", "thermald", "i8kmon"];
let mut suppressed = self.suppressed_services.lock().unwrap();
for s in services {
if self.ctx.runner.run("systemctl", &["is-active", "--quiet", s]).is_ok() {
debug!("Suppressing service: {}", s);
self.ctx.runner.run("systemctl", &["stop", s])?;
let _ = self.ctx.runner.run("systemctl", &["stop", s]);
suppressed.push(s.to_string());
}
}
@@ -139,6 +160,20 @@ impl EnvironmentGuard for DellXps9380Sal {
}
fn restore(&self) -> Result<()> {
// 1. Restore Power Limits
if let Some(pl1) = *self.original_pl1.lock().unwrap() {
let _ = fs::write(&self.pl1_path, pl1.to_string());
}
if let Some(pl2) = *self.original_pl2.lock().unwrap() {
let _ = fs::write(&self.pl2_path, pl2.to_string());
}
// 2. Restore Fan Mode (BIOS Control)
if let Some(tool_path) = self.fact_sheet.paths.tools.get("dell_fan_ctrl") {
let _ = self.ctx.runner.run(&tool_path.to_string_lossy(), &["1"]);
}
// 3. Restart Services
let mut suppressed = self.suppressed_services.lock().unwrap();
for s in suppressed.drain(..) {
let _ = self.ctx.runner.run("systemctl", &["start", &s]);
@@ -162,17 +197,23 @@ impl SensorBus for DellXps9380Sal {
}
fn get_power_w(&self) -> Result<f32> {
if self.pwr_path.to_string_lossy().contains("energy_uj") {
// FIX: Ensure we always read from energy_uj if available for delta calculation
let rapl_base = self.pl1_path.parent().context("RAPL path error")?;
let energy_path = rapl_base.join("energy_uj");
if energy_path.exists() {
let mut last = self.last_energy.lock().unwrap();
let e2 = fs::read_to_string(&self.pwr_path)?.trim().parse::<u64>()?;
let e2_str = fs::read_to_string(&energy_path)?;
let e2 = e2_str.trim().parse::<u64>()?;
let t2 = Instant::now();
let (e1, t1) = *last;
let delta_e = e2.wrapping_sub(e1);
let delta_t = t2.duration_since(t1).as_secs_f32();
*last = (e2, t2);
if delta_t < 0.01 { return Ok(0.0); }
if delta_t < 0.05 { return Ok(0.0); }
Ok((delta_e as f32 / 1_000_000.0) / delta_t)
} else {
// Fallback to power1_average if it exists (units are µW)
let s = fs::read_to_string(&self.pwr_path)?;
Ok(s.trim().parse::<f32>()? / 1000000.0)
}

View File

@@ -3,6 +3,7 @@ use std::path::{Path};
use std::fs;
use std::time::{Duration, Instant};
use std::sync::Mutex;
use tracing::{debug};
use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx};
use crate::sal::heuristic::discovery::SystemFactSheet;
@@ -16,6 +17,10 @@ pub struct GenericLinuxSal {
last_valid_temp: Mutex<(f32, Instant)>,
current_pl1: Mutex<f32>,
last_energy: Mutex<(u64, Instant)>,
// --- Original State for Restoration ---
original_pl1: Mutex<Option<u64>>,
original_pl2: Mutex<Option<u64>>,
}
impl GenericLinuxSal {
@@ -34,6 +39,8 @@ impl GenericLinuxSal {
last_energy: Mutex::new((initial_energy, Instant::now())),
fact_sheet: facts,
ctx,
original_pl1: Mutex::new(None),
original_pl2: Mutex::new(None),
}
}
@@ -95,7 +102,7 @@ impl SensorBus for GenericLinuxSal {
let delta_e = e2.wrapping_sub(e1);
let delta_t = t2.duration_since(t1).as_secs_f32();
*last = (e2, t2);
if delta_t < 0.01 { return Ok(0.0); }
if delta_t < 0.05 { return Ok(0.0); }
Ok((delta_e as f32 / 1_000_000.0) / delta_t)
}
@@ -160,12 +167,22 @@ impl ActuatorBus for GenericLinuxSal {
impl EnvironmentGuard for GenericLinuxSal {
fn suppress(&self) -> Result<()> {
// Snapshot Power Limits
if let Some(rapl_path) = self.fact_sheet.rapl_paths.first() {
if let Ok(pl1) = fs::read_to_string(rapl_path.join("constraint_0_power_limit_uw")) {
*self.original_pl1.lock().unwrap() = pl1.trim().parse().ok();
}
if let Ok(pl2) = fs::read_to_string(rapl_path.join("constraint_1_power_limit_uw")) {
*self.original_pl2.lock().unwrap() = pl2.trim().parse().ok();
}
}
let mut suppressed = self.suppressed_services.lock().unwrap();
for conflict_id in &self.fact_sheet.active_conflicts {
if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) {
for service in &conflict.services {
if self.ctx.runner.run("systemctl", &["is-active", "--quiet", service]).is_ok() {
self.ctx.runner.run("systemctl", &["stop", service])?;
let _ = self.ctx.runner.run("systemctl", &["stop", service]);
suppressed.push(service.clone());
}
}
@@ -175,6 +192,16 @@ impl EnvironmentGuard for GenericLinuxSal {
}
fn restore(&self) -> Result<()> {
// Restore Power Limits
if let Some(rapl_path) = self.fact_sheet.rapl_paths.first() {
if let Some(pl1) = *self.original_pl1.lock().unwrap() {
let _ = fs::write(rapl_path.join("constraint_0_power_limit_uw"), pl1.to_string());
}
if let Some(pl2) = *self.original_pl2.lock().unwrap() {
let _ = fs::write(rapl_path.join("constraint_1_power_limit_uw"), pl2.to_string());
}
}
let mut suppressed = self.suppressed_services.lock().unwrap();
for service in suppressed.drain(..) {
let _ = self.ctx.runner.run("systemctl", &["start", &service]);

View File

@@ -1,11 +1,11 @@
use std::fs;
use std::path::{Path, PathBuf};
use std::process::Command;
use std::time::{Duration};
use std::thread;
use std::sync::mpsc;
use std::collections::HashMap;
use crate::sal::heuristic::schema::{SensorDiscovery, ActuatorDiscovery, Conflict, Discovery, Benchmarking};
use crate::sys::SyscallRunner;
use tracing::{debug, warn};
/// Registry of dynamically discovered paths for configs and tools.
@@ -31,6 +31,7 @@ pub struct SystemFactSheet {
/// Probes the system for hardware sensors, actuators, service conflicts, and paths.
pub fn discover_facts(
base_path: &Path,
runner: &dyn SyscallRunner,
discovery: &Discovery,
conflicts: &[Conflict],
bench_config: Benchmarking,
@@ -45,7 +46,7 @@ pub fn discover_facts(
let mut active_conflicts = Vec::new();
for conflict in conflicts {
for service in &conflict.services {
if is_service_active(service) {
if is_service_active(runner, service) {
debug!("Detected active conflict: {} (Service: {})", conflict.id, service);
active_conflicts.push(conflict.id.clone());
break;
@@ -93,7 +94,6 @@ fn discover_paths(base_path: &Path, discovery: &Discovery) -> PathRegistry {
break;
}
}
// If not found, use the first one as default if any exist
if !registry.configs.contains_key(id) {
if let Some(first) = candidates.first() {
registry.configs.insert(id.clone(), PathBuf::from(first));
@@ -142,7 +142,6 @@ fn discover_hwmon(base_path: &Path, cfg: &SensorDiscovery) -> (Option<PathBuf>,
for hw_entry in hw_entries.flatten() {
let file_name = hw_entry.file_name().into_string().unwrap_or_default();
// Temperature Sensors
if file_name.starts_with("temp") && file_name.ends_with("_label") {
if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(100)) {
if cfg.temp_labels.iter().any(|l| label.contains(l)) {
@@ -154,7 +153,6 @@ fn discover_hwmon(base_path: &Path, cfg: &SensorDiscovery) -> (Option<PathBuf>,
}
}
// Fan Sensors
if file_name.starts_with("fan") && file_name.ends_with("_label") {
if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(100)) {
if cfg.fan_labels.iter().any(|l| label.contains(l)) {
@@ -206,18 +204,9 @@ fn discover_rapl(base_path: &Path, cfg: &ActuatorDiscovery) -> Vec<PathBuf> {
paths
}
/// Checks if a systemd service is currently active.
pub fn is_service_active(service: &str) -> bool {
let status = Command::new("systemctl")
.arg("is-active")
.arg("--quiet")
.arg(service)
.status();
match status {
Ok(s) => s.success(),
Err(_) => false,
}
/// Checks if a systemd service is currently active using the injected runner.
pub fn is_service_active(runner: &dyn SyscallRunner, service: &str) -> bool {
runner.run("systemctl", &["is-active", "--quiet", service]).is_ok()
}
/// Helper to read a sysfs file with a timeout.

View File

@@ -24,7 +24,7 @@ impl HeuristicEngine {
.context("Failed to parse hardware_db.toml")?;
// 2. Discover Facts
let facts = discover_facts(&ctx.sysfs_base, &db.discovery, &db.conflicts, db.benchmarking.clone());
let facts = discover_facts(&ctx.sysfs_base, ctx.runner.as_ref(), &db.discovery, &db.conflicts, db.benchmarking.clone());
info!("System Identity: {} {}", facts.vendor, facts.model);
// 3. Routing Logic