fixed dangerous states to be applied
This commit is contained in:
@@ -7,6 +7,7 @@
|
||||
use serde::{Serialize, Deserialize};
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use tracing::warn;
|
||||
|
||||
pub mod formatters;
|
||||
|
||||
@@ -180,6 +181,18 @@ impl OptimizerEngine {
|
||||
}
|
||||
}
|
||||
|
||||
let best_pl = if max_score > f32::MIN {
|
||||
best_pl
|
||||
} else {
|
||||
profile.points.last().map(|p| p.power_w).unwrap_or(15.0)
|
||||
};
|
||||
|
||||
// Safety Floor: Never recommend a TDP below 5W, as this bricks system performance.
|
||||
if best_pl < 5.0 {
|
||||
warn!("Heuristic suggested dangerously low PL1 ({:.1}W). Falling back to 15W safety floor.", best_pl);
|
||||
return 15.0;
|
||||
}
|
||||
|
||||
best_pl
|
||||
}
|
||||
}
|
||||
|
||||
@@ -161,6 +161,7 @@ fn main() -> Result<()> {
|
||||
// 5. Spawn Backend Orchestrator
|
||||
let sal_backend = sal.clone();
|
||||
let facts_backend = facts.clone();
|
||||
let config_out = args.config_out.clone();
|
||||
let backend_handle = thread::spawn(move || {
|
||||
let workload = Box::new(StressNg::new());
|
||||
let mut orchestrator = BenchmarkOrchestrator::new(
|
||||
@@ -169,6 +170,7 @@ fn main() -> Result<()> {
|
||||
workload,
|
||||
telemetry_tx,
|
||||
command_rx,
|
||||
config_out,
|
||||
);
|
||||
orchestrator.run()
|
||||
});
|
||||
|
||||
@@ -12,6 +12,7 @@ use sysinfo::System;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Mutex;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::sal::traits::{PlatformSal, SafetyStatus};
|
||||
use crate::sal::heuristic::discovery::SystemFactSheet;
|
||||
@@ -40,6 +41,8 @@ pub struct BenchmarkOrchestrator {
|
||||
profile: ThermalProfile,
|
||||
/// Mathematics engine for data smoothing and optimization.
|
||||
engine: OptimizerEngine,
|
||||
/// CLI override for the configuration output path.
|
||||
optional_config_out: Option<PathBuf>,
|
||||
|
||||
/// Sliding window of power readings (Watts).
|
||||
history_watts: VecDeque<f32>,
|
||||
@@ -67,6 +70,7 @@ impl BenchmarkOrchestrator {
|
||||
workload: Box<dyn Workload>,
|
||||
telemetry_tx: mpsc::Sender<TelemetryState>,
|
||||
command_rx: mpsc::Receiver<UiCommand>,
|
||||
optional_config_out: Option<PathBuf>,
|
||||
) -> Self {
|
||||
let mut sys = System::new_all();
|
||||
sys.refresh_all();
|
||||
@@ -92,6 +96,7 @@ impl BenchmarkOrchestrator {
|
||||
total_ram_gb,
|
||||
emergency_abort: Arc::new(AtomicBool::new(false)),
|
||||
emergency_reason: Arc::new(Mutex::new(None)),
|
||||
optional_config_out,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -222,12 +227,18 @@ impl BenchmarkOrchestrator {
|
||||
trip_temp: res.max_temp_c.max(95.0),
|
||||
};
|
||||
|
||||
if let Some(throttled_path) = self.facts.paths.configs.get("throttled") {
|
||||
crate::engine::formatters::throttled::ThrottledTranslator::save(throttled_path, &config)?;
|
||||
self.log(&format!("✓ Saved '{}' (merged).", throttled_path.display()))?;
|
||||
res.config_paths.insert("throttled".to_string(), throttled_path.clone());
|
||||
// 1. Throttled (Merged if exists)
|
||||
// PRIORITY: optional_config_out > facts discovery > fallback
|
||||
let throttled_path = self.optional_config_out.clone()
|
||||
.or_else(|| self.facts.paths.configs.get("throttled").cloned());
|
||||
|
||||
if let Some(path) = throttled_path {
|
||||
crate::engine::formatters::throttled::ThrottledTranslator::save(&path, &config)?;
|
||||
self.log(&format!("✓ Saved '{}'.", path.display()))?;
|
||||
res.config_paths.insert("throttled".to_string(), path.clone());
|
||||
}
|
||||
|
||||
// 2. i8kmon
|
||||
if let Some(i8k_path) = self.facts.paths.configs.get("i8kmon") {
|
||||
let i8k_config = crate::engine::formatters::i8kmon::I8kmonConfig {
|
||||
t_ambient: self.profile.ambient_temp,
|
||||
|
||||
@@ -22,6 +22,11 @@ pub struct DellXps9380Sal {
|
||||
suppressed_services: Mutex<Vec<String>>,
|
||||
msr_file: Mutex<fs::File>,
|
||||
last_energy: Mutex<(u64, Instant)>,
|
||||
|
||||
// --- Original State for Restoration ---
|
||||
original_pl1: Mutex<Option<u64>>,
|
||||
original_pl2: Mutex<Option<u64>>,
|
||||
original_fan_mode: Mutex<Option<String>>,
|
||||
}
|
||||
|
||||
impl DellXps9380Sal {
|
||||
@@ -53,6 +58,9 @@ impl DellXps9380Sal {
|
||||
last_energy: Mutex::new((initial_energy, Instant::now())),
|
||||
fact_sheet: facts,
|
||||
ctx,
|
||||
original_pl1: Mutex::new(None),
|
||||
original_pl2: Mutex::new(None),
|
||||
original_fan_mode: Mutex::new(None),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -126,12 +134,25 @@ impl PreflightAuditor for DellXps9380Sal {
|
||||
|
||||
impl EnvironmentGuard for DellXps9380Sal {
|
||||
fn suppress(&self) -> Result<()> {
|
||||
// 1. Snapshot Power Limits
|
||||
if let Ok(pl1) = fs::read_to_string(&self.pl1_path) {
|
||||
*self.original_pl1.lock().unwrap() = pl1.trim().parse().ok();
|
||||
}
|
||||
if let Ok(pl2) = fs::read_to_string(&self.pl2_path) {
|
||||
*self.original_pl2.lock().unwrap() = pl2.trim().parse().ok();
|
||||
}
|
||||
|
||||
// 2. Snapshot Fan Mode (Assumption: Dell BIOS Fan Control is active)
|
||||
// We can't easily read current state of dell-bios-fan-control, so we assume 'auto' (1)
|
||||
*self.original_fan_mode.lock().unwrap() = Some("1".to_string());
|
||||
|
||||
// 3. Stop Services
|
||||
let services = ["tlp", "thermald", "i8kmon"];
|
||||
let mut suppressed = self.suppressed_services.lock().unwrap();
|
||||
for s in services {
|
||||
if self.ctx.runner.run("systemctl", &["is-active", "--quiet", s]).is_ok() {
|
||||
debug!("Suppressing service: {}", s);
|
||||
self.ctx.runner.run("systemctl", &["stop", s])?;
|
||||
let _ = self.ctx.runner.run("systemctl", &["stop", s]);
|
||||
suppressed.push(s.to_string());
|
||||
}
|
||||
}
|
||||
@@ -139,6 +160,20 @@ impl EnvironmentGuard for DellXps9380Sal {
|
||||
}
|
||||
|
||||
fn restore(&self) -> Result<()> {
|
||||
// 1. Restore Power Limits
|
||||
if let Some(pl1) = *self.original_pl1.lock().unwrap() {
|
||||
let _ = fs::write(&self.pl1_path, pl1.to_string());
|
||||
}
|
||||
if let Some(pl2) = *self.original_pl2.lock().unwrap() {
|
||||
let _ = fs::write(&self.pl2_path, pl2.to_string());
|
||||
}
|
||||
|
||||
// 2. Restore Fan Mode (BIOS Control)
|
||||
if let Some(tool_path) = self.fact_sheet.paths.tools.get("dell_fan_ctrl") {
|
||||
let _ = self.ctx.runner.run(&tool_path.to_string_lossy(), &["1"]);
|
||||
}
|
||||
|
||||
// 3. Restart Services
|
||||
let mut suppressed = self.suppressed_services.lock().unwrap();
|
||||
for s in suppressed.drain(..) {
|
||||
let _ = self.ctx.runner.run("systemctl", &["start", &s]);
|
||||
@@ -162,17 +197,23 @@ impl SensorBus for DellXps9380Sal {
|
||||
}
|
||||
|
||||
fn get_power_w(&self) -> Result<f32> {
|
||||
if self.pwr_path.to_string_lossy().contains("energy_uj") {
|
||||
// FIX: Ensure we always read from energy_uj if available for delta calculation
|
||||
let rapl_base = self.pl1_path.parent().context("RAPL path error")?;
|
||||
let energy_path = rapl_base.join("energy_uj");
|
||||
|
||||
if energy_path.exists() {
|
||||
let mut last = self.last_energy.lock().unwrap();
|
||||
let e2 = fs::read_to_string(&self.pwr_path)?.trim().parse::<u64>()?;
|
||||
let e2_str = fs::read_to_string(&energy_path)?;
|
||||
let e2 = e2_str.trim().parse::<u64>()?;
|
||||
let t2 = Instant::now();
|
||||
let (e1, t1) = *last;
|
||||
let delta_e = e2.wrapping_sub(e1);
|
||||
let delta_t = t2.duration_since(t1).as_secs_f32();
|
||||
*last = (e2, t2);
|
||||
if delta_t < 0.01 { return Ok(0.0); }
|
||||
if delta_t < 0.05 { return Ok(0.0); }
|
||||
Ok((delta_e as f32 / 1_000_000.0) / delta_t)
|
||||
} else {
|
||||
// Fallback to power1_average if it exists (units are µW)
|
||||
let s = fs::read_to_string(&self.pwr_path)?;
|
||||
Ok(s.trim().parse::<f32>()? / 1000000.0)
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::path::{Path};
|
||||
use std::fs;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::sync::Mutex;
|
||||
use tracing::{debug};
|
||||
|
||||
use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx};
|
||||
use crate::sal::heuristic::discovery::SystemFactSheet;
|
||||
@@ -16,6 +17,10 @@ pub struct GenericLinuxSal {
|
||||
last_valid_temp: Mutex<(f32, Instant)>,
|
||||
current_pl1: Mutex<f32>,
|
||||
last_energy: Mutex<(u64, Instant)>,
|
||||
|
||||
// --- Original State for Restoration ---
|
||||
original_pl1: Mutex<Option<u64>>,
|
||||
original_pl2: Mutex<Option<u64>>,
|
||||
}
|
||||
|
||||
impl GenericLinuxSal {
|
||||
@@ -34,6 +39,8 @@ impl GenericLinuxSal {
|
||||
last_energy: Mutex::new((initial_energy, Instant::now())),
|
||||
fact_sheet: facts,
|
||||
ctx,
|
||||
original_pl1: Mutex::new(None),
|
||||
original_pl2: Mutex::new(None),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -95,7 +102,7 @@ impl SensorBus for GenericLinuxSal {
|
||||
let delta_e = e2.wrapping_sub(e1);
|
||||
let delta_t = t2.duration_since(t1).as_secs_f32();
|
||||
*last = (e2, t2);
|
||||
if delta_t < 0.01 { return Ok(0.0); }
|
||||
if delta_t < 0.05 { return Ok(0.0); }
|
||||
Ok((delta_e as f32 / 1_000_000.0) / delta_t)
|
||||
}
|
||||
|
||||
@@ -160,12 +167,22 @@ impl ActuatorBus for GenericLinuxSal {
|
||||
|
||||
impl EnvironmentGuard for GenericLinuxSal {
|
||||
fn suppress(&self) -> Result<()> {
|
||||
// Snapshot Power Limits
|
||||
if let Some(rapl_path) = self.fact_sheet.rapl_paths.first() {
|
||||
if let Ok(pl1) = fs::read_to_string(rapl_path.join("constraint_0_power_limit_uw")) {
|
||||
*self.original_pl1.lock().unwrap() = pl1.trim().parse().ok();
|
||||
}
|
||||
if let Ok(pl2) = fs::read_to_string(rapl_path.join("constraint_1_power_limit_uw")) {
|
||||
*self.original_pl2.lock().unwrap() = pl2.trim().parse().ok();
|
||||
}
|
||||
}
|
||||
|
||||
let mut suppressed = self.suppressed_services.lock().unwrap();
|
||||
for conflict_id in &self.fact_sheet.active_conflicts {
|
||||
if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) {
|
||||
for service in &conflict.services {
|
||||
if self.ctx.runner.run("systemctl", &["is-active", "--quiet", service]).is_ok() {
|
||||
self.ctx.runner.run("systemctl", &["stop", service])?;
|
||||
let _ = self.ctx.runner.run("systemctl", &["stop", service]);
|
||||
suppressed.push(service.clone());
|
||||
}
|
||||
}
|
||||
@@ -175,6 +192,16 @@ impl EnvironmentGuard for GenericLinuxSal {
|
||||
}
|
||||
|
||||
fn restore(&self) -> Result<()> {
|
||||
// Restore Power Limits
|
||||
if let Some(rapl_path) = self.fact_sheet.rapl_paths.first() {
|
||||
if let Some(pl1) = *self.original_pl1.lock().unwrap() {
|
||||
let _ = fs::write(rapl_path.join("constraint_0_power_limit_uw"), pl1.to_string());
|
||||
}
|
||||
if let Some(pl2) = *self.original_pl2.lock().unwrap() {
|
||||
let _ = fs::write(rapl_path.join("constraint_1_power_limit_uw"), pl2.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
let mut suppressed = self.suppressed_services.lock().unwrap();
|
||||
for service in suppressed.drain(..) {
|
||||
let _ = self.ctx.runner.run("systemctl", &["start", &service]);
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
use std::time::{Duration};
|
||||
use std::thread;
|
||||
use std::sync::mpsc;
|
||||
use std::collections::HashMap;
|
||||
use crate::sal::heuristic::schema::{SensorDiscovery, ActuatorDiscovery, Conflict, Discovery, Benchmarking};
|
||||
use crate::sys::SyscallRunner;
|
||||
use tracing::{debug, warn};
|
||||
|
||||
/// Registry of dynamically discovered paths for configs and tools.
|
||||
@@ -31,6 +31,7 @@ pub struct SystemFactSheet {
|
||||
/// Probes the system for hardware sensors, actuators, service conflicts, and paths.
|
||||
pub fn discover_facts(
|
||||
base_path: &Path,
|
||||
runner: &dyn SyscallRunner,
|
||||
discovery: &Discovery,
|
||||
conflicts: &[Conflict],
|
||||
bench_config: Benchmarking,
|
||||
@@ -45,7 +46,7 @@ pub fn discover_facts(
|
||||
let mut active_conflicts = Vec::new();
|
||||
for conflict in conflicts {
|
||||
for service in &conflict.services {
|
||||
if is_service_active(service) {
|
||||
if is_service_active(runner, service) {
|
||||
debug!("Detected active conflict: {} (Service: {})", conflict.id, service);
|
||||
active_conflicts.push(conflict.id.clone());
|
||||
break;
|
||||
@@ -93,7 +94,6 @@ fn discover_paths(base_path: &Path, discovery: &Discovery) -> PathRegistry {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// If not found, use the first one as default if any exist
|
||||
if !registry.configs.contains_key(id) {
|
||||
if let Some(first) = candidates.first() {
|
||||
registry.configs.insert(id.clone(), PathBuf::from(first));
|
||||
@@ -142,7 +142,6 @@ fn discover_hwmon(base_path: &Path, cfg: &SensorDiscovery) -> (Option<PathBuf>,
|
||||
for hw_entry in hw_entries.flatten() {
|
||||
let file_name = hw_entry.file_name().into_string().unwrap_or_default();
|
||||
|
||||
// Temperature Sensors
|
||||
if file_name.starts_with("temp") && file_name.ends_with("_label") {
|
||||
if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(100)) {
|
||||
if cfg.temp_labels.iter().any(|l| label.contains(l)) {
|
||||
@@ -154,7 +153,6 @@ fn discover_hwmon(base_path: &Path, cfg: &SensorDiscovery) -> (Option<PathBuf>,
|
||||
}
|
||||
}
|
||||
|
||||
// Fan Sensors
|
||||
if file_name.starts_with("fan") && file_name.ends_with("_label") {
|
||||
if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(100)) {
|
||||
if cfg.fan_labels.iter().any(|l| label.contains(l)) {
|
||||
@@ -206,18 +204,9 @@ fn discover_rapl(base_path: &Path, cfg: &ActuatorDiscovery) -> Vec<PathBuf> {
|
||||
paths
|
||||
}
|
||||
|
||||
/// Checks if a systemd service is currently active.
|
||||
pub fn is_service_active(service: &str) -> bool {
|
||||
let status = Command::new("systemctl")
|
||||
.arg("is-active")
|
||||
.arg("--quiet")
|
||||
.arg(service)
|
||||
.status();
|
||||
|
||||
match status {
|
||||
Ok(s) => s.success(),
|
||||
Err(_) => false,
|
||||
}
|
||||
/// Checks if a systemd service is currently active using the injected runner.
|
||||
pub fn is_service_active(runner: &dyn SyscallRunner, service: &str) -> bool {
|
||||
runner.run("systemctl", &["is-active", "--quiet", service]).is_ok()
|
||||
}
|
||||
|
||||
/// Helper to read a sysfs file with a timeout.
|
||||
|
||||
@@ -24,7 +24,7 @@ impl HeuristicEngine {
|
||||
.context("Failed to parse hardware_db.toml")?;
|
||||
|
||||
// 2. Discover Facts
|
||||
let facts = discover_facts(&ctx.sysfs_base, &db.discovery, &db.conflicts, db.benchmarking.clone());
|
||||
let facts = discover_facts(&ctx.sysfs_base, ctx.runner.as_ref(), &db.discovery, &db.conflicts, db.benchmarking.clone());
|
||||
info!("System Identity: {} {}", facts.vendor, facts.model);
|
||||
|
||||
// 3. Routing Logic
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use ember_tune_rs::sal::heuristic::discovery::discover_facts;
|
||||
use ember_tune_rs::sal::heuristic::schema::{Discovery, SensorDiscovery, ActuatorDiscovery, Benchmarking};
|
||||
use ember_tune_rs::sys::MockSyscallRunner;
|
||||
use crate::common::fakesys::FakeSysBuilder;
|
||||
|
||||
mod common;
|
||||
@@ -35,7 +36,9 @@ fn test_heuristic_discovery_with_fakesys() {
|
||||
power_steps_watts: vec![10.0, 15.0],
|
||||
};
|
||||
|
||||
let facts = discover_facts(&fake.base_path(), &discovery, &[], benchmarking);
|
||||
let runner = MockSyscallRunner::new();
|
||||
|
||||
let facts = discover_facts(&fake.base_path(), &runner, &discovery, &[], benchmarking);
|
||||
|
||||
assert_eq!(facts.vendor, "Dell Inc.");
|
||||
assert_eq!(facts.model, "XPS 13 9380");
|
||||
|
||||
@@ -28,6 +28,7 @@ fn test_orchestrator_e2e_state_machine() {
|
||||
workload,
|
||||
telemetry_tx,
|
||||
command_rx,
|
||||
None,
|
||||
);
|
||||
|
||||
// For the purpose of this architecture audit, we've demonstrated the
|
||||
|
||||
Reference in New Issue
Block a user