Merge pull request 'release/1.2.0' (#2) from release/1.2.0 into main
All checks were successful
Build and Release / release (push) Successful in 1m7s
All checks were successful
Build and Release / release (push) Successful in 1m7s
Reviewed-on: #2
This commit was merged in pull request #2.
This commit is contained in:
115
Cargo.lock
generated
115
Cargo.lock
generated
@@ -513,7 +513,7 @@ checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
|
||||
|
||||
[[package]]
|
||||
name = "ember-tune-rs"
|
||||
version = "1.1.0"
|
||||
version = "1.2.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
@@ -526,13 +526,17 @@ dependencies = [
|
||||
"num_cpus",
|
||||
"owo-colors",
|
||||
"ratatui",
|
||||
"regex",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sysinfo",
|
||||
"tempfile",
|
||||
"thiserror 2.0.18",
|
||||
"toml",
|
||||
"tracing",
|
||||
"tracing-appender",
|
||||
"tracing-subscriber",
|
||||
"which",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -541,6 +545,12 @@ version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
|
||||
|
||||
[[package]]
|
||||
name = "env_home"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c7f84e12ccf0a7ddc17a6c41c93326024c42920d7ee630d04950e6926645c0fe"
|
||||
|
||||
[[package]]
|
||||
name = "equivalent"
|
||||
version = "1.0.2"
|
||||
@@ -586,6 +596,12 @@ dependencies = [
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
version = "2.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
|
||||
|
||||
[[package]]
|
||||
name = "filedescriptor"
|
||||
version = "0.8.3"
|
||||
@@ -885,6 +901,15 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matchers"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9"
|
||||
dependencies = [
|
||||
"regex-automata",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.8.0"
|
||||
@@ -1534,6 +1559,15 @@ dependencies = [
|
||||
"zmij",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_spanned"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8bbf91e5a4d6315eee45e704372590b30e260ee83af6639d64557f51b067776"
|
||||
dependencies = [
|
||||
"serde_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sha2"
|
||||
version = "0.10.9"
|
||||
@@ -1687,6 +1721,19 @@ dependencies = [
|
||||
"windows",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.25.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0136791f7c95b1f6dd99f9cc786b91bb81c3800b639b3478e561ddb7be95e5f1"
|
||||
dependencies = [
|
||||
"fastrand",
|
||||
"getrandom 0.4.1",
|
||||
"once_cell",
|
||||
"rustix",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "terminal_size"
|
||||
version = "0.4.3"
|
||||
@@ -1852,6 +1899,45 @@ dependencies = [
|
||||
"time-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml"
|
||||
version = "1.0.3+spec-1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c7614eaf19ad818347db24addfa201729cf2a9b6fdfd9eb0ab870fcacc606c0c"
|
||||
dependencies = [
|
||||
"indexmap",
|
||||
"serde_core",
|
||||
"serde_spanned",
|
||||
"toml_datetime",
|
||||
"toml_parser",
|
||||
"toml_writer",
|
||||
"winnow",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_datetime"
|
||||
version = "1.0.0+spec-1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32c2555c699578a4f59f0cc68e5116c8d7cabbd45e1409b989d4be085b53f13e"
|
||||
dependencies = [
|
||||
"serde_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_parser"
|
||||
version = "1.0.9+spec-1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4"
|
||||
dependencies = [
|
||||
"winnow",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_writer"
|
||||
version = "1.0.6+spec-1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ab16f14aed21ee8bfd8ec22513f7287cd4a91aa92e44edfe2c17ddd004e92607"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.44"
|
||||
@@ -1923,10 +2009,14 @@ version = "0.3.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e"
|
||||
dependencies = [
|
||||
"matchers",
|
||||
"nu-ansi-term",
|
||||
"once_cell",
|
||||
"regex-automata",
|
||||
"sharded-slab",
|
||||
"smallvec",
|
||||
"thread_local",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
]
|
||||
@@ -2204,6 +2294,17 @@ dependencies = [
|
||||
"wezterm-dynamic",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "which"
|
||||
version = "8.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3fabb953106c3c8eea8306e4393700d7657561cb43122571b172bbfb7c7ba1d"
|
||||
dependencies = [
|
||||
"env_home",
|
||||
"rustix",
|
||||
"winsafe",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
@@ -2492,6 +2593,18 @@ version = "0.53.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
|
||||
|
||||
[[package]]
|
||||
name = "winnow"
|
||||
version = "0.7.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829"
|
||||
|
||||
[[package]]
|
||||
name = "winsafe"
|
||||
version = "0.0.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d135d17ab770252ad95e9a872d365cf3090e3be864a34ab46f48555993efc904"
|
||||
|
||||
[[package]]
|
||||
name = "wit-bindgen"
|
||||
version = "0.51.0"
|
||||
|
||||
10
Cargo.toml
10
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "ember-tune-rs"
|
||||
version = "1.1.0"
|
||||
version = "1.2.0"
|
||||
edition = "2024"
|
||||
authors = ["Nils Pukropp <nils@narl.io>"]
|
||||
readme = "README.md"
|
||||
@@ -23,8 +23,14 @@ serde_json = "1.0.149"
|
||||
clap = { version = "4.5", features = ["derive", "string", "wrap_help"] }
|
||||
color-eyre = "0.6"
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = "0.3"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
tracing-appender = "0.2"
|
||||
sysinfo = "0.38"
|
||||
libc = "0.2"
|
||||
num_cpus = "1.17"
|
||||
toml = "1.0.3"
|
||||
regex = "1.12.3"
|
||||
which = "8.0.0"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
|
||||
86
README.md
Normal file
86
README.md
Normal file
@@ -0,0 +1,86 @@
|
||||
# 🔥 ember-tune
|
||||
```text
|
||||
__________ ____ ______ ____ ______ __ __ _ __ ______
|
||||
/ ____/ |/ // __ )/ ____// __ \ /_ __/ / / / // | / // ____/
|
||||
/ __/ / /|_/ // __ / __/ / /_/ / / / / / / // |/ // __/
|
||||
/ /___ / / / // /_/ / /___ / _, _/ / / / /_/ // /| // /___
|
||||
/_____//_/ /_//_____/_____//_/ |_| /_/ \____//_/ |_//_____/
|
||||
|
||||
>>> Physically-grounded thermal & power optimization for Linux <<<
|
||||
```
|
||||
|
||||
> ### **Find your hardware's "Physical Sweet Spot" through automated trial-by-fire.**
|
||||
|
||||
`ember-tune` is a scientifically-driven hardware optimizer that replaces guesswork and manual tuning with a rigorous, automated engineering workflow. It determines the unique thermal properties of your specific laptop—including its Thermal Resistance (Rθ) and "Silicon Knee"—to generate optimal configurations for common Linux tuning daemons.
|
||||
|
||||
## ✨ Features
|
||||
|
||||
- **Automated Physical Benchmarking:** Measures real-world thermal performance under load to find the true "sweet spot" where performance-per-watt is maximized before thermal saturation causes diminishing returns.
|
||||
- **Heuristic Hardware Discovery:** Utilizes a data-driven Hardware Abstraction Layer (SAL) that probes your system and automatically adapts to its unique quirks, drivers, and sensor paths.
|
||||
- **Non-Destructive Configuration:** Safely merges new, optimized power limits into your existing `throttled.conf`, preserving manual undervolt settings and comments.
|
||||
- **Universal Safeguard Architecture (USA):** Includes a high-frequency concurrent watchdog and RAII state restoration to guarantee your system is never left in a dangerous state.
|
||||
- **Real-time TUI Dashboard:** A `ratatui`-based terminal interface provides high-resolution telemetry throughout the benchmark.
|
||||
|
||||
## 🔬 How it Works: The Architecture
|
||||
|
||||
`ember-tune` is built on a decoupled, multi-threaded architecture to ensure the UI is always responsive and that hardware state is managed safely.
|
||||
|
||||
1. **The Heuristic Engine:** On startup, the engine probes your system's DMI, `sysfs`, and active services. It compares these "facts" against the `hardware_db.toml` to select the correct System Abstraction Layer (SAL).
|
||||
2. **The Orchestrator (Backend Thread):** This is the state machine that executes the benchmark. It communicates with hardware *only* through the SAL traits.
|
||||
3. **The TUI (Main Thread):** The `ratatui` dashboard renders `TelemetryState` snapshots received from the orchestrator via an MPSC channel.
|
||||
4. **The Watchdog (Safety Thread):** A high-priority thread that polls safety sensors every 100ms to trigger an atomic `EmergencyAbort` if failure conditions are met.
|
||||
|
||||
## ⚙️ Development Setup
|
||||
|
||||
`ember-tune` is a standard Cargo project.
|
||||
|
||||
**Prerequisites:**
|
||||
- `rustup`
|
||||
- `build-essential`
|
||||
- `libudev-dev`
|
||||
- `stress-ng` (Required for benchmarking)
|
||||
|
||||
```bash
|
||||
# 1. Clone and Build
|
||||
git clone https://gitea.com/narl/ember-tune.git
|
||||
cd ember-tune
|
||||
cargo build --release
|
||||
|
||||
# 2. Run the safe test suite
|
||||
cargo test
|
||||
```
|
||||
|
||||
**Running:**
|
||||
```bash
|
||||
# Run a full benchmark
|
||||
sudo ./target/release/ember-tune
|
||||
|
||||
# Run a mock benchmark for UI testing
|
||||
sudo ./target/release/ember-tune --mock
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 🤝 Contributing Quirk Data (`hardware_db.toml`)
|
||||
|
||||
**This is the most impactful way to contribute.** If your hardware isn't working perfectly, add a new entry to `assets/hardware_db.toml`.
|
||||
|
||||
### Example: Adding a Service Conflict
|
||||
```toml
|
||||
[[conflicts]]
|
||||
id = "laptop_mode_conflict"
|
||||
services = ["laptop-mode.service"]
|
||||
contention = "Multiple - I/O schedulers, Power limits"
|
||||
severity = "Medium"
|
||||
fix_action = "SuspendService"
|
||||
help_text = "laptop-mode-tools can override power-related sysfs settings."
|
||||
```
|
||||
|
||||
### Example: Defining a Model-Specific Quirk
|
||||
```toml
|
||||
[[quirks]]
|
||||
model_regex = "HP Envy 15-ep.*"
|
||||
id = "hp_fan_stuck_sensor"
|
||||
issue = "Fan sensor reports 0 RPM when active."
|
||||
action = "UseThermalVelocityFallback"
|
||||
```
|
||||
@@ -1,5 +1,5 @@
|
||||
[metadata]
|
||||
version = "1.0.0"
|
||||
version = "1.2.0"
|
||||
updated = "2026-02-26"
|
||||
description = "Hardware and Conflict Database for ember-tune Thermal Engine"
|
||||
|
||||
@@ -15,7 +15,7 @@ help_text = "TLP and Power-Profiles-Daemon fight over power envelopes. Mask both
|
||||
|
||||
[[conflicts]]
|
||||
id = "thermal_logic_collision"
|
||||
services = ["thermald.service", "throttled.service"]
|
||||
services = ["thermald.service", "throttled.service", "lenovo_fix.service", "lenovo-throttling-fix.service"]
|
||||
contention = "RAPL / MSR / BD-PROCHOT"
|
||||
severity = "High"
|
||||
fix_action = "SuspendService"
|
||||
@@ -29,6 +29,14 @@ severity = "Medium"
|
||||
fix_action = "SuspendService"
|
||||
help_text = "Auto-cpufreq interferes with deterministic Silicon Knee identification."
|
||||
|
||||
[[conflicts]]
|
||||
id = "dell_fan_collision"
|
||||
services = ["i8kmon.service"]
|
||||
contention = "Dell SMM Fan Control"
|
||||
severity = "High"
|
||||
fix_action = "SuspendService"
|
||||
help_text = "i8kmon fights with ember-tune for SMM fan duty cycles. Suspend during benchmark."
|
||||
|
||||
# manufacturer wide logic
|
||||
|
||||
[ecosystems.dell]
|
||||
@@ -38,6 +46,7 @@ drivers = ["dell_smm_hwmon"]
|
||||
fan_manual_mode_cmd = "dell-bios-fan-control 0"
|
||||
fan_auto_mode_cmd = "dell-bios-fan-control 1"
|
||||
safety_register = "0x1FC" # BD PROCHOT MSR
|
||||
help_text = "Dell systems often require 'SMM Security Mitigation' disabled in BIOS for fan control."
|
||||
|
||||
[ecosystems.lenovo]
|
||||
vendor_regex = "LENOVO"
|
||||
@@ -60,6 +69,13 @@ fan_boost_path = "/sys/devices/platform/hp-wmi/hwmon/hwmon*/pwm1_enable"
|
||||
vendor_regex = "Framework"
|
||||
ec_tool = "ectool"
|
||||
optimization = "Direct-FFI-SMC"
|
||||
polling_cap_ms = 500
|
||||
|
||||
[ecosystems.surface]
|
||||
vendor_regex = "Microsoft Corporation"
|
||||
product_regex = "Surface.*"
|
||||
drivers = ["surface_acpi"]
|
||||
profiles_path = "/sys/bus/platform/devices/surface_performance/platform_profile"
|
||||
|
||||
# quirks: model quirks and fixes
|
||||
|
||||
@@ -85,6 +101,7 @@ id = "asus_fan_hex_support"
|
||||
issue = "Custom Hex Curve Interface"
|
||||
target_path = "/sys/devices/platform/asus-nb-wmi/fan_curve"
|
||||
format = "HexPair16"
|
||||
action = "ManualFanControlRequired"
|
||||
|
||||
[[quirks]]
|
||||
model_regex = "Spectre x360"
|
||||
@@ -92,20 +109,45 @@ id = "hp_rapl_lockout"
|
||||
issue = "Hardware MSR Lockout"
|
||||
action = "WarnUserMSRLocked"
|
||||
|
||||
[[quirks]]
|
||||
model_regex = "Framework.*"
|
||||
id = "framework_prochot_stuck"
|
||||
issue = "BD PROCHOT wedged at 200MHz"
|
||||
monitor_msr = "0x1FC"
|
||||
reset_bit = 0
|
||||
action = "ClearBitOnSafeTemp"
|
||||
|
||||
# heuristic discovery
|
||||
|
||||
[discovery.sensors]
|
||||
temp_labels = ["Package id 0", "Tdie", "Tctl", "CPU Temperature"]
|
||||
fan_labels = ["CPU Fan", "GPU Fan", "System Fan"]
|
||||
hwmon_priority = ["coretemp", "zenpower", "k10temp", "dell_smm"]
|
||||
temp_labels = ["Package id 0", "Tdie", "Tctl", "CPU Temperature", "Core 0", "Composite"]
|
||||
fan_labels = ["CPU Fan", "GPU Fan", "System Fan", "Processor Fan"]
|
||||
hwmon_priority = ["coretemp", "zenpower", "k10temp", "dell_smm", "thinkpad", "asus"]
|
||||
|
||||
[discovery.actuators]
|
||||
rapl_paths = ["intel-rapl:0", "package-0"]
|
||||
rapl_paths = ["intel-rapl:0", "package-0", "intel-rapl:1"]
|
||||
amd_energy_paths = ["zenpower/energy1_input", "k10temp/energy1_input"]
|
||||
governor_files = ["energy_performance_preference", "energy_performance_hint", "scaling_governor"]
|
||||
|
||||
[discovery.configs]
|
||||
throttled = ["/etc/throttled.conf", "/usr/local/etc/throttled.conf", "/etc/lenovo_fix.conf"]
|
||||
i8kmon = ["/etc/i8kmon.conf", "/etc/default/i8kmon"]
|
||||
tlp = ["/etc/tlp.conf", "/etc/default/tlp"]
|
||||
|
||||
[discovery.tools]
|
||||
dell_fan_ctrl = "dell-bios-fan-control"
|
||||
ectool = "ectool"
|
||||
ryzenadj = "ryzenadj"
|
||||
|
||||
# env health verification
|
||||
|
||||
[benchmarking]
|
||||
idle_duration_s = 10
|
||||
stress_duration_min_s = 15
|
||||
stress_duration_max_s = 45
|
||||
cool_down_s = 5
|
||||
power_steps_watts = [15.0, 20.0, 25.0, 30.0, 35.0]
|
||||
|
||||
[[preflight_checks]]
|
||||
name = "MSR Write Access"
|
||||
check_cmd = "grep -q 'msr.allow_writes=on' /proc/cmdline"
|
||||
@@ -113,5 +155,10 @@ fail_help = "Add 'msr.allow_writes=on' to kernel parameters to allow power limit
|
||||
|
||||
[[preflight_checks]]
|
||||
name = "Kernel Lockdown Status"
|
||||
check_cmd = "cat /sys/kernel/security/lockdown | grep -q '\\[none\\]'"
|
||||
check_cmd = "cat /sys/kernel/security/lockdown | grep -q '\\[none\\]' || ! [ -f /sys/kernel/security/lockdown ]"
|
||||
fail_help = "Kernel Lockdown is enabled. MMIO/MSR actuators are restricted by the Linux Security Module."
|
||||
|
||||
[[preflight_checks]]
|
||||
name = "Intel P-State Check"
|
||||
check_cmd = "[ -d /sys/devices/system/cpu/intel_pstate ] || [ -d /sys/devices/system/cpu/cpufreq/policy0 ]"
|
||||
fail_help = "CPU Frequency scaling driver not detected. Ensure intel_pstate or acpi-cpufreq is loaded."
|
||||
|
||||
117
assets/hardware_db.toml.bak
Normal file
117
assets/hardware_db.toml.bak
Normal file
@@ -0,0 +1,117 @@
|
||||
[metadata]
|
||||
version = "1.0.0"
|
||||
updated = "2026-02-26"
|
||||
description = "Hardware and Conflict Database for ember-tune Thermal Engine"
|
||||
|
||||
# service collision
|
||||
|
||||
[[conflicts]]
|
||||
id = "tlp_vs_ppd"
|
||||
services = ["tlp.service", "power-profiles-daemon.service"]
|
||||
contention = "ACPI Platform Profile / EPP"
|
||||
severity = "Critical"
|
||||
fix_action = "MaskBoth"
|
||||
help_text = "TLP and Power-Profiles-Daemon fight over power envelopes. Mask both to allow ember-tune deterministic control."
|
||||
|
||||
[[conflicts]]
|
||||
id = "thermal_logic_collision"
|
||||
services = ["thermald.service", "throttled.service"]
|
||||
contention = "RAPL / MSR / BD-PROCHOT"
|
||||
severity = "High"
|
||||
fix_action = "SuspendService"
|
||||
help_text = "Thermald and Throttled create a 'register ping-pong' loop. Disable throttled; ember-tune will manage RAPL limits."
|
||||
|
||||
[[conflicts]]
|
||||
id = "freq_scaling_collision"
|
||||
services = ["auto-cpufreq.service"]
|
||||
contention = "CPU Scaling Governor"
|
||||
severity = "Medium"
|
||||
fix_action = "SuspendService"
|
||||
help_text = "Auto-cpufreq interferes with deterministic Silicon Knee identification."
|
||||
|
||||
# manufacturer wide logic
|
||||
|
||||
[ecosystems.dell]
|
||||
vendor_regex = "(Dell.*|Precision.*|Latitude.*|XPS.*)"
|
||||
polling_cap_ms = 1000
|
||||
drivers = ["dell_smm_hwmon"]
|
||||
fan_manual_mode_cmd = "dell-bios-fan-control 0"
|
||||
fan_auto_mode_cmd = "dell-bios-fan-control 1"
|
||||
safety_register = "0x1FC" # BD PROCHOT MSR
|
||||
|
||||
[ecosystems.lenovo]
|
||||
vendor_regex = "LENOVO"
|
||||
lap_mode_path = "/sys/devices/platform/thinkpad_acpi/dytc_lapmode"
|
||||
profiles_path = "/sys/firmware/acpi/platform_profile"
|
||||
ec_write_required = false # Varies by model
|
||||
|
||||
[ecosystems.asus]
|
||||
vendor_regex = "ASUSTeK.*"
|
||||
thermal_policy_path = "/sys/devices/platform/asus-nb-wmi/throttle_thermal_policy"
|
||||
policy_map = { Balanced = 0, Turbo = 1, Silent = 2 }
|
||||
|
||||
[ecosystems.hp]
|
||||
vendor_regex = "HP"
|
||||
msr_lock_register = "0x610"
|
||||
msr_lock_bit = 63
|
||||
fan_boost_path = "/sys/devices/platform/hp-wmi/hwmon/hwmon*/pwm1_enable"
|
||||
|
||||
[ecosystems.framework]
|
||||
vendor_regex = "Framework"
|
||||
ec_tool = "ectool"
|
||||
optimization = "Direct-FFI-SMC"
|
||||
|
||||
# quirks: model quirks and fixes
|
||||
|
||||
[[quirks]]
|
||||
model_regex = "XPS 13 93.*"
|
||||
id = "dell_bd_prochot_fix"
|
||||
issue = "False Positive 400MHz Lock"
|
||||
monitor_msr = "0x1FC"
|
||||
reset_bit = 0
|
||||
action = "ClearBitOnSafeTemp"
|
||||
|
||||
[[quirks]]
|
||||
model_regex = "ThinkPad T14.*"
|
||||
id = "lenovo_lap_throttling"
|
||||
issue = "11W TDP Lock in Lap Mode"
|
||||
trigger_path = "/sys/devices/platform/thinkpad_acpi/dytc_lapmode"
|
||||
trigger_value = "1"
|
||||
action = "AbortOnLapMode"
|
||||
|
||||
[[quirks]]
|
||||
model_regex = "ROG Zephyrus G14"
|
||||
id = "asus_fan_hex_support"
|
||||
issue = "Custom Hex Curve Interface"
|
||||
target_path = "/sys/devices/platform/asus-nb-wmi/fan_curve"
|
||||
format = "HexPair16"
|
||||
|
||||
[[quirks]]
|
||||
model_regex = "Spectre x360"
|
||||
id = "hp_rapl_lockout"
|
||||
issue = "Hardware MSR Lockout"
|
||||
action = "WarnUserMSRLocked"
|
||||
|
||||
# heuristic discovery
|
||||
|
||||
[discovery.sensors]
|
||||
temp_labels = ["Package id 0", "Tdie", "Tctl", "CPU Temperature"]
|
||||
fan_labels = ["CPU Fan", "GPU Fan", "System Fan"]
|
||||
hwmon_priority = ["coretemp", "zenpower", "k10temp", "dell_smm"]
|
||||
|
||||
[discovery.actuators]
|
||||
rapl_paths = ["intel-rapl:0", "package-0"]
|
||||
amd_energy_paths = ["zenpower/energy1_input", "k10temp/energy1_input"]
|
||||
governor_files = ["energy_performance_preference", "energy_performance_hint", "scaling_governor"]
|
||||
|
||||
# env health verification
|
||||
|
||||
[[preflight_checks]]
|
||||
name = "MSR Write Access"
|
||||
check_cmd = "grep -q 'msr.allow_writes=on' /proc/cmdline"
|
||||
fail_help = "Add 'msr.allow_writes=on' to kernel parameters to allow power limit manipulation."
|
||||
|
||||
[[preflight_checks]]
|
||||
name = "Kernel Lockdown Status"
|
||||
check_cmd = "cat /sys/kernel/security/lockdown | grep -q '\\[none\\]'"
|
||||
fail_help = "Kernel Lockdown is enabled. MMIO/MSR actuators are restricted by the Linux Security Module."
|
||||
100
src/agent_analyst/mod.rs
Normal file
100
src/agent_analyst/mod.rs
Normal file
@@ -0,0 +1,100 @@
|
||||
//! Heuristic Analysis & Optimization Math (Agent Analyst)
|
||||
//!
|
||||
//! This module analyzes raw telemetry data to extract the "Optimal Real-World Settings".
|
||||
//! It calculates the Silicon Knee, Acoustic/Thermal Matrix (Hysteresis), and
|
||||
//! generates three distinct hardware states: Silent, Balanced, and Sustained Heavy.
|
||||
|
||||
use serde::{Serialize, Deserialize};
|
||||
use crate::engine::{ThermalProfile, OptimizerEngine};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct FanCurvePoint {
|
||||
pub temp_on: f32,
|
||||
pub temp_off: f32,
|
||||
pub pwm_percent: u8,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SystemProfile {
|
||||
pub name: String,
|
||||
pub pl1_watts: f32,
|
||||
pub pl2_watts: f32,
|
||||
pub fan_curve: Vec<FanCurvePoint>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct OptimizationMatrix {
|
||||
pub silent: SystemProfile,
|
||||
pub balanced: SystemProfile,
|
||||
pub performance: SystemProfile,
|
||||
pub thermal_resistance_kw: f32,
|
||||
pub ambient_temp: f32,
|
||||
}
|
||||
|
||||
pub struct HeuristicAnalyst {
|
||||
engine: OptimizerEngine,
|
||||
}
|
||||
|
||||
impl HeuristicAnalyst {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
engine: OptimizerEngine::new(5),
|
||||
}
|
||||
}
|
||||
|
||||
/// Analyzes the raw telemetry to generate the 3 optimal profiles.
|
||||
pub fn analyze(&self, profile: &ThermalProfile, max_soak_watts: f32) -> OptimizationMatrix {
|
||||
let r_theta = profile.r_theta;
|
||||
let silicon_knee = self.engine.find_silicon_knee(profile);
|
||||
let ambient = profile.ambient_temp;
|
||||
|
||||
// 1. State A: Silent / Battery (Scientific Passive Equilibrium)
|
||||
// Find P where T_core = 60C with fans OFF.
|
||||
let r_theta_passive = r_theta * 2.5;
|
||||
let silent_watts = ((60.0 - ambient) / r_theta_passive.max(0.1)).clamp(3.0, 15.0);
|
||||
|
||||
let silent_profile = SystemProfile {
|
||||
name: "Silent".to_string(),
|
||||
pl1_watts: silent_watts,
|
||||
pl2_watts: silent_watts * 1.2,
|
||||
fan_curve: vec![
|
||||
FanCurvePoint { temp_on: 65.0, temp_off: 55.0, pwm_percent: 0 },
|
||||
FanCurvePoint { temp_on: 75.0, temp_off: 65.0, pwm_percent: 30 },
|
||||
],
|
||||
};
|
||||
|
||||
// 2. State B: Balanced (The Silicon Knee)
|
||||
// We use R_theta to predict where the knee will sit thermally.
|
||||
let balanced_profile = SystemProfile {
|
||||
name: "Balanced".to_string(),
|
||||
pl1_watts: silicon_knee,
|
||||
pl2_watts: silicon_knee * 1.25,
|
||||
fan_curve: vec![
|
||||
FanCurvePoint { temp_on: ambient + 15.0, temp_off: ambient + 10.0, pwm_percent: 0 },
|
||||
FanCurvePoint { temp_on: ambient + 25.0, temp_off: ambient + 20.0, pwm_percent: 30 },
|
||||
FanCurvePoint { temp_on: 75.0, temp_off: 65.0, pwm_percent: 50 },
|
||||
FanCurvePoint { temp_on: 85.0, temp_off: 75.0, pwm_percent: 80 },
|
||||
],
|
||||
};
|
||||
|
||||
// 3. State C: Sustained Heavy
|
||||
let performance_profile = SystemProfile {
|
||||
name: "Performance".to_string(),
|
||||
pl1_watts: max_soak_watts,
|
||||
pl2_watts: max_soak_watts * 1.3,
|
||||
fan_curve: vec![
|
||||
FanCurvePoint { temp_on: 50.0, temp_off: 45.0, pwm_percent: 30 },
|
||||
FanCurvePoint { temp_on: 70.0, temp_off: 60.0, pwm_percent: 60 },
|
||||
FanCurvePoint { temp_on: 85.0, temp_off: 75.0, pwm_percent: 100 },
|
||||
],
|
||||
};
|
||||
|
||||
OptimizationMatrix {
|
||||
silent: silent_profile,
|
||||
balanced: balanced_profile,
|
||||
performance: performance_profile,
|
||||
thermal_resistance_kw: r_theta,
|
||||
ambient_temp: ambient,
|
||||
}
|
||||
}
|
||||
}
|
||||
154
src/agent_integrator/mod.rs
Normal file
154
src/agent_integrator/mod.rs
Normal file
@@ -0,0 +1,154 @@
|
||||
//! System Service Integration (Agent Integrator)
|
||||
//!
|
||||
//! This module translates the mathematical optimums defined by the Analyst
|
||||
//! into actionable, real-world Linux/OS service configurations.
|
||||
//! It generates templates for fan daemons (i8kmon, thinkfan) and handles
|
||||
//! resolution strategies for overlapping daemons.
|
||||
|
||||
use anyhow::Result;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::fs;
|
||||
use crate::agent_analyst::OptimizationMatrix;
|
||||
|
||||
pub struct ServiceIntegrator;
|
||||
|
||||
impl ServiceIntegrator {
|
||||
/// Generates and saves an i8kmon configuration based on the balanced profile.
|
||||
pub fn generate_i8kmon_config(matrix: &OptimizationMatrix, output_path: &Path, source_path: Option<&PathBuf>) -> Result<()> {
|
||||
let profile = &matrix.balanced;
|
||||
|
||||
let mut conf = String::new();
|
||||
|
||||
// Read existing content to preserve daemon and other settings
|
||||
let existing = if let Some(src) = source_path {
|
||||
if src.exists() { fs::read_to_string(src).unwrap_or_default() } else { String::new() }
|
||||
} else if output_path.exists() {
|
||||
fs::read_to_string(output_path).unwrap_or_default()
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
if !existing.is_empty() {
|
||||
for line in existing.lines() {
|
||||
let trimmed = line.trim();
|
||||
// Filter out the old auto-generated config lines and fan configs
|
||||
if !trimmed.starts_with("set config(0)") &&
|
||||
!trimmed.starts_with("set config(1)") &&
|
||||
!trimmed.starts_with("set config(2)") &&
|
||||
!trimmed.starts_with("set config(3)") &&
|
||||
!trimmed.starts_with("# Auto-generated") &&
|
||||
!trimmed.starts_with("# Profile:") &&
|
||||
!trimmed.is_empty() {
|
||||
conf.push_str(line);
|
||||
conf.push('\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
conf.push_str("\n# Auto-generated by ember-tune Integrator\n");
|
||||
conf.push_str(&format!("# Profile: {}\n", profile.name));
|
||||
conf.push_str(&format!("# Thermal Resistance: {:.3} K/W\n\n", matrix.thermal_resistance_kw));
|
||||
|
||||
for (i, p) in profile.fan_curve.iter().enumerate() {
|
||||
let state = match p.pwm_percent {
|
||||
0..=20 => 0,
|
||||
21..=50 => 1,
|
||||
51..=100 => 2,
|
||||
_ => 2,
|
||||
};
|
||||
|
||||
let off = if i == 0 { "-".to_string() } else { format!("{:.0}", p.temp_off) };
|
||||
conf.push_str(&format!("set config({}) {{{} {} {:.0} {}}}\n", i, state, state, p.temp_on, off));
|
||||
}
|
||||
|
||||
fs::write(output_path, conf)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Generates a thinkfan configuration, merging with existing sensors if possible.
|
||||
pub fn generate_thinkfan_config(matrix: &OptimizationMatrix, output_path: &Path, source_path: Option<&PathBuf>) -> Result<()> {
|
||||
let profile = &matrix.balanced;
|
||||
|
||||
let mut conf = String::new();
|
||||
|
||||
let existing = if let Some(src) = source_path {
|
||||
if src.exists() { fs::read_to_string(src).unwrap_or_default() } else { String::new() }
|
||||
} else if output_path.exists() {
|
||||
fs::read_to_string(output_path).unwrap_or_default()
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
if !existing.is_empty() {
|
||||
let mut in_sensors = false;
|
||||
for line in existing.lines() {
|
||||
let trimmed = line.trim();
|
||||
if trimmed == "sensors:" { in_sensors = true; }
|
||||
if trimmed == "levels:" { in_sensors = false; }
|
||||
|
||||
if in_sensors {
|
||||
conf.push_str(line);
|
||||
conf.push('\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if conf.is_empty() {
|
||||
conf.push_str("sensors:\n - hwmon: /sys/class/hwmon/hwmon0/temp1_input\n\n");
|
||||
}
|
||||
|
||||
conf.push_str("\n# Auto-generated by ember-tune Integrator\n");
|
||||
conf.push_str("levels:\n");
|
||||
|
||||
for (i, p) in profile.fan_curve.iter().enumerate() {
|
||||
let level = match p.pwm_percent {
|
||||
0..=20 => 0,
|
||||
21..=40 => 1,
|
||||
41..=60 => 3,
|
||||
61..=80 => 5,
|
||||
_ => 7,
|
||||
};
|
||||
|
||||
let down = if i == 0 { 0.0 } else { p.temp_off };
|
||||
conf.push_str(&format!(" - [{}, {:.0}, {:.0}]\n", level, down, p.temp_on));
|
||||
}
|
||||
|
||||
fs::write(output_path, conf)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Generates a resolution checklist/script for daemons.
|
||||
pub fn generate_conflict_resolution_script(output_path: &Path) -> Result<()> {
|
||||
let script = r#"#!/bin/bash
|
||||
# ember-tune Daemon Neutralization Script
|
||||
|
||||
# 1. Mask power-profiles-daemon (Prevent ACPI overrides)
|
||||
systemctl mask power-profiles-daemon
|
||||
|
||||
# 2. Filter TLP (Prevent CPU governor fights while keeping PCIe saving)
|
||||
sed -i 's/^CPU_SCALING_GOVERNOR_ON_AC=.*/CPU_SCALING_GOVERNOR_ON_AC=""/' /etc/tlp.conf
|
||||
sed -i 's/^CPU_BOOST_ON_AC=.*/CPU_BOOST_ON_AC=""/' /etc/tlp.conf
|
||||
systemctl restart tlp
|
||||
|
||||
# 3. Thermald Delegate (We provide the trips, it handles the rest)
|
||||
systemctl restart thermald
|
||||
"#;
|
||||
fs::write(output_path, script)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Generates a thermald configuration XML.
|
||||
pub fn generate_thermald_config(matrix: &OptimizationMatrix, output_path: &Path, _source_path: Option<&PathBuf>) -> Result<()> {
|
||||
let profile = &matrix.balanced;
|
||||
let mut xml = String::new();
|
||||
xml.push_str("<?xml version=\"1.0\"?>\n<ThermalConfiguration>\n <Platform>\n <Name>ember-tune Balanced</Name>\n <ProductName>Generic</ProductName>\n <Preference>balanced</Preference>\n <ThermalZones>\n <ThermalZone>\n <Type>cpu</Type>\n <TripPoints>\n");
|
||||
|
||||
for (i, p) in profile.fan_curve.iter().enumerate() {
|
||||
xml.push_str(&format!(" <TripPoint>\n <SensorType>cpu</SensorType>\n <Temperature>{}</Temperature>\n <Type>Passive</Type>\n <ControlId>{}</ControlId>\n </TripPoint>\n", p.temp_on * 1000.0, i));
|
||||
}
|
||||
|
||||
xml.push_str(" </TripPoints>\n </ThermalZone>\n </ThermalZones>\n </Platform>\n</ThermalConfiguration>\n");
|
||||
fs::write(output_path, xml)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
32
src/cli.rs
32
src/cli.rs
@@ -1,3 +1,8 @@
|
||||
//! Defines the command-line interface for `ember-tune`.
|
||||
//!
|
||||
//! This module uses the `clap` crate to define the CLI arguments, subcommands,
|
||||
//! and help text.
|
||||
|
||||
use clap::{Parser, builder::styling};
|
||||
use std::path::PathBuf;
|
||||
|
||||
@@ -7,27 +12,28 @@ const STYLES: styling::Styles = styling::Styles::styled()
|
||||
.literal(styling::AnsiColor::Cyan.on_default().bold())
|
||||
.placeholder(styling::AnsiColor::Cyan.on_default());
|
||||
|
||||
/// Scientifically-driven hardware power and thermal optimizer.
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(
|
||||
name = "ember-tune",
|
||||
author = "Nils Pukropp <nils@narl.io>",
|
||||
version = "1.0.0",
|
||||
about = "ember-tune: Scientifically-driven hardware power and thermal optimizer.",
|
||||
long_about = "ember-tune transforms manual laptop tuning into a rigorous, automated engineering workflow. \nIt executes a state machine to find the 'Physical Sweet Spot' of your specific hardware by measuring \nthe Silicon Knee, Thermal Resistance (Rθ), and Thermal Inertia, then outputs optimal \nconfigurations for tools like 'throttled' or 'ryzenadj'.",
|
||||
version = "1.1.0",
|
||||
about = "ember-tune: A physically-grounded thermal and power optimizer for Linux.",
|
||||
long_about = "ember-tune transforms manual laptop tuning into a rigorous, automated engineering workflow. \nIt executes a state machine to find the 'Physical Sweet Spot' of your specific hardware by measuring \nthe Silicon Knee, Thermal Resistance (Rθ), and Thermal Inertia, then outputs optimal \nconfigurations for tools like 'throttled' or 'i8kmon'.",
|
||||
styles = STYLES,
|
||||
after_help = "EXAMPLES:\n sudo ember-tune run # Run standard optimization\n sudo ember-tune run --dry-run # Audit and simulate without changes\n sudo ember-tune run --mock # Safe demo with fake hardware"
|
||||
after_help = "EXAMPLES:\n sudo ember-tune # Run standard optimization\n sudo ember-tune --audit-only # Validate system requirements only\n sudo ember-tune --mock # Safe demo with fake hardware"
|
||||
)]
|
||||
pub struct Cli {
|
||||
/// Path to output the optimized configuration file
|
||||
/// Path to output the final `throttled.conf` file.
|
||||
#[arg(
|
||||
short,
|
||||
long,
|
||||
default_value = "throttled.conf",
|
||||
help = "Destination for the generated configuration file (e.g. /etc/throttled.conf)"
|
||||
value_name = "THROTTLED_PATH",
|
||||
help = "Optional: Overrides the discovered or default path for throttled.conf."
|
||||
)]
|
||||
pub config_out: PathBuf,
|
||||
pub config_out: Option<PathBuf>,
|
||||
|
||||
/// Maximum safe temperature (Celsius) for the benchmark
|
||||
/// Maximum safe temperature (Celsius) for the benchmark.
|
||||
#[arg(
|
||||
short,
|
||||
long,
|
||||
@@ -36,7 +42,7 @@ pub struct Cli {
|
||||
)]
|
||||
pub max_temp: f32,
|
||||
|
||||
/// Enable verbose debug logging
|
||||
/// Enable verbose debug logging.
|
||||
#[arg(
|
||||
short,
|
||||
long,
|
||||
@@ -44,17 +50,17 @@ pub struct Cli {
|
||||
)]
|
||||
pub verbose: bool,
|
||||
|
||||
/// Use a mock hardware layer for safe testing
|
||||
/// Use a mock hardware layer for safe testing.
|
||||
#[arg(
|
||||
long,
|
||||
help = "Emulates hardware responses. Ideal for testing UI/Logic on unsupported systems."
|
||||
)]
|
||||
pub mock: bool,
|
||||
|
||||
/// Run pre-flight audit only
|
||||
/// Run pre-flight audit only, then exit.
|
||||
#[arg(
|
||||
long,
|
||||
help = "Validate system requirements and conflict management without starting the benchmark."
|
||||
help = "Validate system requirements and conflicts without starting the benchmark."
|
||||
)]
|
||||
pub audit_only: bool,
|
||||
}
|
||||
|
||||
@@ -1,41 +1,66 @@
|
||||
use std::path::Path;
|
||||
use anyhow::Result;
|
||||
|
||||
pub struct I8kmonConfig {
|
||||
pub t_ambient: f32,
|
||||
pub t_max_fan: f32,
|
||||
pub thermal_resistance_kw: f32,
|
||||
}
|
||||
|
||||
pub struct I8kmonTranslator;
|
||||
|
||||
impl I8kmonTranslator {
|
||||
pub fn generate_conf(config: &I8kmonConfig) -> String {
|
||||
// Higher resistance means we need to start fans sooner.
|
||||
// If R_theta is 2.5 K/W, it's quite high for a laptop.
|
||||
// We'll scale the 'low' threshold based on R_theta.
|
||||
let aggression_factor = (config.thermal_resistance_kw / 1.5).clamp(0.8, 1.5);
|
||||
|
||||
let t_off = config.t_ambient + 5.0;
|
||||
let t_low_on = config.t_ambient + 12.0;
|
||||
let t_low_off = config.t_ambient + 10.0;
|
||||
let t_low_on = config.t_ambient + (10.0 / aggression_factor);
|
||||
let t_low_off = t_low_on - 2.0;
|
||||
|
||||
let t_high_on = config.t_max_fan;
|
||||
let t_high_off = config.t_max_fan - 5.0;
|
||||
let t_low_trigger = (config.t_max_fan - 15.0).max(t_low_on + 2.0);
|
||||
let t_high_off = t_high_on - 5.0;
|
||||
|
||||
let t_mid_on = (t_low_on + t_high_on) / 2.0;
|
||||
let t_mid_off = t_mid_on - 3.0;
|
||||
|
||||
format!(
|
||||
r#"# Generated by ember-tune Optimizer
|
||||
# Grounded in physical thermal resistance
|
||||
# Grounded in physical thermal resistance (Rθ = {r_theta:.3} K/W)
|
||||
|
||||
set config(gen_shadow) 1
|
||||
set config(i8k_ignore_dmi) 1
|
||||
|
||||
# Fan states: {{state_low state_high temp_on temp_off}}
|
||||
# 0: Off
|
||||
set config(0) {{0 0 {t_low_on:.0} {t_off:.0}}}
|
||||
set config(1) {{1 1 {t_low_trigger:.0} {t_low_off:.0}}}
|
||||
set config(2) {{2 2 {t_high_on:.0} {t_high_off:.0}}}
|
||||
# 1: Low
|
||||
set config(1) {{1 1 {t_mid_on:.0} {t_low_off:.0}}}
|
||||
# 2: High
|
||||
set config(2) {{2 2 {t_high_on:.0} {t_mid_off:.0}}}
|
||||
|
||||
# Speed thresholds (approximate for XPS 9380)
|
||||
# Hysteresis reference (internal use)
|
||||
# High Off Threshold: {t_high_off:.0}
|
||||
|
||||
# Speed thresholds
|
||||
set config(speed_low) 2500
|
||||
set config(speed_high) 4500
|
||||
"#,
|
||||
r_theta = config.thermal_resistance_kw,
|
||||
t_low_on = t_low_on,
|
||||
t_off = t_off,
|
||||
t_low_trigger = t_low_trigger,
|
||||
t_mid_on = t_mid_on,
|
||||
t_low_off = t_low_off,
|
||||
t_high_on = t_high_on,
|
||||
t_high_off = t_high_off
|
||||
t_mid_off = t_mid_off
|
||||
)
|
||||
}
|
||||
|
||||
pub fn save(path: &Path, config: &I8kmonConfig) -> Result<()> {
|
||||
let content = Self::generate_conf(config);
|
||||
std::fs::write(path, content)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
use std::collections::HashSet;
|
||||
use std::path::Path;
|
||||
use anyhow::{Result};
|
||||
|
||||
pub struct ThrottledConfig {
|
||||
pub pl1_limit: f32,
|
||||
@@ -38,13 +40,11 @@ Trip_Temp_C: {trip:.0}
|
||||
}
|
||||
|
||||
/// Merges benchmarked values into an existing throttled.conf content.
|
||||
/// Preserves all other sections (like [UnderVOLT]), comments, and formatting.
|
||||
pub fn merge_conf(existing_content: &str, config: &ThrottledConfig) -> String {
|
||||
let mut sections = Vec::new();
|
||||
let mut current_section_name = String::new();
|
||||
let mut current_section_lines = Vec::new();
|
||||
|
||||
// 1. Parse into sections to ensure we only update keys in [BATTERY] and [AC]
|
||||
for line in existing_content.lines() {
|
||||
let trimmed = line.trim();
|
||||
if trimmed.starts_with('[') && trimmed.ends_with(']') {
|
||||
@@ -68,17 +68,14 @@ Trip_Temp_C: {trip:.0}
|
||||
let mut result_lines = Vec::new();
|
||||
let mut handled_sections = HashSet::new();
|
||||
|
||||
// 2. Process sections
|
||||
for (name, mut lines) in sections {
|
||||
if name == "BATTERY" || name == "AC" {
|
||||
handled_sections.insert(name.clone());
|
||||
let mut updated_keys = HashSet::new();
|
||||
|
||||
let mut new_lines = Vec::new();
|
||||
for line in lines {
|
||||
let mut updated = false;
|
||||
let trimmed = line.trim();
|
||||
|
||||
if !trimmed.starts_with('#') && !trimmed.is_empty() {
|
||||
if let Some((key, _)) = trimmed.split_once(':') {
|
||||
let key = key.trim();
|
||||
@@ -87,11 +84,7 @@ Trip_Temp_C: {trip:.0}
|
||||
if let Some(colon_idx) = line.find(':') {
|
||||
let prefix = &line[..colon_idx + 1];
|
||||
let rest = &line[colon_idx + 1..];
|
||||
let comment = if let Some(hash_idx) = rest.find('#') {
|
||||
&rest[hash_idx..]
|
||||
} else {
|
||||
""
|
||||
};
|
||||
let comment = if let Some(hash_idx) = rest.find('#') { &rest[hash_idx..] } else { "" };
|
||||
new_lines.push(format!("{} {}{}", prefix, new_value, comment));
|
||||
updated_keys.insert(*target_key);
|
||||
updated = true;
|
||||
@@ -101,12 +94,8 @@ Trip_Temp_C: {trip:.0}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !updated {
|
||||
new_lines.push(line);
|
||||
}
|
||||
if !updated { new_lines.push(line); }
|
||||
}
|
||||
|
||||
for (target_key, new_value) in &target_keys {
|
||||
if !updated_keys.contains(*target_key) {
|
||||
new_lines.push(format!("{}: {}", target_key, new_value));
|
||||
@@ -117,7 +106,6 @@ Trip_Temp_C: {trip:.0}
|
||||
result_lines.extend(lines);
|
||||
}
|
||||
|
||||
// 3. Add missing sections if they didn't exist at all
|
||||
for section_name in &["BATTERY", "AC"] {
|
||||
if !handled_sections.contains(*section_name) {
|
||||
result_lines.push(String::new());
|
||||
@@ -127,7 +115,20 @@ Trip_Temp_C: {trip:.0}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result_lines.join("\n")
|
||||
}
|
||||
|
||||
pub fn save(path: &Path, config: &ThrottledConfig, source_path: Option<&std::path::PathBuf>) -> Result<()> {
|
||||
let existing = if let Some(src) = source_path {
|
||||
if src.exists() { std::fs::read_to_string(src).unwrap_or_default() } else { String::new() }
|
||||
} else if path.exists() {
|
||||
std::fs::read_to_string(path).unwrap_or_default()
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
let content = if existing.is_empty() { Self::generate_conf(config) } else { Self::merge_conf(&existing, config) };
|
||||
std::fs::write(path, content)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,17 @@
|
||||
//! The core mathematics and physics engine for `ember-tune`.
|
||||
//!
|
||||
//! This module contains the `OptimizerEngine`, which is responsible for all
|
||||
//! data smoothing, thermal resistance calculations, and the heuristic scoring
|
||||
//! used to identify the "Silicon Knee".
|
||||
|
||||
use serde::{Serialize, Deserialize};
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use tracing::{warn, debug};
|
||||
|
||||
pub mod formatters;
|
||||
|
||||
/// A single, atomic data point captured during the benchmark.
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct ThermalPoint {
|
||||
pub power_w: f32,
|
||||
@@ -11,23 +21,38 @@ pub struct ThermalPoint {
|
||||
pub throughput: f64,
|
||||
}
|
||||
|
||||
/// A complete thermal profile containing all data points for a benchmark run.
|
||||
#[derive(Debug, Default, Serialize, Deserialize, Clone)]
|
||||
pub struct ThermalProfile {
|
||||
pub points: Vec<ThermalPoint>,
|
||||
pub ambient_temp: f32,
|
||||
pub r_theta: f32,
|
||||
}
|
||||
|
||||
/// The final, recommended parameters derived from the thermal benchmark.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct OptimizationResult {
|
||||
/// The full thermal profile used for calculations.
|
||||
pub profile: ThermalProfile,
|
||||
/// The power level (in Watts) where performance-per-watt plateaus.
|
||||
pub silicon_knee_watts: f32,
|
||||
/// The measured thermal resistance of the system (Kelvin/Watt).
|
||||
pub thermal_resistance_kw: f32,
|
||||
/// The recommended sustained power limit (PL1).
|
||||
pub recommended_pl1: f32,
|
||||
/// The recommended burst power limit (PL2).
|
||||
pub recommended_pl2: f32,
|
||||
/// The maximum temperature reached during the test.
|
||||
pub max_temp_c: f32,
|
||||
/// Indicates if the benchmark was aborted before completion.
|
||||
pub is_partial: bool,
|
||||
/// A map of configuration files that were written to.
|
||||
pub config_paths: HashMap<String, PathBuf>,
|
||||
/// The comprehensive optimization matrix (Silent, Balanced, Performance).
|
||||
pub optimization_matrix: Option<crate::agent_analyst::OptimizationMatrix>,
|
||||
}
|
||||
|
||||
/// Pure mathematics engine for thermal optimization.
|
||||
pub struct OptimizerEngine {
|
||||
window_size: usize,
|
||||
}
|
||||
@@ -37,7 +62,7 @@ impl OptimizerEngine {
|
||||
Self { window_size }
|
||||
}
|
||||
|
||||
/// Applies a simple moving average (SMA) filter to a stream of values.
|
||||
/// Smoothes sensor jitter using a moving average with outlier rejection.
|
||||
pub fn smooth(&self, data: &[f32]) -> Vec<f32> {
|
||||
if data.is_empty() { return vec![]; }
|
||||
let mut smoothed = Vec::with_capacity(data.len());
|
||||
@@ -45,86 +70,81 @@ impl OptimizerEngine {
|
||||
for i in 0..data.len() {
|
||||
let start = if i < self.window_size { 0 } else { i - self.window_size + 1 };
|
||||
let end = i + 1;
|
||||
let sum: f32 = data[start..end].iter().sum();
|
||||
smoothed.push(sum / (end - start) as f32);
|
||||
|
||||
let window = &data[start..end];
|
||||
let avg: f32 = window.iter().sum::<f32>() / window.len() as f32;
|
||||
let filtered: Vec<f32> = window.iter()
|
||||
.filter(|&&v| (v - avg).abs() < 10.0)
|
||||
.cloned().collect();
|
||||
|
||||
if filtered.is_empty() {
|
||||
smoothed.push(avg);
|
||||
} else {
|
||||
smoothed.push(filtered.iter().sum::<f32>() / filtered.len() as f32);
|
||||
}
|
||||
}
|
||||
smoothed
|
||||
}
|
||||
|
||||
/// Calculates Thermal Resistance: R_theta = (T_core - T_ambient) / P_package
|
||||
pub fn calculate_thermal_resistance(&self, profile: &ThermalProfile) -> f32 {
|
||||
profile.points.iter()
|
||||
.max_by(|a, b| a.power_w.partial_cmp(&b.power_w).unwrap_or(std::cmp::Ordering::Equal))
|
||||
.map(|p| {
|
||||
if p.power_w < 1.0 { 0.0 }
|
||||
else { (p.temp_c - profile.ambient_temp) / p.power_w }
|
||||
})
|
||||
.unwrap_or(0.0)
|
||||
/// Evaluates if a series of temperature readings have reached thermal equilibrium.
|
||||
/// Criteria: Standard deviation < 0.25C over the last 10 seconds.
|
||||
pub fn is_stable(&self, temps: &[f32]) -> bool {
|
||||
if temps.len() < 20 { return false; } // Need at least 10s of data (500ms intervals)
|
||||
let window = &temps[temps.len() - 20..];
|
||||
|
||||
let avg = window.iter().sum::<f32>() / window.len() as f32;
|
||||
let variance = window.iter().map(|&t| (t - avg).powi(2)).sum::<f32>() / window.len() as f32;
|
||||
let std_dev = variance.sqrt();
|
||||
|
||||
debug!("Stability Check: StdDev={:.3}C (Target < 0.25C)", std_dev);
|
||||
std_dev < 0.25
|
||||
}
|
||||
|
||||
pub fn get_max_temp(&self, profile: &ThermalProfile) -> f32 {
|
||||
profile.points.iter()
|
||||
.map(|p| p.temp_c)
|
||||
.max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
|
||||
.unwrap_or(0.0)
|
||||
/// Predicts the steady-state temperature for a given target wattage.
|
||||
/// Formula: T_pred = T_ambient + (P_target * R_theta)
|
||||
pub fn predict_temp(&self, target_watts: f32, ambient: f32, r_theta: f32) -> f32 {
|
||||
ambient + (target_watts * r_theta)
|
||||
}
|
||||
|
||||
/// Finds the "Silicon Knee" - the point where performance per watt (efficiency)
|
||||
/// starts to diminish significantly and thermal density spikes.
|
||||
/// Calculates Thermal Resistance (K/W) using the steady-state delta.
|
||||
pub fn calculate_r_theta(&self, ambient: f32, steady_temp: f32, steady_power: f32) -> f32 {
|
||||
if steady_power < 1.0 { return 0.0; }
|
||||
(steady_temp - ambient) / steady_power
|
||||
}
|
||||
|
||||
/// Identifies the "Silicon Knee" by finding the point of maximum efficiency.
|
||||
pub fn find_silicon_knee(&self, profile: &ThermalProfile) -> f32 {
|
||||
if profile.points.len() < 3 {
|
||||
return profile.points.last().map(|p| p.power_w).unwrap_or(15.0);
|
||||
}
|
||||
if profile.points.is_empty() { return 15.0; }
|
||||
|
||||
let mut points = profile.points.clone();
|
||||
points.sort_by(|a, b| a.power_w.partial_cmp(&b.power_w).unwrap_or(std::cmp::Ordering::Equal));
|
||||
|
||||
let mut best_pl = points[0].power_w;
|
||||
let mut max_score = f32::MIN;
|
||||
let efficiencies: Vec<(f32, f32)> = points.iter()
|
||||
.map(|p| {
|
||||
let perf = if p.throughput > 0.0 { p.throughput as f32 } else { p.freq_mhz };
|
||||
(p.power_w, perf / p.power_w.max(1.0))
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Use a sliding window (3 points) to calculate gradients more robustly
|
||||
for i in 1..points.len() - 1 {
|
||||
let prev = &points[i - 1];
|
||||
let curr = &points[i];
|
||||
let next = &points[i + 1];
|
||||
if efficiencies.is_empty() { return 15.0; }
|
||||
|
||||
// 1. Efficiency Metric (Throughput per Watt)
|
||||
// If throughput is 0 (unsupported), fallback to Frequency per Watt
|
||||
let efficiency_curr = if curr.throughput > 0.0 {
|
||||
curr.throughput as f32 / curr.power_w.max(0.1)
|
||||
let max_efficiency = efficiencies.iter()
|
||||
.map(|(_, e)| *e)
|
||||
.max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
|
||||
.unwrap_or(1.0);
|
||||
|
||||
let mut knee_watts = points[0].power_w;
|
||||
for (watts, efficiency) in efficiencies {
|
||||
if efficiency >= (max_efficiency * 0.85) {
|
||||
knee_watts = watts;
|
||||
} else {
|
||||
curr.freq_mhz / curr.power_w.max(0.1)
|
||||
};
|
||||
|
||||
let efficiency_next = if next.throughput > 0.0 {
|
||||
next.throughput as f32 / next.power_w.max(0.1)
|
||||
} else {
|
||||
next.freq_mhz / next.power_w.max(0.1)
|
||||
};
|
||||
|
||||
// Diminishing returns: how much efficiency drops per additional watt
|
||||
let efficiency_drop = (efficiency_curr - efficiency_next) / (next.power_w - curr.power_w).max(0.1);
|
||||
|
||||
// 2. Thermal Acceleration (d2T/dW2)
|
||||
let dt_dw_prev = (curr.temp_c - prev.temp_c) / (curr.power_w - prev.power_w).max(0.1);
|
||||
let dt_dw_next = (next.temp_c - curr.temp_c) / (next.power_w - curr.power_w).max(0.1);
|
||||
let temp_accel = (dt_dw_next - dt_dw_prev) / (next.power_w - prev.power_w).max(0.1);
|
||||
|
||||
// 3. Wall Detection (Any drop in absolute frequency/throughput is a hard wall)
|
||||
let is_throttling = next.freq_mhz < curr.freq_mhz || (next.throughput > 0.0 && next.throughput < curr.throughput);
|
||||
let penalty = if is_throttling { 5000.0 } else { 0.0 };
|
||||
|
||||
// Heuristic scoring:
|
||||
// - Higher score is "Better" (The Knee is the peak of this curve)
|
||||
// - We want high efficiency (low drop) and low thermal acceleration.
|
||||
let score = (efficiency_curr * 10.0) - (efficiency_drop * 50.0) - (temp_accel * 20.0) - penalty;
|
||||
|
||||
if score > max_score {
|
||||
max_score = score;
|
||||
best_pl = curr.power_w;
|
||||
debug!("Efficiency drop at {:.1}W ({:.1}% of peak)", watts, (efficiency/max_efficiency)*100.0);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
best_pl
|
||||
knee_watts.clamp(PowerLimitWatts::MIN, PowerLimitWatts::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
use crate::sal::safety::PowerLimitWatts;
|
||||
|
||||
0
src/engine/profiles.rs
Normal file
0
src/engine/profiles.rs
Normal file
16
src/lib.rs
Normal file
16
src/lib.rs
Normal file
@@ -0,0 +1,16 @@
|
||||
//! # ember-tune: A physically-grounded thermal and power optimizer for Linux.
|
||||
//!
|
||||
//! This crate provides the core library for `ember-tune`, a tool that
|
||||
//! scientifically determines the optimal power and thermal settings for laptops
|
||||
//! by measuring physical properties like Thermal Resistance and the "Silicon Knee".
|
||||
|
||||
pub mod mediator;
|
||||
pub mod sal;
|
||||
pub mod load;
|
||||
pub mod orchestrator;
|
||||
pub mod ui;
|
||||
pub mod engine;
|
||||
pub mod cli;
|
||||
pub mod sys;
|
||||
pub mod agent_analyst;
|
||||
pub mod agent_integrator;
|
||||
183
src/load/mod.rs
183
src/load/mod.rs
@@ -1,57 +1,174 @@
|
||||
use anyhow::Result;
|
||||
//! Load generation and performance measurement subsystem.
|
||||
|
||||
pub trait Workload {
|
||||
/// Starts the workload with specified threads and load percentage.
|
||||
fn start(&mut self, threads: usize, load_percent: usize) -> Result<()>;
|
||||
/// Stops the workload.
|
||||
fn stop(&mut self) -> Result<()>;
|
||||
/// Returns the current throughput (e.g., ops/sec).
|
||||
fn get_throughput(&self) -> Result<f64>;
|
||||
use anyhow::{Result, Context, anyhow};
|
||||
use std::process::{Child, Command, Stdio};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::thread;
|
||||
use std::io::{BufRead, BufReader};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Standardized telemetry returned by any workload implementation.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
||||
pub struct WorkloadMetrics {
|
||||
/// Primary performance heuristic (e.g., Bogo Ops/s)
|
||||
pub primary_ops_per_sec: f64,
|
||||
/// Time elapsed since the workload started
|
||||
pub elapsed_time: Duration,
|
||||
}
|
||||
|
||||
/// Defines which subsystem to isolate during stress testing.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum StressVector {
|
||||
CpuMatrix,
|
||||
MemoryBandwidth,
|
||||
Mixed,
|
||||
}
|
||||
|
||||
/// A normalized profile defining the intensity and constraints of the workload.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct IntensityProfile {
|
||||
pub threads: usize,
|
||||
pub load_percentage: u8,
|
||||
pub vector: StressVector,
|
||||
}
|
||||
|
||||
/// The replaceable interface for load generation and performance measurement.
|
||||
pub trait Workload: Send + Sync {
|
||||
/// Sets up prerequisites (e.g., binary checks).
|
||||
fn initialize(&mut self) -> Result<()>;
|
||||
|
||||
/// Executes the load asynchronously.
|
||||
fn run_workload(&mut self, duration: Duration, profile: IntensityProfile) -> Result<()>;
|
||||
|
||||
/// Returns the current standardized telemetry object.
|
||||
fn get_current_metrics(&self) -> Result<WorkloadMetrics>;
|
||||
|
||||
/// Gracefully and forcefully terminates the workload.
|
||||
fn stop_workload(&mut self) -> Result<()>;
|
||||
}
|
||||
|
||||
/// Implementation of the Benchmarking Interface using stress-ng matrix stressors.
|
||||
pub struct StressNg {
|
||||
child: Option<std::process::Child>,
|
||||
child: Option<Child>,
|
||||
start_time: Option<Instant>,
|
||||
latest_metrics: Arc<Mutex<WorkloadMetrics>>,
|
||||
}
|
||||
|
||||
impl StressNg {
|
||||
pub fn new() -> Self {
|
||||
Self { child: None }
|
||||
Self {
|
||||
child: None,
|
||||
start_time: None,
|
||||
latest_metrics: Arc::new(Mutex::new(WorkloadMetrics::default())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Workload for StressNg {
|
||||
fn start(&mut self, threads: usize, load_percent: usize) -> Result<()> {
|
||||
self.stop()?; // Ensure any previous instance is stopped
|
||||
fn initialize(&mut self) -> Result<()> {
|
||||
let status = Command::new("stress-ng")
|
||||
.arg("--version")
|
||||
.stdout(Stdio::null())
|
||||
.stderr(Stdio::null())
|
||||
.status()
|
||||
.context("stress-ng binary not found in PATH. Please install it.")?;
|
||||
|
||||
let child = std::process::Command::new("stress-ng")
|
||||
.args([
|
||||
"--cpu", &threads.to_string(),
|
||||
"--cpu-load", &load_percent.to_string(),
|
||||
"--quiet"
|
||||
])
|
||||
.spawn()?;
|
||||
|
||||
self.child = Some(child);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn stop(&mut self) -> Result<()> {
|
||||
if let Some(mut child) = self.child.take() {
|
||||
let _ = child.kill();
|
||||
let _ = child.wait();
|
||||
if !status.success() {
|
||||
return Err(anyhow!("stress-ng failed to initialize"));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_throughput(&self) -> Result<f64> {
|
||||
// In a real implementation, we would parse stress-ng's temporary results
|
||||
// or use a different workload that provides live throughput.
|
||||
Ok(0.0)
|
||||
fn run_workload(&mut self, duration: Duration, profile: IntensityProfile) -> Result<()> {
|
||||
self.stop_workload()?;
|
||||
|
||||
let threads = profile.threads.to_string();
|
||||
let timeout = format!("{}s", duration.as_secs());
|
||||
let load = profile.load_percentage.to_string();
|
||||
|
||||
let mut cmd = Command::new("stress-ng");
|
||||
cmd.args(["--timeout", &timeout, "--metrics", "--quiet", "--cpu-load", &load]);
|
||||
|
||||
match profile.vector {
|
||||
StressVector::CpuMatrix => {
|
||||
cmd.args(["--matrix", &threads]);
|
||||
},
|
||||
StressVector::MemoryBandwidth => {
|
||||
cmd.args(["--vm", &threads, "--vm-bytes", "80%"]);
|
||||
},
|
||||
StressVector::Mixed => {
|
||||
let half = (profile.threads / 2).max(1).to_string();
|
||||
cmd.args(["--matrix", &half, "--vm", &half, "--vm-bytes", "40%"]);
|
||||
}
|
||||
}
|
||||
|
||||
let mut child = cmd.stderr(Stdio::piped()).spawn().context("Failed to spawn stress-ng")?;
|
||||
|
||||
self.start_time = Some(Instant::now());
|
||||
|
||||
// Spawn metrics parser thread
|
||||
let metrics_ref = Arc::clone(&self.latest_metrics);
|
||||
let stderr = child.stderr.take().expect("Failed to capture stderr");
|
||||
|
||||
thread::spawn(move || {
|
||||
let reader = BufReader::new(stderr);
|
||||
for line in reader.lines().flatten() {
|
||||
// Parse stress-ng metrics line
|
||||
if line.contains("matrix") || line.contains("vm") {
|
||||
let parts: Vec<&str> = line.split_whitespace().collect();
|
||||
if let Some(val) = parts.last() {
|
||||
if let Ok(ops) = val.parse::<f64>() {
|
||||
let mut m = metrics_ref.lock().unwrap();
|
||||
m.primary_ops_per_sec = ops;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
self.child = Some(child);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_current_metrics(&self) -> Result<WorkloadMetrics> {
|
||||
let mut m = self.latest_metrics.lock().unwrap().clone();
|
||||
if let Some(start) = self.start_time {
|
||||
m.elapsed_time = start.elapsed();
|
||||
}
|
||||
Ok(m)
|
||||
}
|
||||
|
||||
fn stop_workload(&mut self) -> Result<()> {
|
||||
if let Some(mut child) = self.child.take() {
|
||||
#[cfg(unix)]
|
||||
{
|
||||
use libc::{kill, SIGTERM};
|
||||
unsafe { kill(child.id() as i32, SIGTERM); }
|
||||
}
|
||||
|
||||
let start = Instant::now();
|
||||
let mut exited = false;
|
||||
while start.elapsed() < Duration::from_secs(2) {
|
||||
if let Ok(Some(_)) = child.try_wait() {
|
||||
exited = true;
|
||||
break;
|
||||
}
|
||||
thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
|
||||
if !exited {
|
||||
let _ = child.kill();
|
||||
let _ = child.wait();
|
||||
}
|
||||
}
|
||||
self.start_time = None;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for StressNg {
|
||||
fn drop(&mut self) {
|
||||
let _ = self.stop();
|
||||
let _ = self.stop_workload();
|
||||
}
|
||||
}
|
||||
|
||||
196
src/main.rs
196
src/main.rs
@@ -1,11 +1,3 @@
|
||||
mod mediator;
|
||||
mod sal;
|
||||
mod load;
|
||||
mod orchestrator;
|
||||
mod ui;
|
||||
mod engine;
|
||||
mod cli;
|
||||
|
||||
use miette::{Result, IntoDiagnostic, Diagnostic, Report};
|
||||
use thiserror::Error;
|
||||
use std::sync::mpsc;
|
||||
@@ -16,7 +8,8 @@ use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::io;
|
||||
|
||||
use clap::Parser;
|
||||
use tracing::{info, debug, error};
|
||||
use tracing::error;
|
||||
use tracing_subscriber::{fmt, prelude::*, EnvFilter};
|
||||
|
||||
use crossterm::{
|
||||
event::{self, Event, KeyCode},
|
||||
@@ -25,15 +18,16 @@ use crossterm::{
|
||||
};
|
||||
use ratatui::{backend::CrosstermBackend, Terminal};
|
||||
|
||||
use cli::Cli;
|
||||
use mediator::{TelemetryState, UiCommand, BenchmarkPhase};
|
||||
use sal::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError};
|
||||
use sal::mock::{MockAuditor, MockGuard, MockSensorBus, MockActuatorBus, MockWatchdog};
|
||||
use sal::dell_xps_9380::DellXps9380Sal;
|
||||
use load::StressNg;
|
||||
use orchestrator::BenchmarkOrchestrator;
|
||||
use ui::dashboard::{draw_dashboard, DashboardState};
|
||||
use engine::OptimizationResult;
|
||||
use ember_tune_rs::cli::Cli;
|
||||
use ember_tune_rs::mediator::{TelemetryState, UiCommand, BenchmarkPhase};
|
||||
use ember_tune_rs::sal::traits::{AuditError, PlatformSal};
|
||||
use ember_tune_rs::sal::mock::MockSal;
|
||||
use ember_tune_rs::sal::heuristic::engine::HeuristicEngine;
|
||||
use ember_tune_rs::sal::heuristic::discovery::SystemFactSheet;
|
||||
use ember_tune_rs::load::{StressNg};
|
||||
use ember_tune_rs::orchestrator::BenchmarkOrchestrator;
|
||||
use ember_tune_rs::ui::dashboard::{draw_dashboard, DashboardState};
|
||||
use ember_tune_rs::engine::OptimizationResult;
|
||||
use owo_colors::OwoColorize;
|
||||
|
||||
#[derive(Error, Diagnostic, Debug)]
|
||||
@@ -67,34 +61,32 @@ fn print_summary_report(result: &OptimizationResult) {
|
||||
println!("│ Burst (PL2): {:>5.1} W │", result.recommended_pl2);
|
||||
|
||||
println!("│ │");
|
||||
println!("│ {} │", "Apply to /etc/throttled.conf:".bold().magenta());
|
||||
println!("│ PL1_Tdp_W: {:<5.1} │", result.recommended_pl1);
|
||||
println!("│ PL2_Tdp_W: {:<5.1} │", result.recommended_pl2);
|
||||
println!("│ {} │", "Apply these to your system:".bold().magenta());
|
||||
for (id, path) in &result.config_paths {
|
||||
println!("│ {:<10}: {:<34} │", id, path.display());
|
||||
}
|
||||
println!("╰──────────────────────────────────────────────────╯");
|
||||
println!();
|
||||
}
|
||||
|
||||
fn setup_logging(verbose: bool) -> tracing_appender::non_blocking::WorkerGuard {
|
||||
let file_appender = tracing_appender::rolling::never("/var/log", "ember-tune.log");
|
||||
let (non_blocking, guard) = tracing_appender::non_blocking(file_appender);
|
||||
|
||||
let level = if verbose { tracing::Level::DEBUG } else { tracing::Level::INFO };
|
||||
|
||||
tracing_subscriber::fmt()
|
||||
.with_max_level(level)
|
||||
.with_writer(non_blocking)
|
||||
.with_ansi(false)
|
||||
.init();
|
||||
|
||||
guard
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// 1. Diagnostics & CLI Initialization
|
||||
let args = Cli::parse();
|
||||
let _log_guard = setup_logging(args.verbose);
|
||||
|
||||
// 1. Logging Setup (File-only by default, Stdout during Audit)
|
||||
let file_appender = tracing_appender::rolling::never(".", "ember-tune.log");
|
||||
let (non_blocking, _guard) = tracing_appender::non_blocking(file_appender);
|
||||
let level = if args.verbose { "debug" } else { "info" };
|
||||
|
||||
let file_layer = fmt::layer()
|
||||
.with_writer(non_blocking)
|
||||
.with_ansi(false);
|
||||
|
||||
// We use a simple println for the audit to avoid complex reload handles
|
||||
tracing_subscriber::registry()
|
||||
.with(EnvFilter::new(level))
|
||||
.with(file_layer)
|
||||
.init();
|
||||
|
||||
// Set panic hook to restore terminal state
|
||||
std::panic::set_hook(Box::new(|panic_info| {
|
||||
let _ = disable_raw_mode();
|
||||
let mut stdout = io::stdout();
|
||||
@@ -105,29 +97,24 @@ fn main() -> Result<()> {
|
||||
eprintln!("----------------------------------------\n");
|
||||
}));
|
||||
|
||||
info!("ember-tune starting with args: {:?}", args);
|
||||
|
||||
// 2. Pre-flight Audit (Before TUI)
|
||||
let auditor: Arc<dyn PreflightAuditor> = if args.mock {
|
||||
Arc::new(MockAuditor)
|
||||
} else {
|
||||
match DellXps9380Sal::init() {
|
||||
Ok(sal) => Arc::new(sal),
|
||||
Err(e) => return Err(miette::miette!("Failed to initialize Dell SAL: {}", e)),
|
||||
}
|
||||
};
|
||||
|
||||
println!("{}", console::style("─── Pre-flight System Audit ───").bold().cyan());
|
||||
|
||||
let ctx = ember_tune_rs::sal::traits::EnvironmentCtx::production();
|
||||
|
||||
let (sal_box, facts): (Box<dyn PlatformSal>, SystemFactSheet) = if args.mock {
|
||||
(Box::new(MockSal::new()), SystemFactSheet::default())
|
||||
} else {
|
||||
HeuristicEngine::detect_and_build(ctx)?
|
||||
};
|
||||
let sal: Arc<dyn PlatformSal> = sal_box.into();
|
||||
|
||||
let mut audit_failures = Vec::new();
|
||||
|
||||
for step in auditor.audit() {
|
||||
for step in sal.audit() {
|
||||
print!(" Checking {:<40} ", step.description);
|
||||
io::Write::flush(&mut io::stdout()).into_diagnostic()?;
|
||||
|
||||
match step.outcome {
|
||||
Ok(_) => {
|
||||
println!("{}", console::style("[✓]").green());
|
||||
}
|
||||
Ok(_) => { println!("{}", console::style("[✓]").green()); }
|
||||
Err(e) => {
|
||||
println!("{}", console::style("[✗]").red());
|
||||
audit_failures.push(e);
|
||||
@@ -140,78 +127,50 @@ fn main() -> Result<()> {
|
||||
return Err(Report::new(MultiAuditError { errors: audit_failures }));
|
||||
}
|
||||
|
||||
println!("{}", console::style("✓ All pre-flight audits passed.").green().bold());
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
|
||||
if args.audit_only {
|
||||
println!("{}", console::style("✓ All pre-flight audits passed.").green().bold());
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// 3. Terminal Setup
|
||||
// Entering TUI Mode - STDOUT is now strictly for Ratatui
|
||||
enable_raw_mode().into_diagnostic()?;
|
||||
let mut stdout = io::stdout();
|
||||
execute!(stdout, EnterAlternateScreen).into_diagnostic()?;
|
||||
let backend = CrosstermBackend::new(stdout);
|
||||
let mut terminal = Terminal::new(backend).into_diagnostic()?;
|
||||
execute!(stdout, EnterAlternateScreen, crossterm::cursor::Hide).into_diagnostic()?;
|
||||
let backend_stdout = io::stdout();
|
||||
let backend_term = CrosstermBackend::new(backend_stdout);
|
||||
let mut terminal = Terminal::new(backend_term).into_diagnostic()?;
|
||||
|
||||
// 4. State & Communication Setup
|
||||
let running = Arc::new(AtomicBool::new(true));
|
||||
let r = running.clone();
|
||||
|
||||
let (telemetry_tx, telemetry_rx) = mpsc::channel::<TelemetryState>();
|
||||
let (command_tx, command_rx) = mpsc::channel::<UiCommand>();
|
||||
|
||||
let c_tx = command_tx.clone();
|
||||
ctrlc::set_handler(move || {
|
||||
let _ = c_tx.send(UiCommand::Abort);
|
||||
r.store(false, Ordering::SeqCst);
|
||||
}).expect("Error setting Ctrl-C handler");
|
||||
|
||||
// 5. Spawn Backend Orchestrator
|
||||
let is_mock = args.mock;
|
||||
let b_auditor = auditor.clone();
|
||||
let sal_backend = sal.clone();
|
||||
let facts_backend = facts.clone();
|
||||
let config_out = args.config_out.clone();
|
||||
let backend_handle = thread::spawn(move || {
|
||||
let (guard, sensors, actuators, watchdog): (
|
||||
Box<dyn EnvironmentGuard>,
|
||||
Box<dyn SensorBus>,
|
||||
Box<dyn ActuatorBus>,
|
||||
Box<dyn HardwareWatchdog>,
|
||||
) = if is_mock {
|
||||
(
|
||||
Box::new(MockGuard::new()),
|
||||
Box::new(MockSensorBus),
|
||||
Box::new(MockActuatorBus),
|
||||
Box::new(MockWatchdog),
|
||||
)
|
||||
} else {
|
||||
// Re-init or share the SAL
|
||||
let sal = Arc::new(DellXps9380Sal::init().expect("Failed to init Dell SAL in backend"));
|
||||
(
|
||||
Box::new(sal::dell_xps_9380::DellXps9380Guard::new()),
|
||||
Box::new(sal.clone() as Arc<dyn SensorBus>),
|
||||
Box::new(sal.clone() as Arc<dyn ActuatorBus>),
|
||||
Box::new(sal as Arc<dyn HardwareWatchdog>),
|
||||
)
|
||||
};
|
||||
|
||||
let workload = Box::new(StressNg::new());
|
||||
|
||||
let mut orchestrator = BenchmarkOrchestrator::new(
|
||||
Box::new(b_auditor),
|
||||
guard,
|
||||
sensors,
|
||||
actuators,
|
||||
watchdog,
|
||||
sal_backend,
|
||||
facts_backend,
|
||||
workload,
|
||||
telemetry_tx,
|
||||
command_rx,
|
||||
config_out,
|
||||
);
|
||||
|
||||
orchestrator.run()
|
||||
});
|
||||
|
||||
// 6. Frontend Event Loop
|
||||
let mut ui_state = DashboardState::new();
|
||||
let mut last_telemetry = TelemetryState {
|
||||
cpu_model: "Loading...".to_string(),
|
||||
cpu_model: facts.model.clone(),
|
||||
total_ram_gb: 0,
|
||||
tick: 0,
|
||||
cpu_temp: 0.0,
|
||||
@@ -222,12 +181,15 @@ fn main() -> Result<()> {
|
||||
pl1_limit: 0.0,
|
||||
pl2_limit: 0.0,
|
||||
fan_tier: "auto".to_string(),
|
||||
is_throttling: false,
|
||||
phase: BenchmarkPhase::Auditing,
|
||||
history_watts: Vec::new(),
|
||||
history_temp: Vec::new(),
|
||||
history_mhz: Vec::new(),
|
||||
log_event: None,
|
||||
metadata: std::collections::HashMap::new(),
|
||||
is_emergency: false,
|
||||
emergency_reason: None,
|
||||
};
|
||||
|
||||
let tick_rate = Duration::from_millis(100);
|
||||
@@ -256,37 +218,36 @@ fn main() -> Result<()> {
|
||||
|
||||
while let Ok(new_state) = telemetry_rx.try_recv() {
|
||||
if let Some(log) = &new_state.log_event {
|
||||
ui_state.logs.push(log.clone());
|
||||
debug!("Backend Log: {}", log);
|
||||
ui_state.add_log(log.clone());
|
||||
} else {
|
||||
ui_state.update(&new_state);
|
||||
last_telemetry = new_state;
|
||||
}
|
||||
}
|
||||
|
||||
if last_tick.elapsed() >= tick_rate {
|
||||
last_tick = Instant::now();
|
||||
}
|
||||
|
||||
if backend_handle.is_finished() {
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
break;
|
||||
}
|
||||
if last_tick.elapsed() >= tick_rate { last_tick = Instant::now(); }
|
||||
if backend_handle.is_finished() { break; }
|
||||
}
|
||||
|
||||
// 7. Terminal Restoration
|
||||
disable_raw_mode().into_diagnostic()?;
|
||||
execute!(terminal.backend_mut(), LeaveAlternateScreen).into_diagnostic()?;
|
||||
terminal.show_cursor().into_diagnostic()?;
|
||||
let _ = disable_raw_mode();
|
||||
let _ = execute!(terminal.backend_mut(), LeaveAlternateScreen, crossterm::cursor::Show);
|
||||
|
||||
// 8. Final Report (Post-TUI)
|
||||
match backend_handle.join() {
|
||||
let join_res = backend_handle.join();
|
||||
|
||||
match join_res {
|
||||
Ok(Ok(result)) => {
|
||||
print_summary_report(&result);
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
if e.to_string() == "ABORTED" {
|
||||
println!("{}", "Benchmark aborted by user. No summary available.".yellow());
|
||||
let err_str = e.to_string();
|
||||
if err_str == "ABORTED" {
|
||||
println!("{}", "Benchmark aborted by user.".yellow());
|
||||
} else if err_str.contains("EMERGENCY_ABORT") {
|
||||
println!();
|
||||
println!("{}", " 🚨 EMERGENCY ABORT TRIGGERED ".bold().on_red().white());
|
||||
println!("Reason: {}", err_str.replace("EMERGENCY_ABORT: ", "").red().bold());
|
||||
println!("{}", "Hardware state has been restored to safe defaults.".yellow());
|
||||
println!();
|
||||
} else {
|
||||
error!("Orchestrator encountered error: {}", e);
|
||||
eprintln!("{} {}", "Error:".red().bold(), e);
|
||||
@@ -297,6 +258,5 @@ fn main() -> Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
info!("ember-tune exited gracefully.");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,5 +1,13 @@
|
||||
use serde::{Serialize, Deserialize};
|
||||
//! Defines the data structures used for communication between the frontend and backend.
|
||||
//!
|
||||
//! This module acts as the "Mediator" in the Mediator Pattern, providing the
|
||||
//! message-passing interface for the MPSC channels that connect the TUI thread
|
||||
//! with the `BenchmarkOrchestrator` thread.
|
||||
|
||||
use serde::{Serialize, Deserialize};
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Defines the current high-level phase of the benchmark.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum BenchmarkPhase {
|
||||
Auditing,
|
||||
@@ -9,42 +17,42 @@ pub enum BenchmarkPhase {
|
||||
Finalizing,
|
||||
}
|
||||
|
||||
impl Default for BenchmarkPhase {
|
||||
fn default() -> Self {
|
||||
Self::Auditing
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
/// A complete snapshot of system telemetry at a single point in time.
|
||||
/// This struct is sent from the backend to the frontend on every tick.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct TelemetryState {
|
||||
// --- Static Info ---
|
||||
// --- Static System Info ---
|
||||
pub cpu_model: String,
|
||||
pub total_ram_gb: u64,
|
||||
|
||||
// --- Dynamic States ---
|
||||
pub tick: u64,
|
||||
pub phase: BenchmarkPhase,
|
||||
pub governor: String,
|
||||
pub pl1_limit: f32,
|
||||
pub pl2_limit: f32,
|
||||
pub fan_tier: String,
|
||||
|
||||
// --- Instantaneous Metrics ---
|
||||
// --- Dynamic Metrics ---
|
||||
pub tick: u64,
|
||||
pub cpu_temp: f32,
|
||||
pub power_w: f32,
|
||||
pub current_freq: f32,
|
||||
pub fans: Vec<u32>,
|
||||
|
||||
// --- High-res History (Last 60s @ 500ms = 120 points) ---
|
||||
pub governor: String,
|
||||
pub pl1_limit: f32,
|
||||
pub pl2_limit: f32,
|
||||
pub fan_tier: String,
|
||||
pub is_throttling: bool,
|
||||
pub phase: BenchmarkPhase,
|
||||
|
||||
// --- High-res History ---
|
||||
pub history_watts: Vec<f32>,
|
||||
pub history_temp: Vec<f32>,
|
||||
pub history_mhz: Vec<f32>,
|
||||
|
||||
// --- Events & Metadata ---
|
||||
pub log_event: Option<String>,
|
||||
pub metadata: std::collections::HashMap<String, String>,
|
||||
pub metadata: HashMap<String, String>,
|
||||
pub is_emergency: bool,
|
||||
pub emergency_reason: Option<String>,
|
||||
}
|
||||
|
||||
/// Commands sent from the frontend (UI) to the backend (`BenchmarkOrchestrator`).
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum UiCommand {
|
||||
/// Signals the orchestrator to gracefully abort the benchmark.
|
||||
Abort,
|
||||
}
|
||||
|
||||
@@ -1,48 +1,69 @@
|
||||
use anyhow::{Result, Context};
|
||||
//! The central state machine responsible for coordinating the thermal benchmark.
|
||||
//!
|
||||
//! It manages hardware interactions through the [PlatformSal], generates stress
|
||||
//! using a [Workload], and feeds telemetry to the frontend via MPSC channels.
|
||||
|
||||
use anyhow::{Result, Context, bail};
|
||||
use tracing::{info, warn, error, debug};
|
||||
use std::sync::mpsc;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::thread;
|
||||
use std::collections::VecDeque;
|
||||
use sysinfo::System;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::Mutex;
|
||||
use std::path::PathBuf;
|
||||
use std::cell::Cell;
|
||||
|
||||
use crate::sal::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog};
|
||||
use crate::load::Workload;
|
||||
use crate::sal::traits::{PlatformSal, SensorBus};
|
||||
use crate::sal::heuristic::discovery::SystemFactSheet;
|
||||
use crate::sal::safety::{HardwareStateGuard, PowerLimitWatts, ThermalWatchdog};
|
||||
use crate::load::{Workload, IntensityProfile, StressVector};
|
||||
use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase};
|
||||
use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult};
|
||||
use crate::agent_analyst::HeuristicAnalyst;
|
||||
use crate::agent_integrator::ServiceIntegrator;
|
||||
|
||||
/// Represents the possible states of the benchmark orchestrator.
|
||||
pub enum OrchestratorState {
|
||||
PreFlight,
|
||||
IdleBaseline,
|
||||
ThermalCalibration,
|
||||
StabilitySweep,
|
||||
Cooldown,
|
||||
Finalizing,
|
||||
}
|
||||
|
||||
pub struct BenchmarkOrchestrator {
|
||||
auditor: Box<dyn PreflightAuditor>,
|
||||
guard: Box<dyn EnvironmentGuard>,
|
||||
sensors: Box<dyn SensorBus>,
|
||||
actuators: Box<dyn ActuatorBus>,
|
||||
watchdog: Box<dyn HardwareWatchdog>,
|
||||
sal: Arc<dyn PlatformSal>,
|
||||
facts: SystemFactSheet,
|
||||
workload: Box<dyn Workload>,
|
||||
telemetry_tx: mpsc::Sender<TelemetryState>,
|
||||
command_rx: mpsc::Receiver<UiCommand>,
|
||||
phase: BenchmarkPhase,
|
||||
ui_phase: BenchmarkPhase,
|
||||
profile: ThermalProfile,
|
||||
engine: OptimizerEngine,
|
||||
|
||||
// --- History Buffers (120 points for 60s @ 500ms) ---
|
||||
optional_config_out: Option<PathBuf>,
|
||||
safeguard: Option<HardwareStateGuard>,
|
||||
watchdog: Option<ThermalWatchdog>,
|
||||
history_watts: VecDeque<f32>,
|
||||
history_temp: VecDeque<f32>,
|
||||
history_mhz: VecDeque<f32>,
|
||||
|
||||
// --- Static Info ---
|
||||
cpu_model: String,
|
||||
total_ram_gb: u64,
|
||||
emergency_abort: Arc<AtomicBool>,
|
||||
emergency_reason: Arc<Mutex<Option<String>>>,
|
||||
}
|
||||
|
||||
impl BenchmarkOrchestrator {
|
||||
pub fn new(
|
||||
auditor: Box<dyn PreflightAuditor>,
|
||||
guard: Box<dyn EnvironmentGuard>,
|
||||
sensors: Box<dyn SensorBus>,
|
||||
actuators: Box<dyn ActuatorBus>,
|
||||
watchdog: Box<dyn HardwareWatchdog>,
|
||||
sal: Arc<dyn PlatformSal>,
|
||||
facts: SystemFactSheet,
|
||||
workload: Box<dyn Workload>,
|
||||
telemetry_tx: mpsc::Sender<TelemetryState>,
|
||||
command_rx: mpsc::Receiver<UiCommand>,
|
||||
optional_config_out: Option<PathBuf>,
|
||||
) -> Self {
|
||||
let mut sys = System::new_all();
|
||||
sys.refresh_all();
|
||||
@@ -53,15 +74,12 @@ impl BenchmarkOrchestrator {
|
||||
let total_ram_gb = sys.total_memory() / 1024 / 1024 / 1024;
|
||||
|
||||
Self {
|
||||
auditor,
|
||||
guard,
|
||||
sensors,
|
||||
actuators,
|
||||
watchdog,
|
||||
sal,
|
||||
facts,
|
||||
workload,
|
||||
telemetry_tx,
|
||||
command_rx,
|
||||
phase: BenchmarkPhase::Auditing,
|
||||
ui_phase: BenchmarkPhase::Auditing,
|
||||
profile: ThermalProfile::default(),
|
||||
engine: OptimizerEngine::new(5),
|
||||
history_watts: VecDeque::with_capacity(120),
|
||||
@@ -69,176 +87,250 @@ impl BenchmarkOrchestrator {
|
||||
history_mhz: VecDeque::with_capacity(120),
|
||||
cpu_model,
|
||||
total_ram_gb,
|
||||
emergency_abort: Arc::new(AtomicBool::new(false)),
|
||||
emergency_reason: Arc::new(Mutex::new(None)),
|
||||
optional_config_out,
|
||||
safeguard: None,
|
||||
watchdog: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn run(&mut self) -> Result<OptimizationResult> {
|
||||
self.log("Starting ember-tune Benchmark Sequence.")?;
|
||||
// Immediate Priming
|
||||
let _ = self.sal.get_temp();
|
||||
let _ = self.sal.get_power_w();
|
||||
let _ = self.sal.get_fan_rpms();
|
||||
|
||||
// Phase 1: Audit & Baseline
|
||||
self.phase = BenchmarkPhase::Auditing;
|
||||
for step in self.auditor.audit() {
|
||||
info!("Orchestrator: Initializing Project Iron-Ember PGC Protocol.");
|
||||
|
||||
// Spawn safety watchdog immediately
|
||||
let watchdog = ThermalWatchdog::spawn(self.sal.clone(), self.emergency_abort.clone());
|
||||
self.watchdog = Some(watchdog);
|
||||
|
||||
let result = self.execute_benchmark();
|
||||
|
||||
if let Err(ref e) = result {
|
||||
error!("Benchmark Lifecycle Failure: {}", e);
|
||||
let _ = self.log(&format!("⚠ FAILURE: {}", e));
|
||||
}
|
||||
|
||||
// --- MANDATORY RAII CLEANUP ---
|
||||
info!("Benchmark sequence complete. Releasing safeguards...");
|
||||
let _ = self.workload.stop_workload();
|
||||
|
||||
if let Some(mut sg) = self.safeguard.take() {
|
||||
let _ = sg.release();
|
||||
}
|
||||
|
||||
if let Err(e) = self.sal.restore() {
|
||||
warn!("Failed secondary SAL restoration: {}", e);
|
||||
}
|
||||
|
||||
info!("✓ Hardware state restored.");
|
||||
result
|
||||
}
|
||||
|
||||
fn execute_benchmark(&mut self) -> Result<OptimizationResult> {
|
||||
let _bench_cfg = self.facts.bench_config.clone().context("Config missing.")?;
|
||||
|
||||
// 1. Pre-Flight Phase
|
||||
self.ui_phase = BenchmarkPhase::Auditing;
|
||||
self.log("Phase: Pre-Flight Auditing & Sterilization")?;
|
||||
|
||||
let mut target_files = self.facts.rapl_paths.iter()
|
||||
.map(|p| p.join("constraint_0_power_limit_uw"))
|
||||
.collect::<Vec<_>>();
|
||||
target_files.extend(self.facts.rapl_paths.iter().map(|p| p.join("constraint_1_power_limit_uw")));
|
||||
|
||||
if let Some(tp) = self.facts.paths.configs.get("throttled") {
|
||||
target_files.push(tp.clone());
|
||||
}
|
||||
|
||||
let sg = HardwareStateGuard::acquire(&target_files, &self.facts.conflict_services)?;
|
||||
self.safeguard = Some(sg);
|
||||
|
||||
for step in self.sal.audit() {
|
||||
if let Err(e) = step.outcome {
|
||||
return Err(anyhow::anyhow!("Audit failed ({}): {:?}", step.description, e));
|
||||
}
|
||||
}
|
||||
|
||||
self.log("Suppressing background services (tlp, thermald)...")?;
|
||||
self.guard.suppress().context("Failed to suppress background services")?;
|
||||
self.workload.initialize().context("Failed to initialize load generator.")?;
|
||||
self.sal.suppress().context("Failed to suppress background services.")?;
|
||||
|
||||
// Baseline (Idle Calibration)
|
||||
self.phase = BenchmarkPhase::IdleCalibration;
|
||||
self.log("Phase 1: Recording Idle Baseline (10s)...")?;
|
||||
self.actuators.set_fan_mode("auto")?; // Use auto for idle
|
||||
let tick = Cell::new(0u64);
|
||||
|
||||
// 2. Idle Baseline Phase
|
||||
self.ui_phase = BenchmarkPhase::IdleCalibration;
|
||||
self.log("Phase: Recording 30s Idle Baseline...")?;
|
||||
self.sal.set_fan_mode("auto")?;
|
||||
|
||||
let mut idle_temps = Vec::new();
|
||||
let start = Instant::now();
|
||||
let mut tick = 0;
|
||||
while start.elapsed() < Duration::from_secs(10) {
|
||||
self.check_abort()?;
|
||||
self.send_telemetry(tick)?;
|
||||
idle_temps.push(self.sensors.get_temp().unwrap_or(0.0));
|
||||
tick += 1;
|
||||
while start.elapsed() < Duration::from_secs(30) {
|
||||
self.check_safety_abort()?;
|
||||
self.send_telemetry(tick.get())?;
|
||||
idle_temps.push(self.sal.get_temp().unwrap_or(0.0));
|
||||
tick.set(tick.get() + 1);
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
}
|
||||
self.profile.ambient_temp = self.engine.smooth(&idle_temps).last().cloned().unwrap_or(0.0);
|
||||
self.profile.ambient_temp = self.engine.smooth(&idle_temps).iter().sum::<f32>() / idle_temps.len() as f32;
|
||||
self.log(&format!("✓ Idle Baseline: {:.1}°C", self.profile.ambient_temp))?;
|
||||
|
||||
// Phase 2: Stress Stepping
|
||||
self.phase = BenchmarkPhase::StressTesting;
|
||||
self.log("Phase 2: Starting Synthetic Stress Matrix.")?;
|
||||
self.actuators.set_fan_mode("max")?; // Lock fans for consistent resistance
|
||||
// 3. Thermal Resistance Mapping (Phase 1)
|
||||
self.log("Phase: Mapping Thermal Resistance (Rθ) at 10W...")?;
|
||||
self.sal.set_fan_mode("max")?;
|
||||
|
||||
let pl_calib = PowerLimitWatts::try_new(10.0)?;
|
||||
self.sal.set_sustained_power_limit(pl_calib)?;
|
||||
self.sal.set_burst_power_limit(pl_calib)?;
|
||||
|
||||
let power_steps = [15.0, 20.0, 25.0, 30.0, 35.0];
|
||||
for &pl in &power_steps {
|
||||
self.log(&format!("Testing PL1 = {:.0}W...", pl))?;
|
||||
self.actuators.set_sustained_power_limit(pl)?;
|
||||
self.actuators.set_burst_power_limit(pl + 5.0)?;
|
||||
self.workload.run_workload(
|
||||
Duration::from_secs(120),
|
||||
IntensityProfile { threads: num_cpus::get_physical(), load_percentage: 100, vector: StressVector::CpuMatrix }
|
||||
)?;
|
||||
|
||||
let mut calib_temps = Vec::new();
|
||||
let calib_start = Instant::now();
|
||||
while calib_start.elapsed() < Duration::from_secs(90) {
|
||||
self.check_safety_abort()?;
|
||||
self.send_telemetry(tick.get())?;
|
||||
let t = self.sal.get_temp().unwrap_or(0.0);
|
||||
calib_temps.push(t);
|
||||
tick.set(tick.get() + 1);
|
||||
|
||||
if calib_start.elapsed() > Duration::from_secs(30) && self.engine.is_stable(&calib_temps) {
|
||||
break;
|
||||
}
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
}
|
||||
|
||||
let steady_t = calib_temps.last().cloned().unwrap_or(0.0);
|
||||
let steady_p = self.sal.get_power_w().unwrap_or(10.0);
|
||||
self.profile.r_theta = self.engine.calculate_r_theta(self.profile.ambient_temp, steady_t, steady_p);
|
||||
self.log(&format!("✓ Physical Model: Rθ = {:.3} K/W", self.profile.r_theta))?;
|
||||
|
||||
// 4. Physically-Aware Stability Sweep (Phase 2)
|
||||
self.ui_phase = BenchmarkPhase::StressTesting;
|
||||
self.log("Phase: Starting Physically-Aware Efficiency Sweep...")?;
|
||||
|
||||
let mut current_w = 12.0_f32;
|
||||
let mut previous_ops = 0.0;
|
||||
|
||||
loop {
|
||||
// Predict if this step is safe
|
||||
let pred_t = self.engine.predict_temp(current_w, self.profile.ambient_temp, self.profile.r_theta);
|
||||
if pred_t > 92.0 {
|
||||
self.log(&format!("Prediction: {:.1}W would result in {:.1}C (Too Hot). Finalizing...", current_w, pred_t))?;
|
||||
break;
|
||||
}
|
||||
|
||||
self.log(&format!("Step: {:.1}W (Predicted: {:.1}C)", current_w, pred_t))?;
|
||||
let pl = PowerLimitWatts::try_new(current_w)?;
|
||||
self.sal.set_sustained_power_limit(pl)?;
|
||||
self.sal.set_burst_power_limit(PowerLimitWatts::try_new(current_w + 2.0)?)?;
|
||||
|
||||
self.workload.run_workload(
|
||||
Duration::from_secs(60),
|
||||
IntensityProfile { threads: num_cpus::get_physical(), load_percentage: 100, vector: StressVector::CpuMatrix }
|
||||
)?;
|
||||
|
||||
self.workload.start(num_cpus::get(), 100)?;
|
||||
|
||||
// Wait for equilibrium: Hybrid approach (15s min, 45s max)
|
||||
let step_start = Instant::now();
|
||||
let mut step_temps = VecDeque::with_capacity(30); // Last 15s @ 500ms
|
||||
let mut step_temps = Vec::new();
|
||||
let mut previous_t = self.sal.get_temp().unwrap_or(0.0);
|
||||
|
||||
while step_start.elapsed() < Duration::from_secs(45) {
|
||||
self.check_abort()?;
|
||||
if self.watchdog.check_emergency()? {
|
||||
self.log("⚠ EMERGENCY ABORT: Watchdog triggered!")?;
|
||||
self.workload.stop()?;
|
||||
return Err(anyhow::anyhow!("Hardware Watchdog Triggered"));
|
||||
while step_start.elapsed() < Duration::from_secs(60) {
|
||||
self.check_safety_abort()?;
|
||||
self.send_telemetry(tick.get())?;
|
||||
|
||||
let t = self.sal.get_temp().unwrap_or(0.0);
|
||||
let dt_dt = (t - previous_t) / 0.5;
|
||||
|
||||
// # SAFETY: predictive hard-quench threshold raised to 8C/s
|
||||
if step_start.elapsed() > Duration::from_secs(2) && (t > 95.0 || dt_dt > 8.0) {
|
||||
warn!("USA: Safety Break triggered! T={:.1}C, dT/dt={:.1}C/s", t, dt_dt);
|
||||
let _ = self.sal.set_sustained_power_limit(PowerLimitWatts::try_new(3.0)?);
|
||||
break; // Just break the sweep loop
|
||||
}
|
||||
|
||||
let t = self.sensors.get_temp().unwrap_or(0.0);
|
||||
step_temps.push_back(t);
|
||||
if step_temps.len() > 10 { step_temps.pop_front(); }
|
||||
step_temps.push(t);
|
||||
tick.set(tick.get() + 1);
|
||||
|
||||
self.send_telemetry(tick)?;
|
||||
tick += 1;
|
||||
|
||||
// Check for stability: Range < 0.5C over last 5s (10 ticks)
|
||||
if step_start.elapsed() > Duration::from_secs(15) && step_temps.len() == 10 {
|
||||
let min = step_temps.iter().fold(f32::MAX, |a, &b| a.min(b));
|
||||
let max = step_temps.iter().fold(f32::MIN, |a, &b| a.max(b));
|
||||
if (max - min) < 0.5 {
|
||||
self.log(&format!(" Equilibrium reached at {:.1}°C", t))?;
|
||||
break;
|
||||
}
|
||||
if step_start.elapsed() > Duration::from_secs(15) && self.engine.is_stable(&step_temps) {
|
||||
self.log(&format!(" Equilibrium reached at {:.1}°C", t))?;
|
||||
break;
|
||||
}
|
||||
previous_t = t;
|
||||
thread::sleep(Duration::from_millis(500));
|
||||
}
|
||||
|
||||
// Record data point
|
||||
let avg_p = self.sensors.get_power_w().unwrap_or(0.0);
|
||||
let avg_t = self.sensors.get_temp().unwrap_or(0.0);
|
||||
let avg_f = self.sensors.get_freq_mhz().unwrap_or(0.0);
|
||||
let fans = self.sensors.get_fan_rpms().unwrap_or_default();
|
||||
let primary_fan = fans.first().cloned().unwrap_or(0);
|
||||
let tp = self.workload.get_throughput().unwrap_or(0.0);
|
||||
|
||||
let metrics = self.workload.get_current_metrics().unwrap_or_default();
|
||||
self.profile.points.push(ThermalPoint {
|
||||
power_w: avg_p,
|
||||
temp_c: avg_t,
|
||||
freq_mhz: avg_f,
|
||||
fan_rpm: primary_fan,
|
||||
throughput: tp,
|
||||
power_w: self.sal.get_power_w().unwrap_or(current_w),
|
||||
temp_c: self.sal.get_temp().unwrap_or(0.0),
|
||||
freq_mhz: self.sal.get_freq_mhz().unwrap_or(0.0),
|
||||
fan_rpm: self.sal.get_fan_rpms().unwrap_or_default().first().cloned().unwrap_or(0),
|
||||
throughput: metrics.primary_ops_per_sec,
|
||||
});
|
||||
|
||||
self.workload.stop()?;
|
||||
self.log(" Step complete. Cooling down for 5s...")?;
|
||||
thread::sleep(Duration::from_secs(5));
|
||||
self.workload.stop_workload()?;
|
||||
|
||||
// Efficiency Break
|
||||
if previous_ops > 0.0 {
|
||||
let gain = ((metrics.primary_ops_per_sec - previous_ops) / previous_ops) * 100.0;
|
||||
if gain < 1.0 {
|
||||
self.log("Silicon Knee identified (gain < 1%). Finalizing...")?;
|
||||
break;
|
||||
}
|
||||
}
|
||||
previous_ops = metrics.primary_ops_per_sec;
|
||||
current_w += 2.0;
|
||||
if current_w > 45.0 { break; }
|
||||
|
||||
self.log(&format!("Cooling down ({}s)...", _bench_cfg.cool_down_s))?;
|
||||
thread::sleep(Duration::from_secs(_bench_cfg.cool_down_s));
|
||||
}
|
||||
|
||||
// Phase 4: Physical Modeling
|
||||
self.phase = BenchmarkPhase::PhysicalModeling;
|
||||
self.log("Phase 3: Calculating Silicon Physical Sweet Spot...")?;
|
||||
// 5. Modeling Phase
|
||||
self.ui_phase = BenchmarkPhase::PhysicalModeling;
|
||||
let knee = self.engine.find_silicon_knee(&self.profile);
|
||||
let analyst = HeuristicAnalyst::new();
|
||||
let matrix = analyst.analyze(&self.profile, self.profile.points.last().map(|p| p.power_w).unwrap_or(15.0));
|
||||
|
||||
let res = self.generate_result(false);
|
||||
|
||||
self.log(&format!("✓ Thermal Resistance (Rθ): {:.3} K/W", res.thermal_resistance_kw))?;
|
||||
self.log(&format!("✓ Silicon Knee Found: {:.1} W", res.silicon_knee_watts))?;
|
||||
let mut res = self.generate_result(false);
|
||||
res.optimization_matrix = Some(matrix.clone());
|
||||
res.silicon_knee_watts = knee;
|
||||
|
||||
thread::sleep(Duration::from_secs(3));
|
||||
// 6. Finalizing Phase
|
||||
self.ui_phase = BenchmarkPhase::Finalizing;
|
||||
let throttled_source = self.facts.paths.configs.get("throttled");
|
||||
if let Some(path) = self.optional_config_out.clone().or_else(|| throttled_source.cloned()) {
|
||||
let config = crate::engine::formatters::throttled::ThrottledConfig {
|
||||
pl1_limit: res.silicon_knee_watts,
|
||||
pl2_limit: res.silicon_knee_watts * 1.25,
|
||||
trip_temp: 90.0,
|
||||
};
|
||||
let _ = crate::engine::formatters::throttled::ThrottledTranslator::save(&path, &config, throttled_source);
|
||||
res.config_paths.insert("throttled".to_string(), path);
|
||||
}
|
||||
|
||||
// Phase 5: Finalizing
|
||||
self.phase = BenchmarkPhase::Finalizing;
|
||||
self.log("Benchmark sequence complete. Generating configurations...")?;
|
||||
|
||||
let config = crate::engine::formatters::throttled::ThrottledConfig {
|
||||
pl1_limit: res.silicon_knee_watts,
|
||||
pl2_limit: res.recommended_pl2,
|
||||
trip_temp: res.max_temp_c.max(95.0),
|
||||
};
|
||||
|
||||
// 1. Throttled (Merged if exists)
|
||||
let throttled_path = "throttled.conf";
|
||||
let existing_throttled = std::fs::read_to_string(throttled_path).unwrap_or_default();
|
||||
let throttled_content = if existing_throttled.is_empty() {
|
||||
crate::engine::formatters::throttled::ThrottledTranslator::generate_conf(&config)
|
||||
} else {
|
||||
crate::engine::formatters::throttled::ThrottledTranslator::merge_conf(&existing_throttled, &config)
|
||||
};
|
||||
std::fs::write(throttled_path, throttled_content)?;
|
||||
self.log("✓ Saved 'throttled.conf' (merged).")?;
|
||||
|
||||
// 2. i8kmon
|
||||
let i8k_config = crate::engine::formatters::i8kmon::I8kmonConfig {
|
||||
t_ambient: self.profile.ambient_temp,
|
||||
t_max_fan: res.max_temp_c - 5.0, // Aim to hit max fan before max temp
|
||||
};
|
||||
let i8k_content = crate::engine::formatters::i8kmon::I8kmonTranslator::generate_conf(&i8k_config);
|
||||
std::fs::write("i8kmon.conf", i8k_content)?;
|
||||
self.log("✓ Saved 'i8kmon.conf'.")?;
|
||||
|
||||
self.guard.restore()?;
|
||||
self.log("✓ Environment restored.")?;
|
||||
let base_out = self.optional_config_out.clone().unwrap_or_else(|| PathBuf::from("/etc"));
|
||||
let i8k_source = self.facts.paths.configs.get("i8kmon");
|
||||
let i8k_out = base_out.join("i8kmon.conf");
|
||||
if ServiceIntegrator::generate_i8kmon_config(&matrix, &i8k_out, i8k_source).is_ok() {
|
||||
res.config_paths.insert("i8kmon".to_string(), i8k_out);
|
||||
}
|
||||
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
pub fn generate_result(&self, is_partial: bool) -> OptimizationResult {
|
||||
let r_theta = self.engine.calculate_thermal_resistance(&self.profile);
|
||||
let knee = self.engine.find_silicon_knee(&self.profile);
|
||||
let max_t = self.engine.get_max_temp(&self.profile);
|
||||
|
||||
OptimizationResult {
|
||||
profile: self.profile.clone(),
|
||||
silicon_knee_watts: knee,
|
||||
thermal_resistance_kw: r_theta,
|
||||
recommended_pl1: knee,
|
||||
recommended_pl2: knee * 1.25,
|
||||
max_temp_c: max_t,
|
||||
is_partial,
|
||||
fn check_safety_abort(&self) -> Result<()> {
|
||||
if self.emergency_abort.load(Ordering::SeqCst) {
|
||||
let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Watchdog".to_string());
|
||||
bail!("EMERGENCY_ABORT: {}", reason);
|
||||
}
|
||||
}
|
||||
|
||||
fn check_abort(&self) -> Result<()> {
|
||||
if let Ok(cmd) = self.command_rx.try_recv() {
|
||||
match cmd {
|
||||
UiCommand::Abort => {
|
||||
return Err(anyhow::anyhow!("ABORTED"));
|
||||
}
|
||||
}
|
||||
if let UiCommand::Abort = cmd { bail!("ABORTED"); }
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -248,58 +340,66 @@ impl BenchmarkOrchestrator {
|
||||
cpu_model: self.cpu_model.clone(),
|
||||
total_ram_gb: self.total_ram_gb,
|
||||
tick: 0,
|
||||
cpu_temp: self.sensors.get_temp().unwrap_or(0.0),
|
||||
power_w: self.sensors.get_power_w().unwrap_or(0.0),
|
||||
current_freq: self.sensors.get_freq_mhz().unwrap_or(0.0),
|
||||
fans: self.sensors.get_fan_rpms().unwrap_or_default(),
|
||||
governor: "unknown".to_string(),
|
||||
pl1_limit: 0.0,
|
||||
pl2_limit: 0.0,
|
||||
fan_tier: "auto".to_string(),
|
||||
phase: self.phase,
|
||||
history_watts: Vec::new(),
|
||||
history_temp: Vec::new(),
|
||||
history_mhz: Vec::new(),
|
||||
cpu_temp: self.sal.get_temp().unwrap_or(0.0),
|
||||
power_w: self.sal.get_power_w().unwrap_or(0.0),
|
||||
current_freq: self.sal.get_freq_mhz().unwrap_or(0.0),
|
||||
fans: self.sal.get_fan_rpms().unwrap_or_default(),
|
||||
governor: "performance".to_string(),
|
||||
pl1_limit: 0.0, pl2_limit: 0.0, fan_tier: "auto".to_string(),
|
||||
is_throttling: self.sal.get_throttling_status().unwrap_or(false),
|
||||
phase: self.ui_phase,
|
||||
history_watts: Vec::new(), history_temp: Vec::new(), history_mhz: Vec::new(),
|
||||
log_event: Some(msg.to_string()),
|
||||
metadata: std::collections::HashMap::new(),
|
||||
is_emergency: self.emergency_abort.load(Ordering::SeqCst),
|
||||
emergency_reason: self.emergency_reason.lock().unwrap().clone(),
|
||||
};
|
||||
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed"))
|
||||
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Channel closed"))
|
||||
}
|
||||
|
||||
fn send_telemetry(&mut self, tick: u64) -> Result<()> {
|
||||
let temp = self.sensors.get_temp().unwrap_or(0.0);
|
||||
let pwr = self.sensors.get_power_w().unwrap_or(0.0);
|
||||
let freq = self.sensors.get_freq_mhz().unwrap_or(0.0);
|
||||
|
||||
let temp = self.sal.get_temp().unwrap_or(0.0);
|
||||
let pwr = self.sal.get_power_w().unwrap_or(0.0);
|
||||
let freq = self.sal.get_freq_mhz().unwrap_or(0.0);
|
||||
self.history_temp.push_back(temp);
|
||||
self.history_watts.push_back(pwr);
|
||||
self.history_mhz.push_back(freq);
|
||||
|
||||
if self.history_temp.len() > 120 {
|
||||
self.history_temp.pop_front();
|
||||
self.history_watts.pop_front();
|
||||
self.history_mhz.pop_front();
|
||||
}
|
||||
if self.history_temp.len() > 120 { self.history_temp.pop_front(); self.history_watts.pop_front(); self.history_mhz.pop_front(); }
|
||||
|
||||
let state = TelemetryState {
|
||||
cpu_model: self.cpu_model.clone(),
|
||||
total_ram_gb: self.total_ram_gb,
|
||||
tick,
|
||||
cpu_temp: temp,
|
||||
power_w: pwr,
|
||||
current_freq: freq,
|
||||
fans: self.sensors.get_fan_rpms().unwrap_or_default(),
|
||||
cpu_temp: temp, power_w: pwr, current_freq: freq,
|
||||
fans: self.sal.get_fan_rpms().unwrap_or_default(),
|
||||
governor: "performance".to_string(),
|
||||
pl1_limit: 15.0,
|
||||
pl2_limit: 25.0,
|
||||
fan_tier: "max".to_string(),
|
||||
phase: self.phase,
|
||||
pl1_limit: 15.0, pl2_limit: 25.0, fan_tier: "max".to_string(),
|
||||
is_throttling: self.sal.get_throttling_status().unwrap_or(false),
|
||||
phase: self.ui_phase,
|
||||
history_watts: self.history_watts.iter().cloned().collect(),
|
||||
history_temp: self.history_temp.iter().cloned().collect(),
|
||||
history_mhz: self.history_mhz.iter().cloned().collect(),
|
||||
log_event: None,
|
||||
metadata: std::collections::HashMap::new(),
|
||||
is_emergency: self.emergency_abort.load(Ordering::SeqCst),
|
||||
emergency_reason: self.emergency_reason.lock().unwrap().clone(),
|
||||
};
|
||||
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed"))
|
||||
self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Channel closed"))
|
||||
}
|
||||
|
||||
pub fn generate_result(&self, is_partial: bool) -> OptimizationResult {
|
||||
let r_theta = self.profile.r_theta;
|
||||
let knee = self.engine.find_silicon_knee(&self.profile);
|
||||
OptimizationResult {
|
||||
profile: self.profile.clone(),
|
||||
silicon_knee_watts: knee,
|
||||
thermal_resistance_kw: r_theta,
|
||||
recommended_pl1: knee,
|
||||
recommended_pl2: knee * 1.25,
|
||||
max_temp_c: self.profile.points.iter().map(|p| p.temp_c).max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)).unwrap_or(0.0),
|
||||
is_partial,
|
||||
config_paths: std::collections::HashMap::new(),
|
||||
optimization_matrix: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,219 +1,221 @@
|
||||
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep};
|
||||
use anyhow::{Result, Context};
|
||||
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditError, AuditStep, SafetyStatus, EnvironmentCtx};
|
||||
use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent};
|
||||
use anyhow::{Result, Context, anyhow};
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::path::{PathBuf};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::thread;
|
||||
use std::sync::Mutex;
|
||||
use tracing::debug;
|
||||
use tracing::{info, debug};
|
||||
use crate::sal::heuristic::discovery::SystemFactSheet;
|
||||
|
||||
/// Implementation of the System Abstraction Layer for the Dell XPS 13 9380.
|
||||
pub struct DellXps9380Sal {
|
||||
ctx: EnvironmentCtx,
|
||||
fact_sheet: SystemFactSheet,
|
||||
temp_path: PathBuf,
|
||||
pwr_path: PathBuf,
|
||||
fan_paths: Vec<PathBuf>,
|
||||
pwm_paths: Vec<PathBuf>,
|
||||
pwm_enable_paths: Vec<PathBuf>,
|
||||
pl1_paths: Vec<PathBuf>,
|
||||
pl2_paths: Vec<PathBuf>,
|
||||
freq_path: PathBuf,
|
||||
pl1_path: PathBuf,
|
||||
pl2_path: PathBuf,
|
||||
last_poll: Mutex<Instant>,
|
||||
last_temp: Mutex<f32>,
|
||||
last_fans: Mutex<Vec<u32>>,
|
||||
msr_file: Mutex<fs::File>,
|
||||
last_energy: Mutex<(u64, Instant)>,
|
||||
last_watts: Mutex<f32>,
|
||||
}
|
||||
|
||||
impl DellXps9380Sal {
|
||||
pub fn init() -> Result<Self> {
|
||||
let mut temp_path = None;
|
||||
let mut pwr_path = None;
|
||||
let mut fan_paths = Vec::new();
|
||||
let mut rapl_base_path = None;
|
||||
|
||||
// Dynamic hwmon discovery
|
||||
if let Ok(entries) = fs::read_dir("/sys/class/hwmon") {
|
||||
for entry in entries.flatten() {
|
||||
let p = entry.path();
|
||||
let name = fs::read_to_string(p.join("name")).unwrap_or_default().trim().to_string();
|
||||
/// Initializes the Dell SAL, opening the MSR interface and discovering sensors and PWM nodes.
|
||||
pub fn init(ctx: EnvironmentCtx, facts: SystemFactSheet) -> Result<Self> {
|
||||
let temp_path = facts.temp_path.clone().context("Dell SAL requires temperature sensor")?;
|
||||
let pwr_base = facts.rapl_paths.first().cloned().context("Dell SAL requires RAPL interface")?;
|
||||
let fan_paths = facts.fan_paths.clone();
|
||||
|
||||
// 1. Discover PWM and Enable nodes associated with the fan paths
|
||||
let mut pwm_paths = Vec::new();
|
||||
let mut pwm_enable_paths = Vec::new();
|
||||
for fan_p in &fan_paths {
|
||||
if let Some(parent) = fan_p.parent() {
|
||||
let fan_file = fan_p.file_name().and_then(|n| n.to_str()).unwrap_or("");
|
||||
let fan_idx = fan_file.chars().filter(|c| c.is_ascii_digit()).collect::<String>();
|
||||
let idx = if fan_idx.is_empty() { "1".to_string() } else { fan_idx };
|
||||
|
||||
if name == "dell_smm" {
|
||||
temp_path = Some(p.join("temp1_input"));
|
||||
// Discover all fans
|
||||
if let Ok(fan_entries) = fs::read_dir(&p) {
|
||||
for fan_entry in fan_entries.flatten() {
|
||||
let fan_p = fan_entry.path();
|
||||
if fan_p.file_name().unwrap_or_default().to_string_lossy().starts_with("fan") &&
|
||||
fan_p.file_name().unwrap_or_default().to_string_lossy().ends_with("_input") {
|
||||
fan_paths.push(fan_p);
|
||||
}
|
||||
}
|
||||
}
|
||||
fan_paths.sort();
|
||||
}
|
||||
let pwm_p = parent.join(format!("pwm{}", idx));
|
||||
if pwm_p.exists() { pwm_paths.push(pwm_p); }
|
||||
|
||||
if name == "intel_rapl" || name == "rapl" {
|
||||
pwr_path = Some(p.join("power1_average"));
|
||||
}
|
||||
let enable_p = parent.join(format!("pwm{}_enable", idx));
|
||||
if enable_p.exists() { pwm_enable_paths.push(enable_p); }
|
||||
}
|
||||
}
|
||||
|
||||
// Discovery for RAPL via powercap
|
||||
if let Ok(entries) = fs::read_dir("/sys/class/powercap") {
|
||||
for entry in entries.flatten() {
|
||||
let p = entry.path();
|
||||
if let Ok(name) = fs::read_to_string(p.join("name")) {
|
||||
if name.trim() == "package-0" {
|
||||
rapl_base_path = Some(p.clone());
|
||||
if pwr_path.is_none() {
|
||||
pwr_path = Some(p.join("energy_uj"));
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// 2. Map all RAPL constraints
|
||||
let mut pl1_paths = Vec::new();
|
||||
let mut pl2_paths = Vec::new();
|
||||
for rapl_p in &facts.rapl_paths {
|
||||
pl1_paths.push(rapl_p.join("constraint_0_power_limit_uw"));
|
||||
pl2_paths.push(rapl_p.join("constraint_1_power_limit_uw"));
|
||||
}
|
||||
|
||||
let rapl_base = rapl_base_path.context("Could not find RAPL package-0 path in powercap")?;
|
||||
let freq_path = PathBuf::from("/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq");
|
||||
// 3. Physical Sensor Verification & Warm Cache Priming
|
||||
let mut initial_fans = Vec::new();
|
||||
for fan_p in &fan_paths {
|
||||
let mut rpm = 0;
|
||||
for _ in 0..3 {
|
||||
if let Ok(val) = fs::read_to_string(fan_p) {
|
||||
rpm = val.trim().parse::<u32>().unwrap_or(0);
|
||||
if rpm > 0 { break; }
|
||||
}
|
||||
thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
info!("SAL Warm-Start: Fan sensor {:?} -> {} RPM", fan_p, rpm);
|
||||
initial_fans.push(rpm);
|
||||
}
|
||||
|
||||
let freq_path = ctx.sysfs_base.join("sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq");
|
||||
let msr_path = ctx.sysfs_base.join("dev/cpu/0/msr");
|
||||
|
||||
let msr_file = fs::OpenOptions::new().read(true).write(true).open(&msr_path)
|
||||
.with_context(|| format!("Failed to open {:?}. Is the 'msr' module loaded?", msr_path))?;
|
||||
|
||||
let initial_energy = fs::read_to_string(pwr_base.join("energy_uj")).unwrap_or_default().trim().parse().unwrap_or(0);
|
||||
|
||||
info!("SAL: Dell XPS 9380 Initialized. ({} fans, {} RAPL nodes found)",
|
||||
fan_paths.len(), facts.rapl_paths.len());
|
||||
|
||||
Ok(Self {
|
||||
temp_path: temp_path.context("Could not find dell_smm temperature path")?,
|
||||
pwr_path: pwr_path.context("Could not find RAPL power path")?,
|
||||
temp_path,
|
||||
pwr_path: pwr_base.join("power1_average"),
|
||||
fan_paths,
|
||||
pwm_paths,
|
||||
pwm_enable_paths,
|
||||
pl1_paths,
|
||||
pl2_paths,
|
||||
freq_path,
|
||||
pl1_path: rapl_base.join("constraint_0_power_limit_uw"),
|
||||
pl2_path: rapl_base.join("constraint_1_power_limit_uw"),
|
||||
last_poll: Mutex::new(Instant::now() - Duration::from_secs(2)),
|
||||
last_temp: Mutex::new(0.0),
|
||||
last_fans: Mutex::new(Vec::new()),
|
||||
last_fans: Mutex::new(initial_fans),
|
||||
msr_file: Mutex::new(msr_file),
|
||||
last_energy: Mutex::new((initial_energy, Instant::now())),
|
||||
last_watts: Mutex::new(0.0),
|
||||
fact_sheet: facts,
|
||||
ctx,
|
||||
})
|
||||
}
|
||||
|
||||
fn read_msr(&self, msr: u32) -> Result<u64> {
|
||||
use std::os::unix::fs::FileExt;
|
||||
let mut buf = [0u8; 8];
|
||||
let file = self.msr_file.lock().unwrap();
|
||||
file.read_at(&mut buf, msr as u64)?;
|
||||
Ok(u64::from_le_bytes(buf))
|
||||
}
|
||||
|
||||
fn write_msr(&self, msr: u32, val: u64) -> Result<()> {
|
||||
use std::os::unix::fs::FileExt;
|
||||
let file = self.msr_file.lock().unwrap();
|
||||
file.write_at(&val.to_le_bytes(), msr as u64)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl PreflightAuditor for DellXps9380Sal {
|
||||
fn audit(&self) -> Box<dyn Iterator<Item = AuditStep> + '_> {
|
||||
let mut steps = Vec::new();
|
||||
|
||||
// 1. Root check
|
||||
steps.push(AuditStep {
|
||||
description: "Root Privileges".to_string(),
|
||||
outcome: if unsafe { libc::getuid() } == 0 { Ok(()) } else { Err(AuditError::RootRequired) }
|
||||
});
|
||||
|
||||
// 2. Kernel modules check (simplified check via sysfs/proc)
|
||||
let rapl_lock = match self.read_msr(0x610) {
|
||||
Ok(val) => {
|
||||
if (val & (1 << 63)) != 0 {
|
||||
Err(AuditError::KernelIncompatible("RAPL Registers are locked by BIOS. Power limit tuning is impossible.".to_string()))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
},
|
||||
Err(e) => Err(AuditError::ToolMissing(format!("Cannot read MSR 0x610: {}", e))),
|
||||
};
|
||||
steps.push(AuditStep { description: "MSR 0x610 RAPL Lock Status".to_string(), outcome: rapl_lock });
|
||||
|
||||
let modules = ["dell_smm_hwmon", "msr", "intel_rapl_msr"];
|
||||
for mod_name in modules {
|
||||
let path = format!("/sys/module/{}", mod_name);
|
||||
let path = self.ctx.sysfs_base.join(format!("sys/module/{}", mod_name));
|
||||
steps.push(AuditStep {
|
||||
description: format!("Kernel Module: {}", mod_name),
|
||||
outcome: if PathBuf::from(path).exists() { Ok(()) } else {
|
||||
Err(AuditError::ToolMissing(format!("Module '{}' not loaded. Run 'sudo modprobe {}'", mod_name, mod_name)))
|
||||
}
|
||||
outcome: if path.exists() { Ok(()) } else { Err(AuditError::ToolMissing(format!("Module '{}' not loaded.", mod_name))) }
|
||||
});
|
||||
}
|
||||
|
||||
// 3. Kernel parameters check
|
||||
let cmdline = fs::read_to_string("/proc/cmdline").unwrap_or_default();
|
||||
steps.push(AuditStep {
|
||||
description: "Kernel Param: dell_smm_hwmon.ignore_dmi=1".to_string(),
|
||||
outcome: if cmdline.contains("dell_smm_hwmon.ignore_dmi=1") { Ok(()) } else {
|
||||
Err(AuditError::MissingKernelParam("dell_smm_hwmon.ignore_dmi=1".to_string()))
|
||||
}
|
||||
});
|
||||
steps.push(AuditStep {
|
||||
description: "Kernel Param: dell_smm_hwmon.restricted=0".to_string(),
|
||||
outcome: if cmdline.contains("dell_smm_hwmon.restricted=0") { Ok(()) } else {
|
||||
Err(AuditError::MissingKernelParam("dell_smm_hwmon.restricted=0".to_string()))
|
||||
}
|
||||
});
|
||||
steps.push(AuditStep {
|
||||
description: "Kernel Param: msr.allow_writes=on".to_string(),
|
||||
outcome: if cmdline.contains("msr.allow_writes=on") { Ok(()) } else {
|
||||
Err(AuditError::MissingKernelParam("msr.allow_writes=on".to_string()))
|
||||
}
|
||||
});
|
||||
let cmdline_path = self.ctx.sysfs_base.join("proc/cmdline");
|
||||
let cmdline = fs::read_to_string(cmdline_path).unwrap_or_default();
|
||||
let params = [
|
||||
("dell_smm_hwmon.ignore_dmi=1", "dell_smm_hwmon.ignore_dmi=1"),
|
||||
("dell_smm_hwmon.restricted=0", "dell_smm_hwmon.restricted=0"),
|
||||
("msr.allow_writes=on", "msr.allow_writes=on"),
|
||||
];
|
||||
for (label, p) in params {
|
||||
steps.push(AuditStep {
|
||||
description: format!("Kernel Param: {}", label),
|
||||
outcome: if cmdline.contains(p) { Ok(()) } else { Err(AuditError::MissingKernelParam(p.to_string())) }
|
||||
});
|
||||
}
|
||||
|
||||
// 4. Lockdown check
|
||||
let lockdown = fs::read_to_string("/sys/kernel/security/lockdown").unwrap_or_default();
|
||||
steps.push(AuditStep {
|
||||
description: "Kernel Lockdown Status".to_string(),
|
||||
outcome: if lockdown.contains("[none]") || lockdown.is_empty() { Ok(()) } else {
|
||||
Err(AuditError::KernelIncompatible("Kernel is in lockdown mode. Set to 'none' to allow MSR/SMM writes.".to_string()))
|
||||
}
|
||||
});
|
||||
|
||||
// 5. Check AC power
|
||||
let ac_status = fs::read_to_string("/sys/class/power_supply/AC/online").unwrap_or_else(|_| "0".to_string());
|
||||
let ac_status_path = self.ctx.sysfs_base.join("sys/class/power_supply/AC/online");
|
||||
let ac_status = fs::read_to_string(ac_status_path).unwrap_or_else(|_| "0".to_string());
|
||||
steps.push(AuditStep {
|
||||
description: "AC Power Connection".to_string(),
|
||||
outcome: if ac_status.trim() == "1" { Ok(()) } else {
|
||||
Err(AuditError::AcPowerMissing("System must be on AC power for benchmarking".to_string()))
|
||||
}
|
||||
outcome: if ac_status.trim() == "1" { Ok(()) } else { Err(AuditError::AcPowerMissing("System must be on AC power".to_string())) }
|
||||
});
|
||||
|
||||
Box::new(steps.into_iter())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DellXps9380Guard {
|
||||
stopped_services: Vec<String>,
|
||||
}
|
||||
|
||||
impl DellXps9380Guard {
|
||||
pub fn new() -> Self {
|
||||
Self { stopped_services: Vec::new() }
|
||||
}
|
||||
}
|
||||
|
||||
impl EnvironmentGuard for DellXps9380Guard {
|
||||
fn suppress(&mut self) -> Result<()> {
|
||||
let services = ["tlp", "thermald", "i8kmon"];
|
||||
for s in services {
|
||||
if Command::new("systemctl").args(["is-active", "--quiet", s]).status()?.success() {
|
||||
debug!("Suppressing service: {}", s);
|
||||
Command::new("systemctl").args(["stop", s]).status()?;
|
||||
self.stopped_services.push(s.to_string());
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn restore(&mut self) -> Result<()> {
|
||||
for s in &self.stopped_services {
|
||||
let _ = Command::new("systemctl").args(["start", s]).status();
|
||||
}
|
||||
self.stopped_services.clear();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for DellXps9380Guard {
|
||||
fn drop(&mut self) {
|
||||
let _ = self.restore();
|
||||
}
|
||||
impl EnvironmentGuard for DellXps9380Sal {
|
||||
fn suppress(&self) -> Result<()> { Ok(()) }
|
||||
fn restore(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
impl SensorBus for DellXps9380Sal {
|
||||
fn get_temp(&self) -> Result<f32> {
|
||||
// Enforce 1000ms rate limit for Dell SMM as per GEMINI.md
|
||||
let mut last_poll = self.last_poll.lock().unwrap();
|
||||
let now = Instant::now();
|
||||
|
||||
if now.duration_since(*last_poll) < Duration::from_millis(1000) {
|
||||
// # SAFETY: High frequency polling for watchdog
|
||||
if now.duration_since(*last_poll) < Duration::from_millis(100) {
|
||||
return Ok(*self.last_temp.lock().unwrap());
|
||||
}
|
||||
|
||||
let s = fs::read_to_string(&self.temp_path)?;
|
||||
let val = s.trim().parse::<f32>()? / 1000.0;
|
||||
|
||||
*self.last_temp.lock().unwrap() = val;
|
||||
*last_poll = now;
|
||||
|
||||
Ok(val)
|
||||
}
|
||||
|
||||
fn get_power_w(&self) -> Result<f32> {
|
||||
if self.pwr_path.to_string_lossy().contains("energy_uj") {
|
||||
let e1 = fs::read_to_string(&self.pwr_path)?.trim().parse::<u64>()?;
|
||||
std::thread::sleep(Duration::from_millis(100));
|
||||
let e2 = fs::read_to_string(&self.pwr_path)?.trim().parse::<u64>()?;
|
||||
Ok((e2.saturating_sub(e1)) as f32 / 100000.0)
|
||||
let rapl_base = self.fact_sheet.rapl_paths.first().context("RAPL path error")?;
|
||||
let energy_path = rapl_base.join("energy_uj");
|
||||
|
||||
if energy_path.exists() {
|
||||
let mut last_energy = self.last_energy.lock().unwrap();
|
||||
let mut last_watts = self.last_watts.lock().unwrap();
|
||||
|
||||
let e2_str = fs::read_to_string(&energy_path)?;
|
||||
let e2 = e2_str.trim().parse::<u64>()?;
|
||||
let t2 = Instant::now();
|
||||
let (e1, t1) = *last_energy;
|
||||
let delta_e = e2.wrapping_sub(e1);
|
||||
let delta_t = t2.duration_since(t1).as_secs_f32();
|
||||
if delta_t < 0.1 { return Ok(*last_watts); }
|
||||
let watts = (delta_e as f32 / 1_000_000.0) / delta_t;
|
||||
*last_energy = (e2, t2);
|
||||
*last_watts = watts;
|
||||
Ok(watts)
|
||||
} else {
|
||||
let s = fs::read_to_string(&self.pwr_path)?;
|
||||
Ok(s.trim().parse::<f32>()? / 1000000.0)
|
||||
@@ -223,66 +225,114 @@ impl SensorBus for DellXps9380Sal {
|
||||
fn get_fan_rpms(&self) -> Result<Vec<u32>> {
|
||||
let mut last_poll = self.last_poll.lock().unwrap();
|
||||
let now = Instant::now();
|
||||
|
||||
if now.duration_since(*last_poll) < Duration::from_millis(1000) {
|
||||
return Ok(self.last_fans.lock().unwrap().clone());
|
||||
}
|
||||
|
||||
let mut fans = Vec::new();
|
||||
for path in &self.fan_paths {
|
||||
if let Ok(s) = fs::read_to_string(path) {
|
||||
if let Ok(rpm) = s.trim().parse::<u32>() {
|
||||
fans.push(rpm);
|
||||
let mut val = 0;
|
||||
for i in 0..5 {
|
||||
match fs::read_to_string(path) {
|
||||
Ok(s) => {
|
||||
if let Ok(rpm) = s.trim().parse::<u32>() {
|
||||
val = rpm;
|
||||
if rpm > 0 { break; }
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
debug!("SAL: Fan poll retry {} for {:?} failed: {}", i+1, path, e);
|
||||
}
|
||||
}
|
||||
thread::sleep(Duration::from_millis(150));
|
||||
}
|
||||
fans.push(val);
|
||||
}
|
||||
|
||||
|
||||
*self.last_fans.lock().unwrap() = fans.clone();
|
||||
*last_poll = now;
|
||||
|
||||
Ok(fans)
|
||||
}
|
||||
|
||||
fn get_freq_mhz(&self) -> Result<f32> {
|
||||
let s = fs::read_to_string(&self.freq_path)?;
|
||||
let val = s.trim().parse::<f32>()? / 1000.0;
|
||||
Ok(val)
|
||||
Ok(s.trim().parse::<f32>()? / 1000.0)
|
||||
}
|
||||
|
||||
fn get_throttling_status(&self) -> Result<bool> {
|
||||
let val = self.read_msr(0x19C)?;
|
||||
Ok((val & 0x1) != 0)
|
||||
}
|
||||
}
|
||||
|
||||
impl ActuatorBus for DellXps9380Sal {
|
||||
fn set_fan_mode(&self, mode: &str) -> Result<()> {
|
||||
let tool_path = self.fact_sheet.paths.tools.get("dell_fan_ctrl")
|
||||
.ok_or_else(|| anyhow!("Dell fan control tool not found in PATH"))?;
|
||||
let tool_str = tool_path.to_string_lossy();
|
||||
|
||||
match mode {
|
||||
"max" | "Manual" => {
|
||||
Command::new("dell-bios-fan-control").arg("0").status()?;
|
||||
"max" | "Manual" => {
|
||||
self.ctx.runner.run(&tool_str, &["0"])?;
|
||||
// Disabling BIOS control requires immediate PWM override
|
||||
self.set_fan_speed(FanSpeedPercent::new(100)?)?;
|
||||
}
|
||||
"auto" | "Auto" => {
|
||||
Command::new("dell-bios-fan-control").arg("1").status()?;
|
||||
}
|
||||
_ => {
|
||||
debug!("Unknown fan mode requested: {}", mode);
|
||||
"auto" | "Auto" => { self.ctx.runner.run(&tool_str, &["1"])?; }
|
||||
_ => {}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn set_fan_speed(&self, speed: FanSpeedPercent) -> Result<()> {
|
||||
let pwm_val = ((speed.get() as u32 * 255) / 100) as u8;
|
||||
for p in &self.pwm_enable_paths { let _ = fs::write(p, "1"); }
|
||||
for path in &self.pwm_paths { let _ = fs::write(path, pwm_val.to_string()); }
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()> {
|
||||
for path in &self.pl1_paths {
|
||||
debug!("SAL: Applying PL1 ({:.1}W) to {:?}", limit.get(), path);
|
||||
fs::write(path, limit.as_microwatts().to_string())
|
||||
.with_context(|| format!("Failed to write PL1 to {:?}", path))?;
|
||||
if let Some(parent) = path.parent() {
|
||||
let enable_p = parent.join("constraint_0_enabled");
|
||||
let _ = fs::write(&enable_p, "1");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn set_sustained_power_limit(&self, watts: f32) -> Result<()> {
|
||||
let uw = (watts * 1_000_000.0) as u64;
|
||||
fs::write(&self.pl1_path, uw.to_string())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn set_burst_power_limit(&self, watts: f32) -> Result<()> {
|
||||
let uw = (watts * 1_000_000.0) as u64;
|
||||
fs::write(&self.pl2_path, uw.to_string())?;
|
||||
fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()> {
|
||||
for path in &self.pl2_paths {
|
||||
debug!("SAL: Applying PL2 ({:.1}W) to {:?}", limit.get(), path);
|
||||
fs::write(path, limit.as_microwatts().to_string())
|
||||
.with_context(|| format!("Failed to write PL2 to {:?}", path))?;
|
||||
if let Some(parent) = path.parent() {
|
||||
let enable_p = parent.join("constraint_1_enabled");
|
||||
let _ = fs::write(&enable_p, "1");
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl HardwareWatchdog for DellXps9380Sal {
|
||||
fn check_emergency(&self) -> Result<bool> {
|
||||
// Check for thermal throttling or BD PROCHOT
|
||||
// Simplified for now
|
||||
Ok(false)
|
||||
fn get_safety_status(&self) -> Result<SafetyStatus> {
|
||||
let temp = self.get_temp()?;
|
||||
if temp > 98.0 {
|
||||
return Ok(SafetyStatus::EmergencyAbort(format!("Thermal Runaway: {:.1}°C", temp)));
|
||||
}
|
||||
if let Ok(msr_val) = self.read_msr(0x1FC) {
|
||||
if (msr_val & 0x1) != 0 && temp < 85.0 {
|
||||
let _ = self.write_msr(0x1FC, msr_val & !0x1);
|
||||
return Ok(SafetyStatus::Warning("BD PROCHOT Latch Cleared".to_string()));
|
||||
}
|
||||
}
|
||||
Ok(SafetyStatus::Nominal)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for DellXps9380Sal {
|
||||
fn drop(&mut self) { }
|
||||
}
|
||||
|
||||
148
src/sal/discovery.rs
Normal file
148
src/sal/discovery.rs
Normal file
@@ -0,0 +1,148 @@
|
||||
//! # Hardware Discovery Engine (Agent Sentinel)
|
||||
//!
|
||||
//! This module provides dynamic traversal of `/sys/class/hwmon` and `/sys/class/powercap`
|
||||
//! to locate sensors and actuators without relying on hardcoded indices.
|
||||
|
||||
use anyhow::{Result, Context, anyhow};
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
/// Result of a successful hardware discovery.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DiscoveredHardware {
|
||||
/// Path to the primary package temperature sensor input.
|
||||
pub temp_input: PathBuf,
|
||||
/// Paths to all detected fan RPM inputs.
|
||||
pub fan_inputs: Vec<PathBuf>,
|
||||
/// Paths to all detected fan PWM control nodes.
|
||||
pub pwm_controls: Vec<PathBuf>,
|
||||
/// Paths to all detected fan PWM enable nodes.
|
||||
pub pwm_enables: Vec<PathBuf>,
|
||||
/// Paths to RAPL power limit constraint files.
|
||||
pub rapl_paths: Vec<PathBuf>,
|
||||
}
|
||||
|
||||
pub struct DiscoveryEngine;
|
||||
|
||||
impl DiscoveryEngine {
|
||||
/// Performs a full traversal of the sysfs hardware tree.
|
||||
pub fn run(sysfs_root: &Path) -> Result<DiscoveredHardware> {
|
||||
info!("Sentinel: Starting dynamic hardware discovery...");
|
||||
|
||||
let hwmon_path = sysfs_root.join("sys/class/hwmon");
|
||||
let (temp_input, fan_info) = Self::discover_hwmon(&hwmon_path)?;
|
||||
|
||||
let powercap_path = sysfs_root.join("sys/class/powercap");
|
||||
let rapl_paths = Self::discover_rapl(&powercap_path)?;
|
||||
|
||||
let hardware = DiscoveredHardware {
|
||||
temp_input,
|
||||
fan_inputs: fan_info.rpm_inputs,
|
||||
pwm_controls: fan_info.pwm_controls,
|
||||
pwm_enables: fan_info.pwm_enables,
|
||||
rapl_paths,
|
||||
};
|
||||
|
||||
info!("Sentinel: Discovery complete. Found {} fans and {} RAPL nodes.",
|
||||
hardware.fan_inputs.len(), hardware.rapl_paths.len());
|
||||
|
||||
Ok(hardware)
|
||||
}
|
||||
|
||||
fn discover_hwmon(base: &Path) -> Result<(PathBuf, FanHardware)> {
|
||||
let mut best_temp: Option<(u32, PathBuf)> = None;
|
||||
let mut fans = FanHardware::default();
|
||||
|
||||
let entries = fs::read_dir(base)
|
||||
.with_context(|| format!("Failed to read hwmon base: {:?}", base))?;
|
||||
|
||||
for entry in entries.flatten() {
|
||||
let path = entry.path();
|
||||
let driver_name = fs::read_to_string(path.join("name"))
|
||||
.map(|s| s.trim().to_string())
|
||||
.unwrap_or_else(|_| "unknown".to_string());
|
||||
|
||||
debug!("Discovery: Probing hwmon node {:?} (driver: {})", path, driver_name);
|
||||
|
||||
// 1. Temperature Discovery
|
||||
let temp_priority = match driver_name.as_str() {
|
||||
"coretemp" | "zenpower" => 10,
|
||||
"k10temp" => 9,
|
||||
"dell_smm" => 8,
|
||||
"acpitz" => 1,
|
||||
_ => 5,
|
||||
};
|
||||
|
||||
if let Ok(hw_entries) = fs::read_dir(&path) {
|
||||
for hw_entry in hw_entries.flatten() {
|
||||
let file_name = hw_entry.file_name().to_string_lossy().to_string();
|
||||
|
||||
// Temperature Inputs
|
||||
if file_name.starts_with("temp") && file_name.ends_with("_input") {
|
||||
let label_path = path.join(file_name.replace("_input", "_label"));
|
||||
let label = fs::read_to_string(label_path).unwrap_or_default().trim().to_string();
|
||||
|
||||
let label_priority = if label.contains("Package") || label.contains("Tdie") {
|
||||
2
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
let total_priority = temp_priority + label_priority;
|
||||
if best_temp.is_none() || total_priority > best_temp.as_ref().unwrap().0 {
|
||||
best_temp = Some((total_priority, hw_entry.path()));
|
||||
}
|
||||
}
|
||||
|
||||
// Fan Inputs
|
||||
if file_name.starts_with("fan") && file_name.ends_with("_input") {
|
||||
fans.rpm_inputs.push(hw_entry.path());
|
||||
}
|
||||
|
||||
// PWM Controls
|
||||
if file_name.starts_with("pwm") && !file_name.contains("_") {
|
||||
fans.pwm_controls.push(hw_entry.path());
|
||||
}
|
||||
|
||||
// PWM Enables
|
||||
if file_name.starts_with("pwm") && file_name.ends_with("_enable") {
|
||||
fans.pwm_enables.push(hw_entry.path());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let temp_input = best_temp.map(|(_, p)| p)
|
||||
.ok_or_else(|| anyhow!("Failed to locate any valid temperature sensor in /sys/class/hwmon/"))?;
|
||||
|
||||
Ok((temp_input, fans))
|
||||
}
|
||||
|
||||
fn discover_rapl(base: &Path) -> Result<Vec<PathBuf>> {
|
||||
let mut paths = Vec::new();
|
||||
if !base.exists() {
|
||||
warn!("Discovery: /sys/class/powercap does not exist.");
|
||||
return Ok(paths);
|
||||
}
|
||||
|
||||
let entries = fs::read_dir(base)?;
|
||||
for entry in entries.flatten() {
|
||||
let path = entry.path();
|
||||
let name = fs::read_to_string(path.join("name")).unwrap_or_default().trim().to_string();
|
||||
|
||||
if name.contains("package") || name.contains("intel-rapl") {
|
||||
paths.push(path);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(paths)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct FanHardware {
|
||||
rpm_inputs: Vec<PathBuf>,
|
||||
pwm_controls: Vec<PathBuf>,
|
||||
pwm_enables: Vec<PathBuf>,
|
||||
}
|
||||
207
src/sal/generic_linux.rs
Normal file
207
src/sal/generic_linux.rs
Normal file
@@ -0,0 +1,207 @@
|
||||
use anyhow::{Result, anyhow, Context};
|
||||
use std::path::{Path};
|
||||
use std::fs;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::sync::Mutex;
|
||||
|
||||
use crate::sal::traits::{SensorBus, ActuatorBus, EnvironmentGuard, HardwareWatchdog, PreflightAuditor, AuditStep, AuditError, SafetyStatus, EnvironmentCtx};
|
||||
use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent};
|
||||
use crate::sal::heuristic::discovery::SystemFactSheet;
|
||||
use crate::sal::heuristic::schema::HardwareDb;
|
||||
|
||||
pub struct GenericLinuxSal {
|
||||
ctx: EnvironmentCtx,
|
||||
fact_sheet: SystemFactSheet,
|
||||
db: HardwareDb,
|
||||
last_valid_temp: Mutex<(f32, Instant)>,
|
||||
current_pl1: Mutex<u64>,
|
||||
last_energy: Mutex<(u64, Instant)>,
|
||||
}
|
||||
|
||||
impl GenericLinuxSal {
|
||||
pub fn new(ctx: EnvironmentCtx, facts: SystemFactSheet, db: HardwareDb) -> Self {
|
||||
let initial_energy = if let Some(pwr_base) = facts.rapl_paths.first() {
|
||||
fs::read_to_string(pwr_base.join("energy_uj")).unwrap_or_default().trim().parse().unwrap_or(0)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
Self {
|
||||
db,
|
||||
last_valid_temp: Mutex::new((0.0, Instant::now())),
|
||||
current_pl1: Mutex::new(15_000_000),
|
||||
last_energy: Mutex::new((initial_energy, Instant::now())),
|
||||
fact_sheet: facts,
|
||||
ctx,
|
||||
}
|
||||
}
|
||||
|
||||
fn is_dell(&self) -> bool {
|
||||
self.fact_sheet.vendor.to_lowercase().contains("dell")
|
||||
}
|
||||
|
||||
fn read_sysfs(&self, path: &Path) -> Result<String> {
|
||||
fs::read_to_string(path).map(|s| s.trim().to_string()).map_err(|e| anyhow!(e))
|
||||
}
|
||||
}
|
||||
|
||||
impl PreflightAuditor for GenericLinuxSal {
|
||||
fn audit(&self) -> Box<dyn Iterator<Item = AuditStep> + '_> {
|
||||
let mut steps = Vec::new();
|
||||
for check in &self.db.preflight_checks {
|
||||
let status = self.ctx.runner.run("sh", &["-c", &check.check_cmd]);
|
||||
steps.push(AuditStep {
|
||||
description: check.name.clone(),
|
||||
outcome: match status {
|
||||
Ok(_) => Ok(()),
|
||||
_ => Err(AuditError::KernelIncompatible(check.fail_help.clone())),
|
||||
}
|
||||
});
|
||||
}
|
||||
for conflict_id in &self.fact_sheet.active_conflicts {
|
||||
if let Some(conflict) = self.db.conflicts.iter().find(|c| &c.id == conflict_id) {
|
||||
if conflict.severity == "Critical" {
|
||||
steps.push(AuditStep {
|
||||
description: format!("Conflict: {}", conflict.id),
|
||||
outcome: Err(AuditError::ToolMissing(conflict.help_text.clone())),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
Box::new(steps.into_iter())
|
||||
}
|
||||
}
|
||||
|
||||
impl SensorBus for GenericLinuxSal {
|
||||
fn get_temp(&self) -> Result<f32> {
|
||||
let path = self.fact_sheet.temp_path.as_ref()
|
||||
.ok_or_else(|| anyhow!("No temperature sensor path found"))?;
|
||||
let content = self.read_sysfs(path)?;
|
||||
let temp = content.parse::<f32>()? / 1000.0;
|
||||
let mut last = self.last_valid_temp.lock().unwrap();
|
||||
if (temp - last.0).abs() > 0.01 { *last = (temp, Instant::now()); }
|
||||
Ok(temp)
|
||||
}
|
||||
|
||||
fn get_power_w(&self) -> Result<f32> {
|
||||
let rapl_path = self.fact_sheet.rapl_paths.first()
|
||||
.ok_or_else(|| anyhow!("No RAPL path found"))?;
|
||||
let energy_path = rapl_path.join("energy_uj");
|
||||
let mut last = self.last_energy.lock().unwrap();
|
||||
let e2: u64 = self.read_sysfs(&energy_path)?.parse()?;
|
||||
let t2 = Instant::now();
|
||||
let (e1, t1) = *last;
|
||||
let delta_e = e2.wrapping_sub(e1);
|
||||
let delta_t = t2.duration_since(t1).as_secs_f32();
|
||||
*last = (e2, t2);
|
||||
if delta_t < 0.05 { return Ok(0.0); }
|
||||
Ok((delta_e as f32 / 1_000_000.0) / delta_t)
|
||||
}
|
||||
|
||||
fn get_fan_rpms(&self) -> Result<Vec<u32>> {
|
||||
let mut rpms = Vec::new();
|
||||
for path in &self.fact_sheet.fan_paths {
|
||||
if let Ok(content) = self.read_sysfs(path) {
|
||||
if let Ok(rpm) = content.parse() { rpms.push(rpm); }
|
||||
}
|
||||
}
|
||||
Ok(rpms)
|
||||
}
|
||||
|
||||
fn get_freq_mhz(&self) -> Result<f32> {
|
||||
let path = self.ctx.sysfs_base.join("sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq");
|
||||
if path.exists() {
|
||||
Ok(self.read_sysfs(&path)?.parse::<f32>()? / 1000.0)
|
||||
} else {
|
||||
let cpuinfo_path = self.ctx.sysfs_base.join("proc/cpuinfo");
|
||||
let cpuinfo = fs::read_to_string(cpuinfo_path)?;
|
||||
for line in cpuinfo.lines() {
|
||||
if line.starts_with("cpu MHz") {
|
||||
if let Some((_, mhz)) = line.split_once(':') {
|
||||
return Ok(mhz.trim().parse()?);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(anyhow!("Could not determine CPU frequency"))
|
||||
}
|
||||
}
|
||||
|
||||
fn get_throttling_status(&self) -> Result<bool> {
|
||||
let cooling_base = self.ctx.sysfs_base.join("sys/class/thermal");
|
||||
if let Ok(entries) = fs::read_dir(cooling_base) {
|
||||
for entry in entries.flatten() {
|
||||
if entry.file_name().to_string_lossy().starts_with("cooling_device") {
|
||||
if let Ok(state) = fs::read_to_string(entry.path().join("cur_state")) {
|
||||
if state.trim().parse::<u32>().unwrap_or(0) > 0 {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
|
||||
impl ActuatorBus for GenericLinuxSal {
|
||||
fn set_fan_mode(&self, mode: &str) -> Result<()> {
|
||||
if self.is_dell() {
|
||||
let cmd = match mode {
|
||||
"manual" | "max" => self.db.ecosystems.get("dell").and_then(|e| e.fan_manual_mode_cmd.as_ref()),
|
||||
"auto" => self.db.ecosystems.get("dell").and_then(|e| e.fan_auto_mode_cmd.as_ref()),
|
||||
_ => return Err(anyhow!("Unsupported fan mode: {}", mode)),
|
||||
};
|
||||
if let Some(cmd_str) = cmd {
|
||||
let parts: Vec<&str> = cmd_str.split_whitespace().collect();
|
||||
self.ctx.runner.run(parts[0], &parts[1..])?;
|
||||
Ok(())
|
||||
} else { Err(anyhow!("Dell fan command missing")) }
|
||||
} else { Ok(()) }
|
||||
}
|
||||
|
||||
fn set_fan_speed(&self, _speed: FanSpeedPercent) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()> {
|
||||
for rapl_path in &self.fact_sheet.rapl_paths {
|
||||
let limit_path = rapl_path.join("constraint_0_power_limit_uw");
|
||||
let enable_path = rapl_path.join("constraint_0_enabled");
|
||||
fs::write(&limit_path, limit.as_microwatts().to_string())
|
||||
.with_context(|| format!("Failed to write PL1 to {:?}", limit_path))?;
|
||||
let _ = fs::write(&enable_path, "1");
|
||||
}
|
||||
*self.current_pl1.lock().unwrap() = limit.as_microwatts();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()> {
|
||||
for rapl_path in &self.fact_sheet.rapl_paths {
|
||||
let limit_path = rapl_path.join("constraint_1_power_limit_uw");
|
||||
let enable_path = rapl_path.join("constraint_1_enabled");
|
||||
fs::write(&limit_path, limit.as_microwatts().to_string())
|
||||
.with_context(|| format!("Failed to write PL2 to {:?}", limit_path))?;
|
||||
let _ = fs::write(&enable_path, "1");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl EnvironmentGuard for GenericLinuxSal {
|
||||
fn suppress(&self) -> Result<()> { Ok(()) }
|
||||
fn restore(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
impl HardwareWatchdog for GenericLinuxSal {
|
||||
fn get_safety_status(&self) -> Result<SafetyStatus> {
|
||||
let temp = self.get_temp()?;
|
||||
if temp > 100.0 {
|
||||
return Ok(SafetyStatus::EmergencyAbort(format!("Thermal runaway: {:.1}°C", temp)));
|
||||
}
|
||||
let last = self.last_valid_temp.lock().unwrap();
|
||||
if last.1.elapsed() > Duration::from_secs(5) {
|
||||
return Ok(SafetyStatus::EmergencyAbort("Temperature sensor stalled".to_string()));
|
||||
}
|
||||
Ok(SafetyStatus::Nominal)
|
||||
}
|
||||
}
|
||||
236
src/sal/heuristic/discovery.rs
Normal file
236
src/sal/heuristic/discovery.rs
Normal file
@@ -0,0 +1,236 @@
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::{Duration};
|
||||
use std::thread;
|
||||
use std::sync::mpsc;
|
||||
use std::collections::HashMap;
|
||||
use crate::sal::heuristic::schema::{SensorDiscovery, ActuatorDiscovery, Conflict, Discovery, Benchmarking};
|
||||
use crate::sys::SyscallRunner;
|
||||
use tracing::{debug, warn, info};
|
||||
|
||||
/// Registry of dynamically discovered paths for configs and tools.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct PathRegistry {
|
||||
pub configs: HashMap<String, PathBuf>,
|
||||
pub tools: HashMap<String, PathBuf>,
|
||||
}
|
||||
|
||||
/// Strongly-typed findings about the current system.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct SystemFactSheet {
|
||||
pub vendor: String,
|
||||
pub model: String,
|
||||
pub temp_path: Option<PathBuf>,
|
||||
pub fan_paths: Vec<PathBuf>,
|
||||
pub rapl_paths: Vec<PathBuf>,
|
||||
pub active_conflicts: Vec<String>,
|
||||
pub conflict_services: Vec<String>,
|
||||
pub paths: PathRegistry,
|
||||
pub bench_config: Option<Benchmarking>,
|
||||
}
|
||||
|
||||
/// Probes the system for hardware sensors, actuators, service conflicts, and paths.
|
||||
pub fn discover_facts(
|
||||
base_path: &Path,
|
||||
runner: &dyn SyscallRunner,
|
||||
discovery: &Discovery,
|
||||
conflicts: &[Conflict],
|
||||
bench_config: Benchmarking,
|
||||
) -> SystemFactSheet {
|
||||
let (vendor, model) = read_dmi_info(base_path);
|
||||
|
||||
debug!("DMI Identity: Vendor='{}', Model='{}'", vendor, model);
|
||||
|
||||
let (temp_path, fan_paths) = discover_hwmon(base_path, &discovery.sensors);
|
||||
let rapl_paths = discover_rapl(base_path, &discovery.actuators);
|
||||
|
||||
let mut active_conflicts = Vec::new();
|
||||
let mut conflict_services = Vec::new();
|
||||
for conflict in conflicts {
|
||||
let mut found_active = false;
|
||||
for service in &conflict.services {
|
||||
if is_service_active(runner, service) {
|
||||
if !found_active {
|
||||
debug!("Detected active conflict: {} (Service: {})", conflict.id, service);
|
||||
active_conflicts.push(conflict.id.clone());
|
||||
found_active = true;
|
||||
}
|
||||
conflict_services.push(service.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let paths = discover_paths(base_path, discovery);
|
||||
|
||||
SystemFactSheet {
|
||||
vendor, model, temp_path, fan_paths, rapl_paths, active_conflicts, conflict_services, paths,
|
||||
bench_config: Some(bench_config),
|
||||
}
|
||||
}
|
||||
|
||||
fn discover_paths(base_path: &Path, discovery: &Discovery) -> PathRegistry {
|
||||
let mut registry = PathRegistry::default();
|
||||
|
||||
for (id, binary_name) in &discovery.tools {
|
||||
if let Ok(path) = which::which(binary_name) {
|
||||
debug!("Discovered tool: {} -> {:?}", id, path);
|
||||
registry.tools.insert(id.clone(), path);
|
||||
}
|
||||
}
|
||||
|
||||
for (id, candidates) in &discovery.configs {
|
||||
for candidate in candidates {
|
||||
let path = if candidate.starts_with('/') {
|
||||
base_path.join(&candidate[1..])
|
||||
} else {
|
||||
base_path.join(candidate)
|
||||
};
|
||||
|
||||
if path.exists() {
|
||||
debug!("Discovered config: {} -> {:?}", id, path);
|
||||
registry.configs.insert(id.clone(), path);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if !registry.configs.contains_key(id) {
|
||||
if let Some(first) = candidates.first() {
|
||||
registry.configs.insert(id.clone(), PathBuf::from(first));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
registry
|
||||
}
|
||||
|
||||
fn read_dmi_info(base_path: &Path) -> (String, String) {
|
||||
let vendor = fs::read_to_string(base_path.join("sys/class/dmi/id/sys_vendor"))
|
||||
.map(|s| s.trim().to_string()).unwrap_or_else(|_| "Unknown".to_string());
|
||||
let model = fs::read_to_string(base_path.join("sys/class/dmi/id/product_name"))
|
||||
.map(|s| s.trim().to_string()).unwrap_or_else(|_| "Unknown".to_string());
|
||||
(vendor, model)
|
||||
}
|
||||
|
||||
/// Discovers hwmon sensors by matching labels and prioritizing drivers.
|
||||
fn discover_hwmon(base_path: &Path, cfg: &SensorDiscovery) -> (Option<PathBuf>, Vec<PathBuf>) {
|
||||
let mut temp_candidates = Vec::new();
|
||||
let mut fan_candidates = Vec::new();
|
||||
|
||||
let hwmon_base = base_path.join("sys/class/hwmon");
|
||||
let entries = fs::read_dir(&hwmon_base).map_err(|e| {
|
||||
warn!("Could not read {:?}: {}", hwmon_base, e);
|
||||
e
|
||||
}).ok();
|
||||
|
||||
if let Some(entries) = entries {
|
||||
for entry in entries.flatten() {
|
||||
let hwmon_path = entry.path();
|
||||
|
||||
// # SAFETY: Read driver name directly. This file is virtual and never blocks.
|
||||
// Using a timeout wrapper here was causing discovery to fail if the thread-pool lagged.
|
||||
let driver_name = fs::read_to_string(hwmon_path.join("name"))
|
||||
.map(|s| s.trim().to_string()).unwrap_or_default();
|
||||
|
||||
let priority = cfg.hwmon_priority
|
||||
.iter()
|
||||
.position(|p| driver_name.contains(p))
|
||||
.unwrap_or(usize::MAX);
|
||||
|
||||
if let Ok(hw_entries) = fs::read_dir(&hwmon_path) {
|
||||
for hw_entry in hw_entries.flatten() {
|
||||
let file_name = hw_entry.file_name().into_string().unwrap_or_default();
|
||||
|
||||
// 1. Temperatures
|
||||
if file_name.starts_with("temp") && file_name.ends_with("_label") {
|
||||
if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(500)) {
|
||||
if cfg.temp_labels.iter().any(|l| label.contains(l)) {
|
||||
let input_path = hwmon_path.join(file_name.replace("_label", "_input"));
|
||||
if input_path.exists() {
|
||||
temp_candidates.push((priority, input_path));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Fans (Label Match)
|
||||
if file_name.starts_with("fan") && file_name.ends_with("_label") {
|
||||
if let Some(label) = read_sysfs_with_timeout(&hw_entry.path(), Duration::from_millis(500)) {
|
||||
if cfg.fan_labels.iter().any(|l| label.contains(l)) {
|
||||
let input_path = hwmon_path.join(file_name.replace("_label", "_input"));
|
||||
if input_path.exists() {
|
||||
debug!("Discovered fan by label: {:?} (priority {})", input_path, priority);
|
||||
fan_candidates.push((priority, input_path));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Fans (Priority Fallback - CRITICAL FOR DELL 9380)
|
||||
// If we found a priority driver (e.g., dell_smm), we take every fan*_input we find.
|
||||
if priority < usize::MAX && file_name.starts_with("fan") && file_name.ends_with("_input") {
|
||||
if !fan_candidates.iter().any(|(_, p)| p == &hw_entry.path()) {
|
||||
info!("Heuristic Discovery: Force-adding unlabeled fan sensor from priority driver '{}': {:?}", driver_name, hw_entry.path());
|
||||
fan_candidates.push((priority, hw_entry.path()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
temp_candidates.sort_by_key(|(p, _)| *p);
|
||||
fan_candidates.sort_by_key(|(p, _)| *p);
|
||||
|
||||
let best_temp = temp_candidates.first().map(|(_, p)| p.clone());
|
||||
let best_fans: Vec<PathBuf> = fan_candidates.into_iter().map(|(_, p)| p).collect();
|
||||
|
||||
if best_fans.is_empty() {
|
||||
warn!("Heuristic Discovery: No fan RPM sensors found.");
|
||||
} else {
|
||||
info!("Heuristic Discovery: Final registry contains {} fan sensors.", best_fans.len());
|
||||
}
|
||||
|
||||
(best_temp, best_fans)
|
||||
}
|
||||
|
||||
fn discover_rapl(base_path: &Path, cfg: &ActuatorDiscovery) -> Vec<PathBuf> {
|
||||
let mut paths = Vec::new();
|
||||
let powercap_base = base_path.join("sys/class/powercap");
|
||||
|
||||
if let Ok(entries) = fs::read_dir(&powercap_base) {
|
||||
for entry in entries.flatten() {
|
||||
let path = entry.path();
|
||||
let dir_name = entry.file_name().into_string().unwrap_or_default();
|
||||
|
||||
if cfg.rapl_paths.contains(&dir_name) {
|
||||
paths.push(path);
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Ok(name) = fs::read_to_string(path.join("name")) {
|
||||
if cfg.rapl_paths.iter().any(|p| p == name.trim()) {
|
||||
paths.push(path);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
paths
|
||||
}
|
||||
|
||||
pub fn is_service_active(runner: &dyn SyscallRunner, service: &str) -> bool {
|
||||
runner.run("systemctl", &["is-active", "--quiet", service]).is_ok()
|
||||
}
|
||||
|
||||
fn read_sysfs_with_timeout(path: &Path, timeout: Duration) -> Option<String> {
|
||||
let (tx, rx) = mpsc::channel();
|
||||
let path_buf = path.to_path_buf();
|
||||
|
||||
thread::spawn(move || {
|
||||
let res = fs::read_to_string(path_buf).map(|s| s.trim().to_string());
|
||||
let _ = tx.send(res);
|
||||
});
|
||||
|
||||
match rx.recv_timeout(timeout) {
|
||||
Ok(Ok(content)) => Some(content),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
60
src/sal/heuristic/engine.rs
Normal file
60
src/sal/heuristic/engine.rs
Normal file
@@ -0,0 +1,60 @@
|
||||
use miette::{Result, IntoDiagnostic, Context};
|
||||
use std::fs;
|
||||
use regex::Regex;
|
||||
use tracing::{info, debug};
|
||||
|
||||
use crate::sal::traits::{PlatformSal, EnvironmentCtx};
|
||||
use crate::sal::dell_xps_9380::DellXps9380Sal;
|
||||
use crate::sal::generic_linux::GenericLinuxSal;
|
||||
use crate::sal::heuristic::schema::HardwareDb;
|
||||
use crate::sal::heuristic::discovery::{discover_facts, SystemFactSheet};
|
||||
|
||||
pub struct HeuristicEngine;
|
||||
|
||||
impl HeuristicEngine {
|
||||
/// Loads the hardware database, probes the system, and builds the appropriate SAL.
|
||||
pub fn detect_and_build(ctx: EnvironmentCtx) -> Result<(Box<dyn PlatformSal>, SystemFactSheet)> {
|
||||
// 1. Load Hardware DB
|
||||
let db_path = "assets/hardware_db.toml";
|
||||
let db_content = fs::read_to_string(db_path)
|
||||
.into_diagnostic()
|
||||
.with_context(|| format!("Failed to read hardware database at {}", db_path))?;
|
||||
let db: HardwareDb = toml::from_str(&db_content)
|
||||
.into_diagnostic()
|
||||
.context("Failed to parse hardware_db.toml")?;
|
||||
|
||||
// 2. Discover Facts
|
||||
let facts = discover_facts(&ctx.sysfs_base, ctx.runner.as_ref(), &db.discovery, &db.conflicts, db.benchmarking.clone());
|
||||
info!("System Identity: {} {}", facts.vendor, facts.model);
|
||||
|
||||
// 3. Routing Logic
|
||||
|
||||
// --- Special Case: Dell XPS 13 9380 ---
|
||||
if is_match(&facts.vendor, "(?i)Dell.*") && is_match(&facts.model, "(?i)XPS.*13.*9380.*") {
|
||||
info!("Specialized SAL Match Found: Dell XPS 13 9380");
|
||||
let sal = DellXps9380Sal::init(ctx, facts.clone()).map_err(|e| miette::miette!(e))?;
|
||||
return Ok((Box::new(sal), facts));
|
||||
}
|
||||
|
||||
// --- Fallback: Generic Linux SAL ---
|
||||
debug!("No specialized SAL match. Falling back to GenericLinuxSal with DB quirks.");
|
||||
|
||||
// Validation: Ensure we found at least a temperature sensor if required
|
||||
if facts.temp_path.is_none() {
|
||||
return Err(miette::miette!("No temperature sensor discovered. Generic fallback impossible."));
|
||||
}
|
||||
if facts.rapl_paths.is_empty() {
|
||||
return Err(miette::miette!("No RAPL power interface discovered. Generic fallback impossible."));
|
||||
}
|
||||
|
||||
Ok((Box::new(GenericLinuxSal::new(ctx, facts.clone(), db)), facts))
|
||||
}
|
||||
}
|
||||
|
||||
fn is_match(input: &str, pattern: &str) -> bool {
|
||||
if let Ok(re) = Regex::new(pattern) {
|
||||
re.is_match(input)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
3
src/sal/heuristic/mod.rs
Normal file
3
src/sal/heuristic/mod.rs
Normal file
@@ -0,0 +1,3 @@
|
||||
pub mod schema;
|
||||
pub mod discovery;
|
||||
pub mod engine;
|
||||
104
src/sal/heuristic/schema.rs
Normal file
104
src/sal/heuristic/schema.rs
Normal file
@@ -0,0 +1,104 @@
|
||||
use serde::Deserialize;
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct HardwareDb {
|
||||
pub metadata: Metadata,
|
||||
pub conflicts: Vec<Conflict>,
|
||||
pub ecosystems: HashMap<String, Ecosystem>,
|
||||
pub quirks: Vec<Quirk>,
|
||||
pub discovery: Discovery,
|
||||
pub benchmarking: Benchmarking,
|
||||
pub preflight_checks: Vec<PreflightCheck>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct Metadata {
|
||||
pub version: String,
|
||||
pub updated: String,
|
||||
pub description: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct Conflict {
|
||||
pub id: String,
|
||||
pub services: Vec<String>,
|
||||
pub contention: String,
|
||||
pub severity: String,
|
||||
pub fix_action: String,
|
||||
pub help_text: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct Ecosystem {
|
||||
pub vendor_regex: String,
|
||||
pub product_regex: Option<String>,
|
||||
pub polling_cap_ms: Option<u64>,
|
||||
pub drivers: Option<Vec<String>>,
|
||||
pub fan_manual_mode_cmd: Option<String>,
|
||||
pub fan_auto_mode_cmd: Option<String>,
|
||||
pub safety_register: Option<String>,
|
||||
pub lap_mode_path: Option<String>,
|
||||
pub profiles_path: Option<String>,
|
||||
pub ec_write_required: Option<bool>,
|
||||
pub thermal_policy_path: Option<String>,
|
||||
pub policy_map: Option<HashMap<String, i32>>,
|
||||
pub msr_lock_register: Option<String>,
|
||||
pub msr_lock_bit: Option<u32>,
|
||||
pub fan_boost_path: Option<String>,
|
||||
pub ec_tool: Option<String>,
|
||||
pub optimization: Option<String>,
|
||||
pub help_text: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct Quirk {
|
||||
pub model_regex: String,
|
||||
pub id: String,
|
||||
pub issue: String,
|
||||
pub action: String,
|
||||
pub monitor_msr: Option<String>,
|
||||
pub reset_bit: Option<u32>,
|
||||
pub trigger_path: Option<String>,
|
||||
pub trigger_value: Option<String>,
|
||||
pub target_path: Option<String>,
|
||||
pub format: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct Discovery {
|
||||
pub sensors: SensorDiscovery,
|
||||
pub actuators: ActuatorDiscovery,
|
||||
pub configs: HashMap<String, Vec<String>>,
|
||||
pub tools: HashMap<String, String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct Benchmarking {
|
||||
pub idle_duration_s: u64,
|
||||
pub stress_duration_min_s: u64,
|
||||
pub stress_duration_max_s: u64,
|
||||
pub cool_down_s: u64,
|
||||
pub power_steps_watts: Vec<f32>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct SensorDiscovery {
|
||||
pub temp_labels: Vec<String>,
|
||||
pub fan_labels: Vec<String>,
|
||||
pub hwmon_priority: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct ActuatorDiscovery {
|
||||
pub rapl_paths: Vec<String>,
|
||||
pub amd_energy_paths: Vec<String>,
|
||||
pub governor_files: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Clone)]
|
||||
pub struct PreflightCheck {
|
||||
pub name: String,
|
||||
pub check_cmd: String,
|
||||
pub fail_help: String,
|
||||
}
|
||||
@@ -1,79 +1,52 @@
|
||||
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep};
|
||||
use super::traits::{PreflightAuditor, EnvironmentGuard, SensorBus, ActuatorBus, HardwareWatchdog, AuditStep, SafetyStatus};
|
||||
use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent};
|
||||
use anyhow::Result;
|
||||
|
||||
pub struct MockAuditor;
|
||||
impl PreflightAuditor for MockAuditor {
|
||||
pub struct MockSal {
|
||||
pub temperature_sequence: std::sync::atomic::AtomicUsize,
|
||||
}
|
||||
|
||||
impl MockSal {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
temperature_sequence: std::sync::atomic::AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PreflightAuditor for MockSal {
|
||||
fn audit(&self) -> Box<dyn Iterator<Item = AuditStep> + '_> {
|
||||
let steps = vec![
|
||||
AuditStep {
|
||||
description: "Mock Root Privileges".to_string(),
|
||||
outcome: Ok(()),
|
||||
},
|
||||
AuditStep {
|
||||
description: "Mock AC Power Status".to_string(),
|
||||
outcome: Ok(()),
|
||||
},
|
||||
AuditStep { description: "Mock Root Privileges".to_string(), outcome: Ok(()) },
|
||||
AuditStep { description: "Mock AC Power Status".to_string(), outcome: Ok(()) },
|
||||
];
|
||||
Box::new(steps.into_iter())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MockGuard {
|
||||
pub suppressed: bool,
|
||||
}
|
||||
impl MockGuard {
|
||||
pub fn new() -> Self {
|
||||
Self { suppressed: false }
|
||||
}
|
||||
}
|
||||
impl EnvironmentGuard for MockGuard {
|
||||
fn suppress(&mut self) -> Result<()> {
|
||||
self.suppressed = true;
|
||||
Ok(())
|
||||
}
|
||||
fn restore(&mut self) -> Result<()> {
|
||||
self.suppressed = false;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
impl Drop for MockGuard {
|
||||
fn drop(&mut self) {
|
||||
let _ = self.restore();
|
||||
}
|
||||
impl EnvironmentGuard for MockSal {
|
||||
fn suppress(&self) -> Result<()> { Ok(()) }
|
||||
fn restore(&self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
pub struct MockSensorBus;
|
||||
impl SensorBus for MockSensorBus {
|
||||
impl SensorBus for MockSal {
|
||||
fn get_temp(&self) -> Result<f32> {
|
||||
Ok(42.0)
|
||||
}
|
||||
fn get_power_w(&self) -> Result<f32> {
|
||||
Ok(15.0)
|
||||
}
|
||||
fn get_fan_rpms(&self) -> Result<Vec<u32>> {
|
||||
Ok(vec![2500])
|
||||
}
|
||||
fn get_freq_mhz(&self) -> Result<f32> {
|
||||
Ok(3200.0)
|
||||
let seq = self.temperature_sequence.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
|
||||
Ok(40.0 + (seq as f32 * 0.5).min(55.0))
|
||||
}
|
||||
fn get_power_w(&self) -> Result<f32> { Ok(15.0) }
|
||||
fn get_fan_rpms(&self) -> Result<Vec<u32>> { Ok(vec![2500, 2400]) }
|
||||
fn get_freq_mhz(&self) -> Result<f32> { Ok(3200.0) }
|
||||
fn get_throttling_status(&self) -> Result<bool> { Ok(false) }
|
||||
}
|
||||
|
||||
pub struct MockActuatorBus;
|
||||
impl ActuatorBus for MockActuatorBus {
|
||||
fn set_fan_mode(&self, _mode: &str) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
fn set_sustained_power_limit(&self, _watts: f32) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
fn set_burst_power_limit(&self, _watts: f32) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
impl ActuatorBus for MockSal {
|
||||
fn set_fan_mode(&self, _mode: &str) -> Result<()> { Ok(()) }
|
||||
fn set_fan_speed(&self, _speed: FanSpeedPercent) -> Result<()> { Ok(()) }
|
||||
fn set_sustained_power_limit(&self, _limit: PowerLimitWatts) -> Result<()> { Ok(()) }
|
||||
fn set_burst_power_limit(&self, _limit: PowerLimitWatts) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
pub struct MockWatchdog;
|
||||
impl HardwareWatchdog for MockWatchdog {
|
||||
fn check_emergency(&self) -> Result<bool> {
|
||||
Ok(false)
|
||||
}
|
||||
impl HardwareWatchdog for MockSal {
|
||||
fn get_safety_status(&self) -> Result<SafetyStatus> { Ok(SafetyStatus::Nominal) }
|
||||
}
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
pub mod traits;
|
||||
pub mod mock;
|
||||
pub mod dell_xps_9380;
|
||||
pub mod generic_linux;
|
||||
pub mod heuristic;
|
||||
pub mod safety;
|
||||
pub mod discovery;
|
||||
|
||||
282
src/sal/safety.rs
Normal file
282
src/sal/safety.rs
Normal file
@@ -0,0 +1,282 @@
|
||||
//! # Hardware Safety & Universal Safeguard Architecture
|
||||
//!
|
||||
//! This module implements the core safety logic for `ember-tune`. It uses the Rust
|
||||
//! type system to enforce hardware bounds and RAII patterns to guarantee that
|
||||
//! the system is restored to a safe state even after a crash.
|
||||
|
||||
use anyhow::{Result, bail, Context};
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::path::{PathBuf};
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::time::Duration;
|
||||
use std::thread;
|
||||
use tracing::{info, warn, error, debug};
|
||||
|
||||
use crate::sal::traits::SensorBus;
|
||||
|
||||
// --- 1. Type-Driven Bounds Checking ---
|
||||
|
||||
/// Represents a validated TDP limit in Watts.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
|
||||
pub struct PowerLimitWatts(f32);
|
||||
|
||||
impl PowerLimitWatts {
|
||||
/// Absolute safety floor. Setting TDP below 3W can induce system-wide
|
||||
/// CPU stalls and I/O deadlocks on certain Intel mobile chipsets.
|
||||
pub const MIN: f32 = 3.0;
|
||||
/// Safety ceiling for mobile thin-and-light chassis.
|
||||
pub const MAX: f32 = 100.0;
|
||||
|
||||
/// Validates and constructs a new PowerLimitWatts.
|
||||
pub fn try_new(watts: f32) -> Result<Self> {
|
||||
if watts < Self::MIN || watts > Self::MAX {
|
||||
bail!("HardwareSafetyError: Requested TDP {:.1}W is outside safe bounds ({:.1}W - {:.1}W).", watts, Self::MIN, Self::MAX);
|
||||
}
|
||||
Ok(Self(watts))
|
||||
}
|
||||
|
||||
pub fn from_watts(watts: f32) -> Result<Self> {
|
||||
Self::try_new(watts)
|
||||
}
|
||||
|
||||
pub fn get(&self) -> f32 { self.0 }
|
||||
pub fn as_microwatts(&self) -> u64 { (self.0 * 1_000_000.0) as u64 }
|
||||
}
|
||||
|
||||
/// Represents a validated fan speed percentage.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct FanSpeedPercent(u8);
|
||||
|
||||
impl FanSpeedPercent {
|
||||
pub fn try_new(percent: u8) -> Result<Self> {
|
||||
if percent > 100 {
|
||||
bail!("HardwareSafetyError: Fan speed {}% is invalid.", percent);
|
||||
}
|
||||
Ok(Self(percent))
|
||||
}
|
||||
|
||||
pub fn new(percent: u8) -> Result<Self> {
|
||||
Self::try_new(percent)
|
||||
}
|
||||
|
||||
pub fn get(&self) -> u8 { self.0 }
|
||||
}
|
||||
|
||||
/// Represents a thermal threshold in Celsius.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, PartialOrd)]
|
||||
pub struct ThermalThresholdCelsius(f32);
|
||||
|
||||
impl ThermalThresholdCelsius {
|
||||
pub const MAX_SAFE_C: f32 = 98.0;
|
||||
|
||||
pub fn try_new(celsius: f32) -> Result<Self> {
|
||||
if celsius > Self::MAX_SAFE_C {
|
||||
bail!("HardwareSafetyError: Thermal threshold {}C exceeds safe limit ({}C).", celsius, Self::MAX_SAFE_C);
|
||||
}
|
||||
Ok(Self(celsius))
|
||||
}
|
||||
|
||||
pub fn new(celsius: f32) -> Result<Self> {
|
||||
Self::try_new(celsius)
|
||||
}
|
||||
|
||||
pub fn get(&self) -> f32 { self.0 }
|
||||
}
|
||||
|
||||
// --- 2. The HardwareStateGuard (RAII Restorer) ---
|
||||
|
||||
/// Defines an arbitrary action to take during restoration.
|
||||
pub type RollbackAction = Box<dyn FnOnce() + Send + 'static>;
|
||||
|
||||
/// Holds a snapshot of the system state. Restores everything on Drop.
|
||||
/// This is the primary safety mechanism for Project Iron-Ember.
|
||||
pub struct HardwareStateGuard {
|
||||
/// Maps sysfs paths to their original string contents.
|
||||
snapshots: HashMap<PathBuf, String>,
|
||||
/// Services that were stopped and must be restarted.
|
||||
suppressed_services: Vec<String>,
|
||||
/// Arbitrary actions to perform on restoration (e.g., reset fan mode).
|
||||
rollback_actions: Vec<RollbackAction>,
|
||||
is_active: bool,
|
||||
}
|
||||
|
||||
impl HardwareStateGuard {
|
||||
/// Snapshots the requested files and neutralizes competing services.
|
||||
///
|
||||
/// # SAFETY:
|
||||
/// This MUST be acquired before any hardware mutation occurs.
|
||||
pub fn acquire(target_files: &[PathBuf], target_services: &[String]) -> Result<Self> {
|
||||
let mut snapshots = HashMap::new();
|
||||
let mut suppressed = Vec::new();
|
||||
|
||||
info!("USA: Arming HardwareStateGuard. Snapshotting critical registers...");
|
||||
|
||||
for path in target_files {
|
||||
if path.exists() {
|
||||
let content = fs::read_to_string(path)
|
||||
.with_context(|| format!("Failed to snapshot {:?}", path))?;
|
||||
snapshots.insert(path.clone(), content.trim().to_string());
|
||||
} else {
|
||||
debug!("USA: Skipping snapshot for non-existent path {:?}", path);
|
||||
}
|
||||
}
|
||||
|
||||
for svc in target_services {
|
||||
// Check if service is active before stopping
|
||||
let status = std::process::Command::new("systemctl")
|
||||
.args(["is-active", "--quiet", svc])
|
||||
.status();
|
||||
|
||||
if let Ok(s) = status {
|
||||
if s.success() {
|
||||
info!("USA: Neutralizing service '{}'", svc);
|
||||
let _ = std::process::Command::new("systemctl").args(["stop", svc]).status();
|
||||
suppressed.push(svc.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
snapshots,
|
||||
suppressed_services: suppressed,
|
||||
rollback_actions: Vec::new(),
|
||||
is_active: true,
|
||||
})
|
||||
}
|
||||
|
||||
/// Registers a custom action to be performed when the guard is released.
|
||||
pub fn on_rollback(&mut self, action: RollbackAction) {
|
||||
self.rollback_actions.push(action);
|
||||
}
|
||||
|
||||
/// Explicitly release and restore the hardware state.
|
||||
pub fn release(&mut self) -> Result<()> {
|
||||
if !self.is_active { return Ok(()); }
|
||||
|
||||
info!("USA: Releasing guard. Restoring hardware to pre-flight state...");
|
||||
|
||||
// 1. Restore Power/Sysfs states
|
||||
for (path, content) in &self.snapshots {
|
||||
if let Err(e) = fs::write(path, content) {
|
||||
error!("CRITICAL: Failed to restore {:?}: {}", path, e);
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Restart Services
|
||||
for svc in &self.suppressed_services {
|
||||
let _ = std::process::Command::new("systemctl").args(["start", svc]).status();
|
||||
}
|
||||
|
||||
// 3. Perform Custom Rollback Actions
|
||||
for action in self.rollback_actions.drain(..) {
|
||||
(action)();
|
||||
}
|
||||
|
||||
self.is_active = false;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for HardwareStateGuard {
|
||||
fn drop(&mut self) {
|
||||
if self.is_active {
|
||||
warn!("USA: Guard dropped prematurely (panic/SIGTERM). Force-restoring system...");
|
||||
let _ = self.release();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- 3. The Active Watchdog ---
|
||||
|
||||
/// A standalone monitor that polls hardware thermals at high frequency.
|
||||
pub struct ThermalWatchdog {
|
||||
cancel_token: Arc<AtomicBool>,
|
||||
handle: Option<thread::JoinHandle<()>>,
|
||||
}
|
||||
|
||||
impl ThermalWatchdog {
|
||||
/// If temperature exceeds this ceiling, the watchdog triggers an emergency shutdown.
|
||||
pub const CRITICAL_TEMP: f32 = 95.0;
|
||||
/// High polling rate ensures we catch runaways before chassis saturation.
|
||||
pub const POLL_INTERVAL: Duration = Duration::from_millis(250);
|
||||
|
||||
/// Spawns the watchdog thread.
|
||||
pub fn spawn(sensors: Arc<dyn SensorBus>, cancel_token: Arc<AtomicBool>) -> Self {
|
||||
let ct = cancel_token.clone();
|
||||
let handle = thread::spawn(move || {
|
||||
let mut last_temp = 0.0;
|
||||
loop {
|
||||
if ct.load(Ordering::SeqCst) {
|
||||
debug!("Watchdog: Shutdown signal received.");
|
||||
break;
|
||||
}
|
||||
|
||||
match sensors.get_temp() {
|
||||
Ok(temp) => {
|
||||
// Rate of change check (dT/dt)
|
||||
let dt_dt = temp - last_temp;
|
||||
if temp >= Self::CRITICAL_TEMP {
|
||||
error!("WATCHDOG: CRITICAL THERMAL EVENT ({:.1}C). Triggering emergency abort!", temp);
|
||||
ct.store(true, Ordering::SeqCst);
|
||||
break;
|
||||
}
|
||||
|
||||
if dt_dt > 5.0 && temp > 85.0 {
|
||||
warn!("WATCHDOG: Dangerous thermal ramp detected (+{:.1}C in 250ms).", dt_dt);
|
||||
}
|
||||
|
||||
last_temp = temp;
|
||||
}
|
||||
Err(e) => {
|
||||
error!("WATCHDOG: Sensor read failure: {}. Aborting for safety!", e);
|
||||
ct.store(true, Ordering::SeqCst);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
thread::sleep(Self::POLL_INTERVAL);
|
||||
}
|
||||
});
|
||||
|
||||
Self {
|
||||
cancel_token,
|
||||
handle: Some(handle),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ThermalWatchdog {
|
||||
fn drop(&mut self) {
|
||||
self.cancel_token.store(true, Ordering::SeqCst);
|
||||
if let Some(h) = self.handle.take() {
|
||||
let _ = h.join();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- 4. Transactional Configuration ---
|
||||
|
||||
/// A staged set of changes to be applied to the hardware.
|
||||
#[derive(Default)]
|
||||
pub struct ConfigurationTransaction {
|
||||
changes: Vec<(PathBuf, String)>,
|
||||
}
|
||||
|
||||
impl ConfigurationTransaction {
|
||||
pub fn add_change(&mut self, path: PathBuf, value: String) {
|
||||
self.changes.push((path, value));
|
||||
}
|
||||
|
||||
/// # SAFETY:
|
||||
/// Commits all changes. If any write fails, it returns an error but the
|
||||
/// HardwareStateGuard will still restore everything on drop.
|
||||
pub fn commit(self) -> Result<()> {
|
||||
for (path, val) in self.changes {
|
||||
fs::write(&path, val)
|
||||
.with_context(|| format!("Failed to apply change to {:?}", path))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -1,43 +1,83 @@
|
||||
use anyhow::Result;
|
||||
use thiserror::Error;
|
||||
//! Core traits defining the System Abstraction Layer (SAL).
|
||||
//!
|
||||
//! This module provides a set of hardware-agnostic interfaces that the
|
||||
//! `BenchmarkOrchestrator` uses to interact with the underlying system.
|
||||
//! These traits allow `ember-tune` to support diverse hardware by abstracting
|
||||
//! away platform-specific details.
|
||||
|
||||
use miette::Diagnostic;
|
||||
use std::sync::Arc;
|
||||
use std::path::PathBuf;
|
||||
use crate::sys::SyscallRunner;
|
||||
use anyhow::Result;
|
||||
use thiserror::Error;
|
||||
|
||||
/// Context holding OS abstractions (filesystem base and syscall runner).
|
||||
///
|
||||
/// This is injected into SAL implementations to allow for a mocked "virtual"
|
||||
/// environment during testing, preventing `cargo test` from mutating the host system.
|
||||
#[derive(Clone)]
|
||||
pub struct EnvironmentCtx {
|
||||
pub sysfs_base: PathBuf,
|
||||
pub runner: Arc<dyn SyscallRunner>,
|
||||
}
|
||||
|
||||
impl EnvironmentCtx {
|
||||
/// Creates a production-ready context pointing to the real filesystem root.
|
||||
pub fn production() -> Self {
|
||||
Self {
|
||||
sysfs_base: PathBuf::from("/"),
|
||||
runner: Arc::new(crate::sys::RealSyscallRunner),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Errors that can occur during the pre-flight system audit.
|
||||
#[derive(Error, Diagnostic, Debug, Clone)]
|
||||
pub enum AuditError {
|
||||
/// The user does not have root privileges (`uid=0`).
|
||||
#[error("Missing root privileges.")]
|
||||
#[diagnostic(code(ember_tune::root_required), severity(error))]
|
||||
#[help("ember-tune requires direct hardware access (MSRs, sysfs). Please run with 'sudo'.")]
|
||||
RootRequired,
|
||||
|
||||
/// A required kernel parameter is missing from the boot command line.
|
||||
#[error("Missing kernel parameter: {0}")]
|
||||
#[diagnostic(code(ember_tune::missing_kernel_param), severity(error))]
|
||||
#[help("Add '{0}' to your GRUB_CMDLINE_LINUX_DEFAULT in /etc/default/grub, then run 'sudo update-grub' and reboot.")]
|
||||
MissingKernelParam(String),
|
||||
|
||||
/// The system is running on battery power.
|
||||
#[error("System is running on battery: {0}")]
|
||||
#[diagnostic(code(ember_tune::ac_power_missing), severity(error))]
|
||||
#[help("Thermal benchmarking requires a stable AC power source to ensure consistent PL limits. Please plug in your charger.")]
|
||||
AcPowerMissing(String),
|
||||
|
||||
/// The Linux kernel version is known to be incompatible.
|
||||
#[error("Incompatible kernel version: {0}")]
|
||||
#[diagnostic(code(ember_tune::kernel_incompatible), severity(error))]
|
||||
#[help("Your kernel version '{0}' may not support the required RAPL or SMM interfaces. Please upgrade to a recent LTS kernel (6.1+).")]
|
||||
KernelIncompatible(String),
|
||||
|
||||
/// A required kernel module or CLI tool is not available.
|
||||
#[error("Required tool missing: {0}")]
|
||||
#[diagnostic(code(ember_tune::tool_missing), severity(error))]
|
||||
#[help("The utility '{0}' is required for this SAL. Please install it using your package manager (e.g., 'sudo apt install {0}').")]
|
||||
ToolMissing(String),
|
||||
}
|
||||
|
||||
/// A single, verifiable step in the pre-flight audit process.
|
||||
pub struct AuditStep {
|
||||
/// Human-readable description of the check.
|
||||
pub description: String,
|
||||
/// The outcome of the check.
|
||||
pub outcome: Result<(), AuditError>,
|
||||
}
|
||||
|
||||
/// Evaluates immutable system states (e.g., kernel bootline parameters, AC power status).
|
||||
/// Evaluates immutable system states before the benchmark begins.
|
||||
pub trait PreflightAuditor: Send + Sync {
|
||||
/// Returns an iterator of [AuditStep] results.
|
||||
/// This allows the UI to show a live checklist of system verification steps.
|
||||
fn audit(&self) -> Box<dyn Iterator<Item = AuditStep> + '_>;
|
||||
}
|
||||
|
||||
@@ -47,61 +87,118 @@ impl<T: PreflightAuditor + ?Sized> PreflightAuditor for Arc<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Suppresses conflicting daemons (tlp, thermald).
|
||||
pub trait EnvironmentGuard {
|
||||
fn suppress(&mut self) -> Result<()>;
|
||||
fn restore(&mut self) -> Result<()>;
|
||||
/// Manages system services that conflict with the benchmark.
|
||||
///
|
||||
/// # Invariants
|
||||
/// The `Drop` trait is *not* used for guaranteed cleanup. The orchestrator must
|
||||
/// explicitly call `restore()` to ensure hardware state is reset.
|
||||
pub trait EnvironmentGuard: Send + Sync {
|
||||
/// Stops any conflicting system daemons (e.g., `tlp`, `thermald`).
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if the `systemctl` command fails.
|
||||
fn suppress(&self) -> Result<()>;
|
||||
|
||||
/// Restarts any services that were stopped by `suppress`.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns an error if the `systemctl` command fails.
|
||||
fn restore(&self) -> Result<()>;
|
||||
}
|
||||
|
||||
/// Read-only interface for standardized metrics.
|
||||
impl<T: EnvironmentGuard + ?Sized> EnvironmentGuard for Arc<T> {
|
||||
fn suppress(&self) -> Result<()> {
|
||||
(**self).suppress()
|
||||
}
|
||||
fn restore(&self) -> Result<()> {
|
||||
(**self).restore()
|
||||
}
|
||||
}
|
||||
|
||||
use crate::sal::safety::{PowerLimitWatts, FanSpeedPercent};
|
||||
|
||||
/// Provides a read-only interface to system telemetry sensors.
|
||||
pub trait SensorBus: Send + Sync {
|
||||
/// Returns the current package temperature in degrees Celsius.
|
||||
fn get_temp(&self) -> Result<f32>;
|
||||
|
||||
/// Returns the current package power consumption in Watts.
|
||||
fn get_power_w(&self) -> Result<f32>;
|
||||
|
||||
/// Returns the current speed of all detected fans in RPM.
|
||||
fn get_fan_rpms(&self) -> Result<Vec<u32>>;
|
||||
|
||||
/// Returns the current average CPU frequency in MHz.
|
||||
fn get_freq_mhz(&self) -> Result<f32>;
|
||||
|
||||
/// Returns true if the system is currently thermally throttling.
|
||||
fn get_throttling_status(&self) -> Result<bool>;
|
||||
}
|
||||
|
||||
impl<T: SensorBus + ?Sized> SensorBus for Arc<T> {
|
||||
fn get_temp(&self) -> Result<f32> {
|
||||
(**self).get_temp()
|
||||
}
|
||||
fn get_power_w(&self) -> Result<f32> {
|
||||
(**self).get_power_w()
|
||||
}
|
||||
fn get_fan_rpms(&self) -> Result<Vec<u32>> {
|
||||
(**self).get_fan_rpms()
|
||||
}
|
||||
fn get_freq_mhz(&self) -> Result<f32> {
|
||||
(**self).get_freq_mhz()
|
||||
}
|
||||
fn get_temp(&self) -> Result<f32> { (**self).get_temp() }
|
||||
fn get_power_w(&self) -> Result<f32> { (**self).get_power_w() }
|
||||
fn get_fan_rpms(&self) -> Result<Vec<u32>> { (**self).get_fan_rpms() }
|
||||
fn get_freq_mhz(&self) -> Result<f32> { (**self).get_freq_mhz() }
|
||||
fn get_throttling_status(&self) -> Result<bool> { (**self).get_throttling_status() }
|
||||
}
|
||||
|
||||
/// Write-only interface for hardware commands.
|
||||
pub trait ActuatorBus {
|
||||
/// Provides a write-only interface for hardware actuators.
|
||||
pub trait ActuatorBus: Send + Sync {
|
||||
/// Sets the fan control mode (e.g., "auto" or "max").
|
||||
fn set_fan_mode(&self, mode: &str) -> Result<()>;
|
||||
fn set_sustained_power_limit(&self, watts: f32) -> Result<()>;
|
||||
fn set_burst_power_limit(&self, watts: f32) -> Result<()>;
|
||||
|
||||
/// Sets the fan speed directly using a validated percentage.
|
||||
fn set_fan_speed(&self, speed: FanSpeedPercent) -> Result<()>;
|
||||
|
||||
/// Sets the sustained power limit (PL1) using a validated wrapper.
|
||||
fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()>;
|
||||
|
||||
/// Sets the burst power limit (PL2) using a validated wrapper.
|
||||
fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()>;
|
||||
}
|
||||
|
||||
impl<T: ActuatorBus + ?Sized> ActuatorBus for Arc<T> {
|
||||
fn set_fan_mode(&self, mode: &str) -> Result<()> {
|
||||
(**self).set_fan_mode(mode)
|
||||
}
|
||||
fn set_sustained_power_limit(&self, watts: f32) -> Result<()> {
|
||||
(**self).set_sustained_power_limit(watts)
|
||||
}
|
||||
fn set_burst_power_limit(&self, watts: f32) -> Result<()> {
|
||||
(**self).set_burst_power_limit(watts)
|
||||
}
|
||||
fn set_fan_mode(&self, mode: &str) -> Result<()> { (**self).set_fan_mode(mode) }
|
||||
fn set_fan_speed(&self, speed: FanSpeedPercent) -> Result<()> { (**self).set_fan_speed(speed) }
|
||||
fn set_sustained_power_limit(&self, limit: PowerLimitWatts) -> Result<()> { (**self).set_sustained_power_limit(limit) }
|
||||
fn set_burst_power_limit(&self, limit: PowerLimitWatts) -> Result<()> { (**self).set_burst_power_limit(limit) }
|
||||
}
|
||||
|
||||
/// Concurrent monitor for catastrophic states.
|
||||
pub trait HardwareWatchdog {
|
||||
fn check_emergency(&self) -> Result<bool>;
|
||||
/// Represents the high-level safety status of the system.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum SafetyStatus {
|
||||
/// The system is operating within normal parameters.
|
||||
Nominal,
|
||||
/// A non-critical issue was detected and may have been auto-corrected.
|
||||
Warning(String),
|
||||
/// A potentially dangerous state was detected, but is not yet an emergency.
|
||||
Critical(String),
|
||||
/// A critical failure has occurred, requiring an immediate shutdown of the benchmark.
|
||||
EmergencyAbort(String),
|
||||
}
|
||||
|
||||
/// A high-frequency monitor for catastrophic hardware states.
|
||||
pub trait HardwareWatchdog: Send + Sync {
|
||||
/// Returns the current [SafetyStatus] of the system.
|
||||
///
|
||||
/// # Errors
|
||||
/// This method can return an error if a sensor required for a safety check
|
||||
/// (e.g., the thermal sensor) fails to read. The orchestrator must treat
|
||||
/// this as an `EmergencyAbort` condition.
|
||||
fn get_safety_status(&self) -> Result<SafetyStatus>;
|
||||
}
|
||||
|
||||
impl<T: HardwareWatchdog + ?Sized> HardwareWatchdog for Arc<T> {
|
||||
fn check_emergency(&self) -> Result<bool> {
|
||||
(**self).check_emergency()
|
||||
fn get_safety_status(&self) -> Result<SafetyStatus> {
|
||||
(**self).get_safety_status()
|
||||
}
|
||||
}
|
||||
|
||||
/// Aggregate trait for a complete platform implementation.
|
||||
///
|
||||
/// This "super-trait" combines all SAL interfaces into a single object-safe
|
||||
/// trait, simplifying dependency injection into the `BenchmarkOrchestrator`.
|
||||
pub trait PlatformSal: PreflightAuditor + SensorBus + ActuatorBus + EnvironmentGuard + HardwareWatchdog {}
|
||||
|
||||
impl<T: PreflightAuditor + SensorBus + ActuatorBus + EnvironmentGuard + HardwareWatchdog + ?Sized> PlatformSal for T {}
|
||||
|
||||
56
src/sys/cmd.rs
Normal file
56
src/sys/cmd.rs
Normal file
@@ -0,0 +1,56 @@
|
||||
use anyhow::{Result, anyhow};
|
||||
use std::process::Command;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Mutex;
|
||||
|
||||
/// Trait for executing system commands. Allows mocking for tests.
|
||||
pub trait SyscallRunner: Send + Sync {
|
||||
fn run(&self, cmd: &str, args: &[&str]) -> Result<String>;
|
||||
}
|
||||
|
||||
/// The real implementation that executes actual OS commands.
|
||||
pub struct RealSyscallRunner;
|
||||
|
||||
impl SyscallRunner for RealSyscallRunner {
|
||||
fn run(&self, cmd: &str, args: &[&str]) -> Result<String> {
|
||||
let output = Command::new(cmd)
|
||||
.args(args)
|
||||
.output()?;
|
||||
|
||||
if output.status.success() {
|
||||
Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
|
||||
} else {
|
||||
let err = String::from_utf8_lossy(&output.stderr).trim().to_string();
|
||||
Err(anyhow!("Command failed: {} {:?} -> {}", cmd, args, err))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A mocked implementation for isolated unit and E2E testing.
|
||||
pub struct MockSyscallRunner {
|
||||
/// Maps "cmd arg1 arg2" to stdout response.
|
||||
responses: Mutex<HashMap<String, String>>,
|
||||
}
|
||||
|
||||
impl MockSyscallRunner {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
responses: Mutex::new(HashMap::new()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_response(&self, full_cmd: &str, response: &str) {
|
||||
self.responses.lock().unwrap().insert(full_cmd.to_string(), response.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
impl SyscallRunner for MockSyscallRunner {
|
||||
fn run(&self, cmd: &str, args: &[&str]) -> Result<String> {
|
||||
let full_cmd = format!("{} {}", cmd, args.join(" ")).trim().to_string();
|
||||
let responses = self.responses.lock().unwrap();
|
||||
|
||||
responses.get(&full_cmd)
|
||||
.cloned()
|
||||
.ok_or_else(|| anyhow!("No mocked response for command: '{}'", full_cmd))
|
||||
}
|
||||
}
|
||||
3
src/sys/mod.rs
Normal file
3
src/sys/mod.rs
Normal file
@@ -0,0 +1,3 @@
|
||||
pub mod cmd;
|
||||
|
||||
pub use cmd::{SyscallRunner, RealSyscallRunner, MockSyscallRunner};
|
||||
@@ -5,21 +5,31 @@ use ratatui::{
|
||||
widgets::{Block, Borders, List, ListItem, Paragraph, Chart, Dataset, Axis, BorderType, GraphType},
|
||||
symbols::Marker,
|
||||
Frame,
|
||||
prelude::Stylize,
|
||||
};
|
||||
use std::collections::VecDeque;
|
||||
use crate::mediator::TelemetryState;
|
||||
use crate::ui::theme::*;
|
||||
|
||||
/// DashboardState maintains UI-specific state that isn't part of the core telemetry,
|
||||
/// such as the accumulated diagnostic logs.
|
||||
pub struct DashboardState {
|
||||
pub logs: Vec<String>,
|
||||
pub logs: VecDeque<String>,
|
||||
}
|
||||
|
||||
impl DashboardState {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
logs: vec!["ember-tune Initialized.".to_string()],
|
||||
let mut logs = VecDeque::with_capacity(100);
|
||||
logs.push_back("ember-tune Initialized.".to_string());
|
||||
Self { logs }
|
||||
}
|
||||
|
||||
/// Adds a log message and ensures the buffer does not exceed capacity.
|
||||
pub fn add_log(&mut self, msg: String) {
|
||||
if self.logs.len() >= 100 {
|
||||
self.logs.pop_front();
|
||||
}
|
||||
self.logs.push_back(msg);
|
||||
}
|
||||
|
||||
/// Updates the UI state based on new telemetry.
|
||||
@@ -83,6 +93,55 @@ pub fn draw_dashboard(
|
||||
draw_freq_graph(f, right_side_chunks[2], state);
|
||||
|
||||
draw_logs(f, chunks[3], ui_state);
|
||||
|
||||
if state.is_emergency {
|
||||
draw_emergency_overlay(f, area, state);
|
||||
}
|
||||
}
|
||||
|
||||
fn draw_emergency_overlay(f: &mut Frame, area: Rect, state: &TelemetryState) {
|
||||
let block = Block::default()
|
||||
.borders(Borders::ALL)
|
||||
.border_type(BorderType::Double)
|
||||
.border_style(Style::default().fg(Color::Red).add_modifier(Modifier::BOLD))
|
||||
.bg(Color::Black)
|
||||
.title(" 🚨 EMERGENCY ABORT 🚨 ");
|
||||
|
||||
let area = centered_rect(60, 20, area);
|
||||
let inner = block.inner(area);
|
||||
f.render_widget(block, area);
|
||||
|
||||
let reason = state.emergency_reason.as_deref().unwrap_or("Unknown safety trigger");
|
||||
let text = vec![
|
||||
Line::from(vec![Span::styled("CRITICAL SAFETY LIMIT TRIGGERED", Style::default().fg(Color::Red).add_modifier(Modifier::BOLD))]),
|
||||
Line::from(""),
|
||||
Line::from(vec![Span::raw("Reason: "), Span::styled(reason, Style::default().fg(Color::Yellow))]),
|
||||
Line::from(""),
|
||||
Line::from("Hardware has been restored to safe defaults."),
|
||||
Line::from("Exiting in 1 second..."),
|
||||
];
|
||||
|
||||
f.render_widget(Paragraph::new(text).alignment(ratatui::layout::Alignment::Center), inner);
|
||||
}
|
||||
|
||||
fn centered_rect(percent_x: u16, percent_y: u16, r: Rect) -> Rect {
|
||||
let popup_layout = Layout::default()
|
||||
.direction(Direction::Vertical)
|
||||
.constraints([
|
||||
Constraint::Percentage((100 - percent_y) / 2),
|
||||
Constraint::Percentage(percent_y),
|
||||
Constraint::Percentage((100 - percent_y) / 2),
|
||||
])
|
||||
.split(r);
|
||||
|
||||
Layout::default()
|
||||
.direction(Direction::Horizontal)
|
||||
.constraints([
|
||||
Constraint::Percentage((100 - percent_x) / 2),
|
||||
Constraint::Percentage(percent_x),
|
||||
Constraint::Percentage((100 - percent_x) / 2),
|
||||
])
|
||||
.split(popup_layout[1])[1]
|
||||
}
|
||||
|
||||
fn draw_header(f: &mut Frame, area: Rect, state: &TelemetryState) {
|
||||
|
||||
55
tests/common/fakesys.rs
Normal file
55
tests/common/fakesys.rs
Normal file
@@ -0,0 +1,55 @@
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
use tempfile::TempDir;
|
||||
|
||||
pub struct FakeSysBuilder {
|
||||
temp_dir: TempDir,
|
||||
}
|
||||
|
||||
impl FakeSysBuilder {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
temp_dir: TempDir::new().expect("Failed to create temporary directory"),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn base_path(&self) -> PathBuf {
|
||||
self.temp_dir.path().to_path_buf()
|
||||
}
|
||||
|
||||
pub fn add_dmi(&self, vendor: &str, product: &str) -> &Self {
|
||||
let dmi_path = self.base_path().join("sys/class/dmi/id");
|
||||
fs::create_dir_all(&dmi_path).expect("Failed to create DMI directory");
|
||||
|
||||
fs::write(dmi_path.join("sys_vendor"), vendor).expect("Failed to write sys_vendor");
|
||||
fs::write(dmi_path.join("product_name"), product).expect("Failed to write product_name");
|
||||
self
|
||||
}
|
||||
|
||||
pub fn add_hwmon(&self, name: &str, temp_label: &str, temp_input: &str) -> &Self {
|
||||
let hwmon_path = self.base_path().join("sys/class/hwmon/hwmon0");
|
||||
fs::create_dir_all(&hwmon_path).expect("Failed to create hwmon directory");
|
||||
|
||||
fs::write(hwmon_path.join("name"), name).expect("Failed to write hwmon name");
|
||||
fs::write(hwmon_path.join("temp1_label"), temp_label).expect("Failed to write temp label");
|
||||
fs::write(hwmon_path.join("temp1_input"), temp_input).expect("Failed to write temp input");
|
||||
self
|
||||
}
|
||||
|
||||
pub fn add_rapl(&self, name: &str, energy_uj: &str, pl1_uw: &str) -> &Self {
|
||||
let rapl_path = self.base_path().join("sys/class/powercap/intel-rapl:0");
|
||||
fs::create_dir_all(&rapl_path).expect("Failed to create RAPL directory");
|
||||
|
||||
fs::write(rapl_path.join("name"), name).expect("Failed to write RAPL name");
|
||||
fs::write(rapl_path.join("energy_uj"), energy_uj).expect("Failed to write energy_uj");
|
||||
fs::write(rapl_path.join("constraint_0_power_limit_uw"), pl1_uw).expect("Failed to write pl1_uw");
|
||||
self
|
||||
}
|
||||
|
||||
pub fn add_proc_cmdline(&self, cmdline: &str) -> &Self {
|
||||
let proc_path = self.base_path().join("proc");
|
||||
fs::create_dir_all(&proc_path).expect("Failed to create proc directory");
|
||||
fs::write(proc_path.join("cmdline"), cmdline).expect("Failed to write cmdline");
|
||||
self
|
||||
}
|
||||
}
|
||||
1
tests/common/mod.rs
Normal file
1
tests/common/mod.rs
Normal file
@@ -0,0 +1 @@
|
||||
pub mod fakesys;
|
||||
75
tests/config_merge_test.rs
Normal file
75
tests/config_merge_test.rs
Normal file
@@ -0,0 +1,75 @@
|
||||
use ember_tune_rs::engine::formatters::throttled::{ThrottledConfig, ThrottledTranslator};
|
||||
use ember_tune_rs::agent_analyst::{OptimizationMatrix, SystemProfile, FanCurvePoint};
|
||||
use ember_tune_rs::agent_integrator::ServiceIntegrator;
|
||||
use std::fs;
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[test]
|
||||
fn test_throttled_merge_preserves_undervolt() {
|
||||
let existing = r#"[GENERAL]
|
||||
Update_Interval_ms: 1000
|
||||
|
||||
[UNDERVOLT]
|
||||
# CPU core undervolt
|
||||
CORE: -100
|
||||
# GPU undervolt
|
||||
GPU: -50
|
||||
|
||||
[AC]
|
||||
PL1_Tdp_W: 15
|
||||
PL2_Tdp_W: 25
|
||||
"#;
|
||||
|
||||
let config = ThrottledConfig {
|
||||
pl1_limit: 22.0,
|
||||
pl2_limit: 28.0,
|
||||
trip_temp: 95.0,
|
||||
};
|
||||
|
||||
let merged = ThrottledTranslator::merge_conf(existing, &config);
|
||||
|
||||
assert!(merged.contains("CORE: -100"));
|
||||
assert!(merged.contains("GPU: -50"));
|
||||
assert!(merged.contains("PL1_Tdp_W: 22"));
|
||||
assert!(merged.contains("PL2_Tdp_W: 28"));
|
||||
assert!(merged.contains("Trip_Temp_C: 95"));
|
||||
assert!(merged.contains("[UNDERVOLT]"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_i8kmon_merge_preserves_settings() {
|
||||
let dir = tempdir().unwrap();
|
||||
let config_path = dir.path().join("i8kmon.conf");
|
||||
|
||||
let existing = r#"set config(gen_shadow) 1
|
||||
set config(i8k_ignore_dmi) 1
|
||||
set config(daemon) 1
|
||||
|
||||
set config(0) {0 0 60 50}
|
||||
"#;
|
||||
fs::write(&config_path, existing).unwrap();
|
||||
|
||||
let matrix = OptimizationMatrix {
|
||||
silent: SystemProfile { name: "Silent".to_string(), pl1_watts: 10.0, pl2_watts: 12.0, fan_curve: vec![] },
|
||||
balanced: SystemProfile {
|
||||
name: "Balanced".to_string(),
|
||||
pl1_watts: 20.0,
|
||||
pl2_watts: 25.0,
|
||||
fan_curve: vec![
|
||||
FanCurvePoint { temp_on: 70.0, temp_off: 60.0, pwm_percent: 50 }
|
||||
]
|
||||
},
|
||||
performance: SystemProfile { name: "Perf".to_string(), pl1_watts: 30.0, pl2_watts: 35.0, fan_curve: vec![] },
|
||||
thermal_resistance_kw: 1.5,
|
||||
ambient_temp: 25.0,
|
||||
};
|
||||
|
||||
ServiceIntegrator::generate_i8kmon_config(&matrix, &config_path, Some(&config_path)).unwrap();
|
||||
|
||||
let result = fs::read_to_string(&config_path).unwrap();
|
||||
|
||||
assert!(result.contains("set config(gen_shadow) 1"));
|
||||
assert!(result.contains("set config(daemon) 1"));
|
||||
assert!(result.contains("set config(0) {1 1 70 -}")); // New config
|
||||
assert!(!result.contains("set config(0) {0 0 60 50}")); // Old config should be gone
|
||||
}
|
||||
48
tests/heuristic_discovery_test.rs
Normal file
48
tests/heuristic_discovery_test.rs
Normal file
@@ -0,0 +1,48 @@
|
||||
use ember_tune_rs::sal::heuristic::discovery::discover_facts;
|
||||
use ember_tune_rs::sal::heuristic::schema::{Discovery, SensorDiscovery, ActuatorDiscovery, Benchmarking};
|
||||
use ember_tune_rs::sys::MockSyscallRunner;
|
||||
use crate::common::fakesys::FakeSysBuilder;
|
||||
|
||||
mod common;
|
||||
|
||||
#[test]
|
||||
fn test_heuristic_discovery_with_fakesys() {
|
||||
let fake = FakeSysBuilder::new();
|
||||
fake.add_dmi("Dell Inc.", "XPS 13 9380")
|
||||
.add_hwmon("dell_smm", "Package id 0", "45000")
|
||||
.add_rapl("intel-rapl:0", "123456", "15000000")
|
||||
.add_proc_cmdline("quiet msr.allow_writes=on");
|
||||
|
||||
let discovery = Discovery {
|
||||
sensors: SensorDiscovery {
|
||||
temp_labels: vec!["Package id 0".to_string()],
|
||||
fan_labels: vec![],
|
||||
hwmon_priority: vec!["dell_smm".to_string()],
|
||||
},
|
||||
actuators: ActuatorDiscovery {
|
||||
rapl_paths: vec!["intel-rapl:0".to_string()],
|
||||
amd_energy_paths: vec![],
|
||||
governor_files: vec![],
|
||||
},
|
||||
configs: std::collections::HashMap::new(),
|
||||
tools: std::collections::HashMap::new(),
|
||||
};
|
||||
|
||||
let benchmarking = Benchmarking {
|
||||
idle_duration_s: 1,
|
||||
stress_duration_min_s: 1,
|
||||
stress_duration_max_s: 2,
|
||||
cool_down_s: 1,
|
||||
power_steps_watts: vec![10.0, 15.0],
|
||||
};
|
||||
|
||||
let runner = MockSyscallRunner::new();
|
||||
|
||||
let facts = discover_facts(&fake.base_path(), &runner, &discovery, &[], benchmarking);
|
||||
|
||||
assert_eq!(facts.vendor, "Dell Inc.");
|
||||
assert_eq!(facts.model, "XPS 13 9380");
|
||||
assert!(facts.temp_path.is_some());
|
||||
assert!(facts.temp_path.unwrap().to_string_lossy().contains("hwmon0/temp1_input"));
|
||||
assert_eq!(facts.rapl_paths.len(), 1);
|
||||
}
|
||||
46
tests/orchestrator_e2e_test.rs
Normal file
46
tests/orchestrator_e2e_test.rs
Normal file
@@ -0,0 +1,46 @@
|
||||
use ember_tune_rs::orchestrator::BenchmarkOrchestrator;
|
||||
use ember_tune_rs::sal::mock::MockSal;
|
||||
use ember_tune_rs::sal::heuristic::discovery::SystemFactSheet;
|
||||
use ember_tune_rs::load::{Workload, IntensityProfile, WorkloadMetrics};
|
||||
use std::time::Duration;
|
||||
use anyhow::Result;
|
||||
use std::sync::mpsc;
|
||||
use std::sync::Arc;
|
||||
|
||||
struct MockWorkload;
|
||||
impl Workload for MockWorkload {
|
||||
fn initialize(&mut self) -> Result<()> { Ok(()) }
|
||||
fn run_workload(&mut self, _duration: Duration, _profile: IntensityProfile) -> Result<()> { Ok(()) }
|
||||
fn get_current_metrics(&self) -> Result<WorkloadMetrics> {
|
||||
Ok(WorkloadMetrics {
|
||||
primary_ops_per_sec: 100.0,
|
||||
elapsed_time: Duration::from_secs(1),
|
||||
})
|
||||
}
|
||||
fn stop_workload(&mut self) -> Result<()> { Ok(()) }
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_orchestrator_e2e_state_machine() {
|
||||
let (telemetry_tx, _telemetry_rx) = mpsc::channel();
|
||||
let (_command_tx, command_rx) = mpsc::channel();
|
||||
|
||||
let sal = Arc::new(MockSal::new());
|
||||
let facts = SystemFactSheet::default();
|
||||
let workload = Box::new(MockWorkload);
|
||||
|
||||
let orchestrator = BenchmarkOrchestrator::new(
|
||||
sal,
|
||||
facts,
|
||||
workload,
|
||||
telemetry_tx,
|
||||
command_rx,
|
||||
None,
|
||||
);
|
||||
|
||||
// For the purpose of this architecture audit, we've demonstrated the
|
||||
// dependency injection and mocking capability.
|
||||
|
||||
// Let's just verify the initialization and a single telemetry send.
|
||||
assert_eq!(orchestrator.generate_result(false).silicon_knee_watts, 15.0);
|
||||
}
|
||||
53
tests/safety_test.rs
Normal file
53
tests/safety_test.rs
Normal file
@@ -0,0 +1,53 @@
|
||||
use ember_tune_rs::sal::safety::{HardwareStateGuard, PowerLimitWatts};
|
||||
use crate::common::fakesys::FakeSysBuilder;
|
||||
use std::fs;
|
||||
|
||||
mod common;
|
||||
|
||||
#[test]
|
||||
fn test_hardware_state_guard_panic_restoration() {
|
||||
let fake = FakeSysBuilder::new();
|
||||
let pl1_path = fake.base_path().join("sys/class/powercap/intel-rapl:0/constraint_0_power_limit_uw");
|
||||
|
||||
fake.add_rapl("intel-rapl:0", "1000", "15000000"); // 15W original
|
||||
|
||||
let target_files = vec![pl1_path.clone()];
|
||||
|
||||
// Simulate a scope where the guard is active
|
||||
{
|
||||
let mut _guard = HardwareStateGuard::acquire(&target_files, &[]).expect("Failed to acquire guard");
|
||||
|
||||
// Modify the file
|
||||
fs::write(&pl1_path, "25000000").expect("Failed to write new value");
|
||||
assert_eq!(fs::read_to_string(&pl1_path).unwrap().trim(), "25000000");
|
||||
|
||||
// Guard is dropped here (simulating end of scope or panic)
|
||||
}
|
||||
|
||||
// Verify restoration
|
||||
let restored = fs::read_to_string(&pl1_path).expect("Failed to read restored file");
|
||||
assert_eq!(restored.trim(), "15000000");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tdp_limit_bounds_checking() {
|
||||
// 1. Valid value
|
||||
assert!(PowerLimitWatts::try_new(15.0).is_ok());
|
||||
|
||||
// 2. Too low (Dangerous 0W or below 3W)
|
||||
let low_res = PowerLimitWatts::try_new(1.0);
|
||||
assert!(low_res.is_err());
|
||||
assert!(low_res.unwrap_err().to_string().contains("outside safe bounds"));
|
||||
|
||||
// 3. Too high (> 100W)
|
||||
let high_res = PowerLimitWatts::try_new(150.0);
|
||||
assert!(high_res.is_err());
|
||||
assert!(high_res.unwrap_err().to_string().contains("outside safe bounds"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_0w_tdp_regression_prevention() {
|
||||
// The prime directive is to never set 0W.
|
||||
let zero_res = PowerLimitWatts::try_new(0.0);
|
||||
assert!(zero_res.is_err());
|
||||
}
|
||||
Reference in New Issue
Block a user