ember-tune-rs/src/orchestrator/mod.rs

//! The central state machine responsible for coordinating the thermal benchmark.
//!
//! It manages hardware interactions through the [PlatformSal], generates stress
//! using a [Workload], and feeds telemetry to the frontend via MPSC channels.

use anyhow::{Result, Context};
use std::sync::mpsc;
use std::time::{Duration, Instant};
use std::thread;
use std::collections::VecDeque;
use sysinfo::System;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Mutex;
use std::path::PathBuf;

use crate::sal::traits::{PlatformSal, AuditStep, SafetyStatus};
use crate::sal::heuristic::discovery::SystemFactSheet;
use crate::sal::safety::{HardwareStateGuard, TdpLimitMicroWatts};
use crate::load::{Workload, IntensityProfile};
use crate::mediator::{TelemetryState, UiCommand, BenchmarkPhase};
use crate::engine::{OptimizerEngine, ThermalProfile, ThermalPoint, OptimizationResult};

/// The central state machine responsible for coordinating the thermal benchmark.
///
/// It manages hardware interactions through the [PlatformSal], generates stress
/// using a [Workload], and feeds telemetry to the frontend via MPSC channels.
pub struct BenchmarkOrchestrator {
    /// Injected hardware abstraction layer.
    sal: Arc<dyn PlatformSal>,
    /// Discovered system facts and paths.
    facts: SystemFactSheet,
    /// Heat generation workload.
    workload: Box<dyn Workload>,
    /// Channel for sending telemetry updates to the UI.
    telemetry_tx: mpsc::Sender<TelemetryState>,
    /// Channel for receiving commands from the UI.
    command_rx: mpsc::Receiver<UiCommand>,
    /// Current phase of the benchmark.
    phase: BenchmarkPhase,
    /// Accumulated thermal data points.
    profile: ThermalProfile,
    /// Mathematics engine for data smoothing and optimization.
    engine: OptimizerEngine,
    /// CLI override for the configuration output path.
    optional_config_out: Option<PathBuf>,

    /// The safety membrane protecting the system.
    safeguard: Option<HardwareStateGuard>,

    /// Sliding window of power readings (Watts).
    history_watts: VecDeque<f32>,
    /// Sliding window of temperature readings (Celsius).
    history_temp: VecDeque<f32>,
    /// Sliding window of CPU frequency (MHz).
    history_mhz: VecDeque<f32>,

    /// Detected CPU model string.
    cpu_model: String,
    /// Total system RAM in Gigabytes.
    total_ram_gb: u64,

    /// Atomic flag indicating a safety-triggered abort.
    emergency_abort: Arc<AtomicBool>,
    /// Human-readable reason for the emergency abort.
    emergency_reason: Arc<Mutex<Option<String>>>,
}

impl BenchmarkOrchestrator {
    /// Creates a new orchestrator instance with injected dependencies.
    pub fn new(
        sal: Arc<dyn PlatformSal>,
        facts: SystemFactSheet,
        workload: Box<dyn Workload>,
        telemetry_tx: mpsc::Sender<TelemetryState>,
        command_rx: mpsc::Receiver<UiCommand>,
        optional_config_out: Option<PathBuf>,
    ) -> Self {
        let mut sys = System::new_all();
        sys.refresh_all();

        let cpu_model = sys.cpus().first()
            .map(|c| c.brand().to_string())
            .unwrap_or_else(|| "Unknown CPU".to_string());
        let total_ram_gb = sys.total_memory() / 1024 / 1024 / 1024;

        Self {
            sal,
            facts,
            workload,
            telemetry_tx,
            command_rx,
            phase: BenchmarkPhase::Auditing,
            profile: ThermalProfile::default(),
            engine: OptimizerEngine::new(5),
            history_watts: VecDeque::with_capacity(120),
            history_temp: VecDeque::with_capacity(120),
            history_mhz: VecDeque::with_capacity(120),
            cpu_model,
            total_ram_gb,
            emergency_abort: Arc::new(AtomicBool::new(false)),
            emergency_reason: Arc::new(Mutex::new(None)),
            optional_config_out,
            safeguard: None,
        }
    }

    /// Executes the full benchmark sequence.
    ///
    /// This method guarantees that [crate::sal::traits::EnvironmentGuard::restore] and [Workload::stop_workload]
    /// are called regardless of whether the benchmark succeeds or fails.
    pub fn run(&mut self) -> Result<OptimizationResult> {
        self.log("Starting ember-tune Benchmark Sequence.")?;

        let _watchdog_handle = self.spawn_watchdog_monitor();

        let result = self.execute_benchmark();

        // --- MANDATORY CLEANUP ---
        self.log("Benchmark sequence finished. Restoring hardware defaults...")?;
        let _ = self.workload.stop_workload();

        if let Some(mut sg) = self.safeguard.take() {
            if let Err(e) = sg.release() {
                anyhow::bail!("CRITICAL: USA Restoration Failure: {}", e);
            }
        }

        if let Err(e) = self.sal.restore() {
            anyhow::bail!("CRITICAL: Failed to restore hardware state: {}", e);
        }
        self.log("✓ Hardware state restored.")?;

        result
    }

    /// Internal execution logic for the benchmark phases.
    fn execute_benchmark(&mut self) -> Result<OptimizationResult> {
        let bench_cfg = self.facts.bench_config.clone().context("Benchmarking config missing in facts")?;

        // 1. Snapshot & Arm Safeguard
        let mut target_files = self.facts.rapl_paths.iter()
            .map(|p| p.join("constraint_0_power_limit_uw"))
            .collect::<Vec<_>>();
        target_files.extend(self.facts.rapl_paths.iter().map(|p| p.join("constraint_1_power_limit_uw")));
        if let Some(tp) = self.facts.paths.configs.get("throttled") {
            target_files.push(tp.clone());
        }

        let target_services = vec!["tlp.service".to_string(), "thermald.service".to_string(), "throttled.service".to_string()];
        self.safeguard = Some(HardwareStateGuard::acquire(&target_files, &target_services)?);

        // Phase 1: Audit & Baseline
        self.phase = BenchmarkPhase::Auditing;
        for step in self.sal.audit() {
            if let Err(e) = step.outcome {
                return Err(anyhow::anyhow!("Audit failed ({}): {:?}", step.description, e));
            }
        }

        self.workload.initialize().context("Failed to initialize workload")?;
        self.log("Suppressing background services (tlp, thermald)...")?;
        self.sal.suppress().context("Failed to suppress background services")?;

        // Baseline (Idle Calibration)
        self.phase = BenchmarkPhase::IdleCalibration;
        self.log(&format!("Phase 1: Recording Idle Baseline ({}s)...", bench_cfg.idle_duration_s))?;
        self.sal.set_fan_mode("auto")?;

        let mut idle_temps = Vec::new();
        let start = Instant::now();
        let mut tick = 0;
        while start.elapsed() < Duration::from_secs(bench_cfg.idle_duration_s) {
            self.check_abort()?;
            self.send_telemetry(tick)?;
            idle_temps.push(self.sal.get_temp().unwrap_or(0.0));
            tick += 1;
            thread::sleep(Duration::from_millis(500));
        }
        self.profile.ambient_temp = self.engine.smooth(&idle_temps).last().cloned().unwrap_or(0.0);
        self.log(&format!("✓ Idle Baseline: {:.1}°C", self.profile.ambient_temp))?;

        // Phase 2: Stress Stepping
        self.phase = BenchmarkPhase::StressTesting;
        self.log("Phase 2: Starting Synthetic Stress Matrix.")?;
        self.sal.set_fan_mode("max")?;

        let steps = bench_cfg.power_steps_watts.clone();
        for &pl in &steps {
            self.log(&format!("Testing PL1 = {:.0}W...", pl))?;

            let pl1_uw = crate::sal::safety::TdpLimitMicroWatts::new((pl * 1_000_000.0) as u64)?;
            let pl2_uw = crate::sal::safety::TdpLimitMicroWatts::new(((pl + 5.0) * 1_000_000.0) as u64)?;
            self.sal.set_sustained_power_limit(pl1_uw)?;
            self.sal.set_burst_power_limit(pl2_uw)?;

            self.workload.run_workload(
                Duration::from_secs(bench_cfg.stress_duration_max_s),
                IntensityProfile { threads: num_cpus::get(), load_percentage: 100 }
            )?;

            let step_start = Instant::now();
            let mut step_temps = VecDeque::with_capacity(30);

            while step_start.elapsed() < Duration::from_secs(bench_cfg.stress_duration_max_s) {
                self.check_abort()?;

                let t = self.sal.get_temp().unwrap_or(0.0);
                step_temps.push_back(t);
                if step_temps.len() > 10 { step_temps.pop_front(); }

                self.send_telemetry(tick)?;
                tick += 1;

                if step_start.elapsed() > Duration::from_secs(bench_cfg.stress_duration_min_s) && step_temps.len() == 10 {
                    let min = step_temps.iter().fold(f32::MAX, |a, &b| a.min(b));
                    let max = step_temps.iter().fold(f32::MIN, |a, &b| a.max(b));
                    if (max - min) < 0.5 {
                        self.log(&format!("  Equilibrium reached at {:.1}°C", t))?;
                        break;
                    }
                }
                thread::sleep(Duration::from_millis(500));
            }

            // Record data point
            let avg_p = self.sal.get_power_w().unwrap_or(0.0);
            let avg_t = self.sal.get_temp().unwrap_or(0.0);
            let avg_f = self.sal.get_freq_mhz().unwrap_or(0.0);
            let fans = self.sal.get_fan_rpms().unwrap_or_default();
            let primary_fan = fans.first().cloned().unwrap_or(0);
            let metrics = self.workload.get_current_metrics().unwrap_or_default();

            self.profile.points.push(ThermalPoint {
                power_w: avg_p,
                temp_c: avg_t,
                freq_mhz: avg_f,
                fan_rpm: primary_fan,
                throughput: metrics.primary_ops_per_sec,
            });

            self.workload.stop_workload()?;
            self.log(&format!("  Step complete. Cooling down for {}s...", bench_cfg.cool_down_s))?;
            thread::sleep(Duration::from_secs(bench_cfg.cool_down_s));
        }

        // Phase 4: Physical Modeling
        self.phase = BenchmarkPhase::PhysicalModeling;
        self.log("Phase 3: Calculating Silicon Physical Sweet Spot...")?;

        let mut res = self.generate_result(false);

        self.log(&format!("✓ Thermal Resistance (Rθ): {:.3} K/W", res.thermal_resistance_kw))?;
        self.log(&format!("✓ Silicon Knee Found: {:.1} W", res.silicon_knee_watts))?;

        thread::sleep(Duration::from_secs(3));

        // Phase 5: Finalizing
        self.phase = BenchmarkPhase::Finalizing;
        self.log("Benchmark sequence complete. Generating configurations...")?;

        let config = crate::engine::formatters::throttled::ThrottledConfig {
            pl1_limit: res.silicon_knee_watts,
            pl2_limit: res.recommended_pl2,
            trip_temp: res.max_temp_c.max(95.0),
        };

        let throttled_path = self.optional_config_out.clone()
            .or_else(|| self.facts.paths.configs.get("throttled").cloned());

        if let Some(path) = throttled_path {
            crate::engine::formatters::throttled::ThrottledTranslator::save(&path, &config)?;
            self.log(&format!("✓ Saved '{}'.", path.display()))?;
            res.config_paths.insert("throttled".to_string(), path.clone());
        }

        if let Some(i8k_path) = self.facts.paths.configs.get("i8kmon") {
            let i8k_config = crate::engine::formatters::i8kmon::I8kmonConfig {
                t_ambient: self.profile.ambient_temp,
                t_max_fan: res.max_temp_c - 5.0,
                thermal_resistance_kw: res.thermal_resistance_kw,
            };
            crate::engine::formatters::i8kmon::I8kmonTranslator::save(i8k_path, &i8k_config)?;
            self.log(&format!("✓ Saved '{}'.", i8k_path.display()))?;
            res.config_paths.insert("i8kmon".to_string(), i8k_path.clone());
        }

        Ok(res)
    }

    /// Spawns a concurrent monitor that polls safety sensors every 100ms.
    fn spawn_watchdog_monitor(&self) -> thread::JoinHandle<()> {
        let abort = self.emergency_abort.clone();
        let reason_store = self.emergency_reason.clone();
        let sal = self.sal.clone();
        let tx = self.telemetry_tx.clone();

        thread::spawn(move || {
            while !abort.load(Ordering::SeqCst) {
                let status = sal.get_safety_status();
                match status {
                    Ok(SafetyStatus::EmergencyAbort(reason)) => {
                        *reason_store.lock().unwrap() = Some(reason.clone());
                        abort.store(true, Ordering::SeqCst);
                        break;
                    }
                    Ok(SafetyStatus::Warning(msg)) | Ok(SafetyStatus::Critical(msg)) => {
                        let state = TelemetryState {
                            cpu_model: String::new(),
                            total_ram_gb: 0,
                            tick: 0,
                            cpu_temp: 0.0,
                            power_w: 0.0,
                            current_freq: 0.0,
                            fans: Vec::new(),
                            governor: String::new(),
                            pl1_limit: 0.0,
                            pl2_limit: 0.0,
                            fan_tier: String::new(),
                            phase: BenchmarkPhase::StressTesting,
                            history_watts: Vec::new(),
                            history_temp: Vec::new(),
                            history_mhz: Vec::new(),
                            log_event: Some(format!("WATCHDOG: {}", msg)),
                            metadata: std::collections::HashMap::new(),
                            is_emergency: false,
                            emergency_reason: None,
                        };
                        let _ = tx.send(state);
                    }
                    Ok(SafetyStatus::Nominal) => {}
                    Err(e) => {
                        *reason_store.lock().unwrap() = Some(format!("Watchdog Sensor Failure: {}", e));
                        abort.store(true, Ordering::SeqCst);
                        break;
                    }
                }
                thread::sleep(Duration::from_millis(100));
            }
        })
    }

    /// Generates the final [OptimizationResult] based on current measurements.
    pub fn generate_result(&self, is_partial: bool) -> OptimizationResult {
        let r_theta = self.engine.calculate_thermal_resistance(&self.profile);
        let knee = self.engine.find_silicon_knee(&self.profile);
        let max_t = self.engine.get_max_temp(&self.profile);

        OptimizationResult {
            profile: self.profile.clone(),
            silicon_knee_watts: knee,
            thermal_resistance_kw: r_theta,
            recommended_pl1: knee,
            recommended_pl2: knee * 1.25,
            max_temp_c: max_t,
            is_partial,
            config_paths: std::collections::HashMap::new(),
        }
    }

    /// Checks if the benchmark has been aborted by the user or the watchdog.
    fn check_abort(&self) -> Result<()> {
        if self.emergency_abort.load(Ordering::SeqCst) {
            let reason = self.emergency_reason.lock().unwrap().clone().unwrap_or_else(|| "Unknown safety trigger".to_string());
            return Err(anyhow::anyhow!("EMERGENCY_ABORT: {}", reason));
        }

        if let Ok(cmd) = self.command_rx.try_recv() {
            match cmd {
                UiCommand::Abort => {
                    return Err(anyhow::anyhow!("ABORTED"));
                }
            }
        }
        Ok(())
    }

    /// Helper to send log messages to the frontend.
    fn log(&self, msg: &str) -> Result<()> {
        let state = TelemetryState {
            cpu_model: self.cpu_model.clone(),
            total_ram_gb: self.total_ram_gb,
            tick: 0,
            cpu_temp: self.sal.get_temp().unwrap_or(0.0),
            power_w: self.sal.get_power_w().unwrap_or(0.0),
            current_freq: self.sal.get_freq_mhz().unwrap_or(0.0),
            fans: self.sal.get_fan_rpms().unwrap_or_default(),
            governor: "unknown".to_string(),
            pl1_limit: 0.0,
            pl2_limit: 0.0,
            fan_tier: "auto".to_string(),
            phase: self.phase,
            history_watts: Vec::new(),
            history_temp: Vec::new(),
            history_mhz: Vec::new(),
            log_event: Some(msg.to_string()),
            metadata: std::collections::HashMap::new(),
            is_emergency: self.emergency_abort.load(Ordering::SeqCst),
            emergency_reason: self.emergency_reason.lock().unwrap().clone(),
        };
        self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed"))
    }

    /// Collects current sensors and sends a complete [TelemetryState] to the frontend.
    fn send_telemetry(&mut self, tick: u64) -> Result<()> {
        let temp = self.sal.get_temp().unwrap_or(0.0);
        let pwr = self.sal.get_power_w().unwrap_or(0.0);
        let freq = self.sal.get_freq_mhz().unwrap_or(0.0);

        self.history_temp.push_back(temp);
        self.history_watts.push_back(pwr);
        self.history_mhz.push_back(freq);

        if self.history_temp.len() > 120 {
            self.history_temp.pop_front();
            self.history_watts.pop_front();
            self.history_mhz.pop_front();
        }

        let state = TelemetryState {
            cpu_model: self.cpu_model.clone(),
            total_ram_gb: self.total_ram_gb,
            tick,
            cpu_temp: temp,
            power_w: pwr,
            current_freq: freq,
            fans: self.sal.get_fan_rpms().unwrap_or_default(),
            governor: "performance".to_string(),
            pl1_limit: 15.0,
            pl2_limit: 25.0,
            fan_tier: "max".to_string(),
            phase: self.phase,
            history_watts: self.history_watts.iter().cloned().collect(),
            history_temp: self.history_temp.iter().cloned().collect(),
            history_mhz: self.history_mhz.iter().cloned().collect(),
            log_event: None,
            metadata: std::collections::HashMap::new(),
            is_emergency: self.emergency_abort.load(Ordering::SeqCst),
            emergency_reason: self.emergency_reason.lock().unwrap().clone(),
        };
        self.telemetry_tx.send(state).map_err(|_| anyhow::anyhow!("Telemetry channel closed"))
    }
}