diff --git a/src/audio.rs b/src/audio.rs index f2c2df5..273586d 100644 --- a/src/audio.rs +++ b/src/audio.rs @@ -66,6 +66,12 @@ const ACF_CONF_MIN: f32 = 0.15; // below this the ACF peak is noise -> ignore const ACF_SNAP: f32 = 0.30; // strong + wrong-octave IOI -> snap, don't glide const BPM_FOLD_LO: f32 = 88.0; const BPM_FOLD_HI: f32 = 176.0; +/// Mel filterbank size + count of MFCC cepstral coeffs exposed (timbre vec). +pub const MEL_N: usize = 32; +pub const MFCC_N: usize = 13; +const MEL_LO: f32 = 30.0; +const MEL_HI: f32 = 16_000.0; +const MFCC_SMOOTH: f32 = 0.25; // EMA on the bipolar cepstral vector /// Per-band level (AGC-normalised, smoothed) + onset spike + rich descriptors. /// All scalar fields are 0..~1. @@ -102,6 +108,15 @@ pub struct Bands { /// is *anchored* to this, so `beat_phase` no longer drifts an octave on /// syncopated breakcore fills — sync pulses/dolly-punches to this grid. pub bpm: f32, + /// Stereo width: side/mid RMS ratio, AGC-normalised + smoothed. 0 = mono / + /// dead-centre, ->1 = wide / strong L-R / anti-phase. Mono input -> 0. + /// Spatialise the visual to the mix's stereo field (spread/parallax). + pub width: f32, + /// Smoothed MFCC timbre fingerprint: cepstral coeffs c1.. (c0/energy + /// dropped), each per-coeff AGC'd to ~[-1,1]. Captures *texture* (saw vs + /// pad vs noise) independent of pitch & loudness -> morph palette/figure + /// by timbre, not just by note. `mfcc[0]` = c1 (spectral tilt). + pub mfcc: [f32; MFCC_N], /// Spectral flatness 0 (tonal/pad) .. 1 (noisy/break) -> smooth vs jagged. pub flatness: f32, /// Relative pitch-class energy (max-normalised) -> harmonic accent hues. @@ -128,6 +143,8 @@ impl Default for Bands { beat: 0.0, beat_phase: 0.0, bpm: 0.0, + width: 0.0, + mfcc: [0.0; MFCC_N], flatness: 0.0, chroma: [0.0; CHROMA_N], wave: [0.0; WAVE_N], @@ -227,17 +244,19 @@ fn pick_device(host: &cpal::Host, sel: &Source) -> anyhow::Result pub fn start(src: Source) -> anyhow::Result { let (input, out) = triple_buffer::triple_buffer(&Bands::default()); - let rb = HeapRb::::new(RING_CAP); + // Ring carries [mid, side] pairs: mid == old mono mean (spectral path + // unchanged, bit-identical), side == (L-R)/2 (stereo-width only). + let rb = HeapRb::<[f32; 2]>::new(RING_CAP); let (mut prod, cons) = rb.split(); - let mut push_mono = move |m: f32| { - let _ = prod.try_push(m); + let mut push_ms = move |mid: f32, side: f32| { + let _ = prod.try_push([mid, side]); }; let mut streams: Vec = Vec::new(); let host = cpal::default_host(); let sample_rate = match &src { - Source::File(path) => spawn_file_source(path, push_mono, &mut streams)?, + Source::File(path) => spawn_file_source(path, push_ms, &mut streams)?, _ => { let device = pick_device(&host, &src)?; let cfg = device.default_input_config()?; @@ -257,7 +276,7 @@ pub fn start(src: Source) -> anyhow::Result { device: &cpal::Device, cfg: &cpal::StreamConfig, channels: usize, - mut push: impl FnMut(f32) + Send + 'static, + mut push: impl FnMut(f32, f32) + Send + 'static, err_fn: impl FnMut(cpal::StreamError) + Send + 'static, ) -> Result where @@ -272,7 +291,13 @@ pub fn start(src: Source) -> anyhow::Result { for &v in f { s += f32::from_sample(v); } - push(s / f.len().max(1) as f32); + let mid = s / f.len().max(1) as f32; + let side = if f.len() >= 2 { + (f32::from_sample(f[0]) - f32::from_sample(f[1])) * 0.5 + } else { + 0.0 + }; + push(mid, side); } }, err_fn, @@ -285,16 +310,17 @@ pub fn start(src: Source) -> anyhow::Result { &scfg, move |data: &[f32], _| { for f in data.chunks(channels) { - let s: f32 = f.iter().sum::() / f.len().max(1) as f32; - push_mono(s); + let mid: f32 = f.iter().sum::() / f.len().max(1) as f32; + let side = if f.len() >= 2 { (f[0] - f[1]) * 0.5 } else { 0.0 }; + push_ms(mid, side); } }, err_fn, None, )?, - SampleFormat::I16 => run::(&device, &scfg, channels, push_mono, err_fn)?, - SampleFormat::U16 => run::(&device, &scfg, channels, push_mono, err_fn)?, - SampleFormat::I32 => run::(&device, &scfg, channels, push_mono, err_fn)?, + SampleFormat::I16 => run::(&device, &scfg, channels, push_ms, err_fn)?, + SampleFormat::U16 => run::(&device, &scfg, channels, push_ms, err_fn)?, + SampleFormat::I32 => run::(&device, &scfg, channels, push_ms, err_fn)?, other => anyhow::bail!("unsupported sample format: {other:?}"), }; stream.play()?; @@ -310,7 +336,7 @@ pub fn start(src: Source) -> anyhow::Result { }) } -/// Decode `path`, play it on the default output, tee mono into `push_mono`. +/// Decode `path`, play it on the default output, tee `(mid, side)` into `push_ms`. /// Returns the source sample rate. Falls back to the output device's native /// rate with linear resampling if the device rejects the file's rate. /// A probed file ready to decode: format reader + audio decoder + the @@ -361,7 +387,7 @@ fn open_file(path: &Path) -> anyhow::Result { fn spawn_file_source( path: &Path, - mut push_mono: impl FnMut(f32) + Send + 'static, + mut push_ms: impl FnMut(f32, f32) + Send + 'static, streams: &mut Vec, ) -> anyhow::Result { let DecodedFile { @@ -430,9 +456,11 @@ fn spawn_file_source( let resample = (out_sr / file_sr as f32).max(0.01); thread::spawn(move || { - // Linear-resample state per output channel (mono dup across out_ch). + // Linear-resample state (mid drives playback dup; side rides along + // resampled identically so width stays in lock-step with audio). let mut frac = 0.0f32; - let mut prev_mono = 0.0f32; + let mut prev_mid = 0.0f32; + let mut prev_side = 0.0f32; let mut ilv: Vec = Vec::new(); loop { @@ -452,23 +480,27 @@ fn spawn_file_source( decoded.copy_to_vec_interleaved::(&mut ilv); for frame in ilv.chunks(ch) { - let mono = frame.iter().sum::() / ch as f32; + let mid = frame.iter().sum::() / ch as f32; + let side = if ch >= 2 { (frame[0] - frame[1]) * 0.5 } else { 0.0 }; // Emit `resample` output frames per input frame (linear). frac += resample; while frac >= 1.0 { frac -= 1.0; let a = 1.0 - frac.min(1.0); - let s = prev_mono * (1.0 - a) + mono * a; - push_mono(s); + let s_mid = prev_mid * (1.0 - a) + mid * a; + let s_side = prev_side * (1.0 - a) + side * a; + push_ms(s_mid, s_side); // Block until playback ring has room (back-pressure == // play speed; keeps analysis in lock-step with audio). + // Playback stays mono (s_mid) — audible output unchanged. for _ in 0..out_ch { - while pb_prod.try_push(s).is_err() { + while pb_prod.try_push(s_mid).is_err() { thread::sleep(Duration::from_millis(1)); } } } - prev_mono = mono; + prev_mid = mid; + prev_side = side; } } }); @@ -476,7 +508,8 @@ fn spawn_file_source( Ok(file_sr as f32) } -/// Streaming STFT analyser. Feed mono samples; emits one [`Bands`] per hop. +/// Streaming STFT analyser. Feed `(mid, side)` pairs; emits one [`Bands`] per +/// hop. `mid` is the mono analysis signal; `side` only drives stereo width. /// /// Holds all envelope / AGC / onset state so the live thread and the offline /// batch produce bit-identical frames for the same input. @@ -523,6 +556,19 @@ pub struct Analyzer { acf_lag_min: usize, acf_lag_max: usize, bpm: f32, + // Stereo (Mid/Side) width: per-hop RMS-energy accumulators (zeroed each + // hop) + AGC ceiling. Fed bit-identically live/offline via push(mid,side); + // `mid` is the old mono mean so every pre-existing field is unchanged. + ms_mid_sq: f32, + ms_side_sq: f32, + ms_n: usize, + agc_width: f32, + // MFCC: precomputed mel triangular filterbank (start bin + weights) + the + // DCT-II cosine table (MFCC_N rows x MEL_N, row-major); per-coeff bipolar + // AGC ceilings give a stable ~[-1,1] timbre vector. + mel_filt: Vec<(usize, Vec)>, + dct: Vec, + agc_mfcc: [f32; MFCC_N], } fn norm(v: f32, c: &mut f32) -> f32 { @@ -530,6 +576,13 @@ fn norm(v: f32, c: &mut f32) -> f32 { (v / *c).clamp(0.0, 1.0) } +// Bipolar AGC: like `norm` but keeps sign (cepstral coeffs swing about 0). +fn norm_signed(v: f32, c: &mut f32) -> f32 { + let a = v.abs(); + *c = (*c * AGC_DECAY).max(AGC_FLOOR).max(a); + (v / *c).clamp(-1.0, 1.0) +} + fn follow(env: &mut f32, x: f32) { let coeff = if x > *env { ATTACK } else { RELEASE }; *env += (x - *env) * coeff; @@ -565,6 +618,46 @@ impl Analyzer { let acf_lag_max = (ACF_PERIOD_HI / hop_dt).round() as usize; let acf_n = ((ACF_WIN_SECS / hop_dt).round() as usize).max(acf_lag_max + 2); + // Mel triangular filterbank (MEL_N+2 mel-spaced edges -> bin space) + + // the DCT-II cosine table for the MFCCs. Built once; pure fn of sr. + let hz_to_mel = |f: f32| 2595.0 * (1.0 + f / 700.0).log10(); + let mel_to_hz = |m: f32| 700.0 * (10f32.powf(m / 2595.0) - 1.0); + let (m_lo, m_hi) = (hz_to_mel(MEL_LO), hz_to_mel(MEL_HI)); + let mut edges = [0.0f32; MEL_N + 2]; + for (i, e) in edges.iter_mut().enumerate() { + let m = m_lo + (m_hi - m_lo) * i as f32 / (MEL_N + 1) as f32; + *e = mel_to_hz(m) / bin_hz; // edge position in FFT bins + } + let mut mel_filt: Vec<(usize, Vec)> = Vec::with_capacity(MEL_N); + for j in 0..MEL_N { + let (f0, f1, f2) = (edges[j], edges[j + 1], edges[j + 2]); + let a = (f0.floor() as usize).min(half - 1); + let b = ((f2.ceil() as usize).max(a + 1)).min(half); + let w = (a..b) + .map(|bin| { + let x = bin as f32; + let g = if x <= f1 { + if f1 > f0 { (x - f0) / (f1 - f0) } else { 0.0 } + } else if f2 > f1 { + (f2 - x) / (f2 - f1) + } else { + 0.0 + }; + g.clamp(0.0, 1.0) + }) + .collect(); + mel_filt.push((a, w)); + } + let mut dct = vec![0.0f32; MFCC_N * MEL_N]; + for k in 1..=MFCC_N { + for j in 0..MEL_N { + dct[(k - 1) * MEL_N + j] = (std::f32::consts::PI * k as f32 + * (j as f32 + 0.5) + / MEL_N as f32) + .cos(); + } + } + Analyzer { hann, fft, @@ -599,13 +692,25 @@ impl Analyzer { acf_lag_min, acf_lag_max, bpm: 0.0, + ms_mid_sq: 0.0, + ms_side_sq: 0.0, + ms_n: 0, + agc_width: AGC_FLOOR, + mel_filt, + dct, + agc_mfcc: [AGC_FLOOR; MFCC_N], } } - /// Push one mono sample. Returns `Some(bands)` when a hop completes. - pub fn push(&mut self, s: f32) -> Option { + /// Push one `(mid, side)` sample pair. `mid` (= the old mono channel mean) + /// drives all spectral analysis unchanged; `side` (= (L-R)/2) only feeds + /// the new stereo-width metric. Returns `Some(bands)` when a hop completes. + pub fn push(&mut self, mid: f32, side: f32) -> Option { + self.ms_mid_sq += mid * mid; + self.ms_side_sq += side * side; + self.ms_n += 1; self.win.copy_within(1..FFT_SIZE, 0); - self.win[FFT_SIZE - 1] = s; + self.win[FFT_SIZE - 1] = mid; self.filled = (self.filled + 1).min(FFT_SIZE); self.since_hop += 1; if self.filled < FFT_SIZE || self.since_hop < HOP { @@ -722,6 +827,35 @@ impl Analyzer { let am = lin_sum / nbin; let flatness = if am > 1e-9 { (gm / am).clamp(0.0, 1.0) } else { 0.0 }; + // MFCC: mel-filterbank energies (magnitude) -> log -> DCT-II. c0 + // (overall energy) is dropped; c1.. = pitch-independent timbre. + let mut mel_log = [0.0f32; MEL_N]; + for (j, (a, w)) in self.mel_filt.iter().enumerate() { + let mut e = 0.0f32; + for (o, &g) in w.iter().enumerate() { + e += mags[a + o] * g; + } + mel_log[j] = (e + 1e-9).ln(); + } + let mut mfcc = [0.0f32; MFCC_N]; + for (k, mc) in mfcc.iter_mut().enumerate() { + let row = &self.dct[k * MEL_N..(k + 1) * MEL_N]; + let mut s = 0.0f32; + for (j, &r) in row.iter().enumerate() { + s += mel_log[j] * r; + } + *mc = norm_signed(s, &mut self.agc_mfcc[k]); + } + + // Stereo width from this hop's RMS energy (zero the accumulators). + let msn = self.ms_n.max(1) as f32; + let mid_rms = (self.ms_mid_sq / msn).sqrt(); + let side_rms = (self.ms_side_sq / msn).sqrt(); + self.ms_mid_sq = 0.0; + self.ms_side_sq = 0.0; + self.ms_n = 0; + let width = norm(side_rms / (mid_rms + 1e-6), &mut self.agc_width); + // Advance prev_mag now that flux is computed. self.prev_mag.copy_from_slice(&mags); @@ -781,6 +915,12 @@ impl Analyzer { self.env.flux = self.broad_pop; self.env.csd = self.csd_pop; follow(&mut self.env.flatness, flatness); + follow(&mut self.env.width, width); + // Bipolar cepstral vector: plain EMA (slowly-evolving fingerprint, not + // an attack — `follow`'s rise/fall asymmetry would distort it). + for (e, &m) in self.env.mfcc.iter_mut().zip(&mfcc) { + *e += (m - *e) * MFCC_SMOOTH; + } // Autocorrelation tempo: anchor the predictive IOI to the dominant // period in ~3 s of broadband-flux history *before* the beat block @@ -895,12 +1035,12 @@ impl Analyzer { } fn analysis_loop( - mut cons: impl Consumer + Observer, + mut cons: impl Consumer, // Observer is a Consumer supertrait sample_rate: f32, mut out: triple_buffer::Input, ) { let mut an = Analyzer::new(sample_rate); - let mut scratch = vec![0.0f32; HOP * 8]; + let mut scratch = vec![[0.0f32; 2]; HOP * 8]; loop { let avail = cons.occupied_len(); if avail == 0 { @@ -909,8 +1049,8 @@ fn analysis_loop( } let take = avail.min(scratch.len()); let got = cons.pop_slice(&mut scratch[..take]); - for &s in &scratch[..got] { - if let Some(b) = an.push(s) { + for &[mid, side] in &scratch[..got] { + if let Some(b) = an.push(mid, side) { out.write(b); } } @@ -951,9 +1091,10 @@ pub fn analyze_file(path: &Path) -> anyhow::Result { let ch = decoded.spec().channels().count().max(1); decoded.copy_to_vec_interleaved::(&mut ilv); for frame in ilv.chunks(ch) { - let mono = frame.iter().sum::() / ch as f32; + let mid = frame.iter().sum::() / ch as f32; + let side = if ch >= 2 { (frame[0] - frame[1]) * 0.5 } else { 0.0 }; samples += 1; - if let Some(b) = an.push(mono) { + if let Some(b) = an.push(mid, side) { frames.push(b); } } diff --git a/src/bin/sigil.rs b/src/bin/sigil.rs index 63f1938..c712101 100644 --- a/src/bin/sigil.rs +++ b/src/bin/sigil.rs @@ -328,16 +328,22 @@ fn main() { Ok(tl) => { let mut peak = Bands::default(); let mut bpms: Vec = Vec::new(); + let mut mfcc_abs = 0.0f32; // mean |c1| -> timbre vec is alive for b in &tl.frames { peak.low = peak.low.max(b.low); peak.loud = peak.loud.max(b.loud); peak.flux = peak.flux.max(b.flux); peak.csd = peak.csd.max(b.csd); peak.centroid = peak.centroid.max(b.centroid); + peak.width = peak.width.max(b.width); + mfcc_abs += b.mfcc[0].abs(); if b.bpm > 0.0 { bpms.push(b.bpm); } } + let nf = tl.frames.len().max(1) as f32; + mfcc_abs /= nf; + let last = tl.frames.last().copied().unwrap_or_default(); // Median BPM = the track's anchored tempo (ACF-stabilised). let med_bpm = if bpms.is_empty() { 0.0 @@ -346,7 +352,7 @@ fn main() { bpms[bpms.len() / 2] }; println!( - "ok: {} frames, {:.2}s, {} Hz, {:.1} fps\n peak low {:.2} loud {:.2} flux {:.2} csd {:.2} centroid {:.2}\n tempo {:.1} BPM (median of {} locked frames)", + "ok: {} frames, {:.2}s, {} Hz, {:.1} fps\n peak low {:.2} loud {:.2} flux {:.2} csd {:.2} centroid {:.2} width {:.2}\n tempo {:.1} BPM (median of {} locked frames)\n mfcc c1..c4 [{:+.2} {:+.2} {:+.2} {:+.2}] (last) mean|c1| {:.2}", tl.frames.len(), tl.duration(), tl.sample_rate as u32, @@ -356,8 +362,14 @@ fn main() { peak.flux, peak.csd, peak.centroid, + peak.width, med_bpm, bpms.len(), + last.mfcc[0], + last.mfcc[1], + last.mfcc[2], + last.mfcc[3], + mfcc_abs, ); } Err(e) => die(format!("analyze: {e}")),