final dsp improvement
This commit is contained in:
+171
-30
@@ -66,6 +66,12 @@ const ACF_CONF_MIN: f32 = 0.15; // below this the ACF peak is noise -> ignore
|
||||
const ACF_SNAP: f32 = 0.30; // strong + wrong-octave IOI -> snap, don't glide
|
||||
const BPM_FOLD_LO: f32 = 88.0;
|
||||
const BPM_FOLD_HI: f32 = 176.0;
|
||||
/// Mel filterbank size + count of MFCC cepstral coeffs exposed (timbre vec).
|
||||
pub const MEL_N: usize = 32;
|
||||
pub const MFCC_N: usize = 13;
|
||||
const MEL_LO: f32 = 30.0;
|
||||
const MEL_HI: f32 = 16_000.0;
|
||||
const MFCC_SMOOTH: f32 = 0.25; // EMA on the bipolar cepstral vector
|
||||
|
||||
/// Per-band level (AGC-normalised, smoothed) + onset spike + rich descriptors.
|
||||
/// All scalar fields are 0..~1.
|
||||
@@ -102,6 +108,15 @@ pub struct Bands {
|
||||
/// is *anchored* to this, so `beat_phase` no longer drifts an octave on
|
||||
/// syncopated breakcore fills — sync pulses/dolly-punches to this grid.
|
||||
pub bpm: f32,
|
||||
/// Stereo width: side/mid RMS ratio, AGC-normalised + smoothed. 0 = mono /
|
||||
/// dead-centre, ->1 = wide / strong L-R / anti-phase. Mono input -> 0.
|
||||
/// Spatialise the visual to the mix's stereo field (spread/parallax).
|
||||
pub width: f32,
|
||||
/// Smoothed MFCC timbre fingerprint: cepstral coeffs c1.. (c0/energy
|
||||
/// dropped), each per-coeff AGC'd to ~[-1,1]. Captures *texture* (saw vs
|
||||
/// pad vs noise) independent of pitch & loudness -> morph palette/figure
|
||||
/// by timbre, not just by note. `mfcc[0]` = c1 (spectral tilt).
|
||||
pub mfcc: [f32; MFCC_N],
|
||||
/// Spectral flatness 0 (tonal/pad) .. 1 (noisy/break) -> smooth vs jagged.
|
||||
pub flatness: f32,
|
||||
/// Relative pitch-class energy (max-normalised) -> harmonic accent hues.
|
||||
@@ -128,6 +143,8 @@ impl Default for Bands {
|
||||
beat: 0.0,
|
||||
beat_phase: 0.0,
|
||||
bpm: 0.0,
|
||||
width: 0.0,
|
||||
mfcc: [0.0; MFCC_N],
|
||||
flatness: 0.0,
|
||||
chroma: [0.0; CHROMA_N],
|
||||
wave: [0.0; WAVE_N],
|
||||
@@ -227,17 +244,19 @@ fn pick_device(host: &cpal::Host, sel: &Source) -> anyhow::Result<cpal::Device>
|
||||
pub fn start(src: Source) -> anyhow::Result<AudioHandle> {
|
||||
let (input, out) = triple_buffer::triple_buffer(&Bands::default());
|
||||
|
||||
let rb = HeapRb::<f32>::new(RING_CAP);
|
||||
// Ring carries [mid, side] pairs: mid == old mono mean (spectral path
|
||||
// unchanged, bit-identical), side == (L-R)/2 (stereo-width only).
|
||||
let rb = HeapRb::<[f32; 2]>::new(RING_CAP);
|
||||
let (mut prod, cons) = rb.split();
|
||||
let mut push_mono = move |m: f32| {
|
||||
let _ = prod.try_push(m);
|
||||
let mut push_ms = move |mid: f32, side: f32| {
|
||||
let _ = prod.try_push([mid, side]);
|
||||
};
|
||||
|
||||
let mut streams: Vec<cpal::Stream> = Vec::new();
|
||||
let host = cpal::default_host();
|
||||
|
||||
let sample_rate = match &src {
|
||||
Source::File(path) => spawn_file_source(path, push_mono, &mut streams)?,
|
||||
Source::File(path) => spawn_file_source(path, push_ms, &mut streams)?,
|
||||
_ => {
|
||||
let device = pick_device(&host, &src)?;
|
||||
let cfg = device.default_input_config()?;
|
||||
@@ -257,7 +276,7 @@ pub fn start(src: Source) -> anyhow::Result<AudioHandle> {
|
||||
device: &cpal::Device,
|
||||
cfg: &cpal::StreamConfig,
|
||||
channels: usize,
|
||||
mut push: impl FnMut(f32) + Send + 'static,
|
||||
mut push: impl FnMut(f32, f32) + Send + 'static,
|
||||
err_fn: impl FnMut(cpal::StreamError) + Send + 'static,
|
||||
) -> Result<cpal::Stream, cpal::BuildStreamError>
|
||||
where
|
||||
@@ -272,7 +291,13 @@ pub fn start(src: Source) -> anyhow::Result<AudioHandle> {
|
||||
for &v in f {
|
||||
s += f32::from_sample(v);
|
||||
}
|
||||
push(s / f.len().max(1) as f32);
|
||||
let mid = s / f.len().max(1) as f32;
|
||||
let side = if f.len() >= 2 {
|
||||
(f32::from_sample(f[0]) - f32::from_sample(f[1])) * 0.5
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
push(mid, side);
|
||||
}
|
||||
},
|
||||
err_fn,
|
||||
@@ -285,16 +310,17 @@ pub fn start(src: Source) -> anyhow::Result<AudioHandle> {
|
||||
&scfg,
|
||||
move |data: &[f32], _| {
|
||||
for f in data.chunks(channels) {
|
||||
let s: f32 = f.iter().sum::<f32>() / f.len().max(1) as f32;
|
||||
push_mono(s);
|
||||
let mid: f32 = f.iter().sum::<f32>() / f.len().max(1) as f32;
|
||||
let side = if f.len() >= 2 { (f[0] - f[1]) * 0.5 } else { 0.0 };
|
||||
push_ms(mid, side);
|
||||
}
|
||||
},
|
||||
err_fn,
|
||||
None,
|
||||
)?,
|
||||
SampleFormat::I16 => run::<i16>(&device, &scfg, channels, push_mono, err_fn)?,
|
||||
SampleFormat::U16 => run::<u16>(&device, &scfg, channels, push_mono, err_fn)?,
|
||||
SampleFormat::I32 => run::<i32>(&device, &scfg, channels, push_mono, err_fn)?,
|
||||
SampleFormat::I16 => run::<i16>(&device, &scfg, channels, push_ms, err_fn)?,
|
||||
SampleFormat::U16 => run::<u16>(&device, &scfg, channels, push_ms, err_fn)?,
|
||||
SampleFormat::I32 => run::<i32>(&device, &scfg, channels, push_ms, err_fn)?,
|
||||
other => anyhow::bail!("unsupported sample format: {other:?}"),
|
||||
};
|
||||
stream.play()?;
|
||||
@@ -310,7 +336,7 @@ pub fn start(src: Source) -> anyhow::Result<AudioHandle> {
|
||||
})
|
||||
}
|
||||
|
||||
/// Decode `path`, play it on the default output, tee mono into `push_mono`.
|
||||
/// Decode `path`, play it on the default output, tee `(mid, side)` into `push_ms`.
|
||||
/// Returns the source sample rate. Falls back to the output device's native
|
||||
/// rate with linear resampling if the device rejects the file's rate.
|
||||
/// A probed file ready to decode: format reader + audio decoder + the
|
||||
@@ -361,7 +387,7 @@ fn open_file(path: &Path) -> anyhow::Result<DecodedFile> {
|
||||
|
||||
fn spawn_file_source(
|
||||
path: &Path,
|
||||
mut push_mono: impl FnMut(f32) + Send + 'static,
|
||||
mut push_ms: impl FnMut(f32, f32) + Send + 'static,
|
||||
streams: &mut Vec<cpal::Stream>,
|
||||
) -> anyhow::Result<f32> {
|
||||
let DecodedFile {
|
||||
@@ -430,9 +456,11 @@ fn spawn_file_source(
|
||||
let resample = (out_sr / file_sr as f32).max(0.01);
|
||||
|
||||
thread::spawn(move || {
|
||||
// Linear-resample state per output channel (mono dup across out_ch).
|
||||
// Linear-resample state (mid drives playback dup; side rides along
|
||||
// resampled identically so width stays in lock-step with audio).
|
||||
let mut frac = 0.0f32;
|
||||
let mut prev_mono = 0.0f32;
|
||||
let mut prev_mid = 0.0f32;
|
||||
let mut prev_side = 0.0f32;
|
||||
let mut ilv: Vec<f32> = Vec::new();
|
||||
|
||||
loop {
|
||||
@@ -452,23 +480,27 @@ fn spawn_file_source(
|
||||
decoded.copy_to_vec_interleaved::<f32>(&mut ilv);
|
||||
|
||||
for frame in ilv.chunks(ch) {
|
||||
let mono = frame.iter().sum::<f32>() / ch as f32;
|
||||
let mid = frame.iter().sum::<f32>() / ch as f32;
|
||||
let side = if ch >= 2 { (frame[0] - frame[1]) * 0.5 } else { 0.0 };
|
||||
// Emit `resample` output frames per input frame (linear).
|
||||
frac += resample;
|
||||
while frac >= 1.0 {
|
||||
frac -= 1.0;
|
||||
let a = 1.0 - frac.min(1.0);
|
||||
let s = prev_mono * (1.0 - a) + mono * a;
|
||||
push_mono(s);
|
||||
let s_mid = prev_mid * (1.0 - a) + mid * a;
|
||||
let s_side = prev_side * (1.0 - a) + side * a;
|
||||
push_ms(s_mid, s_side);
|
||||
// Block until playback ring has room (back-pressure ==
|
||||
// play speed; keeps analysis in lock-step with audio).
|
||||
// Playback stays mono (s_mid) — audible output unchanged.
|
||||
for _ in 0..out_ch {
|
||||
while pb_prod.try_push(s).is_err() {
|
||||
while pb_prod.try_push(s_mid).is_err() {
|
||||
thread::sleep(Duration::from_millis(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
prev_mono = mono;
|
||||
prev_mid = mid;
|
||||
prev_side = side;
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -476,7 +508,8 @@ fn spawn_file_source(
|
||||
Ok(file_sr as f32)
|
||||
}
|
||||
|
||||
/// Streaming STFT analyser. Feed mono samples; emits one [`Bands`] per hop.
|
||||
/// Streaming STFT analyser. Feed `(mid, side)` pairs; emits one [`Bands`] per
|
||||
/// hop. `mid` is the mono analysis signal; `side` only drives stereo width.
|
||||
///
|
||||
/// Holds all envelope / AGC / onset state so the live thread and the offline
|
||||
/// batch produce bit-identical frames for the same input.
|
||||
@@ -523,6 +556,19 @@ pub struct Analyzer {
|
||||
acf_lag_min: usize,
|
||||
acf_lag_max: usize,
|
||||
bpm: f32,
|
||||
// Stereo (Mid/Side) width: per-hop RMS-energy accumulators (zeroed each
|
||||
// hop) + AGC ceiling. Fed bit-identically live/offline via push(mid,side);
|
||||
// `mid` is the old mono mean so every pre-existing field is unchanged.
|
||||
ms_mid_sq: f32,
|
||||
ms_side_sq: f32,
|
||||
ms_n: usize,
|
||||
agc_width: f32,
|
||||
// MFCC: precomputed mel triangular filterbank (start bin + weights) + the
|
||||
// DCT-II cosine table (MFCC_N rows x MEL_N, row-major); per-coeff bipolar
|
||||
// AGC ceilings give a stable ~[-1,1] timbre vector.
|
||||
mel_filt: Vec<(usize, Vec<f32>)>,
|
||||
dct: Vec<f32>,
|
||||
agc_mfcc: [f32; MFCC_N],
|
||||
}
|
||||
|
||||
fn norm(v: f32, c: &mut f32) -> f32 {
|
||||
@@ -530,6 +576,13 @@ fn norm(v: f32, c: &mut f32) -> f32 {
|
||||
(v / *c).clamp(0.0, 1.0)
|
||||
}
|
||||
|
||||
// Bipolar AGC: like `norm` but keeps sign (cepstral coeffs swing about 0).
|
||||
fn norm_signed(v: f32, c: &mut f32) -> f32 {
|
||||
let a = v.abs();
|
||||
*c = (*c * AGC_DECAY).max(AGC_FLOOR).max(a);
|
||||
(v / *c).clamp(-1.0, 1.0)
|
||||
}
|
||||
|
||||
fn follow(env: &mut f32, x: f32) {
|
||||
let coeff = if x > *env { ATTACK } else { RELEASE };
|
||||
*env += (x - *env) * coeff;
|
||||
@@ -565,6 +618,46 @@ impl Analyzer {
|
||||
let acf_lag_max = (ACF_PERIOD_HI / hop_dt).round() as usize;
|
||||
let acf_n = ((ACF_WIN_SECS / hop_dt).round() as usize).max(acf_lag_max + 2);
|
||||
|
||||
// Mel triangular filterbank (MEL_N+2 mel-spaced edges -> bin space) +
|
||||
// the DCT-II cosine table for the MFCCs. Built once; pure fn of sr.
|
||||
let hz_to_mel = |f: f32| 2595.0 * (1.0 + f / 700.0).log10();
|
||||
let mel_to_hz = |m: f32| 700.0 * (10f32.powf(m / 2595.0) - 1.0);
|
||||
let (m_lo, m_hi) = (hz_to_mel(MEL_LO), hz_to_mel(MEL_HI));
|
||||
let mut edges = [0.0f32; MEL_N + 2];
|
||||
for (i, e) in edges.iter_mut().enumerate() {
|
||||
let m = m_lo + (m_hi - m_lo) * i as f32 / (MEL_N + 1) as f32;
|
||||
*e = mel_to_hz(m) / bin_hz; // edge position in FFT bins
|
||||
}
|
||||
let mut mel_filt: Vec<(usize, Vec<f32>)> = Vec::with_capacity(MEL_N);
|
||||
for j in 0..MEL_N {
|
||||
let (f0, f1, f2) = (edges[j], edges[j + 1], edges[j + 2]);
|
||||
let a = (f0.floor() as usize).min(half - 1);
|
||||
let b = ((f2.ceil() as usize).max(a + 1)).min(half);
|
||||
let w = (a..b)
|
||||
.map(|bin| {
|
||||
let x = bin as f32;
|
||||
let g = if x <= f1 {
|
||||
if f1 > f0 { (x - f0) / (f1 - f0) } else { 0.0 }
|
||||
} else if f2 > f1 {
|
||||
(f2 - x) / (f2 - f1)
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
g.clamp(0.0, 1.0)
|
||||
})
|
||||
.collect();
|
||||
mel_filt.push((a, w));
|
||||
}
|
||||
let mut dct = vec![0.0f32; MFCC_N * MEL_N];
|
||||
for k in 1..=MFCC_N {
|
||||
for j in 0..MEL_N {
|
||||
dct[(k - 1) * MEL_N + j] = (std::f32::consts::PI * k as f32
|
||||
* (j as f32 + 0.5)
|
||||
/ MEL_N as f32)
|
||||
.cos();
|
||||
}
|
||||
}
|
||||
|
||||
Analyzer {
|
||||
hann,
|
||||
fft,
|
||||
@@ -599,13 +692,25 @@ impl Analyzer {
|
||||
acf_lag_min,
|
||||
acf_lag_max,
|
||||
bpm: 0.0,
|
||||
ms_mid_sq: 0.0,
|
||||
ms_side_sq: 0.0,
|
||||
ms_n: 0,
|
||||
agc_width: AGC_FLOOR,
|
||||
mel_filt,
|
||||
dct,
|
||||
agc_mfcc: [AGC_FLOOR; MFCC_N],
|
||||
}
|
||||
}
|
||||
|
||||
/// Push one mono sample. Returns `Some(bands)` when a hop completes.
|
||||
pub fn push(&mut self, s: f32) -> Option<Bands> {
|
||||
/// Push one `(mid, side)` sample pair. `mid` (= the old mono channel mean)
|
||||
/// drives all spectral analysis unchanged; `side` (= (L-R)/2) only feeds
|
||||
/// the new stereo-width metric. Returns `Some(bands)` when a hop completes.
|
||||
pub fn push(&mut self, mid: f32, side: f32) -> Option<Bands> {
|
||||
self.ms_mid_sq += mid * mid;
|
||||
self.ms_side_sq += side * side;
|
||||
self.ms_n += 1;
|
||||
self.win.copy_within(1..FFT_SIZE, 0);
|
||||
self.win[FFT_SIZE - 1] = s;
|
||||
self.win[FFT_SIZE - 1] = mid;
|
||||
self.filled = (self.filled + 1).min(FFT_SIZE);
|
||||
self.since_hop += 1;
|
||||
if self.filled < FFT_SIZE || self.since_hop < HOP {
|
||||
@@ -722,6 +827,35 @@ impl Analyzer {
|
||||
let am = lin_sum / nbin;
|
||||
let flatness = if am > 1e-9 { (gm / am).clamp(0.0, 1.0) } else { 0.0 };
|
||||
|
||||
// MFCC: mel-filterbank energies (magnitude) -> log -> DCT-II. c0
|
||||
// (overall energy) is dropped; c1.. = pitch-independent timbre.
|
||||
let mut mel_log = [0.0f32; MEL_N];
|
||||
for (j, (a, w)) in self.mel_filt.iter().enumerate() {
|
||||
let mut e = 0.0f32;
|
||||
for (o, &g) in w.iter().enumerate() {
|
||||
e += mags[a + o] * g;
|
||||
}
|
||||
mel_log[j] = (e + 1e-9).ln();
|
||||
}
|
||||
let mut mfcc = [0.0f32; MFCC_N];
|
||||
for (k, mc) in mfcc.iter_mut().enumerate() {
|
||||
let row = &self.dct[k * MEL_N..(k + 1) * MEL_N];
|
||||
let mut s = 0.0f32;
|
||||
for (j, &r) in row.iter().enumerate() {
|
||||
s += mel_log[j] * r;
|
||||
}
|
||||
*mc = norm_signed(s, &mut self.agc_mfcc[k]);
|
||||
}
|
||||
|
||||
// Stereo width from this hop's RMS energy (zero the accumulators).
|
||||
let msn = self.ms_n.max(1) as f32;
|
||||
let mid_rms = (self.ms_mid_sq / msn).sqrt();
|
||||
let side_rms = (self.ms_side_sq / msn).sqrt();
|
||||
self.ms_mid_sq = 0.0;
|
||||
self.ms_side_sq = 0.0;
|
||||
self.ms_n = 0;
|
||||
let width = norm(side_rms / (mid_rms + 1e-6), &mut self.agc_width);
|
||||
|
||||
// Advance prev_mag now that flux is computed.
|
||||
self.prev_mag.copy_from_slice(&mags);
|
||||
|
||||
@@ -781,6 +915,12 @@ impl Analyzer {
|
||||
self.env.flux = self.broad_pop;
|
||||
self.env.csd = self.csd_pop;
|
||||
follow(&mut self.env.flatness, flatness);
|
||||
follow(&mut self.env.width, width);
|
||||
// Bipolar cepstral vector: plain EMA (slowly-evolving fingerprint, not
|
||||
// an attack — `follow`'s rise/fall asymmetry would distort it).
|
||||
for (e, &m) in self.env.mfcc.iter_mut().zip(&mfcc) {
|
||||
*e += (m - *e) * MFCC_SMOOTH;
|
||||
}
|
||||
|
||||
// Autocorrelation tempo: anchor the predictive IOI to the dominant
|
||||
// period in ~3 s of broadband-flux history *before* the beat block
|
||||
@@ -895,12 +1035,12 @@ impl Analyzer {
|
||||
}
|
||||
|
||||
fn analysis_loop(
|
||||
mut cons: impl Consumer<Item = f32> + Observer,
|
||||
mut cons: impl Consumer<Item = [f32; 2]>, // Observer is a Consumer supertrait
|
||||
sample_rate: f32,
|
||||
mut out: triple_buffer::Input<Bands>,
|
||||
) {
|
||||
let mut an = Analyzer::new(sample_rate);
|
||||
let mut scratch = vec![0.0f32; HOP * 8];
|
||||
let mut scratch = vec![[0.0f32; 2]; HOP * 8];
|
||||
loop {
|
||||
let avail = cons.occupied_len();
|
||||
if avail == 0 {
|
||||
@@ -909,8 +1049,8 @@ fn analysis_loop(
|
||||
}
|
||||
let take = avail.min(scratch.len());
|
||||
let got = cons.pop_slice(&mut scratch[..take]);
|
||||
for &s in &scratch[..got] {
|
||||
if let Some(b) = an.push(s) {
|
||||
for &[mid, side] in &scratch[..got] {
|
||||
if let Some(b) = an.push(mid, side) {
|
||||
out.write(b);
|
||||
}
|
||||
}
|
||||
@@ -951,9 +1091,10 @@ pub fn analyze_file(path: &Path) -> anyhow::Result<Timeline> {
|
||||
let ch = decoded.spec().channels().count().max(1);
|
||||
decoded.copy_to_vec_interleaved::<f32>(&mut ilv);
|
||||
for frame in ilv.chunks(ch) {
|
||||
let mono = frame.iter().sum::<f32>() / ch as f32;
|
||||
let mid = frame.iter().sum::<f32>() / ch as f32;
|
||||
let side = if ch >= 2 { (frame[0] - frame[1]) * 0.5 } else { 0.0 };
|
||||
samples += 1;
|
||||
if let Some(b) = an.push(mono) {
|
||||
if let Some(b) = an.push(mid, side) {
|
||||
frames.push(b);
|
||||
}
|
||||
}
|
||||
|
||||
+13
-1
@@ -328,16 +328,22 @@ fn main() {
|
||||
Ok(tl) => {
|
||||
let mut peak = Bands::default();
|
||||
let mut bpms: Vec<f32> = Vec::new();
|
||||
let mut mfcc_abs = 0.0f32; // mean |c1| -> timbre vec is alive
|
||||
for b in &tl.frames {
|
||||
peak.low = peak.low.max(b.low);
|
||||
peak.loud = peak.loud.max(b.loud);
|
||||
peak.flux = peak.flux.max(b.flux);
|
||||
peak.csd = peak.csd.max(b.csd);
|
||||
peak.centroid = peak.centroid.max(b.centroid);
|
||||
peak.width = peak.width.max(b.width);
|
||||
mfcc_abs += b.mfcc[0].abs();
|
||||
if b.bpm > 0.0 {
|
||||
bpms.push(b.bpm);
|
||||
}
|
||||
}
|
||||
let nf = tl.frames.len().max(1) as f32;
|
||||
mfcc_abs /= nf;
|
||||
let last = tl.frames.last().copied().unwrap_or_default();
|
||||
// Median BPM = the track's anchored tempo (ACF-stabilised).
|
||||
let med_bpm = if bpms.is_empty() {
|
||||
0.0
|
||||
@@ -346,7 +352,7 @@ fn main() {
|
||||
bpms[bpms.len() / 2]
|
||||
};
|
||||
println!(
|
||||
"ok: {} frames, {:.2}s, {} Hz, {:.1} fps\n peak low {:.2} loud {:.2} flux {:.2} csd {:.2} centroid {:.2}\n tempo {:.1} BPM (median of {} locked frames)",
|
||||
"ok: {} frames, {:.2}s, {} Hz, {:.1} fps\n peak low {:.2} loud {:.2} flux {:.2} csd {:.2} centroid {:.2} width {:.2}\n tempo {:.1} BPM (median of {} locked frames)\n mfcc c1..c4 [{:+.2} {:+.2} {:+.2} {:+.2}] (last) mean|c1| {:.2}",
|
||||
tl.frames.len(),
|
||||
tl.duration(),
|
||||
tl.sample_rate as u32,
|
||||
@@ -356,8 +362,14 @@ fn main() {
|
||||
peak.flux,
|
||||
peak.csd,
|
||||
peak.centroid,
|
||||
peak.width,
|
||||
med_bpm,
|
||||
bpms.len(),
|
||||
last.mfcc[0],
|
||||
last.mfcc[1],
|
||||
last.mfcc[2],
|
||||
last.mfcc[3],
|
||||
mfcc_abs,
|
||||
);
|
||||
}
|
||||
Err(e) => die(format!("analyze: {e}")),
|
||||
|
||||
Reference in New Issue
Block a user