X-Git-Url: https://git.sesse.net/?p=nageru;a=blobdiff_plain;f=nageru%2Faudio_clip.cpp;h=98f4e53e56261183e70b07b8e336ae3c8637fd0c;hp=ec97e6b0f04543920ead2028b5f08cabeb788aae;hb=4c23038d9c18d22a9bfe577244a2d0fc293a7e6a;hpb=5b1f88aa6f4e44f497e951f572a0fe2987389330 diff --git a/nageru/audio_clip.cpp b/nageru/audio_clip.cpp index ec97e6b..98f4e53 100644 --- a/nageru/audio_clip.cpp +++ b/nageru/audio_clip.cpp @@ -2,6 +2,15 @@ #include #include +#include + +#include + +extern "C" { +#include +} + +#include using namespace std; using namespace std::chrono; @@ -59,6 +68,269 @@ steady_clock::time_point AudioClip::get_first_sample() const return first_sample; } +namespace { + +float avg(const vector &vals) +{ + float sum = 0.0f; + for (float val : vals) { + sum += val; + } + return sum / vals.size(); +} + +void remove_dc(vector *vals) +{ + float dc = avg(*vals); + for (float &val : *vals) { + val -= dc; + } +} + +double find_sumsq(const vector &sum_xx, size_t first, size_t last) +{ + if (first == 0) { + return sum_xx[last - 1]; + } else { + return sum_xx[last - 1] - sum_xx[first - 1]; + } +} + +vector calc_sumsq_table(const vector &vals) +{ + vector ret; + ret.resize(vals.size()); + + double sum = 0.0; + for (size_t i = 0; i < vals.size(); ++i) { + sum += vals[i] * vals[i]; + ret[i] = sum; + } + return ret; +} + +vector resample(const vector &vals, float src_freq, float dst_freq) +{ + Resampler resampler; + resampler.setup(src_freq, dst_freq, /*num_channels=*/1, /*hlen=*/32, /*frel=*/1.0); + + // Prefill the sampler so that we don't get any extra delay from it. + resampler.inp_data = nullptr; + resampler.inp_count = resampler.inpsize() / 2 - 1; + resampler.out_data = nullptr; + resampler.out_count = 1024; + resampler.process(); + + vector out; + out.resize(lrint(vals.size() * dst_freq / src_freq)); + + // Do the actual resampling. + resampler.inp_data = const_cast(vals.data()); + resampler.inp_count = vals.size(); + resampler.out_data = &out[0]; + resampler.out_count = out.size(); + resampler.process(); + + return out; +} + +unique_ptr pad(const vector &x, size_t N) +{ + assert(x.size() <= N); + + // avfft requires AVX alignment. + void *ptr; + if (posix_memalign(&ptr, 32, N * sizeof(FFTSample)) != 0) { + perror("posix_memalign"); + abort(); + } + unique_ptr ret(reinterpret_cast(ptr), free); + for (size_t i = 0; i < x.size(); ++i) { + ret[i] = x[i]; + } + for (size_t i = x.size(); i < N; ++i) { + ret[i] = 0.0f; + } + return ret; +} + +unsigned round_up_to_pow2(unsigned x) +{ + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + ++x; + return x; +} + +// Calculate the unnormalized circular correlation between x and y, both zero-padded to length N. +unique_ptr pad_and_correlate(const vector &x, const vector &y, unsigned N) +{ + assert((N & (N - 1)) == 0); + + unsigned bits = ffs(N) - 1; + unique_ptr padded_x = pad(x, N); + unique_ptr padded_y = pad(y, N); + + RDFTContext *fft = av_rdft_init(bits, DFT_R2C); + av_rdft_calc(fft, &padded_x[0]); + av_rdft_calc(fft, &padded_y[0]); + av_rdft_end(fft); + + // Now that we have FFT(X) and FFT(Y), we can compute FFT(result) = conj(FFT(X)) * FFT(Y). + // We reuse the X array for the result. + + // The two first elements are the real values of the lowest (DC) and highest bin. + // (These have zero imaginary value, so that's not stored anywhere.) + padded_x[0] *= padded_y[0]; + padded_x[1] *= padded_y[1]; + + // The remaining elements are complex values for each bin. + for (size_t i = 1; i < N / 2; ++i) { + complex xc(padded_x[i * 2 + 0], padded_x[i * 2 + 1]); + complex yc(padded_y[i * 2 + 0], padded_y[i * 2 + 1]); + complex p = conj(xc) * yc; + padded_x[i * 2 + 0] = p.real(); + padded_x[i * 2 + 1] = p.imag(); + } + + RDFTContext *ifft = av_rdft_init(bits, IDFT_C2R); + av_rdft_calc(ifft, &padded_x[0]); + av_rdft_end(ifft); + + return padded_x; +} + +} // namespace + +AudioClip::BestCorrelation AudioClip::find_best_correlation(const AudioClip *reference) const +{ + // We estimate the delay between the two clips by way of (normalized) cross-correlation; + // if they are perfectly in sync, the cross-correlation will have its maximum at τ=0, + // if they're 100 samples off, the maximum will be at τ=-100 or τ=+100 + // (depending on which one is later), and so on. This gives us single-sample accuracy, + // which is good enough for our use, and it reasonably cheap and simple to compute. + // + // The definition of the cross-correlation (where all sums are over the entire range i) is + // + // R_xy(τ) = sum(x[i] y[i + τ]) / sqrt(sum(x[i]²) sum(y[i]²)) + // + // This assumes a zero mean; if not, the signals will need to be normalized first. + // (We do this below.) It also assumes real-valued signals. + // + // We want to evaluate this over all τ as long as there is reasonable overlap, + // truncating the signals to the part where there is overlap (so the sums of x[i]² + // and y[i]² will also depend on τ, even though it isn't apparent in the formula). + // We could have done this by brute force; it would take about 70 ms, or a bit less + // with AVX-optimized sums. However, we can optimize it significantly with some + // standard algorithmic trickery: + // + // First of all, for the squared sums, we can simply use sum tables; compute the + // cumulative sum of x[i]² and y[i]², and then we can compute sum(x[i]², i=a..b) + // for any a and b by just looking up in the table and doing a subtraction. + // + // Calculating the sum_xy[τ] = sum(x[i] y[i + τ]) part efficiently for all τ + // requires a little signal processing. We use the identity that + // + // FFT(sum_xy) = conj(FFT(x)) * FFT(y) + // + // or equivalently + // + // sum_xy = IFFT(conj(FFT(x)) * FFT(y)) + // + // where conj() is complex conjugate and * is (complex) pointwise multiplication. + // Since FFT and IFFT can be calculated in O(n log n) time, and we already link to ffmpeg, + // which supplies a reasonably efficient implementation, this gives us a significant + // speedup (down to 3–4 ms or less) and a convenient way to calculate sum_xy. + // FFT gives us _circular_ convolution, so sum_xy[0] is for τ=0, sum_xy[1] is for τ=1, + // sum_xy[N - 1] is for τ=-1 and so on. This also means we'll need to take care to + // zero-pad our signals so that we don't get values corrupted by wraparound. + // (We also need to pad to a power of two, since ffmpeg's FFT doesn't support other sizes.) + + // Resample to 16 kHz, so we have a consistent sample rate between the two. + // It also helps performance (and removes HF noise, although noise + // shouldn't be a big problem for the algorithm). Note that ffmpeg's + // FFT drops down to a less optimized implementation for N > 65536; + // with our choice of parameters, N = 32768 typically, so we should be fine. + // + // We don't do sub-sample precision, so this takes us to 0.06 ms, which + // is more than good enough. + constexpr float freq_hz = 16000.0; + vector x = resample(vals, sample_rate, freq_hz); + vector y = resample(reference->vals, sample_rate, freq_hz); + + // There should normally be no DC, but let's be sure. + remove_dc(&x); + remove_dc(&y); + + // Truncate the clips so that they start at the same point. + if (first_sample < reference->first_sample) { + int trunc_samples = lrintf(duration(reference->first_sample - first_sample).count() * freq_hz); + assert(trunc_samples >= 0); + if (size_t(trunc_samples) >= x.size()) { + return { 0.0, 0.0f / 0.0f }; + } + x.erase(x.begin(), x.begin() + trunc_samples); + } else if (reference->first_sample < first_sample) { + int trunc_samples = lrintf(duration(first_sample - reference->first_sample).count() * freq_hz); + assert(trunc_samples >= 0); + if (size_t(trunc_samples) >= y.size()) { + return { 0.0, 0.0f / 0.0f }; + } + y.erase(y.begin(), y.begin() + trunc_samples); + } + + // Truncate the clips to one second. + if (x.size() > int(freq_hz)) x.resize(int(freq_hz)); + if (y.size() > int(freq_hz)) y.resize(int(freq_hz)); + + // Find the cumulative sum of squares. Doing this once will allow us to + // find sum(x[i]², i=a..b) essentially for free in all the iterations below. + vector cumsum_xx = calc_sumsq_table(x); + vector cumsum_yy = calc_sumsq_table(y); + + unsigned N = round_up_to_pow2(x.size() + y.size() - 1); + unique_ptr padded_sum_xy = pad_and_correlate(x, y, N); + + // We only search for delays such that there's at least 100 ms overlap between them. + // Less than that, and the chance of a spurious match feels rather high + // (and 900 ms delay measured on a one-second window sounds pretty excessive!). + constexpr int min_overlap = freq_hz * 0.1; + BestCorrelation best_corr { 0.0, 0.0f / 0.0f }; // NaN. + + // First check candidates where y (the reference) has to be moved later to fit + // (ie., x, ourselves, is delayed -- positive delay). + for (size_t delay_y = 1; delay_y < x.size() - min_overlap; ++delay_y) { + size_t overlap = std::min(x.size() - delay_y, y.size()); + float sum_xy = padded_sum_xy[N - int(delay_y)] * (2.0 / N); + float sum_xx = find_sumsq(cumsum_xx, delay_y, delay_y + overlap); + float sum_yy = find_sumsq(cumsum_yy, 0, overlap); + float corr = sum_xy / sqrt(sum_xx * sum_yy); + + if (isnan(best_corr.correlation) || fabs(corr) > fabs(best_corr.correlation)) { + best_corr = BestCorrelation { int(delay_y) * 1000.0f / freq_hz, corr }; + } + } + + // Then where x (ourselves) has to be moved later to fit (ie., negative delay). + for (size_t delay_x = 0; delay_x < y.size() - min_overlap; ++delay_x) { + size_t overlap = std::min(x.size(), y.size() - delay_x); + float sum_xy = padded_sum_xy[delay_x] * (2.0 / N); + float sum_xx = find_sumsq(cumsum_xx, 0, overlap); + float sum_yy = find_sumsq(cumsum_yy, delay_x, delay_x + overlap); + float corr = sum_xy / sqrt(sum_xx * sum_yy); + + if (isnan(best_corr.correlation) || fabs(corr) > fabs(best_corr.correlation)) { + best_corr = BestCorrelation { -int(delay_x) * 1000.0f / freq_hz, corr }; + } + } + + return best_corr; +} + unique_ptr[]> AudioClip::get_min_max_peaks(unsigned width, steady_clock::time_point base) const { unique_ptr[]> min_max(new pair[width]);