]> git.sesse.net Git - nageru/commitdiff
Make an automated delay estimate, by way of cross-correlation.
authorSteinar H. Gunderson <steinar+nageru@gunderson.no>
Sun, 11 Aug 2019 08:51:05 +0000 (10:51 +0200)
committerSteinar H. Gunderson <steinar+nageru@gunderson.no>
Sun, 25 Aug 2019 22:39:08 +0000 (00:39 +0200)
nageru/audio_clip.cpp
nageru/audio_clip.h
nageru/delay_analyzer.cpp
nageru/delay_analyzer.h
nageru/delay_analyzer.ui

index ec97e6b0f04543920ead2028b5f08cabeb788aae..98f4e53e56261183e70b07b8e336ae3c8637fd0c 100644 (file)
@@ -2,6 +2,15 @@
 
 #include <assert.h>
 #include <math.h>
+#include <zita-resampler/resampler.h>
+
+#include <strings.h>
+
+extern "C" {
+#include <libavcodec/avfft.h>
+}
+
+#include <complex>
 
 using namespace std;
 using namespace std::chrono;
@@ -59,6 +68,269 @@ steady_clock::time_point AudioClip::get_first_sample() const
        return first_sample;
 }
 
+namespace {
+
+float avg(const vector<float> &vals)
+{
+       float sum = 0.0f;
+       for (float val : vals) {
+               sum += val;
+       }
+       return sum / vals.size();
+}
+
+void remove_dc(vector<float> *vals)
+{
+       float dc = avg(*vals);
+       for (float &val : *vals) {
+               val -= dc;
+       }
+}
+
+double find_sumsq(const vector<double> &sum_xx, size_t first, size_t last)
+{
+       if (first == 0) {
+               return sum_xx[last - 1];
+       } else {
+               return sum_xx[last - 1] - sum_xx[first - 1];
+       }
+}
+
+vector<double> calc_sumsq_table(const vector<float> &vals)
+{
+       vector<double> ret;
+       ret.resize(vals.size());
+
+       double sum = 0.0;
+       for (size_t i = 0; i < vals.size(); ++i) {
+               sum += vals[i] * vals[i];
+               ret[i] = sum;
+       }
+       return ret;
+}
+
+vector<float> resample(const vector<float> &vals, float src_freq, float dst_freq)
+{
+       Resampler resampler;
+       resampler.setup(src_freq, dst_freq, /*num_channels=*/1, /*hlen=*/32, /*frel=*/1.0);
+
+       // Prefill the sampler so that we don't get any extra delay from it.
+       resampler.inp_data = nullptr;
+       resampler.inp_count = resampler.inpsize() / 2 - 1;
+       resampler.out_data = nullptr;
+       resampler.out_count = 1024;
+       resampler.process();
+
+       vector<float> out;
+       out.resize(lrint(vals.size() * dst_freq / src_freq));
+
+       // Do the actual resampling.
+       resampler.inp_data = const_cast<float *>(vals.data());
+       resampler.inp_count = vals.size();
+       resampler.out_data = &out[0];
+       resampler.out_count = out.size();
+       resampler.process();
+
+       return out;
+}
+
+unique_ptr<FFTSample[], decltype(free)*> pad(const vector<float> &x, size_t N)
+{
+       assert(x.size() <= N);
+
+       // avfft requires AVX alignment.
+       void *ptr;
+       if (posix_memalign(&ptr, 32, N * sizeof(FFTSample)) != 0) {
+               perror("posix_memalign");
+               abort();
+       }
+       unique_ptr<FFTSample[], decltype(free)*> ret(reinterpret_cast<FFTSample *>(ptr), free);
+       for (size_t i = 0; i < x.size(); ++i) {
+               ret[i] = x[i];
+       }
+       for (size_t i = x.size(); i < N; ++i) {
+               ret[i] = 0.0f;
+       }
+       return ret;
+}
+
+unsigned round_up_to_pow2(unsigned x)
+{
+       --x;
+       x |= x >> 1;
+       x |= x >> 2;
+       x |= x >> 4;
+       x |= x >> 8;
+       x |= x >> 16;
+       ++x;
+       return x;
+}
+
+// Calculate the unnormalized circular correlation between x and y, both zero-padded to length N.
+unique_ptr<FFTSample[], decltype(free)*> pad_and_correlate(const vector<float> &x, const vector<float> &y, unsigned N)
+{
+       assert((N & (N - 1)) == 0);
+
+       unsigned bits = ffs(N) - 1;
+       unique_ptr<FFTSample[], decltype(free)*> padded_x = pad(x, N);
+       unique_ptr<FFTSample[], decltype(free)*> padded_y = pad(y, N);
+
+       RDFTContext *fft = av_rdft_init(bits, DFT_R2C);
+       av_rdft_calc(fft, &padded_x[0]);
+       av_rdft_calc(fft, &padded_y[0]);
+       av_rdft_end(fft);
+
+       // Now that we have FFT(X) and FFT(Y), we can compute FFT(result) = conj(FFT(X)) * FFT(Y).
+       // We reuse the X array for the result.
+
+       // The two first elements are the real values of the lowest (DC) and highest bin.
+       // (These have zero imaginary value, so that's not stored anywhere.)
+       padded_x[0] *= padded_y[0];
+       padded_x[1] *= padded_y[1];
+
+       // The remaining elements are complex values for each bin.
+       for (size_t i = 1; i < N / 2; ++i) {
+               complex<float> xc(padded_x[i * 2 + 0], padded_x[i * 2 + 1]);
+               complex<float> yc(padded_y[i * 2 + 0], padded_y[i * 2 + 1]);
+               complex<float> p = conj(xc) * yc;
+               padded_x[i * 2 + 0] = p.real();
+               padded_x[i * 2 + 1] = p.imag();
+       }
+
+       RDFTContext *ifft = av_rdft_init(bits, IDFT_C2R);
+       av_rdft_calc(ifft, &padded_x[0]);
+       av_rdft_end(ifft);
+
+       return padded_x;
+}
+
+} // namespace
+
+AudioClip::BestCorrelation AudioClip::find_best_correlation(const AudioClip *reference) const
+{
+       // We estimate the delay between the two clips by way of (normalized) cross-correlation;
+       // if they are perfectly in sync, the cross-correlation will have its maximum at τ=0,
+       // if they're 100 samples off, the maximum will be at τ=-100 or τ=+100
+       // (depending on which one is later), and so on. This gives us single-sample accuracy,
+       // which is good enough for our use, and it reasonably cheap and simple to compute.
+       //
+       // The definition of the cross-correlation (where all sums are over the entire range i) is
+       //
+       //  R_xy(τ) = sum(x[i] y[i + τ]) / sqrt(sum(x[i]²) sum(y[i]²))
+       //
+       // This assumes a zero mean; if not, the signals will need to be normalized first.
+       // (We do this below.) It also assumes real-valued signals.
+       //
+       // We want to evaluate this over all τ as long as there is reasonable overlap,
+       // truncating the signals to the part where there is overlap (so the sums of x[i]²
+       // and y[i]² will also depend on τ, even though it isn't apparent in the formula).
+       // We could have done this by brute force; it would take about 70 ms, or a bit less
+       // with AVX-optimized sums. However, we can optimize it significantly with some
+       // standard algorithmic trickery:
+       //
+       // First of all, for the squared sums, we can simply use sum tables; compute the
+       // cumulative sum of x[i]² and y[i]², and then we can compute sum(x[i]², i=a..b)
+       // for any a and b by just looking up in the table and doing a subtraction.
+       //
+       // Calculating the sum_xy[τ] = sum(x[i] y[i + τ]) part efficiently for all τ
+       // requires a little signal processing. We use the identity that
+       //
+       //    FFT(sum_xy) = conj(FFT(x)) * FFT(y)
+       //
+       // or equivalently
+       //
+       //    sum_xy = IFFT(conj(FFT(x)) * FFT(y))
+       //
+       // where conj() is complex conjugate and * is (complex) pointwise multiplication.
+       // Since FFT and IFFT can be calculated in O(n log n) time, and we already link to ffmpeg,
+       // which supplies a reasonably efficient implementation, this gives us a significant
+       // speedup (down to 3–4 ms or less) and a convenient way to calculate sum_xy.
+       // FFT gives us _circular_ convolution, so sum_xy[0] is for τ=0, sum_xy[1] is for τ=1,
+       // sum_xy[N - 1] is for τ=-1 and so on. This also means we'll need to take care to
+       // zero-pad our signals so that we don't get values corrupted by wraparound.
+       // (We also need to pad to a power of two, since ffmpeg's FFT doesn't support other sizes.)
+
+       // Resample to 16 kHz, so we have a consistent sample rate between the two.
+       // It also helps performance (and removes HF noise, although noise
+       // shouldn't be a big problem for the algorithm). Note that ffmpeg's
+       // FFT drops down to a less optimized implementation for N > 65536;
+       // with our choice of parameters, N = 32768 typically, so we should be fine.
+       //
+       // We don't do sub-sample precision, so this takes us to 0.06 ms, which
+       // is more than good enough.
+       constexpr float freq_hz = 16000.0;
+       vector<float> x = resample(vals, sample_rate, freq_hz);
+       vector<float> y = resample(reference->vals, sample_rate, freq_hz);
+
+       // There should normally be no DC, but let's be sure.
+       remove_dc(&x);
+       remove_dc(&y);
+
+       // Truncate the clips so that they start at the same point.
+       if (first_sample < reference->first_sample) {
+               int trunc_samples = lrintf(duration<double>(reference->first_sample - first_sample).count() * freq_hz);
+               assert(trunc_samples >= 0);
+               if (size_t(trunc_samples) >= x.size()) {
+                       return { 0.0, 0.0f / 0.0f };
+               }
+               x.erase(x.begin(), x.begin() + trunc_samples);
+       } else if (reference->first_sample < first_sample) {
+               int trunc_samples = lrintf(duration<double>(first_sample - reference->first_sample).count() * freq_hz);
+               assert(trunc_samples >= 0);
+               if (size_t(trunc_samples) >= y.size()) {
+                       return { 0.0, 0.0f / 0.0f };
+               }
+               y.erase(y.begin(), y.begin() + trunc_samples);
+       }
+
+       // Truncate the clips to one second.
+       if (x.size() > int(freq_hz)) x.resize(int(freq_hz));
+       if (y.size() > int(freq_hz)) y.resize(int(freq_hz));
+
+       // Find the cumulative sum of squares. Doing this once will allow us to
+       // find sum(x[i]², i=a..b) essentially for free in all the iterations below.
+       vector<double> cumsum_xx = calc_sumsq_table(x);
+       vector<double> cumsum_yy = calc_sumsq_table(y);
+
+       unsigned N = round_up_to_pow2(x.size() + y.size() - 1);
+       unique_ptr<FFTSample[], decltype(free)*> padded_sum_xy = pad_and_correlate(x, y, N);
+
+       // We only search for delays such that there's at least 100 ms overlap between them.
+       // Less than that, and the chance of a spurious match feels rather high
+       // (and 900 ms delay measured on a one-second window sounds pretty excessive!).
+       constexpr int min_overlap = freq_hz * 0.1;
+       BestCorrelation best_corr { 0.0, 0.0f / 0.0f };  // NaN.
+
+       // First check candidates where y (the reference) has to be moved later to fit
+       // (ie., x, ourselves, is delayed -- positive delay).
+       for (size_t delay_y = 1; delay_y < x.size() - min_overlap; ++delay_y) {
+               size_t overlap = std::min(x.size() - delay_y, y.size());
+               float sum_xy = padded_sum_xy[N - int(delay_y)] * (2.0 / N);
+               float sum_xx = find_sumsq(cumsum_xx, delay_y, delay_y + overlap);
+               float sum_yy = find_sumsq(cumsum_yy, 0, overlap);
+               float corr = sum_xy / sqrt(sum_xx * sum_yy);
+
+               if (isnan(best_corr.correlation) || fabs(corr) > fabs(best_corr.correlation)) {
+                       best_corr = BestCorrelation { int(delay_y) * 1000.0f / freq_hz, corr };
+               }
+       }
+
+       // Then where x (ourselves) has to be moved later to fit (ie., negative delay).
+       for (size_t delay_x = 0; delay_x < y.size() - min_overlap; ++delay_x) {
+               size_t overlap = std::min(x.size(), y.size() - delay_x);
+               float sum_xy = padded_sum_xy[delay_x] * (2.0 / N);
+               float sum_xx = find_sumsq(cumsum_xx, 0, overlap);
+               float sum_yy = find_sumsq(cumsum_yy, delay_x, delay_x + overlap);
+               float corr = sum_xy / sqrt(sum_xx * sum_yy);
+
+               if (isnan(best_corr.correlation) || fabs(corr) > fabs(best_corr.correlation)) {
+                       best_corr = BestCorrelation { -int(delay_x) * 1000.0f / freq_hz, corr };
+               }
+       }
+
+       return best_corr;
+}
+
 unique_ptr<pair<float, float>[]> AudioClip::get_min_max_peaks(unsigned width, steady_clock::time_point base) const
 {
        unique_ptr<pair<float, float>[]> min_max(new pair<float, float>[width]);
index 693dbbd4aa863e141a4ac619ea24e59d10c23f56..c087bdb2caa199035fb8e5f2e58e2d67e1abf574 100644 (file)
@@ -22,6 +22,12 @@ public:
        // Only valid if not empty.
        std::chrono::steady_clock::time_point get_first_sample() const;
 
+       struct BestCorrelation {
+               float delay_ms;  // Positive values means this clip is delayed compared to the reference.
+               float correlation;  // Between -1 and +1 (+1 is a perfect match, -1 is a perfect inversion).
+       };
+       BestCorrelation find_best_correlation(const AudioClip *reference) const;
+
        std::unique_ptr<std::pair<float, float>[]> get_min_max_peaks(unsigned width, std::chrono::steady_clock::time_point base) const;
 
 private:
index 9e8206015de30a80ce74455a0efdd9d12ff6f19f..b3127c4630567db073871b7f1505218a1028999b 100644 (file)
@@ -2,6 +2,7 @@
 
 #include "audio_mixer.h"
 #include "ui_delay_analyzer.h"
+#include "shared/post_to_main_thread.h"
 
 #include <bmusb/bmusb.h>
 
@@ -53,6 +54,7 @@ void DelayAnalyzer::grab_clicked()
        grab_timeout->start(milliseconds(2000));
        clip1.clear();
        clip2.clear();
+       ui->delay_estimate_label->clear();
        ui->peak_display_1->reset_base();
        ui->peak_display_2->reset_base();
        ui->peak_display_1->audio_clip_updated();
@@ -90,6 +92,7 @@ void DelayAnalyzer::card_selected(QComboBox *card_combo, int selected_index)
                clip2.clear();
                ui->peak_display_2->audio_clip_updated();
        }
+       ui->delay_estimate_label->clear();
 }
 
 void DelayAnalyzer::channel_selected(QComboBox *channel_combo)
@@ -101,6 +104,7 @@ void DelayAnalyzer::channel_selected(QComboBox *channel_combo)
                clip2.clear();
                ui->peak_display_2->audio_clip_updated();
        }
+       ui->delay_estimate_label->clear();
 }
 
 DeviceSpec DelayAnalyzer::get_selected_device(QComboBox *card_combo)
@@ -152,7 +156,29 @@ void DelayAnalyzer::add_audio(DeviceSpec device_spec, const uint8_t *data, unsig
                        grab_timeout->stop();
                });
                grabbing = false;
+
+               // The delay estimation is cheap, but it's not free, and we're on the main mixer thread,
+               // so fire it off onto a background thread.
+               std::thread(&DelayAnalyzer::estimate_delay, this).detach();
+       }
+}
+
+void DelayAnalyzer::estimate_delay()
+{
+       AudioClip::BestCorrelation corr = clip1.find_best_correlation(&clip2);
+       char buf[256];
+       if (isnan(corr.correlation) || fabs(corr.correlation) < 0.1) {  // Very loose bound, basically to guard against digital silence.
+               snprintf(buf, sizeof(buf), "Could not estimate delay.\n");
+       } else if (corr.correlation < 0.0) {
+               snprintf(buf, sizeof(buf), "Best estimate: Channel is %.1f ms delayed compared to reference <em>and inverted</em> (correlation = %.3f)\n",
+                       corr.delay_ms, corr.correlation);
+       } else {
+               snprintf(buf, sizeof(buf), "Best estimate: Channel is %.1f ms delayed compared to reference (correlation = %.3f)\n",
+                       corr.delay_ms, corr.correlation);
        }
+       post_to_main_thread([this, buf] {
+               ui->delay_estimate_label->setText(buf);
+       });
 }
 
 void DelayAnalyzer::grab_timed_out()
index a9770533ef28e07a7ca0a218b4acdb6d45d54eac..5b22ca5013d64daca3ddffc5bd2f62c101189bd1 100644 (file)
@@ -34,6 +34,7 @@ public:
 
 private:
        void grab_timed_out();
+       void estimate_delay();
 
        Ui::DelayAnalyzer *ui;
        AudioClip clip1, clip2;
index ce3566a90a9e6e369999d0b5748d7f0bc1b9a8fa..e36a20a84150c3970e281c50c2a5ac0a14de5d67 100644 (file)
@@ -6,7 +6,7 @@
    <rect>
     <x>0</x>
     <y>0</y>
-    <width>1092</width>
+    <width>1200</width>
     <height>412</height>
    </rect>
   </property>
   </property>
   <widget class="QWidget" name="centralwidget">
    <layout class="QGridLayout" name="gridLayout_2" columnstretch="0,0,0,1">
-    <item row="6" column="0">
-     <widget class="QPushButton" name="grab_btn">
-      <property name="text">
-       <string>Grab</string>
-      </property>
-     </widget>
-    </item>
-    <item row="0" column="0">
-     <spacer name="verticalSpacer">
-      <property name="orientation">
-       <enum>Qt::Vertical</enum>
-      </property>
-      <property name="sizeHint" stdset="0">
-       <size>
-        <width>20</width>
-        <height>40</height>
-       </size>
-      </property>
-     </spacer>
-    </item>
-    <item row="4" column="1">
-     <widget class="QComboBox" name="card_combo_2"/>
-    </item>
-    <item row="5" column="0">
-     <spacer name="verticalSpacer_2">
-      <property name="orientation">
-       <enum>Qt::Vertical</enum>
-      </property>
-      <property name="sizeHint" stdset="0">
-       <size>
-        <width>20</width>
-        <height>40</height>
-       </size>
-      </property>
-     </spacer>
-    </item>
-    <item row="4" column="2">
-     <widget class="QComboBox" name="channel_combo_2"/>
-    </item>
-    <item row="2" column="1">
-     <widget class="QComboBox" name="card_combo_1"/>
-    </item>
-    <item row="4" column="0">
-     <widget class="QLabel" name="label">
-      <property name="text">
-       <string>Reference</string>
-      </property>
-     </widget>
-    </item>
-    <item row="2" column="2">
-     <widget class="QComboBox" name="channel_combo_1"/>
-    </item>
     <item row="2" column="3">
      <widget class="QFrame" name="frame">
       <property name="minimumSize">
       </layout>
      </widget>
     </item>
+    <item row="2" column="1">
+     <widget class="QComboBox" name="card_combo_1"/>
+    </item>
+    <item row="4" column="0">
+     <widget class="QLabel" name="label">
+      <property name="text">
+       <string>Reference</string>
+      </property>
+     </widget>
+    </item>
+    <item row="2" column="2">
+     <widget class="QComboBox" name="channel_combo_1"/>
+    </item>
     <item row="4" column="3">
      <widget class="QFrame" name="frame_2">
       <property name="frameShape">
       </layout>
      </widget>
     </item>
+    <item row="7" column="0">
+     <widget class="QPushButton" name="grab_btn">
+      <property name="text">
+       <string>Grab</string>
+      </property>
+     </widget>
+    </item>
+    <item row="0" column="0">
+     <spacer name="verticalSpacer">
+      <property name="orientation">
+       <enum>Qt::Vertical</enum>
+      </property>
+      <property name="sizeHint" stdset="0">
+       <size>
+        <width>20</width>
+        <height>40</height>
+       </size>
+      </property>
+     </spacer>
+    </item>
+    <item row="4" column="1">
+     <widget class="QComboBox" name="card_combo_2"/>
+    </item>
+    <item row="6" column="0">
+     <spacer name="verticalSpacer_2">
+      <property name="orientation">
+       <enum>Qt::Vertical</enum>
+      </property>
+      <property name="sizeHint" stdset="0">
+       <size>
+        <width>20</width>
+        <height>40</height>
+       </size>
+      </property>
+     </spacer>
+    </item>
+    <item row="4" column="2">
+     <widget class="QComboBox" name="channel_combo_2"/>
+    </item>
+    <item row="5" column="3">
+     <widget class="QLabel" name="delay_estimate_label">
+      <property name="minimumSize">
+       <size>
+        <width>0</width>
+        <height>40</height>
+       </size>
+      </property>
+      <property name="text">
+       <string/>
+      </property>
+      <property name="alignment">
+       <set>Qt::AlignRight|Qt::AlignTrailing|Qt::AlignVCenter</set>
+      </property>
+     </widget>
+    </item>
    </layout>
   </widget>
  </widget>