Rework the audio/video sync algorithm.

author Steinar H. Gunderson <sgunderson@bigfoot.com>

Thu, 2 Feb 2017 23:03:58 +0000 (00:03 +0100)

committer Steinar H. Gunderson <sgunderson@bigfoot.com>

Thu, 2 Feb 2017 23:03:58 +0000 (00:03 +0100)
author Steinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 2 Feb 2017 23:03:58 +0000 (00:03 +0100)
committer Steinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 2 Feb 2017 23:03:58 +0000 (00:03 +0100)
diff --git a/alsa_input.cpp b/alsa_input.cpp

index 158bfaa32e0e914d50a1a29efe52a6cec5fb7bd3..0d6901423b17085d65459708385a5fcb055218a4 100644 (file)
--- a/alsa_input.cpp
+++ b/alsa_input.cpp
@@ -12,6 +12,7 @@
  #include "timebase.h"
  
  using namespace std;
+using namespace std::chrono;
  using namespace std::placeholders;
  
  #define RETURN_ON_ERROR(msg, expr) do {                                                    \
@@ -243,10 +244,11 @@ ALSAInput::CaptureEndReason ALSAInput::do_capture()
  
                 const int64_t prev_pts = frames_to_pts(num_frames_output);
                 const int64_t pts = frames_to_pts(num_frames_output + frames);
+               const steady_clock::time_point now = steady_clock::now();
                 bool success;
                 do {
                         if (should_quit) return CaptureEndReason::REQUESTED_QUIT;
-                       success = audio_callback(buffer.get(), frames, audio_format, pts - prev_pts);
+                       success = audio_callback(buffer.get(), frames, audio_format, pts - prev_pts, now);
                 } while (!success);
                 num_frames_output += frames;
         }
diff --git a/alsa_input.h b/alsa_input.h

index 6bce9134cdcd2a3fb3d381988cc482d4419ccbc1..bae9fdf78413188cef9b9c2d86c06c67bda019dd 100644 (file)
--- a/alsa_input.h
+++ b/alsa_input.h
@@ -13,6 +13,7 @@
  #include <stdint.h>
  #include <sys/types.h>
  #include <atomic>
+#include <chrono>
  #include <functional>
  #include <memory>
  #include <string>
@@ -24,7 +25,7 @@ class ALSAPool;
  
  class ALSAInput {
  public:
-       typedef std::function<bool(const uint8_t *data, unsigned num_samples, bmusb::AudioFormat audio_format, int64_t frame_length)> audio_callback_t;
+       typedef std::function<bool(const uint8_t *data, unsigned num_samples, bmusb::AudioFormat audio_format, int64_t frame_length, std::chrono::steady_clock::time_point ts)> audio_callback_t;
  
         ALSAInput(const char *device, unsigned sample_rate, unsigned num_channels, audio_callback_t audio_callback, ALSAPool *parent_pool, unsigned internal_dev_index);
         ~ALSAInput();
diff --git a/alsa_pool.cpp b/alsa_pool.cpp

index 3a4a5b0eddf74ff876a9c00f432f1cacb1fe5004..348c6232303ab561d3a4039ca69932261bd3e8bc 100644 (file)
--- a/alsa_pool.cpp
+++ b/alsa_pool.cpp
@@ -393,7 +393,7 @@ void ALSAPool::reset_device(unsigned index)
                 inputs[index].reset();
         } else {
                 // TODO: Put on a background thread instead of locking?
-               auto callback = bind(&AudioMixer::add_audio, global_audio_mixer, DeviceSpec{InputSourceType::ALSA_INPUT, index}, _1, _2, _3, _4);
+               auto callback = bind(&AudioMixer::add_audio, global_audio_mixer, DeviceSpec{InputSourceType::ALSA_INPUT, index}, _1, _2, _3, _4, _5);
                 inputs[index].reset(new ALSAInput(device->address.c_str(), OUTPUT_FREQUENCY, device->num_channels, callback, this, index));
                 inputs[index]->start_capture_thread();
         }
diff --git a/audio_mixer.cpp b/audio_mixer.cpp

index 01fed687b47a8d2ae77a752896f925d11c176c37..e4d95a49422250e44dced2a84f11dc866ac878f6 100644 (file)
--- a/audio_mixer.cpp
+++ b/audio_mixer.cpp
@@ -25,6 +25,7 @@
  
  using namespace bmusb;
  using namespace std;
+using namespace std::chrono;
  using namespace std::placeholders;
  
  namespace {
@@ -231,10 +232,9 @@ void AudioMixer::reset_resampler_mutex_held(DeviceSpec device_spec)
                         device_spec.index, device->capture_frequency, OUTPUT_FREQUENCY, device->interesting_channels.size(),
                         global_flags.audio_queue_length_ms * 0.001));
         }
-       device->next_local_pts = 0;
  }
  
-bool AudioMixer::add_audio(DeviceSpec device_spec, const uint8_t *data, unsigned num_samples, AudioFormat audio_format, int64_t frame_length)
+bool AudioMixer::add_audio(DeviceSpec device_spec, const uint8_t *data, unsigned num_samples, AudioFormat audio_format, int64_t frame_length, steady_clock::time_point frame_time)
  {
         AudioDevice *device = find_audio_device(device_spec);
  
@@ -274,9 +274,7 @@ bool AudioMixer::add_audio(DeviceSpec device_spec, const uint8_t *data, unsigned
         }
  
         // Now add it.
-       int64_t local_pts = device->next_local_pts;
-       device->resampling_queue->add_input_samples(local_pts / double(TIMEBASE), audio.get(), num_samples);
-       device->next_local_pts = local_pts + frame_length;
+       device->resampling_queue->add_input_samples(frame_time, audio.get(), num_samples, ResamplingQueue::ADJUST_RATE);
         return true;
  }
  
@@ -298,11 +296,7 @@ bool AudioMixer::add_silence(DeviceSpec device_spec, unsigned samples_per_frame,
  
         vector<float> silence(samples_per_frame * num_channels, 0.0f);
         for (unsigned i = 0; i < num_frames; ++i) {
-               device->resampling_queue->add_input_samples(device->next_local_pts / double(TIMEBASE), silence.data(), samples_per_frame);
-               // Note that if the format changed in the meantime, we have
-               // no way of detecting that; we just have to assume the frame length
-               // is always the same.
-               device->next_local_pts += frame_length;
+               device->resampling_queue->add_input_samples(steady_clock::now(), silence.data(), samples_per_frame, ResamplingQueue::DO_NOT_ADJUST_RATE);
         }
         return true;
  }
@@ -475,7 +469,7 @@ void apply_gain(float db, float last_db, vector<float> *samples)
  
  }  // namespace
  
-vector<float> AudioMixer::get_output(double pts, unsigned num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy)
+vector<float> AudioMixer::get_output(steady_clock::time_point ts, unsigned num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy)
  {
         map<DeviceSpec, vector<float>> samples_card;
         vector<float> samples_bus;
@@ -490,7 +484,7 @@ vector<float> AudioMixer::get_output(double pts, unsigned num_samples, Resamplin
                         memset(&samples_card[device_spec][0], 0, samples_card[device_spec].size() * sizeof(float));
                 } else {
                         device->resampling_queue->get_output_samples(
-                               pts,
+                               ts,
                                 &samples_card[device_spec][0],
                                 num_samples,
                                 rate_adjustment_policy);
diff --git a/audio_mixer.h b/audio_mixer.h

index 85f9aedf96a77779b31e3e9a5433710076659208..8d4f15c7f2020b399fb726f4ce5b025ac22d65fb 100644 (file)
--- a/audio_mixer.h
+++ b/audio_mixer.h
@@ -12,6 +12,7 @@
  #include <stdint.h>
  #include <zita-resampler/resampler.h>
  #include <atomic>
+#include <chrono>
  #include <functional>
  #include <map>
  #include <memory>
@@ -54,7 +55,7 @@ public:
         // (This is to avoid a deadlock where a card hangs on the mutex in add_audio()
         // while we are trying to shut it down from another thread that also holds
         // the mutex.) frame_length is in TIMEBASE units.
-       bool add_audio(DeviceSpec device_spec, const uint8_t *data, unsigned num_samples, bmusb::AudioFormat audio_format, int64_t frame_length);
+       bool add_audio(DeviceSpec device_spec, const uint8_t *data, unsigned num_samples, bmusb::AudioFormat audio_format, int64_t frame_length, std::chrono::steady_clock::time_point frame_time);
         bool add_silence(DeviceSpec device_spec, unsigned samples_per_frame, unsigned num_frames, int64_t frame_length);
  
         // If a given device is offline for whatever reason and cannot deliver audio
@@ -64,7 +65,7 @@ public:
         // affect it. Same true/false behavior as add_audio().
         bool silence_card(DeviceSpec device_spec, bool silence);
  
-       std::vector<float> get_output(double pts, unsigned num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy);
+       std::vector<float> get_output(std::chrono::steady_clock::time_point ts, unsigned num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy);
  
         float get_fader_volume(unsigned bus_index) const { return fader_volume_db[bus_index]; }
         void set_fader_volume(unsigned bus_index, float level_db) { fader_volume_db[bus_index] = level_db; }
@@ -301,7 +302,6 @@ public:
  private:
         struct AudioDevice {
                 std::unique_ptr<ResamplingQueue> resampling_queue;
-               int64_t next_local_pts = 0;
                 std::string display_name;
                 unsigned capture_frequency = OUTPUT_FREQUENCY;
                 // Which channels we consider interesting (ie., are part of some input_mapping).
diff --git a/benchmark_audio_mixer.cpp b/benchmark_audio_mixer.cpp

index 4b8f84a93ef0458dffa25d8806a54564afa2400e..3327179c3518e6dc492b794c636d7d24bafde7a7 100644 (file)
--- a/benchmark_audio_mixer.cpp
+++ b/benchmark_audio_mixer.cpp
@@ -61,6 +61,10 @@ void callback(float level_lufs, float peak_db,
  
  vector<float> process_frame(unsigned frame_num, AudioMixer *mixer)
  {
+       duration<int64_t, ratio<NUM_SAMPLES, OUTPUT_FREQUENCY>> frame_duration(frame_num);
+       steady_clock::time_point ts = steady_clock::time_point::min() +
+               duration_cast<steady_clock::duration>(frame_duration);
+
         // Feed the inputs.
         for (unsigned card_index = 0; card_index < NUM_BENCHMARK_CARDS; ++card_index) {
                 bmusb::AudioFormat audio_format;
@@ -70,12 +74,11 @@ vector<float> process_frame(unsigned frame_num, AudioMixer *mixer)
                 unsigned num_samples = NUM_SAMPLES + (lcgrand() % 9) - 5;
                 bool ok = mixer->add_audio(DeviceSpec{InputSourceType::CAPTURE_CARD, card_index},
                         card_index == 3 ? samples24 : samples16, num_samples, audio_format,
-                       NUM_SAMPLES * TIMEBASE / OUTPUT_FREQUENCY);
+                       NUM_SAMPLES * TIMEBASE / OUTPUT_FREQUENCY, ts);
                 assert(ok);
         }
  
-       double pts = double(frame_num) * NUM_SAMPLES / OUTPUT_FREQUENCY;
-       return mixer->get_output(pts, NUM_SAMPLES, ResamplingQueue::ADJUST_RATE);
+       return mixer->get_output(ts, NUM_SAMPLES, ResamplingQueue::ADJUST_RATE);
  }
  
  void init_mapping(AudioMixer *mixer)
diff --git a/decklink_capture.cpp b/decklink_capture.cpp

index d38cc73a3ae432f62d9c368f766a74861f17622b..a0e890fcbcdd4fdc7c8bbb55e8212e8039980192 100644 (file)
--- a/decklink_capture.cpp
+++ b/decklink_capture.cpp
@@ -311,6 +311,8 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
                 done_init = true;
         }
  
+       steady_clock::time_point now = steady_clock::now();
+
         FrameAllocator::Frame current_video_frame, current_audio_frame;
         VideoFormat video_format;
         AudioFormat audio_format;
@@ -357,7 +359,7 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
                         video_format.width = width;
                         video_format.height = height;
  
-                       current_video_frame.received_timestamp = steady_clock::now();
+                       current_video_frame.received_timestamp = now;
                 }
         }
  
@@ -375,7 +377,7 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
                         audio_format.bits_per_sample = 32;
                         audio_format.num_channels = 2;
  
-                       current_audio_frame.received_timestamp = steady_clock::now();
+                       current_audio_frame.received_timestamp = now;
                 }
         }
  
diff --git a/decklink_output.cpp b/decklink_output.cpp

index a826dfe22ee95c5588f86edebec1c2ad1ce64ed3..38f44ee96c4d614e2d9ed2729ad2cc42e185eec8 100644 (file)
--- a/decklink_output.cpp
+++ b/decklink_output.cpp
@@ -242,7 +242,7 @@ void DeckLinkOutput::send_audio(int64_t pts, const std::vector<float> &samples)
         }
  }
  
-void DeckLinkOutput::wait_for_frame(int64_t pts, int *dropped_frames, int64_t *frame_duration, bool *is_preroll)
+void DeckLinkOutput::wait_for_frame(int64_t pts, int *dropped_frames, int64_t *frame_duration, bool *is_preroll, steady_clock::time_point *frame_timestamp)
  {
         assert(!should_quit);
  
@@ -277,11 +277,12 @@ void DeckLinkOutput::wait_for_frame(int64_t pts, int *dropped_frames, int64_t *f
         double playback_speed;
         output->GetScheduledStreamTime(TIMEBASE, &stream_frame_time, &playback_speed);
  
+       *frame_timestamp = steady_clock::now() +
+               nanoseconds((target_time - stream_frame_time) * 1000000000 / TIMEBASE);
+
         // If we're ahead of time, wait for the frame to (approximately) start.
         if (stream_frame_time < target_time) {
-               steady_clock::time_point t = steady_clock::now() +
-                       nanoseconds((target_time - stream_frame_time) * 1000000000 / TIMEBASE);
-               this_thread::sleep_until(t);
+               this_thread::sleep_until(*frame_timestamp);
                 return;
         }
  
@@ -296,6 +297,8 @@ void DeckLinkOutput::wait_for_frame(int64_t pts, int *dropped_frames, int64_t *f
         // Oops, we missed by more than one frame. Return immediately,
         // but drop so that we catch up.
         *dropped_frames = (stream_frame_time - target_time + *frame_duration - 1) / *frame_duration;
+       const int64_t ns_per_frame = this->frame_duration * 1000000000 / TIMEBASE;
+       *frame_timestamp += nanoseconds(*dropped_frames * ns_per_frame);
         fprintf(stderr, "Dropped %d output frames; skipping.\n", *dropped_frames);
  }
  
diff --git a/decklink_output.h b/decklink_output.h

index 7bc9850d1ba8d5bd25ba016dedd6ba756ec235d9..d3bafcae60ae46b78521cdfc057e366f866afece 100644 (file)
--- a/decklink_output.h
+++ b/decklink_output.h
@@ -4,6 +4,7 @@
  #include <epoxy/gl.h>
  #include <stdint.h>
  #include <atomic>
+#include <chrono>
  #include <condition_variable>
  #include <memory>
  #include <mutex>
@@ -41,7 +42,16 @@ public:
  
         void send_frame(GLuint y_tex, GLuint cbcr_tex, const std::vector<RefCountedFrame> &input_frames, int64_t pts, int64_t duration);
         void send_audio(int64_t pts, const std::vector<float> &samples);
-       void wait_for_frame(int64_t pts, int *dropped_frames, int64_t *frame_duration, bool *is_preroll);
+
+       // NOTE: The returned timestamp is undefined for preroll.
+       // Otherwise, it is the timestamp of the output frame as it should have been,
+       // even if we're overshooting. E.g. at 50 fps (0.02 spf), assuming the
+       // last frame was at t=0.980:
+       //
+       //   If we're at t=0.999, we wait until t=1.000 and return that.
+       //   If we're at t=1.001, we return t=1.000 immediately (small overshoot).
+       //   If we're at t=1.055, we drop two frames and return t=1.040 immediately.
+       void wait_for_frame(int64_t pts, int *dropped_frames, int64_t *frame_duration, bool *is_preroll, std::chrono::steady_clock::time_point *frame_timestamp);
  
         // Analogous to CaptureInterface. Will only return modes that have the right width/height.
         std::map<uint32_t, bmusb::VideoMode> get_available_video_modes() const { return video_modes; }
diff --git a/mixer.cpp b/mixer.cpp

index f77f6d48f7eb0fa4da4bbeb189d40ff24d5a85b6..b28da27e90f3781180dab3068b433135b739a1f2 100644 (file)
--- a/mixer.cpp
+++ b/mixer.cpp
@@ -383,7 +383,7 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode,
                 } while (!success);
         }
  
-       audio_mixer.add_audio(device, audio_frame.data + audio_offset, num_samples, audio_format, frame_length);
+       audio_mixer.add_audio(device, audio_frame.data + audio_offset, num_samples, audio_format, frame_length, audio_frame.received_timestamp);
  
         // Done with the audio, so release it.
         if (audio_frame.owner) {
@@ -613,7 +613,7 @@ void Mixer::thread_func()
                 }
  
                 OutputFrameInfo output_frame_info = get_one_frame_from_each_card(master_card_index, master_card_is_output, new_frames, has_new_frame);
-               schedule_audio_resampling_tasks(output_frame_info.dropped_frames, output_frame_info.num_samples, output_frame_info.frame_duration, output_frame_info.is_preroll);
+               schedule_audio_resampling_tasks(output_frame_info.dropped_frames, output_frame_info.num_samples, output_frame_info.frame_duration, output_frame_info.is_preroll, output_frame_info.frame_timestamp);
                 stats_dropped_frames += output_frame_info.dropped_frames;
  
                 handle_hotplugged_cards();
@@ -738,7 +738,7 @@ start:
         unique_lock<mutex> lock(card_mutex, defer_lock);
         if (master_card_is_output) {
                 // Clocked to the output, so wait for it to be ready for the next frame.
-               cards[master_card_index].output->wait_for_frame(pts_int, &output_frame_info.dropped_frames, &output_frame_info.frame_duration, &output_frame_info.is_preroll);
+               cards[master_card_index].output->wait_for_frame(pts_int, &output_frame_info.dropped_frames, &output_frame_info.frame_duration, &output_frame_info.is_preroll, &output_frame_info.frame_timestamp);
                 lock.lock();
         } else {
                 // Wait for the master card to have a new frame.
@@ -758,6 +758,11 @@ start:
                 goto start;
         }
  
+       if (!master_card_is_output) {
+               output_frame_info.frame_timestamp =
+                       cards[master_card_index].new_frames.front().received_timestamp;
+       }
+
         for (unsigned card_index = 0; card_index < num_cards; ++card_index) {
                 CaptureCard *card = &cards[card_index];
                 if (card->new_frames.empty()) {
@@ -847,7 +852,7 @@ void Mixer::handle_hotplugged_cards()
  }
  
  
-void Mixer::schedule_audio_resampling_tasks(unsigned dropped_frames, int num_samples_per_frame, int length_per_frame, bool is_preroll)
+void Mixer::schedule_audio_resampling_tasks(unsigned dropped_frames, int num_samples_per_frame, int length_per_frame, bool is_preroll, steady_clock::time_point frame_timestamp)
  {
         // Resample the audio as needed, including from previously dropped frames.
         assert(num_cards > 0);
@@ -869,7 +874,7 @@ void Mixer::schedule_audio_resampling_tasks(unsigned dropped_frames, int num_sam
                         // better to just wait until we have a slightly more normal situation).
                         unique_lock<mutex> lock(audio_mutex);
                         bool adjust_rate = !dropped_frame && !is_preroll;
-                       audio_task_queue.push(AudioTask{pts_int, num_samples_per_frame, adjust_rate});
+                       audio_task_queue.push(AudioTask{pts_int, num_samples_per_frame, adjust_rate, frame_timestamp});
                         audio_task_queue_changed.notify_one();
                 }
                 if (dropped_frame) {
@@ -962,7 +967,7 @@ void Mixer::audio_thread_func()
                 ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy =
                         task.adjust_rate ? ResamplingQueue::ADJUST_RATE : ResamplingQueue::DO_NOT_ADJUST_RATE;
                 vector<float> samples_out = audio_mixer.get_output(
-                       double(task.pts_int) / TIMEBASE,
+                       task.frame_timestamp,
                         task.num_samples,
                         rate_adjustment_policy);
  
diff --git a/mixer.h b/mixer.h

index 045e9b1212fccb0f1392cd80fe87903c4079e898..d70c65b8a70ec71e900914119316535fac26772a 100644 (file)
--- a/mixer.h
+++ b/mixer.h
@@ -323,7 +323,7 @@ private:
         void place_rectangle(movit::Effect *resample_effect, movit::Effect *padding_effect, float x0, float y0, float x1, float y1);
         void thread_func();
         void handle_hotplugged_cards();
-       void schedule_audio_resampling_tasks(unsigned dropped_frames, int num_samples_per_frame, int length_per_frame, bool is_preroll);
+       void schedule_audio_resampling_tasks(unsigned dropped_frames, int num_samples_per_frame, int length_per_frame, bool is_preroll, std::chrono::steady_clock::time_point frame_timestamp);
         void render_one_frame(int64_t duration);
         void audio_thread_func();
         void release_display_frame(DisplayFrame *frame);
@@ -403,6 +403,7 @@ private:
                 int num_samples;  // Audio samples needed for this output frame.
                 int64_t frame_duration;  // In TIMEBASE units.
                 bool is_preroll;
+               std::chrono::steady_clock::time_point frame_timestamp;
         };
         OutputFrameInfo get_one_frame_from_each_card(unsigned master_card_index, bool master_card_is_output, CaptureCard::NewFrame new_frames[MAX_VIDEO_CARDS], bool has_new_frame[MAX_VIDEO_CARDS]);
  
@@ -452,6 +453,7 @@ private:
                 int64_t pts_int;
                 int num_samples;
                 bool adjust_rate;
+               std::chrono::steady_clock::time_point frame_timestamp;
         };
         std::mutex audio_mutex;
         std::condition_variable audio_task_queue_changed;
diff --git a/resampling_queue.cpp b/resampling_queue.cpp

index 025fa5041ab4950eaf854f66b0fca603161e5862..188bf7d2d57ccdb98f249f86c59e0360e65b470b 100644 (file)
--- a/resampling_queue.cpp
+++ b/resampling_queue.cpp
@@ -28,9 +28,11 @@
  #include <cmath>
  
  using namespace std;
+using namespace std::chrono;
  
  ResamplingQueue::ResamplingQueue(unsigned card_num, unsigned freq_in, unsigned freq_out, unsigned num_channels, double expected_delay_seconds)
         : card_num(card_num), freq_in(freq_in), freq_out(freq_out), num_channels(num_channels),
+         current_estimated_freq_in(freq_in),
           ratio(double(freq_out) / double(freq_in)), expected_delay(expected_delay_seconds * OUTPUT_FREQUENCY)
  {
         vresampler.setup(ratio, num_channels, /*hlen=*/32);
@@ -41,54 +43,55 @@ ResamplingQueue::ResamplingQueue(unsigned card_num, unsigned freq_in, unsigned f
          vresampler.process ();
  }
  
-void ResamplingQueue::add_input_samples(double pts, const float *samples, ssize_t num_samples)
+void ResamplingQueue::add_input_samples(steady_clock::time_point ts, const float *samples, ssize_t num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy)
  {
         if (num_samples == 0) {
                 return;
         }
-       if (first_input) {
-               // Synthesize a fake length.
-               last_input_len = double(num_samples) / freq_in;
-               first_input = false;
-       } else {
-               last_input_len = pts - last_input_pts;
-       }
  
-       last_input_pts = pts;
-
-       k_a0 = k_a1;
-       k_a1 += num_samples;
+       bool good_sample = (rate_adjustment_policy == ADJUST_RATE);
+       if (good_sample && a1.good_sample) {
+               a0 = a1;
+       }
+       a1.ts = ts;
+       a1.input_samples_received += num_samples;
+       a1.good_sample = good_sample;
+       if (a0.good_sample && a1.good_sample) {
+               current_estimated_freq_in = (a1.input_samples_received - a0.input_samples_received) / duration<double>(a1.ts - a0.ts).count();
+               assert(current_estimated_freq_in >= 0.0);
+
+               // Bound the frequency, so that a single wild result won't throw the filter off guard.
+               current_estimated_freq_in = min(current_estimated_freq_in, 1.2 * freq_in);
+               current_estimated_freq_in = max(current_estimated_freq_in, 0.8 * freq_in);
+       }
  
         buffer.insert(buffer.end(), samples, samples + num_samples * num_channels);
  }
  
-bool ResamplingQueue::get_output_samples(double pts, float *samples, ssize_t num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy)
+bool ResamplingQueue::get_output_samples(steady_clock::time_point ts, float *samples, ssize_t num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy)
  {
         assert(num_samples > 0);
-       if (first_input) {
+       if (a1.input_samples_received == 0) {
                 // No data yet, just return zeros.
                 memset(samples, 0, num_samples * num_channels * sizeof(float));
                 return true;
         }
  
-       double rcorr = -1.0;
-       if (rate_adjustment_policy == ADJUST_RATE) {
-               double last_output_len;
-               if (first_output) {
-                       // Synthesize a fake length.
-                       last_output_len = double(num_samples) / freq_out;
-               } else {
-                       last_output_len = pts - last_output_pts;
-               }
-               last_output_pts = pts;
-
-               // Using the time point since just before the last call to add_input_samples() as a base,
-               // estimate actual delay based on activity since then, measured in number of input samples:
-               double actual_delay = 0.0;
-               assert(last_input_len != 0);
-               actual_delay += (k_a1 - k_a0) * last_output_len / last_input_len;    // Inserted samples since k_a0, rescaled for the different time periods.
-               actual_delay += k_a0 - total_consumed_samples;                       // Samples inserted before k_a0 but not consumed yet.
-               actual_delay += vresampler.inpdist();                                // Delay in the resampler itself.
+       if (rate_adjustment_policy == ADJUST_RATE && (a0.good_sample || a1.good_sample)) {
+               // Estimate the current number of input samples produced at
+               // this instant in time, by extrapolating from the last known
+               // good point. Note that we could be extrapolating backward or
+               // forward, depending on the timing of the calls.
+               const InputPoint &base_point = a1.good_sample ? a1 : a0;
+               const double input_samples_received = base_point.input_samples_received +
+                       current_estimated_freq_in * duration<double>(ts - base_point.ts).count();
+
+               // Estimate the number of input samples _consumed_ after we've run the resampler.
+               const double input_samples_consumed = total_consumed_samples +
+                       num_samples / (ratio * rcorr);
+
+               double actual_delay = input_samples_received - input_samples_consumed;
+               actual_delay += vresampler.inpdist();    // Delay in the resampler itself.
                 double err = actual_delay - expected_delay;
                 if (first_output && err < 0.0) {
                         // Before the very first block, insert artificial delay based on our initial estimate,
@@ -97,7 +100,7 @@ bool ResamplingQueue::get_output_samples(double pts, float *samples, ssize_t num
                         for (ssize_t i = 0; i < delay_samples_to_add * num_channels; ++i) {
                                 buffer.push_front(0.0f);
                         }
-                       total_consumed_samples -= delay_samples_to_add;  // Equivalent to increasing k_a0 and k_a1.
+                       total_consumed_samples -= delay_samples_to_add;  // Equivalent to increasing input_samples_received on a0 and a1.
                         err += delay_samples_to_add;
                 }
                 first_output = false;
@@ -105,10 +108,18 @@ bool ResamplingQueue::get_output_samples(double pts, float *samples, ssize_t num
                 // Compute loop filter coefficients for the two filters. We need to compute them
                 // every time, since they depend on the number of samples the user asked for.
                 //
-               // The loop bandwidth is at 0.02 Hz; we trust the initial estimate quite well,
-               // and our jitter is pretty large since none of the threads involved run at
-               // real-time priority.
-               double loop_bandwidth_hz = 0.02;
+               // The loop bandwidth is at 0.02 Hz; our jitter is pretty large
+               // since none of the threads involved run at real-time priority.
+               // However, the first four seconds, we use a larger loop bandwidth (2 Hz),
+               // because there's a lot going on during startup, and thus the
+               // initial estimate might be tainted by jitter during that phase,
+               // and we want to converge faster.
+               //
+               // NOTE: The above logic might only hold during Nageru startup
+               // (we start ResamplingQueues also when we e.g. switch sound sources),
+               // but in general, a little bit of increased timing jitter is acceptable
+               // right after a setup change like this.
+               double loop_bandwidth_hz = (total_consumed_samples < 4 * freq_in) ? 0.2 : 0.02;
  
                 // Set filters. The first filter much wider than the first one (20x as wide).
                 double w = (2.0 * M_PI) * loop_bandwidth_hz * num_samples / freq_out;
@@ -127,9 +138,9 @@ bool ResamplingQueue::get_output_samples(double pts, float *samples, ssize_t num
                 vresampler.set_rratio(rcorr);
         } else {
                 assert(rate_adjustment_policy == DO_NOT_ADJUST_RATE);
-       };
+       }
  
-       // Finally actually resample, consuming exactly <num_samples> output samples.
+       // Finally actually resample, producing exactly <num_samples> output samples.
         vresampler.out_data = samples;
         vresampler.out_count = num_samples;
         while (vresampler.out_count > 0) {
diff --git a/resampling_queue.h b/resampling_queue.h

index 662a837262e8b84fcf3680ceae65bdf38f0e0871..b46086df9d0954a4aebd3af0acb3937ca6e314de 100644 (file)
--- a/resampling_queue.h
+++ b/resampling_queue.h
@@ -4,8 +4,8 @@
  // Takes in samples from an input source, possibly with jitter, and outputs a fixed number
  // of samples every iteration. Used to a) change sample rates if needed, and b) deal with
  // input sources that don't have audio locked to video. For every input video
-// frame, you call add_input_samples() with the pts (measured in seconds) of the video frame,
-// taken to be the start point of the frame's audio. When you want to _output_ a finished
+// frame, you call add_input_samples() with the received time point of the video frame,
+// taken to be the _end_ point of the frame's audio. When you want to _output_ a finished
  // frame with audio, you get_output_samples() with the number of samples you want, and will
  // get exactly that number of samples back. If the input and output clocks are not in sync,
  // the audio will be stretched for you. (If they are _very_ out of sync, this will come through
@@ -40,6 +40,7 @@
  
  #include <sys/types.h>
  #include <zita-resampler/vresampler.h>
+#include <chrono>
  #include <deque>
  #include <memory>
  
@@ -58,10 +59,9 @@ public:
                 ADJUST_RATE
         };
  
-       // Note: pts is always in seconds.
-       void add_input_samples(double pts, const float *samples, ssize_t num_samples);
+       void add_input_samples(std::chrono::steady_clock::time_point ts, const float *samples, ssize_t num_samples, RateAdjustmentPolicy rate_adjustment_policy);
         // Returns false if underrun.
-       bool get_output_samples(double pts, float *samples, ssize_t num_samples, RateAdjustmentPolicy rate_adjustment_policy);
+       bool get_output_samples(std::chrono::steady_clock::time_point ts, float *samples, ssize_t num_samples, RateAdjustmentPolicy rate_adjustment_policy);
  
  private:
         void init_loop_filter(double bandwidth_hz);
@@ -71,28 +71,43 @@ private:
         unsigned card_num;
         unsigned freq_in, freq_out, num_channels;
  
-       bool first_input = true, first_output = true;
-       double last_input_pts;   // Start of last input block, in seconds.
-       double last_output_pts;
+       bool first_output = true;
  
-       ssize_t k_a0 = 0;  // Total amount of samples inserted _before_ the last call to add_input_samples().
-       ssize_t k_a1 = 0;  // Total amount of samples inserted _after_ the last call to add_input_samples().
+       struct InputPoint {
+               // Equivalent to t_a0 or t_a1 in the paper.
+               std::chrono::steady_clock::time_point ts;
  
-       ssize_t total_consumed_samples = 0;
+               // Number of samples that have been written to the queue (in total)
+               // at this time point. Equivalent to k_a0 or k_a1 in the paper.
+               size_t input_samples_received = 0;
+
+               // Set to false if we should not use the timestamp from this sample
+               // (e.g. if it is from a dropped frame and thus bad). In particular,
+               // we will not use it for updateing current_estimated_freq_in.
+               bool good_sample = false;
+       };
+       InputPoint a0, a1;
  
-       // Duration of last input block, in seconds.
-       double last_input_len;
+       // The current rate at which we seem to get input samples, in Hz.
+       // For an ideal input, identical to freq_in.
+       double current_estimated_freq_in;
+
+       ssize_t total_consumed_samples = 0;
  
         // Filter state for the loop filter.
         double z1 = 0.0, z2 = 0.0, z3 = 0.0;
  
         // Ratio between the two frequencies.
-       double ratio;
+       const double ratio;
+
+       // Current correction ratio. ratio * rcorr gives the true ratio,
+       // so values above 1.0 means to pitch down (consume input samples slower).
+       double rcorr = 1.0;
  
         // How much delay we are expected to have, in input samples.
         // If actual delay drifts too much away from this, we will start
         // changing the resampling ratio to compensate.
-       double expected_delay;
+       const double expected_delay;
  
         // Input samples not yet fed into the resampler.
         // TODO: Use a circular buffer instead, for efficiency.
author	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Thu, 2 Feb 2017 23:03:58 +0000 (00:03 +0100)
committer	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Thu, 2 Feb 2017 23:03:58 +0000 (00:03 +0100)
alsa_input.cpp		patch \| blob \| history
alsa_input.h		patch \| blob \| history
alsa_pool.cpp		patch \| blob \| history
audio_mixer.cpp		patch \| blob \| history
audio_mixer.h		patch \| blob \| history
benchmark_audio_mixer.cpp		patch \| blob \| history
decklink_capture.cpp		patch \| blob \| history
decklink_output.cpp		patch \| blob \| history
decklink_output.h		patch \| blob \| history
mixer.cpp		patch \| blob \| history
mixer.h		patch \| blob \| history
resampling_queue.cpp		patch \| blob \| history
resampling_queue.h		patch \| blob \| history