]> git.sesse.net Git - nageru/commitdiff
Rework the audio/video sync algorithm.
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 2 Feb 2017 23:03:58 +0000 (00:03 +0100)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 2 Feb 2017 23:03:58 +0000 (00:03 +0100)
It turns out I've been misunderstanding parts of Fons' paper;
my estimation is different, and although it works surprisingly well
for something that's hardly supposed to work at all, it has some
significant problems with edge cases like frame rates being _nearly_
off (e.g. 59.94 Hz input on a 60 Hz output); the estimated delay
under the old algorithm will be a very slow sawtooth, which isn't
nice even after being passed through the filter.

The new algorithm probably still isn't 100% identical to zita-ajbridge,
but it should be much closer to how the algorithm is intended to work.
In particular, it makes a real try to understand that an output frame
can arrive between two input frames in time; this makes it dependent
on the system clock, but that's really the core that was missing from
the algorithm, so it's really more a feature than a bug.

I've made some real attempts at making all the received timestamps
more stable; FakeCapture is a bit odd still (especially at startup)
since it has its thing of just doing frames late instead of dropping
them, but it generally seems to work OK. For cases of frame rate
mismatch (even pretty benign ones), the correction rate seems to be
two orders of magnitude more stable, i.e., the maximum difference
from 1.0 during normal operation is greatly reduced.

13 files changed:
alsa_input.cpp
alsa_input.h
alsa_pool.cpp
audio_mixer.cpp
audio_mixer.h
benchmark_audio_mixer.cpp
decklink_capture.cpp
decklink_output.cpp
decklink_output.h
mixer.cpp
mixer.h
resampling_queue.cpp
resampling_queue.h

index 158bfaa32e0e914d50a1a29efe52a6cec5fb7bd3..0d6901423b17085d65459708385a5fcb055218a4 100644 (file)
@@ -12,6 +12,7 @@
 #include "timebase.h"
 
 using namespace std;
+using namespace std::chrono;
 using namespace std::placeholders;
 
 #define RETURN_ON_ERROR(msg, expr) do {                                                    \
@@ -243,10 +244,11 @@ ALSAInput::CaptureEndReason ALSAInput::do_capture()
 
                const int64_t prev_pts = frames_to_pts(num_frames_output);
                const int64_t pts = frames_to_pts(num_frames_output + frames);
+               const steady_clock::time_point now = steady_clock::now();
                bool success;
                do {
                        if (should_quit) return CaptureEndReason::REQUESTED_QUIT;
-                       success = audio_callback(buffer.get(), frames, audio_format, pts - prev_pts);
+                       success = audio_callback(buffer.get(), frames, audio_format, pts - prev_pts, now);
                } while (!success);
                num_frames_output += frames;
        }
index 6bce9134cdcd2a3fb3d381988cc482d4419ccbc1..bae9fdf78413188cef9b9c2d86c06c67bda019dd 100644 (file)
@@ -13,6 +13,7 @@
 #include <stdint.h>
 #include <sys/types.h>
 #include <atomic>
+#include <chrono>
 #include <functional>
 #include <memory>
 #include <string>
@@ -24,7 +25,7 @@ class ALSAPool;
 
 class ALSAInput {
 public:
-       typedef std::function<bool(const uint8_t *data, unsigned num_samples, bmusb::AudioFormat audio_format, int64_t frame_length)> audio_callback_t;
+       typedef std::function<bool(const uint8_t *data, unsigned num_samples, bmusb::AudioFormat audio_format, int64_t frame_length, std::chrono::steady_clock::time_point ts)> audio_callback_t;
 
        ALSAInput(const char *device, unsigned sample_rate, unsigned num_channels, audio_callback_t audio_callback, ALSAPool *parent_pool, unsigned internal_dev_index);
        ~ALSAInput();
index 3a4a5b0eddf74ff876a9c00f432f1cacb1fe5004..348c6232303ab561d3a4039ca69932261bd3e8bc 100644 (file)
@@ -393,7 +393,7 @@ void ALSAPool::reset_device(unsigned index)
                inputs[index].reset();
        } else {
                // TODO: Put on a background thread instead of locking?
-               auto callback = bind(&AudioMixer::add_audio, global_audio_mixer, DeviceSpec{InputSourceType::ALSA_INPUT, index}, _1, _2, _3, _4);
+               auto callback = bind(&AudioMixer::add_audio, global_audio_mixer, DeviceSpec{InputSourceType::ALSA_INPUT, index}, _1, _2, _3, _4, _5);
                inputs[index].reset(new ALSAInput(device->address.c_str(), OUTPUT_FREQUENCY, device->num_channels, callback, this, index));
                inputs[index]->start_capture_thread();
        }
index 01fed687b47a8d2ae77a752896f925d11c176c37..e4d95a49422250e44dced2a84f11dc866ac878f6 100644 (file)
@@ -25,6 +25,7 @@
 
 using namespace bmusb;
 using namespace std;
+using namespace std::chrono;
 using namespace std::placeholders;
 
 namespace {
@@ -231,10 +232,9 @@ void AudioMixer::reset_resampler_mutex_held(DeviceSpec device_spec)
                        device_spec.index, device->capture_frequency, OUTPUT_FREQUENCY, device->interesting_channels.size(),
                        global_flags.audio_queue_length_ms * 0.001));
        }
-       device->next_local_pts = 0;
 }
 
-bool AudioMixer::add_audio(DeviceSpec device_spec, const uint8_t *data, unsigned num_samples, AudioFormat audio_format, int64_t frame_length)
+bool AudioMixer::add_audio(DeviceSpec device_spec, const uint8_t *data, unsigned num_samples, AudioFormat audio_format, int64_t frame_length, steady_clock::time_point frame_time)
 {
        AudioDevice *device = find_audio_device(device_spec);
 
@@ -274,9 +274,7 @@ bool AudioMixer::add_audio(DeviceSpec device_spec, const uint8_t *data, unsigned
        }
 
        // Now add it.
-       int64_t local_pts = device->next_local_pts;
-       device->resampling_queue->add_input_samples(local_pts / double(TIMEBASE), audio.get(), num_samples);
-       device->next_local_pts = local_pts + frame_length;
+       device->resampling_queue->add_input_samples(frame_time, audio.get(), num_samples, ResamplingQueue::ADJUST_RATE);
        return true;
 }
 
@@ -298,11 +296,7 @@ bool AudioMixer::add_silence(DeviceSpec device_spec, unsigned samples_per_frame,
 
        vector<float> silence(samples_per_frame * num_channels, 0.0f);
        for (unsigned i = 0; i < num_frames; ++i) {
-               device->resampling_queue->add_input_samples(device->next_local_pts / double(TIMEBASE), silence.data(), samples_per_frame);
-               // Note that if the format changed in the meantime, we have
-               // no way of detecting that; we just have to assume the frame length
-               // is always the same.
-               device->next_local_pts += frame_length;
+               device->resampling_queue->add_input_samples(steady_clock::now(), silence.data(), samples_per_frame, ResamplingQueue::DO_NOT_ADJUST_RATE);
        }
        return true;
 }
@@ -475,7 +469,7 @@ void apply_gain(float db, float last_db, vector<float> *samples)
 
 }  // namespace
 
-vector<float> AudioMixer::get_output(double pts, unsigned num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy)
+vector<float> AudioMixer::get_output(steady_clock::time_point ts, unsigned num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy)
 {
        map<DeviceSpec, vector<float>> samples_card;
        vector<float> samples_bus;
@@ -490,7 +484,7 @@ vector<float> AudioMixer::get_output(double pts, unsigned num_samples, Resamplin
                        memset(&samples_card[device_spec][0], 0, samples_card[device_spec].size() * sizeof(float));
                } else {
                        device->resampling_queue->get_output_samples(
-                               pts,
+                               ts,
                                &samples_card[device_spec][0],
                                num_samples,
                                rate_adjustment_policy);
index 85f9aedf96a77779b31e3e9a5433710076659208..8d4f15c7f2020b399fb726f4ce5b025ac22d65fb 100644 (file)
@@ -12,6 +12,7 @@
 #include <stdint.h>
 #include <zita-resampler/resampler.h>
 #include <atomic>
+#include <chrono>
 #include <functional>
 #include <map>
 #include <memory>
@@ -54,7 +55,7 @@ public:
        // (This is to avoid a deadlock where a card hangs on the mutex in add_audio()
        // while we are trying to shut it down from another thread that also holds
        // the mutex.) frame_length is in TIMEBASE units.
-       bool add_audio(DeviceSpec device_spec, const uint8_t *data, unsigned num_samples, bmusb::AudioFormat audio_format, int64_t frame_length);
+       bool add_audio(DeviceSpec device_spec, const uint8_t *data, unsigned num_samples, bmusb::AudioFormat audio_format, int64_t frame_length, std::chrono::steady_clock::time_point frame_time);
        bool add_silence(DeviceSpec device_spec, unsigned samples_per_frame, unsigned num_frames, int64_t frame_length);
 
        // If a given device is offline for whatever reason and cannot deliver audio
@@ -64,7 +65,7 @@ public:
        // affect it. Same true/false behavior as add_audio().
        bool silence_card(DeviceSpec device_spec, bool silence);
 
-       std::vector<float> get_output(double pts, unsigned num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy);
+       std::vector<float> get_output(std::chrono::steady_clock::time_point ts, unsigned num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy);
 
        float get_fader_volume(unsigned bus_index) const { return fader_volume_db[bus_index]; }
        void set_fader_volume(unsigned bus_index, float level_db) { fader_volume_db[bus_index] = level_db; }
@@ -301,7 +302,6 @@ public:
 private:
        struct AudioDevice {
                std::unique_ptr<ResamplingQueue> resampling_queue;
-               int64_t next_local_pts = 0;
                std::string display_name;
                unsigned capture_frequency = OUTPUT_FREQUENCY;
                // Which channels we consider interesting (ie., are part of some input_mapping).
index 4b8f84a93ef0458dffa25d8806a54564afa2400e..3327179c3518e6dc492b794c636d7d24bafde7a7 100644 (file)
@@ -61,6 +61,10 @@ void callback(float level_lufs, float peak_db,
 
 vector<float> process_frame(unsigned frame_num, AudioMixer *mixer)
 {
+       duration<int64_t, ratio<NUM_SAMPLES, OUTPUT_FREQUENCY>> frame_duration(frame_num);
+       steady_clock::time_point ts = steady_clock::time_point::min() +
+               duration_cast<steady_clock::duration>(frame_duration);
+
        // Feed the inputs.
        for (unsigned card_index = 0; card_index < NUM_BENCHMARK_CARDS; ++card_index) {
                bmusb::AudioFormat audio_format;
@@ -70,12 +74,11 @@ vector<float> process_frame(unsigned frame_num, AudioMixer *mixer)
                unsigned num_samples = NUM_SAMPLES + (lcgrand() % 9) - 5;
                bool ok = mixer->add_audio(DeviceSpec{InputSourceType::CAPTURE_CARD, card_index},
                        card_index == 3 ? samples24 : samples16, num_samples, audio_format,
-                       NUM_SAMPLES * TIMEBASE / OUTPUT_FREQUENCY);
+                       NUM_SAMPLES * TIMEBASE / OUTPUT_FREQUENCY, ts);
                assert(ok);
        }
 
-       double pts = double(frame_num) * NUM_SAMPLES / OUTPUT_FREQUENCY;
-       return mixer->get_output(pts, NUM_SAMPLES, ResamplingQueue::ADJUST_RATE);
+       return mixer->get_output(ts, NUM_SAMPLES, ResamplingQueue::ADJUST_RATE);
 }
 
 void init_mapping(AudioMixer *mixer)
index d38cc73a3ae432f62d9c368f766a74861f17622b..a0e890fcbcdd4fdc7c8bbb55e8212e8039980192 100644 (file)
@@ -311,6 +311,8 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
                done_init = true;
        }
 
+       steady_clock::time_point now = steady_clock::now();
+
        FrameAllocator::Frame current_video_frame, current_audio_frame;
        VideoFormat video_format;
        AudioFormat audio_format;
@@ -357,7 +359,7 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
                        video_format.width = width;
                        video_format.height = height;
 
-                       current_video_frame.received_timestamp = steady_clock::now();
+                       current_video_frame.received_timestamp = now;
                }
        }
 
@@ -375,7 +377,7 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
                        audio_format.bits_per_sample = 32;
                        audio_format.num_channels = 2;
 
-                       current_audio_frame.received_timestamp = steady_clock::now();
+                       current_audio_frame.received_timestamp = now;
                }
        }
 
index a826dfe22ee95c5588f86edebec1c2ad1ce64ed3..38f44ee96c4d614e2d9ed2729ad2cc42e185eec8 100644 (file)
@@ -242,7 +242,7 @@ void DeckLinkOutput::send_audio(int64_t pts, const std::vector<float> &samples)
        }
 }
 
-void DeckLinkOutput::wait_for_frame(int64_t pts, int *dropped_frames, int64_t *frame_duration, bool *is_preroll)
+void DeckLinkOutput::wait_for_frame(int64_t pts, int *dropped_frames, int64_t *frame_duration, bool *is_preroll, steady_clock::time_point *frame_timestamp)
 {
        assert(!should_quit);
 
@@ -277,11 +277,12 @@ void DeckLinkOutput::wait_for_frame(int64_t pts, int *dropped_frames, int64_t *f
        double playback_speed;
        output->GetScheduledStreamTime(TIMEBASE, &stream_frame_time, &playback_speed);
 
+       *frame_timestamp = steady_clock::now() +
+               nanoseconds((target_time - stream_frame_time) * 1000000000 / TIMEBASE);
+
        // If we're ahead of time, wait for the frame to (approximately) start.
        if (stream_frame_time < target_time) {
-               steady_clock::time_point t = steady_clock::now() +
-                       nanoseconds((target_time - stream_frame_time) * 1000000000 / TIMEBASE);
-               this_thread::sleep_until(t);
+               this_thread::sleep_until(*frame_timestamp);
                return;
        }
 
@@ -296,6 +297,8 @@ void DeckLinkOutput::wait_for_frame(int64_t pts, int *dropped_frames, int64_t *f
        // Oops, we missed by more than one frame. Return immediately,
        // but drop so that we catch up.
        *dropped_frames = (stream_frame_time - target_time + *frame_duration - 1) / *frame_duration;
+       const int64_t ns_per_frame = this->frame_duration * 1000000000 / TIMEBASE;
+       *frame_timestamp += nanoseconds(*dropped_frames * ns_per_frame);
        fprintf(stderr, "Dropped %d output frames; skipping.\n", *dropped_frames);
 }
 
index 7bc9850d1ba8d5bd25ba016dedd6ba756ec235d9..d3bafcae60ae46b78521cdfc057e366f866afece 100644 (file)
@@ -4,6 +4,7 @@
 #include <epoxy/gl.h>
 #include <stdint.h>
 #include <atomic>
+#include <chrono>
 #include <condition_variable>
 #include <memory>
 #include <mutex>
@@ -41,7 +42,16 @@ public:
 
        void send_frame(GLuint y_tex, GLuint cbcr_tex, const std::vector<RefCountedFrame> &input_frames, int64_t pts, int64_t duration);
        void send_audio(int64_t pts, const std::vector<float> &samples);
-       void wait_for_frame(int64_t pts, int *dropped_frames, int64_t *frame_duration, bool *is_preroll);
+
+       // NOTE: The returned timestamp is undefined for preroll.
+       // Otherwise, it is the timestamp of the output frame as it should have been,
+       // even if we're overshooting. E.g. at 50 fps (0.02 spf), assuming the
+       // last frame was at t=0.980:
+       //
+       //   If we're at t=0.999, we wait until t=1.000 and return that.
+       //   If we're at t=1.001, we return t=1.000 immediately (small overshoot).
+       //   If we're at t=1.055, we drop two frames and return t=1.040 immediately.
+       void wait_for_frame(int64_t pts, int *dropped_frames, int64_t *frame_duration, bool *is_preroll, std::chrono::steady_clock::time_point *frame_timestamp);
 
        // Analogous to CaptureInterface. Will only return modes that have the right width/height.
        std::map<uint32_t, bmusb::VideoMode> get_available_video_modes() const { return video_modes; }
index f77f6d48f7eb0fa4da4bbeb189d40ff24d5a85b6..b28da27e90f3781180dab3068b433135b739a1f2 100644 (file)
--- a/mixer.cpp
+++ b/mixer.cpp
@@ -383,7 +383,7 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode,
                } while (!success);
        }
 
-       audio_mixer.add_audio(device, audio_frame.data + audio_offset, num_samples, audio_format, frame_length);
+       audio_mixer.add_audio(device, audio_frame.data + audio_offset, num_samples, audio_format, frame_length, audio_frame.received_timestamp);
 
        // Done with the audio, so release it.
        if (audio_frame.owner) {
@@ -613,7 +613,7 @@ void Mixer::thread_func()
                }
 
                OutputFrameInfo output_frame_info = get_one_frame_from_each_card(master_card_index, master_card_is_output, new_frames, has_new_frame);
-               schedule_audio_resampling_tasks(output_frame_info.dropped_frames, output_frame_info.num_samples, output_frame_info.frame_duration, output_frame_info.is_preroll);
+               schedule_audio_resampling_tasks(output_frame_info.dropped_frames, output_frame_info.num_samples, output_frame_info.frame_duration, output_frame_info.is_preroll, output_frame_info.frame_timestamp);
                stats_dropped_frames += output_frame_info.dropped_frames;
 
                handle_hotplugged_cards();
@@ -738,7 +738,7 @@ start:
        unique_lock<mutex> lock(card_mutex, defer_lock);
        if (master_card_is_output) {
                // Clocked to the output, so wait for it to be ready for the next frame.
-               cards[master_card_index].output->wait_for_frame(pts_int, &output_frame_info.dropped_frames, &output_frame_info.frame_duration, &output_frame_info.is_preroll);
+               cards[master_card_index].output->wait_for_frame(pts_int, &output_frame_info.dropped_frames, &output_frame_info.frame_duration, &output_frame_info.is_preroll, &output_frame_info.frame_timestamp);
                lock.lock();
        } else {
                // Wait for the master card to have a new frame.
@@ -758,6 +758,11 @@ start:
                goto start;
        }
 
+       if (!master_card_is_output) {
+               output_frame_info.frame_timestamp =
+                       cards[master_card_index].new_frames.front().received_timestamp;
+       }
+
        for (unsigned card_index = 0; card_index < num_cards; ++card_index) {
                CaptureCard *card = &cards[card_index];
                if (card->new_frames.empty()) {
@@ -847,7 +852,7 @@ void Mixer::handle_hotplugged_cards()
 }
 
 
-void Mixer::schedule_audio_resampling_tasks(unsigned dropped_frames, int num_samples_per_frame, int length_per_frame, bool is_preroll)
+void Mixer::schedule_audio_resampling_tasks(unsigned dropped_frames, int num_samples_per_frame, int length_per_frame, bool is_preroll, steady_clock::time_point frame_timestamp)
 {
        // Resample the audio as needed, including from previously dropped frames.
        assert(num_cards > 0);
@@ -869,7 +874,7 @@ void Mixer::schedule_audio_resampling_tasks(unsigned dropped_frames, int num_sam
                        // better to just wait until we have a slightly more normal situation).
                        unique_lock<mutex> lock(audio_mutex);
                        bool adjust_rate = !dropped_frame && !is_preroll;
-                       audio_task_queue.push(AudioTask{pts_int, num_samples_per_frame, adjust_rate});
+                       audio_task_queue.push(AudioTask{pts_int, num_samples_per_frame, adjust_rate, frame_timestamp});
                        audio_task_queue_changed.notify_one();
                }
                if (dropped_frame) {
@@ -962,7 +967,7 @@ void Mixer::audio_thread_func()
                ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy =
                        task.adjust_rate ? ResamplingQueue::ADJUST_RATE : ResamplingQueue::DO_NOT_ADJUST_RATE;
                vector<float> samples_out = audio_mixer.get_output(
-                       double(task.pts_int) / TIMEBASE,
+                       task.frame_timestamp,
                        task.num_samples,
                        rate_adjustment_policy);
 
diff --git a/mixer.h b/mixer.h
index 045e9b1212fccb0f1392cd80fe87903c4079e898..d70c65b8a70ec71e900914119316535fac26772a 100644 (file)
--- a/mixer.h
+++ b/mixer.h
@@ -323,7 +323,7 @@ private:
        void place_rectangle(movit::Effect *resample_effect, movit::Effect *padding_effect, float x0, float y0, float x1, float y1);
        void thread_func();
        void handle_hotplugged_cards();
-       void schedule_audio_resampling_tasks(unsigned dropped_frames, int num_samples_per_frame, int length_per_frame, bool is_preroll);
+       void schedule_audio_resampling_tasks(unsigned dropped_frames, int num_samples_per_frame, int length_per_frame, bool is_preroll, std::chrono::steady_clock::time_point frame_timestamp);
        void render_one_frame(int64_t duration);
        void audio_thread_func();
        void release_display_frame(DisplayFrame *frame);
@@ -403,6 +403,7 @@ private:
                int num_samples;  // Audio samples needed for this output frame.
                int64_t frame_duration;  // In TIMEBASE units.
                bool is_preroll;
+               std::chrono::steady_clock::time_point frame_timestamp;
        };
        OutputFrameInfo get_one_frame_from_each_card(unsigned master_card_index, bool master_card_is_output, CaptureCard::NewFrame new_frames[MAX_VIDEO_CARDS], bool has_new_frame[MAX_VIDEO_CARDS]);
 
@@ -452,6 +453,7 @@ private:
                int64_t pts_int;
                int num_samples;
                bool adjust_rate;
+               std::chrono::steady_clock::time_point frame_timestamp;
        };
        std::mutex audio_mutex;
        std::condition_variable audio_task_queue_changed;
index 025fa5041ab4950eaf854f66b0fca603161e5862..188bf7d2d57ccdb98f249f86c59e0360e65b470b 100644 (file)
 #include <cmath>
 
 using namespace std;
+using namespace std::chrono;
 
 ResamplingQueue::ResamplingQueue(unsigned card_num, unsigned freq_in, unsigned freq_out, unsigned num_channels, double expected_delay_seconds)
        : card_num(card_num), freq_in(freq_in), freq_out(freq_out), num_channels(num_channels),
+         current_estimated_freq_in(freq_in),
          ratio(double(freq_out) / double(freq_in)), expected_delay(expected_delay_seconds * OUTPUT_FREQUENCY)
 {
        vresampler.setup(ratio, num_channels, /*hlen=*/32);
@@ -41,54 +43,55 @@ ResamplingQueue::ResamplingQueue(unsigned card_num, unsigned freq_in, unsigned f
         vresampler.process ();
 }
 
-void ResamplingQueue::add_input_samples(double pts, const float *samples, ssize_t num_samples)
+void ResamplingQueue::add_input_samples(steady_clock::time_point ts, const float *samples, ssize_t num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy)
 {
        if (num_samples == 0) {
                return;
        }
-       if (first_input) {
-               // Synthesize a fake length.
-               last_input_len = double(num_samples) / freq_in;
-               first_input = false;
-       } else {
-               last_input_len = pts - last_input_pts;
-       }
 
-       last_input_pts = pts;
-
-       k_a0 = k_a1;
-       k_a1 += num_samples;
+       bool good_sample = (rate_adjustment_policy == ADJUST_RATE);
+       if (good_sample && a1.good_sample) {
+               a0 = a1;
+       }
+       a1.ts = ts;
+       a1.input_samples_received += num_samples;
+       a1.good_sample = good_sample;
+       if (a0.good_sample && a1.good_sample) {
+               current_estimated_freq_in = (a1.input_samples_received - a0.input_samples_received) / duration<double>(a1.ts - a0.ts).count();
+               assert(current_estimated_freq_in >= 0.0);
+
+               // Bound the frequency, so that a single wild result won't throw the filter off guard.
+               current_estimated_freq_in = min(current_estimated_freq_in, 1.2 * freq_in);
+               current_estimated_freq_in = max(current_estimated_freq_in, 0.8 * freq_in);
+       }
 
        buffer.insert(buffer.end(), samples, samples + num_samples * num_channels);
 }
 
-bool ResamplingQueue::get_output_samples(double pts, float *samples, ssize_t num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy)
+bool ResamplingQueue::get_output_samples(steady_clock::time_point ts, float *samples, ssize_t num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy)
 {
        assert(num_samples > 0);
-       if (first_input) {
+       if (a1.input_samples_received == 0) {
                // No data yet, just return zeros.
                memset(samples, 0, num_samples * num_channels * sizeof(float));
                return true;
        }
 
-       double rcorr = -1.0;
-       if (rate_adjustment_policy == ADJUST_RATE) {
-               double last_output_len;
-               if (first_output) {
-                       // Synthesize a fake length.
-                       last_output_len = double(num_samples) / freq_out;
-               } else {
-                       last_output_len = pts - last_output_pts;
-               }
-               last_output_pts = pts;
-
-               // Using the time point since just before the last call to add_input_samples() as a base,
-               // estimate actual delay based on activity since then, measured in number of input samples:
-               double actual_delay = 0.0;
-               assert(last_input_len != 0);
-               actual_delay += (k_a1 - k_a0) * last_output_len / last_input_len;    // Inserted samples since k_a0, rescaled for the different time periods.
-               actual_delay += k_a0 - total_consumed_samples;                       // Samples inserted before k_a0 but not consumed yet.
-               actual_delay += vresampler.inpdist();                                // Delay in the resampler itself.
+       if (rate_adjustment_policy == ADJUST_RATE && (a0.good_sample || a1.good_sample)) {
+               // Estimate the current number of input samples produced at
+               // this instant in time, by extrapolating from the last known
+               // good point. Note that we could be extrapolating backward or
+               // forward, depending on the timing of the calls.
+               const InputPoint &base_point = a1.good_sample ? a1 : a0;
+               const double input_samples_received = base_point.input_samples_received +
+                       current_estimated_freq_in * duration<double>(ts - base_point.ts).count();
+
+               // Estimate the number of input samples _consumed_ after we've run the resampler.
+               const double input_samples_consumed = total_consumed_samples +
+                       num_samples / (ratio * rcorr);
+
+               double actual_delay = input_samples_received - input_samples_consumed;
+               actual_delay += vresampler.inpdist();    // Delay in the resampler itself.
                double err = actual_delay - expected_delay;
                if (first_output && err < 0.0) {
                        // Before the very first block, insert artificial delay based on our initial estimate,
@@ -97,7 +100,7 @@ bool ResamplingQueue::get_output_samples(double pts, float *samples, ssize_t num
                        for (ssize_t i = 0; i < delay_samples_to_add * num_channels; ++i) {
                                buffer.push_front(0.0f);
                        }
-                       total_consumed_samples -= delay_samples_to_add;  // Equivalent to increasing k_a0 and k_a1.
+                       total_consumed_samples -= delay_samples_to_add;  // Equivalent to increasing input_samples_received on a0 and a1.
                        err += delay_samples_to_add;
                }
                first_output = false;
@@ -105,10 +108,18 @@ bool ResamplingQueue::get_output_samples(double pts, float *samples, ssize_t num
                // Compute loop filter coefficients for the two filters. We need to compute them
                // every time, since they depend on the number of samples the user asked for.
                //
-               // The loop bandwidth is at 0.02 Hz; we trust the initial estimate quite well,
-               // and our jitter is pretty large since none of the threads involved run at
-               // real-time priority.
-               double loop_bandwidth_hz = 0.02;
+               // The loop bandwidth is at 0.02 Hz; our jitter is pretty large
+               // since none of the threads involved run at real-time priority.
+               // However, the first four seconds, we use a larger loop bandwidth (2 Hz),
+               // because there's a lot going on during startup, and thus the
+               // initial estimate might be tainted by jitter during that phase,
+               // and we want to converge faster.
+               //
+               // NOTE: The above logic might only hold during Nageru startup
+               // (we start ResamplingQueues also when we e.g. switch sound sources),
+               // but in general, a little bit of increased timing jitter is acceptable
+               // right after a setup change like this.
+               double loop_bandwidth_hz = (total_consumed_samples < 4 * freq_in) ? 0.2 : 0.02;
 
                // Set filters. The first filter much wider than the first one (20x as wide).
                double w = (2.0 * M_PI) * loop_bandwidth_hz * num_samples / freq_out;
@@ -127,9 +138,9 @@ bool ResamplingQueue::get_output_samples(double pts, float *samples, ssize_t num
                vresampler.set_rratio(rcorr);
        } else {
                assert(rate_adjustment_policy == DO_NOT_ADJUST_RATE);
-       };
+       }
 
-       // Finally actually resample, consuming exactly <num_samples> output samples.
+       // Finally actually resample, producing exactly <num_samples> output samples.
        vresampler.out_data = samples;
        vresampler.out_count = num_samples;
        while (vresampler.out_count > 0) {
index 662a837262e8b84fcf3680ceae65bdf38f0e0871..b46086df9d0954a4aebd3af0acb3937ca6e314de 100644 (file)
@@ -4,8 +4,8 @@
 // Takes in samples from an input source, possibly with jitter, and outputs a fixed number
 // of samples every iteration. Used to a) change sample rates if needed, and b) deal with
 // input sources that don't have audio locked to video. For every input video
-// frame, you call add_input_samples() with the pts (measured in seconds) of the video frame,
-// taken to be the start point of the frame's audio. When you want to _output_ a finished
+// frame, you call add_input_samples() with the received time point of the video frame,
+// taken to be the _end_ point of the frame's audio. When you want to _output_ a finished
 // frame with audio, you get_output_samples() with the number of samples you want, and will
 // get exactly that number of samples back. If the input and output clocks are not in sync,
 // the audio will be stretched for you. (If they are _very_ out of sync, this will come through
@@ -40,6 +40,7 @@
 
 #include <sys/types.h>
 #include <zita-resampler/vresampler.h>
+#include <chrono>
 #include <deque>
 #include <memory>
 
@@ -58,10 +59,9 @@ public:
                ADJUST_RATE
        };
 
-       // Note: pts is always in seconds.
-       void add_input_samples(double pts, const float *samples, ssize_t num_samples);
+       void add_input_samples(std::chrono::steady_clock::time_point ts, const float *samples, ssize_t num_samples, RateAdjustmentPolicy rate_adjustment_policy);
        // Returns false if underrun.
-       bool get_output_samples(double pts, float *samples, ssize_t num_samples, RateAdjustmentPolicy rate_adjustment_policy);
+       bool get_output_samples(std::chrono::steady_clock::time_point ts, float *samples, ssize_t num_samples, RateAdjustmentPolicy rate_adjustment_policy);
 
 private:
        void init_loop_filter(double bandwidth_hz);
@@ -71,28 +71,43 @@ private:
        unsigned card_num;
        unsigned freq_in, freq_out, num_channels;
 
-       bool first_input = true, first_output = true;
-       double last_input_pts;   // Start of last input block, in seconds.
-       double last_output_pts;
+       bool first_output = true;
 
-       ssize_t k_a0 = 0;  // Total amount of samples inserted _before_ the last call to add_input_samples().
-       ssize_t k_a1 = 0;  // Total amount of samples inserted _after_ the last call to add_input_samples().
+       struct InputPoint {
+               // Equivalent to t_a0 or t_a1 in the paper.
+               std::chrono::steady_clock::time_point ts;
 
-       ssize_t total_consumed_samples = 0;
+               // Number of samples that have been written to the queue (in total)
+               // at this time point. Equivalent to k_a0 or k_a1 in the paper.
+               size_t input_samples_received = 0;
+
+               // Set to false if we should not use the timestamp from this sample
+               // (e.g. if it is from a dropped frame and thus bad). In particular,
+               // we will not use it for updateing current_estimated_freq_in.
+               bool good_sample = false;
+       };
+       InputPoint a0, a1;
 
-       // Duration of last input block, in seconds.
-       double last_input_len;
+       // The current rate at which we seem to get input samples, in Hz.
+       // For an ideal input, identical to freq_in.
+       double current_estimated_freq_in;
+
+       ssize_t total_consumed_samples = 0;
 
        // Filter state for the loop filter.
        double z1 = 0.0, z2 = 0.0, z3 = 0.0;
 
        // Ratio between the two frequencies.
-       double ratio;
+       const double ratio;
+
+       // Current correction ratio. ratio * rcorr gives the true ratio,
+       // so values above 1.0 means to pitch down (consume input samples slower).
+       double rcorr = 1.0;
 
        // How much delay we are expected to have, in input samples.
        // If actual delay drifts too much away from this, we will start
        // changing the resampling ratio to compensate.
-       double expected_delay;
+       const double expected_delay;
 
        // Input samples not yet fed into the resampler.
        // TODO: Use a circular buffer instead, for efficiency.