From: Steinar H. Gunderson <sgunderson@bigfoot.com>
Date: Thu, 2 Feb 2017 23:03:58 +0000 (+0100)
Subject: Rework the audio/video sync algorithm.
X-Git-Tag: 1.5.0~52
X-Git-Url: https://git.sesse.net/?p=nageru;a=commitdiff_plain;h=f0dacf505189f0cadcd89a2b632000fd9d012bd2

Rework the audio/video sync algorithm.

It turns out I've been misunderstanding parts of Fons' paper;
my estimation is different, and although it works surprisingly well
for something that's hardly supposed to work at all, it has some
significant problems with edge cases like frame rates being _nearly_
off (e.g. 59.94 Hz input on a 60 Hz output); the estimated delay
under the old algorithm will be a very slow sawtooth, which isn't
nice even after being passed through the filter.

The new algorithm probably still isn't 100% identical to zita-ajbridge,
but it should be much closer to how the algorithm is intended to work.
In particular, it makes a real try to understand that an output frame
can arrive between two input frames in time; this makes it dependent
on the system clock, but that's really the core that was missing from
the algorithm, so it's really more a feature than a bug.

I've made some real attempts at making all the received timestamps
more stable; FakeCapture is a bit odd still (especially at startup)
since it has its thing of just doing frames late instead of dropping
them, but it generally seems to work OK. For cases of frame rate
mismatch (even pretty benign ones), the correction rate seems to be
two orders of magnitude more stable, i.e., the maximum difference
from 1.0 during normal operation is greatly reduced.
---

diff --git a/alsa_input.cpp b/alsa_input.cpp
index 158bfaa..0d69014 100644
--- a/alsa_input.cpp
+++ b/alsa_input.cpp
@@ -12,6 +12,7 @@
 #include "timebase.h"
 
 using namespace std;
+using namespace std::chrono;
 using namespace std::placeholders;
 
 #define RETURN_ON_ERROR(msg, expr) do {                                                    \
@@ -243,10 +244,11 @@ ALSAInput::CaptureEndReason ALSAInput::do_capture()
 
 		const int64_t prev_pts = frames_to_pts(num_frames_output);
 		const int64_t pts = frames_to_pts(num_frames_output + frames);
+		const steady_clock::time_point now = steady_clock::now();
 		bool success;
 		do {
 			if (should_quit) return CaptureEndReason::REQUESTED_QUIT;
-			success = audio_callback(buffer.get(), frames, audio_format, pts - prev_pts);
+			success = audio_callback(buffer.get(), frames, audio_format, pts - prev_pts, now);
 		} while (!success);
 		num_frames_output += frames;
 	}
diff --git a/alsa_input.h b/alsa_input.h
index 6bce913..bae9fdf 100644
--- a/alsa_input.h
+++ b/alsa_input.h
@@ -13,6 +13,7 @@
 #include <stdint.h>
 #include <sys/types.h>
 #include <atomic>
+#include <chrono>
 #include <functional>
 #include <memory>
 #include <string>
@@ -24,7 +25,7 @@ class ALSAPool;
 
 class ALSAInput {
 public:
-	typedef std::function<bool(const uint8_t *data, unsigned num_samples, bmusb::AudioFormat audio_format, int64_t frame_length)> audio_callback_t;
+	typedef std::function<bool(const uint8_t *data, unsigned num_samples, bmusb::AudioFormat audio_format, int64_t frame_length, std::chrono::steady_clock::time_point ts)> audio_callback_t;
 
 	ALSAInput(const char *device, unsigned sample_rate, unsigned num_channels, audio_callback_t audio_callback, ALSAPool *parent_pool, unsigned internal_dev_index);
 	~ALSAInput();
diff --git a/alsa_pool.cpp b/alsa_pool.cpp
index 3a4a5b0..348c623 100644
--- a/alsa_pool.cpp
+++ b/alsa_pool.cpp
@@ -393,7 +393,7 @@ void ALSAPool::reset_device(unsigned index)
 		inputs[index].reset();
 	} else {
 		// TODO: Put on a background thread instead of locking?
-		auto callback = bind(&AudioMixer::add_audio, global_audio_mixer, DeviceSpec{InputSourceType::ALSA_INPUT, index}, _1, _2, _3, _4);
+		auto callback = bind(&AudioMixer::add_audio, global_audio_mixer, DeviceSpec{InputSourceType::ALSA_INPUT, index}, _1, _2, _3, _4, _5);
 		inputs[index].reset(new ALSAInput(device->address.c_str(), OUTPUT_FREQUENCY, device->num_channels, callback, this, index));
 		inputs[index]->start_capture_thread();
 	}
diff --git a/audio_mixer.cpp b/audio_mixer.cpp
index 01fed68..e4d95a4 100644
--- a/audio_mixer.cpp
+++ b/audio_mixer.cpp
@@ -25,6 +25,7 @@
 
 using namespace bmusb;
 using namespace std;
+using namespace std::chrono;
 using namespace std::placeholders;
 
 namespace {
@@ -231,10 +232,9 @@ void AudioMixer::reset_resampler_mutex_held(DeviceSpec device_spec)
 			device_spec.index, device->capture_frequency, OUTPUT_FREQUENCY, device->interesting_channels.size(),
 			global_flags.audio_queue_length_ms * 0.001));
 	}
-	device->next_local_pts = 0;
 }
 
-bool AudioMixer::add_audio(DeviceSpec device_spec, const uint8_t *data, unsigned num_samples, AudioFormat audio_format, int64_t frame_length)
+bool AudioMixer::add_audio(DeviceSpec device_spec, const uint8_t *data, unsigned num_samples, AudioFormat audio_format, int64_t frame_length, steady_clock::time_point frame_time)
 {
 	AudioDevice *device = find_audio_device(device_spec);
 
@@ -274,9 +274,7 @@ bool AudioMixer::add_audio(DeviceSpec device_spec, const uint8_t *data, unsigned
 	}
 
 	// Now add it.
-	int64_t local_pts = device->next_local_pts;
-	device->resampling_queue->add_input_samples(local_pts / double(TIMEBASE), audio.get(), num_samples);
-	device->next_local_pts = local_pts + frame_length;
+	device->resampling_queue->add_input_samples(frame_time, audio.get(), num_samples, ResamplingQueue::ADJUST_RATE);
 	return true;
 }
 
@@ -298,11 +296,7 @@ bool AudioMixer::add_silence(DeviceSpec device_spec, unsigned samples_per_frame,
 
 	vector<float> silence(samples_per_frame * num_channels, 0.0f);
 	for (unsigned i = 0; i < num_frames; ++i) {
-		device->resampling_queue->add_input_samples(device->next_local_pts / double(TIMEBASE), silence.data(), samples_per_frame);
-		// Note that if the format changed in the meantime, we have
-		// no way of detecting that; we just have to assume the frame length
-		// is always the same.
-		device->next_local_pts += frame_length;
+		device->resampling_queue->add_input_samples(steady_clock::now(), silence.data(), samples_per_frame, ResamplingQueue::DO_NOT_ADJUST_RATE);
 	}
 	return true;
 }
@@ -475,7 +469,7 @@ void apply_gain(float db, float last_db, vector<float> *samples)
 
 }  // namespace
 
-vector<float> AudioMixer::get_output(double pts, unsigned num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy)
+vector<float> AudioMixer::get_output(steady_clock::time_point ts, unsigned num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy)
 {
 	map<DeviceSpec, vector<float>> samples_card;
 	vector<float> samples_bus;
@@ -490,7 +484,7 @@ vector<float> AudioMixer::get_output(double pts, unsigned num_samples, Resamplin
 			memset(&samples_card[device_spec][0], 0, samples_card[device_spec].size() * sizeof(float));
 		} else {
 			device->resampling_queue->get_output_samples(
-				pts,
+				ts,
 				&samples_card[device_spec][0],
 				num_samples,
 				rate_adjustment_policy);
diff --git a/audio_mixer.h b/audio_mixer.h
index 85f9aed..8d4f15c 100644
--- a/audio_mixer.h
+++ b/audio_mixer.h
@@ -12,6 +12,7 @@
 #include <stdint.h>
 #include <zita-resampler/resampler.h>
 #include <atomic>
+#include <chrono>
 #include <functional>
 #include <map>
 #include <memory>
@@ -54,7 +55,7 @@ public:
 	// (This is to avoid a deadlock where a card hangs on the mutex in add_audio()
 	// while we are trying to shut it down from another thread that also holds
 	// the mutex.) frame_length is in TIMEBASE units.
-	bool add_audio(DeviceSpec device_spec, const uint8_t *data, unsigned num_samples, bmusb::AudioFormat audio_format, int64_t frame_length);
+	bool add_audio(DeviceSpec device_spec, const uint8_t *data, unsigned num_samples, bmusb::AudioFormat audio_format, int64_t frame_length, std::chrono::steady_clock::time_point frame_time);
 	bool add_silence(DeviceSpec device_spec, unsigned samples_per_frame, unsigned num_frames, int64_t frame_length);
 
 	// If a given device is offline for whatever reason and cannot deliver audio
@@ -64,7 +65,7 @@ public:
 	// affect it. Same true/false behavior as add_audio().
 	bool silence_card(DeviceSpec device_spec, bool silence);
 
-	std::vector<float> get_output(double pts, unsigned num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy);
+	std::vector<float> get_output(std::chrono::steady_clock::time_point ts, unsigned num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy);
 
 	float get_fader_volume(unsigned bus_index) const { return fader_volume_db[bus_index]; }
 	void set_fader_volume(unsigned bus_index, float level_db) { fader_volume_db[bus_index] = level_db; }
@@ -301,7 +302,6 @@ public:
 private:
 	struct AudioDevice {
 		std::unique_ptr<ResamplingQueue> resampling_queue;
-		int64_t next_local_pts = 0;
 		std::string display_name;
 		unsigned capture_frequency = OUTPUT_FREQUENCY;
 		// Which channels we consider interesting (ie., are part of some input_mapping).
diff --git a/benchmark_audio_mixer.cpp b/benchmark_audio_mixer.cpp
index 4b8f84a..3327179 100644
--- a/benchmark_audio_mixer.cpp
+++ b/benchmark_audio_mixer.cpp
@@ -61,6 +61,10 @@ void callback(float level_lufs, float peak_db,
 
 vector<float> process_frame(unsigned frame_num, AudioMixer *mixer)
 {
+	duration<int64_t, ratio<NUM_SAMPLES, OUTPUT_FREQUENCY>> frame_duration(frame_num);
+	steady_clock::time_point ts = steady_clock::time_point::min() +
+		duration_cast<steady_clock::duration>(frame_duration);
+
 	// Feed the inputs.
 	for (unsigned card_index = 0; card_index < NUM_BENCHMARK_CARDS; ++card_index) {
 		bmusb::AudioFormat audio_format;
@@ -70,12 +74,11 @@ vector<float> process_frame(unsigned frame_num, AudioMixer *mixer)
 		unsigned num_samples = NUM_SAMPLES + (lcgrand() % 9) - 5;
 		bool ok = mixer->add_audio(DeviceSpec{InputSourceType::CAPTURE_CARD, card_index},
 			card_index == 3 ? samples24 : samples16, num_samples, audio_format,
-			NUM_SAMPLES * TIMEBASE / OUTPUT_FREQUENCY);
+			NUM_SAMPLES * TIMEBASE / OUTPUT_FREQUENCY, ts);
 		assert(ok);
 	}
 
-	double pts = double(frame_num) * NUM_SAMPLES / OUTPUT_FREQUENCY;
-	return mixer->get_output(pts, NUM_SAMPLES, ResamplingQueue::ADJUST_RATE);
+	return mixer->get_output(ts, NUM_SAMPLES, ResamplingQueue::ADJUST_RATE);
 }
 
 void init_mapping(AudioMixer *mixer)
diff --git a/decklink_capture.cpp b/decklink_capture.cpp
index d38cc73..a0e890f 100644
--- a/decklink_capture.cpp
+++ b/decklink_capture.cpp
@@ -311,6 +311,8 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
 		done_init = true;
 	}
 
+	steady_clock::time_point now = steady_clock::now();
+
 	FrameAllocator::Frame current_video_frame, current_audio_frame;
 	VideoFormat video_format;
 	AudioFormat audio_format;
@@ -357,7 +359,7 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
 			video_format.width = width;
 			video_format.height = height;
 
-			current_video_frame.received_timestamp = steady_clock::now();
+			current_video_frame.received_timestamp = now;
 		}
 	}
 
@@ -375,7 +377,7 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
 			audio_format.bits_per_sample = 32;
 			audio_format.num_channels = 2;
 
-			current_audio_frame.received_timestamp = steady_clock::now();
+			current_audio_frame.received_timestamp = now;
 		}
 	}
 
diff --git a/decklink_output.cpp b/decklink_output.cpp
index a826dfe..38f44ee 100644
--- a/decklink_output.cpp
+++ b/decklink_output.cpp
@@ -242,7 +242,7 @@ void DeckLinkOutput::send_audio(int64_t pts, const std::vector<float> &samples)
 	}
 }
 
-void DeckLinkOutput::wait_for_frame(int64_t pts, int *dropped_frames, int64_t *frame_duration, bool *is_preroll)
+void DeckLinkOutput::wait_for_frame(int64_t pts, int *dropped_frames, int64_t *frame_duration, bool *is_preroll, steady_clock::time_point *frame_timestamp)
 {
 	assert(!should_quit);
 
@@ -277,11 +277,12 @@ void DeckLinkOutput::wait_for_frame(int64_t pts, int *dropped_frames, int64_t *f
 	double playback_speed;
 	output->GetScheduledStreamTime(TIMEBASE, &stream_frame_time, &playback_speed);
 
+	*frame_timestamp = steady_clock::now() +
+		nanoseconds((target_time - stream_frame_time) * 1000000000 / TIMEBASE);
+
 	// If we're ahead of time, wait for the frame to (approximately) start.
 	if (stream_frame_time < target_time) {
-		steady_clock::time_point t = steady_clock::now() +
-			nanoseconds((target_time - stream_frame_time) * 1000000000 / TIMEBASE);
-		this_thread::sleep_until(t);
+		this_thread::sleep_until(*frame_timestamp);
 		return;
 	}
 
@@ -296,6 +297,8 @@ void DeckLinkOutput::wait_for_frame(int64_t pts, int *dropped_frames, int64_t *f
 	// Oops, we missed by more than one frame. Return immediately,
 	// but drop so that we catch up.
 	*dropped_frames = (stream_frame_time - target_time + *frame_duration - 1) / *frame_duration;
+	const int64_t ns_per_frame = this->frame_duration * 1000000000 / TIMEBASE;
+	*frame_timestamp += nanoseconds(*dropped_frames * ns_per_frame);
 	fprintf(stderr, "Dropped %d output frames; skipping.\n", *dropped_frames);
 }
 
diff --git a/decklink_output.h b/decklink_output.h
index 7bc9850..d3bafca 100644
--- a/decklink_output.h
+++ b/decklink_output.h
@@ -4,6 +4,7 @@
 #include <epoxy/gl.h>
 #include <stdint.h>
 #include <atomic>
+#include <chrono>
 #include <condition_variable>
 #include <memory>
 #include <mutex>
@@ -41,7 +42,16 @@ public:
 
 	void send_frame(GLuint y_tex, GLuint cbcr_tex, const std::vector<RefCountedFrame> &input_frames, int64_t pts, int64_t duration);
 	void send_audio(int64_t pts, const std::vector<float> &samples);
-	void wait_for_frame(int64_t pts, int *dropped_frames, int64_t *frame_duration, bool *is_preroll);
+
+	// NOTE: The returned timestamp is undefined for preroll.
+	// Otherwise, it is the timestamp of the output frame as it should have been,
+	// even if we're overshooting. E.g. at 50 fps (0.02 spf), assuming the
+	// last frame was at t=0.980:
+	//
+	//   If we're at t=0.999, we wait until t=1.000 and return that.
+	//   If we're at t=1.001, we return t=1.000 immediately (small overshoot).
+	//   If we're at t=1.055, we drop two frames and return t=1.040 immediately.
+	void wait_for_frame(int64_t pts, int *dropped_frames, int64_t *frame_duration, bool *is_preroll, std::chrono::steady_clock::time_point *frame_timestamp);
 
 	// Analogous to CaptureInterface. Will only return modes that have the right width/height.
 	std::map<uint32_t, bmusb::VideoMode> get_available_video_modes() const { return video_modes; }
diff --git a/mixer.cpp b/mixer.cpp
index f77f6d4..b28da27 100644
--- a/mixer.cpp
+++ b/mixer.cpp
@@ -383,7 +383,7 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode,
 		} while (!success);
 	}
 
-	audio_mixer.add_audio(device, audio_frame.data + audio_offset, num_samples, audio_format, frame_length);
+	audio_mixer.add_audio(device, audio_frame.data + audio_offset, num_samples, audio_format, frame_length, audio_frame.received_timestamp);
 
 	// Done with the audio, so release it.
 	if (audio_frame.owner) {
@@ -613,7 +613,7 @@ void Mixer::thread_func()
 		}
 
 		OutputFrameInfo	output_frame_info = get_one_frame_from_each_card(master_card_index, master_card_is_output, new_frames, has_new_frame);
-		schedule_audio_resampling_tasks(output_frame_info.dropped_frames, output_frame_info.num_samples, output_frame_info.frame_duration, output_frame_info.is_preroll);
+		schedule_audio_resampling_tasks(output_frame_info.dropped_frames, output_frame_info.num_samples, output_frame_info.frame_duration, output_frame_info.is_preroll, output_frame_info.frame_timestamp);
 		stats_dropped_frames += output_frame_info.dropped_frames;
 
 		handle_hotplugged_cards();
@@ -738,7 +738,7 @@ start:
 	unique_lock<mutex> lock(card_mutex, defer_lock);
 	if (master_card_is_output) {
 		// Clocked to the output, so wait for it to be ready for the next frame.
-		cards[master_card_index].output->wait_for_frame(pts_int, &output_frame_info.dropped_frames, &output_frame_info.frame_duration, &output_frame_info.is_preroll);
+		cards[master_card_index].output->wait_for_frame(pts_int, &output_frame_info.dropped_frames, &output_frame_info.frame_duration, &output_frame_info.is_preroll, &output_frame_info.frame_timestamp);
 		lock.lock();
 	} else {
 		// Wait for the master card to have a new frame.
@@ -758,6 +758,11 @@ start:
 		goto start;
 	}
 
+	if (!master_card_is_output) {
+		output_frame_info.frame_timestamp =
+			cards[master_card_index].new_frames.front().received_timestamp;
+	}
+
 	for (unsigned card_index = 0; card_index < num_cards; ++card_index) {
 		CaptureCard *card = &cards[card_index];
 		if (card->new_frames.empty()) {
@@ -847,7 +852,7 @@ void Mixer::handle_hotplugged_cards()
 }
 
 
-void Mixer::schedule_audio_resampling_tasks(unsigned dropped_frames, int num_samples_per_frame, int length_per_frame, bool is_preroll)
+void Mixer::schedule_audio_resampling_tasks(unsigned dropped_frames, int num_samples_per_frame, int length_per_frame, bool is_preroll, steady_clock::time_point frame_timestamp)
 {
 	// Resample the audio as needed, including from previously dropped frames.
 	assert(num_cards > 0);
@@ -869,7 +874,7 @@ void Mixer::schedule_audio_resampling_tasks(unsigned dropped_frames, int num_sam
 			// better to just wait until we have a slightly more normal situation).
 			unique_lock<mutex> lock(audio_mutex);
 			bool adjust_rate = !dropped_frame && !is_preroll;
-			audio_task_queue.push(AudioTask{pts_int, num_samples_per_frame, adjust_rate});
+			audio_task_queue.push(AudioTask{pts_int, num_samples_per_frame, adjust_rate, frame_timestamp});
 			audio_task_queue_changed.notify_one();
 		}
 		if (dropped_frame) {
@@ -962,7 +967,7 @@ void Mixer::audio_thread_func()
 		ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy =
 			task.adjust_rate ? ResamplingQueue::ADJUST_RATE : ResamplingQueue::DO_NOT_ADJUST_RATE;
 		vector<float> samples_out = audio_mixer.get_output(
-			double(task.pts_int) / TIMEBASE,
+			task.frame_timestamp,
 			task.num_samples,
 			rate_adjustment_policy);
 
diff --git a/mixer.h b/mixer.h
index 045e9b1..d70c65b 100644
--- a/mixer.h
+++ b/mixer.h
@@ -323,7 +323,7 @@ private:
 	void place_rectangle(movit::Effect *resample_effect, movit::Effect *padding_effect, float x0, float y0, float x1, float y1);
 	void thread_func();
 	void handle_hotplugged_cards();
-	void schedule_audio_resampling_tasks(unsigned dropped_frames, int num_samples_per_frame, int length_per_frame, bool is_preroll);
+	void schedule_audio_resampling_tasks(unsigned dropped_frames, int num_samples_per_frame, int length_per_frame, bool is_preroll, std::chrono::steady_clock::time_point frame_timestamp);
 	void render_one_frame(int64_t duration);
 	void audio_thread_func();
 	void release_display_frame(DisplayFrame *frame);
@@ -403,6 +403,7 @@ private:
 		int num_samples;  // Audio samples needed for this output frame.
 		int64_t frame_duration;  // In TIMEBASE units.
 		bool is_preroll;
+		std::chrono::steady_clock::time_point frame_timestamp;
 	};
 	OutputFrameInfo get_one_frame_from_each_card(unsigned master_card_index, bool master_card_is_output, CaptureCard::NewFrame new_frames[MAX_VIDEO_CARDS], bool has_new_frame[MAX_VIDEO_CARDS]);
 
@@ -452,6 +453,7 @@ private:
 		int64_t pts_int;
 		int num_samples;
 		bool adjust_rate;
+		std::chrono::steady_clock::time_point frame_timestamp;
 	};
 	std::mutex audio_mutex;
 	std::condition_variable audio_task_queue_changed;
diff --git a/resampling_queue.cpp b/resampling_queue.cpp
index 025fa50..188bf7d 100644
--- a/resampling_queue.cpp
+++ b/resampling_queue.cpp
@@ -28,9 +28,11 @@
 #include <cmath>
 
 using namespace std;
+using namespace std::chrono;
 
 ResamplingQueue::ResamplingQueue(unsigned card_num, unsigned freq_in, unsigned freq_out, unsigned num_channels, double expected_delay_seconds)
 	: card_num(card_num), freq_in(freq_in), freq_out(freq_out), num_channels(num_channels),
+	  current_estimated_freq_in(freq_in),
 	  ratio(double(freq_out) / double(freq_in)), expected_delay(expected_delay_seconds * OUTPUT_FREQUENCY)
 {
 	vresampler.setup(ratio, num_channels, /*hlen=*/32);
@@ -41,54 +43,55 @@ ResamplingQueue::ResamplingQueue(unsigned card_num, unsigned freq_in, unsigned f
         vresampler.process ();
 }
 
-void ResamplingQueue::add_input_samples(double pts, const float *samples, ssize_t num_samples)
+void ResamplingQueue::add_input_samples(steady_clock::time_point ts, const float *samples, ssize_t num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy)
 {
 	if (num_samples == 0) {
 		return;
 	}
-	if (first_input) {
-		// Synthesize a fake length.
-		last_input_len = double(num_samples) / freq_in;
-		first_input = false;
-	} else {
-		last_input_len = pts - last_input_pts;
-	}
 
-	last_input_pts = pts;
-
-	k_a0 = k_a1;
-	k_a1 += num_samples;
+	bool good_sample = (rate_adjustment_policy == ADJUST_RATE);
+	if (good_sample && a1.good_sample) {
+		a0 = a1;
+	}
+	a1.ts = ts;
+	a1.input_samples_received += num_samples;
+	a1.good_sample = good_sample;
+	if (a0.good_sample && a1.good_sample) {
+		current_estimated_freq_in = (a1.input_samples_received - a0.input_samples_received) / duration<double>(a1.ts - a0.ts).count();
+		assert(current_estimated_freq_in >= 0.0);
+
+		// Bound the frequency, so that a single wild result won't throw the filter off guard.
+		current_estimated_freq_in = min(current_estimated_freq_in, 1.2 * freq_in);
+		current_estimated_freq_in = max(current_estimated_freq_in, 0.8 * freq_in);
+	}
 
 	buffer.insert(buffer.end(), samples, samples + num_samples * num_channels);
 }
 
-bool ResamplingQueue::get_output_samples(double pts, float *samples, ssize_t num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy)
+bool ResamplingQueue::get_output_samples(steady_clock::time_point ts, float *samples, ssize_t num_samples, ResamplingQueue::RateAdjustmentPolicy rate_adjustment_policy)
 {
 	assert(num_samples > 0);
-	if (first_input) {
+	if (a1.input_samples_received == 0) {
 		// No data yet, just return zeros.
 		memset(samples, 0, num_samples * num_channels * sizeof(float));
 		return true;
 	}
 
-	double rcorr = -1.0;
-	if (rate_adjustment_policy == ADJUST_RATE) {
-		double last_output_len;
-		if (first_output) {
-			// Synthesize a fake length.
-			last_output_len = double(num_samples) / freq_out;
-		} else {
-			last_output_len = pts - last_output_pts;
-		}
-		last_output_pts = pts;
-
-		// Using the time point since just before the last call to add_input_samples() as a base,
-		// estimate actual delay based on activity since then, measured in number of input samples:
-		double actual_delay = 0.0;
-		assert(last_input_len != 0);
-		actual_delay += (k_a1 - k_a0) * last_output_len / last_input_len;    // Inserted samples since k_a0, rescaled for the different time periods.
-		actual_delay += k_a0 - total_consumed_samples;                       // Samples inserted before k_a0 but not consumed yet.
-		actual_delay += vresampler.inpdist();                                // Delay in the resampler itself.
+	if (rate_adjustment_policy == ADJUST_RATE && (a0.good_sample || a1.good_sample)) {
+		// Estimate the current number of input samples produced at
+		// this instant in time, by extrapolating from the last known
+		// good point. Note that we could be extrapolating backward or
+		// forward, depending on the timing of the calls.
+		const InputPoint &base_point = a1.good_sample ? a1 : a0;
+		const double input_samples_received = base_point.input_samples_received +
+			current_estimated_freq_in * duration<double>(ts - base_point.ts).count();
+
+		// Estimate the number of input samples _consumed_ after we've run the resampler.
+		const double input_samples_consumed = total_consumed_samples +
+			num_samples / (ratio * rcorr);
+
+		double actual_delay = input_samples_received - input_samples_consumed;
+		actual_delay += vresampler.inpdist();    // Delay in the resampler itself.
 		double err = actual_delay - expected_delay;
 		if (first_output && err < 0.0) {
 			// Before the very first block, insert artificial delay based on our initial estimate,
@@ -97,7 +100,7 @@ bool ResamplingQueue::get_output_samples(double pts, float *samples, ssize_t num
 			for (ssize_t i = 0; i < delay_samples_to_add * num_channels; ++i) {
 				buffer.push_front(0.0f);
 			}
-			total_consumed_samples -= delay_samples_to_add;  // Equivalent to increasing k_a0 and k_a1.
+			total_consumed_samples -= delay_samples_to_add;  // Equivalent to increasing input_samples_received on a0 and a1.
 			err += delay_samples_to_add;
 		}
 		first_output = false;
@@ -105,10 +108,18 @@ bool ResamplingQueue::get_output_samples(double pts, float *samples, ssize_t num
 		// Compute loop filter coefficients for the two filters. We need to compute them
 		// every time, since they depend on the number of samples the user asked for.
 		//
-		// The loop bandwidth is at 0.02 Hz; we trust the initial estimate quite well,
-		// and our jitter is pretty large since none of the threads involved run at
-		// real-time priority.
-		double loop_bandwidth_hz = 0.02;
+		// The loop bandwidth is at 0.02 Hz; our jitter is pretty large
+		// since none of the threads involved run at real-time priority.
+		// However, the first four seconds, we use a larger loop bandwidth (2 Hz),
+		// because there's a lot going on during startup, and thus the
+		// initial estimate might be tainted by jitter during that phase,
+		// and we want to converge faster.
+		//
+		// NOTE: The above logic might only hold during Nageru startup
+		// (we start ResamplingQueues also when we e.g. switch sound sources),
+		// but in general, a little bit of increased timing jitter is acceptable
+		// right after a setup change like this.
+		double loop_bandwidth_hz = (total_consumed_samples < 4 * freq_in) ? 0.2 : 0.02;
 
 		// Set filters. The first filter much wider than the first one (20x as wide).
 		double w = (2.0 * M_PI) * loop_bandwidth_hz * num_samples / freq_out;
@@ -127,9 +138,9 @@ bool ResamplingQueue::get_output_samples(double pts, float *samples, ssize_t num
 		vresampler.set_rratio(rcorr);
 	} else {
 		assert(rate_adjustment_policy == DO_NOT_ADJUST_RATE);
-	};
+	}
 
-	// Finally actually resample, consuming exactly <num_samples> output samples.
+	// Finally actually resample, producing exactly <num_samples> output samples.
 	vresampler.out_data = samples;
 	vresampler.out_count = num_samples;
 	while (vresampler.out_count > 0) {
diff --git a/resampling_queue.h b/resampling_queue.h
index 662a837..b46086d 100644
--- a/resampling_queue.h
+++ b/resampling_queue.h
@@ -4,8 +4,8 @@
 // Takes in samples from an input source, possibly with jitter, and outputs a fixed number
 // of samples every iteration. Used to a) change sample rates if needed, and b) deal with
 // input sources that don't have audio locked to video. For every input video
-// frame, you call add_input_samples() with the pts (measured in seconds) of the video frame,
-// taken to be the start point of the frame's audio. When you want to _output_ a finished
+// frame, you call add_input_samples() with the received time point of the video frame,
+// taken to be the _end_ point of the frame's audio. When you want to _output_ a finished
 // frame with audio, you get_output_samples() with the number of samples you want, and will
 // get exactly that number of samples back. If the input and output clocks are not in sync,
 // the audio will be stretched for you. (If they are _very_ out of sync, this will come through
@@ -40,6 +40,7 @@
 
 #include <sys/types.h>
 #include <zita-resampler/vresampler.h>
+#include <chrono>
 #include <deque>
 #include <memory>
 
@@ -58,10 +59,9 @@ public:
 		ADJUST_RATE
 	};
 
-	// Note: pts is always in seconds.
-	void add_input_samples(double pts, const float *samples, ssize_t num_samples);
+	void add_input_samples(std::chrono::steady_clock::time_point ts, const float *samples, ssize_t num_samples, RateAdjustmentPolicy rate_adjustment_policy);
 	// Returns false if underrun.
-	bool get_output_samples(double pts, float *samples, ssize_t num_samples, RateAdjustmentPolicy rate_adjustment_policy);
+	bool get_output_samples(std::chrono::steady_clock::time_point ts, float *samples, ssize_t num_samples, RateAdjustmentPolicy rate_adjustment_policy);
 
 private:
 	void init_loop_filter(double bandwidth_hz);
@@ -71,28 +71,43 @@ private:
 	unsigned card_num;
 	unsigned freq_in, freq_out, num_channels;
 
-	bool first_input = true, first_output = true;
-	double last_input_pts;   // Start of last input block, in seconds.
-	double last_output_pts;
+	bool first_output = true;
 
-	ssize_t k_a0 = 0;  // Total amount of samples inserted _before_ the last call to add_input_samples().
-	ssize_t k_a1 = 0;  // Total amount of samples inserted _after_ the last call to add_input_samples().
+	struct InputPoint {
+		// Equivalent to t_a0 or t_a1 in the paper.
+		std::chrono::steady_clock::time_point ts;
 
-	ssize_t total_consumed_samples = 0;
+		// Number of samples that have been written to the queue (in total)
+		// at this time point. Equivalent to k_a0 or k_a1 in the paper.
+		size_t input_samples_received = 0;
+
+		// Set to false if we should not use the timestamp from this sample
+		// (e.g. if it is from a dropped frame and thus bad). In particular,
+		// we will not use it for updateing current_estimated_freq_in.
+		bool good_sample = false;
+	};
+	InputPoint a0, a1;
 
-	// Duration of last input block, in seconds.
-	double last_input_len;
+	// The current rate at which we seem to get input samples, in Hz.
+	// For an ideal input, identical to freq_in.
+	double current_estimated_freq_in;
+
+	ssize_t total_consumed_samples = 0;
 
 	// Filter state for the loop filter.
 	double z1 = 0.0, z2 = 0.0, z3 = 0.0;
 
 	// Ratio between the two frequencies.
-	double ratio;
+	const double ratio;
+
+	// Current correction ratio. ratio * rcorr gives the true ratio,
+	// so values above 1.0 means to pitch down (consume input samples slower).
+	double rcorr = 1.0;
 
 	// How much delay we are expected to have, in input samples.
 	// If actual delay drifts too much away from this, we will start
 	// changing the resampling ratio to compensate.
-	double expected_delay;
+	const double expected_delay;
 
 	// Input samples not yet fed into the resampler.
 	// TODO: Use a circular buffer instead, for efficiency.