From adf28dcc8d96304785b05034c323e4c854c76896 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Sat, 30 Mar 2019 14:45:58 +0100
Subject: [PATCH] Add audio output when playing at 100% speed.

Fairly untested, but should work both on single-track export and
on realtime output. No audio stretching or pitch shift, so only
plays when we're at regular speed. Note: There's no monitor output yet,
so the Futatabi operator will be deaf. There are also no VU bars.
---
 futatabi/export.cpp          |  2 +-
 futatabi/frame_on_disk.cpp   |  8 +++-
 futatabi/frame_on_disk.h     |  2 +-
 futatabi/jpeg_frame_view.cpp |  2 +-
 futatabi/player.cpp          | 23 +++++++---
 futatabi/player.h            |  2 +-
 futatabi/video_stream.cpp    | 86 +++++++++++++++++++++++++++++++++---
 futatabi/video_stream.h      | 27 ++++++++---
 8 files changed, 128 insertions(+), 24 deletions(-)
diff --git a/futatabi/export.cpp b/futatabi/export.cpp
index 1b7c59c..7833f91 100644
--- a/futatabi/export.cpp
+++ b/futatabi/export.cpp
@@ -182,7 +182,7 @@ void export_multitrack_clip(const string &filename, const Clip &clip)
 			}
 		}
 
-		FrameReader::Frame frame = readers[first_frame_stream_idx].read_frame(first_frame, /*read_audio=*/true);
+		FrameReader::Frame frame = readers[first_frame_stream_idx].read_frame(first_frame, /*read_video=*/true, /*read_audio=*/true);
 
 		// Write audio. (Before video, since that's what we expect on input.)
 		if (!frame.audio.empty()) {
diff --git a/futatabi/frame_on_disk.cpp b/futatabi/frame_on_disk.cpp
index 6bdaf23..f9a5639 100644
--- a/futatabi/frame_on_disk.cpp
+++ b/futatabi/frame_on_disk.cpp
@@ -4,6 +4,7 @@
 
 #include <atomic>
 #include <chrono>
+#include <assert.h>
 #include <fcntl.h>
 #include <mutex>
 #include <unistd.h>
@@ -68,8 +69,9 @@ string read_string(int fd, size_t size, off_t offset)
 
 }  // namespace
 
-FrameReader::Frame FrameReader::read_frame(FrameOnDisk frame, bool read_audio)
+FrameReader::Frame FrameReader::read_frame(FrameOnDisk frame, bool read_video, bool read_audio)
 {
+	assert(read_video || read_audio);
 	steady_clock::time_point start = steady_clock::now();
 
 	if (int(frame.filename_idx) != last_filename_idx) {
@@ -98,7 +100,9 @@ FrameReader::Frame FrameReader::read_frame(FrameOnDisk frame, bool read_audio)
 	}
 
 	Frame ret;
-	ret.video = read_string(fd, frame.size, frame.offset);
+	if (read_video) {
+		ret.video = read_string(fd, frame.size, frame.offset);
+	}
 	if (read_audio) {
 		ret.audio = read_string(fd, frame.audio_size, frame.offset + frame.size);
 	}
diff --git a/futatabi/frame_on_disk.h b/futatabi/frame_on_disk.h
index 360bd23..35f375b 100644
--- a/futatabi/frame_on_disk.h
+++ b/futatabi/frame_on_disk.h
@@ -46,7 +46,7 @@ public:
 		std::string video;
 		std::string audio;
 	};
-	Frame read_frame(FrameOnDisk frame, bool read_audio);
+	Frame read_frame(FrameOnDisk frame, bool read_video, bool read_audio);
 
 private:
 	int fd = -1;
diff --git a/futatabi/jpeg_frame_view.cpp b/futatabi/jpeg_frame_view.cpp
index c1afafd..943b3e1 100644
--- a/futatabi/jpeg_frame_view.cpp
+++ b/futatabi/jpeg_frame_view.cpp
@@ -238,7 +238,7 @@ shared_ptr<Frame> decode_jpeg_with_cache(FrameOnDisk frame_spec, CacheMissBehavi
 	++metric_jpeg_cache_miss_frames;
 
 	*did_decode = true;
-	shared_ptr<Frame> frame = decode_jpeg(frame_reader->read_frame(frame_spec, /*read_audio=*/false).video);
+	shared_ptr<Frame> frame = decode_jpeg(frame_reader->read_frame(frame_spec, /*read_video=*/true, /*read_audio=*/false).video);
 
 	lock_guard<mutex> lock(cache_mu);
 	cache_bytes_used += frame_size(*frame);
diff --git a/futatabi/player.cpp b/futatabi/player.cpp
index 611f0ab..779e685 100644
--- a/futatabi/player.cpp
+++ b/futatabi/player.cpp
@@ -139,7 +139,14 @@ void Player::play_playlist_once()
 	}
 
 	steady_clock::duration time_slept = steady_clock::now() - before_sleep;
-	pts += duration_cast<duration<size_t, TimebaseRatio>>(time_slept).count();
+	int64_t slept_pts = duration_cast<duration<size_t, TimebaseRatio>>(time_slept).count();
+	if (slept_pts > 0) {
+		if (video_stream != nullptr) {
+			// Add silence for the time we're waiting.
+			video_stream->schedule_silence(steady_clock::now(), pts, slept_pts, QueueSpotHolder());
+		}
+		pts += slept_pts;
+	}
 
 	if (!clip_ready) {
 		if (video_stream != nullptr) {
@@ -205,6 +212,10 @@ void Player::play_playlist_once()
 				break;
 			}
 
+			// Only play audio if we're within 0.1% of normal speed. We could do
+			// stretching or pitch shift later if it becomes needed.
+			bool play_audio = clip->speed * master_speed >= 0.999 && clip->speed * master_speed <= 1.001;
+
 			{
 				lock_guard<mutex> lock(queue_state_mu);
 				if (splice_ready) {
@@ -349,7 +360,7 @@ void Player::play_playlist_once()
 			if (frame_lower.pts == frame_upper.pts || global_flags.interpolation_quality == 0 || video_stream == nullptr) {
 				display_single_frame(primary_stream_idx, frame_lower, secondary_stream_idx,
 				                     secondary_frame, fade_alpha, next_frame_start, /*snapped=*/false,
-				                     subtitle);
+				                     subtitle, play_audio);
 				continue;
 			}
 
@@ -362,7 +373,7 @@ void Player::play_playlist_once()
 				if (fabs(snap_frame.pts - in_pts) < pts_snap_tolerance) {
 					display_single_frame(primary_stream_idx, snap_frame, secondary_stream_idx,
 					                     secondary_frame, fade_alpha, next_frame_start, /*snapped=*/true,
-					                     subtitle);
+					                     subtitle, play_audio);
 					in_pts_origin += snap_frame.pts - in_pts;
 					snapped = true;
 					break;
@@ -418,7 +429,7 @@ void Player::play_playlist_once()
 			video_stream->schedule_interpolated_frame(
 				next_frame_start, pts, display_func, QueueSpotHolder(this),
 				frame_lower, frame_upper, alpha,
-				secondary_frame, fade_alpha, subtitle);
+				secondary_frame, fade_alpha, subtitle, play_audio);
 			last_pts_played = in_pts;  // Not really needed; only previews use last_pts_played.
 		}
 
@@ -439,7 +450,7 @@ void Player::play_playlist_once()
 	}
 }
 
-void Player::display_single_frame(int primary_stream_idx, const FrameOnDisk &primary_frame, int secondary_stream_idx, const FrameOnDisk &secondary_frame, double fade_alpha, steady_clock::time_point frame_start, bool snapped, const std::string &subtitle)
+void Player::display_single_frame(int primary_stream_idx, const FrameOnDisk &primary_frame, int secondary_stream_idx, const FrameOnDisk &secondary_frame, double fade_alpha, steady_clock::time_point frame_start, bool snapped, const std::string &subtitle, bool play_audio)
 {
 	auto display_func = [this, primary_stream_idx, primary_frame, secondary_frame, fade_alpha] {
 		if (destination != nullptr) {
@@ -458,7 +469,7 @@ void Player::display_single_frame(int primary_stream_idx, const FrameOnDisk &pri
 			}
 			video_stream->schedule_original_frame(
 				frame_start, pts, display_func, QueueSpotHolder(this),
-				primary_frame, subtitle);
+				primary_frame, subtitle, play_audio);
 		} else {
 			assert(secondary_frame.pts != -1);
 			// NOTE: We could be increasing unused metrics for previews, but that's harmless.
diff --git a/futatabi/player.h b/futatabi/player.h
index da5a443..b912b8c 100644
--- a/futatabi/player.h
+++ b/futatabi/player.h
@@ -94,7 +94,7 @@ public:
 private:
 	void thread_func(AVFormatContext *file_avctx);
 	void play_playlist_once();
-	void display_single_frame(int primary_stream_idx, const FrameOnDisk &primary_frame, int secondary_stream_idx, const FrameOnDisk &secondary_frame, double fade_alpha, std::chrono::steady_clock::time_point frame_start, bool snapped, const std::string &subtitle);
+	void display_single_frame(int primary_stream_idx, const FrameOnDisk &primary_frame, int secondary_stream_idx, const FrameOnDisk &secondary_frame, double fade_alpha, std::chrono::steady_clock::time_point frame_start, bool snapped, const std::string &subtitle, bool play_audio);
 	void open_output_stream();
 	static int write_packet2_thunk(void *opaque, uint8_t *buf, int buf_size, AVIODataMarkerType type, int64_t time);
 	int write_packet2(uint8_t *buf, int buf_size, AVIODataMarkerType type, int64_t time);
diff --git a/futatabi/video_stream.cpp b/futatabi/video_stream.cpp
index 06acfd2..9647836 100644
--- a/futatabi/video_stream.cpp
+++ b/futatabi/video_stream.cpp
@@ -13,6 +13,7 @@ extern "C" {
 #include "player.h"
 #include "shared/context.h"
 #include "shared/httpd.h"
+#include "shared/shared_defs.h"
 #include "shared/mux.h"
 #include "util.h"
 #include "ycbcr_converter.h"
@@ -286,10 +287,19 @@ void VideoStream::start()
 		avctx->flags = AVFMT_FLAG_CUSTOM_IO;
 	}
 
+	AVCodecParameters *audio_codecpar = avcodec_parameters_alloc();
+
+	audio_codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
+	audio_codecpar->codec_id = AV_CODEC_ID_PCM_S32LE;
+	audio_codecpar->channel_layout = AV_CH_LAYOUT_STEREO;
+	audio_codecpar->channels = 2;
+	audio_codecpar->sample_rate = OUTPUT_FREQUENCY;
+
 	size_t width = global_flags.width, height = global_flags.height;  // Doesn't matter for MJPEG.
-	mux.reset(new Mux(avctx, width, height, Mux::CODEC_MJPEG, /*video_extradata=*/"", /*audio_codec_parameters=*/nullptr,
+	mux.reset(new Mux(avctx, width, height, Mux::CODEC_MJPEG, /*video_extradata=*/"", audio_codecpar,
 	                  AVCOL_SPC_BT709, COARSE_TIMEBASE, /*write_callback=*/nullptr, Mux::WRITE_FOREGROUND, {}, Mux::WITH_SUBTITLES));
 
+	avcodec_parameters_free(&audio_codecpar);
 	encode_thread = thread(&VideoStream::encode_thread_func, this);
 }
 
@@ -331,12 +341,10 @@ void VideoStream::clear_queue()
 void VideoStream::schedule_original_frame(steady_clock::time_point local_pts,
                                           int64_t output_pts, function<void()> &&display_func,
                                           QueueSpotHolder &&queue_spot_holder,
-                                          FrameOnDisk frame, const string &subtitle)
+                                          FrameOnDisk frame, const string &subtitle, bool include_audio)
 {
 	fprintf(stderr, "output_pts=%" PRId64 "  original      input_pts=%" PRId64 "\n", output_pts, frame.pts);
 
-	// TODO: Write audio if at the right speed.
-
 	QueuedFrame qf;
 	qf.local_pts = local_pts;
 	qf.type = QueuedFrame::ORIGINAL;
@@ -344,7 +352,9 @@ void VideoStream::schedule_original_frame(steady_clock::time_point local_pts,
 	qf.display_func = move(display_func);
 	qf.queue_spot_holder = move(queue_spot_holder);
 	qf.subtitle = subtitle;
-	qf.encoded_jpeg.reset(new string(frame_reader.read_frame(frame, /*read_audio=*/false).video));
+	FrameReader::Frame read_frame = frame_reader.read_frame(frame, /*read_video=*/true, include_audio);
+	qf.encoded_jpeg.reset(new string(move(read_frame.video)));
+	qf.audio = move(read_frame.audio);
 
 	lock_guard<mutex> lock(queue_lock);
 	frame_queue.push_back(move(qf));
@@ -424,7 +434,8 @@ void VideoStream::schedule_interpolated_frame(steady_clock::time_point local_pts
                                               int64_t output_pts, function<void(shared_ptr<Frame>)> &&display_func,
                                               QueueSpotHolder &&queue_spot_holder,
                                               FrameOnDisk frame1, FrameOnDisk frame2,
-                                              float alpha, FrameOnDisk secondary_frame, float fade_alpha, const string &subtitle)
+                                              float alpha, FrameOnDisk secondary_frame, float fade_alpha, const string &subtitle,
+                                              bool play_audio)
 {
 	if (secondary_frame.pts != -1) {
 		fprintf(stderr, "output_pts=%" PRId64 "  interpolated  input_pts1=%" PRId64 " input_pts2=%" PRId64 " alpha=%.3f  secondary_pts=%" PRId64 "  fade_alpha=%.2f\n", output_pts, frame1.pts, frame2.pts, alpha, secondary_frame.pts, fade_alpha);
@@ -452,6 +463,10 @@ void VideoStream::schedule_interpolated_frame(steady_clock::time_point local_pts
 	qf.local_pts = local_pts;
 	qf.subtitle = subtitle;
 
+	if (play_audio) {
+		qf.audio = frame_reader.read_frame(frame1, /*read_video=*/false, /*read_audio=*/true).audio;
+	}
+
 	check_error();
 
 	// Convert frame0 and frame1 to OpenGL textures.
@@ -563,6 +578,20 @@ void VideoStream::schedule_refresh_frame(steady_clock::time_point local_pts,
 	queue_changed.notify_all();
 }
 
+void VideoStream::schedule_silence(steady_clock::time_point local_pts, int64_t output_pts,
+                                   int64_t length_pts, QueueSpotHolder &&queue_spot_holder)
+{
+	QueuedFrame qf;
+	qf.type = QueuedFrame::SILENCE;
+	qf.output_pts = output_pts;
+	qf.queue_spot_holder = move(queue_spot_holder);
+	qf.silence_length_pts = length_pts;
+
+	lock_guard<mutex> lock(queue_lock);
+	frame_queue.push_back(move(qf));
+	queue_changed.notify_all();
+}
+
 namespace {
 
 shared_ptr<Frame> frame_from_pbo(void *contents, size_t width, size_t height)
@@ -662,6 +691,8 @@ void VideoStream::encode_thread_func()
 			pkt.flags = AV_PKT_FLAG_KEY;
 			mux->add_packet(pkt, qf.output_pts, qf.output_pts);
 			last_frame = move(jpeg);
+
+			add_audio_or_silence(qf);
 		} else if (qf.type == QueuedFrame::FADED) {
 			glClientWaitSync(qf.fence.get(), /*flags=*/0, GL_TIMEOUT_IGNORED);
 
@@ -678,6 +709,8 @@ void VideoStream::encode_thread_func()
 			pkt.flags = AV_PKT_FLAG_KEY;
 			mux->add_packet(pkt, qf.output_pts, qf.output_pts);
 			last_frame = move(jpeg);
+
+			add_audio_or_silence(qf);
 		} else if (qf.type == QueuedFrame::INTERPOLATED || qf.type == QueuedFrame::FADED_INTERPOLATED) {
 			glClientWaitSync(qf.fence.get(), /*flags=*/0, GL_TIMEOUT_IGNORED);
 
@@ -705,6 +738,8 @@ void VideoStream::encode_thread_func()
 			pkt.flags = AV_PKT_FLAG_KEY;
 			mux->add_packet(pkt, qf.output_pts, qf.output_pts);
 			last_frame = move(jpeg);
+
+			add_audio_or_silence(qf);
 		} else if (qf.type == QueuedFrame::REFRESH) {
 			AVPacket pkt;
 			av_init_packet(&pkt);
@@ -713,6 +748,10 @@ void VideoStream::encode_thread_func()
 			pkt.size = last_frame.size();
 			pkt.flags = AV_PKT_FLAG_KEY;
 			mux->add_packet(pkt, qf.output_pts, qf.output_pts);
+
+			add_audio_or_silence(qf);  // Definitely silence.
+		} else if (qf.type == QueuedFrame::SILENCE) {
+			add_silence(qf.output_pts, qf.silence_length_pts);
 		} else {
 			assert(false);
 		}
@@ -746,3 +785,38 @@ int VideoStream::write_packet2(uint8_t *buf, int buf_size, AVIODataMarkerType ty
 	}
 	return buf_size;
 }
+
+void VideoStream::add_silence(int64_t pts, int64_t length_pts)
+{
+	// At 59.94, this will never quite add up (even discounting refresh frames,
+	// which have unpredictable length), but hopefully, the player in the other
+	// end should be able to stretch silence easily enough.
+	long num_samples = lrint(length_pts * double(OUTPUT_FREQUENCY) / double(TIMEBASE)) * 2;
+	uint8_t *zero = (uint8_t *)calloc(num_samples, sizeof(int32_t));
+
+	AVPacket pkt;
+	av_init_packet(&pkt);
+	pkt.stream_index = 1;
+	pkt.data = zero;
+	pkt.size = num_samples * sizeof(int32_t);
+	pkt.flags = AV_PKT_FLAG_KEY;
+	mux->add_packet(pkt, pts, pts);
+
+	free(zero);
+}
+
+void VideoStream::add_audio_or_silence(const QueuedFrame &qf)
+{
+	if (qf.audio.empty()) {
+		int64_t frame_length = lrint(double(TIMEBASE) / global_flags.output_framerate);
+		add_silence(qf.output_pts, frame_length);
+	} else {
+		AVPacket pkt;
+		av_init_packet(&pkt);
+		pkt.stream_index = 1;
+		pkt.data = (uint8_t *)qf.audio.data();
+		pkt.size = qf.audio.size();
+		pkt.flags = AV_PKT_FLAG_KEY;
+		mux->add_packet(pkt, qf.output_pts, qf.output_pts);
+	}
+}
diff --git a/futatabi/video_stream.h b/futatabi/video_stream.h
index 26cb7c8..f156be9 100644
--- a/futatabi/video_stream.h
+++ b/futatabi/video_stream.h
@@ -47,23 +47,29 @@ public:
 	void schedule_original_frame(std::chrono::steady_clock::time_point,
 	                             int64_t output_pts, std::function<void()> &&display_func,
 	                             QueueSpotHolder &&queue_spot_holder,
-	                             FrameOnDisk frame, const std::string &subtitle);
+	                             FrameOnDisk frame, const std::string &subtitle,
+	                             bool include_audio);
 	void schedule_faded_frame(std::chrono::steady_clock::time_point, int64_t output_pts,
 	                          std::function<void()> &&display_func,
 	                          QueueSpotHolder &&queue_spot_holder,
 	                          FrameOnDisk frame1, FrameOnDisk frame2,
-	                          float fade_alpha, const std::string &subtitle);
+	                          float fade_alpha, const std::string &subtitle);  // Always no audio.
 	void schedule_interpolated_frame(std::chrono::steady_clock::time_point, int64_t output_pts,
 	                                 std::function<void(std::shared_ptr<Frame>)> &&display_func,
 	                                 QueueSpotHolder &&queue_spot_holder,
 	                                 FrameOnDisk frame1, FrameOnDisk frame2,
 	                                 float alpha, FrameOnDisk secondary_frame,  // Empty = no secondary (fade) frame.
-	                                 float fade_alpha, const std::string &subtitle);
+	                                 float fade_alpha, const std::string &subtitle,
+	                                 bool include_audio);
 	void schedule_refresh_frame(std::chrono::steady_clock::time_point, int64_t output_pts,
 	                            std::function<void()> &&display_func,
-	                            QueueSpotHolder &&queue_spot_holder, const std::string &subtitle);
+	                            QueueSpotHolder &&queue_spot_holder, const std::string &subtitle);  // Always no audio.
+	void schedule_silence(std::chrono::steady_clock::time_point, int64_t output_pts,
+	                      int64_t length_pts, QueueSpotHolder &&queue_spot_holder);
 
 private:
+	struct QueuedFrame;
+
 	FrameReader frame_reader;
 
 	void encode_thread_func();
@@ -72,6 +78,8 @@ private:
 
 	static int write_packet2_thunk(void *opaque, uint8_t *buf, int buf_size, AVIODataMarkerType type, int64_t time);
 	int write_packet2(uint8_t *buf, int buf_size, AVIODataMarkerType type, int64_t time);
+	void add_silence(int64_t pts, int64_t length_pts);
+	void add_audio_or_silence(const QueuedFrame &qf);
 
 	// Allocated at the very start; if we're empty, we start dropping frames
 	// (so that we don't build up an infinite interpolation backlog).
@@ -110,13 +118,13 @@ private:
 		std::chrono::steady_clock::time_point local_pts;
 
 		int64_t output_pts;
-		enum Type { ORIGINAL, FADED, INTERPOLATED, FADED_INTERPOLATED, REFRESH } type;
+		enum Type { ORIGINAL, FADED, INTERPOLATED, FADED_INTERPOLATED, REFRESH, SILENCE } type;
 
 		// For original frames only. Made move-only so we know explicitly
 		// we don't copy these ~200 kB files around inadvertedly.
 		std::unique_ptr<std::string> encoded_jpeg;
 
-		// For everything except original frames.
+		// For everything except original frames and silence.
 		FrameOnDisk frame1;
 
 		// For fades only (including fades against interpolated frames).
@@ -135,6 +143,13 @@ private:
 
 		std::string subtitle;  // Blank for none.
 
+		// Audio, in stereo interleaved 32-bit PCM. If empty and not of type SILENCE, one frame's worth of silence samples
+		// is synthesized.
+		std::string audio;
+
+		// For silence frames only.
+		int64_t silence_length_pts;
+
 		QueueSpotHolder queue_spot_holder;
 	};
 	std::deque<QueuedFrame> frame_queue;  // Under <queue_lock>.
-- 
2.39.2