From b0ce4383b7d64760bbfccf4e0e769b293f0db0cd Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Sat, 8 Jul 2017 18:43:39 +0200
Subject: [PATCH] Add support for transcoding the audio in Kaeru (on by
 default).

---
 ffmpeg_capture.cpp | 43 +++++++++++++++++++++++-----------
 ffmpeg_capture.h   | 11 +++++----
 flags.cpp          | 14 ++++++++++++
 flags.h            |  1 +
 kaeru.cpp          | 57 ++++++++++++++++++++++++++++++++++++++--------
 5 files changed, 98 insertions(+), 28 deletions(-)

diff --git a/ffmpeg_capture.cpp b/ffmpeg_capture.cpp
index 5e6104f..3de7d5e 100644
--- a/ffmpeg_capture.cpp
+++ b/ffmpeg_capture.cpp
@@ -228,7 +228,8 @@ void FFmpegCapture::configure_card()
 		set_video_frame_allocator(owned_video_frame_allocator.get());
 	}
 	if (audio_frame_allocator == nullptr) {
-		owned_audio_frame_allocator.reset(new MallocFrameAllocator(65536, NUM_QUEUED_AUDIO_FRAMES));
+		// Audio can come out in pretty large chunks, so increase from the default 1 MB.
+		owned_audio_frame_allocator.reset(new MallocFrameAllocator(1 << 20, NUM_QUEUED_AUDIO_FRAMES));
 		set_audio_frame_allocator(owned_audio_frame_allocator.get());
 	}
 }
@@ -319,7 +320,7 @@ void FFmpegCapture::send_disconnected_frame()
 		video_frame.len = width * height * 4;
 		memset(video_frame.data, 0, video_frame.len);
 
-		frame_callback(-1, AVRational{1, TIMEBASE}, timecode++,
+		frame_callback(-1, AVRational{1, TIMEBASE}, -1, AVRational{1, TIMEBASE}, timecode++,
 			video_frame, /*video_offset=*/0, video_format,
 			FrameAllocator::Frame(), /*audio_offset=*/0, AudioFormat());
 	}
@@ -410,9 +411,10 @@ bool FFmpegCapture::play_video(const string &pathname)
 		FrameAllocator::Frame audio_frame = audio_frame_allocator->alloc_frame();
 		AudioFormat audio_format;
 
+		int64_t audio_pts;
 		bool error;
 		AVFrameWithDeleter frame = decode_frame(format_ctx.get(), video_codec_ctx.get(), audio_codec_ctx.get(),
-			pathname, video_stream_index, audio_stream_index, &audio_frame, &audio_format, &error);
+			pathname, video_stream_index, audio_stream_index, &audio_frame, &audio_format, &audio_pts, &error);
 		if (error) {
 			return false;
 		}
@@ -447,7 +449,10 @@ bool FFmpegCapture::play_video(const string &pathname)
 			video_frame.received_timestamp = next_frame_start;
 			bool finished_wakeup = producer_thread_should_quit.sleep_until(next_frame_start);
 			if (finished_wakeup) {
-				frame_callback(frame->pts, video_timebase, timecode++,
+				if (audio_frame.len > 0) {
+					assert(audio_pts != -1);
+				}
+				frame_callback(frame->pts, video_timebase, audio_pts, audio_timebase, timecode++,
 					video_frame, 0, video_format,
 					audio_frame, 0, audio_format);
 				break;
@@ -527,7 +532,9 @@ namespace {
 
 }  // namespace
 
-AVFrameWithDeleter FFmpegCapture::decode_frame(AVFormatContext *format_ctx, AVCodecContext *video_codec_ctx, AVCodecContext *audio_codec_ctx, const std::string &pathname, int video_stream_index, int audio_stream_index, FrameAllocator::Frame *audio_frame, AudioFormat *audio_format, bool *error)
+AVFrameWithDeleter FFmpegCapture::decode_frame(AVFormatContext *format_ctx, AVCodecContext *video_codec_ctx, AVCodecContext *audio_codec_ctx,
+	const std::string &pathname, int video_stream_index, int audio_stream_index,
+	FrameAllocator::Frame *audio_frame, AudioFormat *audio_format, int64_t *audio_pts, bool *error)
 {
 	*error = false;
 
@@ -536,6 +543,7 @@ AVFrameWithDeleter FFmpegCapture::decode_frame(AVFormatContext *format_ctx, AVCo
 	AVFrameWithDeleter audio_avframe = av_frame_alloc_unique();
 	AVFrameWithDeleter video_avframe = av_frame_alloc_unique();
 	bool eof = false;
+	*audio_pts = -1;
 	do {
 		AVPacket pkt;
 		unique_ptr<AVPacket, decltype(av_packet_unref)*> pkt_cleanup(
@@ -554,6 +562,9 @@ AVFrameWithDeleter FFmpegCapture::decode_frame(AVFormatContext *format_ctx, AVCo
 					return AVFrameWithDeleter(nullptr);
 				}
 			} else if (pkt.stream_index == audio_stream_index) {
+				if (*audio_pts == -1) {
+					*audio_pts = pkt.pts;
+				}
 				if (avcodec_send_packet(audio_codec_ctx, &pkt) < 0) {
 					fprintf(stderr, "%s: Cannot send packet to audio codec.\n", pathname.c_str());
 					*error = true;
@@ -565,17 +576,23 @@ AVFrameWithDeleter FFmpegCapture::decode_frame(AVFormatContext *format_ctx, AVCo
 		}
 
 		// Decode audio, if any.
-		int err = avcodec_receive_frame(audio_codec_ctx, audio_avframe.get());
-		if (err == 0) {
-			convert_audio(audio_avframe.get(), audio_frame, audio_format);
-		} else if (err != AVERROR(EAGAIN)) {
-			fprintf(stderr, "%s: Cannot receive frame from audio codec.\n", pathname.c_str());
-			*error = true;
-			return AVFrameWithDeleter(nullptr);
+		if (*audio_pts != -1) {
+			for ( ;; ) {
+				int err = avcodec_receive_frame(audio_codec_ctx, audio_avframe.get());
+				if (err == 0) {
+					convert_audio(audio_avframe.get(), audio_frame, audio_format);
+				} else if (err == AVERROR(EAGAIN)) {
+					break;
+				} else {
+					fprintf(stderr, "%s: Cannot receive frame from audio codec.\n", pathname.c_str());
+					*error = true;
+					return AVFrameWithDeleter(nullptr);
+				}
+			}
 		}
 
 		// Decode video, if we have a frame.
-		err = avcodec_receive_frame(video_codec_ctx, video_avframe.get());
+		int err = avcodec_receive_frame(video_codec_ctx, video_avframe.get());
 		if (err == 0) {
 			frame_finished = true;
 			break;
diff --git a/ffmpeg_capture.h b/ffmpeg_capture.h
index eb377f7..c507715 100644
--- a/ffmpeg_capture.h
+++ b/ffmpeg_capture.h
@@ -106,7 +106,8 @@ public:
 
 	// FFmpegCapture-specific overload of set_frame_callback that also gives
 	// the raw original pts from the video. Negative pts means a dummy frame.
-	typedef std::function<void(int64_t pts, AVRational timebase, uint16_t timecode,
+	typedef std::function<void(int64_t video_pts, AVRational video_timebase, int64_t audio_pts, AVRational audio_timebase,
+	                           uint16_t timecode,
 	                           bmusb::FrameAllocator::Frame video_frame, size_t video_offset, bmusb::VideoFormat video_format,
 				   bmusb::FrameAllocator::Frame audio_frame, size_t audio_offset, bmusb::AudioFormat audio_format)>
 		frame_callback_t;
@@ -119,13 +120,13 @@ public:
 	{
 		frame_callback = std::bind(
 			callback,
-			std::placeholders::_3,
-			std::placeholders::_4,
 			std::placeholders::_5,
 			std::placeholders::_6,
 			std::placeholders::_7,
 			std::placeholders::_8,
-			std::placeholders::_9);
+			std::placeholders::_9,
+			std::placeholders::_10,
+			std::placeholders::_11);
 	}
 
 	// FFmpegCapture-specific callback that gives the raw audio.
@@ -198,7 +199,7 @@ private:
 	// Returns nullptr if no frame was decoded (e.g. EOF).
 	AVFrameWithDeleter decode_frame(AVFormatContext *format_ctx, AVCodecContext *video_codec_ctx, AVCodecContext *audio_codec_ctx,
 	                                const std::string &pathname, int video_stream_index, int audio_stream_index,
-	                                bmusb::FrameAllocator::Frame *audio_frame, bmusb::AudioFormat *audio_format, bool *error);
+	                                bmusb::FrameAllocator::Frame *audio_frame, bmusb::AudioFormat *audio_format, int64_t *audio_pts, bool *error);
 	void convert_audio(const AVFrame *audio_avframe, bmusb::FrameAllocator::Frame *audio_frame, bmusb::AudioFormat *audio_format);
 
 	bmusb::VideoFormat construct_video_format(const AVFrame *frame, AVRational video_timebase);
diff --git a/flags.cpp b/flags.cpp
index 773750d..c65903a 100644
--- a/flags.cpp
+++ b/flags.cpp
@@ -33,6 +33,7 @@ enum LongOption {
 	OPTION_HTTP_COARSE_TIMEBASE,
 	OPTION_HTTP_AUDIO_CODEC,
 	OPTION_HTTP_AUDIO_BITRATE,
+	OPTION_NO_TRANSCODE_AUDIO,
 	OPTION_FLAT_AUDIO,
 	OPTION_GAIN_STAGING,
 	OPTION_DISABLE_LOCUT,
@@ -107,6 +108,10 @@ void usage(Program program)
 	fprintf(stderr, "      --http-audio-bitrate=KBITS  audio codec bit rate to use for HTTP streams\n");
 	fprintf(stderr, "                                  (default is %d, ignored unless --http-audio-codec is set)\n",
 		DEFAULT_AUDIO_OUTPUT_BIT_RATE / 1000);
+	if (program == PROGRAM_KAERU) {
+		fprintf(stderr, "      --no-transcode-audio        copy encoded audio raw from the source stream\n");
+		fprintf(stderr, "                                    (requires --http-audio-codec= to be set)\n");
+	}
 	fprintf(stderr, "      --http-coarse-timebase      use less timebase for HTTP (recommended for muxers\n");
 	fprintf(stderr, "                                  that handle large pts poorly, like e.g. MP4)\n");
 	if (program == PROGRAM_NAGERU) {
@@ -182,6 +187,7 @@ void parse_flags(Program program, int argc, char * const argv[])
 		{ "http-coarse-timebase", no_argument, 0, OPTION_HTTP_COARSE_TIMEBASE },
 		{ "http-audio-codec", required_argument, 0, OPTION_HTTP_AUDIO_CODEC },
 		{ "http-audio-bitrate", required_argument, 0, OPTION_HTTP_AUDIO_BITRATE },
+		{ "no-transcode-audio", no_argument, 0, OPTION_NO_TRANSCODE_AUDIO },
 		{ "flat-audio", no_argument, 0, OPTION_FLAT_AUDIO },
 		{ "gain-staging", required_argument, 0, OPTION_GAIN_STAGING },
 		{ "disable-locut", no_argument, 0, OPTION_DISABLE_LOCUT },
@@ -288,6 +294,9 @@ void parse_flags(Program program, int argc, char * const argv[])
 		case OPTION_HTTP_AUDIO_BITRATE:
 			global_flags.stream_audio_codec_bitrate = atoi(optarg) * 1000;
 			break;
+		case OPTION_NO_TRANSCODE_AUDIO:
+			global_flags.transcode_audio = false;
+			break;
 		case OPTION_HTTP_X264_VIDEO:
 			global_flags.x264_video_to_http = true;
 			break;
@@ -476,6 +485,11 @@ void parse_flags(Program program, int argc, char * const argv[])
 		fprintf(stderr, "ERROR: --output-card points to a nonexistant card\n");
 		exit(1);
 	}
+	if (!global_flags.transcode_audio && global_flags.stream_audio_codec_name.empty()) {
+		fprintf(stderr, "ERROR: If not transcoding audio, you must specify ahead-of-time what audio codec is in use\n");
+		fprintf(stderr, "       (using --http-audio-codec).\n");
+		exit(1);
+	}
 	if (global_flags.x264_speedcontrol) {
 		if (!global_flags.x264_preset.empty() && global_flags.x264_preset != "faster") {
 			fprintf(stderr, "WARNING: --x264-preset is overridden by --x264-speedcontrol (implicitly uses \"faster\" as base preset)\n");
diff --git a/flags.h b/flags.h
index 87b05b1..e6bf08e 100644
--- a/flags.h
+++ b/flags.h
@@ -59,6 +59,7 @@ struct Flags {
 	bool ten_bit_input = false;
 	bool ten_bit_output = false;  // Implies x264_video_to_disk == true and x264_bit_depth == 10.
 	YCbCrInterpretation ycbcr_interpretation[MAX_VIDEO_CARDS];
+	bool transcode_audio = true;  // Kaeru only.
 	int x264_bit_depth = 8;  // Not user-settable.
 	bool use_zerocopy = false;  // Not user-settable.
 	bool can_disable_srgb_decoder = false;  // Not user-settable.
diff --git a/kaeru.cpp b/kaeru.cpp
index bb1b08f..a82ee51 100644
--- a/kaeru.cpp
+++ b/kaeru.cpp
@@ -64,15 +64,45 @@ unique_ptr<Mux> create_mux(HTTPD *httpd, AVOutputFormat *oformat, X264Encoder *x
 	return mux;
 }
 
-void video_frame_callback(FFmpegCapture *video, X264Encoder *x264_encoder, int64_t pts, AVRational timebase, uint16_t timecode,
+void video_frame_callback(FFmpegCapture *video, X264Encoder *x264_encoder, AudioEncoder *audio_encoder,
+                          int64_t video_pts, AVRational video_timebase,
+                          int64_t audio_pts, AVRational audio_timebase,
+                          uint16_t timecode,
 	                  FrameAllocator::Frame video_frame, size_t video_offset, VideoFormat video_format,
 	                  FrameAllocator::Frame audio_frame, size_t audio_offset, AudioFormat audio_format)
 {
-	if (pts >= 0 && video_frame.len > 0) {
-		pts = av_rescale_q(pts, timebase, AVRational{ 1, TIMEBASE });
+	if (video_pts >= 0 && video_frame.len > 0) {
+		video_pts = av_rescale_q(video_pts, video_timebase, AVRational{ 1, TIMEBASE });
 		int64_t frame_duration = TIMEBASE * video_format.frame_rate_nom / video_format.frame_rate_den;
-		x264_encoder->add_frame(pts, frame_duration, video->get_current_frame_ycbcr_format().luma_coefficients, video_frame.data + video_offset, ReceivedTimestamps());
+		x264_encoder->add_frame(video_pts, frame_duration, video->get_current_frame_ycbcr_format().luma_coefficients, video_frame.data + video_offset, ReceivedTimestamps());
 	}
+	if (audio_frame.len > 0) {
+		// FFmpegCapture takes care of this for us.
+		assert(audio_format.num_channels == 2);
+		assert(audio_format.sample_rate == OUTPUT_FREQUENCY);
+
+		// TODO: Reduce some duplication against AudioMixer here.
+		size_t num_samples = audio_frame.len / (audio_format.bits_per_sample / 8);
+		vector<float> float_samples;
+		float_samples.resize(num_samples);
+		if (audio_format.bits_per_sample == 16) {
+			const int16_t *src = (const int16_t *)audio_frame.data;
+			float *dst = &float_samples[0];
+			for (size_t i = 0; i < num_samples; ++i) {
+				*dst++ = le16toh(*src++) * (1.0f / 32768.0f);
+			}
+		} else if (audio_format.bits_per_sample == 32) {
+			const int32_t *src = (const int32_t *)audio_frame.data;
+			float *dst = &float_samples[0];
+			for (size_t i = 0; i < num_samples; ++i) {
+				*dst++ = le32toh(*src++) * (1.0f / 2147483648.0f);
+			}
+		} else {
+			assert(false);
+		}
+		audio_pts = av_rescale_q(audio_pts, audio_timebase, AVRational{ 1, TIMEBASE });
+		audio_encoder->encode_audio(float_samples, audio_pts);
+        }
 
 	if (video_frame.owner) {
 		video_frame.owner->release_frame(video_frame);
@@ -104,20 +134,27 @@ int main(int argc, char *argv[])
 	assert(oformat != nullptr);
 
 	unique_ptr<AudioEncoder> audio_encoder;
-	if (global_flags.stream_audio_codec_name.empty()) {
-		audio_encoder.reset(new AudioEncoder(AUDIO_OUTPUT_CODEC_NAME, DEFAULT_AUDIO_OUTPUT_BIT_RATE, oformat));
-	} else {
-		audio_encoder.reset(new AudioEncoder(global_flags.stream_audio_codec_name, global_flags.stream_audio_codec_bitrate, oformat));
+	if (global_flags.transcode_audio) {
+		if (global_flags.stream_audio_codec_name.empty()) {
+			audio_encoder.reset(new AudioEncoder(AUDIO_OUTPUT_CODEC_NAME, DEFAULT_AUDIO_OUTPUT_BIT_RATE, oformat));
+		} else {
+			audio_encoder.reset(new AudioEncoder(global_flags.stream_audio_codec_name, global_flags.stream_audio_codec_bitrate, oformat));
+		}
 	}
 
 	X264Encoder x264_encoder(oformat);
 	unique_ptr<Mux> http_mux = create_mux(&httpd, oformat, &x264_encoder, audio_encoder.get());
+	if (global_flags.transcode_audio) {
+		audio_encoder->add_mux(http_mux.get());
+	}
 	x264_encoder.add_mux(http_mux.get());
 
 	FFmpegCapture video(argv[optind], global_flags.width, global_flags.height);
 	video.set_pixel_format(FFmpegCapture::PixelFormat_NV12);
-	video.set_frame_callback(bind(video_frame_callback, &video, &x264_encoder, _1, _2, _3, _4, _5, _6, _7, _8, _9));
-	video.set_audio_callback(bind(audio_frame_callback, http_mux.get(), _1, _2));
+	video.set_frame_callback(bind(video_frame_callback, &video, &x264_encoder, audio_encoder.get(), _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11));
+	if (!global_flags.transcode_audio) {
+		video.set_audio_callback(bind(audio_frame_callback, http_mux.get(), _1, _2));
+	}
 	video.configure_card();
 	video.start_bm_capture();
 	video.change_rate(2.0);  // Be sure never to really fall behind, but also don't dump huge amounts of stuff onto x264.
-- 
2.39.2