From d205e9c826b4a4e1290cc4160067aad818e70081 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Sat, 8 Jul 2017 13:45:13 +0200
Subject: [PATCH] Add support to FFmpegCapture to decode the audio.

---
 ffmpeg_capture.cpp | 177 +++++++++++++++++++++++++++++++++++++--------
 ffmpeg_capture.h   |  18 ++++-
 2 files changed, 162 insertions(+), 33 deletions(-)

diff --git a/ffmpeg_capture.cpp b/ffmpeg_capture.cpp
index 48a395e..5e6104f 100644
--- a/ffmpeg_capture.cpp
+++ b/ffmpeg_capture.cpp
@@ -18,6 +18,7 @@ extern "C" {
 #include <libavutil/imgutils.h>
 #include <libavutil/mem.h>
 #include <libavutil/pixfmt.h>
+#include <libavutil/opt.h>
 #include <libswscale/swscale.h>
 }
 
@@ -217,6 +218,7 @@ FFmpegCapture::~FFmpegCapture()
 	if (has_dequeue_callbacks) {
 		dequeue_cleanup_callback();
 	}
+	avresample_free(&resampler);
 }
 
 void FFmpegCapture::configure_card()
@@ -356,24 +358,47 @@ bool FFmpegCapture::play_video(const string &pathname)
 
 	int audio_stream_index = find_stream_index(format_ctx.get(), AVMEDIA_TYPE_AUDIO);
 
-	const AVCodecParameters *codecpar = format_ctx->streams[video_stream_index]->codecpar;
+	// Open video decoder.
+	const AVCodecParameters *video_codecpar = format_ctx->streams[video_stream_index]->codecpar;
+	AVCodec *video_codec = avcodec_find_decoder(video_codecpar->codec_id);
 	video_timebase = format_ctx->streams[video_stream_index]->time_base;
-	AVCodecContextWithDeleter codec_ctx = avcodec_alloc_context3_unique(nullptr);
-	if (avcodec_parameters_to_context(codec_ctx.get(), codecpar) < 0) {
-		fprintf(stderr, "%s: Cannot fill codec parameters\n", pathname.c_str());
+	AVCodecContextWithDeleter video_codec_ctx = avcodec_alloc_context3_unique(nullptr);
+	if (avcodec_parameters_to_context(video_codec_ctx.get(), video_codecpar) < 0) {
+		fprintf(stderr, "%s: Cannot fill video codec parameters\n", pathname.c_str());
 		return false;
 	}
-	AVCodec *codec = avcodec_find_decoder(codecpar->codec_id);
-	if (codec == nullptr) {
-		fprintf(stderr, "%s: Cannot find decoder\n", pathname.c_str());
+	if (video_codec == nullptr) {
+		fprintf(stderr, "%s: Cannot find video decoder\n", pathname.c_str());
 		return false;
 	}
-	if (avcodec_open2(codec_ctx.get(), codec, nullptr) < 0) {
-		fprintf(stderr, "%s: Cannot open decoder\n", pathname.c_str());
+	if (avcodec_open2(video_codec_ctx.get(), video_codec, nullptr) < 0) {
+		fprintf(stderr, "%s: Cannot open video decoder\n", pathname.c_str());
 		return false;
 	}
-	unique_ptr<AVCodecContext, decltype(avcodec_close)*> codec_ctx_cleanup(
-		codec_ctx.get(), avcodec_close);
+	unique_ptr<AVCodecContext, decltype(avcodec_close)*> video_codec_ctx_cleanup(
+		video_codec_ctx.get(), avcodec_close);
+
+	// Open audio decoder, if we have audio.
+	AVCodecContextWithDeleter audio_codec_ctx = avcodec_alloc_context3_unique(nullptr);
+	if (audio_stream_index != -1) {
+		const AVCodecParameters *audio_codecpar = format_ctx->streams[audio_stream_index]->codecpar;
+		audio_timebase = format_ctx->streams[audio_stream_index]->time_base;
+		if (avcodec_parameters_to_context(audio_codec_ctx.get(), audio_codecpar) < 0) {
+			fprintf(stderr, "%s: Cannot fill audio codec parameters\n", pathname.c_str());
+			return false;
+		}
+		AVCodec *audio_codec = avcodec_find_decoder(audio_codecpar->codec_id);
+		if (audio_codec == nullptr) {
+			fprintf(stderr, "%s: Cannot find audio decoder\n", pathname.c_str());
+			return false;
+		}
+		if (avcodec_open2(audio_codec_ctx.get(), audio_codec, nullptr) < 0) {
+			fprintf(stderr, "%s: Cannot open audio decoder\n", pathname.c_str());
+			return false;
+		}
+	}
+	unique_ptr<AVCodecContext, decltype(avcodec_close)*> audio_codec_ctx_cleanup(
+		audio_codec_ctx.get(), avcodec_close);
 
 	internal_rewind();
 
@@ -382,9 +407,12 @@ bool FFmpegCapture::play_video(const string &pathname)
 		if (process_queued_commands(format_ctx.get(), pathname, last_modified, /*rewound=*/nullptr)) {
 			return true;
 		}
+		FrameAllocator::Frame audio_frame = audio_frame_allocator->alloc_frame();
+		AudioFormat audio_format;
 
 		bool error;
-		AVFrameWithDeleter frame = decode_frame(format_ctx.get(), codec_ctx.get(), pathname, video_stream_index, audio_stream_index, &error);
+		AVFrameWithDeleter frame = decode_frame(format_ctx.get(), video_codec_ctx.get(), audio_codec_ctx.get(),
+			pathname, video_stream_index, audio_stream_index, &audio_frame, &audio_format, &error);
 		if (error) {
 			return false;
 		}
@@ -411,11 +439,6 @@ bool FFmpegCapture::play_video(const string &pathname)
 			return false;
 		}
 
-		FrameAllocator::Frame audio_frame;
-		AudioFormat audio_format;
-                audio_format.bits_per_sample = 32;
-                audio_format.num_channels = 8;
-
 		for ( ;; ) {
 			if (last_pts == 0 && pts_origin == 0) {
 				pts_origin = frame->pts;	
@@ -500,13 +523,18 @@ bool FFmpegCapture::process_queued_commands(AVFormatContext *format_ctx, const s
 	return false;
 }
 
-AVFrameWithDeleter FFmpegCapture::decode_frame(AVFormatContext *format_ctx, AVCodecContext *codec_ctx, const std::string &pathname, int video_stream_index, int audio_stream_index, bool *error)
+namespace {
+
+}  // namespace
+
+AVFrameWithDeleter FFmpegCapture::decode_frame(AVFormatContext *format_ctx, AVCodecContext *video_codec_ctx, AVCodecContext *audio_codec_ctx, const std::string &pathname, int video_stream_index, int audio_stream_index, FrameAllocator::Frame *audio_frame, AudioFormat *audio_format, bool *error)
 {
 	*error = false;
 
 	// Read packets until we have a frame or there are none left.
 	bool frame_finished = false;
-	AVFrameWithDeleter frame = av_frame_alloc_unique();
+	AVFrameWithDeleter audio_avframe = av_frame_alloc_unique();
+	AVFrameWithDeleter video_avframe = av_frame_alloc_unique();
 	bool eof = false;
 	do {
 		AVPacket pkt;
@@ -519,36 +547,125 @@ AVFrameWithDeleter FFmpegCapture::decode_frame(AVFormatContext *format_ctx, AVCo
 			if (pkt.stream_index == audio_stream_index && audio_callback != nullptr) {
 				audio_callback(&pkt, format_ctx->streams[audio_stream_index]->time_base);
 			}
-			if (pkt.stream_index != video_stream_index) {
-				// Ignore audio for now.
-				continue;
-			}
-			if (avcodec_send_packet(codec_ctx, &pkt) < 0) {
-				fprintf(stderr, "%s: Cannot send packet to codec.\n", pathname.c_str());
-				*error = true;
-				return AVFrameWithDeleter(nullptr);
+			if (pkt.stream_index == video_stream_index) {
+				if (avcodec_send_packet(video_codec_ctx, &pkt) < 0) {
+					fprintf(stderr, "%s: Cannot send packet to video codec.\n", pathname.c_str());
+					*error = true;
+					return AVFrameWithDeleter(nullptr);
+				}
+			} else if (pkt.stream_index == audio_stream_index) {
+				if (avcodec_send_packet(audio_codec_ctx, &pkt) < 0) {
+					fprintf(stderr, "%s: Cannot send packet to audio codec.\n", pathname.c_str());
+					*error = true;
+					return AVFrameWithDeleter(nullptr);
+				}
 			}
 		} else {
 			eof = true;  // Or error, but ignore that for the time being.
 		}
 
-		int err = avcodec_receive_frame(codec_ctx, frame.get());
+		// Decode audio, if any.
+		int err = avcodec_receive_frame(audio_codec_ctx, audio_avframe.get());
+		if (err == 0) {
+			convert_audio(audio_avframe.get(), audio_frame, audio_format);
+		} else if (err != AVERROR(EAGAIN)) {
+			fprintf(stderr, "%s: Cannot receive frame from audio codec.\n", pathname.c_str());
+			*error = true;
+			return AVFrameWithDeleter(nullptr);
+		}
+
+		// Decode video, if we have a frame.
+		err = avcodec_receive_frame(video_codec_ctx, video_avframe.get());
 		if (err == 0) {
 			frame_finished = true;
 			break;
 		} else if (err != AVERROR(EAGAIN)) {
-			fprintf(stderr, "%s: Cannot receive frame from codec.\n", pathname.c_str());
+			fprintf(stderr, "%s: Cannot receive frame from video codec.\n", pathname.c_str());
 			*error = true;
 			return AVFrameWithDeleter(nullptr);
 		}
 	} while (!eof);
 
 	if (frame_finished)
-		return frame;
+		return video_avframe;
 	else
 		return AVFrameWithDeleter(nullptr);
 }
 
+void FFmpegCapture::convert_audio(const AVFrame *audio_avframe, FrameAllocator::Frame *audio_frame, AudioFormat *audio_format)
+{
+	// Decide on a format. If there already is one in this audio frame,
+	// we're pretty much forced to use it. If not, we try to find an exact match.
+	// If that still doesn't work, we default to 32-bit signed chunked
+	// (float would be nice, but there's really no way to signal that yet).
+	AVSampleFormat dst_format;
+	if (audio_format->bits_per_sample == 0) {
+		switch (audio_avframe->format) {
+		case AV_SAMPLE_FMT_S16:
+		case AV_SAMPLE_FMT_S16P:
+			audio_format->bits_per_sample = 16;
+			dst_format = AV_SAMPLE_FMT_S16;
+			break;
+		case AV_SAMPLE_FMT_S32:
+		case AV_SAMPLE_FMT_S32P:
+		default:
+			audio_format->bits_per_sample = 32;
+			dst_format = AV_SAMPLE_FMT_S32;
+			break;
+		}
+	} else if (audio_format->bits_per_sample == 16) {
+		dst_format = AV_SAMPLE_FMT_S16;
+	} else if (audio_format->bits_per_sample == 32) {
+		dst_format = AV_SAMPLE_FMT_S32;
+	} else {
+		assert(false);
+	}
+	audio_format->num_channels = 2;
+
+	if (resampler == nullptr ||
+	    audio_avframe->format != last_src_format ||
+	    dst_format != last_dst_format ||
+	    av_frame_get_channel_layout(audio_avframe) != last_channel_layout ||
+	    av_frame_get_sample_rate(audio_avframe) != last_sample_rate) {
+		avresample_free(&resampler);
+		resampler = avresample_alloc_context();
+		if (resampler == nullptr) {
+			fprintf(stderr, "Allocating resampler failed.\n");
+			exit(1);
+		}
+
+		av_opt_set_int(resampler, "in_channel_layout",  av_frame_get_channel_layout(audio_avframe), 0);
+		av_opt_set_int(resampler, "out_channel_layout", AV_CH_LAYOUT_STEREO,                        0);
+		av_opt_set_int(resampler, "in_sample_rate",     av_frame_get_sample_rate(audio_avframe),    0);
+		av_opt_set_int(resampler, "out_sample_rate",    OUTPUT_FREQUENCY,                           0);
+		av_opt_set_int(resampler, "in_sample_fmt",      audio_avframe->format,                      0);
+		av_opt_set_int(resampler, "out_sample_fmt",     dst_format,                                 0);
+
+		if (avresample_open(resampler) < 0) {
+			fprintf(stderr, "Could not open resample context.\n");
+			exit(1);
+		}
+
+		last_src_format = AVSampleFormat(audio_avframe->format);
+		last_dst_format = dst_format;
+		last_channel_layout = av_frame_get_channel_layout(audio_avframe);
+		last_sample_rate = av_frame_get_sample_rate(audio_avframe);
+	}
+
+	size_t bytes_per_sample = (audio_format->bits_per_sample / 8) * 2;
+	size_t num_samples_room = (audio_frame->size - audio_frame->len) / bytes_per_sample;
+
+	uint8_t *data = audio_frame->data + audio_frame->len;
+	int out_samples = avresample_convert(resampler, &data, 0, num_samples_room,
+		audio_avframe->data, audio_avframe->linesize[0], audio_avframe->nb_samples);
+	if (out_samples < 0) {
+                fprintf(stderr, "Audio conversion failed.\n");
+                exit(1);
+        }
+
+	audio_frame->len += out_samples * bytes_per_sample;
+}
+
 VideoFormat FFmpegCapture::construct_video_format(const AVFrame *frame, AVRational video_timebase)
 {
 	VideoFormat video_format;
diff --git a/ffmpeg_capture.h b/ffmpeg_capture.h
index afca641..eb377f7 100644
--- a/ffmpeg_capture.h
+++ b/ffmpeg_capture.h
@@ -16,7 +16,8 @@
 // but it would require some more plumbing, and it would also fail if the file
 // changes parameters midway, which is allowed in some formats.
 //
-// There is currently no audio support.
+// You can get out the audio either as decoded or in raw form (Kaeru uses this).
+// However, the rest of Nageru can't really use the audio for anything yet.
 
 #include <assert.h>
 #include <stdint.h>
@@ -31,6 +32,7 @@
 #include <movit/ycbcr.h>
 
 extern "C" {
+#include <libavresample/avresample.h>
 #include <libavutil/pixfmt.h>
 #include <libavutil/rational.h>
 }
@@ -194,7 +196,10 @@ private:
 	bool process_queued_commands(AVFormatContext *format_ctx, const std::string &pathname, timespec last_modified, bool *rewound);
 
 	// Returns nullptr if no frame was decoded (e.g. EOF).
-	AVFrameWithDeleter decode_frame(AVFormatContext *format_ctx, AVCodecContext *codec_ctx, const std::string &pathname, int video_stream_index, int audio_stream_index, bool *error);
+	AVFrameWithDeleter decode_frame(AVFormatContext *format_ctx, AVCodecContext *video_codec_ctx, AVCodecContext *audio_codec_ctx,
+	                                const std::string &pathname, int video_stream_index, int audio_stream_index,
+	                                bmusb::FrameAllocator::Frame *audio_frame, bmusb::AudioFormat *audio_format, bool *error);
+	void convert_audio(const AVFrame *audio_avframe, bmusb::FrameAllocator::Frame *audio_frame, bmusb::AudioFormat *audio_format);
 
 	bmusb::VideoFormat construct_video_format(const AVFrame *frame, AVRational video_timebase);
 	bmusb::FrameAllocator::Frame make_video_frame(const AVFrame *frame, const std::string &pathname, bool *error);
@@ -222,7 +227,7 @@ private:
 	SwsContextWithDeleter sws_ctx;
 	int sws_last_width = -1, sws_last_height = -1, sws_last_src_format = -1;
 	AVPixelFormat sws_dst_format = AVPixelFormat(-1);  // In practice, always initialized.
-	AVRational video_timebase;
+	AVRational video_timebase, audio_timebase;
 
 	QuittableSleeper producer_thread_should_quit;
 	std::thread producer_thread;
@@ -236,6 +241,13 @@ private:
 		double new_rate;  // For CHANGE_RATE.
 	};
 	std::vector<QueuedCommand> command_queue;  // Protected by <queue_mu>.
+
+	// Audio resampler.
+	AVAudioResampleContext *resampler = nullptr;
+	AVSampleFormat last_src_format, last_dst_format;
+	int64_t last_channel_layout;
+	int last_sample_rate;
+
 };
 
 #endif  // !defined(_FFMPEG_CAPTURE_H)
-- 
2.39.2