From 009816de6e071c6a35c74b0954d04cf61005b971 Mon Sep 17 00:00:00 2001
From: Helge Norberg <helge.norberg@svt.se>
Date: Tue, 11 Oct 2016 19:07:54 +0200
Subject: [PATCH] [ffmpeg] Reimplemented support for playing all audio streams
 in a clip and treating all audio channels in each stream as if they where
 part of one single stream.

---
 .../producer/framerate/framerate_producer.cpp |   6 +-
 .../decklink/producer/decklink_producer.cpp   |  27 ++++-
 modules/ffmpeg/audio_channel_remapper.cpp     |  21 +---
 .../ffmpeg/producer/audio/audio_decoder.cpp   |  19 ++-
 modules/ffmpeg/producer/audio/audio_decoder.h |   4 +-
 modules/ffmpeg/producer/ffmpeg_producer.cpp   | 108 ++++++++++++-----
 .../ffmpeg/producer/filter/audio_filter.cpp   |  35 +++++-
 modules/ffmpeg/producer/filter/audio_filter.h |  10 +-
 modules/ffmpeg/producer/muxer/frame_muxer.cpp |  79 ++++++++++--
 modules/ffmpeg/producer/muxer/frame_muxer.h   |   4 +-
 modules/ffmpeg/producer/util/util.cpp         | 112 +++++++++---------
 modules/ffmpeg/producer/util/util.h           |   2 +-
 12 files changed, 292 insertions(+), 135 deletions(-)
diff --git a/core/producer/framerate/framerate_producer.cpp b/core/producer/framerate/framerate_producer.cpp
index be7451078..75d103a1b 100644
--- a/core/producer/framerate/framerate_producer.cpp
+++ b/core/producer/framerate/framerate_producer.cpp
@@ -193,8 +193,8 @@ class framerate_producer : public frame_producer_base
 	std::function<boost::rational<int>()>				get_source_framerate_;
 	boost::rational<int>								source_framerate_				= -1;
 	audio_channel_layout								source_channel_layout_			= audio_channel_layout::invalid();
-	boost::rational<int>								original_destination_framerate_;
-	field_mode											original_destination_fieldmode_;
+	const boost::rational<int>							original_destination_framerate_;
+	const field_mode									original_destination_fieldmode_;
 	field_mode											destination_fieldmode_			= field_mode::empty;
 	std::vector<int>									destination_audio_cadence_;
 	boost::rational<std::int64_t>						speed_;
@@ -485,6 +485,8 @@ private:
 		if (source_framerate_ == source_framerate)
 			return;
 
+		output_repeat_				= 0;
+		output_frame_				= 0;
 		source_framerate_			= source_framerate;
 		auto destination_framerate	= original_destination_framerate_;
 		destination_fieldmode_		= original_destination_fieldmode_;
diff --git a/modules/decklink/producer/decklink_producer.cpp b/modules/decklink/producer/decklink_producer.cpp
index e08b3a6d5..0349f5bc1 100644
--- a/modules/decklink/producer/decklink_producer.cpp
+++ b/modules/decklink/producer/decklink_producer.cpp
@@ -93,6 +93,15 @@ std::wstring to_string(const T& cadence)
 	return boost::join(cadence | boost::adaptors::transformed([](size_t i) { return boost::lexical_cast<std::wstring>(i); }), L", ");
 }
 
+ffmpeg::audio_input_pad create_input_pad(const core::video_format_desc& in_format, int num_channels)
+{
+	return ffmpeg::audio_input_pad(
+			boost::rational<int>(1, in_format.audio_sample_rate),
+			in_format.audio_sample_rate,
+			AVSampleFormat::AV_SAMPLE_FMT_S32,
+			av_get_default_channel_layout(num_channels));
+}
+
 class decklink_producer : boost::noncopyable, public IDeckLinkInputCallback
 {
 	const int										device_index_;
@@ -113,7 +122,15 @@ class decklink_producer : boost::noncopyable, public IDeckLinkInputCallback
 	boost::circular_buffer<size_t>					sync_buffer_		{ audio_cadence_.size() };
 	spl::shared_ptr<core::frame_factory>			frame_factory_;
 	core::audio_channel_layout						channel_layout_;
-	ffmpeg::frame_muxer								muxer_				{ in_format_desc_.framerate, frame_factory_, out_format_desc_, channel_layout_, filter_, ffmpeg::filter::is_deinterlacing(filter_) };
+	ffmpeg::frame_muxer								muxer_				{
+																			in_format_desc_.framerate,
+																			{ create_input_pad(in_format_desc_, channel_layout_.num_channels) },
+																			frame_factory_,
+																			out_format_desc_,
+																			channel_layout_,
+																			filter_,
+																			ffmpeg::filter::is_deinterlacing(filter_)
+																		};
 
 	core::constraints								constraints_		{ in_format_desc_.width, in_format_desc_.height };
 
@@ -171,6 +188,12 @@ public:
 									<< msg_info(print() + L" Failed to start input stream.")
 									<< boost::errinfo_api_function("StartStreams"));
 
+		// Wait for first frame until returning or give up after 2 seconds.
+		caspar::timer timeout_timer;
+
+		while (frame_buffer_.size() < 1 && timeout_timer.elapsed() < 2.0)
+			boost::this_thread::sleep_for(boost::chrono::milliseconds(1));
+
 		CASPAR_LOG(info) << print() << L" Initialized";
 	}
 
@@ -269,7 +292,7 @@ public:
 
 			// PUSH
 
-			muxer_.push(audio_buffer);
+			muxer_.push({ audio_buffer });
 			muxer_.push(static_cast<std::shared_ptr<AVFrame>>(video_frame));
 
 			// POLL
diff --git a/modules/ffmpeg/audio_channel_remapper.cpp b/modules/ffmpeg/audio_channel_remapper.cpp
index 920aa7203..5b632690e 100644
--- a/modules/ffmpeg/audio_channel_remapper.cpp
+++ b/modules/ffmpeg/audio_channel_remapper.cpp
@@ -166,25 +166,8 @@ struct audio_channel_remapper::impl
 
 		auto num_samples			=	input.size() / input_layout_.num_channels;
 		auto expected_output_size	=	num_samples * output_layout_.num_channels;
-		auto input_frame			=	ffmpeg::create_frame();
-
-		input_frame->channels		=	input_layout_.num_channels;
-		input_frame->channel_layout	=	ffmpeg::create_channel_layout_bitmask(input_layout_.num_channels);
-		input_frame->sample_rate	=	48000;
-		input_frame->nb_samples		=	static_cast<int>(num_samples);
-		input_frame->format			=	AV_SAMPLE_FMT_S32;
-		input_frame->pts			=	0;
-
-		av_samples_fill_arrays(
-				input_frame->extended_data,
-				input_frame->linesize,
-				reinterpret_cast<const std::uint8_t*>(input.data()),
-				input_frame->channels,
-				input_frame->nb_samples,
-				static_cast<AVSampleFormat>(input_frame->format),
-				16);
-
-		filter_->push(0, input_frame);
+
+		filter_->push(0, boost::make_iterator_range(input));
 
 		auto frames = filter_->poll_all(0);
 
diff --git a/modules/ffmpeg/producer/audio/audio_decoder.cpp b/modules/ffmpeg/producer/audio/audio_decoder.cpp
index ac7b44e42..0a1008876 100644
--- a/modules/ffmpeg/producer/audio/audio_decoder.cpp
+++ b/modules/ffmpeg/producer/audio/audio_decoder.cpp
@@ -51,7 +51,7 @@ namespace caspar { namespace ffmpeg {
 
 struct audio_decoder::implementation : boost::noncopyable
 {
-	int										index_				= -1;
+	int										index_;
 	const spl::shared_ptr<AVCodecContext>	codec_context_;
 	const int								out_samplerate_;
 
@@ -81,8 +81,9 @@ struct audio_decoder::implementation : boost::noncopyable
 																};
 
 public:
-	explicit implementation(const spl::shared_ptr<AVFormatContext>& context, int out_samplerate)
-		: codec_context_(open_codec(*context, AVMEDIA_TYPE_AUDIO, index_, false))
+	explicit implementation(int stream_index, const spl::shared_ptr<AVFormatContext>& context, int out_samplerate)
+		: index_(stream_index)
+		, codec_context_(open_codec(*context, AVMEDIA_TYPE_AUDIO, index_, false))
 		, out_samplerate_(out_samplerate)
 		, buffer_(10 * out_samplerate_ * codec_context_->channels) // 10 seconds of audio
 	{
@@ -168,14 +169,22 @@ public:
 	{
 		return L"[audio-decoder] " + u16(codec_context_->codec->long_name);
 	}
+
+	uint64_t ffmpeg_channel_layout() const
+	{
+		if (codec_context_->channel_layout == 0)
+			return av_get_default_channel_layout(codec_context_->channels);
+		else
+			return codec_context_->channel_layout;
+	}
 };
 
-audio_decoder::audio_decoder(const spl::shared_ptr<AVFormatContext>& context, int out_samplerate) : impl_(new implementation(context, out_samplerate)){}
+audio_decoder::audio_decoder(int stream_index, const spl::shared_ptr<AVFormatContext>& context, int out_samplerate) : impl_(new implementation(stream_index, context, out_samplerate)){}
 void audio_decoder::push(const std::shared_ptr<AVPacket>& packet){impl_->push(packet);}
 bool audio_decoder::ready() const{return impl_->ready();}
 std::shared_ptr<core::mutable_audio_buffer> audio_decoder::poll() { return impl_->poll(); }
 int	audio_decoder::num_channels() const { return impl_->codec_context_->channels; }
-uint64_t audio_decoder::ffmpeg_channel_layout() const { return impl_->codec_context_->channel_layout; }
+uint64_t audio_decoder::ffmpeg_channel_layout() const { return impl_->ffmpeg_channel_layout(); }
 std::wstring audio_decoder::print() const{return impl_->print();}
 
 }}
diff --git a/modules/ffmpeg/producer/audio/audio_decoder.h b/modules/ffmpeg/producer/audio/audio_decoder.h
index 0deb82920..6c06ea941 100644
--- a/modules/ffmpeg/producer/audio/audio_decoder.h
+++ b/modules/ffmpeg/producer/audio/audio_decoder.h
@@ -36,8 +36,8 @@ namespace caspar { namespace ffmpeg {
 class audio_decoder : boost::noncopyable
 {
 public:
-	explicit audio_decoder(const spl::shared_ptr<AVFormatContext>& context, int out_samplerate);
-	
+	explicit audio_decoder(int stream_index, const spl::shared_ptr<AVFormatContext>& context, int out_samplerate);
+
 	bool ready() const;
 	void push(const std::shared_ptr<AVPacket>& packet);
 	std::shared_ptr<core::mutable_audio_buffer> poll();
diff --git a/modules/ffmpeg/producer/ffmpeg_producer.cpp b/modules/ffmpeg/producer/ffmpeg_producer.cpp
index 8c098e267..a2d4afef0 100644
--- a/modules/ffmpeg/producer/ffmpeg_producer.cpp
+++ b/modules/ffmpeg/producer/ffmpeg_producer.cpp
@@ -30,6 +30,7 @@
 #include "audio/audio_decoder.h"
 #include "video/video_decoder.h"
 #include "muxer/frame_muxer.h"
+#include "filter/audio_filter.h"
 
 #include <common/param.h>
 #include <common/diagnostics/graph.h>
@@ -90,7 +91,7 @@ struct ffmpeg_producer : public core::frame_producer_base
 
 	input												input_;
 	std::unique_ptr<video_decoder>						video_decoder_;
-	std::unique_ptr<audio_decoder>						audio_decoder_;
+	std::vector<std::unique_ptr<audio_decoder>>			audio_decoders_;
 	std::unique_ptr<frame_muxer>						muxer_;
 
 	const boost::rational<int>							framerate_;
@@ -154,33 +155,63 @@ public:
 		}
 
 		auto channel_layout = core::audio_channel_layout::invalid();
+		std::vector<audio_input_pad> audio_input_pads;
 
 		if (!thumbnail_mode_)
 		{
-			try
+			for (unsigned stream_index = 0; stream_index < input_.context()->nb_streams; ++stream_index)
 			{
-				audio_decoder_.reset(new audio_decoder(input_.context(), format_desc.audio_sample_rate));
-				channel_layout = get_audio_channel_layout(
-						audio_decoder_->num_channels(),
-						audio_decoder_->ffmpeg_channel_layout(),
-						custom_channel_order);
-				CASPAR_LOG(info) << print() << L" " << audio_decoder_->print();
+				auto stream = input_.context()->streams[stream_index];
+
+				if (stream->codec->codec_type != AVMediaType::AVMEDIA_TYPE_AUDIO)
+					continue;
+
+				try
+				{
+					audio_decoders_.push_back(std::unique_ptr<audio_decoder>(new audio_decoder(stream_index, input_.context(), format_desc.audio_sample_rate)));
+					audio_input_pads.emplace_back(
+							boost::rational<int>(1, format_desc.audio_sample_rate),
+							format_desc.audio_sample_rate,
+							AVSampleFormat::AV_SAMPLE_FMT_S32,
+							audio_decoders_.back()->ffmpeg_channel_layout());
+					CASPAR_LOG(info) << print() << L" " << audio_decoders_.back()->print();
+				}
+				catch (averror_stream_not_found&)
+				{
+					//CASPAR_LOG(warning) << print() << " No audio-stream found. Running without audio.";
+				}
+				catch (...)
+				{
+					CASPAR_LOG_CURRENT_EXCEPTION();
+					CASPAR_LOG(warning) << print() << " Failed to open audio-stream. Running without audio.";
+				}
 			}
-			catch (averror_stream_not_found&)
+
+			if (audio_decoders_.size() == 1)
 			{
-				//CASPAR_LOG(warning) << print() << " No audio-stream found. Running without audio.";
+				channel_layout = get_audio_channel_layout(
+						audio_decoders_.at(0)->num_channels(),
+						audio_decoders_.at(0)->ffmpeg_channel_layout(),
+						custom_channel_order);
 			}
-			catch (...)
+			else if (audio_decoders_.size() > 1)
 			{
-				CASPAR_LOG_CURRENT_EXCEPTION();
-				CASPAR_LOG(warning) << print() << " Failed to open audio-stream. Running without audio.";
+				auto num_channels = cpplinq::from(audio_decoders_)
+					.select(std::mem_fn(&audio_decoder::num_channels))
+					.aggregate(0, std::plus<int>());
+				auto ffmpeg_channel_layout = av_get_default_channel_layout(num_channels);
+
+				channel_layout = get_audio_channel_layout(
+						num_channels,
+						ffmpeg_channel_layout,
+						custom_channel_order);
 			}
 		}
 
-		if (!video_decoder_ && !audio_decoder_)
+		if (!video_decoder_ && audio_decoders_.empty())
 			CASPAR_THROW_EXCEPTION(averror_stream_not_found() << msg_info("No streams found"));
 
-		muxer_.reset(new frame_muxer(framerate_, frame_factory, format_desc, channel_layout, filter, true));
+		muxer_.reset(new frame_muxer(framerate_, std::move(audio_input_pads), frame_factory, format_desc, channel_layout, filter, true));
 	}
 
 	// frame_producer
@@ -490,20 +521,30 @@ public:
 				!video_decoder_->is_progressive()) : L"";
 	}
 
+	bool not_all_audio_decoders_ready() const
+	{
+		for (auto& audio_decoder : audio_decoders_)
+			if (!audio_decoder->ready())
+				return true;
+
+		return false;
+	}
+
 	void try_decode_frame()
 	{
 		std::shared_ptr<AVPacket> pkt;
 
-		for (int n = 0; n < 32 && ((video_decoder_ && !video_decoder_->ready()) || (audio_decoder_ && !audio_decoder_->ready())) && input_.try_pop(pkt); ++n)
+		for (int n = 0; n < 32 && ((video_decoder_ && !video_decoder_->ready()) || not_all_audio_decoders_ready()) && input_.try_pop(pkt); ++n)
 		{
 			if (video_decoder_)
 				video_decoder_->push(pkt);
-			if (audio_decoder_)
-				audio_decoder_->push(pkt);
+
+			for (auto& audio_decoder : audio_decoders_)
+				audio_decoder->push(pkt);
 		}
 
-		std::shared_ptr<AVFrame>					video;
-		std::shared_ptr<core::mutable_audio_buffer>	audio;
+		std::shared_ptr<AVFrame>									video;
+		std::vector<std::shared_ptr<core::mutable_audio_buffer>>	audio;
 
 		tbb::parallel_invoke(
 		[&]
@@ -513,32 +554,39 @@ public:
 		},
 		[&]
 		{
-			if (!muxer_->audio_ready() && audio_decoder_)
-				audio = audio_decoder_->poll();
+			if (!muxer_->audio_ready())
+			{
+				for (auto& audio_decoder : audio_decoders_)
+				{
+					auto audio_for_stream = audio_decoder->poll();
+
+					if (audio_for_stream)
+						audio.push_back(audio_for_stream);
+				}
+			}
 		});
 
 		muxer_->push(video);
 		muxer_->push(audio);
 
-		if (!audio_decoder_)
+		if (audio_decoders_.empty())
 		{
-			if(video == flush_video())
-				muxer_->push(flush_audio());
-			else if(!muxer_->audio_ready())
-				muxer_->push(empty_audio());
+			if (video == flush_video())
+				muxer_->push({ flush_audio() });
+			else if (!muxer_->audio_ready())
+				muxer_->push({ empty_audio() });
 		}
 
 		if (!video_decoder_)
 		{
-			if(audio == flush_audio())
+			if (boost::count_if(audio, [](std::shared_ptr<core::mutable_audio_buffer> a) { return a == flush_audio(); }) > 0)
 				muxer_->push(flush_video());
-			else if(!muxer_->video_ready())
+			else if (!muxer_->video_ready())
 				muxer_->push(empty_video());
 		}
 
 		uint32_t file_frame_number = 0;
 		file_frame_number = std::max(file_frame_number, video_decoder_ ? video_decoder_->file_frame_number() : 0);
-		//file_frame_number = std::max(file_frame_number, audio_decoder_ ? audio_decoder_->file_frame_number() : 0);
 
 		for (auto frame = muxer_->poll(); frame != core::draw_frame::empty(); frame = muxer_->poll())
 			frame_buffer_.push(std::make_pair(frame, file_frame_number));
diff --git a/modules/ffmpeg/producer/filter/audio_filter.cpp b/modules/ffmpeg/producer/filter/audio_filter.cpp
index ad6a34ff1..ac70f04c0 100644
--- a/modules/ffmpeg/producer/filter/audio_filter.cpp
+++ b/modules/ffmpeg/producer/filter/audio_filter.cpp
@@ -77,7 +77,7 @@ std::string create_filter_list(const std::vector<std::string>& items)
 	return boost::join(items, "|");
 }
 
-std::string channel_layout_to_string(int64_t channel_layout)
+std::string channel_layout_to_string(uint64_t channel_layout)
 {
 	return (boost::format("0x%|1$x|") % channel_layout).str();
 }
@@ -108,13 +108,16 @@ struct audio_filter::implementation
 	std::vector<AVFilterContext*>	audio_graph_inputs_;
 	std::vector<AVFilterContext*>	audio_graph_outputs_;
 
+	std::vector<audio_input_pad>	input_pads_;
+
 	implementation(
 		std::vector<audio_input_pad> input_pads,
 		std::vector<audio_output_pad> output_pads,
 		const std::string& filtergraph)
 		: filtergraph_(boost::to_lower_copy(filtergraph))
+		, input_pads_(std::move(input_pads))
 	{
-		if (input_pads.empty())
+		if (input_pads_.empty())
 			CASPAR_THROW_EXCEPTION(invalid_argument() << msg_info("input_pads cannot be empty"));
 
 		if (output_pads.empty())
@@ -131,7 +134,7 @@ struct audio_filter::implementation
 
 		{
 			int i = 0;
-			for (auto& input_pad : input_pads)
+			for (auto& input_pad : input_pads_)
 				complete_filter_graph.push_back(create_sourcefilter_str(input_pad, "a:" + boost::lexical_cast<std::string>(i++)));
 		}
 
@@ -211,6 +214,31 @@ struct audio_filter::implementation
 			src_av_frame.get()));
 	}
 
+	void push(int input_pad_id, const boost::iterator_range<const int32_t*>& frame_samples)
+	{
+		auto& input_pad				= input_pads_.at(input_pad_id);
+		auto num_samples			= frame_samples.size() / av_get_channel_layout_nb_channels(input_pad.audio_channel_layout);
+		auto input_frame			= ffmpeg::create_frame();
+
+		input_frame->channels		= av_get_channel_layout_nb_channels(input_pad.audio_channel_layout);
+		input_frame->channel_layout	= input_pad.audio_channel_layout;
+		input_frame->sample_rate		= input_pad.sample_rate;
+		input_frame->nb_samples		= static_cast<int>(num_samples);
+		input_frame->format			= input_pad.sample_fmt;
+		input_frame->pts				= 0;
+
+		av_samples_fill_arrays(
+				input_frame->extended_data,
+				input_frame->linesize,
+				reinterpret_cast<const std::uint8_t*>(frame_samples.begin()),
+				input_frame->channels,
+				input_frame->nb_samples,
+				static_cast<AVSampleFormat>(input_frame->format),
+				16);
+
+		push(input_pad_id, input_frame);
+	}
+
 	std::shared_ptr<AVFrame> poll(int output_pad_id)
 	{
 		auto filt_frame = create_frame();
@@ -238,6 +266,7 @@ audio_filter::audio_filter(
 audio_filter::audio_filter(audio_filter&& other) : impl_(std::move(other.impl_)){}
 audio_filter& audio_filter::operator=(audio_filter&& other){impl_ = std::move(other.impl_); return *this;}
 void audio_filter::push(int input_pad_id, const std::shared_ptr<AVFrame>& frame){impl_->push(input_pad_id, frame);}
+void audio_filter::push(int input_pad_id, const boost::iterator_range<const int32_t*>& frame_samples) { impl_->push(input_pad_id, frame_samples); }
 std::shared_ptr<AVFrame> audio_filter::poll(int output_pad_id){return impl_->poll(output_pad_id);}
 std::wstring audio_filter::filter_str() const{return u16(impl_->filtergraph_);}
 std::vector<spl::shared_ptr<AVFrame>> audio_filter::poll_all(int output_pad_id)
diff --git a/modules/ffmpeg/producer/filter/audio_filter.h b/modules/ffmpeg/producer/filter/audio_filter.h
index 370bed045..25e05d4a7 100644
--- a/modules/ffmpeg/producer/filter/audio_filter.h
+++ b/modules/ffmpeg/producer/filter/audio_filter.h
@@ -25,6 +25,7 @@
 
 #include <boost/rational.hpp>
 #include <boost/noncopyable.hpp>
+#include <boost/range/iterator_range.hpp>
 
 #include <string>
 #include <vector>
@@ -50,13 +51,13 @@ struct audio_input_pad
 	boost::rational<int>	time_base;
 	int						sample_rate;
 	AVSampleFormat			sample_fmt;
-	std::int64_t			audio_channel_layout;
+	std::uint64_t			audio_channel_layout;
 
 	audio_input_pad(
 			boost::rational<int> time_base,
 			int sample_rate,
 			AVSampleFormat sample_fmt,
-			std::int64_t audio_channel_layout)
+			std::uint64_t audio_channel_layout)
 		: time_base(std::move(time_base))
 		, sample_rate(sample_rate)
 		, sample_fmt(sample_fmt)
@@ -69,12 +70,12 @@ struct audio_output_pad
 {
 	std::vector<int>			sample_rates;
 	std::vector<AVSampleFormat>	sample_fmts;
-	std::vector<std::int64_t>	audio_channel_layouts;
+	std::vector<std::uint64_t>	audio_channel_layouts;
 
 	audio_output_pad(
 			std::vector<int> sample_rates,
 			std::vector<AVSampleFormat> sample_fmts,
-			std::vector<std::int64_t> audio_channel_layouts)
+			std::vector<std::uint64_t> audio_channel_layouts)
 		: sample_rates(std::move(sample_rates))
 		, sample_fmts(std::move(sample_fmts))
 		, audio_channel_layouts(std::move(audio_channel_layouts))
@@ -93,6 +94,7 @@ public:
 	audio_filter& operator=(audio_filter&& other);
 
 	void push(int input_pad_id, const std::shared_ptr<AVFrame>& frame);
+	void push(int input_pad_id, const boost::iterator_range<const int32_t*>& frame_samples);
 	std::shared_ptr<AVFrame> poll(int output_pad_id);
 	std::vector<spl::shared_ptr<AVFrame>> poll_all(int output_pad_id);
 
diff --git a/modules/ffmpeg/producer/muxer/frame_muxer.cpp b/modules/ffmpeg/producer/muxer/frame_muxer.cpp
index 80a11b70f..6d7b6342e 100644
--- a/modules/ffmpeg/producer/muxer/frame_muxer.cpp
+++ b/modules/ffmpeg/producer/muxer/frame_muxer.cpp
@@ -24,6 +24,7 @@
 #include "frame_muxer.h"
 
 #include "../filter/filter.h"
+#include "../filter/audio_filter.h"
 #include "../util/util.h"
 #include "../../ffmpeg.h"
 
@@ -67,6 +68,7 @@ extern "C"
 using namespace caspar::core;
 
 namespace caspar { namespace ffmpeg {
+
 struct av_frame_format
 {
 	int										pix_format;
@@ -96,31 +98,62 @@ struct av_frame_format
 	}
 };
 
+std::unique_ptr<audio_filter> create_amerge_filter(std::vector<audio_input_pad> input_pads, const core::audio_channel_layout& layout)
+{
+	std::vector<audio_output_pad> output_pads;
+	std::wstring amerge;
+
+	output_pads.emplace_back(
+			std::vector<int>			{ 48000 },
+			std::vector<AVSampleFormat>	{ AVSampleFormat::AV_SAMPLE_FMT_S32 },
+			std::vector<uint64_t>		{ static_cast<uint64_t>(av_get_default_channel_layout(layout.num_channels)) });
+
+	if (input_pads.size() > 1)
+	{
+		for (int i = 0; i < input_pads.size(); ++i)
+			amerge += L"[a:" + boost::lexical_cast<std::wstring>(i) + L"]";
+
+		amerge += L"amerge=inputs=" + boost::lexical_cast<std::wstring>(input_pads.size());
+	}
+
+	std::wstring afilter;
+
+	if (!amerge.empty())
+	{
+		afilter = amerge;
+		afilter += L"[aout:0]";
+	}
+
+	return std::unique_ptr<audio_filter>(new audio_filter(input_pads, output_pads, u8(afilter)));
+}
+
 struct frame_muxer::impl : boost::noncopyable
 {
 	std::queue<std::queue<core::mutable_frame>>		video_streams_;
 	std::queue<core::mutable_audio_buffer>			audio_streams_;
 	std::queue<core::draw_frame>					frame_buffer_;
-	display_mode									display_mode_			= display_mode::invalid;
+	display_mode									display_mode_				= display_mode::invalid;
 	const boost::rational<int>						in_framerate_;
 	const video_format_desc							format_desc_;
 	const audio_channel_layout						audio_channel_layout_;
 
-	std::vector<int>								audio_cadence_			= format_desc_.audio_cadence;
+	std::vector<int>								audio_cadence_				= format_desc_.audio_cadence;
 
 	spl::shared_ptr<core::frame_factory>			frame_factory_;
 	boost::optional<av_frame_format>				previously_filtered_frame_;
 
 	std::unique_ptr<filter>							filter_;
 	const std::wstring								filter_str_;
+	std::unique_ptr<audio_filter>					audio_filter_;
 	const bool										multithreaded_filter_;
-	bool											force_deinterlacing_	= env::properties().get(L"configuration.force-deinterlace", false);
+	bool											force_deinterlacing_		= env::properties().get(L"configuration.force-deinterlace", false);
 
 	mutable boost::mutex							out_framerate_mutex_;
 	boost::rational<int>							out_framerate_;
 
 	impl(
 			boost::rational<int> in_framerate,
+			std::vector<audio_input_pad> audio_input_pads,
 			const spl::shared_ptr<core::frame_factory>& frame_factory,
 			const core::video_format_desc& format_desc,
 			const core::audio_channel_layout& channel_layout,
@@ -137,6 +170,11 @@ struct frame_muxer::impl : boost::noncopyable
 		audio_streams_.push(core::mutable_audio_buffer());
 
 		set_out_framerate(in_framerate_);
+
+		if (!audio_input_pads.empty())
+		{
+			audio_filter_ = create_amerge_filter(std::move(audio_input_pads), audio_channel_layout_);
+		}
 	}
 
 	void push(const std::shared_ptr<AVFrame>& video_frame)
@@ -187,22 +225,42 @@ struct frame_muxer::impl : boost::noncopyable
 			CASPAR_THROW_EXCEPTION(invalid_operation() << source_info("frame_muxer") << msg_info("video-stream overflow. This can be caused by incorrect frame-rate. Check clip meta-data."));
 	}
 
-	void push(const std::shared_ptr<core::mutable_audio_buffer>& audio)
+	void push(const std::vector<std::shared_ptr<core::mutable_audio_buffer>>& audio_samples_per_stream)
 	{
-		if (!audio)
+		if (audio_samples_per_stream.empty())
 			return;
 
-		if (audio == flush_audio())
+		bool is_flush = boost::count_if(
+				audio_samples_per_stream,
+				[](std::shared_ptr<core::mutable_audio_buffer> a) { return a == flush_audio(); }) > 0;
+
+		if (is_flush)
 		{
 			audio_streams_.push(core::mutable_audio_buffer());
 		}
-		else if (audio == empty_audio())
+		else if (audio_samples_per_stream.at(0) == empty_audio())
 		{
 			boost::range::push_back(audio_streams_.back(), core::mutable_audio_buffer(audio_cadence_.front() * audio_channel_layout_.num_channels, 0));
 		}
 		else
 		{
-			boost::range::push_back(audio_streams_.back(), *audio);
+			for (int i = 0; i < audio_samples_per_stream.size(); ++i)
+			{
+				auto range = boost::make_iterator_range_n(
+						audio_samples_per_stream.at(i)->data(),
+						audio_samples_per_stream.at(i)->size());
+
+				audio_filter_->push(i, range);
+			}
+
+			for (auto frame : audio_filter_->poll_all(0))
+			{
+				auto audio = boost::make_iterator_range_n(
+						reinterpret_cast<std::int32_t*>(frame->extended_data[0]),
+						frame->nb_samples * frame->channels);
+
+				boost::range::push_back(audio_streams_.back(), audio);
+			}
 		}
 
 		if (audio_streams_.back().size() > 32 * audio_cadence_.front() * audio_channel_layout_.num_channels)
@@ -398,14 +456,15 @@ private:
 
 frame_muxer::frame_muxer(
 		boost::rational<int> in_framerate,
+		std::vector<audio_input_pad> audio_input_pads,
 		const spl::shared_ptr<core::frame_factory>& frame_factory,
 		const core::video_format_desc& format_desc,
 		const core::audio_channel_layout& channel_layout,
 		const std::wstring& filter,
 		bool multithreaded_filter)
-	: impl_(new impl(in_framerate, frame_factory, format_desc, channel_layout, filter, multithreaded_filter)){}
+	: impl_(new impl(std::move(in_framerate), std::move(audio_input_pads), frame_factory, format_desc, channel_layout, filter, multithreaded_filter)){}
 void frame_muxer::push(const std::shared_ptr<AVFrame>& video){impl_->push(video);}
-void frame_muxer::push(const std::shared_ptr<core::mutable_audio_buffer>& audio){impl_->push(audio);}
+void frame_muxer::push(const std::vector<std::shared_ptr<core::mutable_audio_buffer>>& audio_samples_per_stream){impl_->push(audio_samples_per_stream);}
 core::draw_frame frame_muxer::poll(){return impl_->poll();}
 uint32_t frame_muxer::calc_nb_frames(uint32_t nb_frames) const {return impl_->calc_nb_frames(nb_frames);}
 bool frame_muxer::video_ready() const{return impl_->video_ready();}
diff --git a/modules/ffmpeg/producer/muxer/frame_muxer.h b/modules/ffmpeg/producer/muxer/frame_muxer.h
index 97984427a..8364a282b 100644
--- a/modules/ffmpeg/producer/muxer/frame_muxer.h
+++ b/modules/ffmpeg/producer/muxer/frame_muxer.h
@@ -22,6 +22,7 @@
 #pragma once
 
 #include "display_mode.h"
+#include "../filter/audio_filter.h"
 
 #include <common/forward.h>
 #include <common/memory.h>
@@ -44,6 +45,7 @@ class frame_muxer : boost::noncopyable
 public:
 	frame_muxer(
 			boost::rational<int> in_framerate,
+			std::vector<audio_input_pad> audio_input_pads,
 			const spl::shared_ptr<core::frame_factory>& frame_factory,
 			const core::video_format_desc& format_desc,
 			const core::audio_channel_layout& channel_layout,
@@ -51,7 +53,7 @@ public:
 			bool multithreaded_filter);
 
 	void push(const std::shared_ptr<AVFrame>& video_frame);
-	void push(const std::shared_ptr<core::mutable_audio_buffer>& audio_samples);
+	void push(const std::vector<std::shared_ptr<core::mutable_audio_buffer>>& audio_samples_per_stream);
 
 	bool video_ready() const;
 	bool audio_ready() const;
diff --git a/modules/ffmpeg/producer/util/util.cpp b/modules/ffmpeg/producer/util/util.cpp
index d57b9b60d..04a237415 100644
--- a/modules/ffmpeg/producer/util/util.cpp
+++ b/modules/ffmpeg/producer/util/util.cpp
@@ -58,7 +58,7 @@
 #pragma warning (push)
 #pragma warning (disable : 4244)
 #endif
-extern "C" 
+extern "C"
 {
 	#include <libswscale/swscale.h>
 	#include <libavcodec/avcodec.h>
@@ -69,7 +69,7 @@ extern "C"
 #endif
 
 namespace caspar { namespace ffmpeg {
-		
+
 core::field_mode get_mode(const AVFrame& frame)
 {
 	if(!frame.interlaced_frame)
@@ -102,23 +102,23 @@ core::pixel_format get_pixel_format(PixelFormat pix_fmt)
 core::pixel_format_desc pixel_format_desc(PixelFormat pix_fmt, int width, int height)
 {
 	// Get linesizes
-	AVPicture dummy_pict;	
+	AVPicture dummy_pict;
 	avpicture_fill(&dummy_pict, nullptr, pix_fmt, width, height);
 
 	core::pixel_format_desc desc = get_pixel_format(pix_fmt);
-		
+
 	switch(desc.format)
 	{
 	case core::pixel_format::gray:
 	case core::pixel_format::luma:
 		{
-			desc.planes.push_back(core::pixel_format_desc::plane(dummy_pict.linesize[0], height, 1));						
+			desc.planes.push_back(core::pixel_format_desc::plane(dummy_pict.linesize[0], height, 1));
 			return desc;
 		}
 	case core::pixel_format::bgr:
 	case core::pixel_format::rgb:
 		{
-			desc.planes.push_back(core::pixel_format_desc::plane(dummy_pict.linesize[0]/3, height, 3));						
+			desc.planes.push_back(core::pixel_format_desc::plane(dummy_pict.linesize[0]/3, height, 3));
 			return desc;
 		}
 	case core::pixel_format::bgra:
@@ -126,41 +126,41 @@ core::pixel_format_desc pixel_format_desc(PixelFormat pix_fmt, int width, int he
 	case core::pixel_format::rgba:
 	case core::pixel_format::abgr:
 		{
-			desc.planes.push_back(core::pixel_format_desc::plane(dummy_pict.linesize[0]/4, height, 4));						
+			desc.planes.push_back(core::pixel_format_desc::plane(dummy_pict.linesize[0]/4, height, 4));
 			return desc;
 		}
 	case core::pixel_format::ycbcr:
 	case core::pixel_format::ycbcra:
-		{		
+		{
 			// Find chroma height
 			int size2 = static_cast<int>(dummy_pict.data[2] - dummy_pict.data[1]);
-			int h2 = size2/dummy_pict.linesize[1];			
+			int h2 = size2/dummy_pict.linesize[1];
 
 			desc.planes.push_back(core::pixel_format_desc::plane(dummy_pict.linesize[0], height, 1));
 			desc.planes.push_back(core::pixel_format_desc::plane(dummy_pict.linesize[1], h2, 1));
 			desc.planes.push_back(core::pixel_format_desc::plane(dummy_pict.linesize[2], h2, 1));
 
-			if(desc.format == core::pixel_format::ycbcra)						
-				desc.planes.push_back(core::pixel_format_desc::plane(dummy_pict.linesize[3], height, 1));	
+			if(desc.format == core::pixel_format::ycbcra)
+				desc.planes.push_back(core::pixel_format_desc::plane(dummy_pict.linesize[3], height, 1));
 			return desc;
-		}		
-	default:		
+		}
+	default:
 		desc.format = core::pixel_format::invalid;
 		return desc;
 	}
 }
 
 core::mutable_frame make_frame(const void* tag, const spl::shared_ptr<AVFrame>& decoded_frame, core::frame_factory& frame_factory, const core::audio_channel_layout& channel_layout)
-{			
+{
 	static tbb::concurrent_unordered_map<int64_t, tbb::concurrent_queue<std::shared_ptr<SwsContext>>> sws_contvalid_exts_;
-	
+
 	if(decoded_frame->width < 1 || decoded_frame->height < 1)
 		return frame_factory.create_frame(tag, core::pixel_format_desc(core::pixel_format::invalid), core::audio_channel_layout::invalid());
 
 	const auto width  = decoded_frame->width;
 	const auto height = decoded_frame->height;
 	auto desc		  = pixel_format_desc(static_cast<PixelFormat>(decoded_frame->format), width, height);
-		
+
 	if(desc.format == core::pixel_format::invalid)
 	{
 		auto pix_fmt = static_cast<PixelFormat>(decoded_frame->format);
@@ -178,7 +178,7 @@ core::mutable_frame make_frame(const void* tag, const spl::shared_ptr<AVFrame>&
 			target_pix_fmt = PIX_FMT_YUV422P;
 		else if(pix_fmt == PIX_FMT_YUV444P10)
 			target_pix_fmt = PIX_FMT_YUV444P;
-		
+
 		auto target_desc = pixel_format_desc(target_pix_fmt, width, height);
 
 		auto write = frame_factory.create_frame(tag, target_desc, channel_layout);
@@ -186,31 +186,31 @@ core::mutable_frame make_frame(const void* tag, const spl::shared_ptr<AVFrame>&
 		std::shared_ptr<SwsContext> sws_context;
 
 		//CASPAR_LOG(warning) << "Hardware accelerated color transform not supported.";
-		
-		int64_t key = ((static_cast<int64_t>(width)			 << 32) & 0xFFFF00000000) | 
-					  ((static_cast<int64_t>(height)		 << 16) & 0xFFFF0000) | 
-					  ((static_cast<int64_t>(pix_fmt)		 <<  8) & 0xFF00) | 
+
+		int64_t key = ((static_cast<int64_t>(width)			 << 32) & 0xFFFF00000000) |
+					  ((static_cast<int64_t>(height)		 << 16) & 0xFFFF0000) |
+					  ((static_cast<int64_t>(pix_fmt)		 <<  8) & 0xFF00) |
 					  ((static_cast<int64_t>(target_pix_fmt) <<  0) & 0xFF);
-			
+
 		auto& pool = sws_contvalid_exts_[key];
-						
+
 		if(!pool.try_pop(sws_context))
 		{
 			double param;
 			sws_context.reset(sws_getContext(width, height, pix_fmt, width, height, target_pix_fmt, SWS_BILINEAR, nullptr, nullptr, &param), sws_freeContext);
 		}
-			
+
 		if(!sws_context)
 		{
-			CASPAR_THROW_EXCEPTION(operation_failed() << msg_info("Could not create software scaling context.") << 
+			CASPAR_THROW_EXCEPTION(operation_failed() << msg_info("Could not create software scaling context.") <<
 									boost::errinfo_api_function("sws_getContext"));
-		}	
-		
+		}
+
 		auto av_frame = create_frame();
 		if(target_pix_fmt == PIX_FMT_BGRA)
 		{
 			auto size = avpicture_fill(reinterpret_cast<AVPicture*>(av_frame.get()), write.image_data(0).begin(), PIX_FMT_BGRA, width, height);
-			CASPAR_VERIFY(size == write.image_data(0).size()); 
+			CASPAR_VERIFY(size == write.image_data(0).size());
 		}
 		else
 		{
@@ -223,22 +223,22 @@ core::mutable_frame make_frame(const void* tag, const spl::shared_ptr<AVFrame>&
 			}
 		}
 
-		sws_scale(sws_context.get(), decoded_frame->data, decoded_frame->linesize, 0, height, av_frame->data, av_frame->linesize);	
-		pool.push(sws_context);	
+		sws_scale(sws_context.get(), decoded_frame->data, decoded_frame->linesize, 0, height, av_frame->data, av_frame->linesize);
+		pool.push(sws_context);
 
 		return std::move(write);
 	}
 	else
 	{
 		auto write = frame_factory.create_frame(tag, desc, channel_layout);
-		
+
 		for(int n = 0; n < static_cast<int>(desc.planes.size()); ++n)
 		{
 			auto plane            = desc.planes[n];
 			auto result           = write.image_data(n).begin();
 			auto decoded          = decoded_frame->data[n];
 			auto decoded_linesize = decoded_frame->linesize[n];
-			
+
 			CASPAR_ASSERT(decoded);
 			CASPAR_ASSERT(write.image_data(n).begin());
 
@@ -257,7 +257,7 @@ core::mutable_frame make_frame(const void* tag, const spl::shared_ptr<AVFrame>&
 				fast_memcpy(result, decoded, plane.size);
 			}
 		}
-	
+
 		return std::move(write);
 	}
 }
@@ -274,16 +274,16 @@ spl::shared_ptr<AVFrame> make_av_frame(core::mutable_frame& frame)
 spl::shared_ptr<AVFrame> make_av_frame(std::array<uint8_t*, 4> data, const core::pixel_format_desc& pix_desc)
 {
 	auto av_frame = create_frame();
-	
+
 	auto planes		 = pix_desc.planes;
 	auto format		 = pix_desc.format;
 
 	av_frame->width  = planes[0].width;
 	av_frame->height = planes[0].height;
-	for(int n = 0; n < planes.size(); ++n)	
+	for(int n = 0; n < planes.size(); ++n)
 	{
 		av_frame->data[n]	  = data[n];
-		av_frame->linesize[n] = planes[n].linesize;	
+		av_frame->linesize[n] = planes[n].linesize;
 	}
 
 	switch(format)
@@ -295,19 +295,19 @@ spl::shared_ptr<AVFrame> make_av_frame(std::array<uint8_t*, 4> data, const core:
 		av_frame->format = PIX_FMT_BGR24;
 		break;
 	case core::pixel_format::rgba:
-		av_frame->format = PIX_FMT_RGBA; 
+		av_frame->format = PIX_FMT_RGBA;
 		break;
 	case core::pixel_format::argb:
-		av_frame->format = PIX_FMT_ARGB; 
+		av_frame->format = PIX_FMT_ARGB;
 		break;
 	case core::pixel_format::bgra:
-		av_frame->format = PIX_FMT_BGRA; 
+		av_frame->format = PIX_FMT_BGRA;
 		break;
 	case core::pixel_format::abgr:
-		av_frame->format = PIX_FMT_ABGR; 
+		av_frame->format = PIX_FMT_ABGR;
 		break;
 	case core::pixel_format::gray:
-		av_frame->format = PIX_FMT_GRAY8; 
+		av_frame->format = PIX_FMT_GRAY8;
 		break;
 	case core::pixel_format::ycbcr:
 	{
@@ -345,8 +345,8 @@ bool is_sane_fps(AVRational time_base)
 AVRational fix_time_base(AVRational time_base)
 {
 	if(time_base.num == 1)
-		time_base.num = static_cast<int>(std::pow(10.0, static_cast<int>(std::log10(static_cast<float>(time_base.den)))-1));	
-			
+		time_base.num = static_cast<int>(std::pow(10.0, static_cast<int>(std::log10(static_cast<float>(time_base.den)))-1));
+
 	if(!is_sane_fps(time_base))
 	{
 		auto tmp = time_base;
@@ -361,7 +361,7 @@ AVRational fix_time_base(AVRational time_base)
 double read_fps(AVFormatContext& context, double fail_value)
 {
 	auto framerate = read_framerate(context, boost::rational<int>(static_cast<int>(fail_value * 1000000.0), 1000000));
-	
+
 	return static_cast<double>(framerate.numerator()) / static_cast<double>(framerate.denominator());
 }
 
@@ -446,7 +446,7 @@ void fix_meta_data(AVFormatContext& context)
 	{
 		auto video_stream   = context.streams[video_index];
 		auto video_context  = context.streams[video_index]->codec;
-						
+
 		if(boost::filesystem::path(context.filename).extension().string() == ".flv")
 		{
 			try
@@ -465,7 +465,7 @@ void fix_meta_data(AVFormatContext& context)
 			auto ticks		 = video_context->ticks_per_frame;
 
 			if(video_stream->nb_frames == 0)
-				video_stream->nb_frames = (duration*stream_time.num*codec_time.den)/(stream_time.den*codec_time.num*ticks);	
+				video_stream->nb_frames = (duration*stream_time.num*codec_time.den)/(stream_time.den*codec_time.num*ticks);
 		}
 	}
 }
@@ -477,13 +477,13 @@ spl::shared_ptr<AVPacket> create_packet()
 		av_free_packet(p);
 		delete p;
 	});
-	
+
 	av_init_packet(packet.get());
 	return packet;
 }
 
 spl::shared_ptr<AVFrame> create_frame()
-{	
+{
 	spl::shared_ptr<AVFrame> frame(av_frame_alloc(), [](AVFrame* p)
 	{
 		av_frame_free(&p);
@@ -516,7 +516,7 @@ std::shared_ptr<AVFrame> empty_video()
 }
 
 spl::shared_ptr<AVCodecContext> open_codec(AVFormatContext& context, enum AVMediaType type, int& index, bool single_threaded)
-{	
+{
 	AVCodec* decoder;
 	index = THROW_ON_ERROR2(av_find_best_stream(&context, type, index, -1, &decoder, 0), "");
 	//if(strcmp(decoder->name, "prores") == 0 && decoder->next && strcmp(decoder->next->name, "prores_lgpl") == 0)
@@ -548,7 +548,7 @@ std::wstring print_mode(int width, int height, double fps, bool interlaced)
 }
 
 bool is_valid_file(const std::wstring& filename, bool only_video)
-{				
+{
 	static const auto invalid_exts = {
 		L".png",
 		L".tga",
@@ -583,21 +583,21 @@ bool is_valid_file(const std::wstring& filename, bool only_video)
 	};
 
 	auto ext = boost::to_lower_copy(boost::filesystem::path(filename).extension().wstring());
-		
+
 	if(std::find(valid_exts.begin(), valid_exts.end(), ext) != valid_exts.end())
 		return true;
 
 	if (!only_video && std::find(only_audio.begin(), only_audio.end(), ext) != only_audio.end())
 		return true;
-	
+
 	if(std::find(invalid_exts.begin(), invalid_exts.end(), ext) != invalid_exts.end())
-		return false;	
+		return false;
 
 	if (only_video && std::find(only_audio.begin(), only_audio.end(), ext) != only_audio.end())
 		return false;
 
 	auto u8filename = u8(filename);
-	
+
 	int score = 0;
 	AVProbeData pb = {};
 	pb.filename = u8filename.c_str();
@@ -772,7 +772,7 @@ core::audio_channel_layout get_audio_channel_layout(int num_channels, std::uint6
 }
 
 // av_get_default_channel_layout does not work for layouts not predefined in ffmpeg. This is needed to support > 8 channels.
-std::int64_t create_channel_layout_bitmask(int num_channels)
+std::uint64_t create_channel_layout_bitmask(int num_channels)
 {
 	if (num_channels > 63)
 		CASPAR_THROW_EXCEPTION(invalid_argument() << msg_info(L"FFmpeg cannot handle more than 63 audio channels"));
@@ -782,7 +782,7 @@ std::int64_t create_channel_layout_bitmask(int num_channels)
 	auto to_shift = 63 - num_channels;
 	auto result = ALL_63_CHANNELS >> to_shift;
 
-	return static_cast<std::int64_t>(result);
+	return static_cast<std::uint64_t>(result);
 }
 
 std::string to_string(const boost::rational<int>& framerate)
diff --git a/modules/ffmpeg/producer/util/util.h b/modules/ffmpeg/producer/util/util.h
index 1d2c647db..4c1f45c06 100644
--- a/modules/ffmpeg/producer/util/util.h
+++ b/modules/ffmpeg/producer/util/util.h
@@ -93,7 +93,7 @@ bool try_get_duration(const std::wstring filename, std::int64_t& duration, boost
 core::audio_channel_layout get_audio_channel_layout(int num_channels, std::uint64_t layout, const std::wstring& channel_layout_spec);
 
 // av_get_default_channel_layout does not work for layouts not predefined in ffmpeg. This is needed to support > 8 channels.
-std::int64_t create_channel_layout_bitmask(int num_channels);
+std::uint64_t create_channel_layout_bitmask(int num_channels);
 
 std::vector<int> find_audio_cadence(const boost::rational<int>& framerate);
 
-- 
2.39.2