From 25326c82bda01dfa1b86fb4f074d7697705239f8 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Thu, 28 Mar 2019 18:49:01 +0100
Subject: [PATCH] Include the raw audio in the MJPEG output.

---
 nageru/audio_mixer.cpp   | 94 ++++++++++++++++++++++++++++++++++++++++
 nageru/audio_mixer.h     |  4 ++
 nageru/mixer.cpp         | 37 ++++++++++++++--
 nageru/mixer.h           |  5 ++-
 nageru/mjpeg_encoder.cpp | 56 ++++++++++++++++++++++--
 nageru/mjpeg_encoder.h   |  4 +-
 6 files changed, 190 insertions(+), 10 deletions(-)

diff --git a/nageru/audio_mixer.cpp b/nageru/audio_mixer.cpp
index 9b588d7..360689b 100644
--- a/nageru/audio_mixer.cpp
+++ b/nageru/audio_mixer.cpp
@@ -52,6 +52,26 @@ void convert_fixed16_to_fp32(float *dst, size_t out_channel, size_t out_num_chan
 	}
 }
 
+void convert_fixed16_to_fixed32(int32_t *dst, size_t out_channel, size_t out_num_channels,
+                                const uint8_t *src, size_t in_channel, size_t in_num_channels,
+                                size_t num_samples)
+{
+	assert(in_channel < in_num_channels);
+	assert(out_channel < out_num_channels);
+	src += in_channel * 2;
+	dst += out_channel;
+
+	for (size_t i = 0; i < num_samples; ++i) {
+		uint32_t s = uint32_t(uint16_t(le16toh(*(int16_t *)src))) << 16;
+
+		// Keep the sign bit in place, repeat the other 15 bits as far as they go.
+		*dst = s | ((s & 0x7fffffff) >> 15) | ((s & 0x7fffffff) >> 30);
+
+		src += 2 * in_num_channels;
+		dst += out_num_channels;
+	}
+}
+
 void convert_fixed24_to_fp32(float *dst, size_t out_channel, size_t out_num_channels,
                              const uint8_t *src, size_t in_channel, size_t in_num_channels,
                              size_t num_samples)
@@ -73,6 +93,29 @@ void convert_fixed24_to_fp32(float *dst, size_t out_channel, size_t out_num_chan
 	}
 }
 
+void convert_fixed24_to_fixed32(int32_t *dst, size_t out_channel, size_t out_num_channels,
+                                const uint8_t *src, size_t in_channel, size_t in_num_channels,
+                                size_t num_samples)
+{
+	assert(in_channel < in_num_channels);
+	assert(out_channel < out_num_channels);
+	src += in_channel * 3;
+	dst += out_channel;
+
+	for (size_t i = 0; i < num_samples; ++i) {
+		uint32_t s1 = src[0];
+		uint32_t s2 = src[1];
+		uint32_t s3 = src[2];
+		uint32_t s = (s1 << 8) | (s2 << 16) | (s3 << 24);
+
+		// Keep the sign bit in place, repeat the other 23 bits as far as they go.
+		*dst = s | ((s & 0x7fffffff) >> 23);
+
+		src += 3 * in_num_channels;
+		dst += out_num_channels;
+	}
+}
+
 void convert_fixed32_to_fp32(float *dst, size_t out_channel, size_t out_num_channels,
                              const uint8_t *src, size_t in_channel, size_t in_num_channels,
                              size_t num_samples)
@@ -91,6 +134,25 @@ void convert_fixed32_to_fp32(float *dst, size_t out_channel, size_t out_num_chan
 	}
 }
 
+// Basically just a reinterleave.
+void convert_fixed32_to_fixed32(int32_t *dst, size_t out_channel, size_t out_num_channels,
+                                const uint8_t *src, size_t in_channel, size_t in_num_channels,
+                                size_t num_samples)
+{
+	assert(in_channel < in_num_channels);
+	assert(out_channel < out_num_channels);
+	src += in_channel * 4;
+	dst += out_channel;
+
+	for (size_t i = 0; i < num_samples; ++i) {
+		int32_t s = le32toh(*(int32_t *)src);
+		*dst = s;
+
+		src += 4 * in_num_channels;
+		dst += out_num_channels;
+	}
+}
+
 float find_peak_plain(const float *samples, size_t num_samples) __attribute__((unused));
 
 float find_peak_plain(const float *samples, size_t num_samples)
@@ -294,6 +356,38 @@ bool AudioMixer::add_audio(DeviceSpec device_spec, const uint8_t *data, unsigned
 	return true;
 }
 
+vector<int32_t> convert_audio_to_fixed32(const uint8_t *data, unsigned num_samples, bmusb::AudioFormat audio_format, unsigned num_channels)
+{
+	vector<int32_t> audio;
+
+	if (num_channels > audio_format.num_channels) {
+		audio.resize(num_samples * num_channels, 0);
+	} else {
+		audio.resize(num_samples * num_channels);
+	}
+	for (unsigned channel_index = 0; channel_index < num_channels && channel_index < audio_format.num_channels; ++channel_index) {
+		switch (audio_format.bits_per_sample) {
+		case 0:
+			assert(num_samples == 0);
+			break;
+		case 16:
+			convert_fixed16_to_fixed32(&audio[0], channel_index, num_channels, data, channel_index, audio_format.num_channels, num_samples);
+			break;
+		case 24:
+			convert_fixed24_to_fixed32(&audio[0], channel_index, num_channels, data, channel_index, audio_format.num_channels, num_samples);
+			break;
+		case 32:
+			convert_fixed32_to_fixed32(&audio[0], channel_index, num_channels, data, channel_index, audio_format.num_channels, num_samples);
+			break;
+		default:
+			fprintf(stderr, "Cannot handle audio with %u bits per sample\n", audio_format.bits_per_sample);
+			assert(false);
+		}
+	}
+
+	return audio;
+}
+
 bool AudioMixer::add_silence(DeviceSpec device_spec, unsigned samples_per_frame, unsigned num_frames)
 {
 	AudioDevice *device = find_audio_device(device_spec);
diff --git a/nageru/audio_mixer.h b/nageru/audio_mixer.h
index 1cf4da3..14e7e85 100644
--- a/nageru/audio_mixer.h
+++ b/nageru/audio_mixer.h
@@ -37,6 +37,10 @@ namespace bmusb {
 struct AudioFormat;
 }  // namespace bmusb
 
+// Convert the given audio from {16,24,32}-bit M-channel to 32-bit N-channel PCM.
+// Assumes little-endian and chunky, signed PCM throughout.
+std::vector<int32_t> convert_audio_to_fixed32(const uint8_t *data, unsigned num_samples, bmusb::AudioFormat audio_format, unsigned num_destination_channels);
+
 enum EQBand {
 	EQ_BAND_BASS = 0,
 	EQ_BAND_MID,
diff --git a/nageru/mixer.cpp b/nageru/mixer.cpp
index c91857e..5f38a67 100644
--- a/nageru/mixer.cpp
+++ b/nageru/mixer.cpp
@@ -785,6 +785,34 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode,
 
 	if (num_samples > 0) {
 		audio_mixer->add_audio(device, audio_frame.data + audio_offset, num_samples, audio_format, audio_frame.received_timestamp);
+
+		// Audio for the MJPEG stream. We don't resample; audio that's not in 48 kHz
+		// just gets dropped for now.
+		//
+		// Only bother doing MJPEG encoding if there are any connected clients
+		// that want the stream.
+		if (httpd.get_num_connected_multicam_clients() > 0) {
+			vector<int32_t> converted_samples = convert_audio_to_fixed32(audio_frame.data + audio_offset, num_samples, audio_format, 2);
+			lock_guard<mutex> lock(card_mutex);
+			if (card->new_raw_audio.empty()) {
+				card->new_raw_audio = move(converted_samples);
+			} else {
+				// For raw audio, we don't really synchronize audio and video;
+				// we just put the audio in frame by frame, and if a video frame is
+				// dropped, we still keep the audio, which means it will be added
+				// to the beginning of the next frame. It would probably be better
+				// to move the audio pts earlier to show this, but most players can
+				// live with some jitter, and in a lot of ways, it's much nicer for
+				// Futatabi to have all audio locked to a video frame.
+				card->new_raw_audio.insert(card->new_raw_audio.end(), converted_samples.begin(), converted_samples.end());
+
+				// Truncate to one second, just to be sure we don't have infinite buildup in case of weirdness.
+				if (card->new_raw_audio.size() > OUTPUT_FREQUENCY * 2) {
+					size_t excess_samples = card->new_raw_audio.size() - OUTPUT_FREQUENCY * 2;
+					card->new_raw_audio.erase(card->new_raw_audio.begin(), card->new_raw_audio.begin() + excess_samples);
+				}
+			}
+		}
 	}
 
 	// Done with the audio, so release it.
@@ -1038,7 +1066,8 @@ void Mixer::thread_func()
 			assert(master_card_index < num_cards + num_video_inputs);
 		}
 
-		OutputFrameInfo	output_frame_info = get_one_frame_from_each_card(master_card_index, master_card_is_output, new_frames, has_new_frame);
+		vector<int32_t> raw_audio[MAX_VIDEO_CARDS];  // For MJPEG encoding.
+		OutputFrameInfo	output_frame_info = get_one_frame_from_each_card(master_card_index, master_card_is_output, new_frames, has_new_frame, raw_audio);
 		schedule_audio_resampling_tasks(output_frame_info.dropped_frames, output_frame_info.num_samples, output_frame_info.frame_duration, output_frame_info.is_preroll, output_frame_info.frame_timestamp);
 		stats_dropped_frames += output_frame_info.dropped_frames;
 
@@ -1084,7 +1113,7 @@ void Mixer::thread_func()
 			if (new_frame->frame->data_copy != nullptr) {
 				int mjpeg_card_index = mjpeg_encoder->get_mjpeg_stream_for_card(card_index);
 				if (mjpeg_card_index != -1) {
-					mjpeg_encoder->upload_frame(pts_int, mjpeg_card_index, new_frame->frame, new_frame->video_format, new_frame->y_offset, new_frame->cbcr_offset);
+					mjpeg_encoder->upload_frame(pts_int, mjpeg_card_index, new_frame->frame, new_frame->video_format, new_frame->y_offset, new_frame->cbcr_offset, move(raw_audio[card_index]));
 				}
 			}
 		}
@@ -1198,7 +1227,7 @@ pair<string, string> Mixer::get_channel_color_http(unsigned channel_idx)
 	return make_pair(theme->get_channel_color(channel_idx), "text/plain");
 }
 
-Mixer::OutputFrameInfo Mixer::get_one_frame_from_each_card(unsigned master_card_index, bool master_card_is_output, CaptureCard::NewFrame new_frames[MAX_VIDEO_CARDS], bool has_new_frame[MAX_VIDEO_CARDS])
+Mixer::OutputFrameInfo Mixer::get_one_frame_from_each_card(unsigned master_card_index, bool master_card_is_output, CaptureCard::NewFrame new_frames[MAX_VIDEO_CARDS], bool has_new_frame[MAX_VIDEO_CARDS], vector<int32_t> raw_audio[MAX_VIDEO_CARDS])
 {
 	OutputFrameInfo output_frame_info;
 start:
@@ -1246,6 +1275,8 @@ start:
 			card->new_frames.pop_front();
 			card->new_frames_changed.notify_all();
 		}
+
+		raw_audio[card_index] = move(card->new_raw_audio);
 	}
 
 	if (!master_card_is_output) {
diff --git a/nageru/mixer.h b/nageru/mixer.h
index b4ed76f..de8b457 100644
--- a/nageru/mixer.h
+++ b/nageru/mixer.h
@@ -541,9 +541,10 @@ private:
 		};
 		std::deque<NewFrame> new_frames;
 		std::condition_variable new_frames_changed;  // Set whenever new_frames is changed.
-
 		QueueLengthPolicy queue_length_policy;  // Refers to the "new_frames" queue.
 
+		std::vector<int32_t> new_raw_audio;
+
 		int last_timecode = -1;  // Unwrapped.
 
 		JitterHistory jitter_history;
@@ -578,7 +579,7 @@ private:
 		bool is_preroll;
 		std::chrono::steady_clock::time_point frame_timestamp;
 	};
-	OutputFrameInfo get_one_frame_from_each_card(unsigned master_card_index, bool master_card_is_output, CaptureCard::NewFrame new_frames[MAX_VIDEO_CARDS], bool has_new_frame[MAX_VIDEO_CARDS]);
+	OutputFrameInfo get_one_frame_from_each_card(unsigned master_card_index, bool master_card_is_output, CaptureCard::NewFrame new_frames[MAX_VIDEO_CARDS], bool has_new_frame[MAX_VIDEO_CARDS], std::vector<int32_t> raw_audio[MAX_VIDEO_CARDS]);
 
 	InputState input_state;
 
diff --git a/nageru/mjpeg_encoder.cpp b/nageru/mjpeg_encoder.cpp
index 46bb94c..033f67a 100644
--- a/nageru/mjpeg_encoder.cpp
+++ b/nageru/mjpeg_encoder.cpp
@@ -120,7 +120,7 @@ MJPEGEncoder::MJPEGEncoder(HTTPD *httpd, const string &va_display)
 	// a situation with only one video stream (and possibly one audio stream)
 	// with known width/height, and we don't need the extra functionality it provides.
 	avctx.reset(avformat_alloc_context());
-	avctx->oformat = av_guess_format("mp4", nullptr, nullptr);
+	avctx->oformat = av_guess_format("nut", nullptr, nullptr);
 
 	uint8_t *buf = (uint8_t *)av_malloc(MUX_BUFFER_SIZE);
 	avctx->pb = avio_alloc_context(buf, MUX_BUFFER_SIZE, 1, this, nullptr, nullptr, nullptr);
@@ -133,7 +133,11 @@ MJPEGEncoder::MJPEGEncoder(HTTPD *httpd, const string &va_display)
 			fprintf(stderr, "avformat_new_stream() failed\n");
 			abort();
 		}
-		stream->time_base = AVRational{ 1, TIMEBASE };
+
+		// FFmpeg is very picky about having audio at 1/48000 timebase,
+		// no matter what we write. Even though we'd prefer our usual 1/120000,
+		// put the video on the same one, so that we can have locked audio.
+		stream->time_base = AVRational{ 1, OUTPUT_FREQUENCY };
 		stream->codecpar->codec_type = AVMEDIA_TYPE_VIDEO;
 		stream->codecpar->codec_id = AV_CODEC_ID_MJPEG;
 
@@ -151,6 +155,19 @@ MJPEGEncoder::MJPEGEncoder(HTTPD *httpd, const string &va_display)
 		stream->codecpar->chroma_location = AVCHROMA_LOC_LEFT;
 		stream->codecpar->field_order = AV_FIELD_PROGRESSIVE;
 	}
+	for (unsigned card_idx = 0; card_idx < global_flags.card_to_mjpeg_stream_export.size(); ++card_idx) {
+		AVStream *stream = avformat_new_stream(avctx.get(), nullptr);
+		if (stream == nullptr) {
+			fprintf(stderr, "avformat_new_stream() failed\n");
+			abort();
+		}
+		stream->time_base = AVRational{ 1, OUTPUT_FREQUENCY };
+		stream->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
+		stream->codecpar->codec_id = AV_CODEC_ID_PCM_S32LE;
+		stream->codecpar->channel_layout = AV_CH_LAYOUT_STEREO;
+		stream->codecpar->channels = 2;
+		stream->codecpar->sample_rate = OUTPUT_FREQUENCY;
+	}
 
 	AVDictionary *options = NULL;
 	vector<pair<string, string>> opts = MUX_OPTS;
@@ -269,7 +286,7 @@ unique_ptr<VADisplayWithCleanup> MJPEGEncoder::try_open_va(const string &va_disp
 	return va_dpy;
 }
 
-void MJPEGEncoder::upload_frame(int64_t pts, unsigned card_index, RefCountedFrame frame, const bmusb::VideoFormat &video_format, size_t y_offset, size_t cbcr_offset)
+void MJPEGEncoder::upload_frame(int64_t pts, unsigned card_index, RefCountedFrame frame, const bmusb::VideoFormat &video_format, size_t y_offset, size_t cbcr_offset, vector<int32_t> audio)
 {
 	PBOFrameAllocator::Userdata *userdata = (PBOFrameAllocator::Userdata *)frame->userdata;
 	if (video_format.width == 0 || video_format.height == 0) {
@@ -300,7 +317,7 @@ void MJPEGEncoder::upload_frame(int64_t pts, unsigned card_index, RefCountedFram
 		return;
 	}
 	++metric_mjpeg_overrun_submitted;
-	frames_to_be_encoded.push(QueuedFrame{ pts, card_index, frame, video_format, y_offset, cbcr_offset });
+	frames_to_be_encoded.push(QueuedFrame{ pts, card_index, frame, video_format, y_offset, cbcr_offset, move(audio) });
 	any_frames_to_be_encoded.notify_all();
 }
 
@@ -341,6 +358,11 @@ void MJPEGEncoder::encoder_thread_func()
 			// Will call back in the receiver thread.
 			encode_jpeg_va(move(qf));
 		} else {
+			// Write audio before video, since Futatabi expects it.
+			if (qf.audio.size() > 0) {
+				write_audio_packet(qf.pts, qf.card_index, qf.audio);
+			}
+
 			// Encode synchronously, in the same thread.
 			vector<uint8_t> jpeg = encode_jpeg_libjpeg(qf);
 			write_mjpeg_packet(qf.pts, qf.card_index, jpeg.data(), jpeg.size());
@@ -364,6 +386,27 @@ void MJPEGEncoder::write_mjpeg_packet(int64_t pts, unsigned card_index, const ui
 	pkt.flags = AV_PKT_FLAG_KEY;
 	AVRational time_base = avctx->streams[pkt.stream_index]->time_base;
 	pkt.pts = pkt.dts = av_rescale_q(pts, AVRational{ 1, TIMEBASE }, time_base);
+	pkt.duration = 0;
+
+	if (av_write_frame(avctx.get(), &pkt) < 0) {
+		fprintf(stderr, "av_write_frame() failed\n");
+		abort();
+	}
+}
+
+void MJPEGEncoder::write_audio_packet(int64_t pts, unsigned card_index, const vector<int32_t> &audio)
+{
+	AVPacket pkt;
+	memset(&pkt, 0, sizeof(pkt));
+	pkt.buf = nullptr;
+	pkt.data = reinterpret_cast<uint8_t *>(const_cast<int32_t *>(&audio[0]));
+	pkt.size = audio.size() * sizeof(audio[0]);
+	pkt.stream_index = card_index + global_flags.card_to_mjpeg_stream_export.size();
+	pkt.flags = AV_PKT_FLAG_KEY;
+	AVRational time_base = avctx->streams[pkt.stream_index]->time_base;
+	pkt.pts = pkt.dts = av_rescale_q(pts, AVRational{ 1, TIMEBASE }, time_base);
+	size_t num_stereo_samples = audio.size() / 2;
+	pkt.duration = av_rescale_q(num_stereo_samples, AVRational{ 1, OUTPUT_FREQUENCY }, time_base);
 
 	if (av_write_frame(avctx.get(), &pkt) < 0) {
 		fprintf(stderr, "av_write_frame() failed\n");
@@ -752,6 +795,11 @@ void MJPEGEncoder::va_receiver_thread_func()
 			frames_encoding.pop();
 		}
 
+		// Write audio before video, since Futatabi expects it.
+		if (qf.audio.size() > 0) {
+			write_audio_packet(qf.pts, qf.card_index, qf.audio);
+		}
+
 		VAStatus va_status = vaSyncSurface(va_dpy->va_dpy, qf.resources.surface);
 		CHECK_VASTATUS(va_status, "vaSyncSurface");
 
diff --git a/nageru/mjpeg_encoder.h b/nageru/mjpeg_encoder.h
index 6e0357f..bb783d8 100644
--- a/nageru/mjpeg_encoder.h
+++ b/nageru/mjpeg_encoder.h
@@ -38,7 +38,7 @@ public:
 	MJPEGEncoder(HTTPD *httpd, const std::string &va_display);
 	~MJPEGEncoder();
 	void stop();
-	void upload_frame(int64_t pts, unsigned card_index, RefCountedFrame frame, const bmusb::VideoFormat &video_format, size_t y_offset, size_t cbcr_offset);
+	void upload_frame(int64_t pts, unsigned card_index, RefCountedFrame frame, const bmusb::VideoFormat &video_format, size_t y_offset, size_t cbcr_offset, std::vector<int32_t> audio);
 	bool using_vaapi() const { return va_dpy != nullptr; }
 
 	// Returns -1 for inactive (ie., don't encode frames for this card right now).
@@ -104,6 +104,7 @@ private:
 		RefCountedFrame frame;
 		bmusb::VideoFormat video_format;
 		size_t y_offset, cbcr_offset;
+		std::vector<int32_t> audio;
 
 		// Only for frames in the process of being encoded by VA-API.
 		VAResources resources;
@@ -115,6 +116,7 @@ private:
 	void encode_jpeg_va(QueuedFrame &&qf);
 	std::vector<uint8_t> encode_jpeg_libjpeg(const QueuedFrame &qf);
 	void write_mjpeg_packet(int64_t pts, unsigned card_index, const uint8_t *jpeg, size_t jpeg_size);
+	void write_audio_packet(int64_t pts, unsigned card_index, const std::vector<int32_t> &audio);
 	void init_jpeg_422(unsigned width, unsigned height, VectorDestinationManager *dest, jpeg_compress_struct *cinfo);
 	std::vector<uint8_t> get_jpeg_header(unsigned width, unsigned height, jpeg_compress_struct *cinfo);
 
-- 
2.39.2