From 3ed9b0fbb73071284aa7fa221ce0373d2dadbc85 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Sun, 11 Oct 2015 16:29:00 +0200
Subject: [PATCH] Send audio all the way through to the encoder and muxer.

---
 h264encode.cpp | 103 +++++++++++++++++++++++++++++++++++++------------
 h264encode.h   |   7 +++-
 mixer.cpp      |  45 ++++++++++-----------
 mixer.h        |   3 +-
 resampler.h    |   2 +-
 5 files changed, 107 insertions(+), 53 deletions(-)

diff --git a/h264encode.cpp b/h264encode.cpp
index 99fdc11..ddbcb0a 100644
--- a/h264encode.cpp
+++ b/h264encode.cpp
@@ -1587,21 +1587,55 @@ int H264Encoder::save_codeddata(storage_task task)
     }
     vaUnmapBuffer(va_dpy, gl_surfaces[task.display_order % SURFACE_NUM].coded_buf);
 
-    AVPacket pkt;
-    memset(&pkt, 0, sizeof(pkt));
-    pkt.buf = nullptr;
-    pkt.pts = av_rescale_q(task.display_order, AVRational{1, frame_rate}, avstream->time_base);
-    pkt.dts = av_rescale_q(task.encode_order, AVRational{1, frame_rate}, avstream->time_base);
-    pkt.data = reinterpret_cast<uint8_t *>(&data[0]);
-    pkt.size = data.size();
-    pkt.stream_index = 0;
-    if (task.frame_type == FRAME_IDR || task.frame_type == FRAME_I) {
-        pkt.flags = AV_PKT_FLAG_KEY;
-    } else {
-        pkt.flags = 0;
+    {
+        // Add video.
+        AVPacket pkt;
+        memset(&pkt, 0, sizeof(pkt));
+        pkt.buf = nullptr;
+        pkt.pts = av_rescale_q(task.display_order + 2, AVRational{1, frame_rate}, avstream_video->time_base);  // FIXME: delay
+        pkt.dts = av_rescale_q(task.encode_order + 2, AVRational{1, frame_rate}, avstream_video->time_base);  // FIXME: delay
+        pkt.data = reinterpret_cast<uint8_t *>(&data[0]);
+        pkt.size = data.size();
+        pkt.stream_index = 0;
+        if (task.frame_type == FRAME_IDR || task.frame_type == FRAME_I) {
+            pkt.flags = AV_PKT_FLAG_KEY;
+        } else {
+            pkt.flags = 0;
+        }
+        //pkt.duration = 1;
+        av_interleaved_write_frame(avctx, &pkt);
+    }
+    {
+        // Add audio.
+        AVFrame *frame = avcodec_alloc_frame();
+        frame->nb_samples = task.audio.size() / 2;
+        frame->format = AV_SAMPLE_FMT_FLT;
+        frame->channel_layout = AV_CH_LAYOUT_STEREO;
+
+        unique_ptr<float[]> planar_samples(new float[task.audio.size()]);
+        avcodec_fill_audio_frame(frame, 2, AV_SAMPLE_FMT_FLTP, (const uint8_t*)planar_samples.get(), task.audio.size() * sizeof(float), 0);
+        for (int i = 0; i < frame->nb_samples; ++i) {
+            planar_samples[i] = task.audio[i * 2 + 0];
+            planar_samples[i + frame->nb_samples] = task.audio[i * 2 + 1];
+        }
+
+        AVPacket pkt;
+        av_init_packet(&pkt);
+        pkt.data = nullptr;
+        pkt.size = 0;
+        int got_output;
+        avcodec_encode_audio2(avstream_audio->codec, &pkt, frame, &got_output);
+        if (got_output) {
+            pkt.pts = av_rescale_q(task.display_order, AVRational{1, frame_rate}, avstream_audio->time_base);  // FIXME
+            pkt.stream_index = 1;
+            av_interleaved_write_frame(avctx, &pkt);
+        }
+        // TODO: Delayed frames.
+        avcodec_free_frame(&frame);
     }
-    pkt.duration = 1;
-    av_interleaved_write_frame(avctx, &pkt);
+
+    static FILE *audiofp = fopen("audio.raw", "wb");
+    fwrite(&task.audio[0], 4 * task.audio.size(), 1, audiofp);
 
 #if 0
     printf("\r      "); /* return back to startpoint */
@@ -1725,17 +1759,37 @@ H264Encoder::H264Encoder(QSurface *surface, int width, int height, const char *o
 		fprintf(stderr, "%s: avio_open2() failed\n", output_filename);
 		exit(1);
 	}
-	AVCodec *codec = avcodec_find_encoder(AV_CODEC_ID_H264);
-	avstream = avformat_new_stream(avctx, codec);
-	if (avstream == nullptr) {
+	AVCodec *codec_video = avcodec_find_encoder(AV_CODEC_ID_H264);
+	avstream_video = avformat_new_stream(avctx, codec_video);
+	if (avstream_video == nullptr) {
+		fprintf(stderr, "%s: avformat_new_stream() failed\n", output_filename);
+		exit(1);
+	}
+	avstream_video->time_base = AVRational{1, frame_rate};
+	avstream_video->codec->width = width;
+	avstream_video->codec->height = height;
+	avstream_video->codec->time_base = AVRational{1, frame_rate};
+	avstream_video->codec->ticks_per_frame = 1;  // or 2?
+
+	AVCodec *codec_audio = avcodec_find_encoder(AV_CODEC_ID_MP3);
+	avstream_audio = avformat_new_stream(avctx, codec_audio);
+	if (avstream_audio == nullptr) {
 		fprintf(stderr, "%s: avformat_new_stream() failed\n", output_filename);
 		exit(1);
 	}
-	avstream->time_base = AVRational{1, frame_rate};
-	avstream->codec->width = width;
-	avstream->codec->height = height;
-	avstream->codec->time_base = AVRational{1, frame_rate};
-	avstream->codec->ticks_per_frame = 1;  // or 2?
+	avstream_audio->time_base = AVRational{1, frame_rate};
+	avstream_audio->codec->bit_rate = 256000;
+	avstream_audio->codec->sample_rate = 48000;
+	avstream_audio->codec->sample_fmt = AV_SAMPLE_FMT_FLTP;
+	avstream_audio->codec->channels = 2;
+	avstream_audio->codec->channel_layout = AV_CH_LAYOUT_STEREO;
+	avstream_audio->codec->time_base = AVRational{1, frame_rate};
+
+	/* open it */
+	if (avcodec_open2(avstream_audio->codec, codec_audio, NULL) < 0) {
+		fprintf(stderr, "Could not open codec\n");
+		exit(1);
+	}
 
 	if (avformat_write_header(avctx, NULL) < 0) {
 		fprintf(stderr, "%s: avformat_write_header() failed\n", output_filename);
@@ -1862,11 +1916,11 @@ bool H264Encoder::begin_frame(GLuint *y_tex, GLuint *cbcr_tex)
 	return true;
 }
 
-void H264Encoder::end_frame(RefCountedGLsync fence, const std::vector<RefCountedFrame> &input_frames)
+void H264Encoder::end_frame(RefCountedGLsync fence, std::vector<float> audio, const std::vector<RefCountedFrame> &input_frames)
 {
 	{
 		unique_lock<mutex> lock(frame_queue_mutex);
-		pending_frames[current_storage_frame++] = PendingFrame{ fence, input_frames };
+		pending_frames[current_storage_frame++] = PendingFrame{ fence, input_frames, move(audio) };
 	}
 	frame_queue_nonempty.notify_one();
 }
@@ -1934,6 +1988,7 @@ void H264Encoder::copy_thread_func()
 		tmp.display_order = current_frame_display;
 		tmp.encode_order = current_frame_encoding;
 		tmp.frame_type = current_frame_type;
+		tmp.audio = move(frame.audio);
 		storage_task_enqueue(move(tmp));
 		
 		update_ReferenceFrames();
diff --git a/h264encode.h b/h264encode.h
index 636d6a8..5811319 100644
--- a/h264encode.h
+++ b/h264encode.h
@@ -67,13 +67,14 @@ public:
 	void 
 #endif
 	bool begin_frame(GLuint *y_tex, GLuint *cbcr_tex);
-	void end_frame(RefCountedGLsync fence, const std::vector<RefCountedFrame> &input_frames);
+	void end_frame(RefCountedGLsync fence, std::vector<float> audio, const std::vector<RefCountedFrame> &input_frames);
 
 private:
 	struct storage_task {
 		unsigned long long display_order;
 		unsigned long long encode_order;
 		int frame_type;
+		std::vector<float> audio;
 	};
 
 	void copy_thread_func();
@@ -100,12 +101,14 @@ private:
 	struct PendingFrame {
 		RefCountedGLsync fence;
 		std::vector<RefCountedFrame> input_frames;
+		std::vector<float> audio;
 	};
 	std::map<int, PendingFrame> pending_frames;
 	QSurface *surface;
 
 	AVFormatContext *avctx;
-	AVStream *avstream;
+	AVStream *avstream_video;
+	AVStream *avstream_audio;
 };
 
 #endif
diff --git a/mixer.cpp b/mixer.cpp
index c126c0b..03773b7 100644
--- a/mixer.cpp
+++ b/mixer.cpp
@@ -201,38 +201,22 @@ void Mixer::bm_frame(int card_index, uint16_t timecode,
 	GLsync fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, /*flags=*/0);              
 	check_error();
 	assert(fence != nullptr);
+
+	// Convert the audio to stereo fp32 and store it next to the video.
+	size_t num_samples = (audio_frame.len - audio_offset) / 8 / 3;
+	vector<float> audio;
+	audio.resize(num_samples * 2);
+	convert_fixed24_to_fp32(&audio[0], 2, audio_frame.data + audio_offset, 8, num_samples);
+
 	{
 		std::unique_lock<std::mutex> lock(bmusb_mutex);
 		card->new_data_ready = true;
 		card->new_frame = RefCountedFrame(video_frame);
 		card->new_data_ready_fence = fence;
+		card->new_frame_audio = move(audio);
 		card->new_data_ready_changed.notify_all();
 	}
 
-	// As a test of the resampler, send the data from card 0 through it and onto disk.
-	// TODO: Send the audio on, and encode it through ffmpeg.
-	if (card_index == 0) {
-		size_t num_samples = (audio_frame.len - audio_offset) / 8 / 3;
-		double pts = timecode / 60.0;  // FIXME: Unwrap. And rebase.
-		unique_ptr<float[]> samplesf(new float[num_samples * 2]);
-		convert_fixed24_to_fp32(samplesf.get(), 2, audio_frame.data + audio_offset, 8, num_samples);
-		card->resampler->add_input_samples(pts, samplesf.get(), num_samples);
-
-		float samples_out[(48000 / 60) * 2];
-		card->resampler->get_output_samples(pts, samples_out, 48000 / 60);
-
-		static FILE *audiofp = nullptr;
-		if (audiofp == nullptr) {
-			audiofp = fopen("audio.raw", "wb");
-		}
-		fwrite(samples_out, sizeof(samples_out), 1, audiofp);
-		//fwrite(samplesf.get(), num_samples * sizeof(float) * 2, 1, audiofp);
-
-		if (audio_frame.len - audio_offset != 19200) {
-			printf("%d: %d samples (%d bytes)\n", card_index, int(num_samples), int(audio_frame.len - audio_offset));
-		}
-	}
-
 	// Video frame will be released when last user of card->new_frame goes out of scope.
         card->usb->get_audio_frame_allocator()->release_frame(audio_frame);
 }
@@ -267,6 +251,7 @@ void Mixer::thread_func()
 				card_copy[card_index].new_data_ready = card->new_data_ready;
 				card_copy[card_index].new_frame = card->new_frame;
 				card_copy[card_index].new_data_ready_fence = card->new_data_ready_fence;
+				card_copy[card_index].new_frame_audio = move(card->new_frame_audio);
 				card->new_data_ready = false;
 				card->new_data_ready_changed.notify_all();
 			}
@@ -321,6 +306,14 @@ void Mixer::thread_func()
 		RefCountedGLsync fence(GL_SYNC_GPU_COMMANDS_COMPLETE, /*flags=*/0);
 		check_error();
 
+		// Resample the audio as needed.
+		// TODO: Allow using audio from the other card(s) as well.
+		double pts = frame / 60.0;
+		cards[0].resampler->add_input_samples(pts, card_copy[0].new_frame_audio.data(), card_copy[0].new_frame_audio.size() / 2);
+		vector<float> samples_out;
+		samples_out.resize((48000 / 60) * 2);
+		cards[0].resampler->get_output_samples(pts, &samples_out[0], 48000 / 60);
+
 		// Make sure the H.264 gets a reference to all the
 		// input frames needed, so that they are not released back
 		// until the rendering is done.
@@ -328,7 +321,7 @@ void Mixer::thread_func()
 		for (int card_index = 0; card_index < NUM_CARDS; ++card_index) {
 			input_frames.push_back(bmusb_current_rendering_frame[card_index]);
 		}
-		h264_encoder->end_frame(fence, input_frames);
+		h264_encoder->end_frame(fence, move(samples_out), input_frames);
 
 		// The live frame just shows the RGBA texture we just rendered.
 		// It owns rgba_tex now.
@@ -364,6 +357,7 @@ void Mixer::thread_func()
 		//	chain->print_phase_timing();
 		}
 
+#if 0
 		// Reset every 100 frames, so that local variations in frame times
 		// (especially for the first few frames, when the shaders are
 		// compiled etc.) don't make it hard to measure for the entire
@@ -372,6 +366,7 @@ void Mixer::thread_func()
 			frame = 0;
 			start = now;
 		}
+#endif
 		check_error();
 	}
 
diff --git a/mixer.h b/mixer.h
index f4ea5c9..9b631e4 100644
--- a/mixer.h
+++ b/mixer.h
@@ -113,9 +113,10 @@ private:
 		QSurface *surface;
 		QOpenGLContext *context;
 
-		bool new_data_ready = false;  // Whether new_frame contains anything.
+		bool new_data_ready = false;  // Whether new_frame and new_frame_audio contains anything.
 		RefCountedFrame new_frame;
 		GLsync new_data_ready_fence;  // Whether new_frame is ready for rendering.
+		std::vector<float> new_frame_audio;
 		std::condition_variable new_data_ready_changed;  // Set whenever new_data_ready is changed.
 		Resampler *resampler = nullptr;
 	};
diff --git a/resampler.h b/resampler.h
index 80ed1af..aa7af3d 100644
--- a/resampler.h
+++ b/resampler.h
@@ -80,7 +80,7 @@ private:
 	// How much delay we are expected to have, in input samples.
 	// If actual delay drifts too much away from this, we will start
 	// changing the resampling ratio to compensate.
-	double expected_delay = 4800.0;
+	double expected_delay = 1600.0;
 
 	// Input samples not yet fed into the resampler.
 	// TODO: Use a circular buffer instead, for efficiency.
-- 
2.39.2