From 3be00c8dd8b841cecc44f57234b9fc2d3a94cb45 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Mon, 25 Apr 2016 23:04:59 +0200
Subject: [PATCH] Set x264 global headers (Quick Sync global headers are still
 not there).

Should fix the H.264 stream in newer Firefox with some luck.
---
 audio_encoder.cpp     |  6 ++++--
 audio_encoder.h       |  2 +-
 mux.cpp               |  9 ++++++---
 mux.h                 |  2 +-
 quicksync_encoder.cpp | 30 ++++++++++++++++++++++--------
 quicksync_encoder.h   |  7 ++++++-
 video_encoder.cpp     | 31 +++++++++++++++++++------------
 video_encoder.h       |  1 +
 x264_encoder.cpp      | 29 ++++++++++++++++++++++++++---
 x264_encoder.h        | 12 +++++++++++-
 10 files changed, 97 insertions(+), 32 deletions(-)

diff --git a/audio_encoder.cpp b/audio_encoder.cpp
index 2b735e4..ac1c8f5 100644
--- a/audio_encoder.cpp
+++ b/audio_encoder.cpp
@@ -21,7 +21,7 @@ extern "C" {
 
 using namespace std;
 
-AudioEncoder::AudioEncoder(const string &codec_name, int bit_rate)
+AudioEncoder::AudioEncoder(const string &codec_name, int bit_rate, const AVOutputFormat *oformat)
 {
 	AVCodec *codec = avcodec_find_encoder_by_name(codec_name.c_str());
 	if (codec == nullptr) {
@@ -36,7 +36,9 @@ AudioEncoder::AudioEncoder(const string &codec_name, int bit_rate)
 	ctx->channels = 2;
 	ctx->channel_layout = AV_CH_LAYOUT_STEREO;
 	ctx->time_base = AVRational{1, TIMEBASE};
-	ctx->flags |= CODEC_FLAG_GLOBAL_HEADER;
+	if (oformat->flags & AVFMT_GLOBALHEADER) {
+		ctx->flags |= CODEC_FLAG_GLOBAL_HEADER;
+	}
 	if (avcodec_open2(ctx, codec, NULL) < 0) {
 		fprintf(stderr, "Could not open codec '%s'\n", codec_name.c_str());
 		exit(1);
diff --git a/audio_encoder.h b/audio_encoder.h
index d627a9c..786d364 100644
--- a/audio_encoder.h
+++ b/audio_encoder.h
@@ -16,7 +16,7 @@ extern "C" {
 
 class AudioEncoder {
 public:
-	AudioEncoder(const std::string &codec_name, int bit_rate);
+	AudioEncoder(const std::string &codec_name, int bit_rate, const AVOutputFormat *oformat);
 	~AudioEncoder();
 
 	void add_mux(Mux *mux) {  // Does not take ownership.
diff --git a/mux.cpp b/mux.cpp
index ece11e1..e169438 100644
--- a/mux.cpp
+++ b/mux.cpp
@@ -10,7 +10,7 @@
 
 using namespace std;
 
-Mux::Mux(AVFormatContext *avctx, int width, int height, Codec video_codec, const AVCodecContext *audio_ctx, int time_base, KeyFrameSignalReceiver *keyframe_signal_receiver)
+Mux::Mux(AVFormatContext *avctx, int width, int height, Codec video_codec, const string &video_extradata, const AVCodecContext *audio_ctx, int time_base, KeyFrameSignalReceiver *keyframe_signal_receiver)
 	: avctx(avctx), keyframe_signal_receiver(keyframe_signal_receiver)
 {
 	AVCodec *codec_video = avcodec_find_encoder((video_codec == CODEC_H264) ? AV_CODEC_ID_H264 : AV_CODEC_ID_RAWVIDEO);
@@ -43,8 +43,11 @@ Mux::Mux(AVFormatContext *avctx, int width, int height, Codec video_codec, const
 	avstream_video->codec->color_range = AVCOL_RANGE_MPEG;  // Full vs. limited range (output_ycbcr_format.full_range).
 	avstream_video->codec->chroma_sample_location = AVCHROMA_LOC_LEFT;  // Chroma sample location. See chroma_offset_0[] in Mixer::subsample_chroma().
 	avstream_video->codec->field_order = AV_FIELD_PROGRESSIVE;
-	if (avctx->oformat->flags & AVFMT_GLOBALHEADER) {
-		avstream_video->codec->flags = AV_CODEC_FLAG_GLOBAL_HEADER;
+
+	if (!video_extradata.empty()) {
+		avstream_video->codec->extradata = (uint8_t *)av_malloc(video_extradata.size());
+		avstream_video->codec->extradata_size = video_extradata.size();
+		memcpy(avstream_video->codec->extradata, video_extradata.data(), video_extradata.size());
 	}
 
 	avstream_audio = avformat_new_stream(avctx, nullptr);
diff --git a/mux.h b/mux.h
index 1dd967c..c161b29 100644
--- a/mux.h
+++ b/mux.h
@@ -25,7 +25,7 @@ public:
 	};
 
 	// Takes ownership of avctx. <keyframe_signal_receiver> can be nullptr.
-	Mux(AVFormatContext *avctx, int width, int height, Codec video_codec, const AVCodecContext *audio_ctx, int time_base, KeyFrameSignalReceiver *keyframe_signal_receiver);
+	Mux(AVFormatContext *avctx, int width, int height, Codec video_codec, const std::string &video_extradata, const AVCodecContext *audio_ctx, int time_base, KeyFrameSignalReceiver *keyframe_signal_receiver);
 	~Mux();
 	void add_packet(const AVPacket &pkt, int64_t pts, int64_t dts);
 
diff --git a/quicksync_encoder.cpp b/quicksync_encoder.cpp
index fbbde94..b81cb53 100644
--- a/quicksync_encoder.cpp
+++ b/quicksync_encoder.cpp
@@ -194,12 +194,16 @@ FrameReorderer::Frame FrameReorderer::get_first_frame()
 
 class QuickSyncEncoderImpl {
 public:
-	QuickSyncEncoderImpl(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, Mux *stream_mux, AudioEncoder *stream_audio_encoder, X264Encoder *x264_encoder);
+	QuickSyncEncoderImpl(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, AVOutputFormat *oformat, AudioEncoder *stream_audio_encoder, X264Encoder *x264_encoder);
 	~QuickSyncEncoderImpl();
 	void add_audio(int64_t pts, vector<float> audio);
 	bool begin_frame(GLuint *y_tex, GLuint *cbcr_tex);
 	RefCountedGLsync end_frame(int64_t pts, int64_t duration, const vector<RefCountedFrame> &input_frames);
 	void shutdown();
+	void set_stream_mux(Mux *mux)
+	{
+		stream_mux = mux;
+	}
 
 private:
 	struct storage_task {
@@ -280,7 +284,7 @@ private:
 	unique_ptr<FrameReorderer> reorderer;
 	X264Encoder *x264_encoder;  // nullptr if not using x264.
 
-	Mux* stream_mux;  // To HTTP.
+	Mux* stream_mux = nullptr;  // To HTTP.
 	unique_ptr<Mux> file_mux;  // To local disk.
 
 	Display *x11_display = nullptr;
@@ -1727,10 +1731,10 @@ namespace {
 
 }  // namespace
 
-QuickSyncEncoderImpl::QuickSyncEncoderImpl(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, Mux *stream_mux, AudioEncoder *stream_audio_encoder, X264Encoder *x264_encoder)
-	: current_storage_frame(0), resource_pool(resource_pool), surface(surface), stream_audio_encoder(stream_audio_encoder), x264_encoder(x264_encoder), stream_mux(stream_mux), frame_width(width), frame_height(height)
+QuickSyncEncoderImpl::QuickSyncEncoderImpl(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, AVOutputFormat *oformat, AudioEncoder *stream_audio_encoder, X264Encoder *x264_encoder)
+	: current_storage_frame(0), resource_pool(resource_pool), surface(surface), stream_audio_encoder(stream_audio_encoder), x264_encoder(x264_encoder), frame_width(width), frame_height(height)
 {
-	file_audio_encoder.reset(new AudioEncoder(AUDIO_OUTPUT_CODEC_NAME, DEFAULT_AUDIO_OUTPUT_BIT_RATE));
+	file_audio_encoder.reset(new AudioEncoder(AUDIO_OUTPUT_CODEC_NAME, DEFAULT_AUDIO_OUTPUT_BIT_RATE, oformat));
 	open_output_file(filename);
 	file_audio_encoder->add_mux(file_mux.get());
 
@@ -1949,7 +1953,8 @@ void QuickSyncEncoderImpl::open_output_file(const std::string &filename)
 		exit(1);
 	}
 
-	file_mux.reset(new Mux(avctx, frame_width, frame_height, Mux::CODEC_H264, file_audio_encoder->get_ctx(), TIMEBASE, nullptr));
+	string video_extradata = "";  // FIXME: See other comment about global headers.
+	file_mux.reset(new Mux(avctx, frame_width, frame_height, Mux::CODEC_H264, video_extradata, file_audio_encoder->get_ctx(), TIMEBASE, nullptr));
 }
 
 void QuickSyncEncoderImpl::encode_thread_func()
@@ -2141,6 +2146,9 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
 	CHECK_VASTATUS(va_status, "vaBeginPicture");
 
 	if (frame_type == FRAME_IDR) {
+		// FIXME: If the mux wants global headers, we should not put the
+		// SPS/PPS before each IDR frame, but rather put it into the
+		// codec extradata (formatted differently?).
 		render_sequence();
 		render_picture(frame_type, display_frame_num, gop_start_display_frame_num);
 		if (h264_packedheader) {
@@ -2170,8 +2178,8 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame
 }
 
 // Proxy object.
-QuickSyncEncoder::QuickSyncEncoder(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, Mux *stream_mux, AudioEncoder *stream_audio_encoder, X264Encoder *x264_encoder)
-	: impl(new QuickSyncEncoderImpl(filename, resource_pool, surface, va_display, width, height, stream_mux, stream_audio_encoder, x264_encoder)) {}
+QuickSyncEncoder::QuickSyncEncoder(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, AVOutputFormat *oformat, AudioEncoder *stream_audio_encoder, X264Encoder *x264_encoder)
+	: impl(new QuickSyncEncoderImpl(filename, resource_pool, surface, va_display, width, height, oformat, stream_audio_encoder, x264_encoder)) {}
 
 // Must be defined here because unique_ptr<> destructor needs to know the impl.
 QuickSyncEncoder::~QuickSyncEncoder() {}
@@ -2195,3 +2203,9 @@ void QuickSyncEncoder::shutdown()
 {
 	impl->shutdown();
 }
+
+void QuickSyncEncoder::set_stream_mux(Mux *mux)
+{
+	impl->set_stream_mux(mux);
+}
+
diff --git a/quicksync_encoder.h b/quicksync_encoder.h
index 4f2bca5..52aaf77 100644
--- a/quicksync_encoder.h
+++ b/quicksync_encoder.h
@@ -33,6 +33,10 @@
 #include <string>
 #include <vector>
 
+extern "C" {
+#include <libavformat/avformat.h>
+}
+
 #include "ref_counted_frame.h"
 #include "ref_counted_gl_sync.h"
 
@@ -51,9 +55,10 @@ class ResourcePool;
 // .cpp file.
 class QuickSyncEncoder {
 public:
-        QuickSyncEncoder(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const std::string &va_display, int width, int height, Mux *stream_mux, AudioEncoder *stream_audio_encoder, X264Encoder *x264_encoder);
+        QuickSyncEncoder(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const std::string &va_display, int width, int height, AVOutputFormat *oformat, AudioEncoder *stream_audio_encoder, X264Encoder *x264_encoder);
         ~QuickSyncEncoder();
 
+	void set_stream_mux(Mux *mux);  // Does not take ownership. Must be called unless x264 is used for the stream.
 	void add_audio(int64_t pts, std::vector<float> audio);
 	bool begin_frame(GLuint *y_tex, GLuint *cbcr_tex);
 	RefCountedGLsync end_frame(int64_t pts, int64_t duration, const std::vector<RefCountedFrame> &input_frames);
diff --git a/video_encoder.cpp b/video_encoder.cpp
index cae4328..96d4932 100644
--- a/video_encoder.cpp
+++ b/video_encoder.cpp
@@ -38,21 +38,24 @@ string generate_local_dump_filename(int frame)
 VideoEncoder::VideoEncoder(ResourcePool *resource_pool, QSurface *surface, const std::string &va_display, int width, int height, HTTPD *httpd)
 	: resource_pool(resource_pool), surface(surface), va_display(va_display), width(width), height(height), httpd(httpd)
 {
-	open_output_stream();
-
+	oformat = av_guess_format(global_flags.stream_mux_name.c_str(), nullptr, nullptr);
+	assert(oformat != nullptr);
 	if (global_flags.stream_audio_codec_name.empty()) {
-		stream_audio_encoder.reset(new AudioEncoder(AUDIO_OUTPUT_CODEC_NAME, DEFAULT_AUDIO_OUTPUT_BIT_RATE));
+		stream_audio_encoder.reset(new AudioEncoder(AUDIO_OUTPUT_CODEC_NAME, DEFAULT_AUDIO_OUTPUT_BIT_RATE, oformat));
 	} else {
-		stream_audio_encoder.reset(new AudioEncoder(global_flags.stream_audio_codec_name, global_flags.stream_audio_codec_bitrate));
+		stream_audio_encoder.reset(new AudioEncoder(global_flags.stream_audio_codec_name, global_flags.stream_audio_codec_bitrate, oformat));
 	}
-	stream_audio_encoder->add_mux(stream_mux.get());
-
 	if (global_flags.x264_video_to_http) {
-		x264_encoder.reset(new X264Encoder(stream_mux.get()));
+		x264_encoder.reset(new X264Encoder(oformat));
 	}
 
 	string filename = generate_local_dump_filename(/*frame=*/0);
-	quicksync_encoder.reset(new QuickSyncEncoder(filename, resource_pool, surface, va_display, width, height, stream_mux.get(), stream_audio_encoder.get(), x264_encoder.get()));
+	quicksync_encoder.reset(new QuickSyncEncoder(filename, resource_pool, surface, va_display, width, height, oformat, stream_audio_encoder.get(), x264_encoder.get()));
+
+	open_output_stream();
+	stream_audio_encoder->add_mux(stream_mux.get());
+	quicksync_encoder->set_stream_mux(stream_mux.get());
+	x264_encoder->set_mux(stream_mux.get());
 }
 
 VideoEncoder::~VideoEncoder()
@@ -66,7 +69,8 @@ void VideoEncoder::do_cut(int frame)
 	string filename = generate_local_dump_filename(frame);
 	printf("Starting new recording: %s\n", filename.c_str());
 	quicksync_encoder->shutdown();
-	quicksync_encoder.reset(new QuickSyncEncoder(filename, resource_pool, surface, va_display, width, height, stream_mux.get(), stream_audio_encoder.get(), x264_encoder.get()));
+	quicksync_encoder.reset(new QuickSyncEncoder(filename, resource_pool, surface, va_display, width, height, oformat, stream_audio_encoder.get(), x264_encoder.get()));
+	quicksync_encoder->set_stream_mux(stream_mux.get());
 }
 
 void VideoEncoder::add_audio(int64_t pts, std::vector<float> audio)
@@ -87,8 +91,6 @@ RefCountedGLsync VideoEncoder::end_frame(int64_t pts, int64_t duration, const st
 void VideoEncoder::open_output_stream()
 {
 	AVFormatContext *avctx = avformat_alloc_context();
-	AVOutputFormat *oformat = av_guess_format(global_flags.stream_mux_name.c_str(), nullptr, nullptr);
-	assert(oformat != nullptr);
 	avctx->oformat = oformat;
 
 	uint8_t *buf = (uint8_t *)av_malloc(MUX_BUFFER_SIZE);
@@ -103,9 +105,14 @@ void VideoEncoder::open_output_stream()
 
 	avctx->flags = AVFMT_FLAG_CUSTOM_IO;
 
+	string video_extradata;
+	if (global_flags.x264_video_to_http) {
+		video_extradata = x264_encoder->get_global_headers();
+	}
+
 	int time_base = global_flags.stream_coarse_timebase ? COARSE_TIMEBASE : TIMEBASE;
 	stream_mux_writing_header = true;
-	stream_mux.reset(new Mux(avctx, width, height, video_codec, stream_audio_encoder->get_ctx(), time_base, this));
+	stream_mux.reset(new Mux(avctx, width, height, video_codec, video_extradata, stream_audio_encoder->get_ctx(), time_base, this));
 	stream_mux_writing_header = false;
 	httpd->set_header(stream_mux_header);
 	stream_mux_header.clear();
diff --git a/video_encoder.h b/video_encoder.h
index bb1be55..78162e9 100644
--- a/video_encoder.h
+++ b/video_encoder.h
@@ -46,6 +46,7 @@ private:
 	static int write_packet_thunk(void *opaque, uint8_t *buf, int buf_size);
 	int write_packet(uint8_t *buf, int buf_size);
 
+	AVOutputFormat *oformat;
 	std::unique_ptr<QuickSyncEncoder> quicksync_encoder;
 	movit::ResourcePool *resource_pool;
 	QSurface *surface;
diff --git a/x264_encoder.cpp b/x264_encoder.cpp
index 189da20..bbbb1ba 100644
--- a/x264_encoder.cpp
+++ b/x264_encoder.cpp
@@ -13,8 +13,8 @@ extern "C" {
 
 using namespace std;
 
-X264Encoder::X264Encoder(Mux *mux)
-	: mux(mux)
+X264Encoder::X264Encoder(AVOutputFormat *oformat)
+	: wants_global_headers(oformat->flags & AVFMT_GLOBALHEADER)
 {
 	frame_pool.reset(new uint8_t[WIDTH * HEIGHT * 2 * X264_QUEUE_LENGTH]);
 	for (unsigned i = 0; i < X264_QUEUE_LENGTH; ++i) {
@@ -88,11 +88,29 @@ void X264Encoder::init_x264()
 
 	x264_param_apply_profile(&param, "high");
 
+	param.b_repeat_headers = !wants_global_headers;
+
 	x264 = x264_encoder_open(&param);
 	if (x264 == nullptr) {
 		fprintf(stderr, "ERROR: x264 initialization failed.\n");
 		exit(1);
 	}
+
+	if (wants_global_headers) {
+		x264_nal_t *nal;
+		int num_nal;
+
+		x264_encoder_headers(x264, &nal, &num_nal);
+
+		for (int i = 0; i < num_nal; ++i) {
+			if (nal[i].i_type == NAL_SEI) {
+				// Don't put the SEI in extradata; make it part of the first frame instead.
+				buffered_sei += string((const char *)nal[i].p_payload, nal[i].i_payload);
+			} else {
+				global_headers += string((const char *)nal[i].p_payload, nal[i].i_payload);
+			}
+		}
+	}
 }
 
 void X264Encoder::encoder_thread_func()
@@ -160,7 +178,7 @@ void X264Encoder::encode_frame(X264Encoder::QueuedFrame qf)
 
 	// We really need one AVPacket for the entire frame, it seems,
 	// so combine it all.
-	size_t num_bytes = 0;
+	size_t num_bytes = buffered_sei.size();
 	for (int i = 0; i < num_nal; ++i) {
 		num_bytes += nal[i].i_payload;
 	}
@@ -168,6 +186,11 @@ void X264Encoder::encode_frame(X264Encoder::QueuedFrame qf)
 	unique_ptr<uint8_t[]> data(new uint8_t[num_bytes]);
 	uint8_t *ptr = data.get();
 
+	if (!buffered_sei.empty()) {
+		memcpy(ptr, buffered_sei.data(), buffered_sei.size());
+		ptr += buffered_sei.size();
+		buffered_sei.clear();
+	}
 	for (int i = 0; i < num_nal; ++i) {
 		memcpy(ptr, nal[i].p_payload, nal[i].i_payload);
 		ptr += nal[i].i_payload;
diff --git a/x264_encoder.h b/x264_encoder.h
index e146cd2..729cb7f 100644
--- a/x264_encoder.h
+++ b/x264_encoder.h
@@ -27,22 +27,28 @@
 
 extern "C" {
 #include "x264.h"
+#include <libavformat/avformat.h>
 }
 
 class Mux;
 
 class X264Encoder {
 public:
-	X264Encoder(Mux *httpd);  // Does not take ownership.
+	X264Encoder(AVOutputFormat *oformat);  // Does not take ownership.
 
 	// Called after the last frame. Will block; once this returns,
 	// the last data is flushed.
 	~X264Encoder();
 
+	// Must be called before first frame. Does not take ownership.
+	void set_mux(Mux *mux) { this->mux = mux; }
+
 	// <data> is taken to be raw NV12 data of WIDTHxHEIGHT resolution.
 	// Does not block.
 	void add_frame(int64_t pts, int64_t duration, const uint8_t *data);
 
+	std::string get_global_headers() const { return global_headers; }
+
 private:
 	struct QueuedFrame {
 		int64_t pts, duration;
@@ -58,6 +64,10 @@ private:
 	std::unique_ptr<uint8_t[]> frame_pool;
 
 	Mux *mux = nullptr;
+	bool wants_global_headers;
+
+	std::string global_headers;
+	std::string buffered_sei;  // Will be output before first frame, if any.
 
 	std::thread encoder_thread;
 	std::atomic<bool> should_quit{false};
-- 
2.39.2