From 5963b81ab39d0509f42ed503cf5322147ebb1a7e Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Sat, 2 Mar 2024 18:43:35 +0100
Subject: [PATCH] Add CEF support to Kaeru.
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

This allows you to have a fully headless encoder that just shows a web page
(perhaps as a âstream is downâ view or similar), although with no sound.
Request by Marcus Nilsen.
---
 meson.build                    |   4 +-
 nageru/cef_encoder_adapter.cpp | 134 +++++++++++++++++++++++++++++++++
 nageru/cef_encoder_adapter.h   |  60 +++++++++++++++
 nageru/flags.cpp               |  17 +++++
 nageru/flags.h                 |   3 +
 nageru/kaeru.cpp               | 102 ++++++++++++++++++-------
 6 files changed, 291 insertions(+), 29 deletions(-)
 create mode 100644 nageru/cef_encoder_adapter.cpp
 create mode 100644 nageru/cef_encoder_adapter.h
diff --git a/meson.build b/meson.build
index b190b4a..cf7301c 100644
--- a/meson.build
+++ b/meson.build
@@ -85,6 +85,7 @@ nageru_link_with = []
 nageru_build_rpath = ''
 nageru_install_rpath = ''
 
+kaeru_srcs = ['nageru/kaeru.cpp']
 kaeru_link_with = []
 kaeru_extra_deps = []
 
@@ -236,6 +237,7 @@ nageru_include_dirs += decklink_dir
 # CEF input.
 if have_cef
 	nageru_srcs += ['nageru/nageru_cef_app.cpp', 'nageru/cef_capture.cpp']
+	kaeru_srcs += ['nageru/cef_encoder_adapter.cpp', 'nageru/nageru_cef_app.cpp', 'nageru/cef_capture.cpp']
 endif
 
 nageru_srcs += qt_files
@@ -270,7 +272,7 @@ executable('nageru', 'nageru/main.cpp',
 meson.add_install_script('nageru/scripts/setup_nageru_symlink.sh')
 
 # Kaeru executable.
-executable('kaeru', 'nageru/kaeru.cpp',
+executable('kaeru', kaeru_srcs,
 	dependencies: [nageru_deps, kaeru_extra_deps],
 	include_directories: nageru_include_dirs,
 	link_with: [stream, aux, kaeru_link_with],
diff --git a/nageru/cef_encoder_adapter.cpp b/nageru/cef_encoder_adapter.cpp
new file mode 100644
index 0000000..a6c7ea8
--- /dev/null
+++ b/nageru/cef_encoder_adapter.cpp
@@ -0,0 +1,134 @@
+#include "cef_encoder_adapter.h"
+
+#include <bmusb/bmusb.h>
+#include <chrono>
+#include <mutex>
+#include <optional>
+#include <stddef.h>
+#include <stdint.h>
+
+extern "C" {
+#include <libavutil/rational.h>
+#include <libswscale/swscale.h>
+}
+
+#include "flags.h"
+#include "quittable_sleeper.h"
+
+using namespace bmusb;
+using namespace std;
+using namespace std::chrono;
+
+class FFmpegCapture;
+
+// In kaeru.cpp. (This is a bit of a hack, but in the interest of a cleaner split.)
+void video_frame_callback(FFmpegCapture *video, X264Encoder *x264_encoder, AudioEncoder *audio_encoder,
+                          int64_t video_pts, AVRational video_timebase,
+                          int64_t audio_pts, AVRational audio_timebase,
+                          uint16_t timecode,
+                          FrameAllocator::Frame video_frame, size_t video_offset, VideoFormat video_format,
+                          FrameAllocator::Frame audio_frame, size_t audio_offset, AudioFormat audio_format);
+
+void CEFEncoderAdapter::video_frame_callback(uint16_t timecode,
+                                             FrameAllocator::Frame video_frame, size_t video_offset, VideoFormat video_format,
+                                             FrameAllocator::Frame audio_frame, size_t audio_offset, AudioFormat audio_format)
+{
+	lock_guard lock(mu);
+	if (first_frame) {
+		start = steady_clock::now();
+		first_frame = false;
+	}
+
+	last_video_format = video_format;
+	last_audio_format = audio_format;
+
+	steady_clock::time_point now = steady_clock::now();
+	int64_t video_pts = duration_cast<microseconds>(now - start).count();
+	AVRational video_timebase{ 1, 1000000 };
+
+	FrameAllocator::Frame nv12_video_frame;
+	if (video_frame.len > 0) {
+		last_frame = now;
+
+		// Do the actual conversion, replacing the frame.
+		const uint8_t *src_pic_data[4] = { video_frame.data + video_offset, nullptr, nullptr, nullptr };
+		int src_linesizes[4] = { global_flags.width * 4, 0, 0, 0 };
+
+		uint8_t *pic_data[4] = { nv12_data.get(), nv12_data.get() + global_flags.width * global_flags.height, nullptr, nullptr };
+		int linesizes[4] = { global_flags.width, global_flags.width, 0, 0 };
+
+		sws_scale(sws_ctx.get(), src_pic_data, src_linesizes, 0, video_format.height, pic_data, linesizes);
+
+		nv12_video_frame.data = nv12_data.get();
+		nv12_video_frame.len = nv12_video_frame.size = global_flags.width * global_flags.height * 2;
+		if (video_frame.owner) {
+			video_frame.owner->release_frame(video_frame);
+		}
+	} else {
+		nv12_video_frame = video_frame;
+		// Will be released by video_frame_callback(), so don't do that here.
+	}
+
+	// NOTE: We don't have CEF audio yet.
+
+	::video_frame_callback(nullptr, x264_encoder, audio_encoder,
+	                       video_pts, video_timebase,
+	                       /*audio_pts=*/0, AVRational{ 1, 1 },
+	                       timecode,
+	                       nv12_video_frame, /*video_offset=*/0, video_format,
+	                       audio_frame, audio_offset, audio_format);
+}
+
+// Enforce at least 15 fps by duplicating frames. This feels suboptimal
+// on the face of it, but so many things in x264 (in particular lookahead
+// and keyframe interval) work on the number of frames, so not having
+// frames for a while will mess up the latency pretty badly, especially
+// when clients do the initial join on the stream.
+//
+// Returns early (without any duplication) if should_quit becomes active.
+void CEFEncoderAdapter::duplicate_frame_if_needed(QuittableSleeper *should_quit)
+{
+	constexpr duration max_frame_interval = milliseconds(1000 / 15);
+
+	optional<steady_clock::time_point> last_frame_now = get_last_frame();
+	if (!last_frame_now) {
+		// No initial frame yet, so nothing to duplicate. Just wait a bit.
+		should_quit->sleep_for(max_frame_interval);
+		return;
+	}
+	steady_clock::time_point next_inserted_frame = *last_frame_now + max_frame_interval;
+	if (!should_quit->sleep_until(next_inserted_frame)) {
+		// Asked to quit.
+		return;
+	}
+	lock_guard lock(mu);
+	if (*last_frame_now != last_frame) {
+		// A new frame came while we were waiting, so we don't need one just yet.
+		return;
+	}
+
+	// No new frame came before the deadline, so add a duplicate. The data from last
+	// conversion is still in the buffer, so we can just use that.
+	steady_clock::time_point now = steady_clock::now();
+	int64_t video_pts = duration_cast<microseconds>(now - start).count();
+
+	last_frame = now;
+	FrameAllocator::Frame nv12_video_frame;
+	nv12_video_frame.data = nv12_data.get();
+	nv12_video_frame.len = nv12_video_frame.size = global_flags.width * global_flags.height * 2;
+	::video_frame_callback(nullptr, x264_encoder, audio_encoder,
+			       video_pts, video_timebase,
+			       /*audio_pts=*/0, AVRational{ 1, 1 },
+			       /*timecode=*/0,
+			       nv12_video_frame, /*video_offset=*/0, last_video_format,
+			       /*audio_frame=*/{}, /*audio_offset=*/0, last_audio_format);
+}
+
+optional<steady_clock::time_point> CEFEncoderAdapter::get_last_frame() const {
+	lock_guard lock(mu);
+	if (first_frame) {
+		return nullopt;
+	} else {
+		return last_frame;
+	}
+}
diff --git a/nageru/cef_encoder_adapter.h b/nageru/cef_encoder_adapter.h
new file mode 100644
index 0000000..989520c
--- /dev/null
+++ b/nageru/cef_encoder_adapter.h
@@ -0,0 +1,60 @@
+#ifndef _CEF_ENCODER_ADAPTER
+#define _CEF_ENCODER_ADAPTER 1
+
+#include <stddef.h>
+#include <stdint.h>
+#include <bmusb/bmusb.h>
+#include <chrono>
+#include <optional>
+#include <mutex>
+#include <memory>
+
+#include "nageru/quittable_sleeper.h"
+#include "shared/ffmpeg_raii.h"
+
+extern "C" {
+#include <libavutil/rational.h>
+#include <libswscale/swscale.h>
+#include <libavutil/pixfmt.h>
+}
+
+class X264Encoder;
+class AudioEncoder;
+
+// For use in Kaeru, where we don't have a full mixer; converts the video data
+// from BGRA to NV12 (as CEF cannot produce NV12), and also deals with the fact
+// that CEF doesn't produce a steady stream of frames (see comments
+// on duplicate_frame_if_needed()).
+class CEFEncoderAdapter {
+public:
+	// Does not take ownership of the encoders.
+	CEFEncoderAdapter(unsigned width, unsigned height, X264Encoder *x264_encoder, AudioEncoder *audio_encoder)
+		: nv12_data(new uint8_t[width * height * 2]),
+		  sws_ctx(sws_getContext(width, height, AV_PIX_FMT_BGRA,
+			width, height, AV_PIX_FMT_NV12,
+			SWS_BICUBIC, nullptr, nullptr, nullptr)),
+		  x264_encoder(x264_encoder),
+		  audio_encoder(audio_encoder) {}
+
+	void video_frame_callback(uint16_t timecode,
+	                          bmusb::FrameAllocator::Frame video_frame, size_t video_offset, bmusb::VideoFormat video_format,
+	                          bmusb::FrameAllocator::Frame audio_frame, size_t audio_offset, bmusb::AudioFormat audio_format);
+	void duplicate_frame_if_needed(QuittableSleeper *should_quit);
+
+private:
+	std::optional<std::chrono::steady_clock::time_point> get_last_frame() const;
+
+	mutable std::mutex mu;  // Protects all data members.
+	std::unique_ptr<uint8_t[]> nv12_data;
+	SwsContextWithDeleter sws_ctx;
+	std::chrono::steady_clock::time_point start;
+	std::chrono::steady_clock::time_point last_frame;
+	bmusb::VideoFormat last_video_format;
+	bmusb::AudioFormat last_audio_format;
+	bool first_frame = true;
+	X264Encoder *x264_encoder;
+	AudioEncoder *audio_encoder;
+	static constexpr AVRational video_timebase{ 1, 1000000 };
+};
+
+#endif
diff --git a/nageru/flags.cpp b/nageru/flags.cpp
index 3231318..afd0306 100644
--- a/nageru/flags.cpp
+++ b/nageru/flags.cpp
@@ -64,6 +64,7 @@ enum LongOption {
 	OPTION_SRT_PASSPHRASE,
 	OPTION_SRT_YOUTUBE_STREAM_KEY,
 	OPTION_SRT_LATENCY,
+	OPTION_CEF,
 	OPTION_NO_TRANSCODE_VIDEO,
 	OPTION_NO_TRANSCODE_AUDIO,
 	OPTION_DISABLE_AUDIO,
@@ -222,6 +223,9 @@ void usage(Program program)
 	fprintf(stderr, "      --no-srt                    disable receiving SRT streams\n");
 #endif
 	if (program == PROGRAM_KAERU) {
+#ifdef HAVE_CEF
+		fprintf(stderr, "      --cef                       render input as a web page (implies --disable-audio)\n");
+#endif
 		fprintf(stderr, "      --no-transcode-video        copy encoded video raw from the source stream\n");
 		fprintf(stderr, "                                    (experimental, must be H.264)\n");
 		fprintf(stderr, "      --no-transcode-audio        copy encoded audio raw from the source stream\n");
@@ -333,6 +337,9 @@ void parse_flags(Program program, int argc, char * const argv[])
 		{ "srt-passphrase", required_argument, 0, OPTION_SRT_PASSPHRASE },
 		{ "srt-youtube-stream-key", required_argument, 0, OPTION_SRT_YOUTUBE_STREAM_KEY },
 		{ "srt-latency", required_argument, 0, OPTION_SRT_LATENCY },
+#endif
+#ifdef HAVE_CEF
+		{ "cef", no_argument, 0, OPTION_CEF },
 #endif
 		{ "no-transcode-video", no_argument, 0, OPTION_NO_TRANSCODE_VIDEO },
 		{ "no-transcode-audio", no_argument, 0, OPTION_NO_TRANSCODE_AUDIO },
@@ -505,6 +512,10 @@ void parse_flags(Program program, int argc, char * const argv[])
 			global_flags.srt_output_latency_ms = atoi(optarg);
 			break;
 #endif
+		case OPTION_CEF:
+			global_flags.use_cef = true;
+			global_flags.enable_audio = false;
+			break;
 		case OPTION_NO_TRANSCODE_VIDEO:
 			global_flags.transcode_video = false;
 			break;
@@ -784,6 +795,12 @@ void parse_flags(Program program, int argc, char * const argv[])
 		fprintf(stderr, "       (using --http-audio-codec).\n");
 		exit(1);
 	}
+#ifdef HAVE_CEF
+	if (global_flags.enable_audio && global_flags.use_cef) {
+		fprintf(stderr, "ERROR: --cef does not currently support audio.\n");
+		exit(1);
+	}
+#endif
 	if (global_flags.x264_speedcontrol) {
 		if (!global_flags.x264_preset.empty() && global_flags.x264_preset != "faster") {
 			fprintf(stderr, "WARNING: --x264-preset is overridden by --x264-speedcontrol (implicitly uses \"faster\" as base preset)\n");
diff --git a/nageru/flags.h b/nageru/flags.h
index 7704410..fdc6265 100644
--- a/nageru/flags.h
+++ b/nageru/flags.h
@@ -80,6 +80,9 @@ struct Flags {
 	YCbCrInterpretation ycbcr_interpretation[MAX_VIDEO_CARDS];
 	bool transcode_video = true;  // Kaeru only.
 	bool transcode_audio = true;  // Kaeru only.
+#ifdef HAVE_CEF
+	bool use_cef = false;  // Kaeru only. If true, then enable_audio must be false.
+#endif
 	bool enable_audio = true;  // Kaeru only. If false, then transcode_audio is also false.
 	bool use_zerocopy = false;  // Not user-settable.
 	bool fullscreen = false;
diff --git a/nageru/kaeru.cpp b/nageru/kaeru.cpp
index 2bdc686..16ad079 100644
--- a/nageru/kaeru.cpp
+++ b/nageru/kaeru.cpp
@@ -2,6 +2,9 @@
 
 #include "audio_encoder.h"
 #include "basic_stats.h"
+#ifdef HAVE_CEF
+#include "cef_capture.h"
+#endif
 #include "defs.h"
 #include "flags.h"
 #include "ffmpeg_capture.h"
@@ -22,6 +25,7 @@
 #include <errno.h>
 #include <functional>
 #include <memory>
+#include <movit/image_format.h>
 #include <signal.h>
 #include <stddef.h>
 #include <stdint.h>
@@ -47,6 +51,12 @@ extern "C" {
 #include <libavutil/version.h>
 }
 
+#ifdef HAVE_CEF
+#include "cef_encoder_adapter.h"
+#include "nageru_cef_app.h"
+CefRefPtr<NageruCefApp> cef_app;
+#endif
+
 using namespace bmusb;
 using namespace movit;
 using namespace std;
@@ -114,6 +124,7 @@ unique_ptr<Mux> create_mux(HTTPD *httpd, const AVOutputFormat *oformat, X264Enco
 	return mux;
 }
 
+// NOTE: If we start using the timecode for anything, CEFEncoderAdapter will need adjustment.
 void video_frame_callback(FFmpegCapture *video, X264Encoder *x264_encoder, AudioEncoder *audio_encoder,
                           int64_t video_pts, AVRational video_timebase,
                           int64_t audio_pts, AVRational audio_timebase,
@@ -127,7 +138,8 @@ void video_frame_callback(FFmpegCapture *video, X264Encoder *x264_encoder, Audio
 
 		video_pts = av_rescale_q(video_pts, video_timebase, AVRational{ 1, TIMEBASE });
 		int64_t frame_duration = int64_t(TIMEBASE) * video_format.frame_rate_den / video_format.frame_rate_nom;
-		x264_encoder->add_frame(video_pts, frame_duration, video->get_current_frame_ycbcr_format().luma_coefficients, video_frame.data + video_offset, ts);
+		YCbCrLumaCoefficients luma_coefficients = video ? video->get_current_frame_ycbcr_format().luma_coefficients : YCBCR_REC_709;
+		x264_encoder->add_frame(video_pts, frame_duration, luma_coefficients, video_frame.data + video_offset, ts);
 		global_basic_stats->update(frame_num++, /*dropped_frames=*/0);
 	}
 	if (audio_frame.len > 0) {
@@ -233,6 +245,17 @@ void request_quit(int signal)
 
 int main(int argc, char *argv[])
 {
+#ifdef HAVE_CEF
+	// Let CEF have first priority on parsing the command line, because we might be
+	// launched as a CEF sub-process.
+	CefMainArgs main_args(argc, argv);
+	cef_app = CefRefPtr<NageruCefApp>(new NageruCefApp());
+	int err = CefExecuteProcess(main_args, cef_app.get(), nullptr);
+	if (err >= 0) {
+		return err;
+	}
+#endif
+
 	parse_flags(PROGRAM_KAERU, argc, argv);
 	if (optind + 1 != argc) {
 		usage(PROGRAM_KAERU);
@@ -267,33 +290,50 @@ int main(int argc, char *argv[])
 	}
 	global_x264_encoder = x264_encoder.get();
 
-	FFmpegCapture video(argv[optind], global_flags.width, global_flags.height);
-	video.set_pixel_format(FFmpegCapture::PixelFormat_NV12);
-	if (global_flags.transcode_video) {
-		video.set_frame_callback(bind(video_frame_callback, &video, x264_encoder.get(), audio_encoder.get(), _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11));
-	} else {
-		video.set_video_callback(bind(raw_packet_callback, http_mux.get(), /*stream_index=*/0, _1, _2));
-	}
-	if (!global_flags.transcode_audio && global_flags.enable_audio) {
-		AVBSFContext *bsfctx = nullptr;
-		if (strcmp(oformat->name, "mp4") == 0 && strcmp(audio_encoder->get_codec()->name, "aac") == 0) {
-			// We need to insert the aac_adtstoasc filter, seemingly (or we will get warnings to do so).
-			const AVBitStreamFilter *filter = av_bsf_get_by_name("aac_adtstoasc");
-			int err = av_bsf_alloc(filter, &bsfctx);
-			if (err < 0) {
-				fprintf(stderr, "av_bsf_alloc() failed with %d\n", err);
-				exit(1);
-			}
-		}
-		if (bsfctx == nullptr) {
-			video.set_audio_callback(bind(raw_packet_callback, http_mux.get(), /*stream_index=*/1, _1, _2));
-		} else {
-			video.set_audio_callback(bind(filter_packet_callback, http_mux.get(), /*stream_index=*/1, bsfctx, _1, _2));
-		}
+	CaptureInterface *video;
+	unique_ptr<FFmpegCapture> ffmpeg_video;
+#ifdef HAVE_CEF
+	unique_ptr<CEFCapture> cef_video;
+	unique_ptr<CEFEncoderAdapter> cef_encoder_adapter;
+	if (global_flags.use_cef) {
+		cef_encoder_adapter.reset(new CEFEncoderAdapter(global_flags.width, global_flags.height, x264_encoder.get(), audio_encoder.get()));
+		cef_video.reset(new CEFCapture(argv[optind], global_flags.width, global_flags.height));
+		cef_video->set_pixel_format(bmusb::PixelFormat_8BitBGRA);
+		cef_video->set_frame_callback(bind(&CEFEncoderAdapter::video_frame_callback, cef_encoder_adapter.get(), _1, _2, _3, _4, _5, _6, _7));
+		// NOTE: No CEF audio support yet.
+		video = cef_video.get();
+	} else
+#endif
+	{
+	       ffmpeg_video.reset(new FFmpegCapture(argv[optind], global_flags.width, global_flags.height));
+	       ffmpeg_video->set_pixel_format(FFmpegCapture::PixelFormat_NV12);
+	       if (global_flags.transcode_video) {
+		       ffmpeg_video->set_frame_callback(bind(video_frame_callback, ffmpeg_video.get(), x264_encoder.get(), audio_encoder.get(), _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11));
+	       } else {
+		       ffmpeg_video->set_video_callback(bind(raw_packet_callback, http_mux.get(), /*stream_index=*/0, _1, _2));
+	       }
+	       if (!global_flags.transcode_audio && global_flags.enable_audio) {
+		       AVBSFContext *bsfctx = nullptr;
+		       if (strcmp(oformat->name, "mp4") == 0 && strcmp(audio_encoder->get_codec()->name, "aac") == 0) {
+			       // We need to insert the aac_adtstoasc filter, seemingly (or we will get warnings to do so).
+			       const AVBitStreamFilter *filter = av_bsf_get_by_name("aac_adtstoasc");
+			       int err = av_bsf_alloc(filter, &bsfctx);
+			       if (err < 0) {
+				       fprintf(stderr, "av_bsf_alloc() failed with %d\n", err);
+				       exit(1);
+			       }
+		       }
+		       if (bsfctx == nullptr) {
+			       ffmpeg_video->set_audio_callback(bind(raw_packet_callback, http_mux.get(), /*stream_index=*/1, _1, _2));
+		       } else {
+			       ffmpeg_video->set_audio_callback(bind(filter_packet_callback, http_mux.get(), /*stream_index=*/1, bsfctx, _1, _2));
+		       }
+	       }
+	       ffmpeg_video->change_rate(10.0);  // Play as fast as possible.
+	       video = ffmpeg_video.get();
 	}
-	video.configure_card();
-	video.start_bm_capture();
-	video.change_rate(10.0);  // Play as fast as possible.
+	video->configure_card();
+	video->start_bm_capture();
 
 	BasicStats basic_stats(/*verbose=*/false, /*use_opengl=*/false);
 	global_basic_stats = &basic_stats;
@@ -304,10 +344,16 @@ int main(int argc, char *argv[])
 	signal(SIGINT, request_quit);
 
 	while (!should_quit.should_quit()) {
+#ifdef HAVE_CEF
+		if (global_flags.use_cef) {
+			cef_encoder_adapter->duplicate_frame_if_needed(&should_quit);
+			continue;
+		}
+#endif
 		should_quit.sleep_for(hours(1000));
 	}
 
-	video.stop_dequeue_thread();
+	video->stop_dequeue_thread();
 	// Stop the x264 encoder before killing the mux it's writing to.
 	global_x264_encoder = nullptr;
 	x264_encoder.reset();
-- 
2.39.2