From 16c0e5da7fa7b4eeea79470c24697a1ba193f071 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Fri, 29 Apr 2016 02:10:00 +0200
Subject: [PATCH] Implement x264 speedcontrol.
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Based on the speedcontrol patch series from x264-obe, but bugfixed,
reworked and pulled out so that it can run with vanilla x264.

Note that vanilla x264 right now has a bug that this patch exposes;
you want the patch called âFix corruption with reconfig which changed
SPS (like change of --ref)â (currently in x264-sandbox).
---
 Makefile               |   2 +-
 flags.cpp              |  19 +++
 flags.h                |   4 +-
 x264_encoder.cpp       |  20 +++
 x264_encoder.h         |   2 +
 x264_speed_control.cpp | 313 +++++++++++++++++++++++++++++++++++++++++
 x264_speed_control.h   | 116 +++++++++++++++
 7 files changed, 474 insertions(+), 2 deletions(-)
 create mode 100644 x264_speed_control.cpp
 create mode 100644 x264_speed_control.h

diff --git a/Makefile b/Makefile
index 4c2df49..cfcdedd 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,7 @@ OBJS += glwidget.moc.o mainwindow.moc.o vumeter.moc.o lrameter.moc.o correlation
 OBJS += mixer.o bmusb/bmusb.o pbo_frame_allocator.o context.o ref_counted_frame.o theme.o resampling_queue.o httpd.o ebu_r128_proc.o flags.o image_input.o stereocompressor.o filter.o alsa_output.o correlation_measurer.o
 
 # Streaming and encoding objects
-OBJS += quicksync_encoder.o x264_encoder.o video_encoder.o metacube2.o mux.o audio_encoder.o
+OBJS += quicksync_encoder.o x264_encoder.o x264_speed_control.o video_encoder.o metacube2.o mux.o audio_encoder.o
 
 # DeckLink
 OBJS += decklink_capture.o decklink/DeckLinkAPIDispatch.o
diff --git a/flags.cpp b/flags.cpp
index 7e8beaf..25f11b4 100644
--- a/flags.cpp
+++ b/flags.cpp
@@ -25,6 +25,8 @@ void usage()
 	fprintf(stderr, "      --http-x264-video           send x264-compressed video to HTTP clients\n");
 	fprintf(stderr, "      --x264-preset               x264 quality preset (default " X264_DEFAULT_PRESET ")\n");
 	fprintf(stderr, "      --x264-tune                 x264 tuning (default " X264_DEFAULT_TUNE ", can be blank)\n");
+	fprintf(stderr, "      --x264-speedcontrol         try to match x264 preset to available CPU speed\n");
+	fprintf(stderr, "      --x264-speedcontrol-verbose  output speedcontrol debugging statistics\n");
 	fprintf(stderr, "      --x264-bitrate              x264 bitrate (in kilobit/sec, default %d)\n",
 		DEFAULT_X264_OUTPUT_BIT_RATE);
 	fprintf(stderr, "      --x264-vbv-bufsize          x264 VBV size (in kilobits, 0 = one-frame VBV,\n");
@@ -58,6 +60,8 @@ void parse_flags(int argc, char * const argv[])
 		{ "http-x264-video", no_argument, 0, 1008 },
 		{ "x264-preset", required_argument, 0, 1009 },
 		{ "x264-tune", required_argument, 0, 1010 },
+		{ "x264-speedcontrol", no_argument, 0, 1015 },
+		{ "x264-speedcontrol-verbose", no_argument, 0, 1016 },
 		{ "x264-bitrate", required_argument, 0, 1011 },
 		{ "x264-vbv-bufsize", required_argument, 0, 1012 },
 		{ "x264-vbv-max-bitrate", required_argument, 0, 1013 },
@@ -128,6 +132,12 @@ void parse_flags(int argc, char * const argv[])
 		case 1010:
 			global_flags.x264_tune = optarg;
 			break;
+		case 1015:
+			global_flags.x264_speedcontrol = true;
+			break;
+		case 1016:
+			global_flags.x264_speedcontrol_verbose = true;
+			break;
 		case 1011:
 			global_flags.x264_bitrate = atoi(optarg);
 			break;
@@ -162,6 +172,15 @@ void parse_flags(int argc, char * const argv[])
 		fprintf(stderr, "ERROR: --http-uncompressed-video and --http-x264-video are mutually incompatible\n");
 		exit(1);
 	}
+	if (global_flags.x264_speedcontrol) {
+		if (!global_flags.x264_preset.empty() && global_flags.x264_preset != "faster") {
+			fprintf(stderr, "WARNING: --x264-preset is overridden by --x264-speedcontrol (implicitly uses \"faster\" as base preset)\n");
+		}
+		global_flags.x264_preset = "faster";
+	} else if (global_flags.x264_preset.empty()) {
+		global_flags.x264_preset = X264_DEFAULT_PRESET;
+	}
+
 	for (pair<int, int> mapping : global_flags.default_stream_mapping) {
 		if (mapping.second >= global_flags.num_cards) {
 			fprintf(stderr, "ERROR: Signal %d mapped to card %d, which doesn't exist (try adjusting --num-cards)\n",
diff --git a/flags.h b/flags.h
index c3ea0fc..edc47dd 100644
--- a/flags.h
+++ b/flags.h
@@ -18,8 +18,10 @@ struct Flags {
 	bool stream_coarse_timebase = false;
 	std::string stream_audio_codec_name;  // Blank = use the same as for the recording.
 	int stream_audio_codec_bitrate = DEFAULT_AUDIO_OUTPUT_BIT_RATE;  // Ignored if stream_audio_codec_name is blank.
-	std::string x264_preset = X264_DEFAULT_PRESET;
+	std::string x264_preset;  // Empty will be overridden by X264_DEFAULT_PRESET, unless speedcontrol is set.
 	std::string x264_tune = X264_DEFAULT_TUNE;
+	bool x264_speedcontrol = false;
+	bool x264_speedcontrol_verbose = false;
 	int x264_bitrate = DEFAULT_X264_OUTPUT_BIT_RATE;  // In kilobit/sec.
 	int x264_vbv_max_bitrate = -1;  // In kilobits. 0 = no limit, -1 = same as <x264_bitrate> (CBR).
 	int x264_vbv_buffer_size = -1;  // In kilobits. 0 = one-frame VBV, -1 = same as <x264_bitrate> (one-second VBV).
diff --git a/x264_encoder.cpp b/x264_encoder.cpp
index df9e994..e19a869 100644
--- a/x264_encoder.cpp
+++ b/x264_encoder.cpp
@@ -6,6 +6,7 @@
 #include "mux.h"
 #include "timebase.h"
 #include "x264_encoder.h"
+#include "x264_speed_control.h"
 
 extern "C" {
 #include <libavformat/avformat.h>
@@ -68,6 +69,9 @@ void X264Encoder::init_x264()
 	param.i_timebase_num = 1;
 	param.i_timebase_den = TIMEBASE;
 	param.i_keyint_max = 50; // About one second.
+	if (global_flags.x264_speedcontrol) {
+		param.i_frame_reference = 16;  // Because speedcontrol is never allowed to change this above what we set at start.
+	}
 
 	// NOTE: These should be in sync with the ones in h264encode.cpp (sbs_rbsp()).
 	param.vui.i_vidformat = 5;  // Unspecified.
@@ -106,6 +110,10 @@ void X264Encoder::init_x264()
 		exit(1);
 	}
 
+	if (global_flags.x264_speedcontrol) {
+		speed_control.reset(new X264SpeedControl(x264, /*f_speed=*/1.0f, X264_QUEUE_LENGTH, /*f_buffer_init=*/1.0f));
+	}
+
 	if (wants_global_headers) {
 		x264_nal_t *nal;
 		int num_nal;
@@ -181,9 +189,21 @@ void X264Encoder::encode_frame(X264Encoder::QueuedFrame qf)
 		pic.img.i_stride[1] = WIDTH / 2 * sizeof(uint16_t);
 		pic.opaque = reinterpret_cast<void *>(intptr_t(qf.duration));
 
+		if (speed_control) {
+			speed_control->before_frame(float(free_frames.size()) / X264_QUEUE_LENGTH, X264_QUEUE_LENGTH, 1e6 * qf.duration / TIMEBASE);
+		}
 		x264_encoder_encode(x264, &nal, &num_nal, &pic, &pic);
+		if (speed_control) {
+			speed_control->after_frame();
+		}
 	} else {
+		if (speed_control) {
+			speed_control->before_frame(float(free_frames.size()) / X264_QUEUE_LENGTH, X264_QUEUE_LENGTH, 1e6 * qf.duration / TIMEBASE);
+		}
 		x264_encoder_encode(x264, &nal, &num_nal, nullptr, &pic);
+		if (speed_control) {
+			speed_control->after_frame();
+		}
 	}
 
 	// We really need one AVPacket for the entire frame, it seems,
diff --git a/x264_encoder.h b/x264_encoder.h
index 729cb7f..ef118c6 100644
--- a/x264_encoder.h
+++ b/x264_encoder.h
@@ -31,6 +31,7 @@ extern "C" {
 }
 
 class Mux;
+class X264SpeedControl;
 
 class X264Encoder {
 public:
@@ -72,6 +73,7 @@ private:
 	std::thread encoder_thread;
 	std::atomic<bool> should_quit{false};
 	x264_t *x264;
+	std::unique_ptr<X264SpeedControl> speed_control;
 
 	// Protects everything below it.
 	std::mutex mu;
diff --git a/x264_speed_control.cpp b/x264_speed_control.cpp
new file mode 100644
index 0000000..3ed3ece
--- /dev/null
+++ b/x264_speed_control.cpp
@@ -0,0 +1,313 @@
+#include "x264_speed_control.h"
+
+#include "flags.h"
+
+#include <time.h>
+
+#include <algorithm>
+
+using namespace std;
+
+X264SpeedControl::X264SpeedControl(x264_t *x264, float f_speed, int i_buffer_size, float f_buffer_init)
+	: x264(x264), f_speed(f_speed)
+{
+	x264_param_t param;
+	x264_encoder_parameters(x264, &param);
+
+	float fps = (float)param.i_fps_num / param.i_fps_den;
+	uspf = 1e6 / fps;
+	set_buffer_size(i_buffer_size);
+	buffer_fill = buffer_size * f_buffer_init;
+	buffer_fill = max<int64_t>(buffer_fill, uspf);
+	buffer_fill = min(buffer_fill, buffer_size);
+	timestamp = mdate();
+	preset = -1;
+	cplx_num = 3e3; //FIXME estimate initial complexity
+	cplx_den = .1;
+	stat.min_buffer = buffer_size;
+	stat.max_buffer = 0;
+}
+
+X264SpeedControl::~X264SpeedControl()
+{
+	fprintf(stderr, "speedcontrol: avg preset=%.3f  buffer min=%.3f max=%.3f\n",
+		stat.avg_preset / stat.den,
+		(float)stat.min_buffer / buffer_size,
+		(float)stat.max_buffer / buffer_size );
+	//  x264_log( x264, X264_LOG_INFO, "speedcontrol: avg cplx=%.5f\n", cplx_num / cplx_den );
+}
+
+typedef struct
+{
+	float time; // relative encoding time, compared to the other presets
+	int subme;
+	int me;
+	int refs;
+	int mix;
+	int trellis;
+	int partitions;
+	int badapt;
+	int bframes;
+	int direct;
+	int merange;
+} sc_preset_t;
+
+// The actual presets, including the equivalent commandline options. Note that
+// all presets are benchmarked with --weightp 1 --mbtree --rc-lookahead 20
+// on top of the given settings (equivalent settings to the "faster" preset).
+// Timings and SSIM measurements were done on a quadcore Haswell i5 3.2 GHz
+// on the first 1000 frames of "Tears of Steel" in 1080p.
+//
+// Note that the two first and the two last are also used for extrapolation
+// should the desired time be outside the range. Thus, it is disadvantageous if
+// they are chosen so that the timings are too close to each other.
+#define SC_PRESETS 26
+static const sc_preset_t presets[SC_PRESETS] = {
+#define I4 X264_ANALYSE_I4x4
+#define I8 X264_ANALYSE_I8x8
+#define P4 X264_ANALYSE_PSUB8x8
+#define P8 X264_ANALYSE_PSUB16x16
+#define B8 X264_ANALYSE_BSUB16x16
+	// Preset 0: 14.179db, --preset superfast --b-adapt 0 --bframes 0
+	{ .time= 1.000, .subme=1, .me=X264_ME_DIA, .refs=1, .mix=0, .trellis=0, .partitions=I8|I4, .badapt=0, .bframes=0, .direct=0, .merange=16 },
+
+	// Preset 1: 14.459db, --preset superfast
+	{ .time= 1.283, .subme=1, .me=X264_ME_DIA, .refs=1, .mix=0, .trellis=0, .partitions=I8|I4, .badapt=1, .bframes=3, .direct=1, .merange=16 },
+
+	// Preset 2: 14.761db, --preset superfast --subme 2
+	{ .time= 1.603, .subme=2, .me=X264_ME_DIA, .refs=1, .mix=0, .trellis=0, .partitions=I8|I4, .badapt=1, .bframes=3, .direct=1, .merange=16 },
+
+	// Preset 3: 15.543db, --preset veryfast
+	{ .time= 1.843, .subme=2, .me=X264_ME_HEX, .refs=1, .mix=0, .trellis=0, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
+
+	// Preset 4: 15.716db, --preset veryfast --subme 3
+	{ .time= 2.452, .subme=3, .me=X264_ME_HEX, .refs=1, .mix=0, .trellis=0, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
+
+	// Preset 5: 15.786db, --preset veryfast --subme 3 --ref 2
+	{ .time= 2.733, .subme=3, .me=X264_ME_HEX, .refs=2, .mix=0, .trellis=0, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
+
+	// Preset 6: 15.813db, --preset veryfast --subme 4 --ref 2
+	{ .time= 3.085, .subme=4, .me=X264_ME_HEX, .refs=2, .mix=0, .trellis=0, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
+
+	// Preset 7: 15.849db, --preset faster
+	{ .time= 3.101, .subme=4, .me=X264_ME_HEX, .refs=2, .mix=0, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
+
+	// Preset 8: 15.857db, --preset faster --mixed-refs
+	{ .time= 3.284, .subme=4, .me=X264_ME_HEX, .refs=2, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
+
+	// Preset 9: 15.869db, --preset faster --mixed-refs --subme 5
+	{ .time= 3.587, .subme=5, .me=X264_ME_HEX, .refs=2, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
+
+	// Preset 10: 16.051db, --preset fast
+	{ .time= 3.947, .subme=6, .me=X264_ME_HEX, .refs=2, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
+
+	// Preset 11: 16.356db, --preset fast --subme 7
+	{ .time= 4.041, .subme=7, .me=X264_ME_HEX, .refs=2, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
+
+	// Preset 12: 16.418db, --preset fast --subme 7 --ref 3
+	{ .time= 4.406, .subme=7, .me=X264_ME_HEX, .refs=3, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
+
+	// Preset 13: 16.460db, --preset medium
+	{ .time= 4.707, .subme=7, .me=X264_ME_HEX, .refs=3, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
+
+	// Preset 14: 16.517db, --preset medium --subme 8
+	{ .time= 5.133, .subme=8, .me=X264_ME_HEX, .refs=3, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
+
+	// Preset 15: 16.523db, --preset medium --subme 8 --me umh
+	{ .time= 6.050, .subme=8, .me=X264_ME_UMH, .refs=3, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
+
+	// Preset 16: 16.543db, --preset medium --subme 8 --me umh --direct auto --b-adapt 2
+	{ .time= 6.849, .subme=8, .me=X264_ME_UMH, .refs=3, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=2, .bframes=3, .direct=3, .merange=16 },
+
+	// Preset 17: 16.613db, --preset slow
+	{ .time= 8.042, .subme=8, .me=X264_ME_UMH, .refs=5, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=2, .bframes=3, .direct=3, .merange=16 },
+
+	// Preset 18: 16.641db, --preset slow --subme 9
+	{ .time= 8.972, .subme=9, .me=X264_ME_UMH, .refs=5, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=2, .bframes=3, .direct=3, .merange=16 },
+
+	// Preset 19: 16.895db, --preset slow --subme 9 --trellis 2
+	{ .time=10.073, .subme=9, .me=X264_ME_UMH, .refs=5, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8, .badapt=2, .bframes=3, .direct=3, .merange=16 },
+
+	// Preset 20: 16.918db, --preset slow --subme 9 --trellis 2 --ref 6
+	{ .time=11.147, .subme=9, .me=X264_ME_UMH, .refs=6, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8, .badapt=2, .bframes=3, .direct=3, .merange=16 },
+
+	// Preset 21: 16.934db, --preset slow --subme 9 --trellis 2 --ref 7
+	{ .time=12.267, .subme=9, .me=X264_ME_UMH, .refs=7, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8, .badapt=2, .bframes=3, .direct=3, .merange=16 },
+
+	// Preset 22: 16.948db, --preset slower
+	{ .time=13.829, .subme=9, .me=X264_ME_UMH, .refs=8, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8|P4, .badapt=2, .bframes=3, .direct=3, .merange=16 },
+
+	// Preset 23: 17.058db, --preset slower --subme 10
+	{ .time=14.831, .subme=10, .me=X264_ME_UMH, .refs=8, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8|P4, .badapt=2, .bframes=3, .direct=3, .merange=16 },
+
+	// Preset 24: 17.268db, --preset slower --subme 10 --bframes 8
+	{ .time=18.705, .subme=10, .me=X264_ME_UMH, .refs=8, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8|P4, .badapt=2, .bframes=8, .direct=3, .merange=16 },
+
+	// Preset 25: 17.297db, --preset veryslow
+	{ .time=31.419, .subme=10, .me=X264_ME_UMH, .refs=16, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8|P4, .badapt=2, .bframes=8, .direct=3, .merange=24 },
+#undef I4
+#undef I8
+#undef P4
+#undef P8
+#undef B8
+};
+
+void X264SpeedControl::before_frame(float new_buffer_fill, int new_buffer_size, float new_uspf)
+{
+	if (new_uspf > 0.0) {
+		uspf = new_uspf;
+	}
+	if (new_buffer_size) {
+		set_buffer_size(new_buffer_size);
+	}
+	buffer_fill = buffer_size * new_buffer_fill;
+
+	int64_t t, delta_t;
+
+	// update buffer state after encoding and outputting the previous frame(s)
+	if (first) {
+		t = timestamp = mdate();
+		first = false;
+	} else {
+		t = mdate();
+	}
+
+	delta_t = t - timestamp;
+	timestamp = t;
+
+	// update the time predictor
+	int cpu_time = cpu_time_last_frame;
+	cplx_num *= cplx_decay;
+	cplx_den *= cplx_decay;
+	cplx_num += cpu_time / presets[preset].time;
+	++cplx_den;
+
+	stat.avg_preset += preset;
+	++stat.den;
+
+	stat.min_buffer = min(buffer_fill, stat.min_buffer);
+	stat.max_buffer = max(buffer_fill, stat.max_buffer);
+
+	if (buffer_fill >= buffer_size) { // oops, cpu was idle
+		// not really an error, but we'll warn for debugging purposes
+		static int64_t idle_t = 0, print_interval = 0;
+		idle_t += buffer_fill - buffer_size;
+		if (t - print_interval > 1e6) {
+			//fprintf(stderr, "speedcontrol idle (%.6f sec)\n", idle_t/1e6);
+			print_interval = t;
+			idle_t = 0;
+		}
+		buffer_fill = buffer_size;
+	} else if (buffer_fill <= 0) {  // oops, we're late
+		// fprintf(stderr, "speedcontrol underflow (%.6f sec)\n", buffer_fill/1e6);
+	}
+
+	{
+		// Pick the preset that should return the buffer to 3/4-full within a time
+		// specified by compensation_period.
+		//
+		// NOTE: This doesn't actually do that, at least assuming the same target is
+		// chosen for every frame; exactly what it does is unclear to me. It seems
+		// to consistently undershoot a bit, so it needs to be saved by the second
+		// predictor below. However, fixing the formula seems to yield somewhat less
+		// stable results in practice; in particular, once the buffer is half-full
+		// or so, it would give us a negative target. Perhaps increasing
+		// compensation_period would be a good idea, but initial (very brief) tests
+		// did not yield good results.
+		float target = uspf / f_speed
+			* (buffer_fill + compensation_period)
+			/ (buffer_size*3/4 + compensation_period);
+		float cplx = cplx_num / cplx_den;
+		float set, t0, t1;
+		float filled = (float) buffer_fill / buffer_size;
+		int i;
+		t0 = presets[0].time * cplx;
+		for (i = 1; ; i++) {
+			t1 = presets[i].time * cplx;
+			if (t1 >= target || i == SC_PRESETS - 1)
+				break;
+			t0 = t1;
+		}
+		// exponential interpolation between states
+		set = i-1 + (log(target) - log(t0)) / (log(t1) - log(t0));
+		set = max<float>(set, -5);
+		set = min<float>(set, (SC_PRESETS-1) + 5);
+		// Even if our time estimations in the SC_PRESETS array are off
+		// this will push us towards our target fullness
+		float s1 = set;
+		set += (40 * (filled-0.75));
+		float s2 = (40 * (filled-0.75));
+		set = min<float>(max<float>(set, 0), SC_PRESETS - 1);
+		apply_preset(dither_preset(set));
+
+		if (global_flags.x264_speedcontrol_verbose) {
+			static float cpu, wall, tgt, den;
+			const float decay = 1-1/100.;
+			cpu = cpu*decay + cpu_time_last_frame;
+			wall = wall*decay + delta_t;
+			tgt = tgt*decay + target;
+			den = den*decay + 1;
+			fprintf(stderr, "speed: %.2f+%.2f %d[%.5f] (t/c/w: %6.0f/%6.0f/%6.0f = %.4f) fps=%.2f\r",
+					s1, s2, preset, (float)buffer_fill / buffer_size,
+					tgt/den, cpu/den, wall/den, cpu/wall, 1e6*den/wall );
+		}
+	}
+
+}
+
+void X264SpeedControl::after_frame()
+{
+	cpu_time_last_frame = mdate() - timestamp;
+}
+
+void X264SpeedControl::set_buffer_size(int new_buffer_size)
+{
+	new_buffer_size = max(3, new_buffer_size);
+	buffer_size = new_buffer_size * uspf;
+	cplx_decay = 1 - 1./new_buffer_size;
+	compensation_period = buffer_size/4;
+}
+
+int X264SpeedControl::dither_preset(float f)
+{
+	int i = f;
+	if (f < 0) {
+		i--;
+	}
+	dither += f - i;
+	if (dither >= 1.0) {
+		dither--;
+		i++;
+	}
+	return i;
+}
+
+void X264SpeedControl::apply_preset(int new_preset)
+{
+	new_preset = max(new_preset, 0);
+	new_preset = min(new_preset, SC_PRESETS - 1);
+
+	const sc_preset_t *s = &presets[new_preset];
+	x264_param_t p;
+	x264_encoder_parameters(x264, &p);
+
+	p.i_frame_reference = s->refs;
+	p.i_bframe_adaptive = s->badapt;
+	p.i_bframe = s->bframes;
+	p.analyse.inter = s->partitions;
+	p.analyse.i_subpel_refine = s->subme;
+	p.analyse.i_me_method = s->me;
+	p.analyse.i_trellis = s->trellis;
+	p.analyse.b_mixed_references = s->mix;
+	p.analyse.i_direct_mv_pred = s->direct;
+	p.analyse.i_me_range = s->merange;
+	x264_encoder_reconfig(x264, &p);
+	preset = new_preset;
+}
+
+int64_t X264SpeedControl::mdate()
+{
+	timespec now;
+	clock_gettime(CLOCK_MONOTONIC, &now);
+	return now.tv_sec * 1000000 + now.tv_nsec / 1000;
+}
diff --git a/x264_speed_control.h b/x264_speed_control.h
new file mode 100644
index 0000000..b498826
--- /dev/null
+++ b/x264_speed_control.h
@@ -0,0 +1,116 @@
+// The x264 speed control tries to encode video at maximum possible quality
+// without skipping frames (at the expense of higher encoding latency and
+// less even output rates, although VBV is still respected). It does this
+// by continuously (every frame) changing the x264 quality settings such that
+// it uses maximum amount of CPU, but no more.
+//
+// Speed control works by maintaining a queue of frames, with the confusing
+// nomenclature âfullâ meaning that there are no queues in the frame.
+// (Conversely, if the queue is âemptyâ and a new frame comes in, we need to
+// drop that frame.) It tries to keep the buffer 3/4 âfullâ by using a table
+// of measured relative speeds for the different presets, and choosing one that it
+// thinks will return the buffer to that state over time. However, since
+// different frames take different times to encode regardless of preset, it
+// also tries to maintain a running average of how long the typical frame will
+// take to encode at the fastest preset (the so-called âcomplexityâ), by dividing
+// the actual time by the relative time for the preset used.
+//
+// Frame timings is a complex topic in its own sright, since usually, multiple
+// frames are encoded in parallel. X264SpeedControl only supports the timing
+// method that the original patch calls âalternate timingâ; one simply measures
+// the time the last x264_encoder_encode() call took. (The other alternative given
+// is to measure the time between successive x264_encoder_encode() calls.)
+// Unless using the zerocopy presets (which activate slice threading), the function
+// actually returns not when the given frame is done encoding, but when one a few
+// frames back is done encoding. So it doesn't actually measure the time of any
+// given one frame, but it measures something correlated to it, at least as long as
+// you are near 100% CPU utilization (ie., the encoded frame doesn't linger in the
+// buffers already when x264_encoder_encode() is called).
+//
+// The code has a long history; it was originally part of Avail Media's x264
+// branch, used in their encoder appliances, and then a snapshot of that was
+// released. (Given that x264 is licensed under GPLv2 or newer, this means that
+// we can also treat the patch as GPLv2 or newer if we want, which we do.
+// As far as I know, it is copyright Avail Media, although no specific copyright
+// notice was posted on the patch.)
+//
+// From there, it was incorporated in OBE's x264 tree (x264-obe) and some bugs
+// were fixed. I started working on it for the purposes of Nageru, fixing various
+// issues, adding VFR support and redoing the timings entirely based on more
+// modern presets (the patch was made before several important x264 features,
+// such as weighted P-frames). Finally, I took it out of x264 and put it into
+// Nageru (it does not actually use any hooks into the codec itself), so that
+// one does not need to patch x264 to use it in Nageru. It still could do with
+// some cleanup, but it's much, much better than just using a static preset.
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+
+extern "C" {
+#include "x264.h"
+}
+
+class X264SpeedControl {
+public:
+	// x264: Encoding object we are using; must be opened. Assumed to be
+	//    set to the "faster" preset, and with 16 reference frames.
+	// f_speed: Relative encoding speed, usually 1.0.
+	// i_buffer_size: Number of frames in the buffer.
+	// f_buffer_init: Relative fullness of buffer at start
+	//    (0.0 = assumed to be <i_buffer_size> frames in buffer,
+	//     1.0 = no frames in buffer)
+	X264SpeedControl(x264_t *x264, float f_speed, int i_buffer_size, float f_buffer_init);
+	~X264SpeedControl();
+
+	// You need to call before_frame() immediately before each call to
+	// x264_encoder_encode(), and after_frame() immediately after.
+	//
+	// new_buffer_fill: Buffer fullness, in microseconds (_not_ a relative
+	//   number, unlike f_buffer_init in the constructor).
+	// new_buffer_size: If > 0, new number of frames in the buffer,
+	//   ie. the buffer size has changed. (It is harmless to set this
+	//   even if the buffer hasn't actually changed.)
+	// f_uspf: If > 0, new microseconds per frame, ie. the frame rate has
+	//   changed. (Of course, with VFR, it can be impossible to truly know
+	//   the frame rate of the coming frames, but it is a reasonable
+	//   assumption that the next second or so is likely to be the same
+	//   frame rate as the last frame.)
+	void before_frame(float new_buffer_fill, int new_buffer_size, float f_uspf);
+	void after_frame();
+
+private:
+	void set_buffer_size(int new_buffer_size);
+	int dither_preset(float f);
+	void apply_preset(int new_preset);
+	int64_t mdate();  // Current time in microseconds.
+
+	// Not owned by us.
+	x264_t *x264;
+
+	float f_speed;
+
+	// all times are in usec
+	int64_t timestamp;   // when was speedcontrol last invoked
+	int64_t cpu_time_last_frame = 0;    // time spent encoding the previous frame
+	int64_t buffer_size; // assumed application-side buffer of frames to be streamed (measured in microseconds),
+	int64_t buffer_fill; //   where full = we don't have to hurry
+	int64_t compensation_period; // how quickly we try to return to the target buffer fullness
+	float uspf;          // microseconds per frame
+	int preset = -1;     // which setting was used in the previous frame
+	float cplx_num = 3e3;  // rolling average of estimated spf for preset #0. FIXME estimate initial complexity
+	float cplx_den = .1;
+	float cplx_decay;
+	float dither = 0.0f;
+
+	bool first = true;
+	bool buffer_complete = false;
+
+	struct
+	{
+		int64_t min_buffer, max_buffer;
+		double avg_preset;
+		int den;
+	} stat;
+};
-- 
2.39.2