From 16c0e5da7fa7b4eeea79470c24697a1ba193f071 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Fri, 29 Apr 2016 02:10:00 +0200 Subject: [PATCH] Implement x264 speedcontrol. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Based on the speedcontrol patch series from x264-obe, but bugfixed, reworked and pulled out so that it can run with vanilla x264. Note that vanilla x264 right now has a bug that this patch exposes; you want the patch called “Fix corruption with reconfig which changed SPS (like change of --ref)” (currently in x264-sandbox). --- Makefile | 2 +- flags.cpp | 19 +++ flags.h | 4 +- x264_encoder.cpp | 20 +++ x264_encoder.h | 2 + x264_speed_control.cpp | 313 +++++++++++++++++++++++++++++++++++++++++ x264_speed_control.h | 116 +++++++++++++++ 7 files changed, 474 insertions(+), 2 deletions(-) create mode 100644 x264_speed_control.cpp create mode 100644 x264_speed_control.h diff --git a/Makefile b/Makefile index 4c2df49..cfcdedd 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ OBJS += glwidget.moc.o mainwindow.moc.o vumeter.moc.o lrameter.moc.o correlation OBJS += mixer.o bmusb/bmusb.o pbo_frame_allocator.o context.o ref_counted_frame.o theme.o resampling_queue.o httpd.o ebu_r128_proc.o flags.o image_input.o stereocompressor.o filter.o alsa_output.o correlation_measurer.o # Streaming and encoding objects -OBJS += quicksync_encoder.o x264_encoder.o video_encoder.o metacube2.o mux.o audio_encoder.o +OBJS += quicksync_encoder.o x264_encoder.o x264_speed_control.o video_encoder.o metacube2.o mux.o audio_encoder.o # DeckLink OBJS += decklink_capture.o decklink/DeckLinkAPIDispatch.o diff --git a/flags.cpp b/flags.cpp index 7e8beaf..25f11b4 100644 --- a/flags.cpp +++ b/flags.cpp @@ -25,6 +25,8 @@ void usage() fprintf(stderr, " --http-x264-video send x264-compressed video to HTTP clients\n"); fprintf(stderr, " --x264-preset x264 quality preset (default " X264_DEFAULT_PRESET ")\n"); fprintf(stderr, " --x264-tune x264 tuning (default " X264_DEFAULT_TUNE ", can be blank)\n"); + fprintf(stderr, " --x264-speedcontrol try to match x264 preset to available CPU speed\n"); + fprintf(stderr, " --x264-speedcontrol-verbose output speedcontrol debugging statistics\n"); fprintf(stderr, " --x264-bitrate x264 bitrate (in kilobit/sec, default %d)\n", DEFAULT_X264_OUTPUT_BIT_RATE); fprintf(stderr, " --x264-vbv-bufsize x264 VBV size (in kilobits, 0 = one-frame VBV,\n"); @@ -58,6 +60,8 @@ void parse_flags(int argc, char * const argv[]) { "http-x264-video", no_argument, 0, 1008 }, { "x264-preset", required_argument, 0, 1009 }, { "x264-tune", required_argument, 0, 1010 }, + { "x264-speedcontrol", no_argument, 0, 1015 }, + { "x264-speedcontrol-verbose", no_argument, 0, 1016 }, { "x264-bitrate", required_argument, 0, 1011 }, { "x264-vbv-bufsize", required_argument, 0, 1012 }, { "x264-vbv-max-bitrate", required_argument, 0, 1013 }, @@ -128,6 +132,12 @@ void parse_flags(int argc, char * const argv[]) case 1010: global_flags.x264_tune = optarg; break; + case 1015: + global_flags.x264_speedcontrol = true; + break; + case 1016: + global_flags.x264_speedcontrol_verbose = true; + break; case 1011: global_flags.x264_bitrate = atoi(optarg); break; @@ -162,6 +172,15 @@ void parse_flags(int argc, char * const argv[]) fprintf(stderr, "ERROR: --http-uncompressed-video and --http-x264-video are mutually incompatible\n"); exit(1); } + if (global_flags.x264_speedcontrol) { + if (!global_flags.x264_preset.empty() && global_flags.x264_preset != "faster") { + fprintf(stderr, "WARNING: --x264-preset is overridden by --x264-speedcontrol (implicitly uses \"faster\" as base preset)\n"); + } + global_flags.x264_preset = "faster"; + } else if (global_flags.x264_preset.empty()) { + global_flags.x264_preset = X264_DEFAULT_PRESET; + } + for (pair mapping : global_flags.default_stream_mapping) { if (mapping.second >= global_flags.num_cards) { fprintf(stderr, "ERROR: Signal %d mapped to card %d, which doesn't exist (try adjusting --num-cards)\n", diff --git a/flags.h b/flags.h index c3ea0fc..edc47dd 100644 --- a/flags.h +++ b/flags.h @@ -18,8 +18,10 @@ struct Flags { bool stream_coarse_timebase = false; std::string stream_audio_codec_name; // Blank = use the same as for the recording. int stream_audio_codec_bitrate = DEFAULT_AUDIO_OUTPUT_BIT_RATE; // Ignored if stream_audio_codec_name is blank. - std::string x264_preset = X264_DEFAULT_PRESET; + std::string x264_preset; // Empty will be overridden by X264_DEFAULT_PRESET, unless speedcontrol is set. std::string x264_tune = X264_DEFAULT_TUNE; + bool x264_speedcontrol = false; + bool x264_speedcontrol_verbose = false; int x264_bitrate = DEFAULT_X264_OUTPUT_BIT_RATE; // In kilobit/sec. int x264_vbv_max_bitrate = -1; // In kilobits. 0 = no limit, -1 = same as (CBR). int x264_vbv_buffer_size = -1; // In kilobits. 0 = one-frame VBV, -1 = same as (one-second VBV). diff --git a/x264_encoder.cpp b/x264_encoder.cpp index df9e994..e19a869 100644 --- a/x264_encoder.cpp +++ b/x264_encoder.cpp @@ -6,6 +6,7 @@ #include "mux.h" #include "timebase.h" #include "x264_encoder.h" +#include "x264_speed_control.h" extern "C" { #include @@ -68,6 +69,9 @@ void X264Encoder::init_x264() param.i_timebase_num = 1; param.i_timebase_den = TIMEBASE; param.i_keyint_max = 50; // About one second. + if (global_flags.x264_speedcontrol) { + param.i_frame_reference = 16; // Because speedcontrol is never allowed to change this above what we set at start. + } // NOTE: These should be in sync with the ones in h264encode.cpp (sbs_rbsp()). param.vui.i_vidformat = 5; // Unspecified. @@ -106,6 +110,10 @@ void X264Encoder::init_x264() exit(1); } + if (global_flags.x264_speedcontrol) { + speed_control.reset(new X264SpeedControl(x264, /*f_speed=*/1.0f, X264_QUEUE_LENGTH, /*f_buffer_init=*/1.0f)); + } + if (wants_global_headers) { x264_nal_t *nal; int num_nal; @@ -181,9 +189,21 @@ void X264Encoder::encode_frame(X264Encoder::QueuedFrame qf) pic.img.i_stride[1] = WIDTH / 2 * sizeof(uint16_t); pic.opaque = reinterpret_cast(intptr_t(qf.duration)); + if (speed_control) { + speed_control->before_frame(float(free_frames.size()) / X264_QUEUE_LENGTH, X264_QUEUE_LENGTH, 1e6 * qf.duration / TIMEBASE); + } x264_encoder_encode(x264, &nal, &num_nal, &pic, &pic); + if (speed_control) { + speed_control->after_frame(); + } } else { + if (speed_control) { + speed_control->before_frame(float(free_frames.size()) / X264_QUEUE_LENGTH, X264_QUEUE_LENGTH, 1e6 * qf.duration / TIMEBASE); + } x264_encoder_encode(x264, &nal, &num_nal, nullptr, &pic); + if (speed_control) { + speed_control->after_frame(); + } } // We really need one AVPacket for the entire frame, it seems, diff --git a/x264_encoder.h b/x264_encoder.h index 729cb7f..ef118c6 100644 --- a/x264_encoder.h +++ b/x264_encoder.h @@ -31,6 +31,7 @@ extern "C" { } class Mux; +class X264SpeedControl; class X264Encoder { public: @@ -72,6 +73,7 @@ private: std::thread encoder_thread; std::atomic should_quit{false}; x264_t *x264; + std::unique_ptr speed_control; // Protects everything below it. std::mutex mu; diff --git a/x264_speed_control.cpp b/x264_speed_control.cpp new file mode 100644 index 0000000..3ed3ece --- /dev/null +++ b/x264_speed_control.cpp @@ -0,0 +1,313 @@ +#include "x264_speed_control.h" + +#include "flags.h" + +#include + +#include + +using namespace std; + +X264SpeedControl::X264SpeedControl(x264_t *x264, float f_speed, int i_buffer_size, float f_buffer_init) + : x264(x264), f_speed(f_speed) +{ + x264_param_t param; + x264_encoder_parameters(x264, ¶m); + + float fps = (float)param.i_fps_num / param.i_fps_den; + uspf = 1e6 / fps; + set_buffer_size(i_buffer_size); + buffer_fill = buffer_size * f_buffer_init; + buffer_fill = max(buffer_fill, uspf); + buffer_fill = min(buffer_fill, buffer_size); + timestamp = mdate(); + preset = -1; + cplx_num = 3e3; //FIXME estimate initial complexity + cplx_den = .1; + stat.min_buffer = buffer_size; + stat.max_buffer = 0; +} + +X264SpeedControl::~X264SpeedControl() +{ + fprintf(stderr, "speedcontrol: avg preset=%.3f buffer min=%.3f max=%.3f\n", + stat.avg_preset / stat.den, + (float)stat.min_buffer / buffer_size, + (float)stat.max_buffer / buffer_size ); + // x264_log( x264, X264_LOG_INFO, "speedcontrol: avg cplx=%.5f\n", cplx_num / cplx_den ); +} + +typedef struct +{ + float time; // relative encoding time, compared to the other presets + int subme; + int me; + int refs; + int mix; + int trellis; + int partitions; + int badapt; + int bframes; + int direct; + int merange; +} sc_preset_t; + +// The actual presets, including the equivalent commandline options. Note that +// all presets are benchmarked with --weightp 1 --mbtree --rc-lookahead 20 +// on top of the given settings (equivalent settings to the "faster" preset). +// Timings and SSIM measurements were done on a quadcore Haswell i5 3.2 GHz +// on the first 1000 frames of "Tears of Steel" in 1080p. +// +// Note that the two first and the two last are also used for extrapolation +// should the desired time be outside the range. Thus, it is disadvantageous if +// they are chosen so that the timings are too close to each other. +#define SC_PRESETS 26 +static const sc_preset_t presets[SC_PRESETS] = { +#define I4 X264_ANALYSE_I4x4 +#define I8 X264_ANALYSE_I8x8 +#define P4 X264_ANALYSE_PSUB8x8 +#define P8 X264_ANALYSE_PSUB16x16 +#define B8 X264_ANALYSE_BSUB16x16 + // Preset 0: 14.179db, --preset superfast --b-adapt 0 --bframes 0 + { .time= 1.000, .subme=1, .me=X264_ME_DIA, .refs=1, .mix=0, .trellis=0, .partitions=I8|I4, .badapt=0, .bframes=0, .direct=0, .merange=16 }, + + // Preset 1: 14.459db, --preset superfast + { .time= 1.283, .subme=1, .me=X264_ME_DIA, .refs=1, .mix=0, .trellis=0, .partitions=I8|I4, .badapt=1, .bframes=3, .direct=1, .merange=16 }, + + // Preset 2: 14.761db, --preset superfast --subme 2 + { .time= 1.603, .subme=2, .me=X264_ME_DIA, .refs=1, .mix=0, .trellis=0, .partitions=I8|I4, .badapt=1, .bframes=3, .direct=1, .merange=16 }, + + // Preset 3: 15.543db, --preset veryfast + { .time= 1.843, .subme=2, .me=X264_ME_HEX, .refs=1, .mix=0, .trellis=0, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 }, + + // Preset 4: 15.716db, --preset veryfast --subme 3 + { .time= 2.452, .subme=3, .me=X264_ME_HEX, .refs=1, .mix=0, .trellis=0, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 }, + + // Preset 5: 15.786db, --preset veryfast --subme 3 --ref 2 + { .time= 2.733, .subme=3, .me=X264_ME_HEX, .refs=2, .mix=0, .trellis=0, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 }, + + // Preset 6: 15.813db, --preset veryfast --subme 4 --ref 2 + { .time= 3.085, .subme=4, .me=X264_ME_HEX, .refs=2, .mix=0, .trellis=0, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 }, + + // Preset 7: 15.849db, --preset faster + { .time= 3.101, .subme=4, .me=X264_ME_HEX, .refs=2, .mix=0, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 }, + + // Preset 8: 15.857db, --preset faster --mixed-refs + { .time= 3.284, .subme=4, .me=X264_ME_HEX, .refs=2, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 }, + + // Preset 9: 15.869db, --preset faster --mixed-refs --subme 5 + { .time= 3.587, .subme=5, .me=X264_ME_HEX, .refs=2, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 }, + + // Preset 10: 16.051db, --preset fast + { .time= 3.947, .subme=6, .me=X264_ME_HEX, .refs=2, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 }, + + // Preset 11: 16.356db, --preset fast --subme 7 + { .time= 4.041, .subme=7, .me=X264_ME_HEX, .refs=2, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 }, + + // Preset 12: 16.418db, --preset fast --subme 7 --ref 3 + { .time= 4.406, .subme=7, .me=X264_ME_HEX, .refs=3, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 }, + + // Preset 13: 16.460db, --preset medium + { .time= 4.707, .subme=7, .me=X264_ME_HEX, .refs=3, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 }, + + // Preset 14: 16.517db, --preset medium --subme 8 + { .time= 5.133, .subme=8, .me=X264_ME_HEX, .refs=3, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 }, + + // Preset 15: 16.523db, --preset medium --subme 8 --me umh + { .time= 6.050, .subme=8, .me=X264_ME_UMH, .refs=3, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 }, + + // Preset 16: 16.543db, --preset medium --subme 8 --me umh --direct auto --b-adapt 2 + { .time= 6.849, .subme=8, .me=X264_ME_UMH, .refs=3, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=2, .bframes=3, .direct=3, .merange=16 }, + + // Preset 17: 16.613db, --preset slow + { .time= 8.042, .subme=8, .me=X264_ME_UMH, .refs=5, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=2, .bframes=3, .direct=3, .merange=16 }, + + // Preset 18: 16.641db, --preset slow --subme 9 + { .time= 8.972, .subme=9, .me=X264_ME_UMH, .refs=5, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=2, .bframes=3, .direct=3, .merange=16 }, + + // Preset 19: 16.895db, --preset slow --subme 9 --trellis 2 + { .time=10.073, .subme=9, .me=X264_ME_UMH, .refs=5, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8, .badapt=2, .bframes=3, .direct=3, .merange=16 }, + + // Preset 20: 16.918db, --preset slow --subme 9 --trellis 2 --ref 6 + { .time=11.147, .subme=9, .me=X264_ME_UMH, .refs=6, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8, .badapt=2, .bframes=3, .direct=3, .merange=16 }, + + // Preset 21: 16.934db, --preset slow --subme 9 --trellis 2 --ref 7 + { .time=12.267, .subme=9, .me=X264_ME_UMH, .refs=7, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8, .badapt=2, .bframes=3, .direct=3, .merange=16 }, + + // Preset 22: 16.948db, --preset slower + { .time=13.829, .subme=9, .me=X264_ME_UMH, .refs=8, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8|P4, .badapt=2, .bframes=3, .direct=3, .merange=16 }, + + // Preset 23: 17.058db, --preset slower --subme 10 + { .time=14.831, .subme=10, .me=X264_ME_UMH, .refs=8, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8|P4, .badapt=2, .bframes=3, .direct=3, .merange=16 }, + + // Preset 24: 17.268db, --preset slower --subme 10 --bframes 8 + { .time=18.705, .subme=10, .me=X264_ME_UMH, .refs=8, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8|P4, .badapt=2, .bframes=8, .direct=3, .merange=16 }, + + // Preset 25: 17.297db, --preset veryslow + { .time=31.419, .subme=10, .me=X264_ME_UMH, .refs=16, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8|P4, .badapt=2, .bframes=8, .direct=3, .merange=24 }, +#undef I4 +#undef I8 +#undef P4 +#undef P8 +#undef B8 +}; + +void X264SpeedControl::before_frame(float new_buffer_fill, int new_buffer_size, float new_uspf) +{ + if (new_uspf > 0.0) { + uspf = new_uspf; + } + if (new_buffer_size) { + set_buffer_size(new_buffer_size); + } + buffer_fill = buffer_size * new_buffer_fill; + + int64_t t, delta_t; + + // update buffer state after encoding and outputting the previous frame(s) + if (first) { + t = timestamp = mdate(); + first = false; + } else { + t = mdate(); + } + + delta_t = t - timestamp; + timestamp = t; + + // update the time predictor + int cpu_time = cpu_time_last_frame; + cplx_num *= cplx_decay; + cplx_den *= cplx_decay; + cplx_num += cpu_time / presets[preset].time; + ++cplx_den; + + stat.avg_preset += preset; + ++stat.den; + + stat.min_buffer = min(buffer_fill, stat.min_buffer); + stat.max_buffer = max(buffer_fill, stat.max_buffer); + + if (buffer_fill >= buffer_size) { // oops, cpu was idle + // not really an error, but we'll warn for debugging purposes + static int64_t idle_t = 0, print_interval = 0; + idle_t += buffer_fill - buffer_size; + if (t - print_interval > 1e6) { + //fprintf(stderr, "speedcontrol idle (%.6f sec)\n", idle_t/1e6); + print_interval = t; + idle_t = 0; + } + buffer_fill = buffer_size; + } else if (buffer_fill <= 0) { // oops, we're late + // fprintf(stderr, "speedcontrol underflow (%.6f sec)\n", buffer_fill/1e6); + } + + { + // Pick the preset that should return the buffer to 3/4-full within a time + // specified by compensation_period. + // + // NOTE: This doesn't actually do that, at least assuming the same target is + // chosen for every frame; exactly what it does is unclear to me. It seems + // to consistently undershoot a bit, so it needs to be saved by the second + // predictor below. However, fixing the formula seems to yield somewhat less + // stable results in practice; in particular, once the buffer is half-full + // or so, it would give us a negative target. Perhaps increasing + // compensation_period would be a good idea, but initial (very brief) tests + // did not yield good results. + float target = uspf / f_speed + * (buffer_fill + compensation_period) + / (buffer_size*3/4 + compensation_period); + float cplx = cplx_num / cplx_den; + float set, t0, t1; + float filled = (float) buffer_fill / buffer_size; + int i; + t0 = presets[0].time * cplx; + for (i = 1; ; i++) { + t1 = presets[i].time * cplx; + if (t1 >= target || i == SC_PRESETS - 1) + break; + t0 = t1; + } + // exponential interpolation between states + set = i-1 + (log(target) - log(t0)) / (log(t1) - log(t0)); + set = max(set, -5); + set = min(set, (SC_PRESETS-1) + 5); + // Even if our time estimations in the SC_PRESETS array are off + // this will push us towards our target fullness + float s1 = set; + set += (40 * (filled-0.75)); + float s2 = (40 * (filled-0.75)); + set = min(max(set, 0), SC_PRESETS - 1); + apply_preset(dither_preset(set)); + + if (global_flags.x264_speedcontrol_verbose) { + static float cpu, wall, tgt, den; + const float decay = 1-1/100.; + cpu = cpu*decay + cpu_time_last_frame; + wall = wall*decay + delta_t; + tgt = tgt*decay + target; + den = den*decay + 1; + fprintf(stderr, "speed: %.2f+%.2f %d[%.5f] (t/c/w: %6.0f/%6.0f/%6.0f = %.4f) fps=%.2f\r", + s1, s2, preset, (float)buffer_fill / buffer_size, + tgt/den, cpu/den, wall/den, cpu/wall, 1e6*den/wall ); + } + } + +} + +void X264SpeedControl::after_frame() +{ + cpu_time_last_frame = mdate() - timestamp; +} + +void X264SpeedControl::set_buffer_size(int new_buffer_size) +{ + new_buffer_size = max(3, new_buffer_size); + buffer_size = new_buffer_size * uspf; + cplx_decay = 1 - 1./new_buffer_size; + compensation_period = buffer_size/4; +} + +int X264SpeedControl::dither_preset(float f) +{ + int i = f; + if (f < 0) { + i--; + } + dither += f - i; + if (dither >= 1.0) { + dither--; + i++; + } + return i; +} + +void X264SpeedControl::apply_preset(int new_preset) +{ + new_preset = max(new_preset, 0); + new_preset = min(new_preset, SC_PRESETS - 1); + + const sc_preset_t *s = &presets[new_preset]; + x264_param_t p; + x264_encoder_parameters(x264, &p); + + p.i_frame_reference = s->refs; + p.i_bframe_adaptive = s->badapt; + p.i_bframe = s->bframes; + p.analyse.inter = s->partitions; + p.analyse.i_subpel_refine = s->subme; + p.analyse.i_me_method = s->me; + p.analyse.i_trellis = s->trellis; + p.analyse.b_mixed_references = s->mix; + p.analyse.i_direct_mv_pred = s->direct; + p.analyse.i_me_range = s->merange; + x264_encoder_reconfig(x264, &p); + preset = new_preset; +} + +int64_t X264SpeedControl::mdate() +{ + timespec now; + clock_gettime(CLOCK_MONOTONIC, &now); + return now.tv_sec * 1000000 + now.tv_nsec / 1000; +} diff --git a/x264_speed_control.h b/x264_speed_control.h new file mode 100644 index 0000000..b498826 --- /dev/null +++ b/x264_speed_control.h @@ -0,0 +1,116 @@ +// The x264 speed control tries to encode video at maximum possible quality +// without skipping frames (at the expense of higher encoding latency and +// less even output rates, although VBV is still respected). It does this +// by continuously (every frame) changing the x264 quality settings such that +// it uses maximum amount of CPU, but no more. +// +// Speed control works by maintaining a queue of frames, with the confusing +// nomenclature “full” meaning that there are no queues in the frame. +// (Conversely, if the queue is “empty” and a new frame comes in, we need to +// drop that frame.) It tries to keep the buffer 3/4 “full” by using a table +// of measured relative speeds for the different presets, and choosing one that it +// thinks will return the buffer to that state over time. However, since +// different frames take different times to encode regardless of preset, it +// also tries to maintain a running average of how long the typical frame will +// take to encode at the fastest preset (the so-called “complexity”), by dividing +// the actual time by the relative time for the preset used. +// +// Frame timings is a complex topic in its own sright, since usually, multiple +// frames are encoded in parallel. X264SpeedControl only supports the timing +// method that the original patch calls “alternate timing”; one simply measures +// the time the last x264_encoder_encode() call took. (The other alternative given +// is to measure the time between successive x264_encoder_encode() calls.) +// Unless using the zerocopy presets (which activate slice threading), the function +// actually returns not when the given frame is done encoding, but when one a few +// frames back is done encoding. So it doesn't actually measure the time of any +// given one frame, but it measures something correlated to it, at least as long as +// you are near 100% CPU utilization (ie., the encoded frame doesn't linger in the +// buffers already when x264_encoder_encode() is called). +// +// The code has a long history; it was originally part of Avail Media's x264 +// branch, used in their encoder appliances, and then a snapshot of that was +// released. (Given that x264 is licensed under GPLv2 or newer, this means that +// we can also treat the patch as GPLv2 or newer if we want, which we do. +// As far as I know, it is copyright Avail Media, although no specific copyright +// notice was posted on the patch.) +// +// From there, it was incorporated in OBE's x264 tree (x264-obe) and some bugs +// were fixed. I started working on it for the purposes of Nageru, fixing various +// issues, adding VFR support and redoing the timings entirely based on more +// modern presets (the patch was made before several important x264 features, +// such as weighted P-frames). Finally, I took it out of x264 and put it into +// Nageru (it does not actually use any hooks into the codec itself), so that +// one does not need to patch x264 to use it in Nageru. It still could do with +// some cleanup, but it's much, much better than just using a static preset. + +#include +#include +#include +#include + +extern "C" { +#include "x264.h" +} + +class X264SpeedControl { +public: + // x264: Encoding object we are using; must be opened. Assumed to be + // set to the "faster" preset, and with 16 reference frames. + // f_speed: Relative encoding speed, usually 1.0. + // i_buffer_size: Number of frames in the buffer. + // f_buffer_init: Relative fullness of buffer at start + // (0.0 = assumed to be frames in buffer, + // 1.0 = no frames in buffer) + X264SpeedControl(x264_t *x264, float f_speed, int i_buffer_size, float f_buffer_init); + ~X264SpeedControl(); + + // You need to call before_frame() immediately before each call to + // x264_encoder_encode(), and after_frame() immediately after. + // + // new_buffer_fill: Buffer fullness, in microseconds (_not_ a relative + // number, unlike f_buffer_init in the constructor). + // new_buffer_size: If > 0, new number of frames in the buffer, + // ie. the buffer size has changed. (It is harmless to set this + // even if the buffer hasn't actually changed.) + // f_uspf: If > 0, new microseconds per frame, ie. the frame rate has + // changed. (Of course, with VFR, it can be impossible to truly know + // the frame rate of the coming frames, but it is a reasonable + // assumption that the next second or so is likely to be the same + // frame rate as the last frame.) + void before_frame(float new_buffer_fill, int new_buffer_size, float f_uspf); + void after_frame(); + +private: + void set_buffer_size(int new_buffer_size); + int dither_preset(float f); + void apply_preset(int new_preset); + int64_t mdate(); // Current time in microseconds. + + // Not owned by us. + x264_t *x264; + + float f_speed; + + // all times are in usec + int64_t timestamp; // when was speedcontrol last invoked + int64_t cpu_time_last_frame = 0; // time spent encoding the previous frame + int64_t buffer_size; // assumed application-side buffer of frames to be streamed (measured in microseconds), + int64_t buffer_fill; // where full = we don't have to hurry + int64_t compensation_period; // how quickly we try to return to the target buffer fullness + float uspf; // microseconds per frame + int preset = -1; // which setting was used in the previous frame + float cplx_num = 3e3; // rolling average of estimated spf for preset #0. FIXME estimate initial complexity + float cplx_den = .1; + float cplx_decay; + float dither = 0.0f; + + bool first = true; + bool buffer_complete = false; + + struct + { + int64_t min_buffer, max_buffer; + double avg_preset; + int den; + } stat; +}; -- 2.39.2