1 #include "x264_speed_control.h"
11 #include <type_traits>
17 using namespace std::chrono;
19 X264SpeedControl::X264SpeedControl(x264_t *x264, float f_speed, int i_buffer_size, float f_buffer_init)
20 : dyn(load_x264_for_bit_depth(global_flags.x264_bit_depth)),
21 x264(x264), f_speed(f_speed)
24 dyn.x264_encoder_parameters(x264, ¶m);
26 float fps = (float)param.i_fps_num / param.i_fps_den;
28 set_buffer_size(i_buffer_size);
29 buffer_fill = buffer_size * f_buffer_init;
30 buffer_fill = max<int64_t>(buffer_fill, uspf);
31 buffer_fill = min(buffer_fill, buffer_size);
32 timestamp = steady_clock::now();
34 cplx_num = 3e3; //FIXME estimate initial complexity
36 stat.min_buffer = buffer_size;
38 stat.avg_preset = 0.0;
41 metric_x264_speedcontrol_buffer_available_seconds = buffer_fill * 1e-6;
42 metric_x264_speedcontrol_buffer_size_seconds = buffer_size * 1e-6;
43 global_metrics.add_histogram("x264_speedcontrol_preset_used_frames", {}, metric_x264_speedcontrol_preset_used_frames, &metric_x264_speedcontrol_preset_used_frames_sum, SC_PRESETS);
44 global_metrics.add("x264_speedcontrol_buffer_available_seconds", &metric_x264_speedcontrol_buffer_available_seconds, Metrics::TYPE_GAUGE);
45 global_metrics.add("x264_speedcontrol_buffer_size_seconds", &metric_x264_speedcontrol_buffer_size_seconds, Metrics::TYPE_GAUGE);
46 global_metrics.add("x264_speedcontrol_idle_frames", &metric_x264_speedcontrol_idle_frames);
47 global_metrics.add("x264_speedcontrol_late_frames", &metric_x264_speedcontrol_late_frames);
50 X264SpeedControl::~X264SpeedControl()
52 fprintf(stderr, "speedcontrol: avg preset=%.3f buffer min=%.3f max=%.3f\n",
53 stat.avg_preset / stat.den,
54 (float)stat.min_buffer / buffer_size,
55 (float)stat.max_buffer / buffer_size );
56 // x264_log( x264, X264_LOG_INFO, "speedcontrol: avg cplx=%.5f\n", cplx_num / cplx_den );
64 float time; // relative encoding time, compared to the other presets
77 // The actual presets, including the equivalent commandline options. Note that
78 // all presets are benchmarked with --weightp 1 --mbtree --rc-lookahead 20
79 // on top of the given settings (equivalent settings to the "faster" preset).
80 // Timings and SSIM measurements were done on a quadcore Haswell i5 3.2 GHz
81 // on the first 1000 frames of "Tears of Steel" in 1080p.
83 // Note that the two first and the two last are also used for extrapolation
84 // should the desired time be outside the range. Thus, it is disadvantageous if
85 // they are chosen so that the timings are too close to each other.
86 static const sc_preset_t presets[SC_PRESETS] = {
87 #define I4 X264_ANALYSE_I4x4
88 #define I8 X264_ANALYSE_I8x8
89 #define P4 X264_ANALYSE_PSUB8x8
90 #define P8 X264_ANALYSE_PSUB16x16
91 #define B8 X264_ANALYSE_BSUB16x16
92 // Preset 0: 14.179db, --preset superfast --b-adapt 0 --bframes 0
93 { .time= 1.000, .subme=1, .me=X264_ME_DIA, .refs=1, .mix=0, .trellis=0, .partitions=I8|I4, .badapt=0, .bframes=0, .direct=0, .merange=16 },
95 // Preset 1: 14.459db, --preset superfast
96 { .time= 1.283, .subme=1, .me=X264_ME_DIA, .refs=1, .mix=0, .trellis=0, .partitions=I8|I4, .badapt=1, .bframes=3, .direct=1, .merange=16 },
98 // Preset 2: 14.761db, --preset superfast --subme 2
99 { .time= 1.603, .subme=2, .me=X264_ME_DIA, .refs=1, .mix=0, .trellis=0, .partitions=I8|I4, .badapt=1, .bframes=3, .direct=1, .merange=16 },
101 // Preset 3: 15.543db, --preset veryfast
102 { .time= 1.843, .subme=2, .me=X264_ME_HEX, .refs=1, .mix=0, .trellis=0, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
104 // Preset 4: 15.716db, --preset veryfast --subme 3
105 { .time= 2.452, .subme=3, .me=X264_ME_HEX, .refs=1, .mix=0, .trellis=0, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
107 // Preset 5: 15.786db, --preset veryfast --subme 3 --ref 2
108 { .time= 2.733, .subme=3, .me=X264_ME_HEX, .refs=2, .mix=0, .trellis=0, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
110 // Preset 6: 15.813db, --preset veryfast --subme 4 --ref 2
111 { .time= 3.085, .subme=4, .me=X264_ME_HEX, .refs=2, .mix=0, .trellis=0, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
113 // Preset 7: 15.849db, --preset faster
114 { .time= 3.101, .subme=4, .me=X264_ME_HEX, .refs=2, .mix=0, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
116 // Preset 8: 15.857db, --preset faster --mixed-refs
117 { .time= 3.284, .subme=4, .me=X264_ME_HEX, .refs=2, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
119 // Preset 9: 15.869db, --preset faster --mixed-refs --subme 5
120 { .time= 3.587, .subme=5, .me=X264_ME_HEX, .refs=2, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
122 // Preset 10: 16.051db, --preset fast
123 { .time= 3.947, .subme=6, .me=X264_ME_HEX, .refs=2, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
125 // Preset 11: 16.356db, --preset fast --subme 7
126 { .time= 4.041, .subme=7, .me=X264_ME_HEX, .refs=2, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
128 // Preset 12: 16.418db, --preset fast --subme 7 --ref 3
129 { .time= 4.406, .subme=7, .me=X264_ME_HEX, .refs=3, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
131 // Preset 13: 16.460db, --preset medium
132 { .time= 4.707, .subme=7, .me=X264_ME_HEX, .refs=3, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
134 // Preset 14: 16.517db, --preset medium --subme 8
135 { .time= 5.133, .subme=8, .me=X264_ME_HEX, .refs=3, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
137 // Preset 15: 16.523db, --preset medium --subme 8 --me umh
138 { .time= 6.050, .subme=8, .me=X264_ME_UMH, .refs=3, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=1, .bframes=3, .direct=1, .merange=16 },
140 // Preset 16: 16.543db, --preset medium --subme 8 --me umh --direct auto --b-adapt 2
141 { .time= 6.849, .subme=8, .me=X264_ME_UMH, .refs=3, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=2, .bframes=3, .direct=3, .merange=16 },
143 // Preset 17: 16.613db, --preset slow
144 { .time= 8.042, .subme=8, .me=X264_ME_UMH, .refs=5, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=2, .bframes=3, .direct=3, .merange=16 },
146 // Preset 18: 16.641db, --preset slow --subme 9
147 { .time= 8.972, .subme=9, .me=X264_ME_UMH, .refs=5, .mix=1, .trellis=1, .partitions=I8|I4|P8|B8, .badapt=2, .bframes=3, .direct=3, .merange=16 },
149 // Preset 19: 16.895db, --preset slow --subme 9 --trellis 2
150 { .time=10.073, .subme=9, .me=X264_ME_UMH, .refs=5, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8, .badapt=2, .bframes=3, .direct=3, .merange=16 },
152 // Preset 20: 16.918db, --preset slow --subme 9 --trellis 2 --ref 6
153 { .time=11.147, .subme=9, .me=X264_ME_UMH, .refs=6, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8, .badapt=2, .bframes=3, .direct=3, .merange=16 },
155 // Preset 21: 16.934db, --preset slow --subme 9 --trellis 2 --ref 7
156 { .time=12.267, .subme=9, .me=X264_ME_UMH, .refs=7, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8, .badapt=2, .bframes=3, .direct=3, .merange=16 },
158 // Preset 22: 16.948db, --preset slower
159 { .time=13.829, .subme=9, .me=X264_ME_UMH, .refs=8, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8|P4, .badapt=2, .bframes=3, .direct=3, .merange=16 },
161 // Preset 23: 17.058db, --preset slower --subme 10
162 { .time=14.831, .subme=10, .me=X264_ME_UMH, .refs=8, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8|P4, .badapt=2, .bframes=3, .direct=3, .merange=16 },
164 // Preset 24: 17.268db, --preset slower --subme 10 --bframes 8
165 { .time=18.705, .subme=10, .me=X264_ME_UMH, .refs=8, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8|P4, .badapt=2, .bframes=8, .direct=3, .merange=16 },
167 // Preset 25: 17.297db, --preset veryslow
168 { .time=31.419, .subme=10, .me=X264_ME_UMH, .refs=16, .mix=1, .trellis=2, .partitions=I8|I4|P8|B8|P4, .badapt=2, .bframes=8, .direct=3, .merange=24 },
176 void X264SpeedControl::before_frame(float new_buffer_fill, int new_buffer_size, float new_uspf)
178 if (new_uspf > 0.0) {
181 if (new_buffer_size) {
182 set_buffer_size(new_buffer_size);
184 buffer_fill = buffer_size * new_buffer_fill;
185 metric_x264_speedcontrol_buffer_available_seconds = buffer_fill * 1e-6;
187 steady_clock::time_point t;
189 // update buffer state after encoding and outputting the previous frame(s)
191 t = timestamp = steady_clock::now();
194 t = steady_clock::now();
197 auto delta_t = t - timestamp;
200 // update the time predictor
202 int cpu_time = duration_cast<microseconds>(cpu_time_last_frame).count();
203 cplx_num *= cplx_decay;
204 cplx_den *= cplx_decay;
205 cplx_num += cpu_time / presets[preset].time;
208 stat.avg_preset += preset;
212 stat.min_buffer = min(buffer_fill, stat.min_buffer);
213 stat.max_buffer = max(buffer_fill, stat.max_buffer);
215 if (buffer_fill >= buffer_size) { // oops, cpu was idle
216 // not really an error, but we'll warn for debugging purposes
217 static int64_t idle_t = 0;
218 static steady_clock::time_point print_interval;
219 static bool first = false;
220 idle_t += buffer_fill - buffer_size;
221 if (first || duration<double>(t - print_interval).count() > 0.1) {
222 //fprintf(stderr, "speedcontrol idle (%.6f sec)\n", idle_t/1e6);
227 buffer_fill = buffer_size;
228 metric_x264_speedcontrol_buffer_available_seconds = buffer_fill * 1e-6;
229 ++metric_x264_speedcontrol_idle_frames;
230 } else if (buffer_fill <= 0) { // oops, we're late
231 // fprintf(stderr, "speedcontrol underflow (%.6f sec)\n", buffer_fill/1e6);
232 ++metric_x264_speedcontrol_late_frames;
236 // Pick the preset that should return the buffer to 3/4-full within a time
237 // specified by compensation_period.
239 // NOTE: This doesn't actually do that, at least assuming the same target is
240 // chosen for every frame; exactly what it does is unclear to me. It seems
241 // to consistently undershoot a bit, so it needs to be saved by the second
242 // predictor below. However, fixing the formula seems to yield somewhat less
243 // stable results in practice; in particular, once the buffer is half-full
244 // or so, it would give us a negative target. Perhaps increasing
245 // compensation_period would be a good idea, but initial (very brief) tests
246 // did not yield good results.
247 float target = uspf / f_speed
248 * (buffer_fill + compensation_period)
249 / (buffer_size*3/4 + compensation_period);
250 float cplx = cplx_num / cplx_den;
252 float filled = (float) buffer_fill / buffer_size;
254 t0 = presets[0].time * cplx;
256 t1 = presets[i].time * cplx;
257 if (t1 >= target || i == SC_PRESETS - 1)
261 // exponential interpolation between states
262 set = i-1 + (log(target) - log(t0)) / (log(t1) - log(t0));
263 set = max<float>(set, -5);
264 set = min<float>(set, (SC_PRESETS-1) + 5);
265 // Even if our time estimations in the SC_PRESETS array are off
266 // this will push us towards our target fullness
268 set += (40 * (filled-0.75));
269 float s2 = (40 * (filled-0.75));
270 set = min<float>(max<float>(set, 0), SC_PRESETS - 1);
271 apply_preset(dither_preset(set));
273 if (global_flags.x264_speedcontrol_verbose) {
274 static float cpu, wall, tgt, den;
275 const float decay = 1-1/100.;
276 cpu = cpu*decay + duration_cast<microseconds>(cpu_time_last_frame).count();
277 wall = wall*decay + duration_cast<microseconds>(delta_t).count();
278 tgt = tgt*decay + target;
280 fprintf(stderr, "speed: %.2f+%.2f %d[%.5f] (t/c/w: %6.0f/%6.0f/%6.0f = %.4f) fps=%.2f\r",
281 s1, s2, preset, (float)buffer_fill / buffer_size,
282 tgt/den, cpu/den, wall/den, cpu/wall, 1e6*den/wall );
288 void X264SpeedControl::after_frame()
290 cpu_time_last_frame = steady_clock::now() - timestamp;
293 void X264SpeedControl::set_buffer_size(int new_buffer_size)
295 new_buffer_size = max(3, new_buffer_size);
296 buffer_size = new_buffer_size * uspf;
297 cplx_decay = 1 - 1./new_buffer_size;
298 compensation_period = buffer_size/4;
301 int X264SpeedControl::dither_preset(float f)
315 void X264SpeedControl::apply_preset(int new_preset)
317 new_preset = max(new_preset, 0);
318 new_preset = min(new_preset, SC_PRESETS - 1);
320 const sc_preset_t *s = &presets[new_preset];
322 dyn.x264_encoder_parameters(x264, &p);
324 p.i_frame_reference = s->refs;
325 p.i_bframe_adaptive = s->badapt;
326 p.i_bframe = s->bframes;
327 p.analyse.inter = s->partitions;
328 p.analyse.i_subpel_refine = s->subme;
329 p.analyse.i_me_method = s->me;
330 p.analyse.i_trellis = s->trellis;
331 p.analyse.b_mixed_references = s->mix;
332 p.analyse.i_direct_mv_pred = s->direct;
333 p.analyse.i_me_range = s->merange;
337 dyn.x264_encoder_reconfig(x264, &p);
340 ++metric_x264_speedcontrol_preset_used_frames[new_preset];
341 // Non-atomic add, but that's fine, since there are no concurrent writers.
342 metric_x264_speedcontrol_preset_used_frames_sum = metric_x264_speedcontrol_preset_used_frames_sum + new_preset;