]> git.sesse.net Git - x264/commitdiff
VFR/framerate-aware ratecontrol, part 2
authorFiona Glaser <fiona@x264.com>
Fri, 24 Dec 2010 00:33:01 +0000 (19:33 -0500)
committerFiona Glaser <fiona@x264.com>
Mon, 10 Jan 2011 20:28:08 +0000 (12:28 -0800)
MB-tree and qcomp complexity estimation now consider the duration of a frame in their calculations.
This is very important for visual optimizations, as frames that last longer are inherently more important quality-wise.
Improves VFR-aware PSNR as much as 1-2db on extreme test cases, ~0.5db on more ordinary VFR clips (e.g. deduped anime episodes).

WARNING: This change redefines x264's internal quality measurement.
x264 will now scale its quality based on the framerate of the video due to the aforementioned frame duration logic.
That is, --crf X will give lower quality per frame for a 60fps video than for a 30fps one.
This will make --crf closer to constant perceptual quality than previously.
The "center" for this change is 25fps: that is, videos lower than 25fps will go up in quality at the same CRF and videos above will go down.
This choice is completely arbitrary.

Note that to take full advantage of this, x264 must encode your video at the correct framerate, with the correct timestamps.

common/mc.c
common/mc.h
common/x86/const-a.asm
common/x86/mc-a2.asm
common/x86/mc-c.c
encoder/ratecontrol.c
encoder/ratecontrol.h
encoder/slicetype.c
tools/checkasm.c

index 5f8c260bdcaa8b4d5520aa8d4eec52a20f5f4604..b0b38e92e651c26edb30a0e3de5f71e9a27d57b4 100644 (file)
@@ -431,30 +431,19 @@ static void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel
     }
 }
 
-#if defined(__GNUC__) && (ARCH_X86 || ARCH_X86_64)
-// gcc isn't smart enough to use the "idiv" instruction
-static ALWAYS_INLINE int32_t div_64_32(int64_t x, int32_t y)
-{
-    int32_t quotient, remainder;
-    asm("idiv %4"
-        :"=a"(quotient), "=d"(remainder)
-        :"a"((uint32_t)x), "d"((int32_t)(x>>32)), "r"(y)
-    );
-    return quotient;
-}
-#else
-#define div_64_32(x,y) ((x)/(y))
-#endif
-
 /* Estimate the total amount of influence on future quality that could be had if we
  * were to improve the reference samples used to inter predict any given macroblock. */
 static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-                                   uint16_t *inter_costs, uint16_t *inv_qscales, int len )
+                                   uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
 {
+    float fps = *fps_factor / 256.f;
     for( int i = 0; i < len; i++ )
     {
-        int propagate_amount = propagate_in[i] + ((intra_costs[i] * inv_qscales[i] + 128)>>8);
-        dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK)), intra_costs[i]);
+        float intra_cost       = intra_costs[i] * inv_qscales[i];
+        float propagate_amount = propagate_in[i] + intra_cost*fps;
+        float propagate_num    = intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK);
+        float propagate_denom  = intra_costs[i];
+        dst[i] = (int)(propagate_amount * propagate_num / propagate_denom + 0.5f);
     }
 }
 
index 92d0ded53b454cc07c95b4ce7fe992ac88825499..2a96fa6af56e74fe7e74adc174088f1a20ce47d4 100644 (file)
@@ -123,7 +123,7 @@ typedef struct
     void (*weight_cache)( x264_t *, x264_weight_t * );
 
     void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-                                   uint16_t *inter_costs, uint16_t *inv_qscales, int len );
+                                   uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 } x264_mc_functions_t;
 
 void x264_mc_init( int cpu, x264_mc_functions_t *pf );
index d2c7fe69025a105ec80d4b2c503aeae89dd46b3f..f01856d192c0657be7462e7ef11f6df537c12749 100644 (file)
@@ -51,7 +51,6 @@ const pw_pixel_max,times 8 dw ((1 << BIT_DEPTH)-1)
 
 const pd_1,        times 4 dd 1
 const pd_32,       times 4 dd 32
-const pd_128,      times 4 dd 128
 const pd_ffff,     times 4 dd 0xffff
 const pw_00ff,     times 8 dw 0x00ff
 const pw_ff00,     times 8 dw 0xff00
index 3a1ea14f2db20e5d856b0cc14958cb020d1fbe5d..bb639fe2f764b1f1cde29a5da784d571c0c27946 100644 (file)
@@ -40,6 +40,7 @@ deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
 
 pd_16: times 4 dd 16
 pd_0f: times 4 dd 0xffff
+pf_inv256: times 4 dd 0.00390625
 
 pad10: times 8 dw    10*PIXEL_MAX
 pad20: times 8 dw    20*PIXEL_MAX
@@ -59,7 +60,6 @@ cextern pw_32
 cextern pw_00ff
 cextern pw_3fff
 cextern pw_pixel_max
-cextern pd_128
 cextern pd_ffff
 
 %macro LOAD_ADD 4
@@ -1649,47 +1649,49 @@ FRAME_INIT_LOWRES ssse3
 
 ;-----------------------------------------------------------------------------
 ; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-;                             uint16_t *inter_costs, uint16_t *inv_qscales, int len )
+;                             uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
 ;-----------------------------------------------------------------------------
-cglobal mbtree_propagate_cost_sse2, 6,6,7
-    shl r5d, 1
-    lea r0, [r0+r5*2]
-    add r1, r5
-    add r2, r5
-    add r3, r5
-    add r4, r5
-    neg r5
-    pxor      xmm5, xmm5
-    movdqa    xmm6, [pw_3fff]
-    movdqa    xmm4, [pd_128]
+cglobal mbtree_propagate_cost_sse2, 7,7,7
+    shl        r6d, 1
+    lea         r0, [r0+r6*2]
+    add         r1, r6
+    add         r2, r6
+    add         r3, r6
+    add         r4, r6
+    neg         r6
+    pxor      xmm4, xmm4
+    movss     xmm6, [r5]
+    shufps    xmm6, xmm6, 0
+    mulps     xmm6, [pf_inv256]
+    movdqa    xmm5, [pw_3fff]
 .loop:
-    movq      xmm2, [r2+r5] ; intra
-    movq      xmm0, [r4+r5] ; invq
-    movq      xmm3, [r3+r5] ; inter
-    movq      xmm1, [r1+r5] ; prop
-    punpcklwd xmm2, xmm5
-    punpcklwd xmm0, xmm5
+    movq      xmm2, [r2+r6] ; intra
+    movq      xmm0, [r4+r6] ; invq
+    movq      xmm3, [r3+r6] ; inter
+    movq      xmm1, [r1+r6] ; prop
+    punpcklwd xmm2, xmm4
+    punpcklwd xmm0, xmm4
     pmaddwd   xmm0, xmm2
-    pand      xmm3, xmm6
-    punpcklwd xmm1, xmm5
-    punpcklwd xmm3, xmm5
-    paddd     xmm0, xmm4
-    psrld     xmm0, 8       ; intra*invq>>8
-    paddd     xmm0, xmm1    ; prop + (intra*invq>>8)
+    pand      xmm3, xmm5
+    punpcklwd xmm1, xmm4
+    punpcklwd xmm3, xmm4
+    cvtdq2ps  xmm0, xmm0
+    mulps     xmm0, xmm6    ; intra*invq*fps_factor>>8
+    cvtdq2ps  xmm1, xmm1    ; prop
+    addps     xmm0, xmm1    ; prop + (intra*invq*fps_factor>>8)
     cvtdq2ps  xmm1, xmm2    ; intra
     psubd     xmm2, xmm3    ; intra - inter
+    cvtdq2ps  xmm2, xmm2    ; intra - inter
     rcpps     xmm3, xmm1    ; 1 / intra 1st approximation
-    cvtdq2ps  xmm0, xmm0
     mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)
-    cvtdq2ps  xmm2, xmm2
     mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)^2
-    mulps     xmm0, xmm2    ; (prop + (intra*invq>>8)) * (intra - inter)
+    mulps     xmm0, xmm2    ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
     addps     xmm3, xmm3    ; 2 * (1/intra 1st approx)
     subps     xmm3, xmm1    ; 2nd approximation for 1/intra
     mulps     xmm0, xmm3    ; / intra
-    cvttps2dq xmm0, xmm0    ; truncation isn't really desired, but matches the integer implementation
-    movdqa [r0+r5*2], xmm0
-    add r5, 8
+    cvtps2dq  xmm0, xmm0
+    movdqa [r0+r6*2], xmm0
+    add         r6, 8
     jl .loop
     REP_RET
 
index 881f2d7770d8aa94862d9d4c57315fa82afd54f4..cdd9d572279a38794342641b8ed59aa5f50e39bc 100644 (file)
@@ -124,7 +124,7 @@ void x264_integral_init8v_mmx( uint16_t *sum8, int stride );
 void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
 void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
 void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-                                      uint16_t *inter_costs, uint16_t *inv_qscales, int len );
+                                      uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 
 #define MC_CHROMA(cpu)\
 void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\
index 4de6ab513aeb1f15dae0e3aa22afdd10a108a3b1..2d5871504668886bb39f47ad113f6abc6aa24a07 100644 (file)
@@ -1603,9 +1603,7 @@ int x264_ratecontrol_end( x264_t *h, int bits, int *filler )
             rc->cplxr_sum += bits * qp2qscale( rc->qpa_rc ) / (rc->last_rceq * fabs( h->param.rc.f_pb_factor ));
         }
         rc->cplxr_sum *= rc->cbr_decay;
-        double frame_duration = (double)h->fenc->i_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
-
-        rc->wanted_bits_window += frame_duration * rc->bitrate;
+        rc->wanted_bits_window += h->fenc->f_duration * rc->bitrate;
         rc->wanted_bits_window *= rc->cbr_decay;
     }
 
@@ -2184,7 +2182,7 @@ static float rate_estimate_qscale( x264_t *h )
             rcc->last_satd = x264_rc_analyse_slice( h );
             rcc->short_term_cplxsum *= 0.5;
             rcc->short_term_cplxcount *= 0.5;
-            rcc->short_term_cplxsum += rcc->last_satd;
+            rcc->short_term_cplxsum += rcc->last_satd / (CLIP_DURATION(h->fenc->f_duration) / BASE_FRAME_DURATION);
             rcc->short_term_cplxcount ++;
 
             rce.tex_bits = rcc->last_satd;
@@ -2541,10 +2539,11 @@ static int init_pass2( x264_t *h )
 {
     x264_ratecontrol_t *rcc = h->rc;
     uint64_t all_const_bits = 0;
+    double timescale = (double)h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
     double duration = 0;
     for( int i = 0; i < rcc->num_entries; i++ )
         duration += rcc->entry[i].i_duration;
-    duration *= (double)h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
+    duration *= timescale;
     uint64_t all_available_bits = h->param.rc.i_bitrate * 1000. * duration;
     double rate_factor, step_mult;
     double qblur = h->param.rc.f_qblur;
@@ -2583,21 +2582,23 @@ static int init_pass2( x264_t *h )
         for( int j = 1; j < cplxblur*2 && j < rcc->num_entries-i; j++ )
         {
             ratecontrol_entry_t *rcj = &rcc->entry[i+j];
+            double frame_duration = CLIP_DURATION(rcj->i_duration * timescale) / BASE_FRAME_DURATION;
             weight *= 1 - pow( (float)rcj->i_count / rcc->nmb, 2 );
             if( weight < .0001 )
                 break;
             gaussian_weight = weight * exp( -j*j/200.0 );
             weight_sum += gaussian_weight;
-            cplx_sum += gaussian_weight * (qscale2bits(rcj, 1) - rcj->misc_bits);
+            cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits) / frame_duration;
         }
         /* weighted average of cplx of past frames */
         weight = 1.0;
         for( int j = 0; j <= cplxblur*2 && j <= i; j++ )
         {
             ratecontrol_entry_t *rcj = &rcc->entry[i-j];
+            double frame_duration = CLIP_DURATION(rcj->i_duration * timescale) / BASE_FRAME_DURATION;
             gaussian_weight = weight * exp( -j*j/200.0 );
             weight_sum += gaussian_weight;
-            cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits);
+            cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits) / frame_duration;
             weight *= 1 - pow( (float)rcj->i_count / rcc->nmb, 2 );
             if( weight < .0001 )
                 break;
index 03c82cb0dc58f1786abf4190a6890188d242a31f..28e6a3d7384588dbcd4424503762d7e19bc25bbb 100644 (file)
 #ifndef X264_RATECONTROL_H
 #define X264_RATECONTROL_H
 
+/* Completely arbitrary.  Ratecontrol lowers relative quality at higher framerates
+ * and the reverse at lower framerates; this serves as the center of the curve. */
+#define BASE_FRAME_DURATION (0.04f)
+
+/* Arbitrary limitations as a sanity check. */
+#define MAX_FRAME_DURATION 1.00f
+#define MIN_FRAME_DURATION 0.01f
+
+#define CLIP_DURATION(f) x264_clip3f(f,MIN_FRAME_DURATION,MAX_FRAME_DURATION)
+
 int  x264_ratecontrol_new   ( x264_t * );
 void x264_ratecontrol_delete( x264_t * );
 
index cc0193bb684e02111a334461a9305724366bff3e..97cf61e3e7a3b6677087e64deeda72cf760d3874 100644 (file)
@@ -748,9 +748,10 @@ static int x264_slicetype_frame_cost_recalculate( x264_t *h, x264_frame_t **fram
     return i_score;
 }
 
-static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, int ref0_distance )
+static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, float average_duration, int ref0_distance )
 {
-    x264_emms();
+    int fps_factor_intra     = round( CLIP_DURATION(frame->f_duration) / BASE_FRAME_DURATION * 256 );
+    int fps_factor_propagate = round( CLIP_DURATION( average_duration) / BASE_FRAME_DURATION * 256 );
     float weightdelta = 0.0;
     if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 )
         weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]);
@@ -760,17 +761,18 @@ static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, int ref
     float strength = 5.0f * (1.0f - h->param.rc.f_qcompress);
     for( int mb_index = 0; mb_index < h->mb.i_mb_count; mb_index++ )
     {
-        int intra_cost = (frame->i_intra_cost[mb_index] * frame->i_inv_qscale_factor[mb_index]+128)>>8;
+        int intra_cost = (frame->i_intra_cost[mb_index] * frame->i_inv_qscale_factor[mb_index] + 128) >> 8;
+        int intra_cost_scaled = (intra_cost * fps_factor_intra + 128) >> 8;
         if( intra_cost )
         {
-            int propagate_cost = frame->i_propagate_cost[mb_index];
-            float log2_ratio = x264_log2(intra_cost + propagate_cost) - x264_log2(intra_cost) + weightdelta;
+            int propagate_cost = (frame->i_propagate_cost[mb_index] * fps_factor_propagate + 128) >> 8;
+            float log2_ratio = x264_log2(intra_cost_scaled + propagate_cost) - x264_log2(intra_cost) + weightdelta;
             frame->f_qp_offset[mb_index] = frame->f_qp_offset_aq[mb_index] - strength * log2_ratio;
         }
     }
 }
 
-static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, int p0, int p1, int b, int referenced )
+static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, float average_duration, int p0, int p1, int b, int referenced )
 {
     uint16_t *ref_costs[2] = {frames[p0]->i_propagate_cost,frames[p1]->i_propagate_cost};
     int dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
@@ -780,6 +782,9 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
     int *buf = h->scratch_buffer;
     uint16_t *propagate_cost = frames[b]->i_propagate_cost;
 
+    x264_emms();
+    float fps_factor = CLIP_DURATION(frames[b]->f_duration) / CLIP_DURATION(average_duration);
+
     /* For non-reffed frames the source costs are always zero, so just memset one row and re-use it. */
     if( !referenced )
         memset( frames[b]->i_propagate_cost, 0, h->mb.i_mb_width * sizeof(uint16_t) );
@@ -789,7 +794,7 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
         int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride;
         h->mc.mbtree_propagate_cost( buf, propagate_cost,
             frames[b]->i_intra_cost+mb_index, frames[b]->lowres_costs[b-p0][p1-b]+mb_index,
-            frames[b]->i_inv_qscale_factor+mb_index, h->mb.i_mb_width );
+            frames[b]->i_inv_qscale_factor+mb_index, &fps_factor, h->mb.i_mb_width );
         if( referenced )
             propagate_cost += h->mb.i_mb_width;
         for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->mb.i_mb_width; h->mb.i_mb_x++, mb_index++ )
@@ -858,7 +863,7 @@ static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, in
     }
 
     if( h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead && referenced )
-        x264_macroblock_tree_finish( h, frames[b], b == p1 ? b - p0 : 0 );
+        x264_macroblock_tree_finish( h, frames[b], average_duration, b == p1 ? b - p0 : 0 );
 }
 
 static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int num_frames, int b_intra )
@@ -866,6 +871,13 @@ static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t
     int idx = !b_intra;
     int last_nonb, cur_nonb = 1;
     int bframes = 0;
+
+    x264_emms();
+    float total_duration = 0.0;
+    for( int j = 0; j <= num_frames; j++ )
+        total_duration += frames[j]->f_duration;
+    float average_duration = total_duration / (num_frames + 1);
+
     int i = num_frames;
 
     if( b_intra )
@@ -918,34 +930,34 @@ static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t
                 if( i != middle )
                 {
                     x264_slicetype_frame_cost( h, a, frames, p0, p1, i, 0 );
-                    x264_macroblock_tree_propagate( h, frames, p0, p1, i, 0 );
+                    x264_macroblock_tree_propagate( h, frames, average_duration, p0, p1, i, 0 );
                 }
                 i--;
             }
-            x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, middle, 1 );
+            x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, middle, 1 );
         }
         else
         {
             while( i > cur_nonb )
             {
                 x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, i, 0 );
-                x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, i, 0 );
+                x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, i, 0 );
                 i--;
             }
         }
-        x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, last_nonb, 1 );
+        x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, last_nonb, 1 );
         last_nonb = cur_nonb;
     }
 
     if( !h->param.rc.i_lookahead )
     {
-        x264_macroblock_tree_propagate( h, frames, 0, last_nonb, last_nonb, 1 );
+        x264_macroblock_tree_propagate( h, frames, average_duration, 0, last_nonb, last_nonb, 1 );
         XCHG( uint16_t*, frames[last_nonb]->i_propagate_cost, frames[0]->i_propagate_cost );
     }
 
-    x264_macroblock_tree_finish( h, frames[last_nonb], last_nonb );
+    x264_macroblock_tree_finish( h, frames[last_nonb], average_duration, last_nonb );
     if( h->param.i_bframe_pyramid && bframes > 1 && !h->param.rc.i_vbv_buffer_size )
-        x264_macroblock_tree_finish( h, frames[last_nonb+(bframes+1)/2], 0 );
+        x264_macroblock_tree_finish( h, frames[last_nonb+(bframes+1)/2], average_duration, 0 );
 }
 
 static int x264_vbv_frame_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b )
index c552ab9b595eb6e247928cbecfc2526a1f7d196c..7c7faa72305ca60711c64df742bccb2f6549242b 100644 (file)
@@ -1236,29 +1236,34 @@ static int check_mc( int cpu_ref, int cpu_new )
 
     if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost )
     {
-        ok = 1; used_asm = 1;
-        set_func_name( "mbtree_propagate" );
-        int *dsta = (int*)buf3;
-        int *dstc = dsta+400;
-        uint16_t *prop = (uint16_t*)buf1;
-        uint16_t *intra = (uint16_t*)buf4;
-        uint16_t *inter = intra+400;
-        uint16_t *qscale = inter+400;
-        uint16_t *rnd = (uint16_t*)buf2;
         x264_emms();
-        for( int i = 0; i < 400; i++ )
+        for( int i = 0; i < 10; i++ )
         {
-            intra[i]  = *rnd++ & 0x7fff;
-            intra[i] += !intra[i];
-            inter[i]  = *rnd++ & 0x7fff;
-            qscale[i] = *rnd++ & 0x7fff;
+            float fps_factor = (rand()&65535) / 256.;
+            ok = 1; used_asm = 1;
+            set_func_name( "mbtree_propagate" );
+            int *dsta = (int*)buf3;
+            int *dstc = dsta+400;
+            uint16_t *prop = (uint16_t*)buf1;
+            uint16_t *intra = (uint16_t*)buf4;
+            uint16_t *inter = intra+100;
+            uint16_t *qscale = inter+100;
+            uint16_t *rnd = (uint16_t*)buf2;
+            x264_emms();
+            for( int j = 0; j < 100; j++ )
+            {
+                intra[j]  = *rnd++ & 0x7fff;
+                intra[j] += !intra[j];
+                inter[j]  = *rnd++ & 0x7fff;
+                qscale[j] = *rnd++ & 0x7fff;
+            }
+            call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, &fps_factor, 100 );
+            call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, &fps_factor, 100 );
+            // I don't care about exact rounding, this is just how close the floating-point implementation happens to be
+            x264_emms();
+            for( int j = 0; j < 100; j++ )
+                ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;
         }
-        call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, 400 );
-        call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, 400 );
-        // I don't care about exact rounding, this is just how close the floating-point implementation happens to be
-        x264_emms();
-        for( int i = 0; i < 400; i++ )
-            ok &= abs( dstc[i]-dsta[i] ) <= 1 || fabs( (double)dstc[i]/dsta[i]-1 ) < 1e-6;
         report( "mbtree propagate :" );
     }