MB-tree and qcomp complexity estimation now consider the duration of a frame in their calculations.
This is very important for visual optimizations, as frames that last longer are inherently more important quality-wise.
Improves VFR-aware PSNR as much as 1-2db on extreme test cases, ~0.5db on more ordinary VFR clips (e.g. deduped anime episodes).
WARNING: This change redefines x264's internal quality measurement.
x264 will now scale its quality based on the framerate of the video due to the aforementioned frame duration logic.
That is, --crf X will give lower quality per frame for a 60fps video than for a 30fps one.
This will make --crf closer to constant perceptual quality than previously.
The "center" for this change is 25fps: that is, videos lower than 25fps will go up in quality at the same CRF and videos above will go down.
This choice is completely arbitrary.
Note that to take full advantage of this, x264 must encode your video at the correct framerate, with the correct timestamps.
}
}
-#if defined(__GNUC__) && (ARCH_X86 || ARCH_X86_64)
-// gcc isn't smart enough to use the "idiv" instruction
-static ALWAYS_INLINE int32_t div_64_32(int64_t x, int32_t y)
-{
- int32_t quotient, remainder;
- asm("idiv %4"
- :"=a"(quotient), "=d"(remainder)
- :"a"((uint32_t)x), "d"((int32_t)(x>>32)), "r"(y)
- );
- return quotient;
-}
-#else
-#define div_64_32(x,y) ((x)/(y))
-#endif
-
/* Estimate the total amount of influence on future quality that could be had if we
* were to improve the reference samples used to inter predict any given macroblock. */
static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- uint16_t *inter_costs, uint16_t *inv_qscales, int len )
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
{
+ float fps = *fps_factor / 256.f;
for( int i = 0; i < len; i++ )
{
- int propagate_amount = propagate_in[i] + ((intra_costs[i] * inv_qscales[i] + 128)>>8);
- dst[i] = div_64_32((int64_t)propagate_amount * (intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK)), intra_costs[i]);
+ float intra_cost = intra_costs[i] * inv_qscales[i];
+ float propagate_amount = propagate_in[i] + intra_cost*fps;
+ float propagate_num = intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK);
+ float propagate_denom = intra_costs[i];
+ dst[i] = (int)(propagate_amount * propagate_num / propagate_denom + 0.5f);
}
}
void (*weight_cache)( x264_t *, x264_weight_t * );
void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- uint16_t *inter_costs, uint16_t *inv_qscales, int len );
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
} x264_mc_functions_t;
void x264_mc_init( int cpu, x264_mc_functions_t *pf );
const pd_1, times 4 dd 1
const pd_32, times 4 dd 32
-const pd_128, times 4 dd 128
const pd_ffff, times 4 dd 0xffff
const pw_00ff, times 8 dw 0x00ff
const pw_ff00, times 8 dw 0xff00
pd_16: times 4 dd 16
pd_0f: times 4 dd 0xffff
+pf_inv256: times 4 dd 0.00390625
pad10: times 8 dw 10*PIXEL_MAX
pad20: times 8 dw 20*PIXEL_MAX
cextern pw_00ff
cextern pw_3fff
cextern pw_pixel_max
-cextern pd_128
cextern pd_ffff
%macro LOAD_ADD 4
;-----------------------------------------------------------------------------
; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
-; uint16_t *inter_costs, uint16_t *inv_qscales, int len )
+; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
;-----------------------------------------------------------------------------
-cglobal mbtree_propagate_cost_sse2, 6,6,7
- shl r5d, 1
- lea r0, [r0+r5*2]
- add r1, r5
- add r2, r5
- add r3, r5
- add r4, r5
- neg r5
- pxor xmm5, xmm5
- movdqa xmm6, [pw_3fff]
- movdqa xmm4, [pd_128]
+cglobal mbtree_propagate_cost_sse2, 7,7,7
+ shl r6d, 1
+ lea r0, [r0+r6*2]
+ add r1, r6
+ add r2, r6
+ add r3, r6
+ add r4, r6
+ neg r6
+ pxor xmm4, xmm4
+ movss xmm6, [r5]
+ shufps xmm6, xmm6, 0
+ mulps xmm6, [pf_inv256]
+ movdqa xmm5, [pw_3fff]
.loop:
- movq xmm2, [r2+r5] ; intra
- movq xmm0, [r4+r5] ; invq
- movq xmm3, [r3+r5] ; inter
- movq xmm1, [r1+r5] ; prop
- punpcklwd xmm2, xmm5
- punpcklwd xmm0, xmm5
+ movq xmm2, [r2+r6] ; intra
+ movq xmm0, [r4+r6] ; invq
+ movq xmm3, [r3+r6] ; inter
+ movq xmm1, [r1+r6] ; prop
+ punpcklwd xmm2, xmm4
+ punpcklwd xmm0, xmm4
pmaddwd xmm0, xmm2
- pand xmm3, xmm6
- punpcklwd xmm1, xmm5
- punpcklwd xmm3, xmm5
- paddd xmm0, xmm4
- psrld xmm0, 8 ; intra*invq>>8
- paddd xmm0, xmm1 ; prop + (intra*invq>>8)
+ pand xmm3, xmm5
+ punpcklwd xmm1, xmm4
+ punpcklwd xmm3, xmm4
+ cvtdq2ps xmm0, xmm0
+ mulps xmm0, xmm6 ; intra*invq*fps_factor>>8
+ cvtdq2ps xmm1, xmm1 ; prop
+ addps xmm0, xmm1 ; prop + (intra*invq*fps_factor>>8)
cvtdq2ps xmm1, xmm2 ; intra
psubd xmm2, xmm3 ; intra - inter
+ cvtdq2ps xmm2, xmm2 ; intra - inter
rcpps xmm3, xmm1 ; 1 / intra 1st approximation
- cvtdq2ps xmm0, xmm0
mulps xmm1, xmm3 ; intra * (1/intra 1st approx)
- cvtdq2ps xmm2, xmm2
mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2
- mulps xmm0, xmm2 ; (prop + (intra*invq>>8)) * (intra - inter)
+ mulps xmm0, xmm2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
subps xmm3, xmm1 ; 2nd approximation for 1/intra
mulps xmm0, xmm3 ; / intra
- cvttps2dq xmm0, xmm0 ; truncation isn't really desired, but matches the integer implementation
- movdqa [r0+r5*2], xmm0
- add r5, 8
+ cvtps2dq xmm0, xmm0
+ movdqa [r0+r6*2], xmm0
+ add r6, 8
jl .loop
REP_RET
void x264_integral_init8v_sse2( uint16_t *sum8, int stride );
void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
- uint16_t *inter_costs, uint16_t *inv_qscales, int len );
+ uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
#define MC_CHROMA(cpu)\
void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, int i_dst,\
rc->cplxr_sum += bits * qp2qscale( rc->qpa_rc ) / (rc->last_rceq * fabs( h->param.rc.f_pb_factor ));
}
rc->cplxr_sum *= rc->cbr_decay;
- double frame_duration = (double)h->fenc->i_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
-
- rc->wanted_bits_window += frame_duration * rc->bitrate;
+ rc->wanted_bits_window += h->fenc->f_duration * rc->bitrate;
rc->wanted_bits_window *= rc->cbr_decay;
}
rcc->last_satd = x264_rc_analyse_slice( h );
rcc->short_term_cplxsum *= 0.5;
rcc->short_term_cplxcount *= 0.5;
- rcc->short_term_cplxsum += rcc->last_satd;
+ rcc->short_term_cplxsum += rcc->last_satd / (CLIP_DURATION(h->fenc->f_duration) / BASE_FRAME_DURATION);
rcc->short_term_cplxcount ++;
rce.tex_bits = rcc->last_satd;
{
x264_ratecontrol_t *rcc = h->rc;
uint64_t all_const_bits = 0;
+ double timescale = (double)h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
double duration = 0;
for( int i = 0; i < rcc->num_entries; i++ )
duration += rcc->entry[i].i_duration;
- duration *= (double)h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale;
+ duration *= timescale;
uint64_t all_available_bits = h->param.rc.i_bitrate * 1000. * duration;
double rate_factor, step_mult;
double qblur = h->param.rc.f_qblur;
for( int j = 1; j < cplxblur*2 && j < rcc->num_entries-i; j++ )
{
ratecontrol_entry_t *rcj = &rcc->entry[i+j];
+ double frame_duration = CLIP_DURATION(rcj->i_duration * timescale) / BASE_FRAME_DURATION;
weight *= 1 - pow( (float)rcj->i_count / rcc->nmb, 2 );
if( weight < .0001 )
break;
gaussian_weight = weight * exp( -j*j/200.0 );
weight_sum += gaussian_weight;
- cplx_sum += gaussian_weight * (qscale2bits(rcj, 1) - rcj->misc_bits);
+ cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits) / frame_duration;
}
/* weighted average of cplx of past frames */
weight = 1.0;
for( int j = 0; j <= cplxblur*2 && j <= i; j++ )
{
ratecontrol_entry_t *rcj = &rcc->entry[i-j];
+ double frame_duration = CLIP_DURATION(rcj->i_duration * timescale) / BASE_FRAME_DURATION;
gaussian_weight = weight * exp( -j*j/200.0 );
weight_sum += gaussian_weight;
- cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits);
+ cplx_sum += gaussian_weight * (qscale2bits( rcj, 1 ) - rcj->misc_bits) / frame_duration;
weight *= 1 - pow( (float)rcj->i_count / rcc->nmb, 2 );
if( weight < .0001 )
break;
#ifndef X264_RATECONTROL_H
#define X264_RATECONTROL_H
+/* Completely arbitrary. Ratecontrol lowers relative quality at higher framerates
+ * and the reverse at lower framerates; this serves as the center of the curve. */
+#define BASE_FRAME_DURATION (0.04f)
+
+/* Arbitrary limitations as a sanity check. */
+#define MAX_FRAME_DURATION 1.00f
+#define MIN_FRAME_DURATION 0.01f
+
+#define CLIP_DURATION(f) x264_clip3f(f,MIN_FRAME_DURATION,MAX_FRAME_DURATION)
+
int x264_ratecontrol_new ( x264_t * );
void x264_ratecontrol_delete( x264_t * );
return i_score;
}
-static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, int ref0_distance )
+static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, float average_duration, int ref0_distance )
{
- x264_emms();
+ int fps_factor_intra = round( CLIP_DURATION(frame->f_duration) / BASE_FRAME_DURATION * 256 );
+ int fps_factor_propagate = round( CLIP_DURATION( average_duration) / BASE_FRAME_DURATION * 256 );
float weightdelta = 0.0;
if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 )
weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]);
float strength = 5.0f * (1.0f - h->param.rc.f_qcompress);
for( int mb_index = 0; mb_index < h->mb.i_mb_count; mb_index++ )
{
- int intra_cost = (frame->i_intra_cost[mb_index] * frame->i_inv_qscale_factor[mb_index]+128)>>8;
+ int intra_cost = (frame->i_intra_cost[mb_index] * frame->i_inv_qscale_factor[mb_index] + 128) >> 8;
+ int intra_cost_scaled = (intra_cost * fps_factor_intra + 128) >> 8;
if( intra_cost )
{
- int propagate_cost = frame->i_propagate_cost[mb_index];
- float log2_ratio = x264_log2(intra_cost + propagate_cost) - x264_log2(intra_cost) + weightdelta;
+ int propagate_cost = (frame->i_propagate_cost[mb_index] * fps_factor_propagate + 128) >> 8;
+ float log2_ratio = x264_log2(intra_cost_scaled + propagate_cost) - x264_log2(intra_cost) + weightdelta;
frame->f_qp_offset[mb_index] = frame->f_qp_offset_aq[mb_index] - strength * log2_ratio;
}
}
}
-static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, int p0, int p1, int b, int referenced )
+static void x264_macroblock_tree_propagate( x264_t *h, x264_frame_t **frames, float average_duration, int p0, int p1, int b, int referenced )
{
uint16_t *ref_costs[2] = {frames[p0]->i_propagate_cost,frames[p1]->i_propagate_cost};
int dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
int *buf = h->scratch_buffer;
uint16_t *propagate_cost = frames[b]->i_propagate_cost;
+ x264_emms();
+ float fps_factor = CLIP_DURATION(frames[b]->f_duration) / CLIP_DURATION(average_duration);
+
/* For non-reffed frames the source costs are always zero, so just memset one row and re-use it. */
if( !referenced )
memset( frames[b]->i_propagate_cost, 0, h->mb.i_mb_width * sizeof(uint16_t) );
int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride;
h->mc.mbtree_propagate_cost( buf, propagate_cost,
frames[b]->i_intra_cost+mb_index, frames[b]->lowres_costs[b-p0][p1-b]+mb_index,
- frames[b]->i_inv_qscale_factor+mb_index, h->mb.i_mb_width );
+ frames[b]->i_inv_qscale_factor+mb_index, &fps_factor, h->mb.i_mb_width );
if( referenced )
propagate_cost += h->mb.i_mb_width;
for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->mb.i_mb_width; h->mb.i_mb_x++, mb_index++ )
}
if( h->param.rc.i_vbv_buffer_size && h->param.rc.i_lookahead && referenced )
- x264_macroblock_tree_finish( h, frames[b], b == p1 ? b - p0 : 0 );
+ x264_macroblock_tree_finish( h, frames[b], average_duration, b == p1 ? b - p0 : 0 );
}
static void x264_macroblock_tree( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int num_frames, int b_intra )
int idx = !b_intra;
int last_nonb, cur_nonb = 1;
int bframes = 0;
+
+ x264_emms();
+ float total_duration = 0.0;
+ for( int j = 0; j <= num_frames; j++ )
+ total_duration += frames[j]->f_duration;
+ float average_duration = total_duration / (num_frames + 1);
+
int i = num_frames;
if( b_intra )
if( i != middle )
{
x264_slicetype_frame_cost( h, a, frames, p0, p1, i, 0 );
- x264_macroblock_tree_propagate( h, frames, p0, p1, i, 0 );
+ x264_macroblock_tree_propagate( h, frames, average_duration, p0, p1, i, 0 );
}
i--;
}
- x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, middle, 1 );
+ x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, middle, 1 );
}
else
{
while( i > cur_nonb )
{
x264_slicetype_frame_cost( h, a, frames, cur_nonb, last_nonb, i, 0 );
- x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, i, 0 );
+ x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, i, 0 );
i--;
}
}
- x264_macroblock_tree_propagate( h, frames, cur_nonb, last_nonb, last_nonb, 1 );
+ x264_macroblock_tree_propagate( h, frames, average_duration, cur_nonb, last_nonb, last_nonb, 1 );
last_nonb = cur_nonb;
}
if( !h->param.rc.i_lookahead )
{
- x264_macroblock_tree_propagate( h, frames, 0, last_nonb, last_nonb, 1 );
+ x264_macroblock_tree_propagate( h, frames, average_duration, 0, last_nonb, last_nonb, 1 );
XCHG( uint16_t*, frames[last_nonb]->i_propagate_cost, frames[0]->i_propagate_cost );
}
- x264_macroblock_tree_finish( h, frames[last_nonb], last_nonb );
+ x264_macroblock_tree_finish( h, frames[last_nonb], average_duration, last_nonb );
if( h->param.i_bframe_pyramid && bframes > 1 && !h->param.rc.i_vbv_buffer_size )
- x264_macroblock_tree_finish( h, frames[last_nonb+(bframes+1)/2], 0 );
+ x264_macroblock_tree_finish( h, frames[last_nonb+(bframes+1)/2], average_duration, 0 );
}
static int x264_vbv_frame_cost( x264_t *h, x264_mb_analysis_t *a, x264_frame_t **frames, int p0, int p1, int b )
if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost )
{
- ok = 1; used_asm = 1;
- set_func_name( "mbtree_propagate" );
- int *dsta = (int*)buf3;
- int *dstc = dsta+400;
- uint16_t *prop = (uint16_t*)buf1;
- uint16_t *intra = (uint16_t*)buf4;
- uint16_t *inter = intra+400;
- uint16_t *qscale = inter+400;
- uint16_t *rnd = (uint16_t*)buf2;
x264_emms();
- for( int i = 0; i < 400; i++ )
+ for( int i = 0; i < 10; i++ )
{
- intra[i] = *rnd++ & 0x7fff;
- intra[i] += !intra[i];
- inter[i] = *rnd++ & 0x7fff;
- qscale[i] = *rnd++ & 0x7fff;
+ float fps_factor = (rand()&65535) / 256.;
+ ok = 1; used_asm = 1;
+ set_func_name( "mbtree_propagate" );
+ int *dsta = (int*)buf3;
+ int *dstc = dsta+400;
+ uint16_t *prop = (uint16_t*)buf1;
+ uint16_t *intra = (uint16_t*)buf4;
+ uint16_t *inter = intra+100;
+ uint16_t *qscale = inter+100;
+ uint16_t *rnd = (uint16_t*)buf2;
+ x264_emms();
+ for( int j = 0; j < 100; j++ )
+ {
+ intra[j] = *rnd++ & 0x7fff;
+ intra[j] += !intra[j];
+ inter[j] = *rnd++ & 0x7fff;
+ qscale[j] = *rnd++ & 0x7fff;
+ }
+ call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, &fps_factor, 100 );
+ call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, &fps_factor, 100 );
+ // I don't care about exact rounding, this is just how close the floating-point implementation happens to be
+ x264_emms();
+ for( int j = 0; j < 100; j++ )
+ ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;
}
- call_c( mc_c.mbtree_propagate_cost, dstc, prop, intra, inter, qscale, 400 );
- call_a( mc_a.mbtree_propagate_cost, dsta, prop, intra, inter, qscale, 400 );
- // I don't care about exact rounding, this is just how close the floating-point implementation happens to be
- x264_emms();
- for( int i = 0; i < 400; i++ )
- ok &= abs( dstc[i]-dsta[i] ) <= 1 || fabs( (double)dstc[i]/dsta[i]-1 ) < 1e-6;
report( "mbtree propagate :" );
}