((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
}
- int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int);
+ int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int16_t);
scratch_size = X264_MAX( scratch_size, buf_mbtree );
if( scratch_size )
CHECKED_MALLOC( h->scratch_buffer, scratch_size );
h->scratch_buffer = NULL;
int buf_lookahead_threads = (h->mb.i_mb_height + (4 + 32) * h->param.i_lookahead_threads) * sizeof(int) * 2;
- CHECKED_MALLOC( h->scratch_buffer2, buf_lookahead_threads );
+ int buf_mbtree2 = buf_mbtree * 12; /* size of the internal propagate_list asm buffer */
+ scratch_size = X264_MAX( buf_lookahead_threads, buf_mbtree2 );
+ CHECKED_MALLOC( h->scratch_buffer2, scratch_size );
return 0;
fail:
/* Estimate the total amount of influence on future quality that could be had if we
* were to improve the reference samples used to inter predict any given macroblock. */
-static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+static void mbtree_propagate_cost( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
{
float fps = *fps_factor;
for( int i = 0; i < len; i++ )
{
- float intra_cost = intra_costs[i] * inv_qscales[i];
- float propagate_amount = propagate_in[i] + intra_cost*fps;
- float propagate_num = intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK);
- float propagate_denom = intra_costs[i];
- dst[i] = (int)(propagate_amount * propagate_num / propagate_denom + 0.5f);
+ int intra_cost = intra_costs[i];
+ int inter_cost = X264_MIN(intra_costs[i], inter_costs[i] & LOWRES_COST_MASK);
+ float propagate_intra = intra_cost * inv_qscales[i];
+ float propagate_amount = propagate_in[i] + propagate_intra*fps;
+ float propagate_num = intra_cost - inter_cost;
+ float propagate_denom = intra_cost;
+ dst[i] = X264_MIN((int)(propagate_amount * propagate_num / propagate_denom + 0.5f), 32767);
}
}
+static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
+ int16_t *propagate_amount, uint16_t *lowres_costs,
+ int bipred_weight, int mb_y, int len, int list )
+{
+ unsigned stride = h->mb.i_mb_stride;
+ unsigned width = h->mb.i_mb_width;
+ unsigned height = h->mb.i_mb_height;
+
+ for( unsigned i = 0; i < len; i++ )
+ {
+#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
+ int lists_used = lowres_costs[i]>>LOWRES_COST_SHIFT;
+
+ if( !(lists_used & (1 << list)) )
+ continue;
+
+ int listamount = propagate_amount[i];
+ /* Apply bipred weighting. */
+ if( lists_used == 3 )
+ listamount = (listamount * bipred_weight + 32) >> 6;
+
+ /* Early termination for simple case of mv0. */
+ if( !M32( mvs[i] ) )
+ {
+ CLIP_ADD( ref_costs[mb_y*stride + i], listamount );
+ continue;
+ }
+
+ int x = mvs[i][0];
+ int y = mvs[i][1];
+ unsigned mbx = (x>>5)+i;
+ unsigned mby = (y>>5)+mb_y;
+ unsigned idx0 = mbx + mby * stride;
+ unsigned idx2 = idx0 + stride;
+ x &= 31;
+ y &= 31;
+ int idx0weight = (32-y)*(32-x);
+ int idx1weight = (32-y)*x;
+ int idx2weight = y*(32-x);
+ int idx3weight = y*x;
+ idx0weight = (idx0weight * listamount + 512) >> 10;
+ idx1weight = (idx1weight * listamount + 512) >> 10;
+ idx2weight = (idx2weight * listamount + 512) >> 10;
+ idx3weight = (idx3weight * listamount + 512) >> 10;
+
+ if( mbx < width-1 && mby < height-1 )
+ {
+ CLIP_ADD( ref_costs[idx0+0], idx0weight );
+ CLIP_ADD( ref_costs[idx0+1], idx1weight );
+ CLIP_ADD( ref_costs[idx2+0], idx2weight );
+ CLIP_ADD( ref_costs[idx2+1], idx3weight );
+ }
+ else
+ {
+ /* Note: this takes advantage of unsigned representation to
+ * catch negative mbx/mby. */
+ if( mby < height )
+ {
+ if( mbx < width )
+ CLIP_ADD( ref_costs[idx0+0], idx0weight );
+ if( mbx+1 < width )
+ CLIP_ADD( ref_costs[idx0+1], idx1weight );
+ }
+ if( mby+1 < height )
+ {
+ if( mbx < width )
+ CLIP_ADD( ref_costs[idx2+0], idx2weight );
+ if( mbx+1 < width )
+ CLIP_ADD( ref_costs[idx2+1], idx3weight );
+ }
+ }
+ }
+#undef CLIP_ADD
+}
+
void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
{
pf->mc_luma = mc_luma;
pf->integral_init8v = integral_init8v;
pf->mbtree_propagate_cost = mbtree_propagate_cost;
+ pf->mbtree_propagate_list = mbtree_propagate_list;
#if HAVE_MMX
x264_mc_init_mmx( cpu, pf );
#endif
if( cpu_independent )
+ {
pf->mbtree_propagate_cost = mbtree_propagate_cost;
+ pf->mbtree_propagate_list = mbtree_propagate_list;
+ }
}
void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
weight_fn_t *offsetsub;
void (*weight_cache)( x264_t *, x264_weight_t * );
- void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ void (*mbtree_propagate_cost)( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+
+ void (*mbtree_propagate_list)( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
+ int16_t *propagate_amount, uint16_t *lowres_costs,
+ int bipred_weight, int mb_y, int len, int list );
} x264_mc_functions_t;
void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent );
const pw_512, times 16 dw 512
const pw_00ff, times 16 dw 0x00ff
const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
+const pw_0to15, dw 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
const pd_1, times 8 dd 1
const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
SECTION_RODATA 32
+pw_1024: times 16 dw 1024
filt_mul20: times 32 db 20
filt_mul15: times 16 db 1, -5
filt_mul51: times 16 db -5, 1
deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
%endif ; !HIGH_BIT_DEPTH
-pw_1024: times 16 dw 1024
-
pd_16: times 4 dd 16
pd_0f: times 4 dd 0xffff
tap2: times 4 dw 20, 20
tap3: times 4 dw -5, 1
+pw_0xc000: times 8 dw 0xc000
+pw_31: times 8 dw 31
+pd_4: times 4 dd 4
+
SECTION .text
cextern pb_0
cextern pw_1
+cextern pw_8
cextern pw_16
cextern pw_32
cextern pw_512
cextern pw_00ff
cextern pw_3fff
cextern pw_pixel_max
+cextern pw_0to15
cextern pd_ffff
%macro LOAD_ADD 4
cglobal mbtree_propagate_cost, 6,6,7
movss m6, [r5]
mov r5d, r6m
- lea r0, [r0+r5*4]
+ lea r0, [r0+r5*2]
add r5d, r5d
add r1, r5
add r2, r5
movq m0, [r4+r5] ; invq
movq m3, [r3+r5] ; inter
movq m1, [r1+r5] ; prop
+ pand m3, m5
+ pminsw m3, m2
punpcklwd m2, m4
punpcklwd m0, m4
pmaddwd m0, m2
- pand m3, m5
punpcklwd m1, m4
punpcklwd m3, m4
%if cpuflag(fma4)
mulps m0, m3 ; / intra
%endif
cvtps2dq m0, m0
- mova [r0+r5*2], m0
+ packssdw m0, m0
+ movh [r0+r5], m0
add r5, 8
jl .loop
RET
cglobal mbtree_propagate_cost, 6,6,%1
vbroadcastss m6, [r5]
mov r5d, r6m
- lea r0, [r0+r5*4]
+ lea r0, [r0+r5*2]
add r5d, r5d
add r1, r5
add r2, r5
pmovzxwd m2, [r1+r5] ; prop
pand xm3, xm5, [r3+r5] ; inter
pmovzxwd m3, xm3
+ pminsd m3, m0
pmaddwd m1, m0
psubd m4, m0, m3
cvtdq2ps m0, m0
movu xm1, [r4+r5]
movu xm2, [r1+r5]
pand xm3, xm5, [r3+r5]
+ pminsw xm3, xm0
INT16_UNPACK 0
INT16_UNPACK 1
INT16_UNPACK 2
mulps m1, m3 ; / intra
%endif
vcvtps2dq m1, m1
- mova [r0+r5*2], m1
+ vextractf128 xm2, m1, 1
+ packssdw xm1, xm2
+ mova [r0+r5], xm1
add r5, 16
jl .loop
RET
MBTREE_AVX 8
INIT_YMM avx2,fma3
MBTREE_AVX 7
+
+%macro MBTREE_PROPAGATE_LIST 0
+;-----------------------------------------------------------------------------
+; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int *propagate_amount, uint16_t *lowres_costs,
+; int16_t *output, int bipred_weight, int mb_y, int len )
+;-----------------------------------------------------------------------------
+cglobal mbtree_propagate_list_internal, 4,6,8
+ movh m6, [pw_0to15] ; mb_x
+ movd m7, r5m
+ pshuflw m7, m7, 0
+ punpcklwd m6, m7 ; 0 y 1 y 2 y 3 y
+ movd m7, r4m
+ SPLATW m7, m7 ; bipred_weight
+ psllw m7, 9 ; bipred_weight << 9
+
+ mov r5d, r6m
+ xor r4d, r4d
+.loop:
+ mova m3, [r1+r4*2]
+ movu m4, [r2+r4*2]
+ mova m5, [pw_0xc000]
+ pand m4, m5
+ pcmpeqw m4, m5
+ pmulhrsw m5, m3, m7 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
+%if cpuflag(avx)
+ pblendvb m5, m3, m5, m4
+%else
+ pand m5, m4
+ pandn m4, m3
+ por m5, m4 ; if( lists_used == 3 )
+ ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
+%endif
+
+ movu m0, [r0+r4*4] ; x,y
+ movu m1, [r0+r4*4+mmsize]
+
+ psraw m2, m0, 5
+ psraw m3, m1, 5
+ mova m4, [pd_4]
+ paddw m2, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
+ paddw m6, m4 ; {mbx, mby} += {4, 0}
+ paddw m3, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
+ paddw m6, m4 ; {mbx, mby} += {4, 0}
+
+ mova [r3+mmsize*0], m2
+ mova [r3+mmsize*1], m3
+
+ mova m3, [pw_31]
+ pand m0, m3 ; x &= 31
+ pand m1, m3 ; y &= 31
+ packuswb m0, m1
+ psrlw m1, m0, 3
+ pand m0, m3 ; x
+ SWAP 1, 3
+ pandn m1, m3 ; y premultiplied by (1<<5) for later use of pmulhrsw
+
+ mova m3, [pw_32]
+ psubw m3, m0 ; 32 - x
+ mova m4, [pw_1024]
+ psubw m4, m1 ; (32 - y) << 5
+
+ pmullw m2, m3, m4 ; idx0weight = (32-y)*(32-x) << 5
+ pmullw m4, m0 ; idx1weight = (32-y)*x << 5
+ pmullw m0, m1 ; idx3weight = y*x << 5
+ pmullw m1, m3 ; idx2weight = y*(32-x) << 5
+
+ ; avoid overflow in the input to pmulhrsw
+ psrlw m3, m2, 15
+ psubw m2, m3 ; idx0weight -= (idx0weight == 32768)
+
+ pmulhrsw m2, m5 ; idx0weight * propagate_amount + 512 >> 10
+ pmulhrsw m4, m5 ; idx1weight * propagate_amount + 512 >> 10
+ pmulhrsw m1, m5 ; idx2weight * propagate_amount + 512 >> 10
+ pmulhrsw m0, m5 ; idx3weight * propagate_amount + 512 >> 10
+
+ SBUTTERFLY wd, 2, 4, 3
+ SBUTTERFLY wd, 1, 0, 3
+ mova [r3+mmsize*2], m2
+ mova [r3+mmsize*3], m4
+ mova [r3+mmsize*4], m1
+ mova [r3+mmsize*5], m0
+ add r4d, mmsize/2
+ add r3, mmsize*6
+ cmp r4d, r5d
+ jl .loop
+ REP_RET
+%endmacro
+
+INIT_XMM ssse3
+MBTREE_PROPAGATE_LIST
+INIT_XMM avx
+MBTREE_PROPAGATE_LIST
void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride );
-void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+void x264_mbtree_propagate_cost_sse2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx ( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx2_fma3( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+void x264_mbtree_propagate_cost_avx2_fma3( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
#define MC_CHROMA(cpu)\
PLANE_INTERLEAVE(avx)
#endif
+#if HAVE_X86_INLINE_ASM
+#define CLIP_ADD(s,x)\
+do\
+{\
+ int temp;\
+ asm("movd %0, %%xmm0 \n"\
+ "movd %2, %%xmm1 \n"\
+ "paddsw %%xmm1, %%xmm0 \n"\
+ "movd %%xmm0, %1 \n"\
+ :"+m"(s), "=&r"(temp)\
+ :"m"(x)\
+ );\
+ s = temp;\
+} while(0)
+
+#define CLIP_ADD2(s,x)\
+do\
+{\
+ asm("movd %0, %%xmm0 \n"\
+ "movd %1, %%xmm1 \n"\
+ "paddsw %%xmm1, %%xmm0 \n"\
+ "movd %%xmm0, %0 \n"\
+ :"+m"(M32(s))\
+ :"m"(M32(x))\
+ );\
+} while(0)
+#else
+#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
+#define CLIP_ADD2(s,x)\
+do\
+{\
+ CLIP_ADD((s)[0], (x)[0]);\
+ CLIP_ADD((s)[1], (x)[1]);\
+} while(0)
+#endif
+
+#define PROPAGATE_LIST(cpu)\
+void x264_mbtree_propagate_list_internal_##cpu( int16_t (*mvs)[2], int16_t *propagate_amount,\
+ uint16_t *lowres_costs, int16_t *output,\
+ int bipred_weight, int mb_y, int len );\
+\
+static void x264_mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],\
+ int16_t *propagate_amount, uint16_t *lowres_costs,\
+ int bipred_weight, int mb_y, int len, int list )\
+{\
+ int16_t *current = h->scratch_buffer2;\
+\
+ x264_mbtree_propagate_list_internal_##cpu( mvs, propagate_amount, lowres_costs,\
+ current, bipred_weight, mb_y, len );\
+\
+ unsigned stride = h->mb.i_mb_stride;\
+ unsigned width = h->mb.i_mb_width;\
+ unsigned height = h->mb.i_mb_height;\
+\
+ for( unsigned i = 0; i < len; current += 32 )\
+ {\
+ int end = X264_MIN( i+8, len );\
+ for( ; i < end; i++, current += 2 )\
+ {\
+ if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\
+ continue;\
+\
+ unsigned mbx = current[0];\
+ unsigned mby = current[1];\
+ unsigned idx0 = mbx + mby * stride;\
+ unsigned idx2 = idx0 + stride;\
+\
+ /* Shortcut for the simple/common case of zero MV */\
+ if( !M32( mvs[i] ) )\
+ {\
+ CLIP_ADD( ref_costs[idx0], current[16] );\
+ continue;\
+ }\
+\
+ if( mbx < width-1 && mby < height-1 )\
+ {\
+ CLIP_ADD2( ref_costs+idx0, current+16 );\
+ CLIP_ADD2( ref_costs+idx2, current+32 );\
+ }\
+ else\
+ {\
+ /* Note: this takes advantage of unsigned representation to\
+ * catch negative mbx/mby. */\
+ if( mby < height )\
+ {\
+ if( mbx < width )\
+ CLIP_ADD( ref_costs[idx0+0], current[16] );\
+ if( mbx+1 < width )\
+ CLIP_ADD( ref_costs[idx0+1], current[17] );\
+ }\
+ if( mby+1 < height )\
+ {\
+ if( mbx < width )\
+ CLIP_ADD( ref_costs[idx2+0], current[32] );\
+ if( mbx+1 < width )\
+ CLIP_ADD( ref_costs[idx2+1], current[33] );\
+ }\
+ }\
+ }\
+ }\
+}
+
+PROPAGATE_LIST(ssse3)
+PROPAGATE_LIST(avx)
+#undef CLIP_ADD
+#undef CLIP_ADD2
+
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
if( !(cpu&X264_CPU_MMX) )
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3;
+ pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
pf->integral_init4v = x264_integral_init4v_ssse3;
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_ssse3;
+ pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
if( !(cpu&X264_CPU_SLOW_PSHUFB) )
{
return;
pf->memzero_aligned = x264_memzero_aligned_avx;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
+ pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx;
if( cpu&X264_CPU_FMA4 )
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
SECTION_RODATA 32
-pw_0to15: dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4
pw_m3: times 16 dw -3
pw_m7: times 16 dw -7
cextern pw_16
cextern pw_00ff
cextern pw_pixel_max
+cextern pw_0to15
%macro STORE8 1
mova [r0+0*FDEC_STRIDEB], %1
return i_score;
}
+/* Trade off precision in mbtree for increased range */
+#define MBTREE_PRECISION 0.5f
+
static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, float average_duration, int ref0_distance )
{
- int fps_factor = round( CLIP_DURATION(average_duration) / CLIP_DURATION(frame->f_duration) * 256 );
+ int fps_factor = round( CLIP_DURATION(average_duration) / CLIP_DURATION(frame->f_duration) * 256 / MBTREE_PRECISION );
float weightdelta = 0.0;
if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 )
weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]);
int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32;
int16_t (*mvs[2])[2] = { frames[b]->lowres_mvs[0][b-p0-1], frames[b]->lowres_mvs[1][p1-b-1] };
int bipred_weights[2] = {i_bipred_weight, 64 - i_bipred_weight};
- int *buf = h->scratch_buffer;
+ int16_t *buf = h->scratch_buffer;
uint16_t *propagate_cost = frames[b]->i_propagate_cost;
+ uint16_t *lowres_costs = frames[b]->lowres_costs[b-p0][p1-b];
x264_emms();
- float fps_factor = CLIP_DURATION(frames[b]->f_duration) / (CLIP_DURATION(average_duration) * 256.0f);
+ float fps_factor = CLIP_DURATION(frames[b]->f_duration) / (CLIP_DURATION(average_duration) * 256.0f) * MBTREE_PRECISION;
/* For non-reffed frames the source costs are always zero, so just memset one row and re-use it. */
if( !referenced )
{
int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride;
h->mc.mbtree_propagate_cost( buf, propagate_cost,
- frames[b]->i_intra_cost+mb_index, frames[b]->lowres_costs[b-p0][p1-b]+mb_index,
+ frames[b]->i_intra_cost+mb_index, lowres_costs+mb_index,
frames[b]->i_inv_qscale_factor+mb_index, &fps_factor, h->mb.i_mb_width );
if( referenced )
propagate_cost += h->mb.i_mb_width;
- for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->mb.i_mb_width; h->mb.i_mb_x++, mb_index++ )
+
+ h->mc.mbtree_propagate_list( h, ref_costs[0], &mvs[0][mb_index], buf, &lowres_costs[mb_index],
+ bipred_weights[0], h->mb.i_mb_y, h->mb.i_mb_width, 0 );
+ if( b != p1 )
{
- int propagate_amount = buf[h->mb.i_mb_x];
- /* Don't propagate for an intra block. */
- if( propagate_amount > 0 )
- {
- /* Access width-2 bitfield. */
- int lists_used = frames[b]->lowres_costs[b-p0][p1-b][mb_index] >> LOWRES_COST_SHIFT;
- /* Follow the MVs to the previous frame(s). */
- for( int list = 0; list < 2; list++ )
- if( (lists_used >> list)&1 )
- {
-#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<16)-1)
- int listamount = propagate_amount;
- /* Apply bipred weighting. */
- if( lists_used == 3 )
- listamount = (listamount * bipred_weights[list] + 32) >> 6;
-
- /* Early termination for simple case of mv0. */
- if( !M32( mvs[list][mb_index] ) )
- {
- CLIP_ADD( ref_costs[list][mb_index], listamount );
- continue;
- }
-
- int x = mvs[list][mb_index][0];
- int y = mvs[list][mb_index][1];
- int mbx = (x>>5)+h->mb.i_mb_x;
- int mby = (y>>5)+h->mb.i_mb_y;
- int idx0 = mbx + mby * h->mb.i_mb_stride;
- int idx1 = idx0 + 1;
- int idx2 = idx0 + h->mb.i_mb_stride;
- int idx3 = idx0 + h->mb.i_mb_stride + 1;
- x &= 31;
- y &= 31;
- int idx0weight = (32-y)*(32-x);
- int idx1weight = (32-y)*x;
- int idx2weight = y*(32-x);
- int idx3weight = y*x;
-
- /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't
- * be counted. */
- if( mbx < h->mb.i_mb_width-1 && mby < h->mb.i_mb_height-1 && mbx >= 0 && mby >= 0 )
- {
- CLIP_ADD( ref_costs[list][idx0], (listamount*idx0weight+512)>>10 );
- CLIP_ADD( ref_costs[list][idx1], (listamount*idx1weight+512)>>10 );
- CLIP_ADD( ref_costs[list][idx2], (listamount*idx2weight+512)>>10 );
- CLIP_ADD( ref_costs[list][idx3], (listamount*idx3weight+512)>>10 );
- }
- else /* Check offsets individually */
- {
- if( mbx < h->mb.i_mb_width && mby < h->mb.i_mb_height && mbx >= 0 && mby >= 0 )
- CLIP_ADD( ref_costs[list][idx0], (listamount*idx0weight+512)>>10 );
- if( mbx+1 < h->mb.i_mb_width && mby < h->mb.i_mb_height && mbx+1 >= 0 && mby >= 0 )
- CLIP_ADD( ref_costs[list][idx1], (listamount*idx1weight+512)>>10 );
- if( mbx < h->mb.i_mb_width && mby+1 < h->mb.i_mb_height && mbx >= 0 && mby+1 >= 0 )
- CLIP_ADD( ref_costs[list][idx2], (listamount*idx2weight+512)>>10 );
- if( mbx+1 < h->mb.i_mb_width && mby+1 < h->mb.i_mb_height && mbx+1 >= 0 && mby+1 >= 0 )
- CLIP_ADD( ref_costs[list][idx3], (listamount*idx3weight+512)>>10 );
- }
- }
- }
+ h->mc.mbtree_propagate_list( h, ref_costs[1], &mvs[1][mb_index], buf, &lowres_costs[mb_index],
+ bipred_weights[1], h->mb.i_mb_y, h->mb.i_mb_width, 1 );
}
}
INTEGRAL_INIT( integral_init8v, 9, sum, stride );
report( "integral init :" );
+ ok = 1; used_asm = 0;
if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost )
{
- ok = 1; used_asm = 1;
+ used_asm = 1;
x264_emms();
for( int i = 0; i < 10; i++ )
{
float fps_factor = (rand()&65535) / 65535.0f;
- set_func_name( "mbtree_propagate" );
- int *dsta = (int*)buf3;
- int *dstc = dsta+400;
+ set_func_name( "mbtree_propagate_cost" );
+ int16_t *dsta = (int16_t*)buf3;
+ int16_t *dstc = dsta+400;
uint16_t *prop = (uint16_t*)buf1;
uint16_t *intra = (uint16_t*)buf4;
uint16_t *inter = intra+128;
{
ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;
if( !ok )
- fprintf( stderr, "mbtree_propagate FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] );
+ fprintf( stderr, "mbtree_propagate_cost FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] );
}
}
- report( "mbtree propagate :" );
}
+ if( mc_a.mbtree_propagate_list != mc_ref.mbtree_propagate_list )
+ {
+ used_asm = 1;
+ for( int i = 0; i < 8; i++ )
+ {
+ set_func_name( "mbtree_propagate_list" );
+ x264_t h;
+ int height = 4;
+ int width = 128;
+ int size = width*height;
+ h.mb.i_mb_stride = width;
+ h.mb.i_mb_width = width;
+ h.mb.i_mb_height = height;
+
+ uint16_t *ref_costsc = (uint16_t*)buf3;
+ uint16_t *ref_costsa = (uint16_t*)buf4;
+ int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + size);
+ int16_t *propagate_amount = (int16_t*)(mvs + width);
+ uint16_t *lowres_costs = (uint16_t*)(propagate_amount + width);
+ h.scratch_buffer2 = (uint8_t*)(ref_costsa + size);
+ int bipred_weight = (rand()%63)+1;
+ int list = i&1;
+ for( int j = 0; j < size; j++ )
+ ref_costsc[j] = ref_costsa[j] = rand()&32767;
+ for( int j = 0; j < width; j++ )
+ {
+ static const uint8_t list_dist[2][8] = {{0,1,1,1,1,1,1,1},{1,1,3,3,3,3,3,2}};
+ for( int k = 0; k < 2; k++ )
+ mvs[j][k] = (rand()&127) - 64;
+ propagate_amount[j] = rand()&32767;
+ lowres_costs[j] = list_dist[list][rand()&7] << LOWRES_COST_SHIFT;
+ }
+
+ call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
+ call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
+
+ for( int j = 0; j < size && ok; j++ )
+ {
+ ok &= abs(ref_costsa[j] - ref_costsc[j]) <= 1;
+ if( !ok )
+ fprintf( stderr, "mbtree_propagate_list FAILED at %d: %d !~= %d\n", j, ref_costsc[j], ref_costsa[j] );
+ }
+
+ call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
+ call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
+ }
+ }
+ report( "mbtree :" );
+
if( mc_a.memcpy_aligned != mc_ref.memcpy_aligned )
{
set_func_name( "memcpy_aligned" );