}
}
+/* Conversion between float and Q8.8 fixed point (big-endian) for storage */
+static void mbtree_fix8_pack( uint16_t *dst, float *src, int count )
+{
+ for( int i = 0; i < count; i++ )
+ dst[i] = endian_fix16( (int16_t)(src[i] * 256.0f) );
+}
+
+static void mbtree_fix8_unpack( float *dst, uint16_t *src, int count )
+{
+ for( int i = 0; i < count; i++ )
+ dst[i] = (int16_t)endian_fix16( src[i] ) * (1.0f/256.0f);
+}
+
void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
{
pf->mc_luma = mc_luma;
pf->mbtree_propagate_cost = mbtree_propagate_cost;
pf->mbtree_propagate_list = mbtree_propagate_list;
+ pf->mbtree_fix8_pack = mbtree_fix8_pack;
+ pf->mbtree_fix8_unpack = mbtree_fix8_unpack;
#if HAVE_MMX
x264_mc_init_mmx( cpu, pf );
void (*mbtree_propagate_cost)( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-
void (*mbtree_propagate_list)( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
int16_t *propagate_amount, uint16_t *lowres_costs,
int bipred_weight, int mb_y, int len, int list );
+ void (*mbtree_fix8_pack)( uint16_t *dst, float *src, int count );
+ void (*mbtree_fix8_unpack)( float *dst, uint16_t *src, int count );
} x264_mc_functions_t;
void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent );
deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
%endif ; !HIGH_BIT_DEPTH
+mbtree_fix8_unpack_shuf: db -1,-1, 1, 0,-1,-1, 3, 2,-1,-1, 5, 4,-1,-1, 7, 6
+ db -1,-1, 9, 8,-1,-1,11,10,-1,-1,13,12,-1,-1,15,14
+mbtree_fix8_pack_shuf: db 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14
+
+pf_256: times 4 dd 256.0
+pf_inv256: times 4 dd 0.00390625
+
pd_16: times 4 dd 16
pd_0f: times 4 dd 0xffff
MBTREE_PROPAGATE_LIST
INIT_XMM avx
MBTREE_PROPAGATE_LIST
+
+%macro MBTREE_FIX8 0
+;-----------------------------------------------------------------------------
+; void mbtree_fix8_pack( uint16_t *dst, float *src, int count )
+;-----------------------------------------------------------------------------
+cglobal mbtree_fix8_pack, 3,4
+%if mmsize == 32
+ vbroadcastf128 m2, [pf_256]
+ vbroadcasti128 m3, [mbtree_fix8_pack_shuf]
+%else
+ movaps m2, [pf_256]
+ mova m3, [mbtree_fix8_pack_shuf]
+%endif
+ sub r2d, mmsize/2
+ movsxdifnidn r2, r2d
+ lea r1, [r1+4*r2]
+ lea r0, [r0+2*r2]
+ neg r2
+ jg .skip_loop
+.loop:
+ mulps m0, m2, [r1+4*r2]
+ mulps m1, m2, [r1+4*r2+mmsize]
+ cvttps2dq m0, m0
+ cvttps2dq m1, m1
+ packssdw m0, m1
+ pshufb m0, m3
+%if mmsize == 32
+ vpermq m0, m0, q3120
+%endif
+ mova [r0+2*r2], m0
+ add r2, mmsize/2
+ jle .loop
+.skip_loop:
+ sub r2, mmsize/2
+ jz .end
+ ; Do the remaining values in scalar in order to avoid overreading src.
+.scalar:
+ mulss xm0, xm2, [r1+4*r2+2*mmsize]
+ cvttss2si r3d, xm0
+ rol r3w, 8
+ mov [r0+2*r2+mmsize], r3w
+ inc r2
+ jl .scalar
+.end:
+ RET
+
+;-----------------------------------------------------------------------------
+; void mbtree_fix8_unpack( float *dst, uint16_t *src, int count )
+;-----------------------------------------------------------------------------
+cglobal mbtree_fix8_unpack, 3,4
+%if mmsize == 32
+ vbroadcastf128 m2, [pf_inv256]
+%else
+ movaps m2, [pf_inv256]
+ mova m4, [mbtree_fix8_unpack_shuf+16]
+%endif
+ mova m3, [mbtree_fix8_unpack_shuf]
+ sub r2d, mmsize/2
+ movsxdifnidn r2, r2d
+ lea r1, [r1+2*r2]
+ lea r0, [r0+4*r2]
+ neg r2
+ jg .skip_loop
+.loop:
+%if mmsize == 32
+ vbroadcasti128 m0, [r1+2*r2]
+ vbroadcasti128 m1, [r1+2*r2+16]
+ pshufb m0, m3
+ pshufb m1, m3
+%else
+ mova m1, [r1+2*r2]
+ pshufb m0, m1, m3
+ pshufb m1, m4
+%endif
+ psrad m0, 16 ; sign-extend
+ psrad m1, 16
+ cvtdq2ps m0, m0
+ cvtdq2ps m1, m1
+ mulps m0, m2
+ mulps m1, m2
+ movaps [r0+4*r2], m0
+ movaps [r0+4*r2+mmsize], m1
+ add r2, mmsize/2
+ jle .loop
+.skip_loop:
+ sub r2, mmsize/2
+ jz .end
+.scalar:
+ movzx r3d, word [r1+2*r2+mmsize]
+ rol r3w, 8
+ movsx r3d, r3w
+ ; Use 3-arg cvtsi2ss as a workaround for the fact that the instruction has a stupid dependency on
+ ; dst which causes terrible performance when used in a loop otherwise. Blame Intel for poor design.
+ cvtsi2ss xm0, xm2, r3d
+ mulss xm0, xm2
+ movss [r0+4*r2+2*mmsize], xm0
+ inc r2
+ jl .scalar
+.end:
+ RET
+%endmacro
+
+INIT_XMM ssse3
+MBTREE_FIX8
+INIT_YMM avx2
+MBTREE_FIX8
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+void x264_mbtree_fix8_pack_ssse3( uint16_t *dst, float *src, int count );
+void x264_mbtree_fix8_pack_avx2 ( uint16_t *dst, float *src, int count );
+void x264_mbtree_fix8_unpack_ssse3( float *dst, uint16_t *src, int count );
+void x264_mbtree_fix8_unpack_avx2 ( float *dst, uint16_t *src, int count );
#define MC_CHROMA(cpu)\
void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, intptr_t i_src,\
pf->plane_copy_swap = x264_plane_copy_swap_ssse3;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
+ pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_ssse3;
+ pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_ssse3;
if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
pf->integral_init4v = x264_integral_init4v_ssse3;
pf->plane_copy_swap = x264_plane_copy_swap_ssse3;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_ssse3;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
+ pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_ssse3;
+ pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_ssse3;
if( !(cpu&X264_CPU_SLOW_PSHUFB) )
{
pf->plane_copy_swap = x264_plane_copy_swap_avx2;
pf->get_ref = get_ref_avx2;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
+ pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx2;
+ pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx2;
}
}
float *dst = rc->mbtree.rescale_enabled ? rc->mbtree.scale_buffer[0] : frame->f_qp_offset;
- for( int i = 0; i < rc->mbtree.src_mb_count; i++ )
- {
- int16_t qp_fix8 = endian_fix16( rc->mbtree.qp_buffer[rc->mbtree.qpbuf_pos][i] );
- dst[i] = qp_fix8 * (1.f/256.f);
- }
+ h->mc.mbtree_fix8_unpack( dst, rc->mbtree.qp_buffer[rc->mbtree.qpbuf_pos], rc->mbtree.src_mb_count );
if( rc->mbtree.rescale_enabled )
x264_macroblock_tree_rescale( h, rc, frame->f_qp_offset );
if( h->frames.b_have_lowres )
if( h->param.rc.b_mb_tree && h->fenc->b_kept_as_ref && !h->param.rc.b_stat_read )
{
uint8_t i_type = h->sh.i_type;
- /* Values are stored as big-endian FIX8.8 */
- for( int i = 0; i < h->mb.i_mb_count; i++ )
- rc->mbtree.qp_buffer[0][i] = endian_fix16( (int16_t)(h->fenc->f_qp_offset[i]*256.0) );
+ h->mc.mbtree_fix8_pack( rc->mbtree.qp_buffer[0], h->fenc->f_qp_offset, h->mb.i_mb_count );
if( fwrite( &i_type, 1, 1, rc->p_mbtree_stat_file_out ) < 1 )
goto fail;
if( fwrite( rc->mbtree.qp_buffer[0], sizeof(uint16_t), h->mb.i_mb_count, rc->p_mbtree_stat_file_out ) < h->mb.i_mb_count )
call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
}
}
+
+ if( mc_a.mbtree_fix8_pack != mc_ref.mbtree_fix8_pack )
+ {
+ set_func_name( "mbtree_fix8_pack" );
+ used_asm = 1;
+ float *fix8_src = (float*)(buf3 + 0x800);
+ uint16_t *dstc = (uint16_t*)buf3;
+ uint16_t *dsta = (uint16_t*)buf4;
+ for( int i = 0; i < 5; i++ )
+ {
+ int count = 256 + i;
+
+ for( int j = 0; j < count; j++ )
+ fix8_src[j] = (int16_t)(rand()) / 256.0f;
+ dsta[count] = 0xAAAA;
+
+ call_c( mc_c.mbtree_fix8_pack, dstc, fix8_src, count );
+ call_a( mc_a.mbtree_fix8_pack, dsta, fix8_src, count );
+
+ if( memcmp( dsta, dstc, count * sizeof(uint16_t) ) || dsta[count] != 0xAAAA )
+ {
+ ok = 0;
+ fprintf( stderr, "mbtree_fix8_pack FAILED\n" );
+ break;
+ }
+ }
+ }
+
+ if( mc_a.mbtree_fix8_unpack != mc_ref.mbtree_fix8_unpack )
+ {
+ set_func_name( "mbtree_fix8_unpack" );
+ used_asm = 1;
+ uint16_t *fix8_src = (uint16_t*)(buf3 + 0x800);
+ float *dstc = (float*)buf3;
+ float *dsta = (float*)buf4;
+ for( int i = 0; i < 5; i++ )
+ {
+ int count = 256 + i;
+
+ for( int j = 0; j < count; j++ )
+ fix8_src[j] = rand();
+ M32( &dsta[count] ) = 0xAAAAAAAA;
+
+ call_c( mc_c.mbtree_fix8_unpack, dstc, fix8_src, count );
+ call_a( mc_a.mbtree_fix8_unpack, dsta, fix8_src, count );
+
+ if( memcmp( dsta, dstc, count * sizeof(float) ) || M32( &dsta[count] ) != 0xAAAAAAAA )
+ {
+ ok = 0;
+ fprintf( stderr, "mbtree_fix8_unpack FAILED\n" );
+ break;
+ }
+ }
+ }
report( "mbtree :" );
if( mc_a.memcpy_aligned != mc_ref.memcpy_aligned )