From 3e25eab0b7172e3c0b067b8b6d641ce148d03db9 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Martin=20Storsj=C3=B6?= Date: Thu, 3 Sep 2015 09:30:43 +0300 Subject: [PATCH] x86: Share the mbtree_propagate_list macro with aarch64 This avoids having to duplicate the same code for all architectures that implement only the internal part of this function in assembler. --- common/aarch64/mc-c.c | 83 +------------------------------------------ common/mc.c | 20 +++++------ common/mc.h | 74 ++++++++++++++++++++++++++++++++++++++ common/x86/mc-c.c | 82 +++--------------------------------------- 4 files changed, 88 insertions(+), 171 deletions(-) diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c index b94e3d3d..8d480d7f 100644 --- a/common/aarch64/mc-c.c +++ b/common/aarch64/mc-c.c @@ -205,88 +205,7 @@ void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, int height, int16_t *buf ); #endif // !HIGH_BIT_DEPTH -#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1) -#define CLIP_ADD2(s,x)\ -do\ -{\ - CLIP_ADD((s)[0], (x)[0]);\ - CLIP_ADD((s)[1], (x)[1]);\ -} while(0) - -void x264_mbtree_propagate_list_internal_neon( int16_t (*mvs)[2], - int16_t *propagate_amount, - uint16_t *lowres_costs, - int16_t *output, - int bipred_weight, int mb_y, - int len ); - -static void x264_mbtree_propagate_list_neon( x264_t *h, uint16_t *ref_costs, - int16_t (*mvs)[2], - int16_t *propagate_amount, - uint16_t *lowres_costs, - int bipred_weight, int mb_y, - int len, int list ) -{ - int16_t *current = h->scratch_buffer2; - - x264_mbtree_propagate_list_internal_neon( mvs, propagate_amount, - lowres_costs, current, - bipred_weight, mb_y, len ); - - unsigned stride = h->mb.i_mb_stride; - unsigned width = h->mb.i_mb_width; - unsigned height = h->mb.i_mb_height; - - for( unsigned i = 0; i < len; current += 32 ) - { - int end = X264_MIN( i+8, len ); - for( ; i < end; i++, current += 2 ) - { - if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) ) - continue; - - unsigned mbx = current[0]; - unsigned mby = current[1]; - unsigned idx0 = mbx + mby * stride; - unsigned idx2 = idx0 + stride; - - /* Shortcut for the simple/common case of zero MV */ - if( !M32( mvs[i] ) ) - { - CLIP_ADD( ref_costs[idx0], current[16] ); - continue; - } - - if( mbx < width-1 && mby < height-1 ) - { - CLIP_ADD2( ref_costs+idx0, current+16 ); - CLIP_ADD2( ref_costs+idx2, current+32 ); - } - else - { - /* Note: this takes advantage of unsigned representation to - * catch negative mbx/mby. */ - if( mby < height ) - { - if( mbx < width ) - CLIP_ADD( ref_costs[idx0+0], current[16] ); - if( mbx+1 < width ) - CLIP_ADD( ref_costs[idx0+1], current[17] ); - } - if( mby+1 < height ) - { - if( mbx < width ) - CLIP_ADD( ref_costs[idx2+0], current[32] ); - if( mbx+1 < width ) - CLIP_ADD( ref_costs[idx2+1], current[33] ); - } - } - } - } -} - -#undef CLIP_ADD -#undef CLIP_ADD2 +PROPAGATE_LIST(neon) void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf ) { diff --git a/common/mc.c b/common/mc.c index 8c63e1b5..57c1f23a 100644 --- a/common/mc.c +++ b/common/mc.c @@ -526,7 +526,6 @@ static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs for( unsigned i = 0; i < len; i++ ) { -#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1) int lists_used = lowres_costs[i]>>LOWRES_COST_SHIFT; if( !(lists_used & (1 << list)) ) @@ -540,7 +539,7 @@ static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs /* Early termination for simple case of mv0. */ if( !M32( mvs[i] ) ) { - CLIP_ADD( ref_costs[mb_y*stride + i], listamount ); + MC_CLIP_ADD( ref_costs[mb_y*stride + i], listamount ); continue; } @@ -563,10 +562,10 @@ static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs if( mbx < width-1 && mby < height-1 ) { - CLIP_ADD( ref_costs[idx0+0], idx0weight ); - CLIP_ADD( ref_costs[idx0+1], idx1weight ); - CLIP_ADD( ref_costs[idx2+0], idx2weight ); - CLIP_ADD( ref_costs[idx2+1], idx3weight ); + MC_CLIP_ADD( ref_costs[idx0+0], idx0weight ); + MC_CLIP_ADD( ref_costs[idx0+1], idx1weight ); + MC_CLIP_ADD( ref_costs[idx2+0], idx2weight ); + MC_CLIP_ADD( ref_costs[idx2+1], idx3weight ); } else { @@ -575,20 +574,19 @@ static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs if( mby < height ) { if( mbx < width ) - CLIP_ADD( ref_costs[idx0+0], idx0weight ); + MC_CLIP_ADD( ref_costs[idx0+0], idx0weight ); if( mbx+1 < width ) - CLIP_ADD( ref_costs[idx0+1], idx1weight ); + MC_CLIP_ADD( ref_costs[idx0+1], idx1weight ); } if( mby+1 < height ) { if( mbx < width ) - CLIP_ADD( ref_costs[idx2+0], idx2weight ); + MC_CLIP_ADD( ref_costs[idx2+0], idx2weight ); if( mbx+1 < width ) - CLIP_ADD( ref_costs[idx2+1], idx3weight ); + MC_CLIP_ADD( ref_costs[idx2+1], idx3weight ); } } } -#undef CLIP_ADD } void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent ) diff --git a/common/mc.h b/common/mc.h index 53aab379..47184ea4 100644 --- a/common/mc.h +++ b/common/mc.h @@ -26,6 +26,80 @@ #ifndef X264_MC_H #define X264_MC_H +#define MC_CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1) +#define MC_CLIP_ADD2(s,x)\ +do\ +{\ + MC_CLIP_ADD((s)[0], (x)[0]);\ + MC_CLIP_ADD((s)[1], (x)[1]);\ +} while(0) + +#define PROPAGATE_LIST(cpu)\ +void x264_mbtree_propagate_list_internal_##cpu( int16_t (*mvs)[2], int16_t *propagate_amount,\ + uint16_t *lowres_costs, int16_t *output,\ + int bipred_weight, int mb_y, int len );\ +\ +static void x264_mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],\ + int16_t *propagate_amount, uint16_t *lowres_costs,\ + int bipred_weight, int mb_y, int len, int list )\ +{\ + int16_t *current = h->scratch_buffer2;\ +\ + x264_mbtree_propagate_list_internal_##cpu( mvs, propagate_amount, lowres_costs,\ + current, bipred_weight, mb_y, len );\ +\ + unsigned stride = h->mb.i_mb_stride;\ + unsigned width = h->mb.i_mb_width;\ + unsigned height = h->mb.i_mb_height;\ +\ + for( unsigned i = 0; i < len; current += 32 )\ + {\ + int end = X264_MIN( i+8, len );\ + for( ; i < end; i++, current += 2 )\ + {\ + if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\ + continue;\ +\ + unsigned mbx = current[0];\ + unsigned mby = current[1];\ + unsigned idx0 = mbx + mby * stride;\ + unsigned idx2 = idx0 + stride;\ +\ + /* Shortcut for the simple/common case of zero MV */\ + if( !M32( mvs[i] ) )\ + {\ + MC_CLIP_ADD( ref_costs[idx0], current[16] );\ + continue;\ + }\ +\ + if( mbx < width-1 && mby < height-1 )\ + {\ + MC_CLIP_ADD2( ref_costs+idx0, current+16 );\ + MC_CLIP_ADD2( ref_costs+idx2, current+32 );\ + }\ + else\ + {\ + /* Note: this takes advantage of unsigned representation to\ + * catch negative mbx/mby. */\ + if( mby < height )\ + {\ + if( mbx < width )\ + MC_CLIP_ADD( ref_costs[idx0+0], current[16] );\ + if( mbx+1 < width )\ + MC_CLIP_ADD( ref_costs[idx0+1], current[17] );\ + }\ + if( mby+1 < height )\ + {\ + if( mbx < width )\ + MC_CLIP_ADD( ref_costs[idx2+0], current[32] );\ + if( mbx+1 < width )\ + MC_CLIP_ADD( ref_costs[idx2+1], current[33] );\ + }\ + }\ + }\ + }\ +} + struct x264_weight_t; typedef void (* weight_fn_t)( pixel *, intptr_t, pixel *,intptr_t, const struct x264_weight_t *, int ); typedef struct x264_weight_t diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index d868706c..b437ca4e 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -590,7 +590,8 @@ PLANE_INTERLEAVE(avx) #endif #if HAVE_X86_INLINE_ASM -#define CLIP_ADD(s,x)\ +#undef MC_CLIP_ADD +#define MC_CLIP_ADD(s,x)\ do\ {\ int temp;\ @@ -604,7 +605,8 @@ do\ s = temp;\ } while(0) -#define CLIP_ADD2(s,x)\ +#undef MC_CLIP_ADD2 +#define MC_CLIP_ADD2(s,x)\ do\ {\ asm("movd %0, %%xmm0 \n"\ @@ -615,86 +617,10 @@ do\ :"m"(M32(x))\ );\ } while(0) -#else -#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1) -#define CLIP_ADD2(s,x)\ -do\ -{\ - CLIP_ADD((s)[0], (x)[0]);\ - CLIP_ADD((s)[1], (x)[1]);\ -} while(0) #endif -#define PROPAGATE_LIST(cpu)\ -void x264_mbtree_propagate_list_internal_##cpu( int16_t (*mvs)[2], int16_t *propagate_amount,\ - uint16_t *lowres_costs, int16_t *output,\ - int bipred_weight, int mb_y, int len );\ -\ -static void x264_mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],\ - int16_t *propagate_amount, uint16_t *lowres_costs,\ - int bipred_weight, int mb_y, int len, int list )\ -{\ - int16_t *current = h->scratch_buffer2;\ -\ - x264_mbtree_propagate_list_internal_##cpu( mvs, propagate_amount, lowres_costs,\ - current, bipred_weight, mb_y, len );\ -\ - unsigned stride = h->mb.i_mb_stride;\ - unsigned width = h->mb.i_mb_width;\ - unsigned height = h->mb.i_mb_height;\ -\ - for( unsigned i = 0; i < len; current += 32 )\ - {\ - int end = X264_MIN( i+8, len );\ - for( ; i < end; i++, current += 2 )\ - {\ - if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\ - continue;\ -\ - unsigned mbx = current[0];\ - unsigned mby = current[1];\ - unsigned idx0 = mbx + mby * stride;\ - unsigned idx2 = idx0 + stride;\ -\ - /* Shortcut for the simple/common case of zero MV */\ - if( !M32( mvs[i] ) )\ - {\ - CLIP_ADD( ref_costs[idx0], current[16] );\ - continue;\ - }\ -\ - if( mbx < width-1 && mby < height-1 )\ - {\ - CLIP_ADD2( ref_costs+idx0, current+16 );\ - CLIP_ADD2( ref_costs+idx2, current+32 );\ - }\ - else\ - {\ - /* Note: this takes advantage of unsigned representation to\ - * catch negative mbx/mby. */\ - if( mby < height )\ - {\ - if( mbx < width )\ - CLIP_ADD( ref_costs[idx0+0], current[16] );\ - if( mbx+1 < width )\ - CLIP_ADD( ref_costs[idx0+1], current[17] );\ - }\ - if( mby+1 < height )\ - {\ - if( mbx < width )\ - CLIP_ADD( ref_costs[idx2+0], current[32] );\ - if( mbx+1 < width )\ - CLIP_ADD( ref_costs[idx2+1], current[33] );\ - }\ - }\ - }\ - }\ -} - PROPAGATE_LIST(ssse3) PROPAGATE_LIST(avx) -#undef CLIP_ADD -#undef CLIP_ADD2 void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { -- 2.39.2