From: Guillaume Poirier Date: Mon, 12 Nov 2007 20:28:30 +0000 (+0000) Subject: add AltiVec implementation of dequant_4x4 and dequant_8x8, 2.8x faster than C, X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=3b6b4c412037f072d0511cf48524987f3b927428;p=x264 add AltiVec implementation of dequant_4x4 and dequant_8x8, 2.8x faster than C, 1.01x faster than previous revision with default encoding options Patch by Noboru Asai % noboru DD asai AA gmail DD com % git-svn-id: svn://svn.videolan.org/x264/trunk@684 df754926-b1dd-0310-bc7b-ec298dee348c --- diff --git a/common/ppc/quant.c b/common/ppc/quant.c index aa1990bd..3036d810 100644 --- a/common/ppc/quant.c +++ b/common/ppc/quant.c @@ -237,3 +237,136 @@ void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[6 } } +#define DEQUANT_SHL() \ +{ \ + dctv = vec_ld(0, dct[y]); \ + mf1v = vec_ld(0, dequant_mf[i_mf][y]); \ + mf2v = vec_ld(16, dequant_mf[i_mf][y]); \ + mfv = vec_packs(mf1v, mf2v); \ + \ + multEvenvA = vec_mule(dctv, mfv); \ + multOddvA = vec_mulo(dctv, mfv); \ + dctv = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), \ + vec_mergel(multEvenvA, multOddvA)); \ + dctv = vec_sl(dctv, i_qbitsv); \ + vec_st(dctv, 0, dct[y]); \ +} + +#define DEQUANT_SHR() \ +{ \ + dctv = vec_ld(0, dct[y]); \ + dct1v = vec_mergeh(dctv, dctv); \ + dct2v = vec_mergel(dctv, dctv); \ + mf1v = vec_ld(0, dequant_mf[i_mf][y]); \ + mf2v = vec_ld(16, dequant_mf[i_mf][y]); \ + \ + multEvenvA = vec_mule(dct1v, (vec_s16_t)mf1v); \ + multOddvA = vec_mulo(dct1v, (vec_s16_t)mf1v); \ + temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \ + temp1v = vec_add(temp1v, fv); \ + temp1v = vec_sra(temp1v, i_qbitsv); \ + \ + multEvenvA = vec_mule(dct2v, (vec_s16_t)mf2v); \ + multOddvA = vec_mulo(dct2v, (vec_s16_t)mf2v); \ + temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \ + temp2v = vec_add(temp2v, fv); \ + temp2v = vec_sra(temp2v, i_qbitsv); \ + \ + dctv = (vec_s16_t)vec_packs(temp1v, temp2v); \ + vec_st(dctv, 0, dct[y]); \ +} + +void x264_dequant_4x4_altivec( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ) +{ + const int i_mf = i_qp%6; + const int i_qbits = i_qp/6 - 4; + int y; + + vec_s16_t dctv; + vec_s16_t dct1v, dct2v; + vec_s32_t mf1v, mf2v; + vec_s16_t mfv; + vec_s32_t multEvenvA, multOddvA; + vec_s32_t temp1v, temp2v; + + if( i_qbits >= 0 ) + { + vec_u16_t i_qbitsv; + vect_ushort_u qbits_u; + qbits_u.s[0]=i_qbits; + i_qbitsv = vec_splat(qbits_u.v, 0); + + for( y = 0; y < 4; y+=2 ) + DEQUANT_SHL(); + } + else + { + const int f = 1 << (-i_qbits-1); + + vec_s32_t fv; + vect_int_u f_u; + f_u.s[0]=f; + fv = (vec_s32_t)vec_splat(f_u.v, 0); + + vec_u32_t i_qbitsv; + vect_int_u qbits_u; + qbits_u.s[0]=-i_qbits; + i_qbitsv = vec_splat(qbits_u.v, 0); + + vec_u32_t sixteenv; + vect_int_u sixteen_u; + sixteen_u.s[0]=16; + sixteenv = vec_splat(sixteen_u.v, 0); + + for( y = 0; y < 4; y+=2 ) + DEQUANT_SHR(); + } +} + +void x264_dequant_8x8_altivec( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ) +{ + const int i_mf = i_qp%6; + const int i_qbits = i_qp/6 - 6; + int y; + + vec_s16_t dctv; + vec_s16_t dct1v, dct2v; + vec_s32_t mf1v, mf2v; + vec_s16_t mfv; + vec_s32_t multEvenvA, multOddvA; + vec_s32_t temp1v, temp2v; + + if( i_qbits >= 0 ) + { + vec_u16_t i_qbitsv; + vect_ushort_u qbits_u; + qbits_u.s[0]=i_qbits; + i_qbitsv = vec_splat(qbits_u.v, 0); + + for( y = 0; y < 8; y++ ) + DEQUANT_SHL(); + } + else + { + const int f = 1 << (-i_qbits-1); + + vec_s32_t fv; + vect_int_u f_u; + f_u.s[0]=f; + fv = (vec_s32_t)vec_splat(f_u.v, 0); + + vec_u32_t i_qbitsv; + vect_int_u qbits_u; + qbits_u.s[0]=-i_qbits; + i_qbitsv = vec_splat(qbits_u.v, 0); + + vec_u32_t sixteenv; + vect_int_u sixteen_u; + sixteen_u.s[0]=16; + sixteenv = vec_splat(sixteen_u.v, 0); + + for( y = 0; y < 8; y++ ) + DEQUANT_SHR(); + } +} + diff --git a/common/ppc/quant.h b/common/ppc/quant.h index 84d39436..f10955a8 100644 --- a/common/ppc/quant.h +++ b/common/ppc/quant.h @@ -26,4 +26,7 @@ void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[6 void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias ); void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias ); + +void x264_dequant_4x4_altivec( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp ); +void x264_dequant_8x8_altivec( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp ); #endif diff --git a/common/quant.c b/common/quant.c index 48663e67..a4d853e5 100644 --- a/common/quant.c +++ b/common/quant.c @@ -245,6 +245,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->quant_4x4_dc = x264_quant_4x4_dc_altivec; pf->quant_4x4 = x264_quant_4x4_altivec; pf->quant_8x8 = x264_quant_8x8_altivec; + + pf->dequant_4x4 = x264_dequant_4x4_altivec; + pf->dequant_8x8 = x264_dequant_8x8_altivec; } #endif }