#include "quant.h"
// quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
-#define QUANT_16_U( idx0, idx1 ) \
-temp1v = vec_ld((idx0), *dct); \
-temp2v = vec_ld((idx1), *dct); \
-mfvA = vec_ld((idx0), mf); \
-mfvB = vec_ld((idx1), mf); \
-biasvA = vec_ld((idx0), bias); \
-biasvB = vec_ld((idx1), bias); \
-mskA = vec_cmplt(temp1v, zero_s16v); \
-mskB = vec_cmplt(temp2v, zero_s16v); \
-coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v); \
-coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v); \
-coefvA = vec_adds(coefvA, biasvA); \
-coefvB = vec_adds(coefvB, biasvB); \
-multEvenvA = vec_mule(coefvA, mfvA); \
-multOddvA = vec_mulo(coefvA, mfvA); \
-multEvenvB = vec_mule(coefvB, mfvB); \
-multOddvB = vec_mulo(coefvB, mfvB); \
-multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
-multOddvA = vec_sr(multOddvA, i_qbitsv); \
-multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
-multOddvB = vec_sr(multOddvB, i_qbitsv); \
-temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
-temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
-temp1v = vec_xor(temp1v, mskA); \
-temp2v = vec_xor(temp2v, mskB); \
-temp1v = vec_adds(temp1v, vec_and(mskA, one)); \
-vec_st(temp1v, (idx0), (int16_t*)dct); \
-temp2v = vec_adds(temp2v, vec_and(mskB, one)); \
-nz = vec_or(nz, vec_or(temp1v, temp2v)); \
-vec_st(temp2v, (idx1), (int16_t*)dct);
+#define QUANT_16_U( idx0, idx1 ) \
+{ \
+ temp1v = vec_ld((idx0), *dct); \
+ temp2v = vec_ld((idx1), *dct); \
+ mfvA = vec_ld((idx0), mf); \
+ mfvB = vec_ld((idx1), mf); \
+ biasvA = vec_ld((idx0), bias); \
+ biasvB = vec_ld((idx1), bias); \
+ mskA = vec_cmplt(temp1v, zero_s16v); \
+ mskB = vec_cmplt(temp2v, zero_s16v); \
+ coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
+ coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
+ coefvA = vec_adds(coefvA, biasvA); \
+ coefvB = vec_adds(coefvB, biasvB); \
+ multEvenvA = vec_mule(coefvA, mfvA); \
+ multOddvA = vec_mulo(coefvA, mfvA); \
+ multEvenvB = vec_mule(coefvB, mfvB); \
+ multOddvB = vec_mulo(coefvB, mfvB); \
+ multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
+ multOddvA = vec_sr(multOddvA, i_qbitsv); \
+ multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
+ multOddvB = vec_sr(multOddvB, i_qbitsv); \
+ temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
+ temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
+ temp1v = vec_xor(temp1v, mskA); \
+ temp2v = vec_xor(temp2v, mskB); \
+ temp1v = vec_adds(temp1v, vec_and(mskA, one)); \
+ vec_st(temp1v, (idx0), (int16_t*)dct); \
+ temp2v = vec_adds(temp2v, vec_and(mskB, one)); \
+ nz = vec_or(nz, vec_or(temp1v, temp2v)); \
+ vec_st(temp2v, (idx1), (int16_t*)dct); \
+}
int x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
{
}
// DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
-#define QUANT_16_U_DC( idx0, idx1 ) \
-temp1v = vec_ld((idx0), *dct); \
-temp2v = vec_ld((idx1), *dct); \
-mskA = vec_cmplt(temp1v, zero_s16v); \
-mskB = vec_cmplt(temp2v, zero_s16v); \
-coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
-coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
-coefvA = vec_add(coefvA, biasv); \
-coefvB = vec_add(coefvB, biasv); \
-multEvenvA = vec_mule(coefvA, mfv); \
-multOddvA = vec_mulo(coefvA, mfv); \
-multEvenvB = vec_mule(coefvB, mfv); \
-multOddvB = vec_mulo(coefvB, mfv); \
-multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
-multOddvA = vec_sr(multOddvA, i_qbitsv); \
-multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
-multOddvB = vec_sr(multOddvB, i_qbitsv); \
-temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
-temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
-temp1v = vec_xor(temp1v, mskA); \
-temp2v = vec_xor(temp2v, mskB); \
-temp1v = vec_add(temp1v, vec_and(mskA, one)); \
-vec_st(temp1v, (idx0), (int16_t*)dct); \
-temp2v = vec_add(temp2v, vec_and(mskB, one)); \
-nz = vec_or(nz, vec_or(temp1v, temp2v)); \
-vec_st(temp2v, (idx1), (int16_t*)dct);
+#define QUANT_16_U_DC( idx0, idx1 ) \
+{ \
+ temp1v = vec_ld((idx0), *dct); \
+ temp2v = vec_ld((idx1), *dct); \
+ mskA = vec_cmplt(temp1v, zero_s16v); \
+ mskB = vec_cmplt(temp2v, zero_s16v); \
+ coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
+ coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
+ coefvA = vec_add(coefvA, biasv); \
+ coefvB = vec_add(coefvB, biasv); \
+ multEvenvA = vec_mule(coefvA, mfv); \
+ multOddvA = vec_mulo(coefvA, mfv); \
+ multEvenvB = vec_mule(coefvB, mfv); \
+ multOddvB = vec_mulo(coefvB, mfv); \
+ multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
+ multOddvA = vec_sr(multOddvA, i_qbitsv); \
+ multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
+ multOddvB = vec_sr(multOddvB, i_qbitsv); \
+ temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
+ temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
+ temp1v = vec_xor(temp1v, mskA); \
+ temp2v = vec_xor(temp2v, mskB); \
+ temp1v = vec_add(temp1v, vec_and(mskA, one)); \
+ vec_st(temp1v, (idx0), (int16_t*)dct); \
+ temp2v = vec_add(temp2v, vec_and(mskB, one)); \
+ nz = vec_or(nz, vec_or(temp1v, temp2v)); \
+ vec_st(temp2v, (idx1), (int16_t*)dct); \
+}
int x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias )
{
}
// DC quant of a whole 2x2 block
-#define QUANT_4_U_DC( idx0 ) \
-const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0); \
-temp1v = vec_ld((idx0), *dct); \
-mskA = vec_cmplt(temp1v, zero_s16v); \
-coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
-coefvA = vec_add(coefvA, biasv); \
-multEvenvA = vec_mule(coefvA, mfv); \
-multOddvA = vec_mulo(coefvA, mfv); \
-multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
-multOddvA = vec_sr(multOddvA, i_qbitsv); \
-temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
-temp2v = vec_xor(temp2v, mskA); \
-temp2v = vec_add(temp2v, vec_and(mskA, one)); \
-temp1v = vec_sel(temp1v, temp2v, sel); \
-nz = vec_or(nz, temp1v); \
-vec_st(temp1v, (idx0), (int16_t*)dct);
+#define QUANT_4_U_DC( idx0 ) \
+{ \
+ const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0); \
+ temp1v = vec_ld((idx0), *dct); \
+ mskA = vec_cmplt(temp1v, zero_s16v); \
+ coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
+ coefvA = vec_add(coefvA, biasv); \
+ multEvenvA = vec_mule(coefvA, mfv); \
+ multOddvA = vec_mulo(coefvA, mfv); \
+ multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
+ multOddvA = vec_sr(multOddvA, i_qbitsv); \
+ temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
+ temp2v = vec_xor(temp2v, mskA); \
+ temp2v = vec_add(temp2v, vec_and(mskA, one)); \
+ temp1v = vec_sel(temp1v, temp2v, sel); \
+ nz = vec_or(nz, temp1v); \
+ vec_st(temp1v, (idx0), (int16_t*)dct); \
+}
int x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias )
{
vec_u32_u qbits_u;
qbits_u.s[0]=16;
i_qbitsv = vec_splat(qbits_u.v, 0);
-
- int i;
- for ( i=0; i<4; i++ ) {
- QUANT_16_U( i*2*16, i*2*16+16 );
- }
+ for( int i = 0; i < 4; i++ )
+ QUANT_16_U( i*2*16, i*2*16+16 );
return vec_any_ne(nz, zero_s16v);
}
void x264_dequant_4x4_altivec( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
{
- const int i_mf = i_qp%6;
- const int i_qbits = i_qp/6 - 4;
- int y;
+ int i_mf = i_qp%6;
+ int i_qbits = i_qp/6 - 4;
vec_s16_t dctv;
vec_s16_t dct1v, dct2v;
qbits_u.s[0]=i_qbits;
i_qbitsv = vec_splat(qbits_u.v, 0);
- for( y = 0; y < 4; y+=2 )
+ for( int y = 0; y < 4; y+=2 )
DEQUANT_SHL();
}
else
sixteen_u.s[0]=16;
sixteenv = vec_splat(sixteen_u.v, 0);
- for( y = 0; y < 4; y+=2 )
+ for( int y = 0; y < 4; y+=2 )
DEQUANT_SHR();
}
}
void x264_dequant_8x8_altivec( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp )
{
- const int i_mf = i_qp%6;
- const int i_qbits = i_qp/6 - 6;
- int y;
+ int i_mf = i_qp%6;
+ int i_qbits = i_qp/6 - 6;
vec_s16_t dctv;
vec_s16_t dct1v, dct2v;
qbits_u.s[0]=i_qbits;
i_qbitsv = vec_splat(qbits_u.v, 0);
- for( y = 0; y < 8; y++ )
+ for( int y = 0; y < 8; y++ )
DEQUANT_SHL();
}
else
sixteen_u.s[0]=16;
sixteenv = vec_splat(sixteen_u.v, 0);
- for( y = 0; y < 8; y++ )
+ for( int y = 0; y < 8; y++ )
DEQUANT_SHR();
}
}