* $Id$
*
* Authors: Eric Petit <titer@m0k.org>
+ * Guillaume Poirier <gpoirier@mplayerhq.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
void x264_sub4x4_dct_altivec( int16_t dct[4][4],
uint8_t *pix1, uint8_t *pix2 )
{
- PREP_DIFF;
- PREP_STORE8;
+ PREP_DIFF_8BYTEALIGNED;
vec_s16_t dct0v, dct1v, dct2v, dct3v;
vec_s16_t tmp0v, tmp1v, tmp2v, tmp3v;
- VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct0v );
- VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct1v );
- VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct2v );
- VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct3v );
+ vec_u8_t permHighv;
+
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct0v );
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct1v );
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct2v );
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 4, dct3v );
VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );
VEC_TRANSPOSE_4( tmp0v, tmp1v, tmp2v, tmp3v,
dct0v, dct1v, dct2v, dct3v );
+ permHighv = (vec_u8_t) CV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17);
VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );
- VEC_STORE8( tmp0v, dct[0] );
- VEC_STORE8( tmp1v, dct[1] );
- VEC_STORE8( tmp2v, dct[2] );
- VEC_STORE8( tmp3v, dct[3] );
+
+ vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0, dct);
+ vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16, dct);
}
void x264_sub8x8_dct_altivec( int16_t dct[4][4][4],
uint8_t *pix1, uint8_t *pix2 )
{
- PREP_DIFF;
- PREP_STORE8_HL;
+ PREP_DIFF_8BYTEALIGNED;
vec_s16_t dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v;
vec_s16_t tmp0v, tmp1v, tmp2v, tmp3v, tmp4v, tmp5v, tmp6v, tmp7v;
- VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v );
- VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v );
- VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v );
- VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v );
- VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v );
- VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v );
- VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v );
- VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v );
+ vec_u8_t permHighv, permLowv;
+
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v );
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v );
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v );
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v );
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v );
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v );
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v );
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v );
VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );
VEC_DCT( dct4v, dct5v, dct6v, dct7v, tmp4v, tmp5v, tmp6v, tmp7v );
VEC_TRANSPOSE_8( tmp0v, tmp1v, tmp2v, tmp3v,
tmp4v, tmp5v, tmp6v, tmp7v,
dct0v, dct1v, dct2v, dct3v,
dct4v, dct5v, dct6v, dct7v );
+
+ permHighv = (vec_u8_t) CV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17);
+ permLowv = (vec_u8_t) CV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
+
VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );
- VEC_STORE8_H( tmp0v, dct[0][0] );
- VEC_STORE8_H( tmp1v, dct[0][1] );
- VEC_STORE8_H( tmp2v, dct[0][2] );
- VEC_STORE8_H( tmp3v, dct[0][3] );
- VEC_STORE8_L( tmp0v, dct[2][0] );
- VEC_STORE8_L( tmp1v, dct[2][1] );
- VEC_STORE8_L( tmp2v, dct[2][2] );
- VEC_STORE8_L( tmp3v, dct[2][3] );
VEC_DCT( dct4v, dct5v, dct6v, dct7v, tmp4v, tmp5v, tmp6v, tmp7v );
- VEC_STORE8_H( tmp4v, dct[1][0] );
- VEC_STORE8_H( tmp5v, dct[1][1] );
- VEC_STORE8_H( tmp6v, dct[1][2] );
- VEC_STORE8_H( tmp7v, dct[1][3] );
- VEC_STORE8_L( tmp4v, dct[3][0] );
- VEC_STORE8_L( tmp5v, dct[3][1] );
- VEC_STORE8_L( tmp6v, dct[3][2] );
- VEC_STORE8_L( tmp7v, dct[3][3] );
+
+ vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0, dct);
+ vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16, dct);
+ vec_st(vec_perm(tmp4v, tmp5v, permHighv), 32, dct);
+ vec_st(vec_perm(tmp6v, tmp7v, permHighv), 48, dct);
+ vec_st(vec_perm(tmp0v, tmp1v, permLowv), 64, dct);
+ vec_st(vec_perm(tmp2v, tmp3v, permLowv), 80, dct);
+ vec_st(vec_perm(tmp4v, tmp5v, permLowv), 96, dct);
+ vec_st(vec_perm(tmp6v, tmp7v, permLowv), 112, dct);
}
-
+
void x264_sub16x16_dct_altivec( int16_t dct[16][4][4],
uint8_t *pix1, uint8_t *pix2 )
{
- PREP_DIFF;
- PREP_STORE8_HL;
- vec_s16_t dcth0v, dcth1v, dcth2v, dcth3v,
- dcth4v, dcth5v, dcth6v, dcth7v,
- dctl0v, dctl1v, dctl2v, dctl3v,
- dctl4v, dctl5v, dctl6v, dctl7v;
- vec_s16_t temp0v, temp1v, temp2v, temp3v,
- temp4v, temp5v, temp6v, temp7v;
-
- VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth0v, dctl0v );
- VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth1v, dctl1v );
- VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth2v, dctl2v );
- VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth3v, dctl3v );
- VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth4v, dctl4v );
- VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth5v, dctl5v );
- VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth6v, dctl6v );
- VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth7v, dctl7v );
-
- VEC_DCT( dcth0v, dcth1v, dcth2v, dcth3v,
- temp0v, temp1v, temp2v, temp3v );
- VEC_DCT( dcth4v, dcth5v, dcth6v, dcth7v,
- temp4v, temp5v, temp6v, temp7v );
- VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
- temp4v, temp5v, temp6v, temp7v,
- dcth0v, dcth1v, dcth2v, dcth3v,
- dcth4v, dcth5v, dcth6v, dcth7v );
- VEC_DCT( dcth0v, dcth1v, dcth2v, dcth3v,
- temp0v, temp1v, temp2v, temp3v );
- VEC_STORE8_H( temp0v, dct[0][0] );
- VEC_STORE8_H( temp1v, dct[0][1] );
- VEC_STORE8_H( temp2v, dct[0][2] );
- VEC_STORE8_H( temp3v, dct[0][3] );
- VEC_STORE8_L( temp0v, dct[2][0] );
- VEC_STORE8_L( temp1v, dct[2][1] );
- VEC_STORE8_L( temp2v, dct[2][2] );
- VEC_STORE8_L( temp3v, dct[2][3] );
- VEC_DCT( dcth4v, dcth5v, dcth6v, dcth7v,
- temp4v, temp5v, temp6v, temp7v );
- VEC_STORE8_H( temp4v, dct[1][0] );
- VEC_STORE8_H( temp5v, dct[1][1] );
- VEC_STORE8_H( temp6v, dct[1][2] );
- VEC_STORE8_H( temp7v, dct[1][3] );
- VEC_STORE8_L( temp4v, dct[3][0] );
- VEC_STORE8_L( temp5v, dct[3][1] );
- VEC_STORE8_L( temp6v, dct[3][2] );
- VEC_STORE8_L( temp7v, dct[3][3] );
-
- VEC_DCT( dctl0v, dctl1v, dctl2v, dctl3v,
- temp0v, temp1v, temp2v, temp3v );
- VEC_DCT( dctl4v, dctl5v, dctl6v, dctl7v,
- temp4v, temp5v, temp6v, temp7v );
- VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
- temp4v, temp5v, temp6v, temp7v,
- dctl0v, dctl1v, dctl2v, dctl3v,
- dctl4v, dctl5v, dctl6v, dctl7v );
- VEC_DCT( dctl0v, dctl1v, dctl2v, dctl3v,
- temp0v, temp1v, temp2v, temp3v );
- VEC_STORE8_H( temp0v, dct[4][0] );
- VEC_STORE8_H( temp1v, dct[4][1] );
- VEC_STORE8_H( temp2v, dct[4][2] );
- VEC_STORE8_H( temp3v, dct[4][3] );
- VEC_STORE8_L( temp0v, dct[6][0] );
- VEC_STORE8_L( temp1v, dct[6][1] );
- VEC_STORE8_L( temp2v, dct[6][2] );
- VEC_STORE8_L( temp3v, dct[6][3] );
- VEC_DCT( dctl4v, dctl5v, dctl6v, dctl7v,
- temp4v, temp5v, temp6v, temp7v );
- VEC_STORE8_H( temp4v, dct[5][0] );
- VEC_STORE8_H( temp5v, dct[5][1] );
- VEC_STORE8_H( temp6v, dct[5][2] );
- VEC_STORE8_H( temp7v, dct[5][3] );
- VEC_STORE8_L( temp4v, dct[7][0] );
- VEC_STORE8_L( temp5v, dct[7][1] );
- VEC_STORE8_L( temp6v, dct[7][2] );
- VEC_STORE8_L( temp7v, dct[7][3] );
-
- VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth0v, dctl0v );
- VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth1v, dctl1v );
- VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth2v, dctl2v );
- VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth3v, dctl3v );
- VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth4v, dctl4v );
- VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth5v, dctl5v );
- VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth6v, dctl6v );
- VEC_DIFF_HL( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, dcth7v, dctl7v );
-
- VEC_DCT( dcth0v, dcth1v, dcth2v, dcth3v,
- temp0v, temp1v, temp2v, temp3v );
- VEC_DCT( dcth4v, dcth5v, dcth6v, dcth7v,
- temp4v, temp5v, temp6v, temp7v );
- VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
- temp4v, temp5v, temp6v, temp7v,
- dcth0v, dcth1v, dcth2v, dcth3v,
- dcth4v, dcth5v, dcth6v, dcth7v );
- VEC_DCT( dcth0v, dcth1v, dcth2v, dcth3v,
- temp0v, temp1v, temp2v, temp3v );
- VEC_STORE8_H( temp0v, dct[8][0] );
- VEC_STORE8_H( temp1v, dct[8][1] );
- VEC_STORE8_H( temp2v, dct[8][2] );
- VEC_STORE8_H( temp3v, dct[8][3] );
- VEC_STORE8_L( temp0v, dct[10][0] );
- VEC_STORE8_L( temp1v, dct[10][1] );
- VEC_STORE8_L( temp2v, dct[10][2] );
- VEC_STORE8_L( temp3v, dct[10][3] );
- VEC_DCT( dcth4v, dcth5v, dcth6v, dcth7v,
- temp4v, temp5v, temp6v, temp7v );
- VEC_STORE8_H( temp4v, dct[9][0] );
- VEC_STORE8_H( temp5v, dct[9][1] );
- VEC_STORE8_H( temp6v, dct[9][2] );
- VEC_STORE8_H( temp7v, dct[9][3] );
- VEC_STORE8_L( temp4v, dct[11][0] );
- VEC_STORE8_L( temp5v, dct[11][1] );
- VEC_STORE8_L( temp6v, dct[11][2] );
- VEC_STORE8_L( temp7v, dct[11][3] );
-
- VEC_DCT( dctl0v, dctl1v, dctl2v, dctl3v,
- temp0v, temp1v, temp2v, temp3v );
- VEC_DCT( dctl4v, dctl5v, dctl6v, dctl7v,
- temp4v, temp5v, temp6v, temp7v );
- VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
- temp4v, temp5v, temp6v, temp7v,
- dctl0v, dctl1v, dctl2v, dctl3v,
- dctl4v, dctl5v, dctl6v, dctl7v );
- VEC_DCT( dctl0v, dctl1v, dctl2v, dctl3v,
- temp0v, temp1v, temp2v, temp3v );
- VEC_STORE8_H( temp0v, dct[12][0] );
- VEC_STORE8_H( temp1v, dct[12][1] );
- VEC_STORE8_H( temp2v, dct[12][2] );
- VEC_STORE8_H( temp3v, dct[12][3] );
- VEC_STORE8_L( temp0v, dct[14][0] );
- VEC_STORE8_L( temp1v, dct[14][1] );
- VEC_STORE8_L( temp2v, dct[14][2] );
- VEC_STORE8_L( temp3v, dct[14][3] );
- VEC_DCT( dctl4v, dctl5v, dctl6v, dctl7v,
- temp4v, temp5v, temp6v, temp7v );
- VEC_STORE8_H( temp4v, dct[13][0] );
- VEC_STORE8_H( temp5v, dct[13][1] );
- VEC_STORE8_H( temp6v, dct[13][2] );
- VEC_STORE8_H( temp7v, dct[13][3] );
- VEC_STORE8_L( temp4v, dct[15][0] );
- VEC_STORE8_L( temp5v, dct[15][1] );
- VEC_STORE8_L( temp6v, dct[15][2] );
- VEC_STORE8_L( temp7v, dct[15][3] );
+ x264_sub8x8_dct_altivec( &dct[ 0], &pix1[0], &pix2[0] );
+ x264_sub8x8_dct_altivec( &dct[ 4], &pix1[8], &pix2[8] );
+ x264_sub8x8_dct_altivec( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
+ x264_sub8x8_dct_altivec( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
+}
+
+/***************************************************************************
+ * 8x8 transform:
+ ***************************************************************************/
+
+/* DCT8_1D unrolled by 8 in Altivec */
+#define DCT8_1D_ALTIVEC( dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v ) \
+{ \
+ /* int s07 = SRC(0) + SRC(7); */ \
+ vec_s16_t s07v = vec_add( dct0v, dct7v); \
+ /* int s16 = SRC(1) + SRC(6); */ \
+ vec_s16_t s16v = vec_add( dct1v, dct6v); \
+ /* int s25 = SRC(2) + SRC(5); */ \
+ vec_s16_t s25v = vec_add( dct2v, dct5v); \
+ /* int s34 = SRC(3) + SRC(4); */ \
+ vec_s16_t s34v = vec_add( dct3v, dct4v); \
+\
+ /* int a0 = s07 + s34; */ \
+ vec_s16_t a0v = vec_add(s07v, s34v); \
+ /* int a1 = s16 + s25; */ \
+ vec_s16_t a1v = vec_add(s16v, s25v); \
+ /* int a2 = s07 - s34; */ \
+ vec_s16_t a2v = vec_sub(s07v, s34v); \
+ /* int a3 = s16 - s25; */ \
+ vec_s16_t a3v = vec_sub(s16v, s25v); \
+\
+ /* int d07 = SRC(0) - SRC(7); */ \
+ vec_s16_t d07v = vec_sub( dct0v, dct7v); \
+ /* int d16 = SRC(1) - SRC(6); */ \
+ vec_s16_t d16v = vec_sub( dct1v, dct6v); \
+ /* int d25 = SRC(2) - SRC(5); */ \
+ vec_s16_t d25v = vec_sub( dct2v, dct5v); \
+ /* int d34 = SRC(3) - SRC(4); */ \
+ vec_s16_t d34v = vec_sub( dct3v, dct4v); \
+\
+ /* int a4 = d16 + d25 + (d07 + (d07>>1)); */ \
+ vec_s16_t a4v = vec_add( vec_add(d16v, d25v), vec_add(d07v, vec_sra(d07v, onev)) );\
+ /* int a5 = d07 - d34 - (d25 + (d25>>1)); */ \
+ vec_s16_t a5v = vec_sub( vec_sub(d07v, d34v), vec_add(d25v, vec_sra(d25v, onev)) );\
+ /* int a6 = d07 + d34 - (d16 + (d16>>1)); */ \
+ vec_s16_t a6v = vec_sub( vec_add(d07v, d34v), vec_add(d16v, vec_sra(d16v, onev)) );\
+ /* int a7 = d16 - d25 + (d34 + (d34>>1)); */ \
+ vec_s16_t a7v = vec_add( vec_sub(d16v, d25v), vec_add(d34v, vec_sra(d34v, onev)) );\
+\
+ /* DST(0) = a0 + a1; */ \
+ dct0v = vec_add( a0v, a1v ); \
+ /* DST(1) = a4 + (a7>>2); */ \
+ dct1v = vec_add( a4v, vec_sra(a7v, twov) ); \
+ /* DST(2) = a2 + (a3>>1); */ \
+ dct2v = vec_add( a2v, vec_sra(a3v, onev) ); \
+ /* DST(3) = a5 + (a6>>2); */ \
+ dct3v = vec_add( a5v, vec_sra(a6v, twov) ); \
+ /* DST(4) = a0 - a1; */ \
+ dct4v = vec_sub( a0v, a1v ); \
+ /* DST(5) = a6 - (a5>>2); */ \
+ dct5v = vec_sub( a6v, vec_sra(a5v, twov) ); \
+ /* DST(6) = (a2>>1) - a3 ; */ \
+ dct6v = vec_sub( vec_sra(a2v, onev), a3v ); \
+ /* DST(7) = (a4>>2) - a7 ; */ \
+ dct7v = vec_sub( vec_sra(a4v, twov), a7v ); \
+}
+
+
+void x264_sub8x8_dct8_altivec( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+{
+ vec_u16_t onev = vec_splat_u16(1);
+ vec_u16_t twov = vec_add( onev, onev );
+
+ PREP_DIFF_8BYTEALIGNED;
+
+ vec_s16_t dct0v, dct1v, dct2v, dct3v,
+ dct4v, dct5v, dct6v, dct7v;
+
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v );
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v );
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v );
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v );
+
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v );
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v );
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v );
+ VEC_DIFF_H_8BYTE_ALIGNED( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v );
+
+ DCT8_1D_ALTIVEC( dct0v, dct1v, dct2v, dct3v,
+ dct4v, dct5v, dct6v, dct7v );
+
+ vec_s16_t dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
+ dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v;
+
+ VEC_TRANSPOSE_8(dct0v, dct1v, dct2v, dct3v,
+ dct4v, dct5v, dct6v, dct7v,
+ dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
+ dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v );
+
+ DCT8_1D_ALTIVEC( dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
+ dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v );
+
+ vec_st( dct_tr0v, 0, (signed short *)dct );
+ vec_st( dct_tr1v, 16, (signed short *)dct );
+ vec_st( dct_tr2v, 32, (signed short *)dct );
+ vec_st( dct_tr3v, 48, (signed short *)dct );
+
+ vec_st( dct_tr4v, 64, (signed short *)dct );
+ vec_st( dct_tr5v, 80, (signed short *)dct );
+ vec_st( dct_tr6v, 96, (signed short *)dct );
+ vec_st( dct_tr7v, 112, (signed short *)dct );
}
+
+void x264_sub16x16_dct8_altivec( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
+{
+ x264_sub8x8_dct8_altivec( dct[0], &pix1[0], &pix2[0] );
+ x264_sub8x8_dct8_altivec( dct[1], &pix1[8], &pix2[8] );
+ x264_sub8x8_dct8_altivec( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
+ x264_sub8x8_dct8_altivec( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
+}
+
--- /dev/null
+/*****************************************************************************
+* quant.c: h264 encoder
+*****************************************************************************
+* Authors: Guillaume Poirier <poirierg@gmail.com>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
+*****************************************************************************/
+
+#ifdef HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+
+#include "common/common.h"
+#include "ppccommon.h"
+#include "quant.h"
+
+// quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
+#define QUANT_16_U( dct0, dct1, quant_mf0, quant_mf1, quant_mf2, quant_mf3 ) \
+temp1v = vec_ld((dct0), *dct); \
+temp2v = vec_ld((dct1), *dct); \
+mfvA = (vec_u16_t) vec_packs((vec_u32_t)vec_ld((quant_mf0), *quant_mf), (vec_u32_t)vec_ld((quant_mf1), *quant_mf)); \
+mfvB = (vec_u16_t) vec_packs((vec_u32_t)vec_ld((quant_mf2), *quant_mf), (vec_u32_t)vec_ld((quant_mf3), *quant_mf)); \
+mskA = vec_cmplt(temp1v, zerov); \
+mskB = vec_cmplt(temp2v, zerov); \
+coefvA = (vec_u16_t)vec_max(vec_sub(zerov, temp1v), temp1v); \
+coefvB = (vec_u16_t)vec_max(vec_sub(zerov, temp2v), temp2v); \
+multEvenvA = vec_mule(coefvA, mfvA); \
+multOddvA = vec_mulo(coefvA, mfvA); \
+multEvenvB = vec_mule(coefvB, mfvB); \
+multOddvB = vec_mulo(coefvB, mfvB); \
+multEvenvA = vec_adds(multEvenvA, fV); \
+multOddvA = vec_adds(multOddvA, fV); \
+multEvenvB = vec_adds(multEvenvB, fV); \
+multOddvB = vec_adds(multOddvB, fV); \
+multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
+multOddvA = vec_sr(multOddvA, i_qbitsv); \
+multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
+multOddvB = vec_sr(multOddvB, i_qbitsv); \
+temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
+temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
+temp1v = vec_xor(temp1v, mskA); \
+temp2v = vec_xor(temp2v, mskB); \
+temp1v = vec_adds(temp1v, vec_and(mskA, one)); \
+vec_st(temp1v, (dct0), dct); \
+temp2v = vec_adds(temp2v, vec_and(mskB, one)); \
+vec_st(temp2v, (dct1), dct);
+
+void x264_quant_4x4_altivec( int16_t dct[4][4], int quant_mf[4][4], int const i_qbits, int const f ) {
+ vector bool short mskA;
+ vec_s32_t i_qbitsv;
+ vec_u16_t coefvA;
+ vec_u32_t multEvenvA, multOddvA;
+ vec_u32_t mfvA;
+ vec_s16_t zerov, one;
+ vec_s32_t fV;
+
+ vector bool short mskB;
+ vec_u16_t coefvB;
+ vec_u32_t multEvenvB, multOddvB;
+ vec_u32_t mfvB;
+
+ vec_s16_t temp1v, temp2v;
+
+ vect_sint_u qbits_u;
+ qbits_u.s[0]=i_qbits;
+ i_qbitsv = vec_splat(qbits_u.v, 0);
+
+ vect_sint_u f_u;
+ f_u.s[0]=f;
+
+ fV = vec_splat(f_u.v, 0);
+
+ zerov = vec_splat_s16(0);
+ one = vec_splat_s16(1);
+
+ QUANT_16_U( 0, 16, 0, 16, 32, 48 );
+}
+
+// DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
+#define QUANT_16_U_DC( dct0, dct1 ) \
+temp1v = vec_ld((dct0), *dct); \
+temp2v = vec_ld((dct1), *dct); \
+mskA = vec_cmplt(temp1v, zerov); \
+mskB = vec_cmplt(temp2v, zerov); \
+coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v); \
+coefvB = (vec_u16_t) vec_max(vec_sub(zerov, temp2v), temp2v); \
+multEvenvA = vec_mule(coefvA, mfv); \
+multOddvA = vec_mulo(coefvA, mfv); \
+multEvenvB = vec_mule(coefvB, mfv); \
+multOddvB = vec_mulo(coefvB, mfv); \
+multEvenvA = vec_add(multEvenvA, fV); \
+multOddvA = vec_add(multOddvA, fV); \
+multEvenvB = vec_add(multEvenvB, fV); \
+multOddvB = vec_add(multOddvB, fV); \
+multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
+multOddvA = vec_sr(multOddvA, i_qbitsv); \
+multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
+multOddvB = vec_sr(multOddvB, i_qbitsv); \
+temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
+temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
+temp1v = vec_xor(temp1v, mskA); \
+temp2v = vec_xor(temp2v, mskB); \
+temp1v = vec_add(temp1v, vec_and(mskA, one)); \
+vec_st(temp1v, (dct0), dct); \
+temp2v = vec_add(temp2v, vec_and(mskB, one)); \
+vec_st(temp2v, (dct1), dct);
+
+
+void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_qbits, int const f ) {
+ vector bool short mskA;
+ vec_s32_t i_qbitsv;
+ vec_u16_t coefvA;
+ vec_u32_t multEvenvA, multOddvA;
+ vec_s16_t zerov, one;
+ vec_s32_t fV;
+
+ vector bool short mskB;
+ vec_u16_t coefvB;
+ vec_u32_t multEvenvB, multOddvB;
+
+ vec_s16_t temp1v, temp2v;
+
+ vec_u32_t mfv;
+ vect_int_u mf_u;
+ mf_u.s[0]=i_quant_mf;
+ mfv = vec_splat( mf_u.v, 0 );
+ mfv = vec_packs( mfv, mfv);
+
+ vect_sint_u qbits_u;
+ qbits_u.s[0]=i_qbits;
+ i_qbitsv = vec_splat(qbits_u.v, 0);
+
+ vect_sint_u f_u;
+ f_u.s[0]=f;
+ fV = vec_splat(f_u.v, 0);
+
+ zerov = vec_splat_s16(0);
+ one = vec_splat_s16(1);
+
+ QUANT_16_U_DC( 0, 16 );
+}
+
+
+void x264_quant_8x8_altivec( int16_t dct[8][8], int quant_mf[8][8], int const i_qbits, int const f ) {
+ vector bool short mskA;
+ vec_s32_t i_qbitsv;
+ vec_u16_t coefvA;
+ vec_s32_t multEvenvA, multOddvA, mfvA;
+ vec_s16_t zerov, one;
+ vec_s32_t fV;
+
+ vector bool short mskB;
+ vec_u16_t coefvB;
+ vec_u32_t multEvenvB, multOddvB, mfvB;
+
+ vec_s16_t temp1v, temp2v;
+
+ vect_int_u qbits_u;
+ qbits_u.s[0]=i_qbits;
+ i_qbitsv = vec_splat(qbits_u.v, 0);
+
+ vect_sint_u f_u;
+ f_u.s[0]=f;
+ fV = vec_splat(f_u.v, 0);
+
+ zerov = vec_splat_s16(0);
+ one = vec_splat_s16(1);
+
+ int i;
+
+ for ( i=0; i<4; i++ ) {
+ QUANT_16_U( i*2*16, i*2*16+16, i*4*16, i*4*16+16, i*4*16+32, i*4*16+48 );
+ }
+}
+