1 /*****************************************************************************
2 * quant.c: h264 encoder
3 *****************************************************************************
4 * Authors: Guillaume Poirier <poirierg@gmail.com>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
19 *****************************************************************************/
27 vector unsigned int v;
32 vector unsigned short v;
35 #include "common/common.h"
36 #include "ppccommon.h"
39 // quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
40 #define QUANT_16_U( idx0, idx1 ) \
41 temp1v = vec_ld((idx0), *dct); \
42 temp2v = vec_ld((idx1), *dct); \
43 mfvA = vec_ld((idx0), mf); \
44 mfvB = vec_ld((idx1), mf); \
45 biasvA = vec_ld((idx0), bias); \
46 biasvB = vec_ld((idx1), bias); \
47 mskA = vec_cmplt(temp1v, zerov); \
48 mskB = vec_cmplt(temp2v, zerov); \
49 coefvA = (vec_u16_t)vec_max(vec_sub(zerov, temp1v), temp1v); \
50 coefvB = (vec_u16_t)vec_max(vec_sub(zerov, temp2v), temp2v); \
51 coefvA = vec_adds(coefvA, biasvA); \
52 coefvB = vec_adds(coefvB, biasvB); \
53 multEvenvA = vec_mule(coefvA, mfvA); \
54 multOddvA = vec_mulo(coefvA, mfvA); \
55 multEvenvB = vec_mule(coefvB, mfvB); \
56 multOddvB = vec_mulo(coefvB, mfvB); \
57 multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
58 multOddvA = vec_sr(multOddvA, i_qbitsv); \
59 multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
60 multOddvB = vec_sr(multOddvB, i_qbitsv); \
61 temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
62 temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
63 temp1v = vec_xor(temp1v, mskA); \
64 temp2v = vec_xor(temp2v, mskB); \
65 temp1v = vec_adds(temp1v, vec_and(mskA, one)); \
66 vec_st(temp1v, (idx0), (int16_t*)dct); \
67 temp2v = vec_adds(temp2v, vec_and(mskB, one)); \
68 vec_st(temp2v, (idx1), (int16_t*)dct);
70 void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
72 vector bool short mskA;
75 vec_u32_t multEvenvA, multOddvA;
80 vector bool short mskB;
82 vec_u32_t multEvenvB, multOddvB;
86 vec_s16_t temp1v, temp2v;
90 i_qbitsv = vec_splat(qbits_u.v, 0);
92 zerov = vec_splat_s16(0);
93 one = vec_splat_s16(1);
98 // DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
99 #define QUANT_16_U_DC( idx0, idx1 ) \
100 temp1v = vec_ld((idx0), *dct); \
101 temp2v = vec_ld((idx1), *dct); \
102 mskA = vec_cmplt(temp1v, zerov); \
103 mskB = vec_cmplt(temp2v, zerov); \
104 coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v); \
105 coefvB = (vec_u16_t) vec_max(vec_sub(zerov, temp2v), temp2v); \
106 coefvA = vec_add(coefvA, biasv); \
107 coefvB = vec_add(coefvB, biasv); \
108 multEvenvA = vec_mule(coefvA, mfv); \
109 multOddvA = vec_mulo(coefvA, mfv); \
110 multEvenvB = vec_mule(coefvB, mfv); \
111 multOddvB = vec_mulo(coefvB, mfv); \
112 multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
113 multOddvA = vec_sr(multOddvA, i_qbitsv); \
114 multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
115 multOddvB = vec_sr(multOddvB, i_qbitsv); \
116 temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
117 temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
118 temp1v = vec_xor(temp1v, mskA); \
119 temp2v = vec_xor(temp2v, mskB); \
120 temp1v = vec_add(temp1v, vec_and(mskA, one)); \
121 vec_st(temp1v, (idx0), (int16_t*)dct); \
122 temp2v = vec_add(temp2v, vec_and(mskB, one)); \
123 vec_st(temp2v, (idx1), (int16_t*)dct);
125 void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias )
127 vector bool short mskA;
130 vec_u32_t multEvenvA, multOddvA;
131 vec_s16_t zerov, one;
133 vector bool short mskB;
135 vec_u32_t multEvenvB, multOddvB;
137 vec_s16_t temp1v, temp2v;
144 mfv = vec_splat( mf_u.v, 0 );
148 i_qbitsv = vec_splat(qbits_u.v, 0);
150 vect_ushort_u bias_u;
152 biasv = vec_splat(bias_u.v, 0);
154 zerov = vec_splat_s16(0);
155 one = vec_splat_s16(1);
157 QUANT_16_U_DC( 0, 16 );
160 // DC quant of a whole 2x2 block
161 #define QUANT_4_U_DC( idx0 ) \
162 const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0); \
163 temp1v = vec_ld((idx0), *dct); \
164 mskA = vec_cmplt(temp1v, zerov); \
165 coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v); \
166 coefvA = vec_add(coefvA, biasv); \
167 multEvenvA = vec_mule(coefvA, mfv); \
168 multOddvA = vec_mulo(coefvA, mfv); \
169 multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
170 multOddvA = vec_sr(multOddvA, i_qbitsv); \
171 temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
172 temp2v = vec_xor(temp2v, mskA); \
173 temp2v = vec_add(temp2v, vec_and(mskA, one)); \
174 temp1v = vec_sel(temp1v, temp2v, sel); \
175 vec_st(temp1v, (idx0), (int16_t*)dct);
177 void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias )
179 vector bool short mskA;
182 vec_u32_t multEvenvA, multOddvA;
183 vec_s16_t zerov, one;
185 vec_s16_t temp1v, temp2v;
192 mfv = vec_splat( mf_u.v, 0 );
196 i_qbitsv = vec_splat(qbits_u.v, 0);
198 vect_ushort_u bias_u;
200 biasv = vec_splat(bias_u.v, 0);
202 zerov = vec_splat_s16(0);
203 one = vec_splat_s16(1);
208 void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
210 vector bool short mskA;
213 vec_u32_t multEvenvA, multOddvA;
216 vec_s16_t zerov, one;
218 vector bool short mskB;
220 vec_u32_t multEvenvB, multOddvB;
224 vec_s16_t temp1v, temp2v;
228 i_qbitsv = vec_splat(qbits_u.v, 0);
230 zerov = vec_splat_s16(0);
231 one = vec_splat_s16(1);
235 for ( i=0; i<4; i++ ) {
236 QUANT_16_U( i*2*16, i*2*16+16 );