1 /*****************************************************************************
2 * quant.c: h264 encoder
3 *****************************************************************************
4 * Copyright (C) 2007 Guillaume Poirier <gpoirier@mplayerhq.hu>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19 *****************************************************************************/
25 #include "common/common.h"
26 #include "ppccommon.h"
29 // quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
30 #define QUANT_16_U( idx0, idx1 ) \
31 temp1v = vec_ld((idx0), *dct); \
32 temp2v = vec_ld((idx1), *dct); \
33 mfvA = vec_ld((idx0), mf); \
34 mfvB = vec_ld((idx1), mf); \
35 biasvA = vec_ld((idx0), bias); \
36 biasvB = vec_ld((idx1), bias); \
37 mskA = vec_cmplt(temp1v, zerov); \
38 mskB = vec_cmplt(temp2v, zerov); \
39 coefvA = (vec_u16_t)vec_max(vec_sub(zerov, temp1v), temp1v); \
40 coefvB = (vec_u16_t)vec_max(vec_sub(zerov, temp2v), temp2v); \
41 coefvA = vec_adds(coefvA, biasvA); \
42 coefvB = vec_adds(coefvB, biasvB); \
43 multEvenvA = vec_mule(coefvA, mfvA); \
44 multOddvA = vec_mulo(coefvA, mfvA); \
45 multEvenvB = vec_mule(coefvB, mfvB); \
46 multOddvB = vec_mulo(coefvB, mfvB); \
47 multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
48 multOddvA = vec_sr(multOddvA, i_qbitsv); \
49 multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
50 multOddvB = vec_sr(multOddvB, i_qbitsv); \
51 temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
52 temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
53 temp1v = vec_xor(temp1v, mskA); \
54 temp2v = vec_xor(temp2v, mskB); \
55 temp1v = vec_adds(temp1v, vec_and(mskA, one)); \
56 vec_st(temp1v, (idx0), (int16_t*)dct); \
57 temp2v = vec_adds(temp2v, vec_and(mskB, one)); \
58 vec_st(temp2v, (idx1), (int16_t*)dct);
60 void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
62 vector bool short mskA;
65 vec_u32_t multEvenvA, multOddvA;
70 vector bool short mskB;
72 vec_u32_t multEvenvB, multOddvB;
76 vec_s16_t temp1v, temp2v;
80 i_qbitsv = vec_splat(qbits_u.v, 0);
82 zerov = vec_splat_s16(0);
83 one = vec_splat_s16(1);
88 // DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
89 #define QUANT_16_U_DC( idx0, idx1 ) \
90 temp1v = vec_ld((idx0), *dct); \
91 temp2v = vec_ld((idx1), *dct); \
92 mskA = vec_cmplt(temp1v, zerov); \
93 mskB = vec_cmplt(temp2v, zerov); \
94 coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v); \
95 coefvB = (vec_u16_t) vec_max(vec_sub(zerov, temp2v), temp2v); \
96 coefvA = vec_add(coefvA, biasv); \
97 coefvB = vec_add(coefvB, biasv); \
98 multEvenvA = vec_mule(coefvA, mfv); \
99 multOddvA = vec_mulo(coefvA, mfv); \
100 multEvenvB = vec_mule(coefvB, mfv); \
101 multOddvB = vec_mulo(coefvB, mfv); \
102 multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
103 multOddvA = vec_sr(multOddvA, i_qbitsv); \
104 multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
105 multOddvB = vec_sr(multOddvB, i_qbitsv); \
106 temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
107 temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
108 temp1v = vec_xor(temp1v, mskA); \
109 temp2v = vec_xor(temp2v, mskB); \
110 temp1v = vec_add(temp1v, vec_and(mskA, one)); \
111 vec_st(temp1v, (idx0), (int16_t*)dct); \
112 temp2v = vec_add(temp2v, vec_and(mskB, one)); \
113 vec_st(temp2v, (idx1), (int16_t*)dct);
115 void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias )
117 vector bool short mskA;
120 vec_u32_t multEvenvA, multOddvA;
121 vec_s16_t zerov, one;
123 vector bool short mskB;
125 vec_u32_t multEvenvB, multOddvB;
127 vec_s16_t temp1v, temp2v;
134 mfv = vec_splat( mf_u.v, 0 );
138 i_qbitsv = vec_splat(qbits_u.v, 0);
140 vect_ushort_u bias_u;
142 biasv = vec_splat(bias_u.v, 0);
144 zerov = vec_splat_s16(0);
145 one = vec_splat_s16(1);
147 QUANT_16_U_DC( 0, 16 );
150 // DC quant of a whole 2x2 block
151 #define QUANT_4_U_DC( idx0 ) \
152 const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0); \
153 temp1v = vec_ld((idx0), *dct); \
154 mskA = vec_cmplt(temp1v, zerov); \
155 coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v); \
156 coefvA = vec_add(coefvA, biasv); \
157 multEvenvA = vec_mule(coefvA, mfv); \
158 multOddvA = vec_mulo(coefvA, mfv); \
159 multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
160 multOddvA = vec_sr(multOddvA, i_qbitsv); \
161 temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
162 temp2v = vec_xor(temp2v, mskA); \
163 temp2v = vec_add(temp2v, vec_and(mskA, one)); \
164 temp1v = vec_sel(temp1v, temp2v, sel); \
165 vec_st(temp1v, (idx0), (int16_t*)dct);
167 void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias )
169 vector bool short mskA;
172 vec_u32_t multEvenvA, multOddvA;
173 vec_s16_t zerov, one;
175 vec_s16_t temp1v, temp2v;
182 mfv = vec_splat( mf_u.v, 0 );
186 i_qbitsv = vec_splat(qbits_u.v, 0);
188 vect_ushort_u bias_u;
190 biasv = vec_splat(bias_u.v, 0);
192 zerov = vec_splat_s16(0);
193 one = vec_splat_s16(1);
198 void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
200 vector bool short mskA;
203 vec_u32_t multEvenvA, multOddvA;
206 vec_s16_t zerov, one;
208 vector bool short mskB;
210 vec_u32_t multEvenvB, multOddvB;
214 vec_s16_t temp1v, temp2v;
218 i_qbitsv = vec_splat(qbits_u.v, 0);
220 zerov = vec_splat_s16(0);
221 one = vec_splat_s16(1);
225 for ( i=0; i<4; i++ ) {
226 QUANT_16_U( i*2*16, i*2*16+16 );
230 #define DEQUANT_SHL() \
232 dctv = vec_ld(0, dct[y]); \
233 mf1v = vec_ld(0, dequant_mf[i_mf][y]); \
234 mf2v = vec_ld(16, dequant_mf[i_mf][y]); \
235 mfv = vec_packs(mf1v, mf2v); \
237 multEvenvA = vec_mule(dctv, mfv); \
238 multOddvA = vec_mulo(dctv, mfv); \
239 dctv = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), \
240 vec_mergel(multEvenvA, multOddvA)); \
241 dctv = vec_sl(dctv, i_qbitsv); \
242 vec_st(dctv, 0, dct[y]); \
245 #define DEQUANT_SHR() \
247 dctv = vec_ld(0, dct[y]); \
248 dct1v = vec_mergeh(dctv, dctv); \
249 dct2v = vec_mergel(dctv, dctv); \
250 mf1v = vec_ld(0, dequant_mf[i_mf][y]); \
251 mf2v = vec_ld(16, dequant_mf[i_mf][y]); \
253 multEvenvA = vec_mule(dct1v, (vec_s16_t)mf1v); \
254 multOddvA = vec_mulo(dct1v, (vec_s16_t)mf1v); \
255 temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
256 temp1v = vec_add(temp1v, fv); \
257 temp1v = vec_sra(temp1v, i_qbitsv); \
259 multEvenvA = vec_mule(dct2v, (vec_s16_t)mf2v); \
260 multOddvA = vec_mulo(dct2v, (vec_s16_t)mf2v); \
261 temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
262 temp2v = vec_add(temp2v, fv); \
263 temp2v = vec_sra(temp2v, i_qbitsv); \
265 dctv = (vec_s16_t)vec_packs(temp1v, temp2v); \
266 vec_st(dctv, 0, dct[y]); \
269 void x264_dequant_4x4_altivec( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
271 const int i_mf = i_qp%6;
272 const int i_qbits = i_qp/6 - 4;
276 vec_s16_t dct1v, dct2v;
277 vec_s32_t mf1v, mf2v;
279 vec_s32_t multEvenvA, multOddvA;
280 vec_s32_t temp1v, temp2v;
285 vect_ushort_u qbits_u;
286 qbits_u.s[0]=i_qbits;
287 i_qbitsv = vec_splat(qbits_u.v, 0);
289 for( y = 0; y < 4; y+=2 )
294 const int f = 1 << (-i_qbits-1);
299 fv = (vec_s32_t)vec_splat(f_u.v, 0);
303 qbits_u.s[0]=-i_qbits;
304 i_qbitsv = vec_splat(qbits_u.v, 0);
307 vect_int_u sixteen_u;
309 sixteenv = vec_splat(sixteen_u.v, 0);
311 for( y = 0; y < 4; y+=2 )
316 void x264_dequant_8x8_altivec( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp )
318 const int i_mf = i_qp%6;
319 const int i_qbits = i_qp/6 - 6;
323 vec_s16_t dct1v, dct2v;
324 vec_s32_t mf1v, mf2v;
326 vec_s32_t multEvenvA, multOddvA;
327 vec_s32_t temp1v, temp2v;
332 vect_ushort_u qbits_u;
333 qbits_u.s[0]=i_qbits;
334 i_qbitsv = vec_splat(qbits_u.v, 0);
336 for( y = 0; y < 8; y++ )
341 const int f = 1 << (-i_qbits-1);
346 fv = (vec_s32_t)vec_splat(f_u.v, 0);
350 qbits_u.s[0]=-i_qbits;
351 i_qbitsv = vec_splat(qbits_u.v, 0);
354 vect_int_u sixteen_u;
356 sixteenv = vec_splat(sixteen_u.v, 0);
358 for( y = 0; y < 8; y++ )