1 /*****************************************************************************
2 * quant.c: h264 encoder
3 *****************************************************************************
4 * Copyright (C) 2007 Guillaume Poirier <gpoirier@mplayerhq.hu>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19 *****************************************************************************/
21 #include "common/common.h"
22 #include "ppccommon.h"
25 #if !X264_HIGH_BIT_DEPTH
26 // quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
27 #define QUANT_16_U( idx0, idx1 ) \
29 temp1v = vec_ld((idx0), *dct); \
30 temp2v = vec_ld((idx1), *dct); \
31 mfvA = vec_ld((idx0), mf); \
32 mfvB = vec_ld((idx1), mf); \
33 biasvA = vec_ld((idx0), bias); \
34 biasvB = vec_ld((idx1), bias); \
35 mskA = vec_cmplt(temp1v, zero_s16v); \
36 mskB = vec_cmplt(temp2v, zero_s16v); \
37 coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
38 coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
39 coefvA = vec_adds(coefvA, biasvA); \
40 coefvB = vec_adds(coefvB, biasvB); \
41 multEvenvA = vec_mule(coefvA, mfvA); \
42 multOddvA = vec_mulo(coefvA, mfvA); \
43 multEvenvB = vec_mule(coefvB, mfvB); \
44 multOddvB = vec_mulo(coefvB, mfvB); \
45 multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
46 multOddvA = vec_sr(multOddvA, i_qbitsv); \
47 multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
48 multOddvB = vec_sr(multOddvB, i_qbitsv); \
49 temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
50 temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
51 temp1v = vec_xor(temp1v, mskA); \
52 temp2v = vec_xor(temp2v, mskB); \
53 temp1v = vec_adds(temp1v, vec_and(mskA, one)); \
54 vec_st(temp1v, (idx0), (int16_t*)dct); \
55 temp2v = vec_adds(temp2v, vec_and(mskB, one)); \
56 nz = vec_or(nz, vec_or(temp1v, temp2v)); \
57 vec_st(temp2v, (idx1), (int16_t*)dct); \
60 int x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
63 vector bool short mskA;
66 vec_u32_t multEvenvA, multOddvA;
69 vec_s16_t one = vec_splat_s16(1);;
70 vec_s16_t nz = zero_s16v;
72 vector bool short mskB;
74 vec_u32_t multEvenvB, multOddvB;
78 vec_s16_t temp1v, temp2v;
82 i_qbitsv = vec_splat(qbits_u.v, 0);
85 return vec_any_ne(nz, zero_s16v);
88 // DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
89 #define QUANT_16_U_DC( idx0, idx1 ) \
91 temp1v = vec_ld((idx0), *dct); \
92 temp2v = vec_ld((idx1), *dct); \
93 mskA = vec_cmplt(temp1v, zero_s16v); \
94 mskB = vec_cmplt(temp2v, zero_s16v); \
95 coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
96 coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
97 coefvA = vec_add(coefvA, biasv); \
98 coefvB = vec_add(coefvB, biasv); \
99 multEvenvA = vec_mule(coefvA, mfv); \
100 multOddvA = vec_mulo(coefvA, mfv); \
101 multEvenvB = vec_mule(coefvB, mfv); \
102 multOddvB = vec_mulo(coefvB, mfv); \
103 multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
104 multOddvA = vec_sr(multOddvA, i_qbitsv); \
105 multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
106 multOddvB = vec_sr(multOddvB, i_qbitsv); \
107 temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
108 temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
109 temp1v = vec_xor(temp1v, mskA); \
110 temp2v = vec_xor(temp2v, mskB); \
111 temp1v = vec_add(temp1v, vec_and(mskA, one)); \
112 vec_st(temp1v, (idx0), (int16_t*)dct); \
113 temp2v = vec_add(temp2v, vec_and(mskB, one)); \
114 nz = vec_or(nz, vec_or(temp1v, temp2v)); \
115 vec_st(temp2v, (idx1), (int16_t*)dct); \
118 int x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias )
121 vector bool short mskA;
124 vec_u32_t multEvenvA, multOddvA;
125 vec_s16_t one = vec_splat_s16(1);
126 vec_s16_t nz = zero_s16v;
128 vector bool short mskB;
130 vec_u32_t multEvenvB, multOddvB;
132 vec_s16_t temp1v, temp2v;
139 mfv = vec_splat( mf_u.v, 0 );
143 i_qbitsv = vec_splat(qbits_u.v, 0);
147 biasv = vec_splat(bias_u.v, 0);
149 QUANT_16_U_DC( 0, 16 );
150 return vec_any_ne(nz, zero_s16v);
153 // DC quant of a whole 2x2 block
154 #define QUANT_4_U_DC( idx0 ) \
156 const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0); \
157 temp1v = vec_ld((idx0), *dct); \
158 mskA = vec_cmplt(temp1v, zero_s16v); \
159 coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
160 coefvA = vec_add(coefvA, biasv); \
161 multEvenvA = vec_mule(coefvA, mfv); \
162 multOddvA = vec_mulo(coefvA, mfv); \
163 multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
164 multOddvA = vec_sr(multOddvA, i_qbitsv); \
165 temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
166 temp2v = vec_xor(temp2v, mskA); \
167 temp2v = vec_add(temp2v, vec_and(mskA, one)); \
168 temp1v = vec_sel(temp1v, temp2v, sel); \
169 nz = vec_or(nz, temp1v); \
170 vec_st(temp1v, (idx0), (int16_t*)dct); \
173 int x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias )
176 vector bool short mskA;
179 vec_u32_t multEvenvA, multOddvA;
180 vec_s16_t one = vec_splat_s16(1);
181 vec_s16_t nz = zero_s16v;
183 vec_s16_t temp1v, temp2v;
190 mfv = vec_splat( mf_u.v, 0 );
194 i_qbitsv = vec_splat(qbits_u.v, 0);
198 biasv = vec_splat(bias_u.v, 0);
200 static const vec_s16_t mask2 = CV(-1, -1, -1, -1, 0, 0, 0, 0);
202 return vec_any_ne(vec_and(nz, mask2), zero_s16v);
205 int x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
208 vector bool short mskA;
211 vec_u32_t multEvenvA, multOddvA;
214 vec_s16_t one = vec_splat_s16(1);;
215 vec_s16_t nz = zero_s16v;
217 vector bool short mskB;
219 vec_u32_t multEvenvB, multOddvB;
223 vec_s16_t temp1v, temp2v;
227 i_qbitsv = vec_splat(qbits_u.v, 0);
229 for( int i = 0; i < 4; i++ )
230 QUANT_16_U( i*2*16, i*2*16+16 );
231 return vec_any_ne(nz, zero_s16v);
234 #define DEQUANT_SHL() \
236 dctv = vec_ld(0, dct[y]); \
237 mf1v = vec_ld(0, dequant_mf[i_mf][y]); \
238 mf2v = vec_ld(16, dequant_mf[i_mf][y]); \
239 mfv = vec_packs(mf1v, mf2v); \
241 multEvenvA = vec_mule(dctv, mfv); \
242 multOddvA = vec_mulo(dctv, mfv); \
243 dctv = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), \
244 vec_mergel(multEvenvA, multOddvA)); \
245 dctv = vec_sl(dctv, i_qbitsv); \
246 vec_st(dctv, 0, dct[y]); \
249 #define DEQUANT_SHR() \
251 dctv = vec_ld(0, dct[y]); \
252 dct1v = vec_mergeh(dctv, dctv); \
253 dct2v = vec_mergel(dctv, dctv); \
254 mf1v = vec_ld(0, dequant_mf[i_mf][y]); \
255 mf2v = vec_ld(16, dequant_mf[i_mf][y]); \
257 multEvenvA = vec_mule(dct1v, (vec_s16_t)mf1v); \
258 multOddvA = vec_mulo(dct1v, (vec_s16_t)mf1v); \
259 temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
260 temp1v = vec_add(temp1v, fv); \
261 temp1v = vec_sra(temp1v, i_qbitsv); \
263 multEvenvA = vec_mule(dct2v, (vec_s16_t)mf2v); \
264 multOddvA = vec_mulo(dct2v, (vec_s16_t)mf2v); \
265 temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
266 temp2v = vec_add(temp2v, fv); \
267 temp2v = vec_sra(temp2v, i_qbitsv); \
269 dctv = (vec_s16_t)vec_packs(temp1v, temp2v); \
270 vec_st(dctv, 0, dct[y]); \
273 void x264_dequant_4x4_altivec( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
276 int i_qbits = i_qp/6 - 4;
279 vec_s16_t dct1v, dct2v;
280 vec_s32_t mf1v, mf2v;
282 vec_s32_t multEvenvA, multOddvA;
283 vec_s32_t temp1v, temp2v;
289 qbits_u.s[0]=i_qbits;
290 i_qbitsv = vec_splat(qbits_u.v, 0);
292 for( int y = 0; y < 4; y+=2 )
297 const int f = 1 << (-i_qbits-1);
302 fv = (vec_s32_t)vec_splat(f_u.v, 0);
306 qbits_u.s[0]=-i_qbits;
307 i_qbitsv = vec_splat(qbits_u.v, 0);
312 sixteenv = vec_splat(sixteen_u.v, 0);
314 for( int y = 0; y < 4; y+=2 )
319 void x264_dequant_8x8_altivec( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp )
322 int i_qbits = i_qp/6 - 6;
325 vec_s16_t dct1v, dct2v;
326 vec_s32_t mf1v, mf2v;
328 vec_s32_t multEvenvA, multOddvA;
329 vec_s32_t temp1v, temp2v;
335 qbits_u.s[0]=i_qbits;
336 i_qbitsv = vec_splat(qbits_u.v, 0);
338 for( int y = 0; y < 8; y++ )
343 const int f = 1 << (-i_qbits-1);
348 fv = (vec_s32_t)vec_splat(f_u.v, 0);
352 qbits_u.s[0]=-i_qbits;
353 i_qbitsv = vec_splat(qbits_u.v, 0);
358 sixteenv = vec_splat(sixteen_u.v, 0);
360 for( int y = 0; y < 8; y++ )
364 #endif // !X264_HIGH_BIT_DEPTH