]> git.sesse.net Git - x264/blob - common/ppc/quant.c
Update file headers throughout x264
[x264] / common / ppc / quant.c
1 /*****************************************************************************
2 * quant.c: h264 encoder
3 *****************************************************************************
4 * Copyright (C) 2007 Guillaume Poirier <gpoirier@mplayerhq.hu>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
19 *****************************************************************************/
20
21 #if defined SYS_LINUX
22 #include <altivec.h>
23 #endif
24
25 #include "common/common.h"
26 #include "ppccommon.h"
27 #include "quant.h"            
28
29 // quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
30 #define QUANT_16_U( idx0, idx1 )                                             \
31 temp1v = vec_ld((idx0), *dct);                                               \
32 temp2v = vec_ld((idx1), *dct);                                               \
33 mfvA = vec_ld((idx0), mf);                                                   \
34 mfvB = vec_ld((idx1), mf);                                                   \
35 biasvA = vec_ld((idx0), bias);                                               \
36 biasvB = vec_ld((idx1), bias);                                               \
37 mskA = vec_cmplt(temp1v, zerov);                                             \
38 mskB = vec_cmplt(temp2v, zerov);                                             \
39 coefvA = (vec_u16_t)vec_max(vec_sub(zerov, temp1v), temp1v);                 \
40 coefvB = (vec_u16_t)vec_max(vec_sub(zerov, temp2v), temp2v);                 \
41 coefvA = vec_adds(coefvA, biasvA);                                           \
42 coefvB = vec_adds(coefvB, biasvB);                                           \
43 multEvenvA = vec_mule(coefvA, mfvA);                                         \
44 multOddvA = vec_mulo(coefvA, mfvA);                                          \
45 multEvenvB = vec_mule(coefvB, mfvB);                                         \
46 multOddvB = vec_mulo(coefvB, mfvB);                                          \
47 multEvenvA = vec_sr(multEvenvA, i_qbitsv);                                   \
48 multOddvA = vec_sr(multOddvA, i_qbitsv);                                     \
49 multEvenvB = vec_sr(multEvenvB, i_qbitsv);                                   \
50 multOddvB = vec_sr(multOddvB, i_qbitsv);                                     \
51 temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
52 temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
53 temp1v = vec_xor(temp1v, mskA);                                              \
54 temp2v = vec_xor(temp2v, mskB);                                              \
55 temp1v = vec_adds(temp1v, vec_and(mskA, one));                               \
56 vec_st(temp1v, (idx0), (int16_t*)dct);                                       \
57 temp2v = vec_adds(temp2v, vec_and(mskB, one));                               \
58 vec_st(temp2v, (idx1), (int16_t*)dct);
59                 
60 void x264_quant_4x4_altivec( int16_t dct[4][4], uint16_t mf[16], uint16_t bias[16] )
61 {
62     vector bool short mskA;
63     vec_u32_t i_qbitsv;
64     vec_u16_t coefvA;
65     vec_u32_t multEvenvA, multOddvA;
66     vec_u16_t mfvA;
67     vec_u16_t biasvA;
68     vec_s16_t zerov, one;
69
70     vector bool short mskB;
71     vec_u16_t coefvB;
72     vec_u32_t multEvenvB, multOddvB;
73     vec_u16_t mfvB;
74     vec_u16_t biasvB;
75
76     vec_s16_t temp1v, temp2v;
77
78     vect_int_u qbits_u;
79     qbits_u.s[0]=16;
80     i_qbitsv = vec_splat(qbits_u.v, 0);
81
82     zerov = vec_splat_s16(0);
83     one = vec_splat_s16(1);
84
85     QUANT_16_U( 0, 16 );
86 }
87
88 // DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
89 #define QUANT_16_U_DC( idx0, idx1 )                             \
90 temp1v = vec_ld((idx0), *dct);                                  \
91 temp2v = vec_ld((idx1), *dct);                                  \
92 mskA = vec_cmplt(temp1v, zerov);                                \
93 mskB = vec_cmplt(temp2v, zerov);                                \
94 coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v);   \
95 coefvB = (vec_u16_t) vec_max(vec_sub(zerov, temp2v), temp2v);   \
96 coefvA = vec_add(coefvA, biasv);                                \
97 coefvB = vec_add(coefvB, biasv);                                \
98 multEvenvA = vec_mule(coefvA, mfv);                             \
99 multOddvA = vec_mulo(coefvA, mfv);                              \
100 multEvenvB = vec_mule(coefvB, mfv);                             \
101 multOddvB = vec_mulo(coefvB, mfv);                              \
102 multEvenvA = vec_sr(multEvenvA, i_qbitsv);                      \
103 multOddvA = vec_sr(multOddvA, i_qbitsv);                        \
104 multEvenvB = vec_sr(multEvenvB, i_qbitsv);                      \
105 multOddvB = vec_sr(multOddvB, i_qbitsv);                        \
106 temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
107 temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
108 temp1v = vec_xor(temp1v, mskA);                                 \
109 temp2v = vec_xor(temp2v, mskB);                                 \
110 temp1v = vec_add(temp1v, vec_and(mskA, one));                   \
111 vec_st(temp1v, (idx0), (int16_t*)dct);                          \
112 temp2v = vec_add(temp2v, vec_and(mskB, one));                   \
113 vec_st(temp2v, (idx1), (int16_t*)dct);
114
115 void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int mf, int bias )
116 {
117     vector bool short mskA;
118     vec_u32_t i_qbitsv;
119     vec_u16_t coefvA;
120     vec_u32_t multEvenvA, multOddvA;
121     vec_s16_t zerov, one;
122
123     vector bool short mskB;
124     vec_u16_t coefvB;
125     vec_u32_t multEvenvB, multOddvB;
126
127     vec_s16_t temp1v, temp2v;
128
129     vec_u16_t mfv;
130     vec_u16_t biasv;
131
132     vect_ushort_u mf_u;
133     mf_u.s[0]=mf;
134     mfv = vec_splat( mf_u.v, 0 );
135
136     vect_int_u qbits_u;
137     qbits_u.s[0]=16;
138     i_qbitsv = vec_splat(qbits_u.v, 0);
139
140     vect_ushort_u bias_u;
141     bias_u.s[0]=bias;
142     biasv = vec_splat(bias_u.v, 0);
143
144     zerov = vec_splat_s16(0);
145     one = vec_splat_s16(1);
146
147     QUANT_16_U_DC( 0, 16 );
148 }
149
150 // DC quant of a whole 2x2 block
151 #define QUANT_4_U_DC( idx0 )                                    \
152 const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0);      \
153 temp1v = vec_ld((idx0), *dct);                                  \
154 mskA = vec_cmplt(temp1v, zerov);                                \
155 coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v);   \
156 coefvA = vec_add(coefvA, biasv);                                \
157 multEvenvA = vec_mule(coefvA, mfv);                             \
158 multOddvA = vec_mulo(coefvA, mfv);                              \
159 multEvenvA = vec_sr(multEvenvA, i_qbitsv);                      \
160 multOddvA = vec_sr(multOddvA, i_qbitsv);                        \
161 temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
162 temp2v = vec_xor(temp2v, mskA);                                 \
163 temp2v = vec_add(temp2v, vec_and(mskA, one));                   \
164 temp1v = vec_sel(temp1v, temp2v, sel);                          \
165 vec_st(temp1v, (idx0), (int16_t*)dct);
166
167 void x264_quant_2x2_dc_altivec( int16_t dct[2][2], int mf, int bias )
168 {
169     vector bool short mskA;
170     vec_u32_t i_qbitsv;
171     vec_u16_t coefvA;
172     vec_u32_t multEvenvA, multOddvA;
173     vec_s16_t zerov, one;
174
175     vec_s16_t temp1v, temp2v;
176
177     vec_u16_t mfv;
178     vec_u16_t biasv;
179
180     vect_ushort_u mf_u;
181     mf_u.s[0]=mf;
182     mfv = vec_splat( mf_u.v, 0 );
183
184     vect_int_u qbits_u;
185     qbits_u.s[0]=16;
186     i_qbitsv = vec_splat(qbits_u.v, 0);
187
188     vect_ushort_u bias_u;
189     bias_u.s[0]=bias;
190     biasv = vec_splat(bias_u.v, 0);
191
192     zerov = vec_splat_s16(0);
193     one = vec_splat_s16(1);
194
195     QUANT_4_U_DC(0);
196 }
197
198 void x264_quant_8x8_altivec( int16_t dct[8][8], uint16_t mf[64], uint16_t bias[64] )
199 {
200     vector bool short mskA;
201     vec_u32_t i_qbitsv;
202     vec_u16_t coefvA;
203     vec_u32_t multEvenvA, multOddvA;
204     vec_u16_t mfvA;
205     vec_u16_t biasvA;
206     vec_s16_t zerov, one;
207     
208     vector bool short mskB;
209     vec_u16_t coefvB;
210     vec_u32_t multEvenvB, multOddvB;
211     vec_u16_t mfvB;
212     vec_u16_t biasvB;
213     
214     vec_s16_t temp1v, temp2v;
215     
216     vect_int_u qbits_u;
217     qbits_u.s[0]=16;
218     i_qbitsv = vec_splat(qbits_u.v, 0);
219
220     zerov = vec_splat_s16(0);
221     one = vec_splat_s16(1);
222     
223     int i;
224
225     for ( i=0; i<4; i++ ) {
226       QUANT_16_U( i*2*16, i*2*16+16 );
227     }
228 }
229
230 #define DEQUANT_SHL()                                                \
231 {                                                                    \
232     dctv = vec_ld(0, dct[y]);                                        \
233     mf1v = vec_ld(0, dequant_mf[i_mf][y]);                           \
234     mf2v = vec_ld(16, dequant_mf[i_mf][y]);                          \
235     mfv  = vec_packs(mf1v, mf2v);                                    \
236                                                                      \
237     multEvenvA = vec_mule(dctv, mfv);                                \
238     multOddvA = vec_mulo(dctv, mfv);                                 \
239     dctv = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA),  \
240                                  vec_mergel(multEvenvA, multOddvA)); \
241     dctv = vec_sl(dctv, i_qbitsv);                                   \
242     vec_st(dctv, 0, dct[y]);                                         \
243 }
244
245 #define DEQUANT_SHR()                                          \
246 {                                                              \
247     dctv = vec_ld(0, dct[y]);                                  \
248     dct1v = vec_mergeh(dctv, dctv);                            \
249     dct2v = vec_mergel(dctv, dctv);                            \
250     mf1v = vec_ld(0, dequant_mf[i_mf][y]);                     \
251     mf2v = vec_ld(16, dequant_mf[i_mf][y]);                    \
252                                                                \
253     multEvenvA = vec_mule(dct1v, (vec_s16_t)mf1v);             \
254     multOddvA = vec_mulo(dct1v, (vec_s16_t)mf1v);              \
255     temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
256     temp1v = vec_add(temp1v, fv);                              \
257     temp1v = vec_sra(temp1v, i_qbitsv);                        \
258                                                                \
259     multEvenvA = vec_mule(dct2v, (vec_s16_t)mf2v);             \
260     multOddvA = vec_mulo(dct2v, (vec_s16_t)mf2v);              \
261     temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
262     temp2v = vec_add(temp2v, fv);                              \
263     temp2v = vec_sra(temp2v, i_qbitsv);                        \
264                                                                \
265     dctv = (vec_s16_t)vec_packs(temp1v, temp2v);               \
266     vec_st(dctv, 0, dct[y]);                                   \
267 }
268
269 void x264_dequant_4x4_altivec( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp )
270 {
271     const int i_mf = i_qp%6;
272     const int i_qbits = i_qp/6 - 4;
273     int y;
274
275     vec_s16_t dctv;
276     vec_s16_t dct1v, dct2v;
277     vec_s32_t mf1v, mf2v;
278     vec_s16_t mfv;
279     vec_s32_t multEvenvA, multOddvA;
280     vec_s32_t temp1v, temp2v;
281
282     if( i_qbits >= 0 )
283     {
284         vec_u16_t i_qbitsv;
285         vect_ushort_u qbits_u;
286         qbits_u.s[0]=i_qbits;
287         i_qbitsv = vec_splat(qbits_u.v, 0);
288
289         for( y = 0; y < 4; y+=2 )
290             DEQUANT_SHL();
291     }
292     else
293     {
294         const int f = 1 << (-i_qbits-1);
295
296         vec_s32_t fv;
297         vect_int_u f_u;
298         f_u.s[0]=f;
299         fv = (vec_s32_t)vec_splat(f_u.v, 0);
300
301         vec_u32_t i_qbitsv;
302         vect_int_u qbits_u;
303         qbits_u.s[0]=-i_qbits;
304         i_qbitsv = vec_splat(qbits_u.v, 0);
305
306         vec_u32_t sixteenv;
307         vect_int_u sixteen_u;
308         sixteen_u.s[0]=16;
309         sixteenv = vec_splat(sixteen_u.v, 0);
310
311         for( y = 0; y < 4; y+=2 )
312             DEQUANT_SHR();
313     }
314 }
315
316 void x264_dequant_8x8_altivec( int16_t dct[8][8], int dequant_mf[6][8][8], int i_qp )
317 {
318     const int i_mf = i_qp%6;
319     const int i_qbits = i_qp/6 - 6;
320     int y;
321
322     vec_s16_t dctv;
323     vec_s16_t dct1v, dct2v;
324     vec_s32_t mf1v, mf2v;
325     vec_s16_t mfv;
326     vec_s32_t multEvenvA, multOddvA;
327     vec_s32_t temp1v, temp2v;
328
329     if( i_qbits >= 0 )
330     {
331         vec_u16_t i_qbitsv;
332         vect_ushort_u qbits_u;
333         qbits_u.s[0]=i_qbits;
334         i_qbitsv = vec_splat(qbits_u.v, 0);
335
336         for( y = 0; y < 8; y++ )
337             DEQUANT_SHL();
338     }
339     else
340     {
341         const int f = 1 << (-i_qbits-1);
342
343         vec_s32_t fv;
344         vect_int_u f_u;
345         f_u.s[0]=f;
346         fv = (vec_s32_t)vec_splat(f_u.v, 0);
347
348         vec_u32_t i_qbitsv;
349         vect_int_u qbits_u;
350         qbits_u.s[0]=-i_qbits;
351         i_qbitsv = vec_splat(qbits_u.v, 0);
352
353         vec_u32_t sixteenv;
354         vect_int_u sixteen_u;
355         sixteen_u.s[0]=16;
356         sixteenv = vec_splat(sixteen_u.v, 0);
357
358         for( y = 0; y < 8; y++ )
359             DEQUANT_SHR();
360     }
361 }
362