]> git.sesse.net Git - ffmpeg/blob - libavcodec/ppc/h264_altivec.c
some samples aren't decoded correctly such as
[ffmpeg] / libavcodec / ppc / h264_altivec.c
1 /*
2  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20
21 #include "dsputil.h"
22
23 #include "gcc_fixes.h"
24
25 #include "dsputil_altivec.h"
26 #include "types_altivec.h"
27
28 #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
29 #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
30
31 #define OP_U8_ALTIVEC                          PUT_OP_U8_ALTIVEC
32 #define PREFIX_h264_chroma_mc8_altivec         put_h264_chroma_mc8_altivec
33 #define PREFIX_h264_chroma_mc8_num             altivec_put_h264_chroma_mc8_num
34 #define PREFIX_h264_qpel16_h_lowpass_altivec   put_h264_qpel16_h_lowpass_altivec
35 #define PREFIX_h264_qpel16_h_lowpass_num       altivec_put_h264_qpel16_h_lowpass_num
36 #define PREFIX_h264_qpel16_v_lowpass_altivec   put_h264_qpel16_v_lowpass_altivec
37 #define PREFIX_h264_qpel16_v_lowpass_num       altivec_put_h264_qpel16_v_lowpass_num
38 #define PREFIX_h264_qpel16_hv_lowpass_altivec  put_h264_qpel16_hv_lowpass_altivec
39 #define PREFIX_h264_qpel16_hv_lowpass_num      altivec_put_h264_qpel16_hv_lowpass_num
40 #include "h264_template_altivec.c"
41 #undef OP_U8_ALTIVEC
42 #undef PREFIX_h264_chroma_mc8_altivec
43 #undef PREFIX_h264_chroma_mc8_num
44 #undef PREFIX_h264_qpel16_h_lowpass_altivec
45 #undef PREFIX_h264_qpel16_h_lowpass_num
46 #undef PREFIX_h264_qpel16_v_lowpass_altivec
47 #undef PREFIX_h264_qpel16_v_lowpass_num
48 #undef PREFIX_h264_qpel16_hv_lowpass_altivec
49 #undef PREFIX_h264_qpel16_hv_lowpass_num
50
51 #define OP_U8_ALTIVEC                          AVG_OP_U8_ALTIVEC
52 #define PREFIX_h264_chroma_mc8_altivec         avg_h264_chroma_mc8_altivec
53 #define PREFIX_h264_chroma_mc8_num             altivec_avg_h264_chroma_mc8_num
54 #define PREFIX_h264_qpel16_h_lowpass_altivec   avg_h264_qpel16_h_lowpass_altivec
55 #define PREFIX_h264_qpel16_h_lowpass_num       altivec_avg_h264_qpel16_h_lowpass_num
56 #define PREFIX_h264_qpel16_v_lowpass_altivec   avg_h264_qpel16_v_lowpass_altivec
57 #define PREFIX_h264_qpel16_v_lowpass_num       altivec_avg_h264_qpel16_v_lowpass_num
58 #define PREFIX_h264_qpel16_hv_lowpass_altivec  avg_h264_qpel16_hv_lowpass_altivec
59 #define PREFIX_h264_qpel16_hv_lowpass_num      altivec_avg_h264_qpel16_hv_lowpass_num
60 #include "h264_template_altivec.c"
61 #undef OP_U8_ALTIVEC
62 #undef PREFIX_h264_chroma_mc8_altivec
63 #undef PREFIX_h264_chroma_mc8_num
64 #undef PREFIX_h264_qpel16_h_lowpass_altivec
65 #undef PREFIX_h264_qpel16_h_lowpass_num
66 #undef PREFIX_h264_qpel16_v_lowpass_altivec
67 #undef PREFIX_h264_qpel16_v_lowpass_num
68 #undef PREFIX_h264_qpel16_hv_lowpass_altivec
69 #undef PREFIX_h264_qpel16_hv_lowpass_num
70
71 #define H264_MC(OPNAME, SIZE, CODETYPE) \
72 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, uint8_t *src, int stride){\
73     OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\
74 }\
75 \
76 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){ \
77     DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
78     put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
79     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
80 }\
81 \
82 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
83     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\
84 }\
85 \
86 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
87     DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
88     put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
89     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\
90 }\
91 \
92 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
93     DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
94     put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
95     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
96 }\
97 \
98 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
99     OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\
100 }\
101 \
102 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
103     DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
104     put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
105     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\
106 }\
107 \
108 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
109     DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
110     DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
111     put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
112     put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
113     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
114 }\
115 \
116 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
117     DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
118     DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
119     put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
120     put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
121     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
122 }\
123 \
124 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
125     DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
126     DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
127     put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
128     put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
129     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
130 }\
131 \
132 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
133     DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
134     DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
135     put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
136     put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
137     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
138 }\
139 \
140 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
141     DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
142     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\
143 }\
144 \
145 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
146     DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
147     DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
148     DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
149     put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
150     put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
151     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
152 }\
153 \
154 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
155     DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
156     DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
157     DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
158     put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
159     put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
160     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
161 }\
162 \
163 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
164     DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
165     DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
166     DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
167     put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
168     put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
169     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
170 }\
171 \
172 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
173     DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
174     DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
175     DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
176     put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
177     put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
178     OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
179 }\
180
181 /* this code assume that stride % 16 == 0 */
182 void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
183    DECLARE_ALIGNED_16(signed int, ABCD[4]) =
184                         {((8 - x) * (8 - y)),
185                           ((x) * (8 - y)),
186                           ((8 - x) * (y)),
187                           ((x) * (y))};
188     register int i;
189     vector unsigned char fperm;
190     const vector signed int vABCD = vec_ld(0, ABCD);
191     const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
192     const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
193     const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
194     const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
195     const vector signed int vzero = vec_splat_s32(0);
196     const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
197     const vector unsigned short v6us = vec_splat_u16(6);
198     register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
199     register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
200
201     vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
202     vector unsigned char vsrc0uc, vsrc1uc;
203     vector signed short vsrc0ssH, vsrc1ssH;
204     vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
205     vector signed short vsrc2ssH, vsrc3ssH, psum;
206     vector unsigned char vdst, ppsum, fsum;
207
208     if (((unsigned long)dst) % 16 == 0) {
209       fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
210                                         0x14, 0x15, 0x16, 0x17,
211                                         0x08, 0x09, 0x0A, 0x0B,
212                                         0x0C, 0x0D, 0x0E, 0x0F);
213     } else {
214       fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
215                                         0x04, 0x05, 0x06, 0x07,
216                                         0x18, 0x19, 0x1A, 0x1B,
217                                         0x1C, 0x1D, 0x1E, 0x1F);
218     }
219
220     vsrcAuc = vec_ld(0, src);
221
222     if (loadSecond)
223       vsrcBuc = vec_ld(16, src);
224     vsrcperm0 = vec_lvsl(0, src);
225     vsrcperm1 = vec_lvsl(1, src);
226
227     vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
228     if (reallyBadAlign)
229       vsrc1uc = vsrcBuc;
230     else
231       vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
232
233     vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
234                                                (vector unsigned char)vsrc0uc);
235     vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
236                                                (vector unsigned char)vsrc1uc);
237
238     if (!loadSecond) {// -> !reallyBadAlign
239       for (i = 0 ; i < h ; i++) {
240
241
242         vsrcCuc = vec_ld(stride + 0, src);
243
244         vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
245         vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
246
247         vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
248                                                 (vector unsigned char)vsrc2uc);
249         vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
250                                                 (vector unsigned char)vsrc3uc);
251
252         psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
253         psum = vec_mladd(vB, vsrc1ssH, psum);
254         psum = vec_mladd(vC, vsrc2ssH, psum);
255         psum = vec_mladd(vD, vsrc3ssH, psum);
256         psum = vec_add(v28ss, psum);
257         psum = vec_sra(psum, v6us);
258
259         vdst = vec_ld(0, dst);
260         ppsum = (vector unsigned char)vec_packsu(psum, psum);
261         fsum = vec_perm(vdst, ppsum, fperm);
262
263         vec_st(fsum, 0, dst);
264
265         vsrc0ssH = vsrc2ssH;
266         vsrc1ssH = vsrc3ssH;
267
268         dst += stride;
269         src += stride;
270       }
271     } else {
272         vector unsigned char vsrcDuc;
273       for (i = 0 ; i < h ; i++) {
274         vsrcCuc = vec_ld(stride + 0, src);
275         vsrcDuc = vec_ld(stride + 16, src);
276
277         vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
278         if (reallyBadAlign)
279           vsrc3uc = vsrcDuc;
280         else
281           vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
282
283         vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
284                                                 (vector unsigned char)vsrc2uc);
285         vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
286                                                 (vector unsigned char)vsrc3uc);
287
288         psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
289         psum = vec_mladd(vB, vsrc1ssH, psum);
290         psum = vec_mladd(vC, vsrc2ssH, psum);
291         psum = vec_mladd(vD, vsrc3ssH, psum);
292         psum = vec_add(v28ss, psum);
293         psum = vec_sr(psum, v6us);
294
295         vdst = vec_ld(0, dst);
296         ppsum = (vector unsigned char)vec_pack(psum, psum);
297         fsum = vec_perm(vdst, ppsum, fperm);
298
299         vec_st(fsum, 0, dst);
300
301         vsrc0ssH = vsrc2ssH;
302         vsrc1ssH = vsrc3ssH;
303
304         dst += stride;
305         src += stride;
306       }
307     }
308 }
309
310 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
311                                     const uint8_t * src2, int dst_stride,
312                                     int src_stride1, int h)
313 {
314     int i;
315     vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
316
317     mask_ = vec_lvsl(0, src2);
318
319     for (i = 0; i < h; i++) {
320
321         tmp1 = vec_ld(i * src_stride1, src1);
322         mask = vec_lvsl(i * src_stride1, src1);
323         tmp2 = vec_ld(i * src_stride1 + 15, src1);
324
325         a = vec_perm(tmp1, tmp2, mask);
326
327         tmp1 = vec_ld(i * 16, src2);
328         tmp2 = vec_ld(i * 16 + 15, src2);
329
330         b = vec_perm(tmp1, tmp2, mask_);
331
332         tmp1 = vec_ld(0, dst);
333         mask = vec_lvsl(0, dst);
334         tmp2 = vec_ld(15, dst);
335
336         d = vec_avg(a, b);
337
338         edges = vec_perm(tmp2, tmp1, mask);
339
340         align = vec_lvsr(0, dst);
341
342         tmp2 = vec_perm(d, edges, align);
343         tmp1 = vec_perm(edges, d, align);
344
345         vec_st(tmp2, 15, dst);
346         vec_st(tmp1, 0 , dst);
347
348         dst += dst_stride;
349     }
350 }
351
352 static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
353                                     const uint8_t * src2, int dst_stride,
354                                     int src_stride1, int h)
355 {
356     int i;
357     vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
358
359     mask_ = vec_lvsl(0, src2);
360
361     for (i = 0; i < h; i++) {
362
363         tmp1 = vec_ld(i * src_stride1, src1);
364         mask = vec_lvsl(i * src_stride1, src1);
365         tmp2 = vec_ld(i * src_stride1 + 15, src1);
366
367         a = vec_perm(tmp1, tmp2, mask);
368
369         tmp1 = vec_ld(i * 16, src2);
370         tmp2 = vec_ld(i * 16 + 15, src2);
371
372         b = vec_perm(tmp1, tmp2, mask_);
373
374         tmp1 = vec_ld(0, dst);
375         mask = vec_lvsl(0, dst);
376         tmp2 = vec_ld(15, dst);
377
378         d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b));
379
380         edges = vec_perm(tmp2, tmp1, mask);
381
382         align = vec_lvsr(0, dst);
383
384         tmp2 = vec_perm(d, edges, align);
385         tmp1 = vec_perm(edges, d, align);
386
387         vec_st(tmp2, 15, dst);
388         vec_st(tmp1, 0 , dst);
389
390         dst += dst_stride;
391     }
392 }
393
394 /* Implemented but could be faster
395 #define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h)
396 #define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h)
397  */
398
399   H264_MC(put_, 16, altivec)
400   H264_MC(avg_, 16, altivec)
401
402
403 /****************************************************************************
404  * IDCT transform:
405  ****************************************************************************/
406
407 #define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3)              \
408    /* 1st stage */                                               \
409    vz0 = vec_add(vb0,vb2);       /* temp[0] = Y[0] + Y[2] */     \
410    vz1 = vec_sub(vb0,vb2);       /* temp[1] = Y[0] - Y[2] */     \
411    vz2 = vec_sra(vb1,vec_splat_u16(1));                          \
412    vz2 = vec_sub(vz2,vb3);       /* temp[2] = Y[1].1/2 - Y[3] */ \
413    vz3 = vec_sra(vb3,vec_splat_u16(1));                          \
414    vz3 = vec_add(vb1,vz3);       /* temp[3] = Y[1] + Y[3].1/2 */ \
415    /* 2nd stage: output */                                       \
416    va0 = vec_add(vz0,vz3);       /* x[0] = temp[0] + temp[3] */  \
417    va1 = vec_add(vz1,vz2);       /* x[1] = temp[1] + temp[2] */  \
418    va2 = vec_sub(vz1,vz2);       /* x[2] = temp[1] - temp[2] */  \
419    va3 = vec_sub(vz0,vz3)        /* x[3] = temp[0] - temp[3] */
420
421 #define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
422     b0 = vec_mergeh( a0, a0 ); \
423     b1 = vec_mergeh( a1, a0 ); \
424     b2 = vec_mergeh( a2, a0 ); \
425     b3 = vec_mergeh( a3, a0 ); \
426     a0 = vec_mergeh( b0, b2 ); \
427     a1 = vec_mergel( b0, b2 ); \
428     a2 = vec_mergeh( b1, b3 ); \
429     a3 = vec_mergel( b1, b3 ); \
430     b0 = vec_mergeh( a0, a2 ); \
431     b1 = vec_mergel( a0, a2 ); \
432     b2 = vec_mergeh( a1, a3 ); \
433     b3 = vec_mergel( a1, a3 )
434
435 #define VEC_LOAD_U8_ADD_S16_STORE_U8(va)                      \
436     vdst_orig = vec_ld(0, dst);                               \
437     vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask);          \
438     vdst_ss = (vec_s16_t) vec_mergeh(zero_u8v, vdst);         \
439     va = vec_add(va, vdst_ss);                                \
440     va_u8 = vec_packsu(va, zero_s16v);                        \
441     va_u32 = vec_splat((vec_u32_t)va_u8, 0);                  \
442     vec_ste(va_u32, element, (uint32_t*)dst);
443
444 static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
445 {
446     vec_s16_t va0, va1, va2, va3;
447     vec_s16_t vz0, vz1, vz2, vz3;
448     vec_s16_t vtmp0, vtmp1, vtmp2, vtmp3;
449     vec_u8_t va_u8;
450     vec_u32_t va_u32;
451     vec_s16_t vdst_ss;
452     const vec_u16_t v6us = vec_splat_u16(6);
453     vec_u8_t vdst, vdst_orig;
454     vec_u8_t vdst_mask = vec_lvsl(0, dst);
455     int element = ((unsigned long)dst & 0xf) >> 2;
456     LOAD_ZERO;
457
458     block[0] += 32;  /* add 32 as a DC-level for rounding */
459
460     vtmp0 = vec_ld(0,block);
461     vtmp1 = vec_sld(vtmp0, vtmp0, 8);
462     vtmp2 = vec_ld(16,block);
463     vtmp3 = vec_sld(vtmp2, vtmp2, 8);
464
465     VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
466     VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3);
467     VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
468
469     va0 = vec_sra(va0,v6us);
470     va1 = vec_sra(va1,v6us);
471     va2 = vec_sra(va2,v6us);
472     va3 = vec_sra(va3,v6us);
473
474     VEC_LOAD_U8_ADD_S16_STORE_U8(va0);
475     dst += stride;
476     VEC_LOAD_U8_ADD_S16_STORE_U8(va1);
477     dst += stride;
478     VEC_LOAD_U8_ADD_S16_STORE_U8(va2);
479     dst += stride;
480     VEC_LOAD_U8_ADD_S16_STORE_U8(va3);
481 }
482
483 #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,  d0, d1, d2, d3, d4, d5, d6, d7) {\
484     /*        a0  = SRC(0) + SRC(4); */ \
485     vec_s16_t a0v = vec_add(s0, s4);    \
486     /*        a2  = SRC(0) - SRC(4); */ \
487     vec_s16_t a2v = vec_sub(s0, s4);    \
488     /*        a4  =           (SRC(2)>>1) - SRC(6); */ \
489     vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6);    \
490     /*        a6  =           (SRC(6)>>1) + SRC(2); */ \
491     vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2);    \
492     /*        b0  =         a0 + a6; */ \
493     vec_s16_t b0v = vec_add(a0v, a6v);  \
494     /*        b2  =         a2 + a4; */ \
495     vec_s16_t b2v = vec_add(a2v, a4v);  \
496     /*        b4  =         a2 - a4; */ \
497     vec_s16_t b4v = vec_sub(a2v, a4v);  \
498     /*        b6  =         a0 - a6; */ \
499     vec_s16_t b6v = vec_sub(a0v, a6v);  \
500     /* a1 =  SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
501     /*        a1 =             (SRC(5)-SRC(3)) -  (SRC(7)  +  (SRC(7)>>1)); */ \
502     vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
503     /* a3 =  SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
504     /*        a3 =             (SRC(7)+SRC(1)) -  (SRC(3)  +  (SRC(3)>>1)); */ \
505     vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
506     /* a5 =  SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
507     /*        a5 =             (SRC(7)-SRC(1)) +   SRC(5) +   (SRC(5)>>1); */ \
508     vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
509     /*        a7 =                SRC(5)+SRC(3) +  SRC(1) +   (SRC(1)>>1); */ \
510     vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
511     /*        b1 =                  (a7>>2)  +  a1; */ \
512     vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v); \
513     /*        b3 =          a3 +        (a5>>2); */ \
514     vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov)); \
515     /*        b5 =                  (a3>>2)  -   a5; */ \
516     vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v); \
517     /*        b7 =           a7 -        (a1>>2); */ \
518     vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
519     /* DST(0,    b0 + b7); */ \
520     d0 = vec_add(b0v, b7v); \
521     /* DST(1,    b2 + b5); */ \
522     d1 = vec_add(b2v, b5v); \
523     /* DST(2,    b4 + b3); */ \
524     d2 = vec_add(b4v, b3v); \
525     /* DST(3,    b6 + b1); */ \
526     d3 = vec_add(b6v, b1v); \
527     /* DST(4,    b6 - b1); */ \
528     d4 = vec_sub(b6v, b1v); \
529     /* DST(5,    b4 - b3); */ \
530     d5 = vec_sub(b4v, b3v); \
531     /* DST(6,    b2 - b5); */ \
532     d6 = vec_sub(b2v, b5v); \
533     /* DST(7,    b0 - b7); */ \
534     d7 = vec_sub(b0v, b7v); \
535 }
536
537 #define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
538     /* unaligned load */                                       \
539     vec_u8_t hv = vec_ld( 0, dest );                           \
540     vec_u8_t lv = vec_ld( 7, dest );                           \
541     vec_u8_t dstv   = vec_perm( hv, lv, (vec_u8_t)perm_ldv );  \
542     vec_s16_t idct_sh6 = vec_sra(idctv, sixv);                 \
543     vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv);   \
544     vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16);  \
545     vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum);        \
546     vec_u8_t edgehv;                                           \
547     /* unaligned store */                                      \
548     vec_u8_t bodyv  = vec_perm( idstsum8, idstsum8, perm_stv );\
549     vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv );     \
550     lv    = vec_sel( lv, bodyv, edgelv );                      \
551     vec_st( lv, 7, dest );                                     \
552     hv    = vec_ld( 0, dest );                                 \
553     edgehv = vec_perm( zero_u8v, sel, perm_stv );              \
554     hv    = vec_sel( hv, bodyv, edgehv );                      \
555     vec_st( hv, 0, dest );                                     \
556  }
557
558 void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
559     vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7;
560     vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7;
561     vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
562
563     vec_u8_t perm_ldv = vec_lvsl(0, dst);
564     vec_u8_t perm_stv = vec_lvsr(8, dst);
565
566     const vec_u16_t onev = vec_splat_u16(1);
567     const vec_u16_t twov = vec_splat_u16(2);
568     const vec_u16_t sixv = vec_splat_u16(6);
569
570     const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,
571                                         -1,-1,-1,-1,-1,-1,-1,-1);
572     LOAD_ZERO;
573
574     dct[0] += 32; // rounding for the >>6 at the end
575
576     s0 = vec_ld(0x00, (int16_t*)dct);
577     s1 = vec_ld(0x10, (int16_t*)dct);
578     s2 = vec_ld(0x20, (int16_t*)dct);
579     s3 = vec_ld(0x30, (int16_t*)dct);
580     s4 = vec_ld(0x40, (int16_t*)dct);
581     s5 = vec_ld(0x50, (int16_t*)dct);
582     s6 = vec_ld(0x60, (int16_t*)dct);
583     s7 = vec_ld(0x70, (int16_t*)dct);
584
585     IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,
586                      d0, d1, d2, d3, d4, d5, d6, d7);
587
588     TRANSPOSE8( d0,  d1,  d2,  d3,  d4,  d5,  d6, d7 );
589
590     IDCT8_1D_ALTIVEC(d0,  d1,  d2,  d3,  d4,  d5,  d6, d7,
591                      idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);
592
593     ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel);
594     ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel);
595     ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel);
596     ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel);
597     ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel);
598     ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel);
599     ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel);
600     ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
601 }
602
603 #define transpose4x16(r0, r1, r2, r3) {      \
604     register vector unsigned char r4;        \
605     register vector unsigned char r5;        \
606     register vector unsigned char r6;        \
607     register vector unsigned char r7;        \
608                                              \
609     r4 = vec_mergeh(r0, r2);  /*0, 2 set 0*/ \
610     r5 = vec_mergel(r0, r2);  /*0, 2 set 1*/ \
611     r6 = vec_mergeh(r1, r3);  /*1, 3 set 0*/ \
612     r7 = vec_mergel(r1, r3);  /*1, 3 set 1*/ \
613                                              \
614     r0 = vec_mergeh(r4, r6);  /*all set 0*/  \
615     r1 = vec_mergel(r4, r6);  /*all set 1*/  \
616     r2 = vec_mergeh(r5, r7);  /*all set 2*/  \
617     r3 = vec_mergel(r5, r7);  /*all set 3*/  \
618 }
619
620 static inline void write16x4(uint8_t *dst, int dst_stride,
621                              register vector unsigned char r0, register vector unsigned char r1,
622                              register vector unsigned char r2, register vector unsigned char r3) {
623     DECLARE_ALIGNED_16(unsigned char, result[64]);
624     uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
625     int int_dst_stride = dst_stride/4;
626
627     vec_st(r0, 0, result);
628     vec_st(r1, 16, result);
629     vec_st(r2, 32, result);
630     vec_st(r3, 48, result);
631     /* FIXME: there has to be a better way!!!! */
632     *dst_int = *src_int;
633     *(dst_int+   int_dst_stride) = *(src_int + 1);
634     *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
635     *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
636     *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
637     *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
638     *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
639     *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
640     *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
641     *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
642     *(dst_int+10*int_dst_stride) = *(src_int + 10);
643     *(dst_int+11*int_dst_stride) = *(src_int + 11);
644     *(dst_int+12*int_dst_stride) = *(src_int + 12);
645     *(dst_int+13*int_dst_stride) = *(src_int + 13);
646     *(dst_int+14*int_dst_stride) = *(src_int + 14);
647     *(dst_int+15*int_dst_stride) = *(src_int + 15);
648 }
649
650 /** \brief performs a 6x16 transpose of data in src, and stores it to dst
651     \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
652     out of unaligned_load() */
653 #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
654     register vector unsigned char r0  = unaligned_load(0,             src);\
655     register vector unsigned char r1  = unaligned_load(   src_stride, src);\
656     register vector unsigned char r2  = unaligned_load(2* src_stride, src);\
657     register vector unsigned char r3  = unaligned_load(3* src_stride, src);\
658     register vector unsigned char r4  = unaligned_load(4* src_stride, src);\
659     register vector unsigned char r5  = unaligned_load(5* src_stride, src);\
660     register vector unsigned char r6  = unaligned_load(6* src_stride, src);\
661     register vector unsigned char r7  = unaligned_load(7* src_stride, src);\
662     register vector unsigned char r14 = unaligned_load(14*src_stride, src);\
663     register vector unsigned char r15 = unaligned_load(15*src_stride, src);\
664                                                                            \
665     r8  = unaligned_load( 8*src_stride, src);                              \
666     r9  = unaligned_load( 9*src_stride, src);                              \
667     r10 = unaligned_load(10*src_stride, src);                              \
668     r11 = unaligned_load(11*src_stride, src);                              \
669     r12 = unaligned_load(12*src_stride, src);                              \
670     r13 = unaligned_load(13*src_stride, src);                              \
671                                                                            \
672     /*Merge first pairs*/                                                  \
673     r0 = vec_mergeh(r0, r8);    /*0, 8*/                                   \
674     r1 = vec_mergeh(r1, r9);    /*1, 9*/                                   \
675     r2 = vec_mergeh(r2, r10);   /*2,10*/                                   \
676     r3 = vec_mergeh(r3, r11);   /*3,11*/                                   \
677     r4 = vec_mergeh(r4, r12);   /*4,12*/                                   \
678     r5 = vec_mergeh(r5, r13);   /*5,13*/                                   \
679     r6 = vec_mergeh(r6, r14);   /*6,14*/                                   \
680     r7 = vec_mergeh(r7, r15);   /*7,15*/                                   \
681                                                                            \
682     /*Merge second pairs*/                                                 \
683     r8  = vec_mergeh(r0, r4);   /*0,4, 8,12 set 0*/                        \
684     r9  = vec_mergel(r0, r4);   /*0,4, 8,12 set 1*/                        \
685     r10 = vec_mergeh(r1, r5);   /*1,5, 9,13 set 0*/                        \
686     r11 = vec_mergel(r1, r5);   /*1,5, 9,13 set 1*/                        \
687     r12 = vec_mergeh(r2, r6);   /*2,6,10,14 set 0*/                        \
688     r13 = vec_mergel(r2, r6);   /*2,6,10,14 set 1*/                        \
689     r14 = vec_mergeh(r3, r7);   /*3,7,11,15 set 0*/                        \
690     r15 = vec_mergel(r3, r7);   /*3,7,11,15 set 1*/                        \
691                                                                            \
692     /*Third merge*/                                                        \
693     r0 = vec_mergeh(r8, r12);   /*0,2,4,6,8,10,12,14 set 0*/               \
694     r1 = vec_mergel(r8, r12);   /*0,2,4,6,8,10,12,14 set 1*/               \
695     r2 = vec_mergeh(r9, r13);   /*0,2,4,6,8,10,12,14 set 2*/               \
696     r4 = vec_mergeh(r10, r14);  /*1,3,5,7,9,11,13,15 set 0*/               \
697     r5 = vec_mergel(r10, r14);  /*1,3,5,7,9,11,13,15 set 1*/               \
698     r6 = vec_mergeh(r11, r15);  /*1,3,5,7,9,11,13,15 set 2*/               \
699     /* Don't need to compute 3 and 7*/                                     \
700                                                                            \
701     /*Final merge*/                                                        \
702     r8  = vec_mergeh(r0, r4);   /*all set 0*/                              \
703     r9  = vec_mergel(r0, r4);   /*all set 1*/                              \
704     r10 = vec_mergeh(r1, r5);   /*all set 2*/                              \
705     r11 = vec_mergel(r1, r5);   /*all set 3*/                              \
706     r12 = vec_mergeh(r2, r6);   /*all set 4*/                              \
707     r13 = vec_mergel(r2, r6);   /*all set 5*/                              \
708     /* Don't need to compute 14 and 15*/                                   \
709                                                                            \
710 }
711
712 // out: o = |x-y| < a
713 static inline vector unsigned char diff_lt_altivec ( register vector unsigned char x,
714                                                      register vector unsigned char y,
715                                                      register vector unsigned char a) {
716
717     register vector unsigned char diff = vec_subs(x, y);
718     register vector unsigned char diffneg = vec_subs(y, x);
719     register vector unsigned char o = vec_or(diff, diffneg); /* |x-y| */
720     o = (vector unsigned char)vec_cmplt(o, a);
721     return o;
722 }
723
724 static inline vector unsigned char h264_deblock_mask ( register vector unsigned char p0,
725                                                        register vector unsigned char p1,
726                                                        register vector unsigned char q0,
727                                                        register vector unsigned char q1,
728                                                        register vector unsigned char alpha,
729                                                        register vector unsigned char beta) {
730
731     register vector unsigned char mask;
732     register vector unsigned char tempmask;
733
734     mask = diff_lt_altivec(p0, q0, alpha);
735     tempmask = diff_lt_altivec(p1, p0, beta);
736     mask = vec_and(mask, tempmask);
737     tempmask = diff_lt_altivec(q1, q0, beta);
738     mask = vec_and(mask, tempmask);
739
740     return mask;
741 }
742
743 // out: p1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
744 #define h264_deblock_q1(p0, p1, p2, q0, tc0) {                     \
745                                                                    \
746     register vector unsigned char average = vec_avg(p0, q0);       \
747     register vector unsigned char temp;                            \
748     register vector unsigned char uncliped;                        \
749     register vector unsigned char ones;                            \
750     register vector unsigned char max;                             \
751     register vector unsigned char min;                             \
752                                                                    \
753     temp = vec_xor(average, p2);                                   \
754     average = vec_avg(average, p2);     /*avg(p2, avg(p0, q0)) */  \
755     ones = vec_splat_u8(1);                                        \
756     temp = vec_and(temp, ones);         /*(p2^avg(p0, q0)) & 1 */  \
757     uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */\
758     max = vec_adds(p1, tc0);                                       \
759     min = vec_subs(p1, tc0);                                       \
760     p1 = vec_max(min, uncliped);                                   \
761     p1 = vec_min(max, p1);                                         \
762 }
763
764 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) {                                           \
765                                                                                                   \
766     const vector unsigned char A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4));                   \
767                                                                                                   \
768     register vector unsigned char pq0bit = vec_xor(p0,q0);                                        \
769     register vector unsigned char temp;                                                           \
770     register vector unsigned char q1minus;                                                        \
771     register vector unsigned char p0minus;                                                        \
772     register vector unsigned char stage1;                                                         \
773     register vector unsigned char stage2;                                                         \
774     register vector unsigned char vec160;                                                         \
775     register vector unsigned char delta;                                                          \
776     register vector unsigned char deltaneg;                                                       \
777                                                                                                   \
778     temp = (vector unsigned char)vec_cmpeq(p0, p0);                                               \
779     q1minus = vec_xor(temp, q1);               /* 255 - q1 */                                     \
780     stage1 = vec_avg(p1, q1minus);             /* (p1 - q1 + 256)>>1 */                           \
781     stage2 = vec_sr(stage1, vec_splat_u8(1));  /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */     \
782     p0minus = vec_xor(temp, p0);               /* 255 - p0 */                                     \
783     stage1 = vec_avg(q0, p0minus);             /* (q0 - p0 + 256)>>1 */                           \
784     pq0bit = vec_and(pq0bit, vec_splat_u8(1));                                                    \
785     stage2 = vec_avg(stage2, pq0bit);          /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \
786     stage2 = vec_adds(stage2, stage1);         /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */  \
787     vec160 = vec_ld(0, &A0v);                                                                     \
788     deltaneg = vec_subs(vec160, stage2);       /* -d */                                           \
789     delta = vec_subs(stage2, vec160);          /* d */                                            \
790     deltaneg = vec_min(tc0masked, deltaneg);                                                      \
791     delta = vec_min(tc0masked, delta);                                                            \
792     p0 = vec_subs(p0, deltaneg);                                                                  \
793     q0 = vec_subs(q0, delta);                                                                     \
794     p0 = vec_adds(p0, delta);                                                                     \
795     q0 = vec_adds(q0, deltaneg);                                                                  \
796 }
797
798 #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) {            \
799     DECLARE_ALIGNED_16(unsigned char, temp[16]);                                             \
800     register vector unsigned char alphavec;                                                  \
801     register vector unsigned char betavec;                                                   \
802     register vector unsigned char mask;                                                      \
803     register vector unsigned char p1mask;                                                    \
804     register vector unsigned char q1mask;                                                    \
805     register vector unsigned char tc0vec;                                                    \
806     register vector unsigned char finaltc0;                                                  \
807     register vector unsigned char tc0masked;                                                 \
808                                                                                              \
809     temp[0] = alpha;                                                                         \
810     temp[1] = beta;                                                                          \
811     alphavec = vec_ld(0, temp);                                                              \
812     betavec = vec_splat(alphavec, 0x1);                                                      \
813     alphavec = vec_splat(alphavec, 0x0);                                                     \
814     mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */            \
815                                                                                              \
816     *((int *)temp) = *((int *)tc0);                                                          \
817     tc0vec = vec_ld(0, temp);                                                                \
818     tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
819     tc0vec = vec_mergeh(tc0vec, tc0vec);                                                     \
820     mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_u8(-1)));  /* if tc0[i] >= 0 */         \
821     finaltc0 = vec_and(tc0vec, mask);                           /*tc = tc0[i]*/              \
822                                                                                              \
823     p1mask = diff_lt_altivec(p2, p0, betavec);                                               \
824     p1mask = vec_and(p1mask, mask);                             /* if( |p2 - p0| < beta) */  \
825     tc0masked = vec_and(p1mask, tc0vec);                                                     \
826     finaltc0 = vec_sub(finaltc0, p1mask);                       /* tc++ */                   \
827     h264_deblock_q1(p0, p1, p2, q0, tc0masked);                                              \
828     /*end if*/                                                                               \
829                                                                                              \
830     q1mask = diff_lt_altivec(q2, q0, betavec);                                               \
831     q1mask = vec_and(q1mask, mask);                             /* if ( |q2 - q0| < beta ) */\
832     tc0masked = vec_and(q1mask, tc0vec);                                                     \
833     finaltc0 = vec_sub(finaltc0, q1mask);                       /* tc++ */                   \
834     h264_deblock_q1(p0, q1, q2, q0, tc0masked);                                              \
835     /*end if*/                                                                               \
836                                                                                              \
837     h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0);                                            \
838 }
839
840 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
841
842     if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
843         register vector unsigned char p2 = vec_ld(-3*stride, pix);
844         register vector unsigned char p1 = vec_ld(-2*stride, pix);
845         register vector unsigned char p0 = vec_ld(-1*stride, pix);
846         register vector unsigned char q0 = vec_ld(0, pix);
847         register vector unsigned char q1 = vec_ld(stride, pix);
848         register vector unsigned char q2 = vec_ld(2*stride, pix);
849         h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
850         vec_st(p1, -2*stride, pix);
851         vec_st(p0, -1*stride, pix);
852         vec_st(q0, 0, pix);
853         vec_st(q1, stride, pix);
854     }
855 }
856
857 static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
858
859     register vector unsigned char line0, line1, line2, line3, line4, line5;
860     if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
861         return;
862     readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
863     h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
864     transpose4x16(line1, line2, line3, line4);
865     write16x4(pix-2, stride, line1, line2, line3, line4);
866 }
867
868 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
869
870 #ifdef HAVE_ALTIVEC
871   if (has_altivec()) {
872     c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
873     c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec;
874     c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
875     c->h264_idct_add = ff_h264_idct_add_altivec;
876     c->h264_idct8_add = ff_h264_idct8_add_altivec;
877 #if 0 // some samples aren't decoded correctly while others are fine. What's wrong?
878     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
879     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
880 #endif
881
882 #define dspfunc(PFX, IDX, NUM) \
883     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \
884     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \
885     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \
886     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \
887     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \
888     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \
889     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \
890     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \
891     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \
892     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \
893     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \
894     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \
895     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \
896     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \
897     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \
898     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec
899
900     dspfunc(put_h264_qpel, 0, 16);
901     dspfunc(avg_h264_qpel, 0, 16);
902 #undef dspfunc
903
904   } else
905 #endif /* HAVE_ALTIVEC */
906   {
907     // Non-AltiVec PPC optimisations
908
909     // ... pending ...
910   }
911 }