2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "gcc_fixes.h"
25 #include "dsputil_altivec.h"
26 #include "types_altivec.h"
28 #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
29 #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
31 #define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC
32 #define PREFIX_h264_chroma_mc8_altivec put_h264_chroma_mc8_altivec
33 #define PREFIX_h264_chroma_mc8_num altivec_put_h264_chroma_mc8_num
34 #define PREFIX_h264_qpel16_h_lowpass_altivec put_h264_qpel16_h_lowpass_altivec
35 #define PREFIX_h264_qpel16_h_lowpass_num altivec_put_h264_qpel16_h_lowpass_num
36 #define PREFIX_h264_qpel16_v_lowpass_altivec put_h264_qpel16_v_lowpass_altivec
37 #define PREFIX_h264_qpel16_v_lowpass_num altivec_put_h264_qpel16_v_lowpass_num
38 #define PREFIX_h264_qpel16_hv_lowpass_altivec put_h264_qpel16_hv_lowpass_altivec
39 #define PREFIX_h264_qpel16_hv_lowpass_num altivec_put_h264_qpel16_hv_lowpass_num
40 #include "h264_template_altivec.c"
42 #undef PREFIX_h264_chroma_mc8_altivec
43 #undef PREFIX_h264_chroma_mc8_num
44 #undef PREFIX_h264_qpel16_h_lowpass_altivec
45 #undef PREFIX_h264_qpel16_h_lowpass_num
46 #undef PREFIX_h264_qpel16_v_lowpass_altivec
47 #undef PREFIX_h264_qpel16_v_lowpass_num
48 #undef PREFIX_h264_qpel16_hv_lowpass_altivec
49 #undef PREFIX_h264_qpel16_hv_lowpass_num
51 #define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC
52 #define PREFIX_h264_chroma_mc8_altivec avg_h264_chroma_mc8_altivec
53 #define PREFIX_h264_chroma_mc8_num altivec_avg_h264_chroma_mc8_num
54 #define PREFIX_h264_qpel16_h_lowpass_altivec avg_h264_qpel16_h_lowpass_altivec
55 #define PREFIX_h264_qpel16_h_lowpass_num altivec_avg_h264_qpel16_h_lowpass_num
56 #define PREFIX_h264_qpel16_v_lowpass_altivec avg_h264_qpel16_v_lowpass_altivec
57 #define PREFIX_h264_qpel16_v_lowpass_num altivec_avg_h264_qpel16_v_lowpass_num
58 #define PREFIX_h264_qpel16_hv_lowpass_altivec avg_h264_qpel16_hv_lowpass_altivec
59 #define PREFIX_h264_qpel16_hv_lowpass_num altivec_avg_h264_qpel16_hv_lowpass_num
60 #include "h264_template_altivec.c"
62 #undef PREFIX_h264_chroma_mc8_altivec
63 #undef PREFIX_h264_chroma_mc8_num
64 #undef PREFIX_h264_qpel16_h_lowpass_altivec
65 #undef PREFIX_h264_qpel16_h_lowpass_num
66 #undef PREFIX_h264_qpel16_v_lowpass_altivec
67 #undef PREFIX_h264_qpel16_v_lowpass_num
68 #undef PREFIX_h264_qpel16_hv_lowpass_altivec
69 #undef PREFIX_h264_qpel16_hv_lowpass_num
71 #define H264_MC(OPNAME, SIZE, CODETYPE) \
72 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, uint8_t *src, int stride){\
73 OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\
76 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){ \
77 DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
78 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
79 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
82 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
83 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\
86 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
87 DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
88 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
89 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\
92 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
93 DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
94 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
95 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
98 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
99 OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\
102 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
103 DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
104 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
105 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\
108 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
109 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
110 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
111 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
112 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
113 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
116 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
117 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
118 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
119 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
120 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
121 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
124 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
125 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
126 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
127 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
128 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
129 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
132 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
133 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
134 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
135 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
136 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
137 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
140 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
141 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
142 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\
145 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
146 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
147 DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
148 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
149 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
150 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
151 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
154 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
155 DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
156 DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
157 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
158 put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
159 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
160 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
163 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
164 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
165 DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
166 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
167 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
168 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
169 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
172 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
173 DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
174 DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
175 DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
176 put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
177 put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
178 OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
181 /* this code assume that stride % 16 == 0 */
182 void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
183 DECLARE_ALIGNED_16(signed int, ABCD[4]) =
184 {((8 - x) * (8 - y)),
189 vector unsigned char fperm;
190 const vector signed int vABCD = vec_ld(0, ABCD);
191 const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
192 const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
193 const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
194 const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
195 const vector signed int vzero = vec_splat_s32(0);
196 const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
197 const vector unsigned short v6us = vec_splat_u16(6);
198 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
199 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
201 vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
202 vector unsigned char vsrc0uc, vsrc1uc;
203 vector signed short vsrc0ssH, vsrc1ssH;
204 vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
205 vector signed short vsrc2ssH, vsrc3ssH, psum;
206 vector unsigned char vdst, ppsum, fsum;
208 if (((unsigned long)dst) % 16 == 0) {
209 fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
210 0x14, 0x15, 0x16, 0x17,
211 0x08, 0x09, 0x0A, 0x0B,
212 0x0C, 0x0D, 0x0E, 0x0F);
214 fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
215 0x04, 0x05, 0x06, 0x07,
216 0x18, 0x19, 0x1A, 0x1B,
217 0x1C, 0x1D, 0x1E, 0x1F);
220 vsrcAuc = vec_ld(0, src);
223 vsrcBuc = vec_ld(16, src);
224 vsrcperm0 = vec_lvsl(0, src);
225 vsrcperm1 = vec_lvsl(1, src);
227 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
231 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
233 vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
234 (vector unsigned char)vsrc0uc);
235 vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
236 (vector unsigned char)vsrc1uc);
238 if (!loadSecond) {// -> !reallyBadAlign
239 for (i = 0 ; i < h ; i++) {
242 vsrcCuc = vec_ld(stride + 0, src);
244 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
245 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
247 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
248 (vector unsigned char)vsrc2uc);
249 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
250 (vector unsigned char)vsrc3uc);
252 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
253 psum = vec_mladd(vB, vsrc1ssH, psum);
254 psum = vec_mladd(vC, vsrc2ssH, psum);
255 psum = vec_mladd(vD, vsrc3ssH, psum);
256 psum = vec_add(v28ss, psum);
257 psum = vec_sra(psum, v6us);
259 vdst = vec_ld(0, dst);
260 ppsum = (vector unsigned char)vec_packsu(psum, psum);
261 fsum = vec_perm(vdst, ppsum, fperm);
263 vec_st(fsum, 0, dst);
272 vector unsigned char vsrcDuc;
273 for (i = 0 ; i < h ; i++) {
274 vsrcCuc = vec_ld(stride + 0, src);
275 vsrcDuc = vec_ld(stride + 16, src);
277 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
281 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
283 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
284 (vector unsigned char)vsrc2uc);
285 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
286 (vector unsigned char)vsrc3uc);
288 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
289 psum = vec_mladd(vB, vsrc1ssH, psum);
290 psum = vec_mladd(vC, vsrc2ssH, psum);
291 psum = vec_mladd(vD, vsrc3ssH, psum);
292 psum = vec_add(v28ss, psum);
293 psum = vec_sr(psum, v6us);
295 vdst = vec_ld(0, dst);
296 ppsum = (vector unsigned char)vec_pack(psum, psum);
297 fsum = vec_perm(vdst, ppsum, fperm);
299 vec_st(fsum, 0, dst);
310 static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
311 const uint8_t * src2, int dst_stride,
312 int src_stride1, int h)
315 vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
317 mask_ = vec_lvsl(0, src2);
319 for (i = 0; i < h; i++) {
321 tmp1 = vec_ld(i * src_stride1, src1);
322 mask = vec_lvsl(i * src_stride1, src1);
323 tmp2 = vec_ld(i * src_stride1 + 15, src1);
325 a = vec_perm(tmp1, tmp2, mask);
327 tmp1 = vec_ld(i * 16, src2);
328 tmp2 = vec_ld(i * 16 + 15, src2);
330 b = vec_perm(tmp1, tmp2, mask_);
332 tmp1 = vec_ld(0, dst);
333 mask = vec_lvsl(0, dst);
334 tmp2 = vec_ld(15, dst);
338 edges = vec_perm(tmp2, tmp1, mask);
340 align = vec_lvsr(0, dst);
342 tmp2 = vec_perm(d, edges, align);
343 tmp1 = vec_perm(edges, d, align);
345 vec_st(tmp2, 15, dst);
346 vec_st(tmp1, 0 , dst);
352 static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
353 const uint8_t * src2, int dst_stride,
354 int src_stride1, int h)
357 vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
359 mask_ = vec_lvsl(0, src2);
361 for (i = 0; i < h; i++) {
363 tmp1 = vec_ld(i * src_stride1, src1);
364 mask = vec_lvsl(i * src_stride1, src1);
365 tmp2 = vec_ld(i * src_stride1 + 15, src1);
367 a = vec_perm(tmp1, tmp2, mask);
369 tmp1 = vec_ld(i * 16, src2);
370 tmp2 = vec_ld(i * 16 + 15, src2);
372 b = vec_perm(tmp1, tmp2, mask_);
374 tmp1 = vec_ld(0, dst);
375 mask = vec_lvsl(0, dst);
376 tmp2 = vec_ld(15, dst);
378 d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b));
380 edges = vec_perm(tmp2, tmp1, mask);
382 align = vec_lvsr(0, dst);
384 tmp2 = vec_perm(d, edges, align);
385 tmp1 = vec_perm(edges, d, align);
387 vec_st(tmp2, 15, dst);
388 vec_st(tmp1, 0 , dst);
394 /* Implemented but could be faster
395 #define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h)
396 #define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h)
399 H264_MC(put_, 16, altivec)
400 H264_MC(avg_, 16, altivec)
403 /****************************************************************************
405 ****************************************************************************/
407 #define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \
409 vz0 = vec_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \
410 vz1 = vec_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \
411 vz2 = vec_sra(vb1,vec_splat_u16(1)); \
412 vz2 = vec_sub(vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \
413 vz3 = vec_sra(vb3,vec_splat_u16(1)); \
414 vz3 = vec_add(vb1,vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \
415 /* 2nd stage: output */ \
416 va0 = vec_add(vz0,vz3); /* x[0] = temp[0] + temp[3] */ \
417 va1 = vec_add(vz1,vz2); /* x[1] = temp[1] + temp[2] */ \
418 va2 = vec_sub(vz1,vz2); /* x[2] = temp[1] - temp[2] */ \
419 va3 = vec_sub(vz0,vz3) /* x[3] = temp[0] - temp[3] */
421 #define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
422 b0 = vec_mergeh( a0, a0 ); \
423 b1 = vec_mergeh( a1, a0 ); \
424 b2 = vec_mergeh( a2, a0 ); \
425 b3 = vec_mergeh( a3, a0 ); \
426 a0 = vec_mergeh( b0, b2 ); \
427 a1 = vec_mergel( b0, b2 ); \
428 a2 = vec_mergeh( b1, b3 ); \
429 a3 = vec_mergel( b1, b3 ); \
430 b0 = vec_mergeh( a0, a2 ); \
431 b1 = vec_mergel( a0, a2 ); \
432 b2 = vec_mergeh( a1, a3 ); \
433 b3 = vec_mergel( a1, a3 )
435 #define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \
436 vdst_orig = vec_ld(0, dst); \
437 vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \
438 vdst_ss = (vec_s16_t) vec_mergeh(zero_u8v, vdst); \
439 va = vec_add(va, vdst_ss); \
440 va_u8 = vec_packsu(va, zero_s16v); \
441 va_u32 = vec_splat((vec_u32_t)va_u8, 0); \
442 vec_ste(va_u32, element, (uint32_t*)dst);
444 static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
446 vec_s16_t va0, va1, va2, va3;
447 vec_s16_t vz0, vz1, vz2, vz3;
448 vec_s16_t vtmp0, vtmp1, vtmp2, vtmp3;
452 const vec_u16_t v6us = vec_splat_u16(6);
453 vec_u8_t vdst, vdst_orig;
454 vec_u8_t vdst_mask = vec_lvsl(0, dst);
455 int element = ((unsigned long)dst & 0xf) >> 2;
458 block[0] += 32; /* add 32 as a DC-level for rounding */
460 vtmp0 = vec_ld(0,block);
461 vtmp1 = vec_sld(vtmp0, vtmp0, 8);
462 vtmp2 = vec_ld(16,block);
463 vtmp3 = vec_sld(vtmp2, vtmp2, 8);
465 VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
466 VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3);
467 VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
469 va0 = vec_sra(va0,v6us);
470 va1 = vec_sra(va1,v6us);
471 va2 = vec_sra(va2,v6us);
472 va3 = vec_sra(va3,v6us);
474 VEC_LOAD_U8_ADD_S16_STORE_U8(va0);
476 VEC_LOAD_U8_ADD_S16_STORE_U8(va1);
478 VEC_LOAD_U8_ADD_S16_STORE_U8(va2);
480 VEC_LOAD_U8_ADD_S16_STORE_U8(va3);
483 #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\
484 /* a0 = SRC(0) + SRC(4); */ \
485 vec_s16_t a0v = vec_add(s0, s4); \
486 /* a2 = SRC(0) - SRC(4); */ \
487 vec_s16_t a2v = vec_sub(s0, s4); \
488 /* a4 = (SRC(2)>>1) - SRC(6); */ \
489 vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6); \
490 /* a6 = (SRC(6)>>1) + SRC(2); */ \
491 vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2); \
492 /* b0 = a0 + a6; */ \
493 vec_s16_t b0v = vec_add(a0v, a6v); \
494 /* b2 = a2 + a4; */ \
495 vec_s16_t b2v = vec_add(a2v, a4v); \
496 /* b4 = a2 - a4; */ \
497 vec_s16_t b4v = vec_sub(a2v, a4v); \
498 /* b6 = a0 - a6; */ \
499 vec_s16_t b6v = vec_sub(a0v, a6v); \
500 /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
501 /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \
502 vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
503 /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
504 /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \
505 vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
506 /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
507 /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \
508 vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
509 /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \
510 vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
511 /* b1 = (a7>>2) + a1; */ \
512 vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v); \
513 /* b3 = a3 + (a5>>2); */ \
514 vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov)); \
515 /* b5 = (a3>>2) - a5; */ \
516 vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v); \
517 /* b7 = a7 - (a1>>2); */ \
518 vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
519 /* DST(0, b0 + b7); */ \
520 d0 = vec_add(b0v, b7v); \
521 /* DST(1, b2 + b5); */ \
522 d1 = vec_add(b2v, b5v); \
523 /* DST(2, b4 + b3); */ \
524 d2 = vec_add(b4v, b3v); \
525 /* DST(3, b6 + b1); */ \
526 d3 = vec_add(b6v, b1v); \
527 /* DST(4, b6 - b1); */ \
528 d4 = vec_sub(b6v, b1v); \
529 /* DST(5, b4 - b3); */ \
530 d5 = vec_sub(b4v, b3v); \
531 /* DST(6, b2 - b5); */ \
532 d6 = vec_sub(b2v, b5v); \
533 /* DST(7, b0 - b7); */ \
534 d7 = vec_sub(b0v, b7v); \
537 #define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
538 /* unaligned load */ \
539 vec_u8_t hv = vec_ld( 0, dest ); \
540 vec_u8_t lv = vec_ld( 7, dest ); \
541 vec_u8_t dstv = vec_perm( hv, lv, (vec_u8_t)perm_ldv ); \
542 vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \
543 vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv); \
544 vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \
545 vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum); \
547 /* unaligned store */ \
548 vec_u8_t bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\
549 vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv ); \
550 lv = vec_sel( lv, bodyv, edgelv ); \
551 vec_st( lv, 7, dest ); \
552 hv = vec_ld( 0, dest ); \
553 edgehv = vec_perm( zero_u8v, sel, perm_stv ); \
554 hv = vec_sel( hv, bodyv, edgehv ); \
555 vec_st( hv, 0, dest ); \
558 void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
559 vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7;
560 vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7;
561 vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
563 vec_u8_t perm_ldv = vec_lvsl(0, dst);
564 vec_u8_t perm_stv = vec_lvsr(8, dst);
566 const vec_u16_t onev = vec_splat_u16(1);
567 const vec_u16_t twov = vec_splat_u16(2);
568 const vec_u16_t sixv = vec_splat_u16(6);
570 const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,
571 -1,-1,-1,-1,-1,-1,-1,-1);
574 dct[0] += 32; // rounding for the >>6 at the end
576 s0 = vec_ld(0x00, (int16_t*)dct);
577 s1 = vec_ld(0x10, (int16_t*)dct);
578 s2 = vec_ld(0x20, (int16_t*)dct);
579 s3 = vec_ld(0x30, (int16_t*)dct);
580 s4 = vec_ld(0x40, (int16_t*)dct);
581 s5 = vec_ld(0x50, (int16_t*)dct);
582 s6 = vec_ld(0x60, (int16_t*)dct);
583 s7 = vec_ld(0x70, (int16_t*)dct);
585 IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,
586 d0, d1, d2, d3, d4, d5, d6, d7);
588 TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 );
590 IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7,
591 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);
593 ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel);
594 ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel);
595 ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel);
596 ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel);
597 ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel);
598 ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel);
599 ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel);
600 ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
603 #define transpose4x16(r0, r1, r2, r3) { \
604 register vector unsigned char r4; \
605 register vector unsigned char r5; \
606 register vector unsigned char r6; \
607 register vector unsigned char r7; \
609 r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \
610 r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \
611 r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \
612 r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \
614 r0 = vec_mergeh(r4, r6); /*all set 0*/ \
615 r1 = vec_mergel(r4, r6); /*all set 1*/ \
616 r2 = vec_mergeh(r5, r7); /*all set 2*/ \
617 r3 = vec_mergel(r5, r7); /*all set 3*/ \
620 static inline void write16x4(uint8_t *dst, int dst_stride, register vector unsigned char r0,
621 register vector unsigned char r1, register vector unsigned char r2,
622 register vector unsigned char r3) {
623 DECLARE_ALIGNED_16(unsigned char, result[64]);
624 uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
625 int int_dst_stride = dst_stride/4;
627 vec_st(r0, 0, result);
628 vec_st(r1, 16, result);
629 vec_st(r2, 32, result);
630 vec_st(r3, 48, result);
631 /* FIXME: there has to be a better way!!!! */
633 *(dst_int+ int_dst_stride) = *(src_int + 1);
634 *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
635 *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
636 *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
637 *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
638 *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
639 *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
640 *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
641 *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
642 *(dst_int+10*int_dst_stride) = *(src_int + 10);
643 *(dst_int+11*int_dst_stride) = *(src_int + 11);
644 *(dst_int+12*int_dst_stride) = *(src_int + 12);
645 *(dst_int+13*int_dst_stride) = *(src_int + 13);
646 *(dst_int+14*int_dst_stride) = *(src_int + 14);
647 *(dst_int+15*int_dst_stride) = *(src_int + 15);
650 /** \brief performs a 6x16 transpose of data in src, and stores it to dst
651 \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
652 out of unaligned_load() */
653 #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
654 register vector unsigned char r0 = unaligned_load(0, src);\
655 register vector unsigned char r1 = unaligned_load( src_stride, src);\
656 register vector unsigned char r2 = unaligned_load(2* src_stride, src);\
657 register vector unsigned char r3 = unaligned_load(3* src_stride, src);\
658 register vector unsigned char r4 = unaligned_load(4* src_stride, src);\
659 register vector unsigned char r5 = unaligned_load(5* src_stride, src);\
660 register vector unsigned char r6 = unaligned_load(6* src_stride, src);\
661 register vector unsigned char r7 = unaligned_load(7* src_stride, src);\
662 register vector unsigned char r14 = unaligned_load(14*src_stride, src);\
663 register vector unsigned char r15 = unaligned_load(15*src_stride, src);\
665 r8 = unaligned_load( 8*src_stride, src); \
666 r9 = unaligned_load( 9*src_stride, src); \
667 r10 = unaligned_load(10*src_stride, src); \
668 r11 = unaligned_load(11*src_stride, src); \
669 r12 = unaligned_load(12*src_stride, src); \
670 r13 = unaligned_load(13*src_stride, src); \
672 /*Merge first pairs*/ \
673 r0 = vec_mergeh(r0, r8); /*0, 8*/ \
674 r1 = vec_mergeh(r1, r9); /*1, 9*/ \
675 r2 = vec_mergeh(r2, r10); /*2,10*/ \
676 r3 = vec_mergeh(r3, r11); /*3,11*/ \
677 r4 = vec_mergeh(r4, r12); /*4,12*/ \
678 r5 = vec_mergeh(r5, r13); /*5,13*/ \
679 r6 = vec_mergeh(r6, r14); /*6,14*/ \
680 r7 = vec_mergeh(r7, r15); /*7,15*/ \
682 /*Merge second pairs*/ \
683 r8 = vec_mergeh(r0, r4); /*0,4, 8,12 set 0*/ \
684 r9 = vec_mergel(r0, r4); /*0,4, 8,12 set 1*/ \
685 r10 = vec_mergeh(r1, r5); /*1,5, 9,13 set 0*/ \
686 r11 = vec_mergel(r1, r5); /*1,5, 9,13 set 1*/ \
687 r12 = vec_mergeh(r2, r6); /*2,6,10,14 set 0*/ \
688 r13 = vec_mergel(r2, r6); /*2,6,10,14 set 1*/ \
689 r14 = vec_mergeh(r3, r7); /*3,7,11,15 set 0*/ \
690 r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \
693 r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \
694 r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \
695 r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \
696 r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \
697 r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \
698 r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \
699 /* Don't need to compute 3 and 7*/ \
702 r8 = vec_mergeh(r0, r4); /*all set 0*/ \
703 r9 = vec_mergel(r0, r4); /*all set 1*/ \
704 r10 = vec_mergeh(r1, r5); /*all set 2*/ \
705 r11 = vec_mergel(r1, r5); /*all set 3*/ \
706 r12 = vec_mergeh(r2, r6); /*all set 4*/ \
707 r13 = vec_mergel(r2, r6); /*all set 5*/ \
708 /* Don't need to compute 14 and 15*/ \
712 // out: o = |x-y| < a
713 static inline vector unsigned char diff_lt_altivec ( register vector unsigned char x,
714 register vector unsigned char y,
715 register vector unsigned char a) {
717 register vector unsigned char diff = vec_subs(x, y);
718 register vector unsigned char diffneg = vec_subs(y, x);
719 register vector unsigned char o = vec_or(diff, diffneg); /* |x-y| */
720 o = (vector unsigned char)vec_cmplt(o, a);
724 static inline vector unsigned char h264_deblock_mask ( register vector unsigned char p0,
725 register vector unsigned char p1,
726 register vector unsigned char q0,
727 register vector unsigned char q1,
728 register vector unsigned char alpha,
729 register vector unsigned char beta) {
731 register vector unsigned char mask;
732 register vector unsigned char tempmask;
734 mask = diff_lt_altivec(p0, q0, alpha);
735 tempmask = diff_lt_altivec(p1, p0, beta);
736 mask = vec_and(mask, tempmask);
737 tempmask = diff_lt_altivec(q1, q0, beta);
738 mask = vec_and(mask, tempmask);
743 // out: p1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
744 #define h264_deblock_q1(p0, p1, p2, q0, tc0) { \
746 register vector unsigned char average = vec_avg(p0, q0); \
747 register vector unsigned char temp; \
748 register vector unsigned char uncliped; \
749 register vector unsigned char ones; \
750 register vector unsigned char max; \
751 register vector unsigned char min; \
753 temp = vec_xor(average, p2); \
754 average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ \
755 ones = vec_splat_u8(1); \
756 temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ \
757 uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */\
758 max = vec_adds(p1, tc0); \
759 min = vec_subs(p1, tc0); \
760 p1 = vec_max(min, uncliped); \
761 p1 = vec_min(max, p1); \
764 #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \
766 const vec_u8_t A0v = (vec_u8_t) AVV(0xA0,0xA0,0xA0,0xA0,0xA0,0xA0,0xA0,0xA0, \
767 0xA0,0xA0,0xA0,0xA0,0xA0,0xA0,0xA0,0xA0); \
769 register vector unsigned char pq0bit = vec_xor(p0,q0); \
770 register vector unsigned char temp; \
771 register vector unsigned char q1minus; \
772 register vector unsigned char p0minus; \
773 register vector unsigned char stage1; \
774 register vector unsigned char stage2; \
775 register vector unsigned char vec160; \
776 register vector unsigned char delta; \
777 register vector unsigned char deltaneg; \
779 temp = (vector unsigned char)vec_cmpeq(p0, p0); \
780 q1minus = vec_xor(temp, q1); /* 255 - q1 */ \
781 stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \
782 stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \
783 p0minus = vec_xor(temp, p0); /* 255 - p0 */ \
784 stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \
785 pq0bit = vec_and(pq0bit, vec_splat_u8(1)); \
786 stage2 = vec_avg(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \
787 stage2 = vec_adds(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \
788 vec160 = vec_ld(0, &A0v); \
789 deltaneg = vec_subs(vec160, stage2); /* -d */ \
790 delta = vec_subs(stage2, vec160); /* d */ \
791 deltaneg = vec_min(tc0masked, deltaneg); \
792 delta = vec_min(tc0masked, delta); \
793 p0 = vec_subs(p0, deltaneg); \
794 q0 = vec_subs(q0, delta); \
795 p0 = vec_adds(p0, delta); \
796 q0 = vec_adds(q0, deltaneg); \
799 #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \
800 DECLARE_ALIGNED_16(unsigned char, temp[16]); \
801 register vector unsigned char alphavec; \
802 register vector unsigned char betavec; \
803 register vector unsigned char mask; \
804 register vector unsigned char p1mask; \
805 register vector unsigned char q1mask; \
806 register vector unsigned char tc0vec; \
807 register vector unsigned char finaltc0; \
808 register vector unsigned char tc0masked; \
812 alphavec = vec_ld(0, temp); \
813 betavec = vec_splat(alphavec, 0x1); \
814 alphavec = vec_splat(alphavec, 0x0); \
815 mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */ \
817 *((int *)temp) = *((int *)tc0); \
818 tc0vec = vec_ld(0, temp); \
819 tc0vec = vec_mergeh(tc0vec, tc0vec); \
820 tc0vec = vec_mergeh(tc0vec, tc0vec); \
821 mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_u8(-1))); /* if tc0[i] >= 0 */ \
822 finaltc0 = vec_and(tc0vec, mask); /*tc = tc0[i]*/ \
824 p1mask = diff_lt_altivec(p2, p0, betavec); \
825 p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */ \
826 tc0masked = vec_and(p1mask, tc0vec); \
827 finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \
828 h264_deblock_q1(p0, p1, p2, q0, tc0masked); \
831 q1mask = diff_lt_altivec(q2, q0, betavec); \
832 q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\
833 tc0masked = vec_and(q1mask, tc0vec); \
834 finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \
835 h264_deblock_q1(p0, q1, q2, q0, tc0masked); \
838 h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \
841 static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
843 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
844 register vector unsigned char p2 = vec_ld(-3*stride, pix);
845 register vector unsigned char p1 = vec_ld(-2*stride, pix);
846 register vector unsigned char p0 = vec_ld(-1*stride, pix);
847 register vector unsigned char q0 = vec_ld(0, pix);
848 register vector unsigned char q1 = vec_ld(stride, pix);
849 register vector unsigned char q2 = vec_ld(2*stride, pix);
850 h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
851 vec_st(p1, -2*stride, pix);
852 vec_st(p0, -1*stride, pix);
854 vec_st(q1, stride, pix);
858 static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
860 register vector unsigned char line0, line1, line2, line3, line4, line5;
861 if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
863 readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
864 h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
865 transpose4x16(line1, line2, line3, line4);
866 write16x4(pix-2, stride, line1, line2, line3, line4);
869 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
873 c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
874 c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec;
875 c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
876 c->h264_idct_add = ff_h264_idct_add_altivec;
877 c->h264_idct8_add = ff_h264_idct8_add_altivec;
878 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
879 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
881 #define dspfunc(PFX, IDX, NUM) \
882 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \
883 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \
884 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \
885 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \
886 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \
887 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \
888 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \
889 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \
890 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \
891 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \
892 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \
893 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \
894 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \
895 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \
896 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \
897 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec
899 dspfunc(put_h264_qpel, 0, 16);
900 dspfunc(avg_h264_qpel, 0, 16);
904 #endif /* HAVE_ALTIVEC */
906 // Non-AltiVec PPC optimisations