2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 #include "libavutil/mem.h"
27 #include "libavutil/ppc/types_altivec.h"
28 #include "libavutil/ppc/util_altivec.h"
31 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
33 #define ASSERT_ALIGNED(ptr) ;
37 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
38 vec_u8 srcR1 = vec_ld(-2, s);\
39 vec_u8 srcR2 = vec_ld(14, s);\
42 srcM2 = vec_perm(srcR1, srcR2, pm2);\
43 srcM1 = vec_perm(srcR1, srcR2, pm1);\
44 srcP0 = vec_perm(srcR1, srcR2, pp0);\
45 srcP1 = vec_perm(srcR1, srcR2, pp1);\
46 srcP2 = vec_perm(srcR1, srcR2, pp2);\
47 srcP3 = vec_perm(srcR1, srcR2, pp3);\
50 srcM2 = vec_perm(srcR1, srcR2, pm2);\
51 srcM1 = vec_perm(srcR1, srcR2, pm1);\
52 srcP0 = vec_perm(srcR1, srcR2, pp0);\
53 srcP1 = vec_perm(srcR1, srcR2, pp1);\
54 srcP2 = vec_perm(srcR1, srcR2, pp2);\
58 vec_u8 srcR3 = vec_ld(30, s);\
59 srcM2 = vec_perm(srcR1, srcR2, pm2);\
60 srcM1 = vec_perm(srcR1, srcR2, pm1);\
61 srcP0 = vec_perm(srcR1, srcR2, pp0);\
62 srcP1 = vec_perm(srcR1, srcR2, pp1);\
64 srcP3 = vec_perm(srcR2, srcR3, pp3);\
67 vec_u8 srcR3 = vec_ld(30, s);\
68 srcM2 = vec_perm(srcR1, srcR2, pm2);\
69 srcM1 = vec_perm(srcR1, srcR2, pm1);\
70 srcP0 = vec_perm(srcR1, srcR2, pp0);\
72 srcP2 = vec_perm(srcR2, srcR3, pp2);\
73 srcP3 = vec_perm(srcR2, srcR3, pp3);\
76 vec_u8 srcR3 = vec_ld(30, s);\
77 srcM2 = vec_perm(srcR1, srcR2, pm2);\
78 srcM1 = vec_perm(srcR1, srcR2, pm1);\
80 srcP1 = vec_perm(srcR2, srcR3, pp1);\
81 srcP2 = vec_perm(srcR2, srcR3, pp2);\
82 srcP3 = vec_perm(srcR2, srcR3, pp3);\
85 vec_u8 srcR3 = vec_ld(30, s);\
86 srcM2 = vec_perm(srcR1, srcR2, pm2);\
88 srcP0 = vec_perm(srcR2, srcR3, pp0);\
89 srcP1 = vec_perm(srcR2, srcR3, pp1);\
90 srcP2 = vec_perm(srcR2, srcR3, pp2);\
91 srcP3 = vec_perm(srcR2, srcR3, pp3);\
96 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
97 srcM2 = vec_vsx_ld(-2, s);\
98 srcM1 = vec_vsx_ld(-1, s);\
99 srcP0 = vec_vsx_ld(0, s);\
100 srcP1 = vec_vsx_ld(1, s);\
101 srcP2 = vec_vsx_ld(2, s);\
102 srcP3 = vec_vsx_ld(3, s);\
104 #endif /* HAVE_BIGENDIAN */
106 /* this code assume stride % 16 == 0 */
107 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
108 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
110 int dstStride, int srcStride)
115 vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
116 const vec_s16 v5ss = vec_splat_s16(5);
117 const vec_u16 v5us = vec_splat_u16(5);
118 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
119 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
121 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
123 register int align = ((((unsigned long)src) - 2) % 16);
125 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
126 srcP2A, srcP2B, srcP3A, srcP3B,
127 srcM1A, srcM1B, srcM2A, srcM2B,
128 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
129 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
130 psumA, psumB, sumA, sumB;
135 permM2 = vec_lvsl(-2, src);
136 permM1 = vec_lvsl(-1, src);
137 permP0 = vec_lvsl(+0, src);
138 permP1 = vec_lvsl(+1, src);
139 permP2 = vec_lvsl(+2, src);
140 permP3 = vec_lvsl(+3, src);
141 #endif /* HAVE_BIGENDIAN */
143 for (i = 0 ; i < 16 ; i ++) {
144 load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
146 srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
147 srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
148 srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
149 srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
151 srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
152 srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
153 srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
154 srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
156 srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
157 srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
158 srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
159 srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
161 sum1A = vec_adds(srcP0A, srcP1A);
162 sum1B = vec_adds(srcP0B, srcP1B);
163 sum2A = vec_adds(srcM1A, srcP2A);
164 sum2B = vec_adds(srcM1B, srcP2B);
165 sum3A = vec_adds(srcM2A, srcP3A);
166 sum3B = vec_adds(srcM2B, srcP3B);
168 pp1A = vec_mladd(sum1A, v20ss, v16ss);
169 pp1B = vec_mladd(sum1B, v20ss, v16ss);
171 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
172 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
174 pp3A = vec_add(sum3A, pp1A);
175 pp3B = vec_add(sum3B, pp1B);
177 psumA = vec_sub(pp3A, pp2A);
178 psumB = vec_sub(pp3B, pp2B);
180 sumA = vec_sra(psumA, v5us);
181 sumB = vec_sra(psumB, v5us);
183 sum = vec_packsu(sumA, sumB);
187 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
189 vec_st(fsum, 0, dst);
197 /* this code assume stride % 16 == 0 */
198 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
199 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
201 int dstStride, int srcStride)
208 perm = vec_lvsl(0, src);
210 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
211 const vec_u16 v5us = vec_splat_u16(5);
212 const vec_s16 v5ss = vec_splat_s16(5);
213 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
215 const uint8_t *srcbis = src - (srcStride * 2);
217 const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm);
219 const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm);
221 const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm);
223 const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm);
225 const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm);
228 vec_s16 srcM2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
229 vec_s16 srcM2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
230 vec_s16 srcM1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
231 vec_s16 srcM1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
232 vec_s16 srcP0ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
233 vec_s16 srcP0ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
234 vec_s16 srcP1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
235 vec_s16 srcP1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
236 vec_s16 srcP2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
237 vec_s16 srcP2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
239 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
240 psumA, psumB, sumA, sumB,
242 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
244 vec_u8 sum, fsum, srcP3;
246 for (i = 0 ; i < 16 ; i++) {
247 srcP3 = load_with_perm_vec(0, srcbis, perm);
250 srcP3ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
251 srcP3ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
253 sum1A = vec_adds(srcP0ssA, srcP1ssA);
254 sum1B = vec_adds(srcP0ssB, srcP1ssB);
255 sum2A = vec_adds(srcM1ssA, srcP2ssA);
256 sum2B = vec_adds(srcM1ssB, srcP2ssB);
257 sum3A = vec_adds(srcM2ssA, srcP3ssA);
258 sum3B = vec_adds(srcM2ssB, srcP3ssB);
271 pp1A = vec_mladd(sum1A, v20ss, v16ss);
272 pp1B = vec_mladd(sum1B, v20ss, v16ss);
274 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
275 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
277 pp3A = vec_add(sum3A, pp1A);
278 pp3B = vec_add(sum3B, pp1B);
280 psumA = vec_sub(pp3A, pp2A);
281 psumB = vec_sub(pp3B, pp2B);
283 sumA = vec_sra(psumA, v5us);
284 sumB = vec_sra(psumB, v5us);
286 sum = vec_packsu(sumA, sumB);
290 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
292 vec_st(fsum, 0, dst);
299 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
300 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
301 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
303 int dstStride, int tmpStride,
308 vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
309 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
310 const vec_u32 v10ui = vec_splat_u32(10);
311 const vec_s16 v5ss = vec_splat_s16(5);
312 const vec_s16 v1ss = vec_splat_s16(1);
313 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
314 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
316 register int align = ((((unsigned long)src) - 2) % 16);
318 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
319 srcP2A, srcP2B, srcP3A, srcP3B,
320 srcM1A, srcM1B, srcM2A, srcM2B,
321 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
322 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
324 const vec_u8 mperm = (const vec_u8)
325 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
326 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
327 int16_t *tmpbis = tmp;
329 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
330 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
333 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
334 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
335 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
336 ssumAe, ssumAo, ssumBe, ssumBo;
337 vec_u8 fsum, sumv, sum;
338 vec_s16 ssume, ssumo;
341 permM2 = vec_lvsl(-2, src);
342 permM1 = vec_lvsl(-1, src);
343 permP0 = vec_lvsl(+0, src);
344 permP1 = vec_lvsl(+1, src);
345 permP2 = vec_lvsl(+2, src);
346 permP3 = vec_lvsl(+3, src);
347 #endif /* HAVE_BIGENDIAN */
349 src -= (2 * srcStride);
350 for (i = 0 ; i < 21 ; i ++) {
351 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
353 load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
355 srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
356 srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
357 srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
358 srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
360 srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
361 srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
362 srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
363 srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
365 srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
366 srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
367 srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
368 srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
370 sum1A = vec_adds(srcP0A, srcP1A);
371 sum1B = vec_adds(srcP0B, srcP1B);
372 sum2A = vec_adds(srcM1A, srcP2A);
373 sum2B = vec_adds(srcM1B, srcP2B);
374 sum3A = vec_adds(srcM2A, srcP3A);
375 sum3B = vec_adds(srcM2B, srcP3B);
377 pp1A = vec_mladd(sum1A, v20ss, sum3A);
378 pp1B = vec_mladd(sum1B, v20ss, sum3B);
380 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
381 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
383 psumA = vec_sub(pp1A, pp2A);
384 psumB = vec_sub(pp1B, pp2B);
386 vec_st(psumA, 0, tmp);
387 vec_st(psumB, 16, tmp);
390 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
393 tmpM2ssA = vec_ld(0, tmpbis);
394 tmpM2ssB = vec_ld(16, tmpbis);
396 tmpM1ssA = vec_ld(0, tmpbis);
397 tmpM1ssB = vec_ld(16, tmpbis);
399 tmpP0ssA = vec_ld(0, tmpbis);
400 tmpP0ssB = vec_ld(16, tmpbis);
402 tmpP1ssA = vec_ld(0, tmpbis);
403 tmpP1ssB = vec_ld(16, tmpbis);
405 tmpP2ssA = vec_ld(0, tmpbis);
406 tmpP2ssB = vec_ld(16, tmpbis);
409 for (i = 0 ; i < 16 ; i++) {
410 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
411 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
413 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
414 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
415 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
416 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
417 vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
418 vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
433 pp1Ae = vec_mule(sum1A, v20ss);
434 pp1Ao = vec_mulo(sum1A, v20ss);
435 pp1Be = vec_mule(sum1B, v20ss);
436 pp1Bo = vec_mulo(sum1B, v20ss);
438 pp2Ae = vec_mule(sum2A, v5ss);
439 pp2Ao = vec_mulo(sum2A, v5ss);
440 pp2Be = vec_mule(sum2B, v5ss);
441 pp2Bo = vec_mulo(sum2B, v5ss);
443 pp3Ao = vec_mulo(sum3A, v1ss);
444 pp3Bo = vec_mulo(sum3B, v1ss);
446 sum3A = (vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3));
447 sum3B = (vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3));
449 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
450 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
452 pp1cAe = vec_add(pp1Ae, v512si);
453 pp1cAo = vec_add(pp1Ao, v512si);
454 pp1cBe = vec_add(pp1Be, v512si);
455 pp1cBo = vec_add(pp1Bo, v512si);
457 pp32Ae = vec_sub(pp3Ae, pp2Ae);
458 pp32Ao = vec_sub(pp3Ao, pp2Ao);
459 pp32Be = vec_sub(pp3Be, pp2Be);
460 pp32Bo = vec_sub(pp3Bo, pp2Bo);
462 sumAe = vec_add(pp1cAe, pp32Ae);
463 sumAo = vec_add(pp1cAo, pp32Ao);
464 sumBe = vec_add(pp1cBe, pp32Be);
465 sumBo = vec_add(pp1cBo, pp32Bo);
467 ssumAe = vec_sra(sumAe, v10ui);
468 ssumAo = vec_sra(sumAo, v10ui);
469 ssumBe = vec_sra(sumBe, v10ui);
470 ssumBo = vec_sra(sumBo, v10ui);
472 ssume = vec_packs(ssumAe, ssumBe);
473 ssumo = vec_packs(ssumAo, ssumBo);
475 sumv = vec_packsu(ssume, ssumo);
476 sum = vec_perm(sumv, sumv, mperm);
480 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
482 vec_st(fsum, 0, dst);