2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 #include "libavutil/avassert.h"
27 #include "libavutil/mem.h"
28 #include "libavutil/ppc/util_altivec.h"
30 #define ASSERT_ALIGNED(ptr) av_assert2(!((uintptr_t)ptr&0x0000000F));
33 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
34 vec_u8 srcR1 = vec_ld(-2, s);\
35 vec_u8 srcR2 = vec_ld(14, s);\
38 srcM2 = vec_perm(srcR1, srcR2, pm2);\
39 srcM1 = vec_perm(srcR1, srcR2, pm1);\
40 srcP0 = vec_perm(srcR1, srcR2, pp0);\
41 srcP1 = vec_perm(srcR1, srcR2, pp1);\
42 srcP2 = vec_perm(srcR1, srcR2, pp2);\
43 srcP3 = vec_perm(srcR1, srcR2, pp3);\
46 srcM2 = vec_perm(srcR1, srcR2, pm2);\
47 srcM1 = vec_perm(srcR1, srcR2, pm1);\
48 srcP0 = vec_perm(srcR1, srcR2, pp0);\
49 srcP1 = vec_perm(srcR1, srcR2, pp1);\
50 srcP2 = vec_perm(srcR1, srcR2, pp2);\
54 vec_u8 srcR3 = vec_ld(30, s);\
55 srcM2 = vec_perm(srcR1, srcR2, pm2);\
56 srcM1 = vec_perm(srcR1, srcR2, pm1);\
57 srcP0 = vec_perm(srcR1, srcR2, pp0);\
58 srcP1 = vec_perm(srcR1, srcR2, pp1);\
60 srcP3 = vec_perm(srcR2, srcR3, pp3);\
63 vec_u8 srcR3 = vec_ld(30, s);\
64 srcM2 = vec_perm(srcR1, srcR2, pm2);\
65 srcM1 = vec_perm(srcR1, srcR2, pm1);\
66 srcP0 = vec_perm(srcR1, srcR2, pp0);\
68 srcP2 = vec_perm(srcR2, srcR3, pp2);\
69 srcP3 = vec_perm(srcR2, srcR3, pp3);\
72 vec_u8 srcR3 = vec_ld(30, s);\
73 srcM2 = vec_perm(srcR1, srcR2, pm2);\
74 srcM1 = vec_perm(srcR1, srcR2, pm1);\
76 srcP1 = vec_perm(srcR2, srcR3, pp1);\
77 srcP2 = vec_perm(srcR2, srcR3, pp2);\
78 srcP3 = vec_perm(srcR2, srcR3, pp3);\
81 vec_u8 srcR3 = vec_ld(30, s);\
82 srcM2 = vec_perm(srcR1, srcR2, pm2);\
84 srcP0 = vec_perm(srcR2, srcR3, pp0);\
85 srcP1 = vec_perm(srcR2, srcR3, pp1);\
86 srcP2 = vec_perm(srcR2, srcR3, pp2);\
87 srcP3 = vec_perm(srcR2, srcR3, pp3);\
92 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
93 srcM2 = vec_vsx_ld(-2, s);\
94 srcM1 = vec_vsx_ld(-1, s);\
95 srcP0 = vec_vsx_ld(0, s);\
96 srcP1 = vec_vsx_ld(1, s);\
97 srcP2 = vec_vsx_ld(2, s);\
98 srcP3 = vec_vsx_ld(3, s);\
100 #endif /* HAVE_BIGENDIAN */
102 /* this code assume stride % 16 == 0 */
103 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
104 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
106 int dstStride, int srcStride)
111 vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
112 const vec_s16 v5ss = vec_splat_s16(5);
113 const vec_u16 v5us = vec_splat_u16(5);
114 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
115 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
117 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
119 register int align = ((((unsigned long)src) - 2) % 16);
121 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
122 srcP2A, srcP2B, srcP3A, srcP3B,
123 srcM1A, srcM1B, srcM2A, srcM2B,
124 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
125 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
126 psumA, psumB, sumA, sumB;
131 permM2 = vec_lvsl(-2, src);
132 permM1 = vec_lvsl(-1, src);
133 permP0 = vec_lvsl(+0, src);
134 permP1 = vec_lvsl(+1, src);
135 permP2 = vec_lvsl(+2, src);
136 permP3 = vec_lvsl(+3, src);
137 #endif /* HAVE_BIGENDIAN */
139 for (i = 0 ; i < 16 ; i ++) {
140 load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
142 srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
143 srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
144 srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
145 srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
147 srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
148 srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
149 srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
150 srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
152 srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
153 srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
154 srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
155 srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
157 sum1A = vec_adds(srcP0A, srcP1A);
158 sum1B = vec_adds(srcP0B, srcP1B);
159 sum2A = vec_adds(srcM1A, srcP2A);
160 sum2B = vec_adds(srcM1B, srcP2B);
161 sum3A = vec_adds(srcM2A, srcP3A);
162 sum3B = vec_adds(srcM2B, srcP3B);
164 pp1A = vec_mladd(sum1A, v20ss, v16ss);
165 pp1B = vec_mladd(sum1B, v20ss, v16ss);
167 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
168 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
170 pp3A = vec_add(sum3A, pp1A);
171 pp3B = vec_add(sum3B, pp1B);
173 psumA = vec_sub(pp3A, pp2A);
174 psumB = vec_sub(pp3B, pp2B);
176 sumA = vec_sra(psumA, v5us);
177 sumB = vec_sra(psumB, v5us);
179 sum = vec_packsu(sumA, sumB);
183 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
185 vec_st(fsum, 0, dst);
191 #endif /* PREFIX_h264_qpel16_h_lowpass_altivec */
193 /* this code assume stride % 16 == 0 */
194 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
195 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
197 int dstStride, int srcStride)
204 perm = vec_lvsl(0, src);
206 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
207 const vec_u16 v5us = vec_splat_u16(5);
208 const vec_s16 v5ss = vec_splat_s16(5);
209 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
211 const uint8_t *srcbis = src - (srcStride * 2);
213 const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm);
215 const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm);
217 const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm);
219 const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm);
221 const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm);
224 vec_s16 srcM2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
225 vec_s16 srcM2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
226 vec_s16 srcM1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
227 vec_s16 srcM1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
228 vec_s16 srcP0ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
229 vec_s16 srcP0ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
230 vec_s16 srcP1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
231 vec_s16 srcP1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
232 vec_s16 srcP2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
233 vec_s16 srcP2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
235 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
236 psumA, psumB, sumA, sumB,
238 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
240 vec_u8 sum, fsum, srcP3;
242 for (i = 0 ; i < 16 ; i++) {
243 srcP3 = load_with_perm_vec(0, srcbis, perm);
246 srcP3ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
247 srcP3ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
249 sum1A = vec_adds(srcP0ssA, srcP1ssA);
250 sum1B = vec_adds(srcP0ssB, srcP1ssB);
251 sum2A = vec_adds(srcM1ssA, srcP2ssA);
252 sum2B = vec_adds(srcM1ssB, srcP2ssB);
253 sum3A = vec_adds(srcM2ssA, srcP3ssA);
254 sum3B = vec_adds(srcM2ssB, srcP3ssB);
267 pp1A = vec_mladd(sum1A, v20ss, v16ss);
268 pp1B = vec_mladd(sum1B, v20ss, v16ss);
270 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
271 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
273 pp3A = vec_add(sum3A, pp1A);
274 pp3B = vec_add(sum3B, pp1B);
276 psumA = vec_sub(pp3A, pp2A);
277 psumB = vec_sub(pp3B, pp2B);
279 sumA = vec_sra(psumA, v5us);
280 sumB = vec_sra(psumB, v5us);
282 sum = vec_packsu(sumA, sumB);
286 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
288 vec_st(fsum, 0, dst);
293 #endif /* PREFIX_h264_qpel16_v_lowpass_altivec */
295 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
296 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
297 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
299 int dstStride, int tmpStride,
304 vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
305 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
306 const vec_u32 v10ui = vec_splat_u32(10);
307 const vec_s16 v5ss = vec_splat_s16(5);
308 const vec_s16 v1ss = vec_splat_s16(1);
309 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
310 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
312 register int align = ((((unsigned long)src) - 2) % 16);
314 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
315 srcP2A, srcP2B, srcP3A, srcP3B,
316 srcM1A, srcM1B, srcM2A, srcM2B,
317 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
318 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
320 const vec_u8 mperm = (const vec_u8)
321 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
322 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
323 int16_t *tmpbis = tmp;
325 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
326 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
329 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
330 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
331 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
332 ssumAe, ssumAo, ssumBe, ssumBo;
333 vec_u8 fsum, sumv, sum;
334 vec_s16 ssume, ssumo;
337 permM2 = vec_lvsl(-2, src);
338 permM1 = vec_lvsl(-1, src);
339 permP0 = vec_lvsl(+0, src);
340 permP1 = vec_lvsl(+1, src);
341 permP2 = vec_lvsl(+2, src);
342 permP3 = vec_lvsl(+3, src);
343 #endif /* HAVE_BIGENDIAN */
345 src -= (2 * srcStride);
346 for (i = 0 ; i < 21 ; i ++) {
347 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
349 load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
351 srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
352 srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
353 srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
354 srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
356 srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
357 srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
358 srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
359 srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
361 srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
362 srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
363 srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
364 srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
366 sum1A = vec_adds(srcP0A, srcP1A);
367 sum1B = vec_adds(srcP0B, srcP1B);
368 sum2A = vec_adds(srcM1A, srcP2A);
369 sum2B = vec_adds(srcM1B, srcP2B);
370 sum3A = vec_adds(srcM2A, srcP3A);
371 sum3B = vec_adds(srcM2B, srcP3B);
373 pp1A = vec_mladd(sum1A, v20ss, sum3A);
374 pp1B = vec_mladd(sum1B, v20ss, sum3B);
376 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
377 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
379 psumA = vec_sub(pp1A, pp2A);
380 psumB = vec_sub(pp1B, pp2B);
382 vec_st(psumA, 0, tmp);
383 vec_st(psumB, 16, tmp);
386 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
389 tmpM2ssA = vec_ld(0, tmpbis);
390 tmpM2ssB = vec_ld(16, tmpbis);
392 tmpM1ssA = vec_ld(0, tmpbis);
393 tmpM1ssB = vec_ld(16, tmpbis);
395 tmpP0ssA = vec_ld(0, tmpbis);
396 tmpP0ssB = vec_ld(16, tmpbis);
398 tmpP1ssA = vec_ld(0, tmpbis);
399 tmpP1ssB = vec_ld(16, tmpbis);
401 tmpP2ssA = vec_ld(0, tmpbis);
402 tmpP2ssB = vec_ld(16, tmpbis);
405 for (i = 0 ; i < 16 ; i++) {
406 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
407 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
409 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
410 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
411 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
412 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
413 vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
414 vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
429 pp1Ae = vec_mule(sum1A, v20ss);
430 pp1Ao = vec_mulo(sum1A, v20ss);
431 pp1Be = vec_mule(sum1B, v20ss);
432 pp1Bo = vec_mulo(sum1B, v20ss);
434 pp2Ae = vec_mule(sum2A, v5ss);
435 pp2Ao = vec_mulo(sum2A, v5ss);
436 pp2Be = vec_mule(sum2B, v5ss);
437 pp2Bo = vec_mulo(sum2B, v5ss);
439 pp3Ao = vec_mulo(sum3A, v1ss);
440 pp3Bo = vec_mulo(sum3B, v1ss);
442 sum3A = (vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3));
443 sum3B = (vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3));
445 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
446 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
448 pp1cAe = vec_add(pp1Ae, v512si);
449 pp1cAo = vec_add(pp1Ao, v512si);
450 pp1cBe = vec_add(pp1Be, v512si);
451 pp1cBo = vec_add(pp1Bo, v512si);
453 pp32Ae = vec_sub(pp3Ae, pp2Ae);
454 pp32Ao = vec_sub(pp3Ao, pp2Ao);
455 pp32Be = vec_sub(pp3Be, pp2Be);
456 pp32Bo = vec_sub(pp3Bo, pp2Bo);
458 sumAe = vec_add(pp1cAe, pp32Ae);
459 sumAo = vec_add(pp1cAo, pp32Ao);
460 sumBe = vec_add(pp1cBe, pp32Be);
461 sumBo = vec_add(pp1cBo, pp32Bo);
463 ssumAe = vec_sra(sumAe, v10ui);
464 ssumAo = vec_sra(sumAo, v10ui);
465 ssumBe = vec_sra(sumBe, v10ui);
466 ssumBo = vec_sra(sumBo, v10ui);
468 ssume = vec_packs(ssumAe, ssumBe);
469 ssumo = vec_packs(ssumAo, ssumBo);
471 sumv = vec_packsu(ssume, ssumo);
472 sum = vec_perm(sumv, sumv, mperm);
476 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
478 vec_st(fsum, 0, dst);
483 #endif /* PREFIX_h264_qpel16_hv_lowpass_altivec */