2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4 * This file is part of Libav.
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
24 #define ASSERT_ALIGNED(ptr) ;
27 /* this code assume that stride % 16 == 0 */
29 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
30 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
31 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
33 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
34 psum = vec_mladd(vB, vsrc1ssH, psum);\
35 psum = vec_mladd(vC, vsrc2ssH, psum);\
36 psum = vec_mladd(vD, vsrc3ssH, psum);\
38 psum = vec_sr(psum, v6us);\
40 vdst = vec_ld(0, dst);\
41 ppsum = (vec_u8)vec_pack(psum, psum);\
42 vfdst = vec_perm(vdst, ppsum, fperm);\
44 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
46 vec_st(fsum, 0, dst);\
54 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
56 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
57 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
59 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
60 psum = vec_mladd(vE, vsrc1ssH, psum);\
61 psum = vec_sr(psum, v6us);\
63 vdst = vec_ld(0, dst);\
64 ppsum = (vec_u8)vec_pack(psum, psum);\
65 vfdst = vec_perm(vdst, ppsum, fperm);\
67 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
69 vec_st(fsum, 0, dst);\
75 #define add28(a) vec_add(v28ss, a)
77 #ifdef PREFIX_h264_chroma_mc8_altivec
78 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
79 int stride, int h, int x, int y) {
80 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
87 const vec_s32 vABCD = vec_ld(0, ABCD);
88 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
89 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
90 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
91 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
93 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
94 const vec_u16 v6us = vec_splat_u16(6);
95 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
96 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
98 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
99 vec_u8 vsrc0uc, vsrc1uc;
100 vec_s16 vsrc0ssH, vsrc1ssH;
101 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
102 vec_s16 vsrc2ssH, vsrc3ssH, psum;
103 vec_u8 vdst, ppsum, vfdst, fsum;
105 if (((unsigned long)dst) % 16 == 0) {
106 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
107 0x14, 0x15, 0x16, 0x17,
108 0x08, 0x09, 0x0A, 0x0B,
109 0x0C, 0x0D, 0x0E, 0x0F};
111 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
112 0x04, 0x05, 0x06, 0x07,
113 0x18, 0x19, 0x1A, 0x1B,
114 0x1C, 0x1D, 0x1E, 0x1F};
117 vsrcAuc = vec_ld(0, src);
120 vsrcBuc = vec_ld(16, src);
121 vsrcperm0 = vec_lvsl(0, src);
122 vsrcperm1 = vec_lvsl(1, src);
124 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
128 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
130 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
131 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
134 if (!loadSecond) {// -> !reallyBadAlign
135 for (i = 0 ; i < h ; i++) {
136 vsrcCuc = vec_ld(stride + 0, src);
137 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
138 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
140 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
144 for (i = 0 ; i < h ; i++) {
145 vsrcCuc = vec_ld(stride + 0, src);
146 vsrcDuc = vec_ld(stride + 16, src);
147 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
151 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
153 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
157 const vec_s16 vE = vec_add(vB, vC);
158 if (ABCD[2]) { // x == 0 B == 0
159 if (!loadSecond) {// -> !reallyBadAlign
160 for (i = 0 ; i < h ; i++) {
161 vsrcCuc = vec_ld(stride + 0, src);
162 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
163 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
169 for (i = 0 ; i < h ; i++) {
170 vsrcCuc = vec_ld(stride + 0, src);
171 vsrcDuc = vec_ld(stride + 15, src);
172 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
173 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
178 } else { // y == 0 C == 0
179 if (!loadSecond) {// -> !reallyBadAlign
180 for (i = 0 ; i < h ; i++) {
181 vsrcCuc = vec_ld(0, src);
182 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
183 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
185 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
189 for (i = 0 ; i < h ; i++) {
190 vsrcCuc = vec_ld(0, src);
191 vsrcDuc = vec_ld(15, src);
192 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
196 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
198 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
206 /* this code assume that stride % 16 == 0 */
207 #ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
208 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
209 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
210 {((8 - x) * (8 - y)),
216 const vec_s32 vABCD = vec_ld(0, ABCD);
217 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
218 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
219 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
220 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
222 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
223 const vec_u16 v6us = vec_splat_u16(6);
224 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
225 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
227 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
228 vec_u8 vsrc0uc, vsrc1uc;
229 vec_s16 vsrc0ssH, vsrc1ssH;
230 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
231 vec_s16 vsrc2ssH, vsrc3ssH, psum;
232 vec_u8 vdst, ppsum, vfdst, fsum;
234 if (((unsigned long)dst) % 16 == 0) {
235 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
236 0x14, 0x15, 0x16, 0x17,
237 0x08, 0x09, 0x0A, 0x0B,
238 0x0C, 0x0D, 0x0E, 0x0F};
240 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
241 0x04, 0x05, 0x06, 0x07,
242 0x18, 0x19, 0x1A, 0x1B,
243 0x1C, 0x1D, 0x1E, 0x1F};
246 vsrcAuc = vec_ld(0, src);
249 vsrcBuc = vec_ld(16, src);
250 vsrcperm0 = vec_lvsl(0, src);
251 vsrcperm1 = vec_lvsl(1, src);
253 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
257 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
259 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
260 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
262 if (!loadSecond) {// -> !reallyBadAlign
263 for (i = 0 ; i < h ; i++) {
266 vsrcCuc = vec_ld(stride + 0, src);
268 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
269 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
271 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
275 for (i = 0 ; i < h ; i++) {
276 vsrcCuc = vec_ld(stride + 0, src);
277 vsrcDuc = vec_ld(stride + 16, src);
279 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
283 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
285 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
293 #undef CHROMA_MC8_ALTIVEC_CORE
295 /* this code assume stride % 16 == 0 */
296 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
297 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
301 const vec_u8 permM2 = vec_lvsl(-2, src);
302 const vec_u8 permM1 = vec_lvsl(-1, src);
303 const vec_u8 permP0 = vec_lvsl(+0, src);
304 const vec_u8 permP1 = vec_lvsl(+1, src);
305 const vec_u8 permP2 = vec_lvsl(+2, src);
306 const vec_u8 permP3 = vec_lvsl(+3, src);
307 const vec_s16 v5ss = vec_splat_s16(5);
308 const vec_u16 v5us = vec_splat_u16(5);
309 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
310 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
312 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
314 register int align = ((((unsigned long)src) - 2) % 16);
316 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
317 srcP2A, srcP2B, srcP3A, srcP3B,
318 srcM1A, srcM1B, srcM2A, srcM2B,
319 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
320 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
321 psumA, psumB, sumA, sumB;
323 vec_u8 sum, vdst, fsum;
325 for (i = 0 ; i < 16 ; i ++) {
326 vec_u8 srcR1 = vec_ld(-2, src);
327 vec_u8 srcR2 = vec_ld(14, src);
331 srcM2 = vec_perm(srcR1, srcR2, permM2);
332 srcM1 = vec_perm(srcR1, srcR2, permM1);
333 srcP0 = vec_perm(srcR1, srcR2, permP0);
334 srcP1 = vec_perm(srcR1, srcR2, permP1);
335 srcP2 = vec_perm(srcR1, srcR2, permP2);
336 srcP3 = vec_perm(srcR1, srcR2, permP3);
339 srcM2 = vec_perm(srcR1, srcR2, permM2);
340 srcM1 = vec_perm(srcR1, srcR2, permM1);
341 srcP0 = vec_perm(srcR1, srcR2, permP0);
342 srcP1 = vec_perm(srcR1, srcR2, permP1);
343 srcP2 = vec_perm(srcR1, srcR2, permP2);
347 vec_u8 srcR3 = vec_ld(30, src);
348 srcM2 = vec_perm(srcR1, srcR2, permM2);
349 srcM1 = vec_perm(srcR1, srcR2, permM1);
350 srcP0 = vec_perm(srcR1, srcR2, permP0);
351 srcP1 = vec_perm(srcR1, srcR2, permP1);
353 srcP3 = vec_perm(srcR2, srcR3, permP3);
356 vec_u8 srcR3 = vec_ld(30, src);
357 srcM2 = vec_perm(srcR1, srcR2, permM2);
358 srcM1 = vec_perm(srcR1, srcR2, permM1);
359 srcP0 = vec_perm(srcR1, srcR2, permP0);
361 srcP2 = vec_perm(srcR2, srcR3, permP2);
362 srcP3 = vec_perm(srcR2, srcR3, permP3);
365 vec_u8 srcR3 = vec_ld(30, src);
366 srcM2 = vec_perm(srcR1, srcR2, permM2);
367 srcM1 = vec_perm(srcR1, srcR2, permM1);
369 srcP1 = vec_perm(srcR2, srcR3, permP1);
370 srcP2 = vec_perm(srcR2, srcR3, permP2);
371 srcP3 = vec_perm(srcR2, srcR3, permP3);
374 vec_u8 srcR3 = vec_ld(30, src);
375 srcM2 = vec_perm(srcR1, srcR2, permM2);
377 srcP0 = vec_perm(srcR2, srcR3, permP0);
378 srcP1 = vec_perm(srcR2, srcR3, permP1);
379 srcP2 = vec_perm(srcR2, srcR3, permP2);
380 srcP3 = vec_perm(srcR2, srcR3, permP3);
384 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
385 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
386 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
387 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
389 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
390 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
391 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
392 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
394 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
395 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
396 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
397 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
399 sum1A = vec_adds(srcP0A, srcP1A);
400 sum1B = vec_adds(srcP0B, srcP1B);
401 sum2A = vec_adds(srcM1A, srcP2A);
402 sum2B = vec_adds(srcM1B, srcP2B);
403 sum3A = vec_adds(srcM2A, srcP3A);
404 sum3B = vec_adds(srcM2B, srcP3B);
406 pp1A = vec_mladd(sum1A, v20ss, v16ss);
407 pp1B = vec_mladd(sum1B, v20ss, v16ss);
409 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
410 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
412 pp3A = vec_add(sum3A, pp1A);
413 pp3B = vec_add(sum3B, pp1B);
415 psumA = vec_sub(pp3A, pp2A);
416 psumB = vec_sub(pp3B, pp2B);
418 sumA = vec_sra(psumA, v5us);
419 sumB = vec_sra(psumB, v5us);
421 sum = vec_packsu(sumA, sumB);
424 vdst = vec_ld(0, dst);
426 OP_U8_ALTIVEC(fsum, sum, vdst);
428 vec_st(fsum, 0, dst);
436 /* this code assume stride % 16 == 0 */
437 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
438 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
442 const vec_u8 perm = vec_lvsl(0, src);
443 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
444 const vec_u16 v5us = vec_splat_u16(5);
445 const vec_s16 v5ss = vec_splat_s16(5);
446 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
448 uint8_t *srcbis = src - (srcStride * 2);
450 const vec_u8 srcM2a = vec_ld(0, srcbis);
451 const vec_u8 srcM2b = vec_ld(16, srcbis);
452 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
453 //srcbis += srcStride;
454 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
455 const vec_u8 srcM1b = vec_ld(16, srcbis);
456 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
457 //srcbis += srcStride;
458 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
459 const vec_u8 srcP0b = vec_ld(16, srcbis);
460 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
461 //srcbis += srcStride;
462 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
463 const vec_u8 srcP1b = vec_ld(16, srcbis);
464 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
465 //srcbis += srcStride;
466 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
467 const vec_u8 srcP2b = vec_ld(16, srcbis);
468 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
469 //srcbis += srcStride;
471 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
472 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
473 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
474 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
475 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
476 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
477 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
478 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
479 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
480 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
482 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
483 psumA, psumB, sumA, sumB,
485 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
487 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
489 for (i = 0 ; i < 16 ; i++) {
490 srcP3a = vec_ld(0, srcbis += srcStride);
491 srcP3b = vec_ld(16, srcbis);
492 srcP3 = vec_perm(srcP3a, srcP3b, perm);
493 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
494 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
495 //srcbis += srcStride;
497 sum1A = vec_adds(srcP0ssA, srcP1ssA);
498 sum1B = vec_adds(srcP0ssB, srcP1ssB);
499 sum2A = vec_adds(srcM1ssA, srcP2ssA);
500 sum2B = vec_adds(srcM1ssB, srcP2ssB);
501 sum3A = vec_adds(srcM2ssA, srcP3ssA);
502 sum3B = vec_adds(srcM2ssB, srcP3ssB);
515 pp1A = vec_mladd(sum1A, v20ss, v16ss);
516 pp1B = vec_mladd(sum1B, v20ss, v16ss);
518 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
519 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
521 pp3A = vec_add(sum3A, pp1A);
522 pp3B = vec_add(sum3B, pp1B);
524 psumA = vec_sub(pp3A, pp2A);
525 psumB = vec_sub(pp3B, pp2B);
527 sumA = vec_sra(psumA, v5us);
528 sumB = vec_sra(psumB, v5us);
530 sum = vec_packsu(sumA, sumB);
533 vdst = vec_ld(0, dst);
535 OP_U8_ALTIVEC(fsum, sum, vdst);
537 vec_st(fsum, 0, dst);
544 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
545 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
546 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
549 const vec_u8 permM2 = vec_lvsl(-2, src);
550 const vec_u8 permM1 = vec_lvsl(-1, src);
551 const vec_u8 permP0 = vec_lvsl(+0, src);
552 const vec_u8 permP1 = vec_lvsl(+1, src);
553 const vec_u8 permP2 = vec_lvsl(+2, src);
554 const vec_u8 permP3 = vec_lvsl(+3, src);
555 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
556 const vec_u32 v10ui = vec_splat_u32(10);
557 const vec_s16 v5ss = vec_splat_s16(5);
558 const vec_s16 v1ss = vec_splat_s16(1);
559 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
560 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
562 register int align = ((((unsigned long)src) - 2) % 16);
564 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
565 srcP2A, srcP2B, srcP3A, srcP3B,
566 srcM1A, srcM1B, srcM2A, srcM2B,
567 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
568 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
570 const vec_u8 mperm = (const vec_u8)
571 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
572 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
573 int16_t *tmpbis = tmp;
575 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
576 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
579 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
580 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
581 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
582 ssumAe, ssumAo, ssumBe, ssumBo;
583 vec_u8 fsum, sumv, sum, vdst;
584 vec_s16 ssume, ssumo;
586 src -= (2 * srcStride);
587 for (i = 0 ; i < 21 ; i ++) {
588 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
589 vec_u8 srcR1 = vec_ld(-2, src);
590 vec_u8 srcR2 = vec_ld(14, src);
594 srcM2 = vec_perm(srcR1, srcR2, permM2);
595 srcM1 = vec_perm(srcR1, srcR2, permM1);
596 srcP0 = vec_perm(srcR1, srcR2, permP0);
597 srcP1 = vec_perm(srcR1, srcR2, permP1);
598 srcP2 = vec_perm(srcR1, srcR2, permP2);
599 srcP3 = vec_perm(srcR1, srcR2, permP3);
602 srcM2 = vec_perm(srcR1, srcR2, permM2);
603 srcM1 = vec_perm(srcR1, srcR2, permM1);
604 srcP0 = vec_perm(srcR1, srcR2, permP0);
605 srcP1 = vec_perm(srcR1, srcR2, permP1);
606 srcP2 = vec_perm(srcR1, srcR2, permP2);
610 vec_u8 srcR3 = vec_ld(30, src);
611 srcM2 = vec_perm(srcR1, srcR2, permM2);
612 srcM1 = vec_perm(srcR1, srcR2, permM1);
613 srcP0 = vec_perm(srcR1, srcR2, permP0);
614 srcP1 = vec_perm(srcR1, srcR2, permP1);
616 srcP3 = vec_perm(srcR2, srcR3, permP3);
619 vec_u8 srcR3 = vec_ld(30, src);
620 srcM2 = vec_perm(srcR1, srcR2, permM2);
621 srcM1 = vec_perm(srcR1, srcR2, permM1);
622 srcP0 = vec_perm(srcR1, srcR2, permP0);
624 srcP2 = vec_perm(srcR2, srcR3, permP2);
625 srcP3 = vec_perm(srcR2, srcR3, permP3);
628 vec_u8 srcR3 = vec_ld(30, src);
629 srcM2 = vec_perm(srcR1, srcR2, permM2);
630 srcM1 = vec_perm(srcR1, srcR2, permM1);
632 srcP1 = vec_perm(srcR2, srcR3, permP1);
633 srcP2 = vec_perm(srcR2, srcR3, permP2);
634 srcP3 = vec_perm(srcR2, srcR3, permP3);
637 vec_u8 srcR3 = vec_ld(30, src);
638 srcM2 = vec_perm(srcR1, srcR2, permM2);
640 srcP0 = vec_perm(srcR2, srcR3, permP0);
641 srcP1 = vec_perm(srcR2, srcR3, permP1);
642 srcP2 = vec_perm(srcR2, srcR3, permP2);
643 srcP3 = vec_perm(srcR2, srcR3, permP3);
647 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
648 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
649 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
650 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
652 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
653 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
654 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
655 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
657 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
658 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
659 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
660 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
662 sum1A = vec_adds(srcP0A, srcP1A);
663 sum1B = vec_adds(srcP0B, srcP1B);
664 sum2A = vec_adds(srcM1A, srcP2A);
665 sum2B = vec_adds(srcM1B, srcP2B);
666 sum3A = vec_adds(srcM2A, srcP3A);
667 sum3B = vec_adds(srcM2B, srcP3B);
669 pp1A = vec_mladd(sum1A, v20ss, sum3A);
670 pp1B = vec_mladd(sum1B, v20ss, sum3B);
672 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
673 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
675 psumA = vec_sub(pp1A, pp2A);
676 psumB = vec_sub(pp1B, pp2B);
678 vec_st(psumA, 0, tmp);
679 vec_st(psumB, 16, tmp);
682 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
685 tmpM2ssA = vec_ld(0, tmpbis);
686 tmpM2ssB = vec_ld(16, tmpbis);
688 tmpM1ssA = vec_ld(0, tmpbis);
689 tmpM1ssB = vec_ld(16, tmpbis);
691 tmpP0ssA = vec_ld(0, tmpbis);
692 tmpP0ssB = vec_ld(16, tmpbis);
694 tmpP1ssA = vec_ld(0, tmpbis);
695 tmpP1ssB = vec_ld(16, tmpbis);
697 tmpP2ssA = vec_ld(0, tmpbis);
698 tmpP2ssB = vec_ld(16, tmpbis);
701 for (i = 0 ; i < 16 ; i++) {
702 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
703 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
705 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
706 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
707 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
708 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
709 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
710 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
725 pp1Ae = vec_mule(sum1A, v20ss);
726 pp1Ao = vec_mulo(sum1A, v20ss);
727 pp1Be = vec_mule(sum1B, v20ss);
728 pp1Bo = vec_mulo(sum1B, v20ss);
730 pp2Ae = vec_mule(sum2A, v5ss);
731 pp2Ao = vec_mulo(sum2A, v5ss);
732 pp2Be = vec_mule(sum2B, v5ss);
733 pp2Bo = vec_mulo(sum2B, v5ss);
735 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
736 pp3Ao = vec_mulo(sum3A, v1ss);
737 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
738 pp3Bo = vec_mulo(sum3B, v1ss);
740 pp1cAe = vec_add(pp1Ae, v512si);
741 pp1cAo = vec_add(pp1Ao, v512si);
742 pp1cBe = vec_add(pp1Be, v512si);
743 pp1cBo = vec_add(pp1Bo, v512si);
745 pp32Ae = vec_sub(pp3Ae, pp2Ae);
746 pp32Ao = vec_sub(pp3Ao, pp2Ao);
747 pp32Be = vec_sub(pp3Be, pp2Be);
748 pp32Bo = vec_sub(pp3Bo, pp2Bo);
750 sumAe = vec_add(pp1cAe, pp32Ae);
751 sumAo = vec_add(pp1cAo, pp32Ao);
752 sumBe = vec_add(pp1cBe, pp32Be);
753 sumBo = vec_add(pp1cBo, pp32Bo);
755 ssumAe = vec_sra(sumAe, v10ui);
756 ssumAo = vec_sra(sumAo, v10ui);
757 ssumBe = vec_sra(sumBe, v10ui);
758 ssumBo = vec_sra(sumBo, v10ui);
760 ssume = vec_packs(ssumAe, ssumBe);
761 ssumo = vec_packs(ssumAo, ssumBo);
763 sumv = vec_packsu(ssume, ssumo);
764 sum = vec_perm(sumv, sumv, mperm);
767 vdst = vec_ld(0, dst);
769 OP_U8_ALTIVEC(fsum, sum, vdst);
771 vec_st(fsum, 0, dst);