2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 //#define DEBUG_ALIGNMENT
22 #ifdef DEBUG_ALIGNMENT
23 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
25 #define ASSERT_ALIGNED(ptr) ;
28 /* this code assume that stride % 16 == 0 */
30 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
31 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
32 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
34 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
35 psum = vec_mladd(vB, vsrc1ssH, psum);\
36 psum = vec_mladd(vC, vsrc2ssH, psum);\
37 psum = vec_mladd(vD, vsrc3ssH, psum);\
39 psum = vec_sr(psum, v6us);\
41 vdst = vec_ld(0, dst);\
42 ppsum = (vec_u8)vec_pack(psum, psum);\
43 vfdst = vec_perm(vdst, ppsum, fperm);\
45 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
47 vec_st(fsum, 0, dst);\
55 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
57 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
58 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
60 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
61 psum = vec_mladd(vE, vsrc1ssH, psum);\
62 psum = vec_sr(psum, v6us);\
64 vdst = vec_ld(0, dst);\
65 ppsum = (vec_u8)vec_pack(psum, psum);\
66 vfdst = vec_perm(vdst, ppsum, fperm);\
68 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
70 vec_st(fsum, 0, dst);\
76 #define add28(a) vec_add(v28ss, a)
78 #ifdef PREFIX_h264_chroma_mc8_altivec
79 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
80 int stride, int h, int x, int y) {
81 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
88 const vec_s32 vABCD = vec_ld(0, ABCD);
89 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
90 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
91 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
92 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
94 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
95 const vec_u16 v6us = vec_splat_u16(6);
96 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
97 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
99 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
100 vec_u8 vsrc0uc, vsrc1uc;
101 vec_s16 vsrc0ssH, vsrc1ssH;
102 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
103 vec_s16 vsrc2ssH, vsrc3ssH, psum;
104 vec_u8 vdst, ppsum, vfdst, fsum;
106 if (((unsigned long)dst) % 16 == 0) {
107 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
108 0x14, 0x15, 0x16, 0x17,
109 0x08, 0x09, 0x0A, 0x0B,
110 0x0C, 0x0D, 0x0E, 0x0F};
112 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
113 0x04, 0x05, 0x06, 0x07,
114 0x18, 0x19, 0x1A, 0x1B,
115 0x1C, 0x1D, 0x1E, 0x1F};
118 vsrcAuc = vec_ld(0, src);
121 vsrcBuc = vec_ld(16, src);
122 vsrcperm0 = vec_lvsl(0, src);
123 vsrcperm1 = vec_lvsl(1, src);
125 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
129 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
131 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
132 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
135 if (!loadSecond) {// -> !reallyBadAlign
136 for (i = 0 ; i < h ; i++) {
137 vsrcCuc = vec_ld(stride + 0, src);
138 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
139 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
141 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
145 for (i = 0 ; i < h ; i++) {
146 vsrcCuc = vec_ld(stride + 0, src);
147 vsrcDuc = vec_ld(stride + 16, src);
148 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
152 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
154 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
158 const vec_s16 vE = vec_add(vB, vC);
159 if (ABCD[2]) { // x == 0 B == 0
160 if (!loadSecond) {// -> !reallyBadAlign
161 for (i = 0 ; i < h ; i++) {
162 vsrcCuc = vec_ld(stride + 0, src);
163 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
164 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
170 for (i = 0 ; i < h ; i++) {
171 vsrcCuc = vec_ld(stride + 0, src);
172 vsrcDuc = vec_ld(stride + 15, src);
173 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
174 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
179 } else { // y == 0 C == 0
180 if (!loadSecond) {// -> !reallyBadAlign
181 for (i = 0 ; i < h ; i++) {
182 vsrcCuc = vec_ld(0, src);
183 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
184 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
186 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
190 for (i = 0 ; i < h ; i++) {
191 vsrcCuc = vec_ld(0, src);
192 vsrcDuc = vec_ld(15, src);
193 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
197 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
199 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
207 /* this code assume that stride % 16 == 0 */
208 #ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
209 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
210 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
211 {((8 - x) * (8 - y)),
217 const vec_s32 vABCD = vec_ld(0, ABCD);
218 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
219 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
220 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
221 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
223 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
224 const vec_u16 v6us = vec_splat_u16(6);
225 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
226 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
228 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
229 vec_u8 vsrc0uc, vsrc1uc;
230 vec_s16 vsrc0ssH, vsrc1ssH;
231 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
232 vec_s16 vsrc2ssH, vsrc3ssH, psum;
233 vec_u8 vdst, ppsum, vfdst, fsum;
235 if (((unsigned long)dst) % 16 == 0) {
236 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
237 0x14, 0x15, 0x16, 0x17,
238 0x08, 0x09, 0x0A, 0x0B,
239 0x0C, 0x0D, 0x0E, 0x0F};
241 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
242 0x04, 0x05, 0x06, 0x07,
243 0x18, 0x19, 0x1A, 0x1B,
244 0x1C, 0x1D, 0x1E, 0x1F};
247 vsrcAuc = vec_ld(0, src);
250 vsrcBuc = vec_ld(16, src);
251 vsrcperm0 = vec_lvsl(0, src);
252 vsrcperm1 = vec_lvsl(1, src);
254 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
258 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
260 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
261 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
263 if (!loadSecond) {// -> !reallyBadAlign
264 for (i = 0 ; i < h ; i++) {
267 vsrcCuc = vec_ld(stride + 0, src);
269 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
270 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
272 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
276 for (i = 0 ; i < h ; i++) {
277 vsrcCuc = vec_ld(stride + 0, src);
278 vsrcDuc = vec_ld(stride + 16, src);
280 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
284 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
286 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
294 #undef CHROMA_MC8_ALTIVEC_CORE
296 /* this code assume stride % 16 == 0 */
297 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
298 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
302 const vec_u8 permM2 = vec_lvsl(-2, src);
303 const vec_u8 permM1 = vec_lvsl(-1, src);
304 const vec_u8 permP0 = vec_lvsl(+0, src);
305 const vec_u8 permP1 = vec_lvsl(+1, src);
306 const vec_u8 permP2 = vec_lvsl(+2, src);
307 const vec_u8 permP3 = vec_lvsl(+3, src);
308 const vec_s16 v5ss = vec_splat_s16(5);
309 const vec_u16 v5us = vec_splat_u16(5);
310 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
311 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
313 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
315 register int align = ((((unsigned long)src) - 2) % 16);
317 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
318 srcP2A, srcP2B, srcP3A, srcP3B,
319 srcM1A, srcM1B, srcM2A, srcM2B,
320 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
321 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
322 psumA, psumB, sumA, sumB;
324 vec_u8 sum, vdst, fsum;
326 for (i = 0 ; i < 16 ; i ++) {
327 vec_u8 srcR1 = vec_ld(-2, src);
328 vec_u8 srcR2 = vec_ld(14, src);
332 srcM2 = vec_perm(srcR1, srcR2, permM2);
333 srcM1 = vec_perm(srcR1, srcR2, permM1);
334 srcP0 = vec_perm(srcR1, srcR2, permP0);
335 srcP1 = vec_perm(srcR1, srcR2, permP1);
336 srcP2 = vec_perm(srcR1, srcR2, permP2);
337 srcP3 = vec_perm(srcR1, srcR2, permP3);
340 srcM2 = vec_perm(srcR1, srcR2, permM2);
341 srcM1 = vec_perm(srcR1, srcR2, permM1);
342 srcP0 = vec_perm(srcR1, srcR2, permP0);
343 srcP1 = vec_perm(srcR1, srcR2, permP1);
344 srcP2 = vec_perm(srcR1, srcR2, permP2);
348 vec_u8 srcR3 = vec_ld(30, src);
349 srcM2 = vec_perm(srcR1, srcR2, permM2);
350 srcM1 = vec_perm(srcR1, srcR2, permM1);
351 srcP0 = vec_perm(srcR1, srcR2, permP0);
352 srcP1 = vec_perm(srcR1, srcR2, permP1);
354 srcP3 = vec_perm(srcR2, srcR3, permP3);
357 vec_u8 srcR3 = vec_ld(30, src);
358 srcM2 = vec_perm(srcR1, srcR2, permM2);
359 srcM1 = vec_perm(srcR1, srcR2, permM1);
360 srcP0 = vec_perm(srcR1, srcR2, permP0);
362 srcP2 = vec_perm(srcR2, srcR3, permP2);
363 srcP3 = vec_perm(srcR2, srcR3, permP3);
366 vec_u8 srcR3 = vec_ld(30, src);
367 srcM2 = vec_perm(srcR1, srcR2, permM2);
368 srcM1 = vec_perm(srcR1, srcR2, permM1);
370 srcP1 = vec_perm(srcR2, srcR3, permP1);
371 srcP2 = vec_perm(srcR2, srcR3, permP2);
372 srcP3 = vec_perm(srcR2, srcR3, permP3);
375 vec_u8 srcR3 = vec_ld(30, src);
376 srcM2 = vec_perm(srcR1, srcR2, permM2);
378 srcP0 = vec_perm(srcR2, srcR3, permP0);
379 srcP1 = vec_perm(srcR2, srcR3, permP1);
380 srcP2 = vec_perm(srcR2, srcR3, permP2);
381 srcP3 = vec_perm(srcR2, srcR3, permP3);
385 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
386 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
387 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
388 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
390 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
391 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
392 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
393 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
395 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
396 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
397 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
398 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
400 sum1A = vec_adds(srcP0A, srcP1A);
401 sum1B = vec_adds(srcP0B, srcP1B);
402 sum2A = vec_adds(srcM1A, srcP2A);
403 sum2B = vec_adds(srcM1B, srcP2B);
404 sum3A = vec_adds(srcM2A, srcP3A);
405 sum3B = vec_adds(srcM2B, srcP3B);
407 pp1A = vec_mladd(sum1A, v20ss, v16ss);
408 pp1B = vec_mladd(sum1B, v20ss, v16ss);
410 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
411 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
413 pp3A = vec_add(sum3A, pp1A);
414 pp3B = vec_add(sum3B, pp1B);
416 psumA = vec_sub(pp3A, pp2A);
417 psumB = vec_sub(pp3B, pp2B);
419 sumA = vec_sra(psumA, v5us);
420 sumB = vec_sra(psumB, v5us);
422 sum = vec_packsu(sumA, sumB);
425 vdst = vec_ld(0, dst);
427 OP_U8_ALTIVEC(fsum, sum, vdst);
429 vec_st(fsum, 0, dst);
437 /* this code assume stride % 16 == 0 */
438 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
439 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
443 const vec_u8 perm = vec_lvsl(0, src);
444 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
445 const vec_u16 v5us = vec_splat_u16(5);
446 const vec_s16 v5ss = vec_splat_s16(5);
447 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
449 uint8_t *srcbis = src - (srcStride * 2);
451 const vec_u8 srcM2a = vec_ld(0, srcbis);
452 const vec_u8 srcM2b = vec_ld(16, srcbis);
453 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
454 //srcbis += srcStride;
455 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
456 const vec_u8 srcM1b = vec_ld(16, srcbis);
457 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
458 //srcbis += srcStride;
459 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
460 const vec_u8 srcP0b = vec_ld(16, srcbis);
461 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
462 //srcbis += srcStride;
463 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
464 const vec_u8 srcP1b = vec_ld(16, srcbis);
465 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
466 //srcbis += srcStride;
467 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
468 const vec_u8 srcP2b = vec_ld(16, srcbis);
469 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
470 //srcbis += srcStride;
472 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
473 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
474 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
475 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
476 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
477 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
478 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
479 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
480 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
481 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
483 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
484 psumA, psumB, sumA, sumB,
486 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
488 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
490 for (i = 0 ; i < 16 ; i++) {
491 srcP3a = vec_ld(0, srcbis += srcStride);
492 srcP3b = vec_ld(16, srcbis);
493 srcP3 = vec_perm(srcP3a, srcP3b, perm);
494 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
495 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
496 //srcbis += srcStride;
498 sum1A = vec_adds(srcP0ssA, srcP1ssA);
499 sum1B = vec_adds(srcP0ssB, srcP1ssB);
500 sum2A = vec_adds(srcM1ssA, srcP2ssA);
501 sum2B = vec_adds(srcM1ssB, srcP2ssB);
502 sum3A = vec_adds(srcM2ssA, srcP3ssA);
503 sum3B = vec_adds(srcM2ssB, srcP3ssB);
516 pp1A = vec_mladd(sum1A, v20ss, v16ss);
517 pp1B = vec_mladd(sum1B, v20ss, v16ss);
519 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
520 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
522 pp3A = vec_add(sum3A, pp1A);
523 pp3B = vec_add(sum3B, pp1B);
525 psumA = vec_sub(pp3A, pp2A);
526 psumB = vec_sub(pp3B, pp2B);
528 sumA = vec_sra(psumA, v5us);
529 sumB = vec_sra(psumB, v5us);
531 sum = vec_packsu(sumA, sumB);
534 vdst = vec_ld(0, dst);
536 OP_U8_ALTIVEC(fsum, sum, vdst);
538 vec_st(fsum, 0, dst);
545 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
546 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
547 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
550 const vec_u8 permM2 = vec_lvsl(-2, src);
551 const vec_u8 permM1 = vec_lvsl(-1, src);
552 const vec_u8 permP0 = vec_lvsl(+0, src);
553 const vec_u8 permP1 = vec_lvsl(+1, src);
554 const vec_u8 permP2 = vec_lvsl(+2, src);
555 const vec_u8 permP3 = vec_lvsl(+3, src);
556 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
557 const vec_u32 v10ui = vec_splat_u32(10);
558 const vec_s16 v5ss = vec_splat_s16(5);
559 const vec_s16 v1ss = vec_splat_s16(1);
560 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
561 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
563 register int align = ((((unsigned long)src) - 2) % 16);
565 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
566 srcP2A, srcP2B, srcP3A, srcP3B,
567 srcM1A, srcM1B, srcM2A, srcM2B,
568 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
569 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
571 const vec_u8 mperm = (const vec_u8)
572 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
573 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
574 int16_t *tmpbis = tmp;
576 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
577 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
580 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
581 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
582 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
583 ssumAe, ssumAo, ssumBe, ssumBo;
584 vec_u8 fsum, sumv, sum, vdst;
585 vec_s16 ssume, ssumo;
587 src -= (2 * srcStride);
588 for (i = 0 ; i < 21 ; i ++) {
589 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
590 vec_u8 srcR1 = vec_ld(-2, src);
591 vec_u8 srcR2 = vec_ld(14, src);
595 srcM2 = vec_perm(srcR1, srcR2, permM2);
596 srcM1 = vec_perm(srcR1, srcR2, permM1);
597 srcP0 = vec_perm(srcR1, srcR2, permP0);
598 srcP1 = vec_perm(srcR1, srcR2, permP1);
599 srcP2 = vec_perm(srcR1, srcR2, permP2);
600 srcP3 = vec_perm(srcR1, srcR2, permP3);
603 srcM2 = vec_perm(srcR1, srcR2, permM2);
604 srcM1 = vec_perm(srcR1, srcR2, permM1);
605 srcP0 = vec_perm(srcR1, srcR2, permP0);
606 srcP1 = vec_perm(srcR1, srcR2, permP1);
607 srcP2 = vec_perm(srcR1, srcR2, permP2);
611 vec_u8 srcR3 = vec_ld(30, src);
612 srcM2 = vec_perm(srcR1, srcR2, permM2);
613 srcM1 = vec_perm(srcR1, srcR2, permM1);
614 srcP0 = vec_perm(srcR1, srcR2, permP0);
615 srcP1 = vec_perm(srcR1, srcR2, permP1);
617 srcP3 = vec_perm(srcR2, srcR3, permP3);
620 vec_u8 srcR3 = vec_ld(30, src);
621 srcM2 = vec_perm(srcR1, srcR2, permM2);
622 srcM1 = vec_perm(srcR1, srcR2, permM1);
623 srcP0 = vec_perm(srcR1, srcR2, permP0);
625 srcP2 = vec_perm(srcR2, srcR3, permP2);
626 srcP3 = vec_perm(srcR2, srcR3, permP3);
629 vec_u8 srcR3 = vec_ld(30, src);
630 srcM2 = vec_perm(srcR1, srcR2, permM2);
631 srcM1 = vec_perm(srcR1, srcR2, permM1);
633 srcP1 = vec_perm(srcR2, srcR3, permP1);
634 srcP2 = vec_perm(srcR2, srcR3, permP2);
635 srcP3 = vec_perm(srcR2, srcR3, permP3);
638 vec_u8 srcR3 = vec_ld(30, src);
639 srcM2 = vec_perm(srcR1, srcR2, permM2);
641 srcP0 = vec_perm(srcR2, srcR3, permP0);
642 srcP1 = vec_perm(srcR2, srcR3, permP1);
643 srcP2 = vec_perm(srcR2, srcR3, permP2);
644 srcP3 = vec_perm(srcR2, srcR3, permP3);
648 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
649 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
650 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
651 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
653 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
654 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
655 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
656 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
658 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
659 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
660 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
661 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
663 sum1A = vec_adds(srcP0A, srcP1A);
664 sum1B = vec_adds(srcP0B, srcP1B);
665 sum2A = vec_adds(srcM1A, srcP2A);
666 sum2B = vec_adds(srcM1B, srcP2B);
667 sum3A = vec_adds(srcM2A, srcP3A);
668 sum3B = vec_adds(srcM2B, srcP3B);
670 pp1A = vec_mladd(sum1A, v20ss, sum3A);
671 pp1B = vec_mladd(sum1B, v20ss, sum3B);
673 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
674 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
676 psumA = vec_sub(pp1A, pp2A);
677 psumB = vec_sub(pp1B, pp2B);
679 vec_st(psumA, 0, tmp);
680 vec_st(psumB, 16, tmp);
683 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
686 tmpM2ssA = vec_ld(0, tmpbis);
687 tmpM2ssB = vec_ld(16, tmpbis);
689 tmpM1ssA = vec_ld(0, tmpbis);
690 tmpM1ssB = vec_ld(16, tmpbis);
692 tmpP0ssA = vec_ld(0, tmpbis);
693 tmpP0ssB = vec_ld(16, tmpbis);
695 tmpP1ssA = vec_ld(0, tmpbis);
696 tmpP1ssB = vec_ld(16, tmpbis);
698 tmpP2ssA = vec_ld(0, tmpbis);
699 tmpP2ssB = vec_ld(16, tmpbis);
702 for (i = 0 ; i < 16 ; i++) {
703 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
704 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
706 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
707 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
708 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
709 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
710 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
711 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
726 pp1Ae = vec_mule(sum1A, v20ss);
727 pp1Ao = vec_mulo(sum1A, v20ss);
728 pp1Be = vec_mule(sum1B, v20ss);
729 pp1Bo = vec_mulo(sum1B, v20ss);
731 pp2Ae = vec_mule(sum2A, v5ss);
732 pp2Ao = vec_mulo(sum2A, v5ss);
733 pp2Be = vec_mule(sum2B, v5ss);
734 pp2Bo = vec_mulo(sum2B, v5ss);
736 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
737 pp3Ao = vec_mulo(sum3A, v1ss);
738 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
739 pp3Bo = vec_mulo(sum3B, v1ss);
741 pp1cAe = vec_add(pp1Ae, v512si);
742 pp1cAo = vec_add(pp1Ao, v512si);
743 pp1cBe = vec_add(pp1Be, v512si);
744 pp1cBo = vec_add(pp1Bo, v512si);
746 pp32Ae = vec_sub(pp3Ae, pp2Ae);
747 pp32Ao = vec_sub(pp3Ao, pp2Ao);
748 pp32Be = vec_sub(pp3Be, pp2Be);
749 pp32Bo = vec_sub(pp3Bo, pp2Bo);
751 sumAe = vec_add(pp1cAe, pp32Ae);
752 sumAo = vec_add(pp1cAo, pp32Ao);
753 sumBe = vec_add(pp1cBe, pp32Be);
754 sumBo = vec_add(pp1cBo, pp32Bo);
756 ssumAe = vec_sra(sumAe, v10ui);
757 ssumAo = vec_sra(sumAo, v10ui);
758 ssumBe = vec_sra(sumBe, v10ui);
759 ssumBo = vec_sra(sumBo, v10ui);
761 ssume = vec_packs(ssumAe, ssumBe);
762 ssumo = vec_packs(ssumAo, ssumBo);
764 sumv = vec_packsu(ssume, ssumo);
765 sum = vec_perm(sumv, sumv, mperm);
768 vdst = vec_ld(0, dst);
770 OP_U8_ALTIVEC(fsum, sum, vdst);
772 vec_st(fsum, 0, dst);