2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 /* this code assume that stride % 16 == 0 */
22 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
23 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
24 DECLARE_ALIGNED_16(signed int, ABCD[4]) =
30 vector unsigned char fperm;
31 const vector signed int vABCD = vec_ld(0, ABCD);
32 const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
33 const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
34 const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
35 const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
36 const vector signed int vzero = vec_splat_s32(0);
37 const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
38 const vector unsigned short v6us = vec_splat_u16(6);
39 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
40 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
42 vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
43 vector unsigned char vsrc0uc, vsrc1uc;
44 vector signed short vsrc0ssH, vsrc1ssH;
45 vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
46 vector signed short vsrc2ssH, vsrc3ssH, psum;
47 vector unsigned char vdst, ppsum, vfdst, fsum;
49 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
51 if (((unsigned long)dst) % 16 == 0) {
52 fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
53 0x14, 0x15, 0x16, 0x17,
54 0x08, 0x09, 0x0A, 0x0B,
55 0x0C, 0x0D, 0x0E, 0x0F);
57 fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
58 0x04, 0x05, 0x06, 0x07,
59 0x18, 0x19, 0x1A, 0x1B,
60 0x1C, 0x1D, 0x1E, 0x1F);
63 vsrcAuc = vec_ld(0, src);
66 vsrcBuc = vec_ld(16, src);
67 vsrcperm0 = vec_lvsl(0, src);
68 vsrcperm1 = vec_lvsl(1, src);
70 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
74 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
76 vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
77 (vector unsigned char)vsrc0uc);
78 vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
79 (vector unsigned char)vsrc1uc);
81 if (!loadSecond) {// -> !reallyBadAlign
82 for (i = 0 ; i < h ; i++) {
85 vsrcCuc = vec_ld(stride + 0, src);
87 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
88 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
90 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
91 (vector unsigned char)vsrc2uc);
92 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
93 (vector unsigned char)vsrc3uc);
95 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
96 psum = vec_mladd(vB, vsrc1ssH, psum);
97 psum = vec_mladd(vC, vsrc2ssH, psum);
98 psum = vec_mladd(vD, vsrc3ssH, psum);
99 psum = vec_add(v32ss, psum);
100 psum = vec_sra(psum, v6us);
102 vdst = vec_ld(0, dst);
103 ppsum = (vector unsigned char)vec_packsu(psum, psum);
104 vfdst = vec_perm(vdst, ppsum, fperm);
106 OP_U8_ALTIVEC(fsum, vfdst, vdst);
108 vec_st(fsum, 0, dst);
117 vector unsigned char vsrcDuc;
118 for (i = 0 ; i < h ; i++) {
119 vsrcCuc = vec_ld(stride + 0, src);
120 vsrcDuc = vec_ld(stride + 16, src);
122 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
126 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
128 vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
129 (vector unsigned char)vsrc2uc);
130 vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
131 (vector unsigned char)vsrc3uc);
133 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
134 psum = vec_mladd(vB, vsrc1ssH, psum);
135 psum = vec_mladd(vC, vsrc2ssH, psum);
136 psum = vec_mladd(vD, vsrc3ssH, psum);
137 psum = vec_add(v32ss, psum);
138 psum = vec_sr(psum, v6us);
140 vdst = vec_ld(0, dst);
141 ppsum = (vector unsigned char)vec_pack(psum, psum);
142 vfdst = vec_perm(vdst, ppsum, fperm);
144 OP_U8_ALTIVEC(fsum, vfdst, vdst);
146 vec_st(fsum, 0, dst);
155 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
158 /* this code assume stride % 16 == 0 */
159 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
160 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
163 const vector signed int vzero = vec_splat_s32(0);
164 const vector unsigned char permM2 = vec_lvsl(-2, src);
165 const vector unsigned char permM1 = vec_lvsl(-1, src);
166 const vector unsigned char permP0 = vec_lvsl(+0, src);
167 const vector unsigned char permP1 = vec_lvsl(+1, src);
168 const vector unsigned char permP2 = vec_lvsl(+2, src);
169 const vector unsigned char permP3 = vec_lvsl(+3, src);
170 const vector signed short v5ss = vec_splat_s16(5);
171 const vector unsigned short v5us = vec_splat_u16(5);
172 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
173 const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
174 const vector unsigned char dstperm = vec_lvsr(0, dst);
175 const vector unsigned char neg1 =
176 (const vector unsigned char) vec_splat_s8(-1);
178 const vector unsigned char dstmask =
179 vec_perm((const vector unsigned char)vzero,
182 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
184 register int align = ((((unsigned long)src) - 2) % 16);
186 vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
187 srcP2A, srcP2B, srcP3A, srcP3B,
188 srcM1A, srcM1B, srcM2A, srcM2B,
189 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
190 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
191 psumA, psumB, sumA, sumB;
193 vector unsigned char sum, dst1, dst2, vdst, fsum,
196 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
198 for (i = 0 ; i < 16 ; i ++) {
199 vector unsigned char srcR1 = vec_ld(-2, src);
200 vector unsigned char srcR2 = vec_ld(14, src);
204 srcM2 = vec_perm(srcR1, srcR2, permM2);
205 srcM1 = vec_perm(srcR1, srcR2, permM1);
206 srcP0 = vec_perm(srcR1, srcR2, permP0);
207 srcP1 = vec_perm(srcR1, srcR2, permP1);
208 srcP2 = vec_perm(srcR1, srcR2, permP2);
209 srcP3 = vec_perm(srcR1, srcR2, permP3);
212 srcM2 = vec_perm(srcR1, srcR2, permM2);
213 srcM1 = vec_perm(srcR1, srcR2, permM1);
214 srcP0 = vec_perm(srcR1, srcR2, permP0);
215 srcP1 = vec_perm(srcR1, srcR2, permP1);
216 srcP2 = vec_perm(srcR1, srcR2, permP2);
220 vector unsigned char srcR3 = vec_ld(30, src);
221 srcM2 = vec_perm(srcR1, srcR2, permM2);
222 srcM1 = vec_perm(srcR1, srcR2, permM1);
223 srcP0 = vec_perm(srcR1, srcR2, permP0);
224 srcP1 = vec_perm(srcR1, srcR2, permP1);
226 srcP3 = vec_perm(srcR2, srcR3, permP3);
229 vector unsigned char srcR3 = vec_ld(30, src);
230 srcM2 = vec_perm(srcR1, srcR2, permM2);
231 srcM1 = vec_perm(srcR1, srcR2, permM1);
232 srcP0 = vec_perm(srcR1, srcR2, permP0);
234 srcP2 = vec_perm(srcR2, srcR3, permP2);
235 srcP3 = vec_perm(srcR2, srcR3, permP3);
238 vector unsigned char srcR3 = vec_ld(30, src);
239 srcM2 = vec_perm(srcR1, srcR2, permM2);
240 srcM1 = vec_perm(srcR1, srcR2, permM1);
242 srcP1 = vec_perm(srcR2, srcR3, permP1);
243 srcP2 = vec_perm(srcR2, srcR3, permP2);
244 srcP3 = vec_perm(srcR2, srcR3, permP3);
247 vector unsigned char srcR3 = vec_ld(30, src);
248 srcM2 = vec_perm(srcR1, srcR2, permM2);
250 srcP0 = vec_perm(srcR2, srcR3, permP0);
251 srcP1 = vec_perm(srcR2, srcR3, permP1);
252 srcP2 = vec_perm(srcR2, srcR3, permP2);
253 srcP3 = vec_perm(srcR2, srcR3, permP3);
257 srcP0A = (vector signed short)
258 vec_mergeh((vector unsigned char)vzero, srcP0);
259 srcP0B = (vector signed short)
260 vec_mergel((vector unsigned char)vzero, srcP0);
261 srcP1A = (vector signed short)
262 vec_mergeh((vector unsigned char)vzero, srcP1);
263 srcP1B = (vector signed short)
264 vec_mergel((vector unsigned char)vzero, srcP1);
266 srcP2A = (vector signed short)
267 vec_mergeh((vector unsigned char)vzero, srcP2);
268 srcP2B = (vector signed short)
269 vec_mergel((vector unsigned char)vzero, srcP2);
270 srcP3A = (vector signed short)
271 vec_mergeh((vector unsigned char)vzero, srcP3);
272 srcP3B = (vector signed short)
273 vec_mergel((vector unsigned char)vzero, srcP3);
275 srcM1A = (vector signed short)
276 vec_mergeh((vector unsigned char)vzero, srcM1);
277 srcM1B = (vector signed short)
278 vec_mergel((vector unsigned char)vzero, srcM1);
279 srcM2A = (vector signed short)
280 vec_mergeh((vector unsigned char)vzero, srcM2);
281 srcM2B = (vector signed short)
282 vec_mergel((vector unsigned char)vzero, srcM2);
284 sum1A = vec_adds(srcP0A, srcP1A);
285 sum1B = vec_adds(srcP0B, srcP1B);
286 sum2A = vec_adds(srcM1A, srcP2A);
287 sum2B = vec_adds(srcM1B, srcP2B);
288 sum3A = vec_adds(srcM2A, srcP3A);
289 sum3B = vec_adds(srcM2B, srcP3B);
291 pp1A = vec_mladd(sum1A, v20ss, v16ss);
292 pp1B = vec_mladd(sum1B, v20ss, v16ss);
294 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
295 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
297 pp3A = vec_add(sum3A, pp1A);
298 pp3B = vec_add(sum3B, pp1B);
300 psumA = vec_sub(pp3A, pp2A);
301 psumB = vec_sub(pp3B, pp2B);
303 sumA = vec_sra(psumA, v5us);
304 sumB = vec_sra(psumB, v5us);
306 sum = vec_packsu(sumA, sumB);
308 dst1 = vec_ld(0, dst);
309 dst2 = vec_ld(16, dst);
310 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
312 OP_U8_ALTIVEC(fsum, sum, vdst);
314 rsum = vec_perm(fsum, fsum, dstperm);
315 fdst1 = vec_sel(dst1, rsum, dstmask);
316 fdst2 = vec_sel(rsum, dst2, dstmask);
318 vec_st(fdst1, 0, dst);
319 vec_st(fdst2, 16, dst);
324 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
327 /* this code assume stride % 16 == 0 */
328 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
329 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
333 const vector signed int vzero = vec_splat_s32(0);
334 const vector unsigned char perm = vec_lvsl(0, src);
335 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
336 const vector unsigned short v5us = vec_splat_u16(5);
337 const vector signed short v5ss = vec_splat_s16(5);
338 const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
339 const vector unsigned char dstperm = vec_lvsr(0, dst);
340 const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
341 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
343 uint8_t *srcbis = src - (srcStride * 2);
345 const vector unsigned char srcM2a = vec_ld(0, srcbis);
346 const vector unsigned char srcM2b = vec_ld(16, srcbis);
347 const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm);
348 // srcbis += srcStride;
349 const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride);
350 const vector unsigned char srcM1b = vec_ld(16, srcbis);
351 const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm);
352 // srcbis += srcStride;
353 const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride);
354 const vector unsigned char srcP0b = vec_ld(16, srcbis);
355 const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm);
356 // srcbis += srcStride;
357 const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride);
358 const vector unsigned char srcP1b = vec_ld(16, srcbis);
359 const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm);
360 // srcbis += srcStride;
361 const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride);
362 const vector unsigned char srcP2b = vec_ld(16, srcbis);
363 const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm);
364 // srcbis += srcStride;
366 vector signed short srcM2ssA = (vector signed short)
367 vec_mergeh((vector unsigned char)vzero, srcM2);
368 vector signed short srcM2ssB = (vector signed short)
369 vec_mergel((vector unsigned char)vzero, srcM2);
370 vector signed short srcM1ssA = (vector signed short)
371 vec_mergeh((vector unsigned char)vzero, srcM1);
372 vector signed short srcM1ssB = (vector signed short)
373 vec_mergel((vector unsigned char)vzero, srcM1);
374 vector signed short srcP0ssA = (vector signed short)
375 vec_mergeh((vector unsigned char)vzero, srcP0);
376 vector signed short srcP0ssB = (vector signed short)
377 vec_mergel((vector unsigned char)vzero, srcP0);
378 vector signed short srcP1ssA = (vector signed short)
379 vec_mergeh((vector unsigned char)vzero, srcP1);
380 vector signed short srcP1ssB = (vector signed short)
381 vec_mergel((vector unsigned char)vzero, srcP1);
382 vector signed short srcP2ssA = (vector signed short)
383 vec_mergeh((vector unsigned char)vzero, srcP2);
384 vector signed short srcP2ssB = (vector signed short)
385 vec_mergel((vector unsigned char)vzero, srcP2);
387 vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
388 psumA, psumB, sumA, sumB,
390 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
392 vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2,
393 srcP3a, srcP3b, srcP3;
395 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
397 for (i = 0 ; i < 16 ; i++) {
398 srcP3a = vec_ld(0, srcbis += srcStride);
399 srcP3b = vec_ld(16, srcbis);
400 srcP3 = vec_perm(srcP3a, srcP3b, perm);
401 srcP3ssA = (vector signed short)
402 vec_mergeh((vector unsigned char)vzero, srcP3);
403 srcP3ssB = (vector signed short)
404 vec_mergel((vector unsigned char)vzero, srcP3);
405 // srcbis += srcStride;
407 sum1A = vec_adds(srcP0ssA, srcP1ssA);
408 sum1B = vec_adds(srcP0ssB, srcP1ssB);
409 sum2A = vec_adds(srcM1ssA, srcP2ssA);
410 sum2B = vec_adds(srcM1ssB, srcP2ssB);
411 sum3A = vec_adds(srcM2ssA, srcP3ssA);
412 sum3B = vec_adds(srcM2ssB, srcP3ssB);
425 pp1A = vec_mladd(sum1A, v20ss, v16ss);
426 pp1B = vec_mladd(sum1B, v20ss, v16ss);
428 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
429 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
431 pp3A = vec_add(sum3A, pp1A);
432 pp3B = vec_add(sum3B, pp1B);
434 psumA = vec_sub(pp3A, pp2A);
435 psumB = vec_sub(pp3B, pp2B);
437 sumA = vec_sra(psumA, v5us);
438 sumB = vec_sra(psumB, v5us);
440 sum = vec_packsu(sumA, sumB);
442 dst1 = vec_ld(0, dst);
443 dst2 = vec_ld(16, dst);
444 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
446 OP_U8_ALTIVEC(fsum, sum, vdst);
448 rsum = vec_perm(fsum, fsum, dstperm);
449 fdst1 = vec_sel(dst1, rsum, dstmask);
450 fdst2 = vec_sel(rsum, dst2, dstmask);
452 vec_st(fdst1, 0, dst);
453 vec_st(fdst2, 16, dst);
457 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
460 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
461 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
462 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
464 const vector signed int vzero = vec_splat_s32(0);
465 const vector unsigned char permM2 = vec_lvsl(-2, src);
466 const vector unsigned char permM1 = vec_lvsl(-1, src);
467 const vector unsigned char permP0 = vec_lvsl(+0, src);
468 const vector unsigned char permP1 = vec_lvsl(+1, src);
469 const vector unsigned char permP2 = vec_lvsl(+2, src);
470 const vector unsigned char permP3 = vec_lvsl(+3, src);
471 const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
472 const vector unsigned int v10ui = vec_splat_u32(10);
473 const vector signed short v5ss = vec_splat_s16(5);
474 const vector signed short v1ss = vec_splat_s16(1);
475 const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
476 const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
478 register int align = ((((unsigned long)src) - 2) % 16);
480 const vector unsigned char neg1 = (const vector unsigned char)
483 vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
484 srcP2A, srcP2B, srcP3A, srcP3B,
485 srcM1A, srcM1B, srcM2A, srcM2B,
486 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
487 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
489 const vector unsigned char dstperm = vec_lvsr(0, dst);
491 const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
493 const vector unsigned char mperm = (const vector unsigned char)
494 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
495 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
496 int16_t *tmpbis = tmp;
498 vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
499 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
502 vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
503 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
504 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
505 ssumAe, ssumAo, ssumBe, ssumBo;
506 vector unsigned char fsum, sumv, sum, dst1, dst2, vdst,
508 vector signed short ssume, ssumo;
510 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
511 src -= (2 * srcStride);
512 for (i = 0 ; i < 21 ; i ++) {
513 vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
514 vector unsigned char srcR1 = vec_ld(-2, src);
515 vector unsigned char srcR2 = vec_ld(14, src);
519 srcM2 = vec_perm(srcR1, srcR2, permM2);
520 srcM1 = vec_perm(srcR1, srcR2, permM1);
521 srcP0 = vec_perm(srcR1, srcR2, permP0);
522 srcP1 = vec_perm(srcR1, srcR2, permP1);
523 srcP2 = vec_perm(srcR1, srcR2, permP2);
524 srcP3 = vec_perm(srcR1, srcR2, permP3);
527 srcM2 = vec_perm(srcR1, srcR2, permM2);
528 srcM1 = vec_perm(srcR1, srcR2, permM1);
529 srcP0 = vec_perm(srcR1, srcR2, permP0);
530 srcP1 = vec_perm(srcR1, srcR2, permP1);
531 srcP2 = vec_perm(srcR1, srcR2, permP2);
535 vector unsigned char srcR3 = vec_ld(30, src);
536 srcM2 = vec_perm(srcR1, srcR2, permM2);
537 srcM1 = vec_perm(srcR1, srcR2, permM1);
538 srcP0 = vec_perm(srcR1, srcR2, permP0);
539 srcP1 = vec_perm(srcR1, srcR2, permP1);
541 srcP3 = vec_perm(srcR2, srcR3, permP3);
544 vector unsigned char srcR3 = vec_ld(30, src);
545 srcM2 = vec_perm(srcR1, srcR2, permM2);
546 srcM1 = vec_perm(srcR1, srcR2, permM1);
547 srcP0 = vec_perm(srcR1, srcR2, permP0);
549 srcP2 = vec_perm(srcR2, srcR3, permP2);
550 srcP3 = vec_perm(srcR2, srcR3, permP3);
553 vector unsigned char srcR3 = vec_ld(30, src);
554 srcM2 = vec_perm(srcR1, srcR2, permM2);
555 srcM1 = vec_perm(srcR1, srcR2, permM1);
557 srcP1 = vec_perm(srcR2, srcR3, permP1);
558 srcP2 = vec_perm(srcR2, srcR3, permP2);
559 srcP3 = vec_perm(srcR2, srcR3, permP3);
562 vector unsigned char srcR3 = vec_ld(30, src);
563 srcM2 = vec_perm(srcR1, srcR2, permM2);
565 srcP0 = vec_perm(srcR2, srcR3, permP0);
566 srcP1 = vec_perm(srcR2, srcR3, permP1);
567 srcP2 = vec_perm(srcR2, srcR3, permP2);
568 srcP3 = vec_perm(srcR2, srcR3, permP3);
572 srcP0A = (vector signed short)
573 vec_mergeh((vector unsigned char)vzero, srcP0);
574 srcP0B = (vector signed short)
575 vec_mergel((vector unsigned char)vzero, srcP0);
576 srcP1A = (vector signed short)
577 vec_mergeh((vector unsigned char)vzero, srcP1);
578 srcP1B = (vector signed short)
579 vec_mergel((vector unsigned char)vzero, srcP1);
581 srcP2A = (vector signed short)
582 vec_mergeh((vector unsigned char)vzero, srcP2);
583 srcP2B = (vector signed short)
584 vec_mergel((vector unsigned char)vzero, srcP2);
585 srcP3A = (vector signed short)
586 vec_mergeh((vector unsigned char)vzero, srcP3);
587 srcP3B = (vector signed short)
588 vec_mergel((vector unsigned char)vzero, srcP3);
590 srcM1A = (vector signed short)
591 vec_mergeh((vector unsigned char)vzero, srcM1);
592 srcM1B = (vector signed short)
593 vec_mergel((vector unsigned char)vzero, srcM1);
594 srcM2A = (vector signed short)
595 vec_mergeh((vector unsigned char)vzero, srcM2);
596 srcM2B = (vector signed short)
597 vec_mergel((vector unsigned char)vzero, srcM2);
599 sum1A = vec_adds(srcP0A, srcP1A);
600 sum1B = vec_adds(srcP0B, srcP1B);
601 sum2A = vec_adds(srcM1A, srcP2A);
602 sum2B = vec_adds(srcM1B, srcP2B);
603 sum3A = vec_adds(srcM2A, srcP3A);
604 sum3B = vec_adds(srcM2B, srcP3B);
606 pp1A = vec_mladd(sum1A, v20ss, sum3A);
607 pp1B = vec_mladd(sum1B, v20ss, sum3B);
609 pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
610 pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
612 psumA = vec_sub(pp1A, pp2A);
613 psumB = vec_sub(pp1B, pp2B);
615 vec_st(psumA, 0, tmp);
616 vec_st(psumB, 16, tmp);
619 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
622 tmpM2ssA = vec_ld(0, tmpbis);
623 tmpM2ssB = vec_ld(16, tmpbis);
625 tmpM1ssA = vec_ld(0, tmpbis);
626 tmpM1ssB = vec_ld(16, tmpbis);
628 tmpP0ssA = vec_ld(0, tmpbis);
629 tmpP0ssB = vec_ld(16, tmpbis);
631 tmpP1ssA = vec_ld(0, tmpbis);
632 tmpP1ssB = vec_ld(16, tmpbis);
634 tmpP2ssA = vec_ld(0, tmpbis);
635 tmpP2ssB = vec_ld(16, tmpbis);
638 for (i = 0 ; i < 16 ; i++) {
639 const vector signed short tmpP3ssA = vec_ld(0, tmpbis);
640 const vector signed short tmpP3ssB = vec_ld(16, tmpbis);
642 const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
643 const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
644 const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
645 const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
646 const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
647 const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
662 pp1Ae = vec_mule(sum1A, v20ss);
663 pp1Ao = vec_mulo(sum1A, v20ss);
664 pp1Be = vec_mule(sum1B, v20ss);
665 pp1Bo = vec_mulo(sum1B, v20ss);
667 pp2Ae = vec_mule(sum2A, v5ss);
668 pp2Ao = vec_mulo(sum2A, v5ss);
669 pp2Be = vec_mule(sum2B, v5ss);
670 pp2Bo = vec_mulo(sum2B, v5ss);
672 pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
673 pp3Ao = vec_mulo(sum3A, v1ss);
674 pp3Be = vec_sra((vector signed int)sum3B, v16ui);
675 pp3Bo = vec_mulo(sum3B, v1ss);
677 pp1cAe = vec_add(pp1Ae, v512si);
678 pp1cAo = vec_add(pp1Ao, v512si);
679 pp1cBe = vec_add(pp1Be, v512si);
680 pp1cBo = vec_add(pp1Bo, v512si);
682 pp32Ae = vec_sub(pp3Ae, pp2Ae);
683 pp32Ao = vec_sub(pp3Ao, pp2Ao);
684 pp32Be = vec_sub(pp3Be, pp2Be);
685 pp32Bo = vec_sub(pp3Bo, pp2Bo);
687 sumAe = vec_add(pp1cAe, pp32Ae);
688 sumAo = vec_add(pp1cAo, pp32Ao);
689 sumBe = vec_add(pp1cBe, pp32Be);
690 sumBo = vec_add(pp1cBo, pp32Bo);
692 ssumAe = vec_sra(sumAe, v10ui);
693 ssumAo = vec_sra(sumAo, v10ui);
694 ssumBe = vec_sra(sumBe, v10ui);
695 ssumBo = vec_sra(sumBo, v10ui);
697 ssume = vec_packs(ssumAe, ssumBe);
698 ssumo = vec_packs(ssumAo, ssumBo);
700 sumv = vec_packsu(ssume, ssumo);
701 sum = vec_perm(sumv, sumv, mperm);
703 dst1 = vec_ld(0, dst);
704 dst2 = vec_ld(16, dst);
705 vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
707 OP_U8_ALTIVEC(fsum, sum, vdst);
709 rsum = vec_perm(fsum, fsum, dstperm);
710 fdst1 = vec_sel(dst1, rsum, dstmask);
711 fdst2 = vec_sel(rsum, dst2, dstmask);
713 vec_st(fdst1, 0, dst);
714 vec_st(fdst2, 16, dst);
718 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);