2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 //#define DEBUG_ALIGNMENT
22 #ifdef DEBUG_ALIGNMENT
23 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
25 #define ASSERT_ALIGNED(ptr) ;
28 /* this code assume that stride % 16 == 0 */
29 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
30 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
31 DECLARE_ALIGNED_16(signed int, ABCD[4]) =
38 const vec_s32_t vABCD = vec_ld(0, ABCD);
39 const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
40 const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
41 const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
42 const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
44 const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
45 const vec_u16_t v6us = vec_splat_u16(6);
46 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
47 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
49 vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
50 vec_u8_t vsrc0uc, vsrc1uc;
51 vec_s16_t vsrc0ssH, vsrc1ssH;
52 vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
53 vec_s16_t vsrc2ssH, vsrc3ssH, psum;
54 vec_u8_t vdst, ppsum, vfdst, fsum;
56 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
58 if (((unsigned long)dst) % 16 == 0) {
59 fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
60 0x14, 0x15, 0x16, 0x17,
61 0x08, 0x09, 0x0A, 0x0B,
62 0x0C, 0x0D, 0x0E, 0x0F);
64 fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
65 0x04, 0x05, 0x06, 0x07,
66 0x18, 0x19, 0x1A, 0x1B,
67 0x1C, 0x1D, 0x1E, 0x1F);
70 vsrcAuc = vec_ld(0, src);
73 vsrcBuc = vec_ld(16, src);
74 vsrcperm0 = vec_lvsl(0, src);
75 vsrcperm1 = vec_lvsl(1, src);
77 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
81 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
83 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);
84 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);
86 if (!loadSecond) {// -> !reallyBadAlign
87 for (i = 0 ; i < h ; i++) {
90 vsrcCuc = vec_ld(stride + 0, src);
92 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
93 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
95 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);
96 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);
98 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
99 psum = vec_mladd(vB, vsrc1ssH, psum);
100 psum = vec_mladd(vC, vsrc2ssH, psum);
101 psum = vec_mladd(vD, vsrc3ssH, psum);
102 psum = vec_add(v32ss, psum);
103 psum = vec_sra(psum, v6us);
105 vdst = vec_ld(0, dst);
106 ppsum = (vec_u8_t)vec_packsu(psum, psum);
107 vfdst = vec_perm(vdst, ppsum, fperm);
109 OP_U8_ALTIVEC(fsum, vfdst, vdst);
111 vec_st(fsum, 0, dst);
121 for (i = 0 ; i < h ; i++) {
122 vsrcCuc = vec_ld(stride + 0, src);
123 vsrcDuc = vec_ld(stride + 16, src);
125 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
129 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
131 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);
132 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);
134 psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
135 psum = vec_mladd(vB, vsrc1ssH, psum);
136 psum = vec_mladd(vC, vsrc2ssH, psum);
137 psum = vec_mladd(vD, vsrc3ssH, psum);
138 psum = vec_add(v32ss, psum);
139 psum = vec_sr(psum, v6us);
141 vdst = vec_ld(0, dst);
142 ppsum = (vec_u8_t)vec_pack(psum, psum);
143 vfdst = vec_perm(vdst, ppsum, fperm);
145 OP_U8_ALTIVEC(fsum, vfdst, vdst);
147 vec_st(fsum, 0, dst);
156 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
159 /* this code assume stride % 16 == 0 */
160 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
161 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
165 const vec_u8_t permM2 = vec_lvsl(-2, src);
166 const vec_u8_t permM1 = vec_lvsl(-1, src);
167 const vec_u8_t permP0 = vec_lvsl(+0, src);
168 const vec_u8_t permP1 = vec_lvsl(+1, src);
169 const vec_u8_t permP2 = vec_lvsl(+2, src);
170 const vec_u8_t permP3 = vec_lvsl(+3, src);
171 const vec_s16_t v5ss = vec_splat_s16(5);
172 const vec_u16_t v5us = vec_splat_u16(5);
173 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
174 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
176 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
178 register int align = ((((unsigned long)src) - 2) % 16);
180 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
181 srcP2A, srcP2B, srcP3A, srcP3B,
182 srcM1A, srcM1B, srcM2A, srcM2B,
183 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
184 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
185 psumA, psumB, sumA, sumB;
187 vec_u8_t sum, vdst, fsum;
189 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
191 for (i = 0 ; i < 16 ; i ++) {
192 vec_u8_t srcR1 = vec_ld(-2, src);
193 vec_u8_t srcR2 = vec_ld(14, src);
197 srcM2 = vec_perm(srcR1, srcR2, permM2);
198 srcM1 = vec_perm(srcR1, srcR2, permM1);
199 srcP0 = vec_perm(srcR1, srcR2, permP0);
200 srcP1 = vec_perm(srcR1, srcR2, permP1);
201 srcP2 = vec_perm(srcR1, srcR2, permP2);
202 srcP3 = vec_perm(srcR1, srcR2, permP3);
205 srcM2 = vec_perm(srcR1, srcR2, permM2);
206 srcM1 = vec_perm(srcR1, srcR2, permM1);
207 srcP0 = vec_perm(srcR1, srcR2, permP0);
208 srcP1 = vec_perm(srcR1, srcR2, permP1);
209 srcP2 = vec_perm(srcR1, srcR2, permP2);
213 vec_u8_t srcR3 = vec_ld(30, src);
214 srcM2 = vec_perm(srcR1, srcR2, permM2);
215 srcM1 = vec_perm(srcR1, srcR2, permM1);
216 srcP0 = vec_perm(srcR1, srcR2, permP0);
217 srcP1 = vec_perm(srcR1, srcR2, permP1);
219 srcP3 = vec_perm(srcR2, srcR3, permP3);
222 vec_u8_t srcR3 = vec_ld(30, src);
223 srcM2 = vec_perm(srcR1, srcR2, permM2);
224 srcM1 = vec_perm(srcR1, srcR2, permM1);
225 srcP0 = vec_perm(srcR1, srcR2, permP0);
227 srcP2 = vec_perm(srcR2, srcR3, permP2);
228 srcP3 = vec_perm(srcR2, srcR3, permP3);
231 vec_u8_t srcR3 = vec_ld(30, src);
232 srcM2 = vec_perm(srcR1, srcR2, permM2);
233 srcM1 = vec_perm(srcR1, srcR2, permM1);
235 srcP1 = vec_perm(srcR2, srcR3, permP1);
236 srcP2 = vec_perm(srcR2, srcR3, permP2);
237 srcP3 = vec_perm(srcR2, srcR3, permP3);
240 vec_u8_t srcR3 = vec_ld(30, src);
241 srcM2 = vec_perm(srcR1, srcR2, permM2);
243 srcP0 = vec_perm(srcR2, srcR3, permP0);
244 srcP1 = vec_perm(srcR2, srcR3, permP1);
245 srcP2 = vec_perm(srcR2, srcR3, permP2);
246 srcP3 = vec_perm(srcR2, srcR3, permP3);
250 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
251 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
252 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
253 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
255 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
256 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
257 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
258 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
260 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
261 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
262 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
263 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
265 sum1A = vec_adds(srcP0A, srcP1A);
266 sum1B = vec_adds(srcP0B, srcP1B);
267 sum2A = vec_adds(srcM1A, srcP2A);
268 sum2B = vec_adds(srcM1B, srcP2B);
269 sum3A = vec_adds(srcM2A, srcP3A);
270 sum3B = vec_adds(srcM2B, srcP3B);
272 pp1A = vec_mladd(sum1A, v20ss, v16ss);
273 pp1B = vec_mladd(sum1B, v20ss, v16ss);
275 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
276 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
278 pp3A = vec_add(sum3A, pp1A);
279 pp3B = vec_add(sum3B, pp1B);
281 psumA = vec_sub(pp3A, pp2A);
282 psumB = vec_sub(pp3B, pp2B);
284 sumA = vec_sra(psumA, v5us);
285 sumB = vec_sra(psumB, v5us);
287 sum = vec_packsu(sumA, sumB);
290 vdst = vec_ld(0, dst);
292 OP_U8_ALTIVEC(fsum, sum, vdst);
294 vec_st(fsum, 0, dst);
299 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
302 /* this code assume stride % 16 == 0 */
303 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
304 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
309 const vec_u8_t perm = vec_lvsl(0, src);
310 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
311 const vec_u16_t v5us = vec_splat_u16(5);
312 const vec_s16_t v5ss = vec_splat_s16(5);
313 const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
315 uint8_t *srcbis = src - (srcStride * 2);
317 const vec_u8_t srcM2a = vec_ld(0, srcbis);
318 const vec_u8_t srcM2b = vec_ld(16, srcbis);
319 const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
320 // srcbis += srcStride;
321 const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
322 const vec_u8_t srcM1b = vec_ld(16, srcbis);
323 const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
324 // srcbis += srcStride;
325 const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
326 const vec_u8_t srcP0b = vec_ld(16, srcbis);
327 const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
328 // srcbis += srcStride;
329 const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
330 const vec_u8_t srcP1b = vec_ld(16, srcbis);
331 const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
332 // srcbis += srcStride;
333 const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
334 const vec_u8_t srcP2b = vec_ld(16, srcbis);
335 const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
336 // srcbis += srcStride;
338 vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
339 vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
340 vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
341 vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
342 vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
343 vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
344 vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
345 vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
346 vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
347 vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
349 vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
350 psumA, psumB, sumA, sumB,
352 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
354 vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3;
356 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
358 for (i = 0 ; i < 16 ; i++) {
359 srcP3a = vec_ld(0, srcbis += srcStride);
360 srcP3b = vec_ld(16, srcbis);
361 srcP3 = vec_perm(srcP3a, srcP3b, perm);
362 srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
363 srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
364 // srcbis += srcStride;
366 sum1A = vec_adds(srcP0ssA, srcP1ssA);
367 sum1B = vec_adds(srcP0ssB, srcP1ssB);
368 sum2A = vec_adds(srcM1ssA, srcP2ssA);
369 sum2B = vec_adds(srcM1ssB, srcP2ssB);
370 sum3A = vec_adds(srcM2ssA, srcP3ssA);
371 sum3B = vec_adds(srcM2ssB, srcP3ssB);
384 pp1A = vec_mladd(sum1A, v20ss, v16ss);
385 pp1B = vec_mladd(sum1B, v20ss, v16ss);
387 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
388 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
390 pp3A = vec_add(sum3A, pp1A);
391 pp3B = vec_add(sum3B, pp1B);
393 psumA = vec_sub(pp3A, pp2A);
394 psumB = vec_sub(pp3B, pp2B);
396 sumA = vec_sra(psumA, v5us);
397 sumB = vec_sra(psumB, v5us);
399 sum = vec_packsu(sumA, sumB);
402 vdst = vec_ld(0, dst);
404 OP_U8_ALTIVEC(fsum, sum, vdst);
406 vec_st(fsum, 0, dst);
410 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
413 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
414 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
415 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
418 const vec_u8_t permM2 = vec_lvsl(-2, src);
419 const vec_u8_t permM1 = vec_lvsl(-1, src);
420 const vec_u8_t permP0 = vec_lvsl(+0, src);
421 const vec_u8_t permP1 = vec_lvsl(+1, src);
422 const vec_u8_t permP2 = vec_lvsl(+2, src);
423 const vec_u8_t permP3 = vec_lvsl(+3, src);
424 const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
425 const vec_u32_t v10ui = vec_splat_u32(10);
426 const vec_s16_t v5ss = vec_splat_s16(5);
427 const vec_s16_t v1ss = vec_splat_s16(1);
428 const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
429 const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
431 register int align = ((((unsigned long)src) - 2) % 16);
433 vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
434 srcP2A, srcP2B, srcP3A, srcP3B,
435 srcM1A, srcM1B, srcM2A, srcM2B,
436 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
437 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
439 const vec_u8_t mperm = (const vec_u8_t)
440 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
441 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
442 int16_t *tmpbis = tmp;
444 vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
445 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
448 vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
449 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
450 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
451 ssumAe, ssumAo, ssumBe, ssumBo;
452 vec_u8_t fsum, sumv, sum, vdst;
453 vec_s16_t ssume, ssumo;
455 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
456 src -= (2 * srcStride);
457 for (i = 0 ; i < 21 ; i ++) {
458 vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
459 vec_u8_t srcR1 = vec_ld(-2, src);
460 vec_u8_t srcR2 = vec_ld(14, src);
464 srcM2 = vec_perm(srcR1, srcR2, permM2);
465 srcM1 = vec_perm(srcR1, srcR2, permM1);
466 srcP0 = vec_perm(srcR1, srcR2, permP0);
467 srcP1 = vec_perm(srcR1, srcR2, permP1);
468 srcP2 = vec_perm(srcR1, srcR2, permP2);
469 srcP3 = vec_perm(srcR1, srcR2, permP3);
472 srcM2 = vec_perm(srcR1, srcR2, permM2);
473 srcM1 = vec_perm(srcR1, srcR2, permM1);
474 srcP0 = vec_perm(srcR1, srcR2, permP0);
475 srcP1 = vec_perm(srcR1, srcR2, permP1);
476 srcP2 = vec_perm(srcR1, srcR2, permP2);
480 vec_u8_t srcR3 = vec_ld(30, src);
481 srcM2 = vec_perm(srcR1, srcR2, permM2);
482 srcM1 = vec_perm(srcR1, srcR2, permM1);
483 srcP0 = vec_perm(srcR1, srcR2, permP0);
484 srcP1 = vec_perm(srcR1, srcR2, permP1);
486 srcP3 = vec_perm(srcR2, srcR3, permP3);
489 vec_u8_t srcR3 = vec_ld(30, src);
490 srcM2 = vec_perm(srcR1, srcR2, permM2);
491 srcM1 = vec_perm(srcR1, srcR2, permM1);
492 srcP0 = vec_perm(srcR1, srcR2, permP0);
494 srcP2 = vec_perm(srcR2, srcR3, permP2);
495 srcP3 = vec_perm(srcR2, srcR3, permP3);
498 vec_u8_t srcR3 = vec_ld(30, src);
499 srcM2 = vec_perm(srcR1, srcR2, permM2);
500 srcM1 = vec_perm(srcR1, srcR2, permM1);
502 srcP1 = vec_perm(srcR2, srcR3, permP1);
503 srcP2 = vec_perm(srcR2, srcR3, permP2);
504 srcP3 = vec_perm(srcR2, srcR3, permP3);
507 vec_u8_t srcR3 = vec_ld(30, src);
508 srcM2 = vec_perm(srcR1, srcR2, permM2);
510 srcP0 = vec_perm(srcR2, srcR3, permP0);
511 srcP1 = vec_perm(srcR2, srcR3, permP1);
512 srcP2 = vec_perm(srcR2, srcR3, permP2);
513 srcP3 = vec_perm(srcR2, srcR3, permP3);
517 srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
518 srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
519 srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
520 srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
522 srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
523 srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
524 srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
525 srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
527 srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
528 srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
529 srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
530 srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
532 sum1A = vec_adds(srcP0A, srcP1A);
533 sum1B = vec_adds(srcP0B, srcP1B);
534 sum2A = vec_adds(srcM1A, srcP2A);
535 sum2B = vec_adds(srcM1B, srcP2B);
536 sum3A = vec_adds(srcM2A, srcP3A);
537 sum3B = vec_adds(srcM2B, srcP3B);
539 pp1A = vec_mladd(sum1A, v20ss, sum3A);
540 pp1B = vec_mladd(sum1B, v20ss, sum3B);
542 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
543 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
545 psumA = vec_sub(pp1A, pp2A);
546 psumB = vec_sub(pp1B, pp2B);
548 vec_st(psumA, 0, tmp);
549 vec_st(psumB, 16, tmp);
552 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
555 tmpM2ssA = vec_ld(0, tmpbis);
556 tmpM2ssB = vec_ld(16, tmpbis);
558 tmpM1ssA = vec_ld(0, tmpbis);
559 tmpM1ssB = vec_ld(16, tmpbis);
561 tmpP0ssA = vec_ld(0, tmpbis);
562 tmpP0ssB = vec_ld(16, tmpbis);
564 tmpP1ssA = vec_ld(0, tmpbis);
565 tmpP1ssB = vec_ld(16, tmpbis);
567 tmpP2ssA = vec_ld(0, tmpbis);
568 tmpP2ssB = vec_ld(16, tmpbis);
571 for (i = 0 ; i < 16 ; i++) {
572 const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
573 const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);
575 const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
576 const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
577 const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
578 const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
579 const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
580 const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
595 pp1Ae = vec_mule(sum1A, v20ss);
596 pp1Ao = vec_mulo(sum1A, v20ss);
597 pp1Be = vec_mule(sum1B, v20ss);
598 pp1Bo = vec_mulo(sum1B, v20ss);
600 pp2Ae = vec_mule(sum2A, v5ss);
601 pp2Ao = vec_mulo(sum2A, v5ss);
602 pp2Be = vec_mule(sum2B, v5ss);
603 pp2Bo = vec_mulo(sum2B, v5ss);
605 pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
606 pp3Ao = vec_mulo(sum3A, v1ss);
607 pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
608 pp3Bo = vec_mulo(sum3B, v1ss);
610 pp1cAe = vec_add(pp1Ae, v512si);
611 pp1cAo = vec_add(pp1Ao, v512si);
612 pp1cBe = vec_add(pp1Be, v512si);
613 pp1cBo = vec_add(pp1Bo, v512si);
615 pp32Ae = vec_sub(pp3Ae, pp2Ae);
616 pp32Ao = vec_sub(pp3Ao, pp2Ao);
617 pp32Be = vec_sub(pp3Be, pp2Be);
618 pp32Bo = vec_sub(pp3Bo, pp2Bo);
620 sumAe = vec_add(pp1cAe, pp32Ae);
621 sumAo = vec_add(pp1cAo, pp32Ao);
622 sumBe = vec_add(pp1cBe, pp32Be);
623 sumBo = vec_add(pp1cBo, pp32Bo);
625 ssumAe = vec_sra(sumAe, v10ui);
626 ssumAo = vec_sra(sumAo, v10ui);
627 ssumBe = vec_sra(sumBe, v10ui);
628 ssumBo = vec_sra(sumBo, v10ui);
630 ssume = vec_packs(ssumAe, ssumBe);
631 ssumo = vec_packs(ssumAo, ssumBo);
633 sumv = vec_packsu(ssume, ssumo);
634 sum = vec_perm(sumv, sumv, mperm);
637 vdst = vec_ld(0, dst);
639 OP_U8_ALTIVEC(fsum, sum, vdst);
641 vec_st(fsum, 0, dst);
645 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);