2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
6 * This file is part of Libav.
8 * Libav is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * Libav is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with Libav; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
28 #include "libavutil/attributes.h"
29 #include "libavutil/ppc/types_altivec.h"
30 #include "libavutil/ppc/util_altivec.h"
31 #include "libavcodec/avcodec.h"
32 #include "libavcodec/dsputil.h"
33 #include "libavcodec/mpegvideo.h"
34 #include "dsputil_altivec.h"
36 static int sad16_x2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
40 const vector unsigned char zero =
41 (const vector unsigned char) vec_splat_u8(0);
42 vector unsigned char perm1 = vec_lvsl(0, pix2);
43 vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
44 vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
45 vector signed int sumdiffs;
47 for (i = 0; i < h; i++) {
48 /* Read unaligned pixels into our vectors. The vectors are as follows:
49 * pix1v: pix1[0] - pix1[15]
50 * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16] */
51 vector unsigned char pix1v = vec_ld(0, pix1);
52 vector unsigned char pix2l = vec_ld(0, pix2);
53 vector unsigned char pix2r = vec_ld(16, pix2);
54 vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm1);
55 vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2);
57 /* Calculate the average vector. */
58 vector unsigned char avgv = vec_avg(pix2v, pix2iv);
60 /* Calculate a sum of abs differences vector. */
61 vector unsigned char t5 = vec_sub(vec_max(pix1v, avgv),
62 vec_min(pix1v, avgv));
64 /* Add each 4 pixel group together and put 4 results into sad. */
65 sad = vec_sum4s(t5, sad);
70 /* Sum up the four partial sums, and put the result into s. */
71 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
72 sumdiffs = vec_splat(sumdiffs, 3);
73 vec_ste(sumdiffs, 0, &s);
78 static int sad16_y2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
82 const vector unsigned char zero =
83 (const vector unsigned char) vec_splat_u8(0);
84 vector unsigned char perm = vec_lvsl(0, pix2);
85 vector unsigned char pix1v, pix3v, avgv, t5;
86 vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
87 vector signed int sumdiffs;
88 uint8_t *pix3 = pix2 + line_size;
90 /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
91 * iteration becomes pix2 in the next iteration. We can use this
92 * fact to avoid a potentially expensive unaligned read, each
93 * time around the loop.
94 * Read unaligned pixels into our vectors. The vectors are as follows:
95 * pix2v: pix2[0] - pix2[15]
96 * Split the pixel vectors into shorts. */
97 vector unsigned char pix2l = vec_ld(0, pix2);
98 vector unsigned char pix2r = vec_ld(15, pix2);
99 vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm);
101 for (i = 0; i < h; i++) {
102 /* Read unaligned pixels into our vectors. The vectors are as follows:
103 * pix1v: pix1[0] - pix1[15]
104 * pix3v: pix3[0] - pix3[15] */
105 pix1v = vec_ld(0, pix1);
107 pix2l = vec_ld(0, pix3);
108 pix2r = vec_ld(15, pix3);
109 pix3v = vec_perm(pix2l, pix2r, perm);
111 /* Calculate the average vector. */
112 avgv = vec_avg(pix2v, pix3v);
114 /* Calculate a sum of abs differences vector. */
115 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
117 /* Add each 4 pixel group together and put 4 results into sad. */
118 sad = vec_sum4s(t5, sad);
125 /* Sum up the four partial sums, and put the result into s. */
126 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
127 sumdiffs = vec_splat(sumdiffs, 3);
128 vec_ste(sumdiffs, 0, &s);
132 static int sad16_xy2_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
133 int line_size, int h)
136 uint8_t *pix3 = pix2 + line_size;
137 const vector unsigned char zero =
138 (const vector unsigned char) vec_splat_u8(0);
139 const vector unsigned short two =
140 (const vector unsigned short) vec_splat_u16(2);
141 vector unsigned char avgv, t5;
142 vector unsigned char perm1 = vec_lvsl(0, pix2);
143 vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
144 vector unsigned char pix1v, pix3v, pix3iv;
145 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
146 vector unsigned short avghv, avglv;
147 vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
148 vector signed int sumdiffs;
150 /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
151 * iteration becomes pix2 in the next iteration. We can use this
152 * fact to avoid a potentially expensive unaligned read, as well
153 * as some splitting, and vector addition each time around the loop.
154 * Read unaligned pixels into our vectors. The vectors are as follows:
155 * pix2v: pix2[0] - pix2[15] pix2iv: pix2[1] - pix2[16]
156 * Split the pixel vectors into shorts. */
157 vector unsigned char pix2l = vec_ld(0, pix2);
158 vector unsigned char pix2r = vec_ld(16, pix2);
159 vector unsigned char pix2v = vec_perm(pix2l, pix2r, perm1);
160 vector unsigned char pix2iv = vec_perm(pix2l, pix2r, perm2);
162 vector unsigned short pix2hv =
163 (vector unsigned short) vec_mergeh(zero, pix2v);
164 vector unsigned short pix2lv =
165 (vector unsigned short) vec_mergel(zero, pix2v);
166 vector unsigned short pix2ihv =
167 (vector unsigned short) vec_mergeh(zero, pix2iv);
168 vector unsigned short pix2ilv =
169 (vector unsigned short) vec_mergel(zero, pix2iv);
170 vector unsigned short t1 = vec_add(pix2hv, pix2ihv);
171 vector unsigned short t2 = vec_add(pix2lv, pix2ilv);
172 vector unsigned short t3, t4;
174 for (i = 0; i < h; i++) {
175 /* Read unaligned pixels into our vectors. The vectors are as follows:
176 * pix1v: pix1[0] - pix1[15]
177 * pix3v: pix3[0] - pix3[15] pix3iv: pix3[1] - pix3[16] */
178 pix1v = vec_ld(0, pix1);
180 pix2l = vec_ld(0, pix3);
181 pix2r = vec_ld(16, pix3);
182 pix3v = vec_perm(pix2l, pix2r, perm1);
183 pix3iv = vec_perm(pix2l, pix2r, perm2);
185 /* Note that AltiVec does have vec_avg, but this works on vector pairs
186 * and rounds up. We could do avg(avg(a, b), avg(c, d)), but the
187 * rounding would mean that, for example, avg(3, 0, 0, 1) = 2, when
188 * it should be 1. Instead, we have to split the pixel vectors into
189 * vectors of shorts and do the averaging by hand. */
191 /* Split the pixel vectors into shorts. */
192 pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
193 pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
194 pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
195 pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
197 /* Do the averaging on them. */
198 t3 = vec_add(pix3hv, pix3ihv);
199 t4 = vec_add(pix3lv, pix3ilv);
201 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
202 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
204 /* Pack the shorts back into a result. */
205 avgv = vec_pack(avghv, avglv);
207 /* Calculate a sum of abs differences vector. */
208 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
210 /* Add each 4 pixel group together and put 4 results into sad. */
211 sad = vec_sum4s(t5, sad);
215 /* Transfer the calculated values for pix3 into pix2. */
219 /* Sum up the four partial sums, and put the result into s. */
220 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
221 sumdiffs = vec_splat(sumdiffs, 3);
222 vec_ste(sumdiffs, 0, &s);
227 static int sad16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
228 int line_size, int h)
231 const vector unsigned int zero =
232 (const vector unsigned int) vec_splat_u32(0);
233 vector unsigned char perm = vec_lvsl(0, pix2);
234 vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
235 vector signed int sumdiffs;
237 for (i = 0; i < h; i++) {
238 /* Read potentially unaligned pixels into t1 and t2. */
239 vector unsigned char pix2l = vec_ld(0, pix2);
240 vector unsigned char pix2r = vec_ld(15, pix2);
241 vector unsigned char t1 = vec_ld(0, pix1);
242 vector unsigned char t2 = vec_perm(pix2l, pix2r, perm);
244 /* Calculate a sum of abs differences vector. */
245 vector unsigned char t3 = vec_max(t1, t2);
246 vector unsigned char t4 = vec_min(t1, t2);
247 vector unsigned char t5 = vec_sub(t3, t4);
249 /* Add each 4 pixel group together and put 4 results into sad. */
250 sad = vec_sum4s(t5, sad);
256 /* Sum up the four partial sums, and put the result into s. */
257 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
258 sumdiffs = vec_splat(sumdiffs, 3);
259 vec_ste(sumdiffs, 0, &s);
264 static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
265 int line_size, int h)
268 const vector unsigned int zero =
269 (const vector unsigned int) vec_splat_u32(0);
270 const vector unsigned char permclear =
271 (vector unsigned char)
272 { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
273 vector unsigned char perm1 = vec_lvsl(0, pix1);
274 vector unsigned char perm2 = vec_lvsl(0, pix2);
275 vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
276 vector signed int sumdiffs;
278 for (i = 0; i < h; i++) {
279 /* Read potentially unaligned pixels into t1 and t2.
280 * Since we're reading 16 pixels, and actually only want 8,
281 * mask out the last 8 pixels. The 0s don't change the sum. */
282 vector unsigned char pix1l = vec_ld(0, pix1);
283 vector unsigned char pix1r = vec_ld(7, pix1);
284 vector unsigned char pix2l = vec_ld(0, pix2);
285 vector unsigned char pix2r = vec_ld(7, pix2);
286 vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1),
288 vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2),
291 /* Calculate a sum of abs differences vector. */
292 vector unsigned char t3 = vec_max(t1, t2);
293 vector unsigned char t4 = vec_min(t1, t2);
294 vector unsigned char t5 = vec_sub(t3, t4);
296 /* Add each 4 pixel group together and put 4 results into sad. */
297 sad = vec_sum4s(t5, sad);
303 /* Sum up the four partial sums, and put the result into s. */
304 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
305 sumdiffs = vec_splat(sumdiffs, 3);
306 vec_ste(sumdiffs, 0, &s);
311 static int pix_norm1_altivec(uint8_t *pix, int line_size)
314 const vector unsigned int zero =
315 (const vector unsigned int) vec_splat_u32(0);
316 vector unsigned char perm = vec_lvsl(0, pix);
317 vector unsigned int sv = (vector unsigned int) vec_splat_u32(0);
318 vector signed int sum;
320 for (i = 0; i < 16; i++) {
321 /* Read the potentially unaligned pixels. */
322 vector unsigned char pixl = vec_ld(0, pix);
323 vector unsigned char pixr = vec_ld(15, pix);
324 vector unsigned char pixv = vec_perm(pixl, pixr, perm);
326 /* Square the values, and add them to our sum. */
327 sv = vec_msum(pixv, pixv, sv);
331 /* Sum up the four partial sums, and put the result into s. */
332 sum = vec_sums((vector signed int) sv, (vector signed int) zero);
333 sum = vec_splat(sum, 3);
339 /* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced.
340 * It's the sad8_altivec code above w/ squaring added. */
341 static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
342 int line_size, int h)
345 const vector unsigned int zero =
346 (const vector unsigned int) vec_splat_u32(0);
347 const vector unsigned char permclear =
348 (vector unsigned char)
349 { 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0 };
350 vector unsigned char perm1 = vec_lvsl(0, pix1);
351 vector unsigned char perm2 = vec_lvsl(0, pix2);
352 vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
353 vector signed int sumsqr;
355 for (i = 0; i < h; i++) {
356 /* Read potentially unaligned pixels into t1 and t2.
357 * Since we're reading 16 pixels, and actually only want 8,
358 * mask out the last 8 pixels. The 0s don't change the sum. */
359 vector unsigned char pix1l = vec_ld(0, pix1);
360 vector unsigned char pix1r = vec_ld(7, pix1);
361 vector unsigned char pix2l = vec_ld(0, pix2);
362 vector unsigned char pix2r = vec_ld(7, pix2);
363 vector unsigned char t1 = vec_and(vec_perm(pix1l, pix1r, perm1),
365 vector unsigned char t2 = vec_and(vec_perm(pix2l, pix2r, perm2),
368 /* Since we want to use unsigned chars, we can take advantage
369 * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
371 /* Calculate abs differences vector. */
372 vector unsigned char t3 = vec_max(t1, t2);
373 vector unsigned char t4 = vec_min(t1, t2);
374 vector unsigned char t5 = vec_sub(t3, t4);
376 /* Square the values and add them to our sum. */
377 sum = vec_msum(t5, t5, sum);
383 /* Sum up the four partial sums, and put the result into s. */
384 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
385 sumsqr = vec_splat(sumsqr, 3);
386 vec_ste(sumsqr, 0, &s);
391 /* Sum of Squared Errors for a 16x16 block, AltiVec-enhanced.
392 * It's the sad16_altivec code above w/ squaring added. */
393 static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
394 int line_size, int h)
397 const vector unsigned int zero =
398 (const vector unsigned int) vec_splat_u32(0);
399 vector unsigned char perm = vec_lvsl(0, pix2);
400 vector unsigned int sum = (vector unsigned int) vec_splat_u32(0);
401 vector signed int sumsqr;
403 for (i = 0; i < h; i++) {
404 /* Read potentially unaligned pixels into t1 and t2. */
405 vector unsigned char pix2l = vec_ld(0, pix2);
406 vector unsigned char pix2r = vec_ld(15, pix2);
407 vector unsigned char t1 = vec_ld(0, pix1);
408 vector unsigned char t2 = vec_perm(pix2l, pix2r, perm);
410 /* Since we want to use unsigned chars, we can take advantage
411 * of the fact that abs(a - b) ^ 2 = (a - b) ^ 2. */
413 /* Calculate abs differences vector. */
414 vector unsigned char t3 = vec_max(t1, t2);
415 vector unsigned char t4 = vec_min(t1, t2);
416 vector unsigned char t5 = vec_sub(t3, t4);
418 /* Square the values and add them to our sum. */
419 sum = vec_msum(t5, t5, sum);
425 /* Sum up the four partial sums, and put the result into s. */
426 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
427 sumsqr = vec_splat(sumsqr, 3);
428 vec_ste(sumsqr, 0, &s);
433 static int pix_sum_altivec(uint8_t *pix, int line_size)
436 const vector unsigned int zero =
437 (const vector unsigned int) vec_splat_u32(0);
438 vector unsigned char perm = vec_lvsl(0, pix);
439 vector unsigned int sad = (vector unsigned int) vec_splat_u32(0);
440 vector signed int sumdiffs;
442 for (i = 0; i < 16; i++) {
443 /* Read the potentially unaligned 16 pixels into t1. */
444 vector unsigned char pixl = vec_ld(0, pix);
445 vector unsigned char pixr = vec_ld(15, pix);
446 vector unsigned char t1 = vec_perm(pixl, pixr, perm);
448 /* Add each 4 pixel group together and put 4 results into sad. */
449 sad = vec_sum4s(t1, sad);
454 /* Sum up the four partial sums, and put the result into s. */
455 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
456 sumdiffs = vec_splat(sumdiffs, 3);
457 vec_ste(sumdiffs, 0, &s);
462 static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
466 vector unsigned char perm = vec_lvsl(0, pixels);
467 const vector unsigned char zero =
468 (const vector unsigned char) vec_splat_u8(0);
470 for (i = 0; i < 8; i++) {
471 /* Read potentially unaligned pixels.
472 * We're reading 16 pixels, and actually only want 8,
473 * but we simply ignore the extras. */
474 vector unsigned char pixl = vec_ld(0, pixels);
475 vector unsigned char pixr = vec_ld(7, pixels);
476 vector unsigned char bytes = vec_perm(pixl, pixr, perm);
478 // Convert the bytes into shorts.
479 vector signed short shorts = (vector signed short) vec_mergeh(zero,
482 // Save the data to the block, we assume the block is 16-byte aligned.
483 vec_st(shorts, i * 16, (vector signed short *) block);
489 static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
490 const uint8_t *s2, int stride)
493 vector unsigned char perm1 = vec_lvsl(0, s1);
494 vector unsigned char perm2 = vec_lvsl(0, s2);
495 const vector unsigned char zero =
496 (const vector unsigned char) vec_splat_u8(0);
497 vector signed short shorts1, shorts2;
499 for (i = 0; i < 4; i++) {
500 /* Read potentially unaligned pixels.
501 * We're reading 16 pixels, and actually only want 8,
502 * but we simply ignore the extras. */
503 vector unsigned char pixl = vec_ld(0, s1);
504 vector unsigned char pixr = vec_ld(15, s1);
505 vector unsigned char bytes = vec_perm(pixl, pixr, perm1);
507 // Convert the bytes into shorts.
508 shorts1 = (vector signed short) vec_mergeh(zero, bytes);
510 // Do the same for the second block of pixels.
511 pixl = vec_ld(0, s2);
512 pixr = vec_ld(15, s2);
513 bytes = vec_perm(pixl, pixr, perm2);
515 // Convert the bytes into shorts.
516 shorts2 = (vector signed short) vec_mergeh(zero, bytes);
518 // Do the subtraction.
519 shorts1 = vec_sub(shorts1, shorts2);
521 // Save the data to the block, we assume the block is 16-byte aligned.
522 vec_st(shorts1, 0, (vector signed short *) block);
528 /* The code below is a copy of the code above...
529 * This is a manual unroll. */
531 /* Read potentially unaligned pixels.
532 * We're reading 16 pixels, and actually only want 8,
533 * but we simply ignore the extras. */
534 pixl = vec_ld(0, s1);
535 pixr = vec_ld(15, s1);
536 bytes = vec_perm(pixl, pixr, perm1);
538 // Convert the bytes into shorts.
539 shorts1 = (vector signed short) vec_mergeh(zero, bytes);
541 // Do the same for the second block of pixels.
542 pixl = vec_ld(0, s2);
543 pixr = vec_ld(15, s2);
544 bytes = vec_perm(pixl, pixr, perm2);
546 // Convert the bytes into shorts.
547 shorts2 = (vector signed short) vec_mergeh(zero, bytes);
549 // Do the subtraction.
550 shorts1 = vec_sub(shorts1, shorts2);
552 // Save the data to the block, we assume the block is 16-byte aligned.
553 vec_st(shorts1, 0, (vector signed short *) block);
561 static int hadamard8_diff8x8_altivec(MpegEncContext *s, uint8_t *dst,
562 uint8_t *src, int stride, int h)
565 register const vector unsigned char vzero =
566 (const vector unsigned char) vec_splat_u8(0);
567 register vector signed short temp0, temp1, temp2, temp3, temp4,
570 register const vector signed short vprod1 =
571 (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
572 register const vector signed short vprod2 =
573 (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
574 register const vector signed short vprod3 =
575 (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
576 register const vector unsigned char perm1 =
577 (const vector unsigned char)
578 { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
579 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
580 register const vector unsigned char perm2 =
581 (const vector unsigned char)
582 { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
583 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
584 register const vector unsigned char perm3 =
585 (const vector unsigned char)
586 { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
587 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
589 #define ONEITERBUTTERFLY(i, res) \
591 register vector unsigned char src1 = vec_ld(stride * i, src); \
592 register vector unsigned char src2 = vec_ld(stride * i + 15, src); \
593 register vector unsigned char srcO = \
594 vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
595 register vector unsigned char dst1 = vec_ld(stride * i, dst); \
596 register vector unsigned char dst2 = vec_ld(stride * i + 15, dst); \
597 register vector unsigned char dstO = \
598 vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
600 /* Promote the unsigned chars to signed shorts. */ \
601 /* We're in the 8x8 function, we only care for the first 8. */ \
602 register vector signed short srcV = \
603 (vector signed short) vec_mergeh((vector signed char) vzero, \
604 (vector signed char) srcO); \
605 register vector signed short dstV = \
606 (vector signed short) vec_mergeh((vector signed char) vzero, \
607 (vector signed char) dstO); \
609 /* subtractions inside the first butterfly */ \
610 register vector signed short but0 = vec_sub(srcV, dstV); \
611 register vector signed short op1 = vec_perm(but0, but0, perm1); \
612 register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
613 register vector signed short op2 = vec_perm(but1, but1, perm2); \
614 register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
615 register vector signed short op3 = vec_perm(but2, but2, perm3); \
616 res = vec_mladd(but2, vprod3, op3); \
618 ONEITERBUTTERFLY(0, temp0);
619 ONEITERBUTTERFLY(1, temp1);
620 ONEITERBUTTERFLY(2, temp2);
621 ONEITERBUTTERFLY(3, temp3);
622 ONEITERBUTTERFLY(4, temp4);
623 ONEITERBUTTERFLY(5, temp5);
624 ONEITERBUTTERFLY(6, temp6);
625 ONEITERBUTTERFLY(7, temp7);
627 #undef ONEITERBUTTERFLY
629 register vector signed int vsum;
630 register vector signed short line0 = vec_add(temp0, temp1);
631 register vector signed short line1 = vec_sub(temp0, temp1);
632 register vector signed short line2 = vec_add(temp2, temp3);
633 register vector signed short line3 = vec_sub(temp2, temp3);
634 register vector signed short line4 = vec_add(temp4, temp5);
635 register vector signed short line5 = vec_sub(temp4, temp5);
636 register vector signed short line6 = vec_add(temp6, temp7);
637 register vector signed short line7 = vec_sub(temp6, temp7);
639 register vector signed short line0B = vec_add(line0, line2);
640 register vector signed short line2B = vec_sub(line0, line2);
641 register vector signed short line1B = vec_add(line1, line3);
642 register vector signed short line3B = vec_sub(line1, line3);
643 register vector signed short line4B = vec_add(line4, line6);
644 register vector signed short line6B = vec_sub(line4, line6);
645 register vector signed short line5B = vec_add(line5, line7);
646 register vector signed short line7B = vec_sub(line5, line7);
648 register vector signed short line0C = vec_add(line0B, line4B);
649 register vector signed short line4C = vec_sub(line0B, line4B);
650 register vector signed short line1C = vec_add(line1B, line5B);
651 register vector signed short line5C = vec_sub(line1B, line5B);
652 register vector signed short line2C = vec_add(line2B, line6B);
653 register vector signed short line6C = vec_sub(line2B, line6B);
654 register vector signed short line3C = vec_add(line3B, line7B);
655 register vector signed short line7C = vec_sub(line3B, line7B);
657 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
658 vsum = vec_sum4s(vec_abs(line1C), vsum);
659 vsum = vec_sum4s(vec_abs(line2C), vsum);
660 vsum = vec_sum4s(vec_abs(line3C), vsum);
661 vsum = vec_sum4s(vec_abs(line4C), vsum);
662 vsum = vec_sum4s(vec_abs(line5C), vsum);
663 vsum = vec_sum4s(vec_abs(line6C), vsum);
664 vsum = vec_sum4s(vec_abs(line7C), vsum);
665 vsum = vec_sums(vsum, (vector signed int) vzero);
666 vsum = vec_splat(vsum, 3);
667 vec_ste(vsum, 0, &sum);
673 * 16x8 works with 16 elements; it allows to avoid replicating loads, and
674 * gives the compiler more room for scheduling. It's only used from
675 * inside hadamard8_diff16_altivec.
677 * Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has
678 * a LOT of spill code, it seems gcc (unlike xlc) cannot keep everything in
679 * registers by itself. The following code includes hand-made register
680 * allocation. It's not clean, but on a 7450 the resulting code is much faster
681 * (best case falls from 700+ cycles to 550).
683 * xlc doesn't add spill code, but it doesn't know how to schedule for the
684 * 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses
685 * 25% fewer instructions...)
687 * On the 970, the hand-made RA is still a win (around 690 vs. around 780),
688 * but xlc goes to around 660 on the regular C code...
690 static int hadamard8_diff16x8_altivec(MpegEncContext *s, uint8_t *dst,
691 uint8_t *src, int stride, int h)
694 register vector signed short
695 temp0 __asm__ ("v0"),
696 temp1 __asm__ ("v1"),
697 temp2 __asm__ ("v2"),
698 temp3 __asm__ ("v3"),
699 temp4 __asm__ ("v4"),
700 temp5 __asm__ ("v5"),
701 temp6 __asm__ ("v6"),
702 temp7 __asm__ ("v7");
703 register vector signed short
704 temp0S __asm__ ("v8"),
705 temp1S __asm__ ("v9"),
706 temp2S __asm__ ("v10"),
707 temp3S __asm__ ("v11"),
708 temp4S __asm__ ("v12"),
709 temp5S __asm__ ("v13"),
710 temp6S __asm__ ("v14"),
711 temp7S __asm__ ("v15");
712 register const vector unsigned char vzero __asm__ ("v31") =
713 (const vector unsigned char) vec_splat_u8(0);
715 register const vector signed short vprod1 __asm__ ("v16") =
716 (const vector signed short) { 1, -1, 1, -1, 1, -1, 1, -1 };
718 register const vector signed short vprod2 __asm__ ("v17") =
719 (const vector signed short) { 1, 1, -1, -1, 1, 1, -1, -1 };
721 register const vector signed short vprod3 __asm__ ("v18") =
722 (const vector signed short) { 1, 1, 1, 1, -1, -1, -1, -1 };
724 register const vector unsigned char perm1 __asm__ ("v19") =
725 (const vector unsigned char)
726 { 0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
727 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D };
729 register const vector unsigned char perm2 __asm__ ("v20") =
730 (const vector unsigned char)
731 { 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
732 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B };
734 register const vector unsigned char perm3 __asm__ ("v21") =
735 (const vector unsigned char)
736 { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
737 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
739 #define ONEITERBUTTERFLY(i, res1, res2) \
741 register vector unsigned char src1 __asm__ ("v22") = \
742 vec_ld(stride * i, src); \
743 register vector unsigned char src2 __asm__ ("v23") = \
744 vec_ld(stride * i + 16, src); \
745 register vector unsigned char srcO __asm__ ("v22") = \
746 vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
747 register vector unsigned char dst1 __asm__ ("v24") = \
748 vec_ld(stride * i, dst); \
749 register vector unsigned char dst2 __asm__ ("v25") = \
750 vec_ld(stride * i + 16, dst); \
751 register vector unsigned char dstO __asm__ ("v23") = \
752 vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
754 /* Promote the unsigned chars to signed shorts. */ \
755 register vector signed short srcV __asm__ ("v24") = \
756 (vector signed short) vec_mergeh((vector signed char) vzero, \
757 (vector signed char) srcO); \
758 register vector signed short dstV __asm__ ("v25") = \
759 (vector signed short) vec_mergeh((vector signed char) vzero, \
760 (vector signed char) dstO); \
761 register vector signed short srcW __asm__ ("v26") = \
762 (vector signed short) vec_mergel((vector signed char) vzero, \
763 (vector signed char) srcO); \
764 register vector signed short dstW __asm__ ("v27") = \
765 (vector signed short) vec_mergel((vector signed char) vzero, \
766 (vector signed char) dstO); \
768 /* subtractions inside the first butterfly */ \
769 register vector signed short but0 __asm__ ("v28") = \
770 vec_sub(srcV, dstV); \
771 register vector signed short but0S __asm__ ("v29") = \
772 vec_sub(srcW, dstW); \
773 register vector signed short op1 __asm__ ("v30") = \
774 vec_perm(but0, but0, perm1); \
775 register vector signed short but1 __asm__ ("v22") = \
776 vec_mladd(but0, vprod1, op1); \
777 register vector signed short op1S __asm__ ("v23") = \
778 vec_perm(but0S, but0S, perm1); \
779 register vector signed short but1S __asm__ ("v24") = \
780 vec_mladd(but0S, vprod1, op1S); \
781 register vector signed short op2 __asm__ ("v25") = \
782 vec_perm(but1, but1, perm2); \
783 register vector signed short but2 __asm__ ("v26") = \
784 vec_mladd(but1, vprod2, op2); \
785 register vector signed short op2S __asm__ ("v27") = \
786 vec_perm(but1S, but1S, perm2); \
787 register vector signed short but2S __asm__ ("v28") = \
788 vec_mladd(but1S, vprod2, op2S); \
789 register vector signed short op3 __asm__ ("v29") = \
790 vec_perm(but2, but2, perm3); \
791 register vector signed short op3S __asm__ ("v30") = \
792 vec_perm(but2S, but2S, perm3); \
793 res1 = vec_mladd(but2, vprod3, op3); \
794 res2 = vec_mladd(but2S, vprod3, op3S); \
796 ONEITERBUTTERFLY(0, temp0, temp0S);
797 ONEITERBUTTERFLY(1, temp1, temp1S);
798 ONEITERBUTTERFLY(2, temp2, temp2S);
799 ONEITERBUTTERFLY(3, temp3, temp3S);
800 ONEITERBUTTERFLY(4, temp4, temp4S);
801 ONEITERBUTTERFLY(5, temp5, temp5S);
802 ONEITERBUTTERFLY(6, temp6, temp6S);
803 ONEITERBUTTERFLY(7, temp7, temp7S);
805 #undef ONEITERBUTTERFLY
807 register vector signed int vsum;
809 register vector signed short line0 = vec_add(temp0, temp1);
810 register vector signed short line1 = vec_sub(temp0, temp1);
811 register vector signed short line2 = vec_add(temp2, temp3);
812 register vector signed short line3 = vec_sub(temp2, temp3);
813 register vector signed short line4 = vec_add(temp4, temp5);
814 register vector signed short line5 = vec_sub(temp4, temp5);
815 register vector signed short line6 = vec_add(temp6, temp7);
816 register vector signed short line7 = vec_sub(temp6, temp7);
818 register vector signed short line0B = vec_add(line0, line2);
819 register vector signed short line2B = vec_sub(line0, line2);
820 register vector signed short line1B = vec_add(line1, line3);
821 register vector signed short line3B = vec_sub(line1, line3);
822 register vector signed short line4B = vec_add(line4, line6);
823 register vector signed short line6B = vec_sub(line4, line6);
824 register vector signed short line5B = vec_add(line5, line7);
825 register vector signed short line7B = vec_sub(line5, line7);
827 register vector signed short line0C = vec_add(line0B, line4B);
828 register vector signed short line4C = vec_sub(line0B, line4B);
829 register vector signed short line1C = vec_add(line1B, line5B);
830 register vector signed short line5C = vec_sub(line1B, line5B);
831 register vector signed short line2C = vec_add(line2B, line6B);
832 register vector signed short line6C = vec_sub(line2B, line6B);
833 register vector signed short line3C = vec_add(line3B, line7B);
834 register vector signed short line7C = vec_sub(line3B, line7B);
836 register vector signed short line0S = vec_add(temp0S, temp1S);
837 register vector signed short line1S = vec_sub(temp0S, temp1S);
838 register vector signed short line2S = vec_add(temp2S, temp3S);
839 register vector signed short line3S = vec_sub(temp2S, temp3S);
840 register vector signed short line4S = vec_add(temp4S, temp5S);
841 register vector signed short line5S = vec_sub(temp4S, temp5S);
842 register vector signed short line6S = vec_add(temp6S, temp7S);
843 register vector signed short line7S = vec_sub(temp6S, temp7S);
845 register vector signed short line0BS = vec_add(line0S, line2S);
846 register vector signed short line2BS = vec_sub(line0S, line2S);
847 register vector signed short line1BS = vec_add(line1S, line3S);
848 register vector signed short line3BS = vec_sub(line1S, line3S);
849 register vector signed short line4BS = vec_add(line4S, line6S);
850 register vector signed short line6BS = vec_sub(line4S, line6S);
851 register vector signed short line5BS = vec_add(line5S, line7S);
852 register vector signed short line7BS = vec_sub(line5S, line7S);
854 register vector signed short line0CS = vec_add(line0BS, line4BS);
855 register vector signed short line4CS = vec_sub(line0BS, line4BS);
856 register vector signed short line1CS = vec_add(line1BS, line5BS);
857 register vector signed short line5CS = vec_sub(line1BS, line5BS);
858 register vector signed short line2CS = vec_add(line2BS, line6BS);
859 register vector signed short line6CS = vec_sub(line2BS, line6BS);
860 register vector signed short line3CS = vec_add(line3BS, line7BS);
861 register vector signed short line7CS = vec_sub(line3BS, line7BS);
863 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
864 vsum = vec_sum4s(vec_abs(line1C), vsum);
865 vsum = vec_sum4s(vec_abs(line2C), vsum);
866 vsum = vec_sum4s(vec_abs(line3C), vsum);
867 vsum = vec_sum4s(vec_abs(line4C), vsum);
868 vsum = vec_sum4s(vec_abs(line5C), vsum);
869 vsum = vec_sum4s(vec_abs(line6C), vsum);
870 vsum = vec_sum4s(vec_abs(line7C), vsum);
872 vsum = vec_sum4s(vec_abs(line0CS), vsum);
873 vsum = vec_sum4s(vec_abs(line1CS), vsum);
874 vsum = vec_sum4s(vec_abs(line2CS), vsum);
875 vsum = vec_sum4s(vec_abs(line3CS), vsum);
876 vsum = vec_sum4s(vec_abs(line4CS), vsum);
877 vsum = vec_sum4s(vec_abs(line5CS), vsum);
878 vsum = vec_sum4s(vec_abs(line6CS), vsum);
879 vsum = vec_sum4s(vec_abs(line7CS), vsum);
880 vsum = vec_sums(vsum, (vector signed int) vzero);
881 vsum = vec_splat(vsum, 3);
882 vec_ste(vsum, 0, &sum);
887 static int hadamard8_diff16_altivec(MpegEncContext *s, uint8_t *dst,
888 uint8_t *src, int stride, int h)
890 int score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
895 score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
900 av_cold void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx,
901 unsigned high_bit_depth)
903 c->pix_abs[0][1] = sad16_x2_altivec;
904 c->pix_abs[0][2] = sad16_y2_altivec;
905 c->pix_abs[0][3] = sad16_xy2_altivec;
906 c->pix_abs[0][0] = sad16_altivec;
907 c->pix_abs[1][0] = sad8_altivec;
909 c->sad[0] = sad16_altivec;
910 c->sad[1] = sad8_altivec;
911 c->sse[0] = sse16_altivec;
912 c->sse[1] = sse8_altivec;
914 c->pix_norm1 = pix_norm1_altivec;
915 c->pix_sum = pix_sum_altivec;
917 c->diff_pixels = diff_pixels_altivec;
919 if (!high_bit_depth) {
920 c->get_pixels = get_pixels_altivec;
923 c->hadamard8_diff[0] = hadamard8_diff16_altivec;
924 c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;