2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27 #include "libavutil/attributes.h"
28 #include "libavutil/ppc/types_altivec.h"
29 #include "libavutil/ppc/util_altivec.h"
30 #include "libavcodec/dsputil.h"
31 #include "dsputil_altivec.h"
33 static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
37 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
38 vector unsigned char perm1 = vec_lvsl(0, pix2);
39 vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
40 vector unsigned char pix2l, pix2r;
41 vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
42 vector unsigned int sad;
43 vector signed int sumdiffs;
46 sad = (vector unsigned int)vec_splat_u32(0);
47 for (i = 0; i < h; i++) {
48 /* Read unaligned pixels into our vectors. The vectors are as follows:
49 pix1v: pix1[0]-pix1[15]
50 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */
51 pix1v = vec_ld( 0, pix1);
52 pix2l = vec_ld( 0, pix2);
53 pix2r = vec_ld(16, pix2);
54 pix2v = vec_perm(pix2l, pix2r, perm1);
55 pix2iv = vec_perm(pix2l, pix2r, perm2);
57 /* Calculate the average vector */
58 avgv = vec_avg(pix2v, pix2iv);
60 /* Calculate a sum of abs differences vector */
61 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
63 /* Add each 4 pixel group together and put 4 results into sad */
64 sad = vec_sum4s(t5, sad);
69 /* Sum up the four partial sums, and put the result into s */
70 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
71 sumdiffs = vec_splat(sumdiffs, 3);
72 vec_ste(sumdiffs, 0, &s);
77 static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
81 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
82 vector unsigned char perm = vec_lvsl(0, pix2);
83 vector unsigned char pix2l, pix2r;
84 vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
85 vector unsigned int sad;
86 vector signed int sumdiffs;
87 uint8_t *pix3 = pix2 + line_size;
90 sad = (vector unsigned int)vec_splat_u32(0);
92 /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
93 iteration becomes pix2 in the next iteration. We can use this
94 fact to avoid a potentially expensive unaligned read, each
96 Read unaligned pixels into our vectors. The vectors are as follows:
97 pix2v: pix2[0]-pix2[15]
98 Split the pixel vectors into shorts */
99 pix2l = vec_ld( 0, pix2);
100 pix2r = vec_ld(15, pix2);
101 pix2v = vec_perm(pix2l, pix2r, perm);
103 for (i = 0; i < h; i++) {
104 /* Read unaligned pixels into our vectors. The vectors are as follows:
105 pix1v: pix1[0]-pix1[15]
106 pix3v: pix3[0]-pix3[15] */
107 pix1v = vec_ld(0, pix1);
109 pix2l = vec_ld( 0, pix3);
110 pix2r = vec_ld(15, pix3);
111 pix3v = vec_perm(pix2l, pix2r, perm);
113 /* Calculate the average vector */
114 avgv = vec_avg(pix2v, pix3v);
116 /* Calculate a sum of abs differences vector */
117 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
119 /* Add each 4 pixel group together and put 4 results into sad */
120 sad = vec_sum4s(t5, sad);
128 /* Sum up the four partial sums, and put the result into s */
129 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
130 sumdiffs = vec_splat(sumdiffs, 3);
131 vec_ste(sumdiffs, 0, &s);
135 static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
139 uint8_t *pix3 = pix2 + line_size;
140 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
141 const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
142 vector unsigned char avgv, t5;
143 vector unsigned char perm1 = vec_lvsl(0, pix2);
144 vector unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
145 vector unsigned char pix2l, pix2r;
146 vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
147 vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
148 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
149 vector unsigned short avghv, avglv;
150 vector unsigned short t1, t2, t3, t4;
151 vector unsigned int sad;
152 vector signed int sumdiffs;
154 sad = (vector unsigned int)vec_splat_u32(0);
158 /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
159 iteration becomes pix2 in the next iteration. We can use this
160 fact to avoid a potentially expensive unaligned read, as well
161 as some splitting, and vector addition each time around the loop.
162 Read unaligned pixels into our vectors. The vectors are as follows:
163 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
164 Split the pixel vectors into shorts */
165 pix2l = vec_ld( 0, pix2);
166 pix2r = vec_ld(16, pix2);
167 pix2v = vec_perm(pix2l, pix2r, perm1);
168 pix2iv = vec_perm(pix2l, pix2r, perm2);
170 pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
171 pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
172 pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
173 pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
174 t1 = vec_add(pix2hv, pix2ihv);
175 t2 = vec_add(pix2lv, pix2ilv);
177 for (i = 0; i < h; i++) {
178 /* Read unaligned pixels into our vectors. The vectors are as follows:
179 pix1v: pix1[0]-pix1[15]
180 pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] */
181 pix1v = vec_ld(0, pix1);
183 pix2l = vec_ld( 0, pix3);
184 pix2r = vec_ld(16, pix3);
185 pix3v = vec_perm(pix2l, pix2r, perm1);
186 pix3iv = vec_perm(pix2l, pix2r, perm2);
188 /* Note that AltiVec does have vec_avg, but this works on vector pairs
189 and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
190 would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
191 Instead, we have to split the pixel vectors into vectors of shorts,
192 and do the averaging by hand. */
194 /* Split the pixel vectors into shorts */
195 pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
196 pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
197 pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
198 pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
200 /* Do the averaging on them */
201 t3 = vec_add(pix3hv, pix3ihv);
202 t4 = vec_add(pix3lv, pix3ilv);
204 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
205 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
207 /* Pack the shorts back into a result */
208 avgv = vec_pack(avghv, avglv);
210 /* Calculate a sum of abs differences vector */
211 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
213 /* Add each 4 pixel group together and put 4 results into sad */
214 sad = vec_sum4s(t5, sad);
218 /* Transfer the calculated values for pix3 into pix2 */
222 /* Sum up the four partial sums, and put the result into s */
223 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
224 sumdiffs = vec_splat(sumdiffs, 3);
225 vec_ste(sumdiffs, 0, &s);
230 static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
234 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
235 vector unsigned char perm = vec_lvsl(0, pix2);
236 vector unsigned char t1, t2, t3,t4, t5;
237 vector unsigned int sad;
238 vector signed int sumdiffs;
240 sad = (vector unsigned int)vec_splat_u32(0);
243 for (i = 0; i < h; i++) {
244 /* Read potentially unaligned pixels into t1 and t2 */
245 vector unsigned char pix2l = vec_ld( 0, pix2);
246 vector unsigned char pix2r = vec_ld(15, pix2);
247 t1 = vec_ld(0, pix1);
248 t2 = vec_perm(pix2l, pix2r, perm);
250 /* Calculate a sum of abs differences vector */
251 t3 = vec_max(t1, t2);
252 t4 = vec_min(t1, t2);
253 t5 = vec_sub(t3, t4);
255 /* Add each 4 pixel group together and put 4 results into sad */
256 sad = vec_sum4s(t5, sad);
262 /* Sum up the four partial sums, and put the result into s */
263 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
264 sumdiffs = vec_splat(sumdiffs, 3);
265 vec_ste(sumdiffs, 0, &s);
270 static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
274 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
275 const vector unsigned char permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
276 vector unsigned char perm1 = vec_lvsl(0, pix1);
277 vector unsigned char perm2 = vec_lvsl(0, pix2);
278 vector unsigned char t1, t2, t3,t4, t5;
279 vector unsigned int sad;
280 vector signed int sumdiffs;
282 sad = (vector unsigned int)vec_splat_u32(0);
284 for (i = 0; i < h; i++) {
285 /* Read potentially unaligned pixels into t1 and t2
286 Since we're reading 16 pixels, and actually only want 8,
287 mask out the last 8 pixels. The 0s don't change the sum. */
288 vector unsigned char pix1l = vec_ld( 0, pix1);
289 vector unsigned char pix1r = vec_ld(15, pix1);
290 vector unsigned char pix2l = vec_ld( 0, pix2);
291 vector unsigned char pix2r = vec_ld(15, pix2);
292 t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear);
293 t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear);
295 /* Calculate a sum of abs differences vector */
296 t3 = vec_max(t1, t2);
297 t4 = vec_min(t1, t2);
298 t5 = vec_sub(t3, t4);
300 /* Add each 4 pixel group together and put 4 results into sad */
301 sad = vec_sum4s(t5, sad);
307 /* Sum up the four partial sums, and put the result into s */
308 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
309 sumdiffs = vec_splat(sumdiffs, 3);
310 vec_ste(sumdiffs, 0, &s);
315 static int pix_norm1_altivec(uint8_t *pix, int line_size)
319 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
320 vector unsigned char perm = vec_lvsl(0, pix);
321 vector unsigned char pixv;
322 vector unsigned int sv;
323 vector signed int sum;
325 sv = (vector unsigned int)vec_splat_u32(0);
328 for (i = 0; i < 16; i++) {
329 /* Read in the potentially unaligned pixels */
330 vector unsigned char pixl = vec_ld( 0, pix);
331 vector unsigned char pixr = vec_ld(15, pix);
332 pixv = vec_perm(pixl, pixr, perm);
334 /* Square the values, and add them to our sum */
335 sv = vec_msum(pixv, pixv, sv);
339 /* Sum up the four partial sums, and put the result into s */
340 sum = vec_sums((vector signed int) sv, (vector signed int) zero);
341 sum = vec_splat(sum, 3);
348 * Sum of Squared Errors for a 8x8 block.
350 * It's the sad8_altivec code above w/ squaring added.
352 static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
356 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
357 const vector unsigned char permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
358 vector unsigned char perm1 = vec_lvsl(0, pix1);
359 vector unsigned char perm2 = vec_lvsl(0, pix2);
360 vector unsigned char t1, t2, t3,t4, t5;
361 vector unsigned int sum;
362 vector signed int sumsqr;
364 sum = (vector unsigned int)vec_splat_u32(0);
366 for (i = 0; i < h; i++) {
367 /* Read potentially unaligned pixels into t1 and t2
368 Since we're reading 16 pixels, and actually only want 8,
369 mask out the last 8 pixels. The 0s don't change the sum. */
370 vector unsigned char pix1l = vec_ld( 0, pix1);
371 vector unsigned char pix1r = vec_ld(15, pix1);
372 vector unsigned char pix2l = vec_ld( 0, pix2);
373 vector unsigned char pix2r = vec_ld(15, pix2);
374 t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear);
375 t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear);
377 /* Since we want to use unsigned chars, we can take advantage
378 of the fact that abs(a-b)^2 = (a-b)^2. */
380 /* Calculate abs differences vector */
381 t3 = vec_max(t1, t2);
382 t4 = vec_min(t1, t2);
383 t5 = vec_sub(t3, t4);
385 /* Square the values and add them to our sum */
386 sum = vec_msum(t5, t5, sum);
392 /* Sum up the four partial sums, and put the result into s */
393 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
394 sumsqr = vec_splat(sumsqr, 3);
395 vec_ste(sumsqr, 0, &s);
401 * Sum of Squared Errors for a 16x16 block.
403 * It's the sad16_altivec code above w/ squaring added.
405 static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
409 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
410 vector unsigned char perm = vec_lvsl(0, pix2);
411 vector unsigned char t1, t2, t3,t4, t5;
412 vector unsigned int sum;
413 vector signed int sumsqr;
415 sum = (vector unsigned int)vec_splat_u32(0);
417 for (i = 0; i < h; i++) {
418 /* Read potentially unaligned pixels into t1 and t2 */
419 vector unsigned char pix2l = vec_ld( 0, pix2);
420 vector unsigned char pix2r = vec_ld(15, pix2);
421 t1 = vec_ld(0, pix1);
422 t2 = vec_perm(pix2l, pix2r, perm);
424 /* Since we want to use unsigned chars, we can take advantage
425 of the fact that abs(a-b)^2 = (a-b)^2. */
427 /* Calculate abs differences vector */
428 t3 = vec_max(t1, t2);
429 t4 = vec_min(t1, t2);
430 t5 = vec_sub(t3, t4);
432 /* Square the values and add them to our sum */
433 sum = vec_msum(t5, t5, sum);
439 /* Sum up the four partial sums, and put the result into s */
440 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
441 sumsqr = vec_splat(sumsqr, 3);
442 vec_ste(sumsqr, 0, &s);
447 static int pix_sum_altivec(uint8_t * pix, int line_size)
449 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
450 vector unsigned char perm = vec_lvsl(0, pix);
451 vector unsigned char t1;
452 vector unsigned int sad;
453 vector signed int sumdiffs;
458 sad = (vector unsigned int)vec_splat_u32(0);
460 for (i = 0; i < 16; i++) {
461 /* Read the potentially unaligned 16 pixels into t1 */
462 vector unsigned char pixl = vec_ld( 0, pix);
463 vector unsigned char pixr = vec_ld(15, pix);
464 t1 = vec_perm(pixl, pixr, perm);
466 /* Add each 4 pixel group together and put 4 results into sad */
467 sad = vec_sum4s(t1, sad);
472 /* Sum up the four partial sums, and put the result into s */
473 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
474 sumdiffs = vec_splat(sumdiffs, 3);
475 vec_ste(sumdiffs, 0, &s);
480 static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels, int line_size)
483 vector unsigned char perm = vec_lvsl(0, pixels);
484 vector unsigned char bytes;
485 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
486 vector signed short shorts;
488 for (i = 0; i < 8; i++) {
489 // Read potentially unaligned pixels.
490 // We're reading 16 pixels, and actually only want 8,
491 // but we simply ignore the extras.
492 vector unsigned char pixl = vec_ld( 0, pixels);
493 vector unsigned char pixr = vec_ld(15, pixels);
494 bytes = vec_perm(pixl, pixr, perm);
496 // convert the bytes into shorts
497 shorts = (vector signed short)vec_mergeh(zero, bytes);
499 // save the data to the block, we assume the block is 16-byte aligned
500 vec_st(shorts, i*16, (vector signed short*)block);
506 static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
507 const uint8_t *s2, int stride)
510 vector unsigned char perm1 = vec_lvsl(0, s1);
511 vector unsigned char perm2 = vec_lvsl(0, s2);
512 vector unsigned char bytes, pixl, pixr;
513 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
514 vector signed short shorts1, shorts2;
516 for (i = 0; i < 4; i++) {
517 // Read potentially unaligned pixels
518 // We're reading 16 pixels, and actually only want 8,
519 // but we simply ignore the extras.
520 pixl = vec_ld( 0, s1);
521 pixr = vec_ld(15, s1);
522 bytes = vec_perm(pixl, pixr, perm1);
524 // convert the bytes into shorts
525 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
527 // Do the same for the second block of pixels
528 pixl = vec_ld( 0, s2);
529 pixr = vec_ld(15, s2);
530 bytes = vec_perm(pixl, pixr, perm2);
532 // convert the bytes into shorts
533 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
535 // Do the subtraction
536 shorts1 = vec_sub(shorts1, shorts2);
538 // save the data to the block, we assume the block is 16-byte aligned
539 vec_st(shorts1, 0, (vector signed short*)block);
546 // The code below is a copy of the code above... This is a manual
549 // Read potentially unaligned pixels
550 // We're reading 16 pixels, and actually only want 8,
551 // but we simply ignore the extras.
552 pixl = vec_ld( 0, s1);
553 pixr = vec_ld(15, s1);
554 bytes = vec_perm(pixl, pixr, perm1);
556 // convert the bytes into shorts
557 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
559 // Do the same for the second block of pixels
560 pixl = vec_ld( 0, s2);
561 pixr = vec_ld(15, s2);
562 bytes = vec_perm(pixl, pixr, perm2);
564 // convert the bytes into shorts
565 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
567 // Do the subtraction
568 shorts1 = vec_sub(shorts1, shorts2);
570 // save the data to the block, we assume the block is 16-byte aligned
571 vec_st(shorts1, 0, (vector signed short*)block);
580 static void clear_block_altivec(int16_t *block) {
582 vec_st(zero_s16v, 0, block);
583 vec_st(zero_s16v, 16, block);
584 vec_st(zero_s16v, 32, block);
585 vec_st(zero_s16v, 48, block);
586 vec_st(zero_s16v, 64, block);
587 vec_st(zero_s16v, 80, block);
588 vec_st(zero_s16v, 96, block);
589 vec_st(zero_s16v, 112, block);
593 static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
595 register vector unsigned char vdst, vsrc;
597 /* dst and src are 16 bytes-aligned (guaranteed) */
598 for (i = 0 ; (i + 15) < w ; i+=16) {
599 vdst = vec_ld(i, (unsigned char*)dst);
600 vsrc = vec_ld(i, (unsigned char*)src);
601 vdst = vec_add(vsrc, vdst);
602 vec_st(vdst, i, (unsigned char*)dst);
604 /* if w is not a multiple of 16 */
605 for (; (i < w) ; i++) {
610 static int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
612 register const vector unsigned char vzero =
613 (const vector unsigned char)vec_splat_u8(0);
614 register vector signed short temp0, temp1, temp2, temp3, temp4,
617 register const vector signed short vprod1 =(const vector signed short)
618 { 1,-1, 1,-1, 1,-1, 1,-1 };
619 register const vector signed short vprod2 =(const vector signed short)
620 { 1, 1,-1,-1, 1, 1,-1,-1 };
621 register const vector signed short vprod3 =(const vector signed short)
622 { 1, 1, 1, 1,-1,-1,-1,-1 };
623 register const vector unsigned char perm1 = (const vector unsigned char)
624 {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
625 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
626 register const vector unsigned char perm2 = (const vector unsigned char)
627 {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
628 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
629 register const vector unsigned char perm3 = (const vector unsigned char)
630 {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
631 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
633 #define ONEITERBUTTERFLY(i, res) \
635 register vector unsigned char src1, src2, srcO; \
636 register vector unsigned char dst1, dst2, dstO; \
637 register vector signed short srcV, dstV; \
638 register vector signed short but0, but1, but2, op1, op2, op3; \
639 src1 = vec_ld(stride * i, src); \
640 src2 = vec_ld((stride * i) + 15, src); \
641 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
642 dst1 = vec_ld(stride * i, dst); \
643 dst2 = vec_ld((stride * i) + 15, dst); \
644 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
645 /* promote the unsigned chars to signed shorts */ \
646 /* we're in the 8x8 function, we only care for the first 8 */ \
647 srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
648 (vector signed char)srcO); \
649 dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
650 (vector signed char)dstO); \
651 /* subtractions inside the first butterfly */ \
652 but0 = vec_sub(srcV, dstV); \
653 op1 = vec_perm(but0, but0, perm1); \
654 but1 = vec_mladd(but0, vprod1, op1); \
655 op2 = vec_perm(but1, but1, perm2); \
656 but2 = vec_mladd(but1, vprod2, op2); \
657 op3 = vec_perm(but2, but2, perm3); \
658 res = vec_mladd(but2, vprod3, op3); \
660 ONEITERBUTTERFLY(0, temp0);
661 ONEITERBUTTERFLY(1, temp1);
662 ONEITERBUTTERFLY(2, temp2);
663 ONEITERBUTTERFLY(3, temp3);
664 ONEITERBUTTERFLY(4, temp4);
665 ONEITERBUTTERFLY(5, temp5);
666 ONEITERBUTTERFLY(6, temp6);
667 ONEITERBUTTERFLY(7, temp7);
669 #undef ONEITERBUTTERFLY
671 register vector signed int vsum;
672 register vector signed short line0 = vec_add(temp0, temp1);
673 register vector signed short line1 = vec_sub(temp0, temp1);
674 register vector signed short line2 = vec_add(temp2, temp3);
675 register vector signed short line3 = vec_sub(temp2, temp3);
676 register vector signed short line4 = vec_add(temp4, temp5);
677 register vector signed short line5 = vec_sub(temp4, temp5);
678 register vector signed short line6 = vec_add(temp6, temp7);
679 register vector signed short line7 = vec_sub(temp6, temp7);
681 register vector signed short line0B = vec_add(line0, line2);
682 register vector signed short line2B = vec_sub(line0, line2);
683 register vector signed short line1B = vec_add(line1, line3);
684 register vector signed short line3B = vec_sub(line1, line3);
685 register vector signed short line4B = vec_add(line4, line6);
686 register vector signed short line6B = vec_sub(line4, line6);
687 register vector signed short line5B = vec_add(line5, line7);
688 register vector signed short line7B = vec_sub(line5, line7);
690 register vector signed short line0C = vec_add(line0B, line4B);
691 register vector signed short line4C = vec_sub(line0B, line4B);
692 register vector signed short line1C = vec_add(line1B, line5B);
693 register vector signed short line5C = vec_sub(line1B, line5B);
694 register vector signed short line2C = vec_add(line2B, line6B);
695 register vector signed short line6C = vec_sub(line2B, line6B);
696 register vector signed short line3C = vec_add(line3B, line7B);
697 register vector signed short line7C = vec_sub(line3B, line7B);
699 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
700 vsum = vec_sum4s(vec_abs(line1C), vsum);
701 vsum = vec_sum4s(vec_abs(line2C), vsum);
702 vsum = vec_sum4s(vec_abs(line3C), vsum);
703 vsum = vec_sum4s(vec_abs(line4C), vsum);
704 vsum = vec_sum4s(vec_abs(line5C), vsum);
705 vsum = vec_sum4s(vec_abs(line6C), vsum);
706 vsum = vec_sum4s(vec_abs(line7C), vsum);
707 vsum = vec_sums(vsum, (vector signed int)vzero);
708 vsum = vec_splat(vsum, 3);
709 vec_ste(vsum, 0, &sum);
715 16x8 works with 16 elements; it allows to avoid replicating loads, and
716 give the compiler more rooms for scheduling. It's only used from
717 inside hadamard8_diff16_altivec.
719 Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has a LOT
720 of spill code, it seems gcc (unlike xlc) cannot keep everything in registers
721 by itself. The following code include hand-made registers allocation. It's not
722 clean, but on a 7450 the resulting code is much faster (best case fall from
725 xlc doesn't add spill code, but it doesn't know how to schedule for the 7450,
726 and its code isn't much faster than gcc-3.3 on the 7450 (but uses 25% less
729 On the 970, the hand-made RA is still a win (around 690 vs. around 780), but
730 xlc goes to around 660 on the regular C code...
733 static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
735 register vector signed short
736 temp0 __asm__ ("v0"),
737 temp1 __asm__ ("v1"),
738 temp2 __asm__ ("v2"),
739 temp3 __asm__ ("v3"),
740 temp4 __asm__ ("v4"),
741 temp5 __asm__ ("v5"),
742 temp6 __asm__ ("v6"),
743 temp7 __asm__ ("v7");
744 register vector signed short
745 temp0S __asm__ ("v8"),
746 temp1S __asm__ ("v9"),
747 temp2S __asm__ ("v10"),
748 temp3S __asm__ ("v11"),
749 temp4S __asm__ ("v12"),
750 temp5S __asm__ ("v13"),
751 temp6S __asm__ ("v14"),
752 temp7S __asm__ ("v15");
753 register const vector unsigned char vzero __asm__ ("v31") =
754 (const vector unsigned char)vec_splat_u8(0);
756 register const vector signed short vprod1 __asm__ ("v16") =
757 (const vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1 };
758 register const vector signed short vprod2 __asm__ ("v17") =
759 (const vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1 };
760 register const vector signed short vprod3 __asm__ ("v18") =
761 (const vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1 };
762 register const vector unsigned char perm1 __asm__ ("v19") =
763 (const vector unsigned char)
764 {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
765 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
766 register const vector unsigned char perm2 __asm__ ("v20") =
767 (const vector unsigned char)
768 {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
769 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
770 register const vector unsigned char perm3 __asm__ ("v21") =
771 (const vector unsigned char)
772 {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
773 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
775 #define ONEITERBUTTERFLY(i, res1, res2) \
777 register vector unsigned char src1 __asm__ ("v22"), \
778 src2 __asm__ ("v23"), \
779 dst1 __asm__ ("v24"), \
780 dst2 __asm__ ("v25"), \
781 srcO __asm__ ("v22"), \
782 dstO __asm__ ("v23"); \
784 register vector signed short srcV __asm__ ("v24"), \
785 dstV __asm__ ("v25"), \
786 srcW __asm__ ("v26"), \
787 dstW __asm__ ("v27"), \
788 but0 __asm__ ("v28"), \
789 but0S __asm__ ("v29"), \
790 op1 __asm__ ("v30"), \
791 but1 __asm__ ("v22"), \
792 op1S __asm__ ("v23"), \
793 but1S __asm__ ("v24"), \
794 op2 __asm__ ("v25"), \
795 but2 __asm__ ("v26"), \
796 op2S __asm__ ("v27"), \
797 but2S __asm__ ("v28"), \
798 op3 __asm__ ("v29"), \
799 op3S __asm__ ("v30"); \
801 src1 = vec_ld(stride * i, src); \
802 src2 = vec_ld((stride * i) + 16, src); \
803 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
804 dst1 = vec_ld(stride * i, dst); \
805 dst2 = vec_ld((stride * i) + 16, dst); \
806 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
807 /* promote the unsigned chars to signed shorts */ \
808 srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
809 (vector signed char)srcO); \
810 dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
811 (vector signed char)dstO); \
812 srcW = (vector signed short)vec_mergel((vector signed char)vzero, \
813 (vector signed char)srcO); \
814 dstW = (vector signed short)vec_mergel((vector signed char)vzero, \
815 (vector signed char)dstO); \
816 /* subtractions inside the first butterfly */ \
817 but0 = vec_sub(srcV, dstV); \
818 but0S = vec_sub(srcW, dstW); \
819 op1 = vec_perm(but0, but0, perm1); \
820 but1 = vec_mladd(but0, vprod1, op1); \
821 op1S = vec_perm(but0S, but0S, perm1); \
822 but1S = vec_mladd(but0S, vprod1, op1S); \
823 op2 = vec_perm(but1, but1, perm2); \
824 but2 = vec_mladd(but1, vprod2, op2); \
825 op2S = vec_perm(but1S, but1S, perm2); \
826 but2S = vec_mladd(but1S, vprod2, op2S); \
827 op3 = vec_perm(but2, but2, perm3); \
828 res1 = vec_mladd(but2, vprod3, op3); \
829 op3S = vec_perm(but2S, but2S, perm3); \
830 res2 = vec_mladd(but2S, vprod3, op3S); \
832 ONEITERBUTTERFLY(0, temp0, temp0S);
833 ONEITERBUTTERFLY(1, temp1, temp1S);
834 ONEITERBUTTERFLY(2, temp2, temp2S);
835 ONEITERBUTTERFLY(3, temp3, temp3S);
836 ONEITERBUTTERFLY(4, temp4, temp4S);
837 ONEITERBUTTERFLY(5, temp5, temp5S);
838 ONEITERBUTTERFLY(6, temp6, temp6S);
839 ONEITERBUTTERFLY(7, temp7, temp7S);
841 #undef ONEITERBUTTERFLY
843 register vector signed int vsum;
844 register vector signed short line0S, line1S, line2S, line3S, line4S,
845 line5S, line6S, line7S, line0BS,line2BS,
846 line1BS,line3BS,line4BS,line6BS,line5BS,
847 line7BS,line0CS,line4CS,line1CS,line5CS,
848 line2CS,line6CS,line3CS,line7CS;
850 register vector signed short line0 = vec_add(temp0, temp1);
851 register vector signed short line1 = vec_sub(temp0, temp1);
852 register vector signed short line2 = vec_add(temp2, temp3);
853 register vector signed short line3 = vec_sub(temp2, temp3);
854 register vector signed short line4 = vec_add(temp4, temp5);
855 register vector signed short line5 = vec_sub(temp4, temp5);
856 register vector signed short line6 = vec_add(temp6, temp7);
857 register vector signed short line7 = vec_sub(temp6, temp7);
859 register vector signed short line0B = vec_add(line0, line2);
860 register vector signed short line2B = vec_sub(line0, line2);
861 register vector signed short line1B = vec_add(line1, line3);
862 register vector signed short line3B = vec_sub(line1, line3);
863 register vector signed short line4B = vec_add(line4, line6);
864 register vector signed short line6B = vec_sub(line4, line6);
865 register vector signed short line5B = vec_add(line5, line7);
866 register vector signed short line7B = vec_sub(line5, line7);
868 register vector signed short line0C = vec_add(line0B, line4B);
869 register vector signed short line4C = vec_sub(line0B, line4B);
870 register vector signed short line1C = vec_add(line1B, line5B);
871 register vector signed short line5C = vec_sub(line1B, line5B);
872 register vector signed short line2C = vec_add(line2B, line6B);
873 register vector signed short line6C = vec_sub(line2B, line6B);
874 register vector signed short line3C = vec_add(line3B, line7B);
875 register vector signed short line7C = vec_sub(line3B, line7B);
877 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
878 vsum = vec_sum4s(vec_abs(line1C), vsum);
879 vsum = vec_sum4s(vec_abs(line2C), vsum);
880 vsum = vec_sum4s(vec_abs(line3C), vsum);
881 vsum = vec_sum4s(vec_abs(line4C), vsum);
882 vsum = vec_sum4s(vec_abs(line5C), vsum);
883 vsum = vec_sum4s(vec_abs(line6C), vsum);
884 vsum = vec_sum4s(vec_abs(line7C), vsum);
886 line0S = vec_add(temp0S, temp1S);
887 line1S = vec_sub(temp0S, temp1S);
888 line2S = vec_add(temp2S, temp3S);
889 line3S = vec_sub(temp2S, temp3S);
890 line4S = vec_add(temp4S, temp5S);
891 line5S = vec_sub(temp4S, temp5S);
892 line6S = vec_add(temp6S, temp7S);
893 line7S = vec_sub(temp6S, temp7S);
895 line0BS = vec_add(line0S, line2S);
896 line2BS = vec_sub(line0S, line2S);
897 line1BS = vec_add(line1S, line3S);
898 line3BS = vec_sub(line1S, line3S);
899 line4BS = vec_add(line4S, line6S);
900 line6BS = vec_sub(line4S, line6S);
901 line5BS = vec_add(line5S, line7S);
902 line7BS = vec_sub(line5S, line7S);
904 line0CS = vec_add(line0BS, line4BS);
905 line4CS = vec_sub(line0BS, line4BS);
906 line1CS = vec_add(line1BS, line5BS);
907 line5CS = vec_sub(line1BS, line5BS);
908 line2CS = vec_add(line2BS, line6BS);
909 line6CS = vec_sub(line2BS, line6BS);
910 line3CS = vec_add(line3BS, line7BS);
911 line7CS = vec_sub(line3BS, line7BS);
913 vsum = vec_sum4s(vec_abs(line0CS), vsum);
914 vsum = vec_sum4s(vec_abs(line1CS), vsum);
915 vsum = vec_sum4s(vec_abs(line2CS), vsum);
916 vsum = vec_sum4s(vec_abs(line3CS), vsum);
917 vsum = vec_sum4s(vec_abs(line4CS), vsum);
918 vsum = vec_sum4s(vec_abs(line5CS), vsum);
919 vsum = vec_sum4s(vec_abs(line6CS), vsum);
920 vsum = vec_sum4s(vec_abs(line7CS), vsum);
921 vsum = vec_sums(vsum, (vector signed int)vzero);
922 vsum = vec_splat(vsum, 3);
923 vec_ste(vsum, 0, &sum);
928 static int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
930 score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
934 score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
939 av_cold void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx)
941 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
943 c->pix_abs[0][1] = sad16_x2_altivec;
944 c->pix_abs[0][2] = sad16_y2_altivec;
945 c->pix_abs[0][3] = sad16_xy2_altivec;
946 c->pix_abs[0][0] = sad16_altivec;
947 c->pix_abs[1][0] = sad8_altivec;
948 c->sad[0]= sad16_altivec;
949 c->sad[1]= sad8_altivec;
950 c->pix_norm1 = pix_norm1_altivec;
951 c->sse[1]= sse8_altivec;
952 c->sse[0]= sse16_altivec;
953 c->pix_sum = pix_sum_altivec;
954 c->diff_pixels = diff_pixels_altivec;
955 c->add_bytes= add_bytes_altivec;
956 if (!high_bit_depth) {
957 c->get_pixels = get_pixels_altivec;
958 c->clear_block = clear_block_altivec;
961 c->hadamard8_diff[0] = hadamard8_diff16_altivec;
962 c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;