git.sesse.net Git - ffmpeg/blob - libavcodec/ppc/dsputil_altivec.c

   1 /*
   2  * Copyright (c) 2002 Brian Foley
   3  * Copyright (c) 2002 Dieter Shirley
   4  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  */
  20
  21 #include "../dsputil.h"
  22
  23 #include "gcc_fixes.h"
  24
  25 #include "dsputil_altivec.h"
  26
  27 #ifdef CONFIG_DARWIN
  28 #include <sys/sysctl.h>
  29 #else /* CONFIG_DARWIN */
  30 #include <signal.h>
  31 #include <setjmp.h>
  32
  33 static sigjmp_buf jmpbuf;
  34 static volatile sig_atomic_t canjump = 0;
  35
  36 static void sigill_handler (int sig)
  37 {
  38     if (!canjump) {
  39         signal (sig, SIG_DFL);
  40         raise (sig);
  41     }
  42
  43     canjump = 0;
  44     siglongjmp (jmpbuf, 1);
  45 }
  46 #endif /* CONFIG_DARWIN */
  47
  48 int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  49 {
  50     int i;
  51     int s __attribute__((aligned(16)));
  52     const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  53     vector unsigned char *tv;
  54     vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
  55     vector unsigned int sad;
  56     vector signed int sumdiffs;
  57
  58     s = 0;
  59     sad = (vector unsigned int)vec_splat_u32(0);
  60     for(i=0;i<h;i++) {
  61         /*
  62            Read unaligned pixels into our vectors. The vectors are as follows:
  63            pix1v: pix1[0]-pix1[15]
  64            pix2v: pix2[0]-pix2[15]      pix2iv: pix2[1]-pix2[16]
  65         */
  66         tv = (vector unsigned char *) pix1;
  67         pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  68
  69         tv = (vector unsigned char *) &pix2[0];
  70         pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  71
  72         tv = (vector unsigned char *) &pix2[1];
  73         pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  74
  75         /* Calculate the average vector */
  76         avgv = vec_avg(pix2v, pix2iv);
  77
  78         /* Calculate a sum of abs differences vector */
  79         t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  80
  81         /* Add each 4 pixel group together and put 4 results into sad */
  82         sad = vec_sum4s(t5, sad);
  83
  84         pix1 += line_size;
  85         pix2 += line_size;
  86     }
  87     /* Sum up the four partial sums, and put the result into s */
  88     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  89     sumdiffs = vec_splat(sumdiffs, 3);
  90     vec_ste(sumdiffs, 0, &s);
  91
  92     return s;
  93 }
  94
  95 int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  96 {
  97     int i;
  98     int s __attribute__((aligned(16)));
  99     const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
 100     vector unsigned char *tv;
 101     vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
 102     vector unsigned int sad;
 103     vector signed int sumdiffs;
 104     uint8_t *pix3 = pix2 + line_size;
 105
 106     s = 0;
 107     sad = (vector unsigned int)vec_splat_u32(0);
 108
 109     /*
 110        Due to the fact that pix3 = pix2 + line_size, the pix3 of one
 111        iteration becomes pix2 in the next iteration. We can use this
 112        fact to avoid a potentially expensive unaligned read, each
 113        time around the loop.
 114        Read unaligned pixels into our vectors. The vectors are as follows:
 115        pix2v: pix2[0]-pix2[15]
 116        Split the pixel vectors into shorts
 117     */
 118     tv = (vector unsigned char *) &pix2[0];
 119     pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
 120
 121     for(i=0;i<h;i++) {
 122         /*
 123            Read unaligned pixels into our vectors. The vectors are as follows:
 124            pix1v: pix1[0]-pix1[15]
 125            pix3v: pix3[0]-pix3[15]
 126         */
 127         tv = (vector unsigned char *) pix1;
 128         pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
 129
 130         tv = (vector unsigned char *) &pix3[0];
 131         pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
 132
 133         /* Calculate the average vector */
 134         avgv = vec_avg(pix2v, pix3v);
 135
 136         /* Calculate a sum of abs differences vector */
 137         t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
 138
 139         /* Add each 4 pixel group together and put 4 results into sad */
 140         sad = vec_sum4s(t5, sad);
 141
 142         pix1 += line_size;
 143         pix2v = pix3v;
 144         pix3 += line_size;
 145
 146     }
 147
 148     /* Sum up the four partial sums, and put the result into s */
 149     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 150     sumdiffs = vec_splat(sumdiffs, 3);
 151     vec_ste(sumdiffs, 0, &s);
 152     return s;
 153 }
 154
 155 int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 156 {
 157     int i;
 158     int s __attribute__((aligned(16)));
 159     uint8_t *pix3 = pix2 + line_size;
 160     const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
 161     const_vector unsigned short two = (const_vector unsigned short)vec_splat_u16(2);
 162     vector unsigned char *tv, avgv, t5;
 163     vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
 164     vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
 165     vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
 166     vector unsigned short avghv, avglv;
 167     vector unsigned short t1, t2, t3, t4;
 168     vector unsigned int sad;
 169     vector signed int sumdiffs;
 170
 171     sad = (vector unsigned int)vec_splat_u32(0);
 172
 173     s = 0;
 174
 175     /*
 176        Due to the fact that pix3 = pix2 + line_size, the pix3 of one
 177        iteration becomes pix2 in the next iteration. We can use this
 178        fact to avoid a potentially expensive unaligned read, as well
 179        as some splitting, and vector addition each time around the loop.
 180        Read unaligned pixels into our vectors. The vectors are as follows:
 181        pix2v: pix2[0]-pix2[15]  pix2iv: pix2[1]-pix2[16]
 182        Split the pixel vectors into shorts
 183     */
 184     tv = (vector unsigned char *) &pix2[0];
 185     pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
 186
 187     tv = (vector unsigned char *) &pix2[1];
 188     pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
 189
 190     pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
 191     pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
 192     pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
 193     pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
 194     t1 = vec_add(pix2hv, pix2ihv);
 195     t2 = vec_add(pix2lv, pix2ilv);
 196
 197     for(i=0;i<h;i++) {
 198         /*
 199            Read unaligned pixels into our vectors. The vectors are as follows:
 200            pix1v: pix1[0]-pix1[15]
 201            pix3v: pix3[0]-pix3[15]      pix3iv: pix3[1]-pix3[16]
 202         */
 203         tv = (vector unsigned char *) pix1;
 204         pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
 205
 206         tv = (vector unsigned char *) &pix3[0];
 207         pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
 208
 209         tv = (vector unsigned char *) &pix3[1];
 210         pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
 211
 212         /*
 213           Note that Altivec does have vec_avg, but this works on vector pairs
 214           and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
 215           would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
 216           Instead, we have to split the pixel vectors into vectors of shorts,
 217           and do the averaging by hand.
 218         */
 219
 220         /* Split the pixel vectors into shorts */
 221         pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
 222         pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
 223         pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
 224         pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
 225
 226         /* Do the averaging on them */
 227         t3 = vec_add(pix3hv, pix3ihv);
 228         t4 = vec_add(pix3lv, pix3ilv);
 229
 230         avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
 231         avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
 232
 233         /* Pack the shorts back into a result */
 234         avgv = vec_pack(avghv, avglv);
 235
 236         /* Calculate a sum of abs differences vector */
 237         t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
 238
 239         /* Add each 4 pixel group together and put 4 results into sad */
 240         sad = vec_sum4s(t5, sad);
 241
 242         pix1 += line_size;
 243         pix3 += line_size;
 244         /* Transfer the calculated values for pix3 into pix2 */
 245         t1 = t3;
 246         t2 = t4;
 247     }
 248     /* Sum up the four partial sums, and put the result into s */
 249     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 250     sumdiffs = vec_splat(sumdiffs, 3);
 251     vec_ste(sumdiffs, 0, &s);
 252
 253     return s;
 254 }
 255
 256 int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 257 {
 258     int i;
 259     int s __attribute__((aligned(16)));
 260     const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
 261     vector unsigned char perm1, perm2, *pix1v, *pix2v;
 262     vector unsigned char t1, t2, t3,t4, t5;
 263     vector unsigned int sad;
 264     vector signed int sumdiffs;
 265
 266     sad = (vector unsigned int)vec_splat_u32(0);
 267
 268
 269     for(i=0;i<h;i++) {
 270         /* Read potentially unaligned pixels into t1 and t2 */
 271         perm1 = vec_lvsl(0, pix1);
 272         pix1v = (vector unsigned char *) pix1;
 273         perm2 = vec_lvsl(0, pix2);
 274         pix2v = (vector unsigned char *) pix2;
 275         t1 = vec_perm(pix1v[0], pix1v[1], perm1);
 276         t2 = vec_perm(pix2v[0], pix2v[1], perm2);
 277
 278         /* Calculate a sum of abs differences vector */
 279         t3 = vec_max(t1, t2);
 280         t4 = vec_min(t1, t2);
 281         t5 = vec_sub(t3, t4);
 282
 283         /* Add each 4 pixel group together and put 4 results into sad */
 284         sad = vec_sum4s(t5, sad);
 285
 286         pix1 += line_size;
 287         pix2 += line_size;
 288     }
 289
 290     /* Sum up the four partial sums, and put the result into s */
 291     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 292     sumdiffs = vec_splat(sumdiffs, 3);
 293     vec_ste(sumdiffs, 0, &s);
 294
 295     return s;
 296 }
 297
 298 int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 299 {
 300     int i;
 301     int s __attribute__((aligned(16)));
 302     const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
 303     vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
 304     vector unsigned char t1, t2, t3,t4, t5;
 305     vector unsigned int sad;
 306     vector signed int sumdiffs;
 307
 308     sad = (vector unsigned int)vec_splat_u32(0);
 309
 310     permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
 311
 312     for(i=0;i<h;i++) {
 313         /* Read potentially unaligned pixels into t1 and t2
 314            Since we're reading 16 pixels, and actually only want 8,
 315            mask out the last 8 pixels. The 0s don't change the sum. */
 316         perm1 = vec_lvsl(0, pix1);
 317         pix1v = (vector unsigned char *) pix1;
 318         perm2 = vec_lvsl(0, pix2);
 319         pix2v = (vector unsigned char *) pix2;
 320         t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
 321         t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
 322
 323         /* Calculate a sum of abs differences vector */
 324         t3 = vec_max(t1, t2);
 325         t4 = vec_min(t1, t2);
 326         t5 = vec_sub(t3, t4);
 327
 328         /* Add each 4 pixel group together and put 4 results into sad */
 329         sad = vec_sum4s(t5, sad);
 330
 331         pix1 += line_size;
 332         pix2 += line_size;
 333     }
 334
 335     /* Sum up the four partial sums, and put the result into s */
 336     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 337     sumdiffs = vec_splat(sumdiffs, 3);
 338     vec_ste(sumdiffs, 0, &s);
 339
 340     return s;
 341 }
 342
 343 int pix_norm1_altivec(uint8_t *pix, int line_size)
 344 {
 345     int i;
 346     int s __attribute__((aligned(16)));
 347     const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
 348     vector unsigned char *tv;
 349     vector unsigned char pixv;
 350     vector unsigned int sv;
 351     vector signed int sum;
 352
 353     sv = (vector unsigned int)vec_splat_u32(0);
 354
 355     s = 0;
 356     for (i = 0; i < 16; i++) {
 357         /* Read in the potentially unaligned pixels */
 358         tv = (vector unsigned char *) pix;
 359         pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
 360
 361         /* Square the values, and add them to our sum */
 362         sv = vec_msum(pixv, pixv, sv);
 363
 364         pix += line_size;
 365     }
 366     /* Sum up the four partial sums, and put the result into s */
 367     sum = vec_sums((vector signed int) sv, (vector signed int) zero);
 368     sum = vec_splat(sum, 3);
 369     vec_ste(sum, 0, &s);
 370
 371     return s;
 372 }
 373
 374 /**
 375  * Sum of Squared Errors for a 8x8 block.
 376  * AltiVec-enhanced.
 377  * It's the sad8_altivec code above w/ squaring added.
 378  */
 379 int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 380 {
 381     int i;
 382     int s __attribute__((aligned(16)));
 383     const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
 384     vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
 385     vector unsigned char t1, t2, t3,t4, t5;
 386     vector unsigned int sum;
 387     vector signed int sumsqr;
 388
 389     sum = (vector unsigned int)vec_splat_u32(0);
 390
 391     permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
 392
 393
 394     for(i=0;i<h;i++) {
 395         /* Read potentially unaligned pixels into t1 and t2
 396            Since we're reading 16 pixels, and actually only want 8,
 397            mask out the last 8 pixels. The 0s don't change the sum. */
 398         perm1 = vec_lvsl(0, pix1);
 399         pix1v = (vector unsigned char *) pix1;
 400         perm2 = vec_lvsl(0, pix2);
 401         pix2v = (vector unsigned char *) pix2;
 402         t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
 403         t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
 404
 405         /*
 406           Since we want to use unsigned chars, we can take advantage
 407           of the fact that abs(a-b)^2 = (a-b)^2.
 408         */
 409
 410         /* Calculate abs differences vector */
 411         t3 = vec_max(t1, t2);
 412         t4 = vec_min(t1, t2);
 413         t5 = vec_sub(t3, t4);
 414
 415         /* Square the values and add them to our sum */
 416         sum = vec_msum(t5, t5, sum);
 417
 418         pix1 += line_size;
 419         pix2 += line_size;
 420     }
 421
 422     /* Sum up the four partial sums, and put the result into s */
 423     sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
 424     sumsqr = vec_splat(sumsqr, 3);
 425     vec_ste(sumsqr, 0, &s);
 426
 427     return s;
 428 }
 429
 430 /**
 431  * Sum of Squared Errors for a 16x16 block.
 432  * AltiVec-enhanced.
 433  * It's the sad16_altivec code above w/ squaring added.
 434  */
 435 int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 436 {
 437     int i;
 438     int s __attribute__((aligned(16)));
 439     const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
 440     vector unsigned char perm1, perm2, *pix1v, *pix2v;
 441     vector unsigned char t1, t2, t3,t4, t5;
 442     vector unsigned int sum;
 443     vector signed int sumsqr;
 444
 445     sum = (vector unsigned int)vec_splat_u32(0);
 446
 447     for(i=0;i<h;i++) {
 448         /* Read potentially unaligned pixels into t1 and t2 */
 449         perm1 = vec_lvsl(0, pix1);
 450         pix1v = (vector unsigned char *) pix1;
 451         perm2 = vec_lvsl(0, pix2);
 452         pix2v = (vector unsigned char *) pix2;
 453         t1 = vec_perm(pix1v[0], pix1v[1], perm1);
 454         t2 = vec_perm(pix2v[0], pix2v[1], perm2);
 455
 456         /*
 457           Since we want to use unsigned chars, we can take advantage
 458           of the fact that abs(a-b)^2 = (a-b)^2.
 459         */
 460
 461         /* Calculate abs differences vector */
 462         t3 = vec_max(t1, t2);
 463         t4 = vec_min(t1, t2);
 464         t5 = vec_sub(t3, t4);
 465
 466         /* Square the values and add them to our sum */
 467         sum = vec_msum(t5, t5, sum);
 468
 469         pix1 += line_size;
 470         pix2 += line_size;
 471     }
 472
 473     /* Sum up the four partial sums, and put the result into s */
 474     sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
 475     sumsqr = vec_splat(sumsqr, 3);
 476     vec_ste(sumsqr, 0, &s);
 477
 478     return s;
 479 }
 480
 481 int pix_sum_altivec(uint8_t * pix, int line_size)
 482 {
 483     const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
 484     vector unsigned char perm, *pixv;
 485     vector unsigned char t1;
 486     vector unsigned int sad;
 487     vector signed int sumdiffs;
 488
 489     int i;
 490     int s __attribute__((aligned(16)));
 491
 492     sad = (vector unsigned int)vec_splat_u32(0);
 493
 494     for (i = 0; i < 16; i++) {
 495         /* Read the potentially unaligned 16 pixels into t1 */
 496         perm = vec_lvsl(0, pix);
 497         pixv = (vector unsigned char *) pix;
 498         t1 = vec_perm(pixv[0], pixv[1], perm);
 499
 500         /* Add each 4 pixel group together and put 4 results into sad */
 501         sad = vec_sum4s(t1, sad);
 502
 503         pix += line_size;
 504     }
 505
 506     /* Sum up the four partial sums, and put the result into s */
 507     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 508     sumdiffs = vec_splat(sumdiffs, 3);
 509     vec_ste(sumdiffs, 0, &s);
 510
 511     return s;
 512 }
 513
 514 void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 515 {
 516     int i;
 517     vector unsigned char perm, bytes, *pixv;
 518     const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
 519     vector signed short shorts;
 520
 521     for(i=0;i<8;i++)
 522     {
 523         // Read potentially unaligned pixels.
 524         // We're reading 16 pixels, and actually only want 8,
 525         // but we simply ignore the extras.
 526         perm = vec_lvsl(0, pixels);
 527         pixv = (vector unsigned char *) pixels;
 528         bytes = vec_perm(pixv[0], pixv[1], perm);
 529
 530         // convert the bytes into shorts
 531         shorts = (vector signed short)vec_mergeh(zero, bytes);
 532
 533         // save the data to the block, we assume the block is 16-byte aligned
 534         vec_st(shorts, i*16, (vector signed short*)block);
 535
 536         pixels += line_size;
 537     }
 538 }
 539
 540 void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
 541         const uint8_t *s2, int stride)
 542 {
 543     int i;
 544     vector unsigned char perm, bytes, *pixv;
 545     const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
 546     vector signed short shorts1, shorts2;
 547
 548     for(i=0;i<4;i++)
 549     {
 550         // Read potentially unaligned pixels
 551         // We're reading 16 pixels, and actually only want 8,
 552         // but we simply ignore the extras.
 553         perm = vec_lvsl(0, s1);
 554         pixv = (vector unsigned char *) s1;
 555         bytes = vec_perm(pixv[0], pixv[1], perm);
 556
 557         // convert the bytes into shorts
 558         shorts1 = (vector signed short)vec_mergeh(zero, bytes);
 559
 560         // Do the same for the second block of pixels
 561         perm = vec_lvsl(0, s2);
 562         pixv = (vector unsigned char *) s2;
 563         bytes = vec_perm(pixv[0], pixv[1], perm);
 564
 565         // convert the bytes into shorts
 566         shorts2 = (vector signed short)vec_mergeh(zero, bytes);
 567
 568         // Do the subtraction
 569         shorts1 = vec_sub(shorts1, shorts2);
 570
 571         // save the data to the block, we assume the block is 16-byte aligned
 572         vec_st(shorts1, 0, (vector signed short*)block);
 573
 574         s1 += stride;
 575         s2 += stride;
 576         block += 8;
 577
 578
 579         // The code below is a copy of the code above... This is a manual
 580         // unroll.
 581
 582         // Read potentially unaligned pixels
 583         // We're reading 16 pixels, and actually only want 8,
 584         // but we simply ignore the extras.
 585         perm = vec_lvsl(0, s1);
 586         pixv = (vector unsigned char *) s1;
 587         bytes = vec_perm(pixv[0], pixv[1], perm);
 588
 589         // convert the bytes into shorts
 590         shorts1 = (vector signed short)vec_mergeh(zero, bytes);
 591
 592         // Do the same for the second block of pixels
 593         perm = vec_lvsl(0, s2);
 594         pixv = (vector unsigned char *) s2;
 595         bytes = vec_perm(pixv[0], pixv[1], perm);
 596
 597         // convert the bytes into shorts
 598         shorts2 = (vector signed short)vec_mergeh(zero, bytes);
 599
 600         // Do the subtraction
 601         shorts1 = vec_sub(shorts1, shorts2);
 602
 603         // save the data to the block, we assume the block is 16-byte aligned
 604         vec_st(shorts1, 0, (vector signed short*)block);
 605
 606         s1 += stride;
 607         s2 += stride;
 608         block += 8;
 609     }
 610 }
 611
 612 void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
 613 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
 614     int i;
 615     for(i=0; i+7<w; i++){
 616         dst[i+0] += src[i+0];
 617         dst[i+1] += src[i+1];
 618         dst[i+2] += src[i+2];
 619         dst[i+3] += src[i+3];
 620         dst[i+4] += src[i+4];
 621         dst[i+5] += src[i+5];
 622         dst[i+6] += src[i+6];
 623         dst[i+7] += src[i+7];
 624     }
 625     for(; i<w; i++)
 626         dst[i+0] += src[i+0];
 627 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
 628     register int i;
 629     register vector unsigned char vdst, vsrc;
 630
 631     /* dst and src are 16 bytes-aligned (guaranteed) */
 632     for(i = 0 ; (i + 15) < w ; i++)
 633     {
 634       vdst = vec_ld(i << 4, (unsigned char*)dst);
 635       vsrc = vec_ld(i << 4, (unsigned char*)src);
 636       vdst = vec_add(vsrc, vdst);
 637       vec_st(vdst, i << 4, (unsigned char*)dst);
 638     }
 639     /* if w is not a multiple of 16 */
 640     for (; (i < w) ; i++)
 641     {
 642       dst[i] = src[i];
 643     }
 644 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 645 }
 646
 647 /* next one assumes that ((line_size % 16) == 0) */
 648 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 649 {
 650 POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
 651 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
 652     int i;
 653
 654 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
 655
 656     for(i=0; i<h; i++) {
 657       *((uint32_t*)(block)) = LD32(pixels);
 658       *((uint32_t*)(block+4)) = LD32(pixels+4);
 659       *((uint32_t*)(block+8)) = LD32(pixels+8);
 660       *((uint32_t*)(block+12)) = LD32(pixels+12);
 661       pixels+=line_size;
 662       block +=line_size;
 663     }
 664
 665 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
 666
 667 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
 668     register vector unsigned char pixelsv1, pixelsv2;
 669     register vector unsigned char pixelsv1B, pixelsv2B;
 670     register vector unsigned char pixelsv1C, pixelsv2C;
 671     register vector unsigned char pixelsv1D, pixelsv2D;
 672
 673     register vector unsigned char perm = vec_lvsl(0, pixels);
 674     int i;
 675     register int line_size_2 = line_size << 1;
 676     register int line_size_3 = line_size + line_size_2;
 677     register int line_size_4 = line_size << 2;
 678
 679 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
 680 // hand-unrolling the loop by 4 gains about 15%
 681 // mininum execution time goes from 74 to 60 cycles
 682 // it's faster than -funroll-loops, but using
 683 // -funroll-loops w/ this is bad - 74 cycles again.
 684 // all this is on a 7450, tuning for the 7450
 685 #if 0
 686     for(i=0; i<h; i++) {
 687       pixelsv1 = vec_ld(0, (unsigned char*)pixels);
 688       pixelsv2 = vec_ld(16, (unsigned char*)pixels);
 689       vec_st(vec_perm(pixelsv1, pixelsv2, perm),
 690              0, (unsigned char*)block);
 691       pixels+=line_size;
 692       block +=line_size;
 693     }
 694 #else
 695     for(i=0; i<h; i+=4) {
 696       pixelsv1 = vec_ld(0, (unsigned char*)pixels);
 697       pixelsv2 = vec_ld(16, (unsigned char*)pixels);
 698       pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
 699       pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels);
 700       pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
 701       pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels);
 702       pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
 703       pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels);
 704       vec_st(vec_perm(pixelsv1, pixelsv2, perm),
 705              0, (unsigned char*)block);
 706       vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
 707              line_size, (unsigned char*)block);
 708       vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
 709              line_size_2, (unsigned char*)block);
 710       vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
 711              line_size_3, (unsigned char*)block);
 712       pixels+=line_size_4;
 713       block +=line_size_4;
 714     }
 715 #endif
 716 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
 717
 718 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 719 }
 720
 721 /* next one assumes that ((line_size % 16) == 0) */
 722 #define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
 723 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 724 {
 725 POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
 726 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
 727     int i;
 728
 729 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
 730
 731     for(i=0; i<h; i++) {
 732       op_avg(*((uint32_t*)(block)),LD32(pixels));
 733       op_avg(*((uint32_t*)(block+4)),LD32(pixels+4));
 734       op_avg(*((uint32_t*)(block+8)),LD32(pixels+8));
 735       op_avg(*((uint32_t*)(block+12)),LD32(pixels+12));
 736       pixels+=line_size;
 737       block +=line_size;
 738     }
 739
 740 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
 741
 742 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
 743     register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
 744     register vector unsigned char perm = vec_lvsl(0, pixels);
 745     int i;
 746
 747 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
 748
 749     for(i=0; i<h; i++) {
 750       pixelsv1 = vec_ld(0, (unsigned char*)pixels);
 751       pixelsv2 = vec_ld(16, (unsigned char*)pixels);
 752       blockv = vec_ld(0, block);
 753       pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
 754       blockv = vec_avg(blockv,pixelsv);
 755       vec_st(blockv, 0, (unsigned char*)block);
 756       pixels+=line_size;
 757       block +=line_size;
 758     }
 759
 760 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
 761
 762 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 763 }
 764
 765 /* next one assumes that ((line_size % 8) == 0) */
 766 void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
 767 {
 768 POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
 769 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
 770     int i;
 771 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
 772     for (i = 0; i < h; i++) {
 773         *((uint32_t *) (block)) =
 774             (((*((uint32_t *) (block))) |
 775               ((((const struct unaligned_32 *) (pixels))->l))) -
 776              ((((*((uint32_t *) (block))) ^
 777                 ((((const struct unaligned_32 *) (pixels))->
 778                   l))) & 0xFEFEFEFEUL) >> 1));
 779         *((uint32_t *) (block + 4)) =
 780             (((*((uint32_t *) (block + 4))) |
 781               ((((const struct unaligned_32 *) (pixels + 4))->l))) -
 782              ((((*((uint32_t *) (block + 4))) ^
 783                 ((((const struct unaligned_32 *) (pixels +
 784                                                   4))->
 785                   l))) & 0xFEFEFEFEUL) >> 1));
 786         pixels += line_size;
 787         block += line_size;
 788     }
 789 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
 790
 791 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
 792     register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
 793     int i;
 794
 795 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
 796
 797    for (i = 0; i < h; i++) {
 798      /*
 799        block is 8 bytes-aligned, so we're either in the
 800        left block (16 bytes-aligned) or in the right block (not)
 801      */
 802      int rightside = ((unsigned long)block & 0x0000000F);
 803
 804      blockv = vec_ld(0, block);
 805      pixelsv1 = vec_ld(0, (unsigned char*)pixels);
 806      pixelsv2 = vec_ld(16, (unsigned char*)pixels);
 807      pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
 808
 809      if (rightside)
 810      {
 811        pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
 812      }
 813      else
 814      {
 815        pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
 816      }
 817
 818      blockv = vec_avg(blockv, pixelsv);
 819
 820      vec_st(blockv, 0, block);
 821
 822      pixels += line_size;
 823      block += line_size;
 824    }
 825
 826 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
 827
 828 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 829 }
 830
 831 /* next one assumes that ((line_size % 8) == 0) */
 832 void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 833 {
 834 POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
 835 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
 836     int j;
 837 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
 838     for (j = 0; j < 2; j++) {
 839       int i;
 840       const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
 841       const uint32_t b =
 842         (((const struct unaligned_32 *) (pixels + 1))->l);
 843       uint32_t l0 =
 844         (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
 845       uint32_t h0 =
 846         ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
 847       uint32_t l1, h1;
 848       pixels += line_size;
 849       for (i = 0; i < h; i += 2) {
 850         uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
 851         uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
 852         l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
 853         h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
 854         *((uint32_t *) block) =
 855           h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
 856         pixels += line_size;
 857         block += line_size;
 858         a = (((const struct unaligned_32 *) (pixels))->l);
 859         b = (((const struct unaligned_32 *) (pixels + 1))->l);
 860         l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
 861         h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
 862         *((uint32_t *) block) =
 863           h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
 864         pixels += line_size;
 865         block += line_size;
 866       } pixels += 4 - line_size * (h + 1);
 867       block += 4 - line_size * h;
 868     }
 869
 870 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
 871
 872 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
 873    register int i;
 874    register vector unsigned char
 875      pixelsv1, pixelsv2,
 876      pixelsavg;
 877    register vector unsigned char
 878      blockv, temp1, temp2;
 879    register vector unsigned short
 880      pixelssum1, pixelssum2, temp3;
 881    register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
 882    register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
 883
 884    temp1 = vec_ld(0, pixels);
 885    temp2 = vec_ld(16, pixels);
 886    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
 887    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
 888    {
 889      pixelsv2 = temp2;
 890    }
 891    else
 892    {
 893      pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
 894    }
 895    pixelsv1 = vec_mergeh(vczero, pixelsv1);
 896    pixelsv2 = vec_mergeh(vczero, pixelsv2);
 897    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
 898                         (vector unsigned short)pixelsv2);
 899    pixelssum1 = vec_add(pixelssum1, vctwo);
 900
 901 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
 902    for (i = 0; i < h ; i++) {
 903      int rightside = ((unsigned long)block & 0x0000000F);
 904      blockv = vec_ld(0, block);
 905
 906      temp1 = vec_ld(line_size, pixels);
 907      temp2 = vec_ld(line_size + 16, pixels);
 908      pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
 909      if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
 910      {
 911        pixelsv2 = temp2;
 912      }
 913      else
 914      {
 915        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
 916      }
 917
 918      pixelsv1 = vec_mergeh(vczero, pixelsv1);
 919      pixelsv2 = vec_mergeh(vczero, pixelsv2);
 920      pixelssum2 = vec_add((vector unsigned short)pixelsv1,
 921                           (vector unsigned short)pixelsv2);
 922      temp3 = vec_add(pixelssum1, pixelssum2);
 923      temp3 = vec_sra(temp3, vctwo);
 924      pixelssum1 = vec_add(pixelssum2, vctwo);
 925      pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
 926
 927      if (rightside)
 928      {
 929        blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
 930      }
 931      else
 932      {
 933        blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
 934      }
 935
 936      vec_st(blockv, 0, block);
 937
 938      block += line_size;
 939      pixels += line_size;
 940    }
 941
 942 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
 943 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 944 }
 945
 946 /* next one assumes that ((line_size % 8) == 0) */
 947 void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 948 {
 949 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
 950 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
 951     int j;
 952 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
 953     for (j = 0; j < 2; j++) {
 954       int i;
 955       const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
 956       const uint32_t b =
 957         (((const struct unaligned_32 *) (pixels + 1))->l);
 958       uint32_t l0 =
 959         (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
 960       uint32_t h0 =
 961         ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
 962       uint32_t l1, h1;
 963       pixels += line_size;
 964       for (i = 0; i < h; i += 2) {
 965         uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
 966         uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
 967         l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
 968         h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
 969         *((uint32_t *) block) =
 970           h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
 971         pixels += line_size;
 972         block += line_size;
 973         a = (((const struct unaligned_32 *) (pixels))->l);
 974         b = (((const struct unaligned_32 *) (pixels + 1))->l);
 975         l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
 976         h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
 977         *((uint32_t *) block) =
 978           h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
 979         pixels += line_size;
 980         block += line_size;
 981       } pixels += 4 - line_size * (h + 1);
 982       block += 4 - line_size * h;
 983     }
 984
 985 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
 986
 987 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
 988    register int i;
 989    register vector unsigned char
 990      pixelsv1, pixelsv2,
 991      pixelsavg;
 992    register vector unsigned char
 993      blockv, temp1, temp2;
 994    register vector unsigned short
 995      pixelssum1, pixelssum2, temp3;
 996    register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
 997    register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
 998    register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
 999
1000    temp1 = vec_ld(0, pixels);
1001    temp2 = vec_ld(16, pixels);
1002    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1003    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
1004    {
1005      pixelsv2 = temp2;
1006    }
1007    else
1008    {
1009      pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1010    }
1011    pixelsv1 = vec_mergeh(vczero, pixelsv1);
1012    pixelsv2 = vec_mergeh(vczero, pixelsv2);
1013    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1014                         (vector unsigned short)pixelsv2);
1015    pixelssum1 = vec_add(pixelssum1, vcone);
1016
1017 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1018    for (i = 0; i < h ; i++) {
1019      int rightside = ((unsigned long)block & 0x0000000F);
1020      blockv = vec_ld(0, block);
1021
1022      temp1 = vec_ld(line_size, pixels);
1023      temp2 = vec_ld(line_size + 16, pixels);
1024      pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1025      if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
1026      {
1027        pixelsv2 = temp2;
1028      }
1029      else
1030      {
1031        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1032      }
1033
1034      pixelsv1 = vec_mergeh(vczero, pixelsv1);
1035      pixelsv2 = vec_mergeh(vczero, pixelsv2);
1036      pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1037                           (vector unsigned short)pixelsv2);
1038      temp3 = vec_add(pixelssum1, pixelssum2);
1039      temp3 = vec_sra(temp3, vctwo);
1040      pixelssum1 = vec_add(pixelssum2, vcone);
1041      pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
1042
1043      if (rightside)
1044      {
1045        blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
1046      }
1047      else
1048      {
1049        blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
1050      }
1051
1052      vec_st(blockv, 0, block);
1053
1054      block += line_size;
1055      pixels += line_size;
1056    }
1057
1058 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1059 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1060 }
1061
1062 /* next one assumes that ((line_size % 16) == 0) */
1063 void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1064 {
1065 POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
1066 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
1067     int j;
1068 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1069       for (j = 0; j < 4; j++) {
1070       int i;
1071       const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1072       const uint32_t b =
1073         (((const struct unaligned_32 *) (pixels + 1))->l);
1074       uint32_t l0 =
1075         (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1076       uint32_t h0 =
1077         ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1078       uint32_t l1, h1;
1079       pixels += line_size;
1080       for (i = 0; i < h; i += 2) {
1081         uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1082         uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1083         l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1084         h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1085         *((uint32_t *) block) =
1086           h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1087         pixels += line_size;
1088         block += line_size;
1089         a = (((const struct unaligned_32 *) (pixels))->l);
1090         b = (((const struct unaligned_32 *) (pixels + 1))->l);
1091         l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1092         h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1093         *((uint32_t *) block) =
1094           h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1095         pixels += line_size;
1096         block += line_size;
1097       } pixels += 4 - line_size * (h + 1);
1098       block += 4 - line_size * h;
1099     }
1100
1101 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1102
1103 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
1104    register int i;
1105    register vector unsigned char
1106      pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1107    register vector unsigned char
1108      blockv, temp1, temp2;
1109    register vector unsigned short
1110      pixelssum1, pixelssum2, temp3,
1111      pixelssum3, pixelssum4, temp4;
1112    register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
1113    register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
1114
1115 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1116
1117    temp1 = vec_ld(0, pixels);
1118    temp2 = vec_ld(16, pixels);
1119    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1120    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
1121    {
1122      pixelsv2 = temp2;
1123    }
1124    else
1125    {
1126      pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1127    }
1128    pixelsv3 = vec_mergel(vczero, pixelsv1);
1129    pixelsv4 = vec_mergel(vczero, pixelsv2);
1130    pixelsv1 = vec_mergeh(vczero, pixelsv1);
1131    pixelsv2 = vec_mergeh(vczero, pixelsv2);
1132    pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1133                         (vector unsigned short)pixelsv4);
1134    pixelssum3 = vec_add(pixelssum3, vctwo);
1135    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1136                         (vector unsigned short)pixelsv2);
1137    pixelssum1 = vec_add(pixelssum1, vctwo);
1138
1139    for (i = 0; i < h ; i++) {
1140      blockv = vec_ld(0, block);
1141
1142      temp1 = vec_ld(line_size, pixels);
1143      temp2 = vec_ld(line_size + 16, pixels);
1144      pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1145      if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
1146      {
1147        pixelsv2 = temp2;
1148      }
1149      else
1150      {
1151        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1152      }
1153
1154      pixelsv3 = vec_mergel(vczero, pixelsv1);
1155      pixelsv4 = vec_mergel(vczero, pixelsv2);
1156      pixelsv1 = vec_mergeh(vczero, pixelsv1);
1157      pixelsv2 = vec_mergeh(vczero, pixelsv2);
1158
1159      pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1160                           (vector unsigned short)pixelsv4);
1161      pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1162                           (vector unsigned short)pixelsv2);
1163      temp4 = vec_add(pixelssum3, pixelssum4);
1164      temp4 = vec_sra(temp4, vctwo);
1165      temp3 = vec_add(pixelssum1, pixelssum2);
1166      temp3 = vec_sra(temp3, vctwo);
1167
1168      pixelssum3 = vec_add(pixelssum4, vctwo);
1169      pixelssum1 = vec_add(pixelssum2, vctwo);
1170
1171      blockv = vec_packsu(temp3, temp4);
1172
1173      vec_st(blockv, 0, block);
1174
1175      block += line_size;
1176      pixels += line_size;
1177    }
1178
1179 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1180 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1181 }
1182
1183 /* next one assumes that ((line_size % 16) == 0) */
1184 void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1185 {
1186 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
1187 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
1188     int j;
1189 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1190       for (j = 0; j < 4; j++) {
1191       int i;
1192       const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1193       const uint32_t b =
1194         (((const struct unaligned_32 *) (pixels + 1))->l);
1195       uint32_t l0 =
1196         (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1197       uint32_t h0 =
1198         ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1199       uint32_t l1, h1;
1200       pixels += line_size;
1201       for (i = 0; i < h; i += 2) {
1202         uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1203         uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1204         l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1205         h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1206         *((uint32_t *) block) =
1207           h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1208         pixels += line_size;
1209         block += line_size;
1210         a = (((const struct unaligned_32 *) (pixels))->l);
1211         b = (((const struct unaligned_32 *) (pixels + 1))->l);
1212         l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1213         h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1214         *((uint32_t *) block) =
1215           h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1216         pixels += line_size;
1217         block += line_size;
1218       } pixels += 4 - line_size * (h + 1);
1219       block += 4 - line_size * h;
1220     }
1221
1222 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1223
1224 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
1225    register int i;
1226    register vector unsigned char
1227      pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1228    register vector unsigned char
1229      blockv, temp1, temp2;
1230    register vector unsigned short
1231      pixelssum1, pixelssum2, temp3,
1232      pixelssum3, pixelssum4, temp4;
1233    register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
1234    register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
1235    register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
1236
1237 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1238
1239    temp1 = vec_ld(0, pixels);
1240    temp2 = vec_ld(16, pixels);
1241    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1242    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
1243    {
1244      pixelsv2 = temp2;
1245    }
1246    else
1247    {
1248      pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1249    }
1250    pixelsv3 = vec_mergel(vczero, pixelsv1);
1251    pixelsv4 = vec_mergel(vczero, pixelsv2);
1252    pixelsv1 = vec_mergeh(vczero, pixelsv1);
1253    pixelsv2 = vec_mergeh(vczero, pixelsv2);
1254    pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1255                         (vector unsigned short)pixelsv4);
1256    pixelssum3 = vec_add(pixelssum3, vcone);
1257    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1258                         (vector unsigned short)pixelsv2);
1259    pixelssum1 = vec_add(pixelssum1, vcone);
1260
1261    for (i = 0; i < h ; i++) {
1262      blockv = vec_ld(0, block);
1263
1264      temp1 = vec_ld(line_size, pixels);
1265      temp2 = vec_ld(line_size + 16, pixels);
1266      pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1267      if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
1268      {
1269        pixelsv2 = temp2;
1270      }
1271      else
1272      {
1273        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1274      }
1275
1276      pixelsv3 = vec_mergel(vczero, pixelsv1);
1277      pixelsv4 = vec_mergel(vczero, pixelsv2);
1278      pixelsv1 = vec_mergeh(vczero, pixelsv1);
1279      pixelsv2 = vec_mergeh(vczero, pixelsv2);
1280
1281      pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1282                           (vector unsigned short)pixelsv4);
1283      pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1284                           (vector unsigned short)pixelsv2);
1285      temp4 = vec_add(pixelssum3, pixelssum4);
1286      temp4 = vec_sra(temp4, vctwo);
1287      temp3 = vec_add(pixelssum1, pixelssum2);
1288      temp3 = vec_sra(temp3, vctwo);
1289
1290      pixelssum3 = vec_add(pixelssum4, vcone);
1291      pixelssum1 = vec_add(pixelssum2, vcone);
1292
1293      blockv = vec_packsu(temp3, temp4);
1294
1295      vec_st(blockv, 0, block);
1296
1297      block += line_size;
1298      pixels += line_size;
1299    }
1300
1301 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1302 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1303 }
1304
1305 int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1306 POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
1307   int sum;
1308 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
1309   register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0);
1310   register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1311 #ifdef CONFIG_DARWIN
1312   {
1313     register const_vector signed short vprod1 = (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1);
1314     register const_vector signed short vprod2 = (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1);
1315     register const_vector signed short vprod3 = (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1);
1316     register const_vector unsigned char perm1 = (const_vector unsigned char)
1317       (0x02, 0x03, 0x00, 0x01,
1318        0x06, 0x07, 0x04, 0x05,
1319        0x0A, 0x0B, 0x08, 0x09,
1320        0x0E, 0x0F, 0x0C, 0x0D);
1321     register const_vector unsigned char perm2 = (const_vector unsigned char)
1322       (0x04, 0x05, 0x06, 0x07,
1323        0x00, 0x01, 0x02, 0x03,
1324        0x0C, 0x0D, 0x0E, 0x0F,
1325        0x08, 0x09, 0x0A, 0x0B);
1326     register const_vector unsigned char perm3 = (const_vector unsigned char)
1327       (0x08, 0x09, 0x0A, 0x0B,
1328        0x0C, 0x0D, 0x0E, 0x0F,
1329        0x00, 0x01, 0x02, 0x03,
1330        0x04, 0x05, 0x06, 0x07);
1331 #else
1332   {
1333     register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1};
1334     register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1};
1335     register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1};
1336     register const_vector unsigned char perm1 = (const_vector unsigned char)
1337       {0x02, 0x03, 0x00, 0x01,
1338        0x06, 0x07, 0x04, 0x05,
1339        0x0A, 0x0B, 0x08, 0x09,
1340        0x0E, 0x0F, 0x0C, 0x0D};
1341     register const_vector unsigned char perm2 = (const_vector unsigned char)
1342       {0x04, 0x05, 0x06, 0x07,
1343        0x00, 0x01, 0x02, 0x03,
1344        0x0C, 0x0D, 0x0E, 0x0F,
1345        0x08, 0x09, 0x0A, 0x0B};
1346     register const_vector unsigned char perm3 = (const_vector unsigned char)
1347       {0x08, 0x09, 0x0A, 0x0B,
1348        0x0C, 0x0D, 0x0E, 0x0F,
1349        0x00, 0x01, 0x02, 0x03,
1350        0x04, 0x05, 0x06, 0x07};
1351 #endif
1352
1353 #define ONEITERBUTTERFLY(i, res)                                        \
1354     {                                                                   \
1355       register vector unsigned char src1, src2, srcO;                   \
1356       register vector unsigned char dst1, dst2, dstO;                   \
1357       src1 = vec_ld(stride * i, src);                                   \
1358       if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8)       \
1359         src2 = vec_ld((stride * i) + 16, src);                          \
1360       srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src));           \
1361       dst1 = vec_ld(stride * i, dst);                                   \
1362       if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8)       \
1363         dst2 = vec_ld((stride * i) + 16, dst);                          \
1364       dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst));           \
1365       /* promote the unsigned chars to signed shorts */                 \
1366       /* we're in the 8x8 function, we only care for the first 8 */     \
1367       register vector signed short srcV =                               \
1368         (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
1369       register vector signed short dstV =                               \
1370         (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
1371       /* substractions inside the first butterfly */                    \
1372       register vector signed short but0 = vec_sub(srcV, dstV);          \
1373       register vector signed short op1 = vec_perm(but0, but0, perm1);   \
1374       register vector signed short but1 = vec_mladd(but0, vprod1, op1); \
1375       register vector signed short op2 = vec_perm(but1, but1, perm2);   \
1376       register vector signed short but2 = vec_mladd(but1, vprod2, op2); \
1377       register vector signed short op3 = vec_perm(but2, but2, perm3);   \
1378       res = vec_mladd(but2, vprod3, op3);                               \
1379     }
1380     ONEITERBUTTERFLY(0, temp0);
1381     ONEITERBUTTERFLY(1, temp1);
1382     ONEITERBUTTERFLY(2, temp2);
1383     ONEITERBUTTERFLY(3, temp3);
1384     ONEITERBUTTERFLY(4, temp4);
1385     ONEITERBUTTERFLY(5, temp5);
1386     ONEITERBUTTERFLY(6, temp6);
1387     ONEITERBUTTERFLY(7, temp7);
1388   }
1389 #undef ONEITERBUTTERFLY
1390   {
1391     register vector signed int vsum;
1392     register vector signed short line0 = vec_add(temp0, temp1);
1393     register vector signed short line1 = vec_sub(temp0, temp1);
1394     register vector signed short line2 = vec_add(temp2, temp3);
1395     register vector signed short line3 = vec_sub(temp2, temp3);
1396     register vector signed short line4 = vec_add(temp4, temp5);
1397     register vector signed short line5 = vec_sub(temp4, temp5);
1398     register vector signed short line6 = vec_add(temp6, temp7);
1399     register vector signed short line7 = vec_sub(temp6, temp7);
1400
1401     register vector signed short line0B = vec_add(line0, line2);
1402     register vector signed short line2B = vec_sub(line0, line2);
1403     register vector signed short line1B = vec_add(line1, line3);
1404     register vector signed short line3B = vec_sub(line1, line3);
1405     register vector signed short line4B = vec_add(line4, line6);
1406     register vector signed short line6B = vec_sub(line4, line6);
1407     register vector signed short line5B = vec_add(line5, line7);
1408     register vector signed short line7B = vec_sub(line5, line7);
1409
1410     register vector signed short line0C = vec_add(line0B, line4B);
1411     register vector signed short line4C = vec_sub(line0B, line4B);
1412     register vector signed short line1C = vec_add(line1B, line5B);
1413     register vector signed short line5C = vec_sub(line1B, line5B);
1414     register vector signed short line2C = vec_add(line2B, line6B);
1415     register vector signed short line6C = vec_sub(line2B, line6B);
1416     register vector signed short line3C = vec_add(line3B, line7B);
1417     register vector signed short line7C = vec_sub(line3B, line7B);
1418
1419     vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
1420     vsum = vec_sum4s(vec_abs(line1C), vsum);
1421     vsum = vec_sum4s(vec_abs(line2C), vsum);
1422     vsum = vec_sum4s(vec_abs(line3C), vsum);
1423     vsum = vec_sum4s(vec_abs(line4C), vsum);
1424     vsum = vec_sum4s(vec_abs(line5C), vsum);
1425     vsum = vec_sum4s(vec_abs(line6C), vsum);
1426     vsum = vec_sum4s(vec_abs(line7C), vsum);
1427     vsum = vec_sums(vsum, (vector signed int)vzero);
1428     vsum = vec_splat(vsum, 3);
1429     vec_ste(vsum, 0, &sum);
1430   }
1431 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
1432   return sum;
1433 }
1434
1435 /*
1436   16x8 works with 16 elements ; it allows to avoid replicating
1437   loads, and give the compiler more rooms for scheduling.
1438   It's only used from inside hadamard8_diff16_altivec.
1439
1440   Unfortunately, it seems gcc-3.3 is a bit dumb, and
1441   the compiled code has a LOT of spill code, it seems
1442   gcc (unlike xlc) cannot keep everything in registers
1443   by itself. The following code include hand-made
1444   registers allocation. It's not clean, but on
1445   a 7450 the resulting code is much faster (best case
1446   fall from 700+ cycles to 550).
1447
1448   xlc doesn't add spill code, but it doesn't know how to
1449   schedule for the 7450, and its code isn't much faster than
1450   gcc-3.3 on the 7450 (but uses 25% less instructions...)
1451
1452   On the 970, the hand-made RA is still a win (arount 690
1453   vs. around 780), but xlc goes to around 660 on the
1454   regular C code...
1455 */
1456
1457 static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
1458   int sum;
1459   register vector signed short
1460     temp0 asm ("v0"),
1461     temp1 asm ("v1"),
1462     temp2 asm ("v2"),
1463     temp3 asm ("v3"),
1464     temp4 asm ("v4"),
1465     temp5 asm ("v5"),
1466     temp6 asm ("v6"),
1467     temp7 asm ("v7");
1468   register vector signed short
1469     temp0S asm ("v8"),
1470     temp1S asm ("v9"),
1471     temp2S asm ("v10"),
1472     temp3S asm ("v11"),
1473     temp4S asm ("v12"),
1474     temp5S asm ("v13"),
1475     temp6S asm ("v14"),
1476     temp7S asm ("v15");
1477   register const_vector unsigned char vzero asm ("v31")= (const_vector unsigned char)vec_splat_u8(0);
1478   {
1479 #ifdef CONFIG_DARWIN
1480     register const_vector signed short vprod1 asm ("v16")= (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1);
1481     register const_vector signed short vprod2 asm ("v17")= (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1);
1482     register const_vector signed short vprod3 asm ("v18")= (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1);
1483     register const_vector unsigned char perm1 asm ("v19")= (const_vector unsigned char)
1484       (0x02, 0x03, 0x00, 0x01,
1485        0x06, 0x07, 0x04, 0x05,
1486        0x0A, 0x0B, 0x08, 0x09,
1487        0x0E, 0x0F, 0x0C, 0x0D);
1488     register const_vector unsigned char perm2 asm ("v20")= (const_vector unsigned char)
1489       (0x04, 0x05, 0x06, 0x07,
1490        0x00, 0x01, 0x02, 0x03,
1491        0x0C, 0x0D, 0x0E, 0x0F,
1492        0x08, 0x09, 0x0A, 0x0B);
1493     register const_vector unsigned char perm3 asm ("v21")= (const_vector unsigned char)
1494       (0x08, 0x09, 0x0A, 0x0B,
1495        0x0C, 0x0D, 0x0E, 0x0F,
1496        0x00, 0x01, 0x02, 0x03,
1497        0x04, 0x05, 0x06, 0x07);
1498 #else
1499     register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1};
1500     register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1};
1501     register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1};
1502     register const_vector unsigned char perm1 = (const_vector unsigned char)
1503       {0x02, 0x03, 0x00, 0x01,
1504        0x06, 0x07, 0x04, 0x05,
1505        0x0A, 0x0B, 0x08, 0x09,
1506        0x0E, 0x0F, 0x0C, 0x0D};
1507     register const_vector unsigned char perm2 = (const_vector unsigned char)
1508       {0x04, 0x05, 0x06, 0x07,
1509        0x00, 0x01, 0x02, 0x03,
1510        0x0C, 0x0D, 0x0E, 0x0F,
1511        0x08, 0x09, 0x0A, 0x0B};
1512     register const_vector unsigned char perm3 = (const_vector unsigned char)
1513       {0x08, 0x09, 0x0A, 0x0B,
1514        0x0C, 0x0D, 0x0E, 0x0F,
1515        0x00, 0x01, 0x02, 0x03,
1516        0x04, 0x05, 0x06, 0x07};
1517 #endif
1518 #define ONEITERBUTTERFLY(i, res1, res2)                                 \
1519     {                                                                   \
1520       register vector unsigned char src1 asm ("v22"), src2 asm ("v23"); \
1521       register vector unsigned char dst1 asm ("v24"), dst2 asm ("v25"); \
1522       src1 = vec_ld(stride * i, src);                                   \
1523       src2 = vec_ld((stride * i) + 16, src);                            \
1524       register vector unsigned char srcO asm ("v22") = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
1525       dst1 = vec_ld(stride * i, dst);                                   \
1526       dst2 = vec_ld((stride * i) + 16, dst);                            \
1527       register vector unsigned char dstO asm ("v23") = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
1528       /* promote the unsigned chars to signed shorts */                 \
1529       register vector signed short srcV asm ("v24") =                   \
1530         (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \
1531       register vector signed short dstV asm ("v25") =                   \
1532         (vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \
1533       register vector signed short srcW asm ("v26") =                   \
1534         (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \
1535       register vector signed short dstW asm ("v27") =                   \
1536         (vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \
1537       /* substractions inside the first butterfly */                    \
1538       register vector signed short but0 asm ("v28") = vec_sub(srcV, dstV); \
1539       register vector signed short but0S asm ("v29") = vec_sub(srcW, dstW); \
1540       register vector signed short op1 asm ("v30") = vec_perm(but0, but0, perm1); \
1541       register vector signed short but1 asm ("v22") = vec_mladd(but0, vprod1, op1); \
1542       register vector signed short op1S asm ("v23") = vec_perm(but0S, but0S, perm1); \
1543       register vector signed short but1S asm ("v24") = vec_mladd(but0S, vprod1, op1S); \
1544       register vector signed short op2 asm ("v25") = vec_perm(but1, but1, perm2); \
1545       register vector signed short but2 asm ("v26") = vec_mladd(but1, vprod2, op2); \
1546       register vector signed short op2S asm ("v27") = vec_perm(but1S, but1S, perm2); \
1547       register vector signed short but2S asm ("v28") = vec_mladd(but1S, vprod2, op2S); \
1548       register vector signed short op3 asm ("v29") = vec_perm(but2, but2, perm3); \
1549       res1 = vec_mladd(but2, vprod3, op3);                              \
1550       register vector signed short op3S asm ("v30") = vec_perm(but2S, but2S, perm3); \
1551       res2 = vec_mladd(but2S, vprod3, op3S);                            \
1552     }
1553     ONEITERBUTTERFLY(0, temp0, temp0S);
1554     ONEITERBUTTERFLY(1, temp1, temp1S);
1555     ONEITERBUTTERFLY(2, temp2, temp2S);
1556     ONEITERBUTTERFLY(3, temp3, temp3S);
1557     ONEITERBUTTERFLY(4, temp4, temp4S);
1558     ONEITERBUTTERFLY(5, temp5, temp5S);
1559     ONEITERBUTTERFLY(6, temp6, temp6S);
1560     ONEITERBUTTERFLY(7, temp7, temp7S);
1561   }
1562 #undef ONEITERBUTTERFLY
1563   {
1564     register vector signed int vsum;
1565     register vector signed short line0 = vec_add(temp0, temp1);
1566     register vector signed short line1 = vec_sub(temp0, temp1);
1567     register vector signed short line2 = vec_add(temp2, temp3);
1568     register vector signed short line3 = vec_sub(temp2, temp3);
1569     register vector signed short line4 = vec_add(temp4, temp5);
1570     register vector signed short line5 = vec_sub(temp4, temp5);
1571     register vector signed short line6 = vec_add(temp6, temp7);
1572     register vector signed short line7 = vec_sub(temp6, temp7);
1573
1574     register vector signed short line0B = vec_add(line0, line2);
1575     register vector signed short line2B = vec_sub(line0, line2);
1576     register vector signed short line1B = vec_add(line1, line3);
1577     register vector signed short line3B = vec_sub(line1, line3);
1578     register vector signed short line4B = vec_add(line4, line6);
1579     register vector signed short line6B = vec_sub(line4, line6);
1580     register vector signed short line5B = vec_add(line5, line7);
1581     register vector signed short line7B = vec_sub(line5, line7);
1582
1583     register vector signed short line0C = vec_add(line0B, line4B);
1584     register vector signed short line4C = vec_sub(line0B, line4B);
1585     register vector signed short line1C = vec_add(line1B, line5B);
1586     register vector signed short line5C = vec_sub(line1B, line5B);
1587     register vector signed short line2C = vec_add(line2B, line6B);
1588     register vector signed short line6C = vec_sub(line2B, line6B);
1589     register vector signed short line3C = vec_add(line3B, line7B);
1590     register vector signed short line7C = vec_sub(line3B, line7B);
1591
1592     vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
1593     vsum = vec_sum4s(vec_abs(line1C), vsum);
1594     vsum = vec_sum4s(vec_abs(line2C), vsum);
1595     vsum = vec_sum4s(vec_abs(line3C), vsum);
1596     vsum = vec_sum4s(vec_abs(line4C), vsum);
1597     vsum = vec_sum4s(vec_abs(line5C), vsum);
1598     vsum = vec_sum4s(vec_abs(line6C), vsum);
1599     vsum = vec_sum4s(vec_abs(line7C), vsum);
1600
1601     register vector signed short line0S = vec_add(temp0S, temp1S);
1602     register vector signed short line1S = vec_sub(temp0S, temp1S);
1603     register vector signed short line2S = vec_add(temp2S, temp3S);
1604     register vector signed short line3S = vec_sub(temp2S, temp3S);
1605     register vector signed short line4S = vec_add(temp4S, temp5S);
1606     register vector signed short line5S = vec_sub(temp4S, temp5S);
1607     register vector signed short line6S = vec_add(temp6S, temp7S);
1608     register vector signed short line7S = vec_sub(temp6S, temp7S);
1609
1610     register vector signed short line0BS = vec_add(line0S, line2S);
1611     register vector signed short line2BS = vec_sub(line0S, line2S);
1612     register vector signed short line1BS = vec_add(line1S, line3S);
1613     register vector signed short line3BS = vec_sub(line1S, line3S);
1614     register vector signed short line4BS = vec_add(line4S, line6S);
1615     register vector signed short line6BS = vec_sub(line4S, line6S);
1616     register vector signed short line5BS = vec_add(line5S, line7S);
1617     register vector signed short line7BS = vec_sub(line5S, line7S);
1618
1619     register vector signed short line0CS = vec_add(line0BS, line4BS);
1620     register vector signed short line4CS = vec_sub(line0BS, line4BS);
1621     register vector signed short line1CS = vec_add(line1BS, line5BS);
1622     register vector signed short line5CS = vec_sub(line1BS, line5BS);
1623     register vector signed short line2CS = vec_add(line2BS, line6BS);
1624     register vector signed short line6CS = vec_sub(line2BS, line6BS);
1625     register vector signed short line3CS = vec_add(line3BS, line7BS);
1626     register vector signed short line7CS = vec_sub(line3BS, line7BS);
1627
1628     vsum = vec_sum4s(vec_abs(line0CS), vsum);
1629     vsum = vec_sum4s(vec_abs(line1CS), vsum);
1630     vsum = vec_sum4s(vec_abs(line2CS), vsum);
1631     vsum = vec_sum4s(vec_abs(line3CS), vsum);
1632     vsum = vec_sum4s(vec_abs(line4CS), vsum);
1633     vsum = vec_sum4s(vec_abs(line5CS), vsum);
1634     vsum = vec_sum4s(vec_abs(line6CS), vsum);
1635     vsum = vec_sum4s(vec_abs(line7CS), vsum);
1636     vsum = vec_sums(vsum, (vector signed int)vzero);
1637     vsum = vec_splat(vsum, 3);
1638     vec_ste(vsum, 0, &sum);
1639   }
1640   return sum;
1641 }
1642
1643 int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1644 POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
1645   int score;
1646 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
1647   score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
1648   if (h==16) {
1649     dst += 8*stride;
1650     src += 8*stride;
1651     score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
1652   }
1653 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
1654   return score;
1655 }
1656
1657 int has_altivec(void)
1658 {
1659 #ifdef CONFIG_DARWIN
1660     int sels[2] = {CTL_HW, HW_VECTORUNIT};
1661     int has_vu = 0;
1662     size_t len = sizeof(has_vu);
1663     int err;
1664
1665     err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
1666
1667     if (err == 0) return (has_vu != 0);
1668 #else /* CONFIG_DARWIN */
1669 /* no Darwin, do it the brute-force way */
1670 /* this is borrowed from the libmpeg2 library */
1671     {
1672       signal (SIGILL, sigill_handler);
1673       if (sigsetjmp (jmpbuf, 1)) {
1674         signal (SIGILL, SIG_DFL);
1675       } else {
1676         canjump = 1;
1677
1678         asm volatile ("mtspr 256, %0\n\t"
1679                       "vand %%v0, %%v0, %%v0"
1680                       :
1681                       : "r" (-1));
1682
1683         signal (SIGILL, SIG_DFL);
1684         return 1;
1685       }
1686     }
1687 #endif /* CONFIG_DARWIN */
1688     return 0;
1689 }