git.sesse.net Git - ffmpeg/blob - libavcodec/ppc/dsputil_altivec.c

   1 /*
   2  * Copyright (c) 2002 Brian Foley
   3  * Copyright (c) 2002 Dieter Shirley
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  18  */
  19
  20 #include "../dsputil.h"
  21 #include "dsputil_altivec.h"
  22
  23 #if CONFIG_DARWIN
  24 #include <sys/sysctl.h>
  25 #endif
  26
  27 #ifdef ALTIVEC_TBL_PERFORMANCE_REPORT
  28 unsigned long long perfdata[altivec_perf_total][altivec_data_total];
  29 /* list below must match enum in dsputil_altivec.h */
  30 static unsigned char* perfname[] = {
  31   "fft_calc",
  32   "gmc1",
  33   "dct_unquantize_h263",
  34   "idct_add",
  35   "idct_put",
  36   "put_pixels_clamped",
  37   "put_pixels16",
  38   "avg_pixels16"
  39 };
  40 #include <stdio.h>
  41 #endif
  42
  43 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  44 {
  45     int i;
  46     int s __attribute__((aligned(16)));
  47     const vector unsigned char zero = (const vector unsigned char)(0);
  48     vector unsigned char *tv;
  49     vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
  50     vector unsigned int sad;
  51     vector signed int sumdiffs;
  52
  53     s = 0;
  54     sad = (vector unsigned int)(0);
  55     for(i=0;i<16;i++) {
  56         /*
  57            Read unaligned pixels into our vectors. The vectors are as follows:
  58            pix1v: pix1[0]-pix1[15]
  59            pix2v: pix2[0]-pix2[15]      pix2iv: pix2[1]-pix2[16]
  60         */
  61         tv = (vector unsigned char *) pix1;
  62         pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  63
  64         tv = (vector unsigned char *) &pix2[0];
  65         pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  66
  67         tv = (vector unsigned char *) &pix2[1];
  68         pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  69
  70         /* Calculate the average vector */
  71         avgv = vec_avg(pix2v, pix2iv);
  72
  73         /* Calculate a sum of abs differences vector */
  74         t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  75
  76         /* Add each 4 pixel group together and put 4 results into sad */
  77         sad = vec_sum4s(t5, sad);
  78
  79         pix1 += line_size;
  80         pix2 += line_size;
  81     }
  82     /* Sum up the four partial sums, and put the result into s */
  83     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  84     sumdiffs = vec_splat(sumdiffs, 3);
  85     vec_ste(sumdiffs, 0, &s);
  86
  87     return s;
  88 }
  89
  90 int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  91 {
  92     int i;
  93     int s __attribute__((aligned(16)));
  94     const vector unsigned char zero = (const vector unsigned char)(0);
  95     vector unsigned char *tv;
  96     vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
  97     vector unsigned int sad;
  98     vector signed int sumdiffs;
  99     uint8_t *pix3 = pix2 + line_size;
 100
 101     s = 0;
 102     sad = (vector unsigned int)(0);
 103
 104     /*
 105        Due to the fact that pix3 = pix2 + line_size, the pix3 of one
 106        iteration becomes pix2 in the next iteration. We can use this
 107        fact to avoid a potentially expensive unaligned read, each
 108        time around the loop.
 109        Read unaligned pixels into our vectors. The vectors are as follows:
 110        pix2v: pix2[0]-pix2[15]
 111        Split the pixel vectors into shorts
 112     */
 113     tv = (vector unsigned char *) &pix2[0];
 114     pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
 115
 116     for(i=0;i<16;i++) {
 117         /*
 118            Read unaligned pixels into our vectors. The vectors are as follows:
 119            pix1v: pix1[0]-pix1[15]
 120            pix3v: pix3[0]-pix3[15]
 121         */
 122         tv = (vector unsigned char *) pix1;
 123         pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
 124
 125         tv = (vector unsigned char *) &pix3[0];
 126         pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
 127
 128         /* Calculate the average vector */
 129         avgv = vec_avg(pix2v, pix3v);
 130
 131         /* Calculate a sum of abs differences vector */
 132         t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
 133
 134         /* Add each 4 pixel group together and put 4 results into sad */
 135         sad = vec_sum4s(t5, sad);
 136
 137         pix1 += line_size;
 138         pix2v = pix3v;
 139         pix3 += line_size;
 140
 141     }
 142
 143     /* Sum up the four partial sums, and put the result into s */
 144     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 145     sumdiffs = vec_splat(sumdiffs, 3);
 146     vec_ste(sumdiffs, 0, &s);
 147     return s;
 148 }
 149
 150 int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 151 {
 152     int i;
 153     int s __attribute__((aligned(16)));
 154     uint8_t *pix3 = pix2 + line_size;
 155     const vector unsigned char zero = (const vector unsigned char)(0);
 156     const vector unsigned short two = (const vector unsigned short)(2);
 157     vector unsigned char *tv, avgv, t5;
 158     vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
 159     vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
 160     vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
 161     vector unsigned short avghv, avglv;
 162     vector unsigned short t1, t2, t3, t4;
 163     vector unsigned int sad;
 164     vector signed int sumdiffs;
 165
 166     sad = (vector unsigned int)(0);
 167
 168     s = 0;
 169
 170     /*
 171        Due to the fact that pix3 = pix2 + line_size, the pix3 of one
 172        iteration becomes pix2 in the next iteration. We can use this
 173        fact to avoid a potentially expensive unaligned read, as well
 174        as some splitting, and vector addition each time around the loop.
 175        Read unaligned pixels into our vectors. The vectors are as follows:
 176        pix2v: pix2[0]-pix2[15]  pix2iv: pix2[1]-pix2[16]
 177        Split the pixel vectors into shorts
 178     */
 179     tv = (vector unsigned char *) &pix2[0];
 180     pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
 181
 182     tv = (vector unsigned char *) &pix2[1];
 183     pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
 184
 185     pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
 186     pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
 187     pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
 188     pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
 189     t1 = vec_add(pix2hv, pix2ihv);
 190     t2 = vec_add(pix2lv, pix2ilv);
 191
 192     for(i=0;i<16;i++) {
 193         /*
 194            Read unaligned pixels into our vectors. The vectors are as follows:
 195            pix1v: pix1[0]-pix1[15]
 196            pix3v: pix3[0]-pix3[15]      pix3iv: pix3[1]-pix3[16]
 197         */
 198         tv = (vector unsigned char *) pix1;
 199         pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
 200
 201         tv = (vector unsigned char *) &pix3[0];
 202         pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
 203
 204         tv = (vector unsigned char *) &pix3[1];
 205         pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
 206
 207         /*
 208           Note that Altivec does have vec_avg, but this works on vector pairs
 209           and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
 210           would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
 211           Instead, we have to split the pixel vectors into vectors of shorts,
 212           and do the averaging by hand.
 213         */
 214
 215         /* Split the pixel vectors into shorts */
 216         pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
 217         pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
 218         pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
 219         pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
 220
 221         /* Do the averaging on them */
 222         t3 = vec_add(pix3hv, pix3ihv);
 223         t4 = vec_add(pix3lv, pix3ilv);
 224
 225         avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
 226         avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
 227
 228         /* Pack the shorts back into a result */
 229         avgv = vec_pack(avghv, avglv);
 230
 231         /* Calculate a sum of abs differences vector */
 232         t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
 233
 234         /* Add each 4 pixel group together and put 4 results into sad */
 235         sad = vec_sum4s(t5, sad);
 236
 237         pix1 += line_size;
 238         pix3 += line_size;
 239         /* Transfer the calculated values for pix3 into pix2 */
 240         t1 = t3;
 241         t2 = t4;
 242     }
 243     /* Sum up the four partial sums, and put the result into s */
 244     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 245     sumdiffs = vec_splat(sumdiffs, 3);
 246     vec_ste(sumdiffs, 0, &s);
 247
 248     return s;
 249 }
 250
 251 int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 252 {
 253     int i;
 254     int s __attribute__((aligned(16)));
 255     const vector unsigned int zero = (const vector unsigned int)(0);
 256     vector unsigned char perm1, perm2, *pix1v, *pix2v;
 257     vector unsigned char t1, t2, t3,t4, t5;
 258     vector unsigned int sad;
 259     vector signed int sumdiffs;
 260
 261     sad = (vector unsigned int) (0);
 262
 263
 264     for(i=0;i<16;i++) {
 265         /* Read potentially unaligned pixels into t1 and t2 */
 266         perm1 = vec_lvsl(0, pix1);
 267         pix1v = (vector unsigned char *) pix1;
 268         perm2 = vec_lvsl(0, pix2);
 269         pix2v = (vector unsigned char *) pix2;
 270         t1 = vec_perm(pix1v[0], pix1v[1], perm1);
 271         t2 = vec_perm(pix2v[0], pix2v[1], perm2);
 272
 273         /* Calculate a sum of abs differences vector */
 274         t3 = vec_max(t1, t2);
 275         t4 = vec_min(t1, t2);
 276         t5 = vec_sub(t3, t4);
 277
 278         /* Add each 4 pixel group together and put 4 results into sad */
 279         sad = vec_sum4s(t5, sad);
 280
 281         pix1 += line_size;
 282         pix2 += line_size;
 283     }
 284
 285     /* Sum up the four partial sums, and put the result into s */
 286     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 287     sumdiffs = vec_splat(sumdiffs, 3);
 288     vec_ste(sumdiffs, 0, &s);
 289
 290     return s;
 291 }
 292
 293 int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 294 {
 295     int i;
 296     int s __attribute__((aligned(16)));
 297     const vector unsigned int zero = (const vector unsigned int)(0);
 298     vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
 299     vector unsigned char t1, t2, t3,t4, t5;
 300     vector unsigned int sad;
 301     vector signed int sumdiffs;
 302
 303     sad = (vector unsigned int)(0);
 304     permclear = (vector unsigned char) (255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
 305
 306     for(i=0;i<8;i++) {
 307         /* Read potentially unaligned pixels into t1 and t2
 308            Since we're reading 16 pixels, and actually only want 8,
 309            mask out the last 8 pixels. The 0s don't change the sum. */
 310         perm1 = vec_lvsl(0, pix1);
 311         pix1v = (vector unsigned char *) pix1;
 312         perm2 = vec_lvsl(0, pix2);
 313         pix2v = (vector unsigned char *) pix2;
 314         t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
 315         t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
 316
 317         /* Calculate a sum of abs differences vector */
 318         t3 = vec_max(t1, t2);
 319         t4 = vec_min(t1, t2);
 320         t5 = vec_sub(t3, t4);
 321
 322         /* Add each 4 pixel group together and put 4 results into sad */
 323         sad = vec_sum4s(t5, sad);
 324
 325         pix1 += line_size;
 326         pix2 += line_size;
 327     }
 328
 329     /* Sum up the four partial sums, and put the result into s */
 330     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 331     sumdiffs = vec_splat(sumdiffs, 3);
 332     vec_ste(sumdiffs, 0, &s);
 333
 334     return s;
 335 }
 336
 337 int pix_norm1_altivec(uint8_t *pix, int line_size)
 338 {
 339     int i;
 340     int s __attribute__((aligned(16)));
 341     const vector unsigned int zero = (const vector unsigned int)(0);
 342     vector unsigned char *tv;
 343     vector unsigned char pixv;
 344     vector unsigned int sv;
 345     vector signed int sum;
 346
 347     sv = (vector unsigned int)(0);
 348
 349     s = 0;
 350     for (i = 0; i < 16; i++) {
 351         /* Read in the potentially unaligned pixels */
 352         tv = (vector unsigned char *) pix;
 353         pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
 354
 355         /* Square the values, and add them to our sum */
 356         sv = vec_msum(pixv, pixv, sv);
 357
 358         pix += line_size;
 359     }
 360     /* Sum up the four partial sums, and put the result into s */
 361     sum = vec_sums((vector signed int) sv, (vector signed int) zero);
 362     sum = vec_splat(sum, 3);
 363     vec_ste(sum, 0, &s);
 364
 365     return s;
 366 }
 367
 368 /**
 369  * Sum of Squared Errors for a 8x8 block.
 370  * AltiVec-enhanced.
 371  * It's the pix_abs8x8_altivec code above w/ squaring added.
 372  */
 373 int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
 374 {
 375     int i;
 376     int s __attribute__((aligned(16)));
 377     const vector unsigned int zero = (const vector unsigned int)(0);
 378     vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
 379     vector unsigned char t1, t2, t3,t4, t5;
 380     vector unsigned int sum;
 381     vector signed int sumsqr;
 382
 383     sum = (vector unsigned int)(0);
 384     permclear = (vector unsigned char)(0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
 385
 386     for(i=0;i<8;i++) {
 387         /* Read potentially unaligned pixels into t1 and t2
 388            Since we're reading 16 pixels, and actually only want 8,
 389            mask out the last 8 pixels. The 0s don't change the sum. */
 390         perm1 = vec_lvsl(0, pix1);
 391         pix1v = (vector unsigned char *) pix1;
 392         perm2 = vec_lvsl(0, pix2);
 393         pix2v = (vector unsigned char *) pix2;
 394         t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
 395         t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
 396
 397         /*
 398           Since we want to use unsigned chars, we can take advantage
 399           of the fact that abs(a-b)^2 = (a-b)^2.
 400         */
 401
 402         /* Calculate abs differences vector */
 403         t3 = vec_max(t1, t2);
 404         t4 = vec_min(t1, t2);
 405         t5 = vec_sub(t3, t4);
 406
 407         /* Square the values and add them to our sum */
 408         sum = vec_msum(t5, t5, sum);
 409
 410         pix1 += line_size;
 411         pix2 += line_size;
 412     }
 413
 414     /* Sum up the four partial sums, and put the result into s */
 415     sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
 416     sumsqr = vec_splat(sumsqr, 3);
 417     vec_ste(sumsqr, 0, &s);
 418
 419     return s;
 420 }
 421
 422 /**
 423  * Sum of Squared Errors for a 16x16 block.
 424  * AltiVec-enhanced.
 425  * It's the pix_abs16x16_altivec code above w/ squaring added.
 426  */
 427 int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
 428 {
 429     int i;
 430     int s __attribute__((aligned(16)));
 431     const vector unsigned int zero = (const vector unsigned int)(0);
 432     vector unsigned char perm1, perm2, *pix1v, *pix2v;
 433     vector unsigned char t1, t2, t3,t4, t5;
 434     vector unsigned int sum;
 435     vector signed int sumsqr;
 436
 437     sum = (vector unsigned int)(0);
 438
 439     for(i=0;i<16;i++) {
 440         /* Read potentially unaligned pixels into t1 and t2 */
 441         perm1 = vec_lvsl(0, pix1);
 442         pix1v = (vector unsigned char *) pix1;
 443         perm2 = vec_lvsl(0, pix2);
 444         pix2v = (vector unsigned char *) pix2;
 445         t1 = vec_perm(pix1v[0], pix1v[1], perm1);
 446         t2 = vec_perm(pix2v[0], pix2v[1], perm2);
 447
 448         /*
 449           Since we want to use unsigned chars, we can take advantage
 450           of the fact that abs(a-b)^2 = (a-b)^2.
 451         */
 452
 453         /* Calculate abs differences vector */
 454         t3 = vec_max(t1, t2);
 455         t4 = vec_min(t1, t2);
 456         t5 = vec_sub(t3, t4);
 457
 458         /* Square the values and add them to our sum */
 459         sum = vec_msum(t5, t5, sum);
 460
 461         pix1 += line_size;
 462         pix2 += line_size;
 463     }
 464
 465     /* Sum up the four partial sums, and put the result into s */
 466     sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
 467     sumsqr = vec_splat(sumsqr, 3);
 468     vec_ste(sumsqr, 0, &s);
 469
 470     return s;
 471 }
 472
 473 int pix_sum_altivec(UINT8 * pix, int line_size)
 474 {
 475     const vector unsigned int zero = (const vector unsigned int)(0);
 476     vector unsigned char perm, *pixv;
 477     vector unsigned char t1;
 478     vector unsigned int sad;
 479     vector signed int sumdiffs;
 480
 481     int i;
 482     int s __attribute__((aligned(16)));
 483
 484     sad = (vector unsigned int) (0);
 485
 486     for (i = 0; i < 16; i++) {
 487         /* Read the potentially unaligned 16 pixels into t1 */
 488         perm = vec_lvsl(0, pix);
 489         pixv = (vector unsigned char *) pix;
 490         t1 = vec_perm(pixv[0], pixv[1], perm);
 491
 492         /* Add each 4 pixel group together and put 4 results into sad */
 493         sad = vec_sum4s(t1, sad);
 494
 495         pix += line_size;
 496     }
 497
 498     /* Sum up the four partial sums, and put the result into s */
 499     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 500     sumdiffs = vec_splat(sumdiffs, 3);
 501     vec_ste(sumdiffs, 0, &s);
 502
 503     return s;
 504 }
 505
 506 void get_pixels_altivec(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
 507 {
 508     int i;
 509     vector unsigned char perm, bytes, *pixv;
 510     const vector unsigned char zero = (const vector unsigned char) (0);
 511     vector signed short shorts;
 512
 513     for(i=0;i<8;i++)
 514     {
 515         // Read potentially unaligned pixels.
 516         // We're reading 16 pixels, and actually only want 8,
 517         // but we simply ignore the extras.
 518         perm = vec_lvsl(0, pixels);
 519         pixv = (vector unsigned char *) pixels;
 520         bytes = vec_perm(pixv[0], pixv[1], perm);
 521
 522         // convert the bytes into shorts
 523         shorts = (vector signed short)vec_mergeh(zero, bytes);
 524
 525         // save the data to the block, we assume the block is 16-byte aligned
 526         vec_st(shorts, i*16, (vector signed short*)block);
 527
 528         pixels += line_size;
 529     }
 530 }
 531
 532 void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1,
 533         const UINT8 *s2, int stride)
 534 {
 535     int i;
 536     vector unsigned char perm, bytes, *pixv;
 537     const vector unsigned char zero = (const vector unsigned char) (0);
 538     vector signed short shorts1, shorts2;
 539
 540     for(i=0;i<4;i++)
 541     {
 542         // Read potentially unaligned pixels
 543         // We're reading 16 pixels, and actually only want 8,
 544         // but we simply ignore the extras.
 545         perm = vec_lvsl(0, s1);
 546         pixv = (vector unsigned char *) s1;
 547         bytes = vec_perm(pixv[0], pixv[1], perm);
 548
 549         // convert the bytes into shorts
 550         shorts1 = (vector signed short)vec_mergeh(zero, bytes);
 551
 552         // Do the same for the second block of pixels
 553         perm = vec_lvsl(0, s2);
 554         pixv = (vector unsigned char *) s2;
 555         bytes = vec_perm(pixv[0], pixv[1], perm);
 556
 557         // convert the bytes into shorts
 558         shorts2 = (vector signed short)vec_mergeh(zero, bytes);
 559
 560         // Do the subtraction
 561         shorts1 = vec_sub(shorts1, shorts2);
 562
 563         // save the data to the block, we assume the block is 16-byte aligned
 564         vec_st(shorts1, 0, (vector signed short*)block);
 565
 566         s1 += stride;
 567         s2 += stride;
 568         block += 8;
 569
 570
 571         // The code below is a copy of the code above... This is a manual
 572         // unroll.
 573
 574         // Read potentially unaligned pixels
 575         // We're reading 16 pixels, and actually only want 8,
 576         // but we simply ignore the extras.
 577         perm = vec_lvsl(0, s1);
 578         pixv = (vector unsigned char *) s1;
 579         bytes = vec_perm(pixv[0], pixv[1], perm);
 580
 581         // convert the bytes into shorts
 582         shorts1 = (vector signed short)vec_mergeh(zero, bytes);
 583
 584         // Do the same for the second block of pixels
 585         perm = vec_lvsl(0, s2);
 586         pixv = (vector unsigned char *) s2;
 587         bytes = vec_perm(pixv[0], pixv[1], perm);
 588
 589         // convert the bytes into shorts
 590         shorts2 = (vector signed short)vec_mergeh(zero, bytes);
 591
 592         // Do the subtraction
 593         shorts1 = vec_sub(shorts1, shorts2);
 594
 595         // save the data to the block, we assume the block is 16-byte aligned
 596         vec_st(shorts1, 0, (vector signed short*)block);
 597
 598         s1 += stride;
 599         s2 += stride;
 600         block += 8;
 601     }
 602 }
 603
 604 int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
 605   return pix_abs16x16_altivec(a,b,stride);
 606 }
 607
 608 int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
 609   return pix_abs8x8_altivec(a,b,stride);
 610 }
 611
 612 void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
 613 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
 614     int i;
 615     for(i=0; i+7<w; i++){
 616         dst[i+0] += src[i+0];
 617         dst[i+1] += src[i+1];
 618         dst[i+2] += src[i+2];
 619         dst[i+3] += src[i+3];
 620         dst[i+4] += src[i+4];
 621         dst[i+5] += src[i+5];
 622         dst[i+6] += src[i+6];
 623         dst[i+7] += src[i+7];
 624     }
 625     for(; i<w; i++)
 626         dst[i+0] += src[i+0];
 627 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
 628     register int i;
 629     register vector unsigned char vdst, vsrc;
 630
 631     /* dst and src are 16 bytes-aligned (guaranteed) */
 632     for(i = 0 ; (i + 15) < w ; i++)
 633     {
 634       vdst = vec_ld(i << 4, (unsigned char*)dst);
 635       vsrc = vec_ld(i << 4, (unsigned char*)src);
 636       vdst = vec_add(vsrc, vdst);
 637       vec_st(vdst, i << 4, (unsigned char*)dst);
 638     }
 639     /* if w is not a multiple of 16 */
 640     for (; (i < w) ; i++)
 641     {
 642       dst[i] = src[i];
 643     }
 644 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 645 }
 646
 647 extern UINT8 cropTbl[];
 648 void put_pixels_clamped_altivec(const DCTELEM *block, UINT8 *restrict pixels,
 649                                 int line_size)
 650 {
 651 ALTIVEC_TBL_DECLARE(altivec_put_pixels_clamped_num, 1);
 652 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
 653     int i;
 654     UINT8 *cm = cropTbl + MAX_NEG_CROP;
 655
 656 ALTIVEC_TBL_START_COUNT(altivec_put_pixels_clamped_num, 1);
 657
 658     /* read the pixels */
 659     for(i=0;i<8;i++) {
 660         pixels[0] = cm[block[0]];
 661         pixels[1] = cm[block[1]];
 662         pixels[2] = cm[block[2]];
 663         pixels[3] = cm[block[3]];
 664         pixels[4] = cm[block[4]];
 665         pixels[5] = cm[block[5]];
 666         pixels[6] = cm[block[6]];
 667         pixels[7] = cm[block[7]];
 668
 669         pixels += line_size;
 670         block += 8;
 671     }
 672
 673 ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels_clamped_num, 1);
 674
 675 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
 676     register const vector short vczero = (const vector short)(0);
 677     register vector short
 678       blockv0, blockv1, blockv2, blockv3,
 679       blockv4, blockv5, blockv6, blockv7;
 680     register vector unsigned char
 681       pixelsv0, pixelsv1, pixelsv2, pixelsv3, pixelsv4,
 682       pixelsv0old, pixelsv4old;
 683
 684 ALTIVEC_TBL_START_COUNT(altivec_put_pixels_clamped_num, 1);
 685
 686     blockv0 = vec_ld(0, block);
 687     blockv1 = vec_ld(16, block);
 688     blockv2 = vec_ld(32, block);
 689     blockv3 = vec_ld(48, block);
 690     blockv4 = vec_ld(64, block);
 691     blockv5 = vec_ld(80, block);
 692     blockv6 = vec_ld(96, block);
 693     blockv7 = vec_ld(112, block);
 694     if (((unsigned long)pixels) & 0x0000000F)
 695     {
 696       pixelsv0old = vec_ld(-8, pixels);
 697       pixelsv4old = vec_ld(56, pixels);
 698       pixelsv0 = vec_packsu(vczero, blockv0);
 699       pixelsv1 = vec_packsu(blockv1, blockv2);
 700       pixelsv2 = vec_packsu(blockv3, blockv4);
 701       pixelsv3 = vec_packsu(blockv5, blockv6);
 702       pixelsv4 = vec_packsu(blockv5, vczero);
 703       pixelsv0 = vec_perm(pixelsv0old, pixelsv0, vcprm(0, 1, s2, s3));
 704       pixelsv4 = vec_perm(pixelsv4, pixelsv4old, vcprm(0, 1, s2, s3));
 705       vec_st(pixelsv0, -8, pixels);
 706       vec_st(pixelsv1, 8, pixels);
 707       vec_st(pixelsv2, 24, pixels);
 708       vec_st(pixelsv3, 40, pixels);
 709       vec_st(pixelsv4, 56, pixels);
 710     }
 711     else
 712     {
 713       pixelsv0 = vec_packsu(blockv0, blockv1);
 714       pixelsv1 = vec_packsu(blockv2, blockv3);
 715       pixelsv2 = vec_packsu(blockv4, blockv5);
 716       pixelsv3 = vec_packsu(blockv6, blockv7);
 717       vec_st(pixelsv0, 0, pixels);
 718       vec_st(pixelsv1, 16, pixels);
 719       vec_st(pixelsv2, 32, pixels);
 720       vec_st(pixelsv3, 48, pixels);
 721     }
 722
 723 ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels_clamped_num, 1);
 724 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 725 }
 726
 727 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 728 {
 729 ALTIVEC_TBL_DECLARE(altivec_put_pixels16_num, 1);
 730 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
 731     int i;
 732
 733 ALTIVEC_TBL_START_COUNT(altivec_put_pixels16_num, 1);
 734
 735     for(i=0; i<h; i++) {
 736       *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l);
 737       *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l);
 738       *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l);
 739       *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l);
 740       pixels+=line_size;
 741       block +=line_size;
 742     }
 743
 744 ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1);
 745
 746 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
 747
 748     register vector unsigned char perm = vec_lvsl(0, pixels);
 749     register vector unsigned char pixelsv1, pixelsv2;
 750     int i;
 751
 752 ALTIVEC_TBL_START_COUNT(altivec_put_pixels16_num, 1);
 753
 754     for(i=0; i<h; i++) {
 755       pixelsv1 = vec_ld(0, (unsigned char*)pixels);
 756       pixelsv2 = vec_ld(16, (unsigned char*)pixels);
 757       vec_st(vec_perm(pixelsv1, pixelsv2, perm), 0, (unsigned char*)block);
 758       pixels+=line_size;
 759       block +=line_size;
 760     }
 761
 762 ALTIVEC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1);
 763
 764 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 765 }
 766
 767 #define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
 768 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 769 {
 770 ALTIVEC_TBL_DECLARE(altivec_avg_pixels16_num, 1);
 771 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
 772     int i;
 773
 774 ALTIVEC_TBL_START_COUNT(altivec_avg_pixels16_num, 1);
 775
 776     for(i=0; i<h; i++) {
 777       op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l));
 778       op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l));
 779       op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l));
 780       op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l));
 781       pixels+=line_size;
 782       block +=line_size;
 783     }
 784
 785 ALTIVEC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1);
 786
 787 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
 788
 789     register vector unsigned char perm = vec_lvsl(0, pixels);
 790     register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
 791     int i;
 792
 793 ALTIVEC_TBL_START_COUNT(altivec_avg_pixels16_num, 1);
 794
 795     for(i=0; i<h; i++) {
 796       pixelsv1 = vec_ld(0, (unsigned char*)pixels);
 797       pixelsv2 = vec_ld(16, (unsigned char*)pixels);
 798       blockv = vec_ld(0, block);
 799       pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
 800       blockv = vec_avg(blockv,pixelsv);
 801       vec_st(blockv, 0, (unsigned char*)block);
 802       pixels+=line_size;
 803       block +=line_size;
 804     }
 805
 806 ALTIVEC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1);
 807
 808 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 809 }
 810
 811 int has_altivec(void)
 812 {
 813 #if CONFIG_DARWIN
 814     int sels[2] = {CTL_HW, HW_VECTORUNIT};
 815     int has_vu = 0;
 816     size_t len = sizeof(has_vu);
 817     int err;
 818
 819     err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
 820
 821     if (err == 0) return (has_vu != 0);
 822 #endif
 823     return 0;
 824 }
 825
 826 #ifdef ALTIVEC_TBL_PERFORMANCE_REPORT
 827 void altivec_display_perf_report(void)
 828 {
 829   int i;
 830   fprintf(stderr, "AltiVec performance report\n Values are from the Time Base register, and represent 4 bus cycles.\n");
 831   for(i = 0 ; i < altivec_perf_total ; i++)
 832   {
 833     if (perfdata[i][altivec_data_num] != (unsigned long long)0)
 834       fprintf(stderr, " Function \"%s\":\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
 835               perfname[i],
 836               perfdata[i][altivec_data_min],
 837               perfdata[i][altivec_data_max],
 838               (double)perfdata[i][altivec_data_sum] /
 839               (double)perfdata[i][altivec_data_num],
 840               perfdata[i][altivec_data_num]);
 841   }
 842 }
 843 #endif /* ALTIVEC_TBL_PERFORMANCE_REPORT */