git.sesse.net Git - ffmpeg/blob - libavcodec/ppc/dsputil_altivec.c

   1 /*
   2  * Copyright (c) 2002 Brian Foley
   3  * Copyright (c) 2002 Dieter Shirley
   4  * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  */
  20
  21 #include "../dsputil.h"
  22
  23 #include "gcc_fixes.h"
  24
  25 #include "dsputil_altivec.h"
  26
  27 #ifdef CONFIG_DARWIN
  28 #include <sys/sysctl.h>
  29 #else /* CONFIG_DARWIN */
  30 #include <signal.h>
  31 #include <setjmp.h>
  32
  33 static sigjmp_buf jmpbuf;
  34 static volatile sig_atomic_t canjump = 0;
  35
  36 static void sigill_handler (int sig)
  37 {
  38     if (!canjump) {
  39         signal (sig, SIG_DFL);
  40         raise (sig);
  41     }
  42
  43     canjump = 0;
  44     siglongjmp (jmpbuf, 1);
  45 }
  46 #endif /* CONFIG_DARWIN */
  47
  48 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  49 {
  50     int i;
  51     int s __attribute__((aligned(16)));
  52     const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  53     vector unsigned char *tv;
  54     vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
  55     vector unsigned int sad;
  56     vector signed int sumdiffs;
  57
  58     s = 0;
  59     sad = (vector unsigned int)vec_splat_u32(0);
  60     for(i=0;i<16;i++) {
  61         /*
  62            Read unaligned pixels into our vectors. The vectors are as follows:
  63            pix1v: pix1[0]-pix1[15]
  64            pix2v: pix2[0]-pix2[15]      pix2iv: pix2[1]-pix2[16]
  65         */
  66         tv = (vector unsigned char *) pix1;
  67         pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  68
  69         tv = (vector unsigned char *) &pix2[0];
  70         pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  71
  72         tv = (vector unsigned char *) &pix2[1];
  73         pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  74
  75         /* Calculate the average vector */
  76         avgv = vec_avg(pix2v, pix2iv);
  77
  78         /* Calculate a sum of abs differences vector */
  79         t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  80
  81         /* Add each 4 pixel group together and put 4 results into sad */
  82         sad = vec_sum4s(t5, sad);
  83
  84         pix1 += line_size;
  85         pix2 += line_size;
  86     }
  87     /* Sum up the four partial sums, and put the result into s */
  88     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  89     sumdiffs = vec_splat(sumdiffs, 3);
  90     vec_ste(sumdiffs, 0, &s);
  91
  92     return s;
  93 }
  94
  95 int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  96 {
  97     int i;
  98     int s __attribute__((aligned(16)));
  99     const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
 100     vector unsigned char *tv;
 101     vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
 102     vector unsigned int sad;
 103     vector signed int sumdiffs;
 104     uint8_t *pix3 = pix2 + line_size;
 105
 106     s = 0;
 107     sad = (vector unsigned int)vec_splat_u32(0);
 108
 109     /*
 110        Due to the fact that pix3 = pix2 + line_size, the pix3 of one
 111        iteration becomes pix2 in the next iteration. We can use this
 112        fact to avoid a potentially expensive unaligned read, each
 113        time around the loop.
 114        Read unaligned pixels into our vectors. The vectors are as follows:
 115        pix2v: pix2[0]-pix2[15]
 116        Split the pixel vectors into shorts
 117     */
 118     tv = (vector unsigned char *) &pix2[0];
 119     pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
 120
 121     for(i=0;i<16;i++) {
 122         /*
 123            Read unaligned pixels into our vectors. The vectors are as follows:
 124            pix1v: pix1[0]-pix1[15]
 125            pix3v: pix3[0]-pix3[15]
 126         */
 127         tv = (vector unsigned char *) pix1;
 128         pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
 129
 130         tv = (vector unsigned char *) &pix3[0];
 131         pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
 132
 133         /* Calculate the average vector */
 134         avgv = vec_avg(pix2v, pix3v);
 135
 136         /* Calculate a sum of abs differences vector */
 137         t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
 138
 139         /* Add each 4 pixel group together and put 4 results into sad */
 140         sad = vec_sum4s(t5, sad);
 141
 142         pix1 += line_size;
 143         pix2v = pix3v;
 144         pix3 += line_size;
 145
 146     }
 147
 148     /* Sum up the four partial sums, and put the result into s */
 149     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 150     sumdiffs = vec_splat(sumdiffs, 3);
 151     vec_ste(sumdiffs, 0, &s);
 152     return s;
 153 }
 154
 155 int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 156 {
 157     int i;
 158     int s __attribute__((aligned(16)));
 159     uint8_t *pix3 = pix2 + line_size;
 160     const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
 161     const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
 162     vector unsigned char *tv, avgv, t5;
 163     vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
 164     vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
 165     vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
 166     vector unsigned short avghv, avglv;
 167     vector unsigned short t1, t2, t3, t4;
 168     vector unsigned int sad;
 169     vector signed int sumdiffs;
 170
 171     sad = (vector unsigned int)vec_splat_u32(0);
 172
 173     s = 0;
 174
 175     /*
 176        Due to the fact that pix3 = pix2 + line_size, the pix3 of one
 177        iteration becomes pix2 in the next iteration. We can use this
 178        fact to avoid a potentially expensive unaligned read, as well
 179        as some splitting, and vector addition each time around the loop.
 180        Read unaligned pixels into our vectors. The vectors are as follows:
 181        pix2v: pix2[0]-pix2[15]  pix2iv: pix2[1]-pix2[16]
 182        Split the pixel vectors into shorts
 183     */
 184     tv = (vector unsigned char *) &pix2[0];
 185     pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
 186
 187     tv = (vector unsigned char *) &pix2[1];
 188     pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
 189
 190     pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
 191     pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
 192     pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
 193     pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
 194     t1 = vec_add(pix2hv, pix2ihv);
 195     t2 = vec_add(pix2lv, pix2ilv);
 196
 197     for(i=0;i<16;i++) {
 198         /*
 199            Read unaligned pixels into our vectors. The vectors are as follows:
 200            pix1v: pix1[0]-pix1[15]
 201            pix3v: pix3[0]-pix3[15]      pix3iv: pix3[1]-pix3[16]
 202         */
 203         tv = (vector unsigned char *) pix1;
 204         pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
 205
 206         tv = (vector unsigned char *) &pix3[0];
 207         pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
 208
 209         tv = (vector unsigned char *) &pix3[1];
 210         pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
 211
 212         /*
 213           Note that Altivec does have vec_avg, but this works on vector pairs
 214           and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
 215           would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
 216           Instead, we have to split the pixel vectors into vectors of shorts,
 217           and do the averaging by hand.
 218         */
 219
 220         /* Split the pixel vectors into shorts */
 221         pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
 222         pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
 223         pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
 224         pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
 225
 226         /* Do the averaging on them */
 227         t3 = vec_add(pix3hv, pix3ihv);
 228         t4 = vec_add(pix3lv, pix3ilv);
 229
 230         avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
 231         avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
 232
 233         /* Pack the shorts back into a result */
 234         avgv = vec_pack(avghv, avglv);
 235
 236         /* Calculate a sum of abs differences vector */
 237         t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
 238
 239         /* Add each 4 pixel group together and put 4 results into sad */
 240         sad = vec_sum4s(t5, sad);
 241
 242         pix1 += line_size;
 243         pix3 += line_size;
 244         /* Transfer the calculated values for pix3 into pix2 */
 245         t1 = t3;
 246         t2 = t4;
 247     }
 248     /* Sum up the four partial sums, and put the result into s */
 249     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 250     sumdiffs = vec_splat(sumdiffs, 3);
 251     vec_ste(sumdiffs, 0, &s);
 252
 253     return s;
 254 }
 255
 256 int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 257 {
 258     int i;
 259     int s __attribute__((aligned(16)));
 260     const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
 261     vector unsigned char perm1, perm2, *pix1v, *pix2v;
 262     vector unsigned char t1, t2, t3,t4, t5;
 263     vector unsigned int sad;
 264     vector signed int sumdiffs;
 265
 266     sad = (vector unsigned int)vec_splat_u32(0);
 267
 268
 269     for(i=0;i<16;i++) {
 270         /* Read potentially unaligned pixels into t1 and t2 */
 271         perm1 = vec_lvsl(0, pix1);
 272         pix1v = (vector unsigned char *) pix1;
 273         perm2 = vec_lvsl(0, pix2);
 274         pix2v = (vector unsigned char *) pix2;
 275         t1 = vec_perm(pix1v[0], pix1v[1], perm1);
 276         t2 = vec_perm(pix2v[0], pix2v[1], perm2);
 277
 278         /* Calculate a sum of abs differences vector */
 279         t3 = vec_max(t1, t2);
 280         t4 = vec_min(t1, t2);
 281         t5 = vec_sub(t3, t4);
 282
 283         /* Add each 4 pixel group together and put 4 results into sad */
 284         sad = vec_sum4s(t5, sad);
 285
 286         pix1 += line_size;
 287         pix2 += line_size;
 288     }
 289
 290     /* Sum up the four partial sums, and put the result into s */
 291     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 292     sumdiffs = vec_splat(sumdiffs, 3);
 293     vec_ste(sumdiffs, 0, &s);
 294
 295     return s;
 296 }
 297
 298 int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 299 {
 300     int i;
 301     int s __attribute__((aligned(16)));
 302     const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
 303     vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
 304     vector unsigned char t1, t2, t3,t4, t5;
 305     vector unsigned int sad;
 306     vector signed int sumdiffs;
 307
 308     sad = (vector unsigned int)vec_splat_u32(0);
 309
 310     permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
 311
 312     for(i=0;i<8;i++) {
 313         /* Read potentially unaligned pixels into t1 and t2
 314            Since we're reading 16 pixels, and actually only want 8,
 315            mask out the last 8 pixels. The 0s don't change the sum. */
 316         perm1 = vec_lvsl(0, pix1);
 317         pix1v = (vector unsigned char *) pix1;
 318         perm2 = vec_lvsl(0, pix2);
 319         pix2v = (vector unsigned char *) pix2;
 320         t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
 321         t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
 322
 323         /* Calculate a sum of abs differences vector */
 324         t3 = vec_max(t1, t2);
 325         t4 = vec_min(t1, t2);
 326         t5 = vec_sub(t3, t4);
 327
 328         /* Add each 4 pixel group together and put 4 results into sad */
 329         sad = vec_sum4s(t5, sad);
 330
 331         pix1 += line_size;
 332         pix2 += line_size;
 333     }
 334
 335     /* Sum up the four partial sums, and put the result into s */
 336     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 337     sumdiffs = vec_splat(sumdiffs, 3);
 338     vec_ste(sumdiffs, 0, &s);
 339
 340     return s;
 341 }
 342
 343 int pix_norm1_altivec(uint8_t *pix, int line_size)
 344 {
 345     int i;
 346     int s __attribute__((aligned(16)));
 347     const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
 348     vector unsigned char *tv;
 349     vector unsigned char pixv;
 350     vector unsigned int sv;
 351     vector signed int sum;
 352
 353     sv = (vector unsigned int)vec_splat_u32(0);
 354
 355     s = 0;
 356     for (i = 0; i < 16; i++) {
 357         /* Read in the potentially unaligned pixels */
 358         tv = (vector unsigned char *) pix;
 359         pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
 360
 361         /* Square the values, and add them to our sum */
 362         sv = vec_msum(pixv, pixv, sv);
 363
 364         pix += line_size;
 365     }
 366     /* Sum up the four partial sums, and put the result into s */
 367     sum = vec_sums((vector signed int) sv, (vector signed int) zero);
 368     sum = vec_splat(sum, 3);
 369     vec_ste(sum, 0, &s);
 370
 371     return s;
 372 }
 373
 374 /**
 375  * Sum of Squared Errors for a 8x8 block.
 376  * AltiVec-enhanced.
 377  * It's the pix_abs8x8_altivec code above w/ squaring added.
 378  */
 379 int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
 380 {
 381     int i;
 382     int s __attribute__((aligned(16)));
 383     const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
 384     vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
 385     vector unsigned char t1, t2, t3,t4, t5;
 386     vector unsigned int sum;
 387     vector signed int sumsqr;
 388
 389     sum = (vector unsigned int)vec_splat_u32(0);
 390
 391     permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
 392
 393
 394     for(i=0;i<8;i++) {
 395         /* Read potentially unaligned pixels into t1 and t2
 396            Since we're reading 16 pixels, and actually only want 8,
 397            mask out the last 8 pixels. The 0s don't change the sum. */
 398         perm1 = vec_lvsl(0, pix1);
 399         pix1v = (vector unsigned char *) pix1;
 400         perm2 = vec_lvsl(0, pix2);
 401         pix2v = (vector unsigned char *) pix2;
 402         t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
 403         t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
 404
 405         /*
 406           Since we want to use unsigned chars, we can take advantage
 407           of the fact that abs(a-b)^2 = (a-b)^2.
 408         */
 409
 410         /* Calculate abs differences vector */
 411         t3 = vec_max(t1, t2);
 412         t4 = vec_min(t1, t2);
 413         t5 = vec_sub(t3, t4);
 414
 415         /* Square the values and add them to our sum */
 416         sum = vec_msum(t5, t5, sum);
 417
 418         pix1 += line_size;
 419         pix2 += line_size;
 420     }
 421
 422     /* Sum up the four partial sums, and put the result into s */
 423     sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
 424     sumsqr = vec_splat(sumsqr, 3);
 425     vec_ste(sumsqr, 0, &s);
 426
 427     return s;
 428 }
 429
 430 /**
 431  * Sum of Squared Errors for a 16x16 block.
 432  * AltiVec-enhanced.
 433  * It's the pix_abs16x16_altivec code above w/ squaring added.
 434  */
 435 int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
 436 {
 437     int i;
 438     int s __attribute__((aligned(16)));
 439     const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
 440     vector unsigned char perm1, perm2, *pix1v, *pix2v;
 441     vector unsigned char t1, t2, t3,t4, t5;
 442     vector unsigned int sum;
 443     vector signed int sumsqr;
 444
 445     sum = (vector unsigned int)vec_splat_u32(0);
 446
 447     for(i=0;i<16;i++) {
 448         /* Read potentially unaligned pixels into t1 and t2 */
 449         perm1 = vec_lvsl(0, pix1);
 450         pix1v = (vector unsigned char *) pix1;
 451         perm2 = vec_lvsl(0, pix2);
 452         pix2v = (vector unsigned char *) pix2;
 453         t1 = vec_perm(pix1v[0], pix1v[1], perm1);
 454         t2 = vec_perm(pix2v[0], pix2v[1], perm2);
 455
 456         /*
 457           Since we want to use unsigned chars, we can take advantage
 458           of the fact that abs(a-b)^2 = (a-b)^2.
 459         */
 460
 461         /* Calculate abs differences vector */
 462         t3 = vec_max(t1, t2);
 463         t4 = vec_min(t1, t2);
 464         t5 = vec_sub(t3, t4);
 465
 466         /* Square the values and add them to our sum */
 467         sum = vec_msum(t5, t5, sum);
 468
 469         pix1 += line_size;
 470         pix2 += line_size;
 471     }
 472
 473     /* Sum up the four partial sums, and put the result into s */
 474     sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
 475     sumsqr = vec_splat(sumsqr, 3);
 476     vec_ste(sumsqr, 0, &s);
 477
 478     return s;
 479 }
 480
 481 int pix_sum_altivec(uint8_t * pix, int line_size)
 482 {
 483     const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
 484     vector unsigned char perm, *pixv;
 485     vector unsigned char t1;
 486     vector unsigned int sad;
 487     vector signed int sumdiffs;
 488
 489     int i;
 490     int s __attribute__((aligned(16)));
 491
 492     sad = (vector unsigned int)vec_splat_u32(0);
 493
 494     for (i = 0; i < 16; i++) {
 495         /* Read the potentially unaligned 16 pixels into t1 */
 496         perm = vec_lvsl(0, pix);
 497         pixv = (vector unsigned char *) pix;
 498         t1 = vec_perm(pixv[0], pixv[1], perm);
 499
 500         /* Add each 4 pixel group together and put 4 results into sad */
 501         sad = vec_sum4s(t1, sad);
 502
 503         pix += line_size;
 504     }
 505
 506     /* Sum up the four partial sums, and put the result into s */
 507     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 508     sumdiffs = vec_splat(sumdiffs, 3);
 509     vec_ste(sumdiffs, 0, &s);
 510
 511     return s;
 512 }
 513
 514 void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 515 {
 516     int i;
 517     vector unsigned char perm, bytes, *pixv;
 518     const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
 519     vector signed short shorts;
 520
 521     for(i=0;i<8;i++)
 522     {
 523         // Read potentially unaligned pixels.
 524         // We're reading 16 pixels, and actually only want 8,
 525         // but we simply ignore the extras.
 526         perm = vec_lvsl(0, pixels);
 527         pixv = (vector unsigned char *) pixels;
 528         bytes = vec_perm(pixv[0], pixv[1], perm);
 529
 530         // convert the bytes into shorts
 531         shorts = (vector signed short)vec_mergeh(zero, bytes);
 532
 533         // save the data to the block, we assume the block is 16-byte aligned
 534         vec_st(shorts, i*16, (vector signed short*)block);
 535
 536         pixels += line_size;
 537     }
 538 }
 539
 540 void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
 541         const uint8_t *s2, int stride)
 542 {
 543     int i;
 544     vector unsigned char perm, bytes, *pixv;
 545     const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
 546     vector signed short shorts1, shorts2;
 547
 548     for(i=0;i<4;i++)
 549     {
 550         // Read potentially unaligned pixels
 551         // We're reading 16 pixels, and actually only want 8,
 552         // but we simply ignore the extras.
 553         perm = vec_lvsl(0, s1);
 554         pixv = (vector unsigned char *) s1;
 555         bytes = vec_perm(pixv[0], pixv[1], perm);
 556
 557         // convert the bytes into shorts
 558         shorts1 = (vector signed short)vec_mergeh(zero, bytes);
 559
 560         // Do the same for the second block of pixels
 561         perm = vec_lvsl(0, s2);
 562         pixv = (vector unsigned char *) s2;
 563         bytes = vec_perm(pixv[0], pixv[1], perm);
 564
 565         // convert the bytes into shorts
 566         shorts2 = (vector signed short)vec_mergeh(zero, bytes);
 567
 568         // Do the subtraction
 569         shorts1 = vec_sub(shorts1, shorts2);
 570
 571         // save the data to the block, we assume the block is 16-byte aligned
 572         vec_st(shorts1, 0, (vector signed short*)block);
 573
 574         s1 += stride;
 575         s2 += stride;
 576         block += 8;
 577
 578
 579         // The code below is a copy of the code above... This is a manual
 580         // unroll.
 581
 582         // Read potentially unaligned pixels
 583         // We're reading 16 pixels, and actually only want 8,
 584         // but we simply ignore the extras.
 585         perm = vec_lvsl(0, s1);
 586         pixv = (vector unsigned char *) s1;
 587         bytes = vec_perm(pixv[0], pixv[1], perm);
 588
 589         // convert the bytes into shorts
 590         shorts1 = (vector signed short)vec_mergeh(zero, bytes);
 591
 592         // Do the same for the second block of pixels
 593         perm = vec_lvsl(0, s2);
 594         pixv = (vector unsigned char *) s2;
 595         bytes = vec_perm(pixv[0], pixv[1], perm);
 596
 597         // convert the bytes into shorts
 598         shorts2 = (vector signed short)vec_mergeh(zero, bytes);
 599
 600         // Do the subtraction
 601         shorts1 = vec_sub(shorts1, shorts2);
 602
 603         // save the data to the block, we assume the block is 16-byte aligned
 604         vec_st(shorts1, 0, (vector signed short*)block);
 605
 606         s1 += stride;
 607         s2 += stride;
 608         block += 8;
 609     }
 610 }
 611
 612 int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
 613   return pix_abs16x16_altivec(a,b,stride);
 614 }
 615
 616 int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
 617   return pix_abs8x8_altivec(a,b,stride);
 618 }
 619
 620 void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
 621 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
 622     int i;
 623     for(i=0; i+7<w; i++){
 624         dst[i+0] += src[i+0];
 625         dst[i+1] += src[i+1];
 626         dst[i+2] += src[i+2];
 627         dst[i+3] += src[i+3];
 628         dst[i+4] += src[i+4];
 629         dst[i+5] += src[i+5];
 630         dst[i+6] += src[i+6];
 631         dst[i+7] += src[i+7];
 632     }
 633     for(; i<w; i++)
 634         dst[i+0] += src[i+0];
 635 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
 636     register int i;
 637     register vector unsigned char vdst, vsrc;
 638
 639     /* dst and src are 16 bytes-aligned (guaranteed) */
 640     for(i = 0 ; (i + 15) < w ; i++)
 641     {
 642       vdst = vec_ld(i << 4, (unsigned char*)dst);
 643       vsrc = vec_ld(i << 4, (unsigned char*)src);
 644       vdst = vec_add(vsrc, vdst);
 645       vec_st(vdst, i << 4, (unsigned char*)dst);
 646     }
 647     /* if w is not a multiple of 16 */
 648     for (; (i < w) ; i++)
 649     {
 650       dst[i] = src[i];
 651     }
 652 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 653 }
 654
 655 /* next one assumes that ((line_size % 16) == 0) */
 656 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 657 {
 658 POWERPC_TBL_DECLARE(altivec_put_pixels16_num, 1);
 659 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
 660     int i;
 661
 662 POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1);
 663
 664     for(i=0; i<h; i++) {
 665       *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l);
 666       *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l);
 667       *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l);
 668       *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l);
 669       pixels+=line_size;
 670       block +=line_size;
 671     }
 672
 673 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1);
 674
 675 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
 676     register vector unsigned char pixelsv1, pixelsv2;
 677     register vector unsigned char perm = vec_lvsl(0, pixels);
 678     int i;
 679
 680 POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1);
 681
 682     for(i=0; i<h; i++) {
 683       pixelsv1 = vec_ld(0, (unsigned char*)pixels);
 684       pixelsv2 = vec_ld(16, (unsigned char*)pixels);
 685       vec_st(vec_perm(pixelsv1, pixelsv2, perm),
 686              0, (unsigned char*)block);
 687       pixels+=line_size;
 688       block +=line_size;
 689     }
 690
 691 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1);
 692
 693 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 694 }
 695
 696 /* next one assumes that ((line_size % 16) == 0) */
 697 #define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
 698 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 699 {
 700 POWERPC_TBL_DECLARE(altivec_avg_pixels16_num, 1);
 701 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
 702     int i;
 703
 704 POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1);
 705
 706     for(i=0; i<h; i++) {
 707       op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l));
 708       op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l));
 709       op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l));
 710       op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l));
 711       pixels+=line_size;
 712       block +=line_size;
 713     }
 714
 715 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1);
 716
 717 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
 718     register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
 719     register vector unsigned char perm = vec_lvsl(0, pixels);
 720     int i;
 721
 722 POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1);
 723
 724     for(i=0; i<h; i++) {
 725       pixelsv1 = vec_ld(0, (unsigned char*)pixels);
 726       pixelsv2 = vec_ld(16, (unsigned char*)pixels);
 727       blockv = vec_ld(0, block);
 728       pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
 729       blockv = vec_avg(blockv,pixelsv);
 730       vec_st(blockv, 0, (unsigned char*)block);
 731       pixels+=line_size;
 732       block +=line_size;
 733     }
 734
 735 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1);
 736
 737 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 738 }
 739
 740 /* next one assumes that ((line_size % 8) == 0) */
 741 void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
 742 {
 743 POWERPC_TBL_DECLARE(altivec_avg_pixels8_num, 1);
 744 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
 745     int i;
 746 POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num, 1);
 747     for (i = 0; i < h; i++) {
 748         *((uint32_t *) (block)) =
 749             (((*((uint32_t *) (block))) |
 750               ((((const struct unaligned_32 *) (pixels))->l))) -
 751              ((((*((uint32_t *) (block))) ^
 752                 ((((const struct unaligned_32 *) (pixels))->
 753                   l))) & 0xFEFEFEFEUL) >> 1));
 754         *((uint32_t *) (block + 4)) =
 755             (((*((uint32_t *) (block + 4))) |
 756               ((((const struct unaligned_32 *) (pixels + 4))->l))) -
 757              ((((*((uint32_t *) (block + 4))) ^
 758                 ((((const struct unaligned_32 *) (pixels +
 759                                                   4))->
 760                   l))) & 0xFEFEFEFEUL) >> 1));
 761         pixels += line_size;
 762         block += line_size;
 763     }
 764 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num, 1);
 765
 766 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
 767     register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
 768     int i;
 769
 770 POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num, 1);
 771
 772    for (i = 0; i < h; i++) {
 773      /*
 774        block is 8 bytes-aligned, so we're either in the
 775        left block (16 bytes-aligned) or in the right block (not)
 776      */
 777      int rightside = ((unsigned long)block & 0x0000000F);
 778
 779      blockv = vec_ld(0, block);
 780      pixelsv1 = vec_ld(0, (unsigned char*)pixels);
 781      pixelsv2 = vec_ld(16, (unsigned char*)pixels);
 782      pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
 783
 784      if (rightside)
 785      {
 786        pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
 787      }
 788      else
 789      {
 790        pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
 791      }
 792
 793      blockv = vec_avg(blockv, pixelsv);
 794
 795      vec_st(blockv, 0, block);
 796
 797      pixels += line_size;
 798      block += line_size;
 799    }
 800
 801 POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num, 1);
 802
 803 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 804 }
 805
 806 /* next one assumes that ((line_size % 8) == 0) */
 807 void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 808 {
 809 POWERPC_TBL_DECLARE(altivec_put_pixels8_xy2_num, 1);
 810 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
 811     int j;
 812 POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1);
 813     for (j = 0; j < 2; j++) {
 814       int i;
 815       const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
 816       const uint32_t b =
 817         (((const struct unaligned_32 *) (pixels + 1))->l);
 818       uint32_t l0 =
 819         (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
 820       uint32_t h0 =
 821         ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
 822       uint32_t l1, h1;
 823       pixels += line_size;
 824       for (i = 0; i < h; i += 2) {
 825         uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
 826         uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
 827         l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
 828         h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
 829         *((uint32_t *) block) =
 830           h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
 831         pixels += line_size;
 832         block += line_size;
 833         a = (((const struct unaligned_32 *) (pixels))->l);
 834         b = (((const struct unaligned_32 *) (pixels + 1))->l);
 835         l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
 836         h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
 837         *((uint32_t *) block) =
 838           h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
 839         pixels += line_size;
 840         block += line_size;
 841       } pixels += 4 - line_size * (h + 1);
 842       block += 4 - line_size * h;
 843     }
 844
 845 POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
 846
 847 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
 848    register int i;
 849    register vector unsigned char
 850      pixelsv1, pixelsv2,
 851      pixelsavg;
 852    register vector unsigned char
 853      blockv, temp1, temp2;
 854    register vector unsigned short
 855      pixelssum1, pixelssum2, temp3;
 856    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
 857    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 858
 859    temp1 = vec_ld(0, pixels);
 860    temp2 = vec_ld(16, pixels);
 861    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
 862    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
 863    {
 864      pixelsv2 = temp2;
 865    }
 866    else
 867    {
 868      pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
 869    }
 870    pixelsv1 = vec_mergeh(vczero, pixelsv1);
 871    pixelsv2 = vec_mergeh(vczero, pixelsv2);
 872    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
 873                         (vector unsigned short)pixelsv2);
 874    pixelssum1 = vec_add(pixelssum1, vctwo);
 875
 876 POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1);
 877    for (i = 0; i < h ; i++) {
 878      int rightside = ((unsigned long)block & 0x0000000F);
 879      blockv = vec_ld(0, block);
 880
 881      temp1 = vec_ld(line_size, pixels);
 882      temp2 = vec_ld(line_size + 16, pixels);
 883      pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
 884      if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
 885      {
 886        pixelsv2 = temp2;
 887      }
 888      else
 889      {
 890        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
 891      }
 892
 893      pixelsv1 = vec_mergeh(vczero, pixelsv1);
 894      pixelsv2 = vec_mergeh(vczero, pixelsv2);
 895      pixelssum2 = vec_add((vector unsigned short)pixelsv1,
 896                           (vector unsigned short)pixelsv2);
 897      temp3 = vec_add(pixelssum1, pixelssum2);
 898      temp3 = vec_sra(temp3, vctwo);
 899      pixelssum1 = vec_add(pixelssum2, vctwo);
 900      pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
 901
 902      if (rightside)
 903      {
 904        blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
 905      }
 906      else
 907      {
 908        blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
 909      }
 910
 911      vec_st(blockv, 0, block);
 912
 913      block += line_size;
 914      pixels += line_size;
 915    }
 916
 917 POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
 918 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 919 }
 920
 921 /* next one assumes that ((line_size % 8) == 0) */
 922 void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 923 {
 924 POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
 925 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
 926     int j;
 927 POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
 928     for (j = 0; j < 2; j++) {
 929       int i;
 930       const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
 931       const uint32_t b =
 932         (((const struct unaligned_32 *) (pixels + 1))->l);
 933       uint32_t l0 =
 934         (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
 935       uint32_t h0 =
 936         ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
 937       uint32_t l1, h1;
 938       pixels += line_size;
 939       for (i = 0; i < h; i += 2) {
 940         uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
 941         uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
 942         l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
 943         h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
 944         *((uint32_t *) block) =
 945           h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
 946         pixels += line_size;
 947         block += line_size;
 948         a = (((const struct unaligned_32 *) (pixels))->l);
 949         b = (((const struct unaligned_32 *) (pixels + 1))->l);
 950         l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
 951         h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
 952         *((uint32_t *) block) =
 953           h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
 954         pixels += line_size;
 955         block += line_size;
 956       } pixels += 4 - line_size * (h + 1);
 957       block += 4 - line_size * h;
 958     }
 959
 960 POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
 961
 962 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
 963    register int i;
 964    register vector unsigned char
 965      pixelsv1, pixelsv2,
 966      pixelsavg;
 967    register vector unsigned char
 968      blockv, temp1, temp2;
 969    register vector unsigned short
 970      pixelssum1, pixelssum2, temp3;
 971    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
 972    register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
 973    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 974
 975    temp1 = vec_ld(0, pixels);
 976    temp2 = vec_ld(16, pixels);
 977    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
 978    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
 979    {
 980      pixelsv2 = temp2;
 981    }
 982    else
 983    {
 984      pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
 985    }
 986    pixelsv1 = vec_mergeh(vczero, pixelsv1);
 987    pixelsv2 = vec_mergeh(vczero, pixelsv2);
 988    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
 989                         (vector unsigned short)pixelsv2);
 990    pixelssum1 = vec_add(pixelssum1, vcone);
 991
 992 POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
 993    for (i = 0; i < h ; i++) {
 994      int rightside = ((unsigned long)block & 0x0000000F);
 995      blockv = vec_ld(0, block);
 996
 997      temp1 = vec_ld(line_size, pixels);
 998      temp2 = vec_ld(line_size + 16, pixels);
 999      pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1000      if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
1001      {
1002        pixelsv2 = temp2;
1003      }
1004      else
1005      {
1006        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1007      }
1008
1009      pixelsv1 = vec_mergeh(vczero, pixelsv1);
1010      pixelsv2 = vec_mergeh(vczero, pixelsv2);
1011      pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1012                           (vector unsigned short)pixelsv2);
1013      temp3 = vec_add(pixelssum1, pixelssum2);
1014      temp3 = vec_sra(temp3, vctwo);
1015      pixelssum1 = vec_add(pixelssum2, vcone);
1016      pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
1017
1018      if (rightside)
1019      {
1020        blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
1021      }
1022      else
1023      {
1024        blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
1025      }
1026
1027      vec_st(blockv, 0, block);
1028
1029      block += line_size;
1030      pixels += line_size;
1031    }
1032
1033 POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1034 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1035 }
1036
1037 /* next one assumes that ((line_size % 16) == 0) */
1038 void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1039 {
1040 POWERPC_TBL_DECLARE(altivec_put_pixels16_xy2_num, 1);
1041 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
1042     int j;
1043 POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1044       for (j = 0; j < 4; j++) {
1045       int i;
1046       const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1047       const uint32_t b =
1048         (((const struct unaligned_32 *) (pixels + 1))->l);
1049       uint32_t l0 =
1050         (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1051       uint32_t h0 =
1052         ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1053       uint32_t l1, h1;
1054       pixels += line_size;
1055       for (i = 0; i < h; i += 2) {
1056         uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1057         uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1058         l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1059         h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1060         *((uint32_t *) block) =
1061           h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1062         pixels += line_size;
1063         block += line_size;
1064         a = (((const struct unaligned_32 *) (pixels))->l);
1065         b = (((const struct unaligned_32 *) (pixels + 1))->l);
1066         l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1067         h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1068         *((uint32_t *) block) =
1069           h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1070         pixels += line_size;
1071         block += line_size;
1072       } pixels += 4 - line_size * (h + 1);
1073       block += 4 - line_size * h;
1074     }
1075
1076 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1077
1078 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
1079    register int i;
1080    register vector unsigned char
1081      pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1082    register vector unsigned char
1083      blockv, temp1, temp2;
1084    register vector unsigned short
1085      pixelssum1, pixelssum2, temp3,
1086      pixelssum3, pixelssum4, temp4;
1087    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1088    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1089
1090 POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1091
1092    temp1 = vec_ld(0, pixels);
1093    temp2 = vec_ld(16, pixels);
1094    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1095    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
1096    {
1097      pixelsv2 = temp2;
1098    }
1099    else
1100    {
1101      pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1102    }
1103    pixelsv3 = vec_mergel(vczero, pixelsv1);
1104    pixelsv4 = vec_mergel(vczero, pixelsv2);
1105    pixelsv1 = vec_mergeh(vczero, pixelsv1);
1106    pixelsv2 = vec_mergeh(vczero, pixelsv2);
1107    pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1108                         (vector unsigned short)pixelsv4);
1109    pixelssum3 = vec_add(pixelssum3, vctwo);
1110    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1111                         (vector unsigned short)pixelsv2);
1112    pixelssum1 = vec_add(pixelssum1, vctwo);
1113
1114    for (i = 0; i < h ; i++) {
1115      blockv = vec_ld(0, block);
1116
1117      temp1 = vec_ld(line_size, pixels);
1118      temp2 = vec_ld(line_size + 16, pixels);
1119      pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1120      if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
1121      {
1122        pixelsv2 = temp2;
1123      }
1124      else
1125      {
1126        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1127      }
1128
1129      pixelsv3 = vec_mergel(vczero, pixelsv1);
1130      pixelsv4 = vec_mergel(vczero, pixelsv2);
1131      pixelsv1 = vec_mergeh(vczero, pixelsv1);
1132      pixelsv2 = vec_mergeh(vczero, pixelsv2);
1133
1134      pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1135                           (vector unsigned short)pixelsv4);
1136      pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1137                           (vector unsigned short)pixelsv2);
1138      temp4 = vec_add(pixelssum3, pixelssum4);
1139      temp4 = vec_sra(temp4, vctwo);
1140      temp3 = vec_add(pixelssum1, pixelssum2);
1141      temp3 = vec_sra(temp3, vctwo);
1142
1143      pixelssum3 = vec_add(pixelssum4, vctwo);
1144      pixelssum1 = vec_add(pixelssum2, vctwo);
1145
1146      blockv = vec_packsu(temp3, temp4);
1147
1148      vec_st(blockv, 0, block);
1149
1150      block += line_size;
1151      pixels += line_size;
1152    }
1153
1154 POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1155 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1156 }
1157
1158 /* next one assumes that ((line_size % 16) == 0) */
1159 void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1160 {
1161 POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
1162 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
1163     int j;
1164 POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1165       for (j = 0; j < 4; j++) {
1166       int i;
1167       const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1168       const uint32_t b =
1169         (((const struct unaligned_32 *) (pixels + 1))->l);
1170       uint32_t l0 =
1171         (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1172       uint32_t h0 =
1173         ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1174       uint32_t l1, h1;
1175       pixels += line_size;
1176       for (i = 0; i < h; i += 2) {
1177         uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1178         uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1179         l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1180         h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1181         *((uint32_t *) block) =
1182           h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1183         pixels += line_size;
1184         block += line_size;
1185         a = (((const struct unaligned_32 *) (pixels))->l);
1186         b = (((const struct unaligned_32 *) (pixels + 1))->l);
1187         l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1188         h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1189         *((uint32_t *) block) =
1190           h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1191         pixels += line_size;
1192         block += line_size;
1193       } pixels += 4 - line_size * (h + 1);
1194       block += 4 - line_size * h;
1195     }
1196
1197 POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1198
1199 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
1200    register int i;
1201    register vector unsigned char
1202      pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1203    register vector unsigned char
1204      blockv, temp1, temp2;
1205    register vector unsigned short
1206      pixelssum1, pixelssum2, temp3,
1207      pixelssum3, pixelssum4, temp4;
1208    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1209    register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
1210    register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1211
1212 POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1213
1214    temp1 = vec_ld(0, pixels);
1215    temp2 = vec_ld(16, pixels);
1216    pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1217    if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
1218    {
1219      pixelsv2 = temp2;
1220    }
1221    else
1222    {
1223      pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1224    }
1225    pixelsv3 = vec_mergel(vczero, pixelsv1);
1226    pixelsv4 = vec_mergel(vczero, pixelsv2);
1227    pixelsv1 = vec_mergeh(vczero, pixelsv1);
1228    pixelsv2 = vec_mergeh(vczero, pixelsv2);
1229    pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1230                         (vector unsigned short)pixelsv4);
1231    pixelssum3 = vec_add(pixelssum3, vcone);
1232    pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1233                         (vector unsigned short)pixelsv2);
1234    pixelssum1 = vec_add(pixelssum1, vcone);
1235
1236    for (i = 0; i < h ; i++) {
1237      blockv = vec_ld(0, block);
1238
1239      temp1 = vec_ld(line_size, pixels);
1240      temp2 = vec_ld(line_size + 16, pixels);
1241      pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1242      if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
1243      {
1244        pixelsv2 = temp2;
1245      }
1246      else
1247      {
1248        pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1249      }
1250
1251      pixelsv3 = vec_mergel(vczero, pixelsv1);
1252      pixelsv4 = vec_mergel(vczero, pixelsv2);
1253      pixelsv1 = vec_mergeh(vczero, pixelsv1);
1254      pixelsv2 = vec_mergeh(vczero, pixelsv2);
1255
1256      pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1257                           (vector unsigned short)pixelsv4);
1258      pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1259                           (vector unsigned short)pixelsv2);
1260      temp4 = vec_add(pixelssum3, pixelssum4);
1261      temp4 = vec_sra(temp4, vctwo);
1262      temp3 = vec_add(pixelssum1, pixelssum2);
1263      temp3 = vec_sra(temp3, vctwo);
1264
1265      pixelssum3 = vec_add(pixelssum4, vcone);
1266      pixelssum1 = vec_add(pixelssum2, vcone);
1267
1268      blockv = vec_packsu(temp3, temp4);
1269
1270      vec_st(blockv, 0, block);
1271
1272      block += line_size;
1273      pixels += line_size;
1274    }
1275
1276 POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1277 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1278 }
1279
1280 int has_altivec(void)
1281 {
1282 #ifdef CONFIG_DARWIN
1283     int sels[2] = {CTL_HW, HW_VECTORUNIT};
1284     int has_vu = 0;
1285     size_t len = sizeof(has_vu);
1286     int err;
1287
1288     err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
1289
1290     if (err == 0) return (has_vu != 0);
1291 #else /* CONFIG_DARWIN */
1292 /* no Darwin, do it the brute-force way */
1293 /* this is borrowed from the libmpeg2 library */
1294     {
1295       signal (SIGILL, sigill_handler);
1296       if (sigsetjmp (jmpbuf, 1)) {
1297         signal (SIGILL, SIG_DFL);
1298       } else {
1299         canjump = 1;
1300
1301         asm volatile ("mtspr 256, %0\n\t"
1302                       "vand %%v0, %%v0, %%v0"
1303                       :
1304                       : "r" (-1));
1305
1306         signal (SIGILL, SIG_DFL);
1307         return 1;
1308       }
1309     }
1310 #endif /* CONFIG_DARWIN */
1311     return 0;
1312 }