git.sesse.net Git - ffmpeg/blob - libswscale/ppc/yuv2rgb_altivec.c

   1 /*
   2  * AltiVec acceleration for colorspace conversion
   3  *
   4  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 /*
  24 Convert I420 YV12 to RGB in various formats,
  25   it rejects images that are not in 420 formats,
  26   it rejects images that don't have widths of multiples of 16,
  27   it rejects images that don't have heights of multiples of 2.
  28 Reject defers to C simulation code.
  29
  30 Lots of optimizations to be done here.
  31
  32 1. Need to fix saturation code. I just couldn't get it to fly with packs
  33    and adds, so we currently use max/min to clip.
  34
  35 2. The inefficient use of chroma loading needs a bit of brushing up.
  36
  37 3. Analysis of pipeline stalls needs to be done. Use shark to identify
  38    pipeline stalls.
  39
  40
  41 MODIFIED to calculate coeffs from currently selected color space.
  42 MODIFIED core to be a macro where you specify the output format.
  43 ADDED UYVY conversion which is never called due to some thing in swscale.
  44 CORRECTED algorithim selection to be strict on input formats.
  45 ADDED runtime detection of AltiVec.
  46
  47 ADDED altivec_yuv2packedX vertical scl + RGB converter
  48
  49 March 27,2004
  50 PERFORMANCE ANALYSIS
  51
  52 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
  53 used as test.
  54 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
  55 same sequence.
  56
  57 720 * 480 * 30  ~10MPS
  58
  59 so we have roughly 10 clocks per pixel. This is too high, something has
  60 to be wrong.
  61
  62 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
  63 need for vec_min.
  64
  65 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
  66 the input video frame, it was just decompressed so it probably resides in L1
  67 caches. However, we are creating the output video stream. This needs to use the
  68 DSTST instruction to optimize for the cache. We couple this with the fact that
  69 we are not going to be visiting the input buffer again so we mark it Least
  70 Recently Used. This shaves 25% of the processor cycles off.
  71
  72 Now memcpy is the largest mips consumer in the system, probably due
  73 to the inefficient X11 stuff.
  74
  75 GL libraries seem to be very slow on this machine 1.33Ghz PB running
  76 Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
  77 a versioning issue, however I have libGL.1.2.dylib for both
  78 machines. (We need to figure this out now.)
  79
  80 GL2 libraries work now with patch for RGB32.
  81
  82 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
  83
  84 Integrated luma prescaling adjustment for saturation/contrast/brightness
  85 adjustment.
  86 */
  87
  88 #include <stdio.h>
  89 #include <stdlib.h>
  90 #include <string.h>
  91 #include <inttypes.h>
  92 #include <assert.h>
  93 #include "config.h"
  94 #include "libswscale/rgb2rgb.h"
  95 #include "libswscale/swscale.h"
  96 #include "libswscale/swscale_internal.h"
  97 #include "libavutil/cpu.h"
  98
  99 #undef PROFILE_THE_BEAST
 100 #undef INC_SCALING
 101
 102 typedef unsigned char ubyte;
 103 typedef signed char   sbyte;
 104
 105
 106 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
 107    homogeneous vector registers x0,x1,x2 are interleaved with the
 108    following technique:
 109
 110       o0 = vec_mergeh (x0,x1);
 111       o1 = vec_perm (o0, x2, perm_rgb_0);
 112       o2 = vec_perm (o0, x2, perm_rgb_1);
 113       o3 = vec_mergel (x0,x1);
 114       o4 = vec_perm (o3,o2,perm_rgb_2);
 115       o5 = vec_perm (o3,o2,perm_rgb_3);
 116
 117   perm_rgb_0:   o0(RG).h v1(B) --> o1*
 118               0   1  2   3   4
 119              rgbr|gbrg|brgb|rgbr
 120              0010 0100 1001 0010
 121              0102 3145 2673 894A
 122
 123   perm_rgb_1:   o0(RG).h v1(B) --> o2
 124               0   1  2   3   4
 125              gbrg|brgb|bbbb|bbbb
 126              0100 1001 1111 1111
 127              B5CD 6EF7 89AB CDEF
 128
 129   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
 130               0   1  2   3   4
 131              gbrg|brgb|rgbr|gbrg
 132              1111 1111 0010 0100
 133              89AB CDEF 0182 3945
 134
 135   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
 136               0   1  2   3   4
 137              brgb|rgbr|gbrg|brgb
 138              1001 0010 0100 1001
 139              a67b 89cA BdCD eEFf
 140
 141 */
 142 static
 143 const vector unsigned char
 144   perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
 145                 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
 146   perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
 147                 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
 148   perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
 149                 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
 150   perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
 151                 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
 152
 153 #define vec_merge3(x2,x1,x0,y0,y1,y2)       \
 154 do {                                        \
 155     __typeof__(x0) o0,o2,o3;                \
 156         o0 = vec_mergeh (x0,x1);            \
 157         y0 = vec_perm (o0, x2, perm_rgb_0); \
 158         o2 = vec_perm (o0, x2, perm_rgb_1); \
 159         o3 = vec_mergel (x0,x1);            \
 160         y1 = vec_perm (o3,o2,perm_rgb_2);   \
 161         y2 = vec_perm (o3,o2,perm_rgb_3);   \
 162 } while(0)
 163
 164 #define vec_mstbgr24(x0,x1,x2,ptr)      \
 165 do {                                    \
 166     __typeof__(x0) _0,_1,_2;            \
 167     vec_merge3 (x0,x1,x2,_0,_1,_2);     \
 168     vec_st (_0, 0, ptr++);              \
 169     vec_st (_1, 0, ptr++);              \
 170     vec_st (_2, 0, ptr++);              \
 171 }  while (0)
 172
 173 #define vec_mstrgb24(x0,x1,x2,ptr)      \
 174 do {                                    \
 175     __typeof__(x0) _0,_1,_2;            \
 176     vec_merge3 (x2,x1,x0,_0,_1,_2);     \
 177     vec_st (_0, 0, ptr++);              \
 178     vec_st (_1, 0, ptr++);              \
 179     vec_st (_2, 0, ptr++);              \
 180 }  while (0)
 181
 182 /* pack the pixels in rgb0 format
 183    msb R
 184    lsb 0
 185 */
 186 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
 187 do {                                                                          \
 188     T _0,_1,_2,_3;                                                            \
 189     _0 = vec_mergeh (x0,x1);                                                  \
 190     _1 = vec_mergeh (x2,x3);                                                  \
 191     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
 192     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
 193     vec_st (_2, 0*16, (T *)ptr);                                              \
 194     vec_st (_3, 1*16, (T *)ptr);                                              \
 195     _0 = vec_mergel (x0,x1);                                                  \
 196     _1 = vec_mergel (x2,x3);                                                  \
 197     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
 198     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
 199     vec_st (_2, 2*16, (T *)ptr);                                              \
 200     vec_st (_3, 3*16, (T *)ptr);                                              \
 201     ptr += 4;                                                                 \
 202 }  while (0)
 203
 204 /*
 205
 206   | 1     0       1.4021   | | Y |
 207   | 1    -0.3441 -0.7142   |x| Cb|
 208   | 1     1.7718  0        | | Cr|
 209
 210
 211   Y:      [-128 127]
 212   Cb/Cr : [-128 127]
 213
 214   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
 215
 216 */
 217
 218
 219
 220
 221 #define vec_unh(x) \
 222     (vector signed short) \
 223         vec_perm(x,(__typeof__(x)){0}, \
 224                  ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
 225                                          0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
 226 #define vec_unl(x) \
 227     (vector signed short) \
 228         vec_perm(x,(__typeof__(x)){0}, \
 229                  ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
 230                                          0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
 231
 232 #define vec_clip_s16(x) \
 233     vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
 234                          ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
 235
 236 #define vec_packclp(x,y) \
 237     (vector unsigned char)vec_packs \
 238         ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
 239          (vector unsigned short)vec_max (y,((vector signed short) {0})))
 240
 241 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
 242
 243
 244 static inline void cvtyuvtoRGB (SwsContext *c,
 245                                 vector signed short Y, vector signed short U, vector signed short V,
 246                                 vector signed short *R, vector signed short *G, vector signed short *B)
 247 {
 248     vector signed   short vx,ux,uvx;
 249
 250     Y = vec_mradds (Y, c->CY, c->OY);
 251     U  = vec_sub (U,(vector signed short)
 252                     vec_splat((vector signed short){128},0));
 253     V  = vec_sub (V,(vector signed short)
 254                     vec_splat((vector signed short){128},0));
 255
 256     //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
 257     ux = vec_sl (U, c->CSHIFT);
 258     *B = vec_mradds (ux, c->CBU, Y);
 259
 260     // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
 261     vx = vec_sl (V, c->CSHIFT);
 262     *R = vec_mradds (vx, c->CRV, Y);
 263
 264     // uvx = ((CGU*u) + (CGV*v))>>15;
 265     uvx = vec_mradds (U, c->CGU, Y);
 266     *G  = vec_mradds (V, c->CGV, uvx);
 267 }
 268
 269
 270 /*
 271   ------------------------------------------------------------------------------
 272   CS converters
 273   ------------------------------------------------------------------------------
 274 */
 275
 276
 277 #define DEFCSP420_CVT(name,out_pixels)                                  \
 278 static int altivec_##name (SwsContext *c,                               \
 279                            const unsigned char **in, int *instrides,    \
 280                            int srcSliceY,        int srcSliceH,         \
 281                            unsigned char **oplanes, int *outstrides)    \
 282 {                                                                       \
 283     int w = c->srcW;                                                    \
 284     int h = srcSliceH;                                                  \
 285     int i,j;                                                            \
 286     int instrides_scl[3];                                               \
 287     vector unsigned char y0,y1;                                         \
 288                                                                         \
 289     vector signed char  u,v;                                            \
 290                                                                         \
 291     vector signed short Y0,Y1,Y2,Y3;                                    \
 292     vector signed short U,V;                                            \
 293     vector signed short vx,ux,uvx;                                      \
 294     vector signed short vx0,ux0,uvx0;                                   \
 295     vector signed short vx1,ux1,uvx1;                                   \
 296     vector signed short R0,G0,B0;                                       \
 297     vector signed short R1,G1,B1;                                       \
 298     vector unsigned char R,G,B;                                         \
 299                                                                         \
 300     vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
 301     vector unsigned char align_perm;                                    \
 302                                                                         \
 303     vector signed short                                                 \
 304         lCY  = c->CY,                                                   \
 305         lOY  = c->OY,                                                   \
 306         lCRV = c->CRV,                                                  \
 307         lCBU = c->CBU,                                                  \
 308         lCGU = c->CGU,                                                  \
 309         lCGV = c->CGV;                                                  \
 310                                                                         \
 311     vector unsigned short lCSHIFT = c->CSHIFT;                          \
 312                                                                         \
 313     const ubyte *y1i   = in[0];                                         \
 314     const ubyte *y2i   = in[0]+instrides[0];                            \
 315     const ubyte *ui    = in[1];                                         \
 316     const ubyte *vi    = in[2];                                         \
 317                                                                         \
 318     vector unsigned char *oute                                          \
 319         = (vector unsigned char *)                                      \
 320             (oplanes[0]+srcSliceY*outstrides[0]);                       \
 321     vector unsigned char *outo                                          \
 322         = (vector unsigned char *)                                      \
 323             (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
 324                                                                         \
 325                                                                         \
 326     instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
 327     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
 328     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
 329                                                                         \
 330                                                                         \
 331     for (i=0;i<h/2;i++) {                                               \
 332         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
 333         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
 334                                                                         \
 335         for (j=0;j<w/16;j++) {                                          \
 336                                                                         \
 337             y1ivP = (vector unsigned char *)y1i;                        \
 338             y2ivP = (vector unsigned char *)y2i;                        \
 339             uivP  = (vector unsigned char *)ui;                         \
 340             vivP  = (vector unsigned char *)vi;                         \
 341                                                                         \
 342             align_perm = vec_lvsl (0, y1i);                             \
 343             y0 = (vector unsigned char)                                 \
 344                  vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
 345                                                                         \
 346             align_perm = vec_lvsl (0, y2i);                             \
 347             y1 = (vector unsigned char)                                 \
 348                  vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
 349                                                                         \
 350             align_perm = vec_lvsl (0, ui);                              \
 351             u = (vector signed char)                                    \
 352                 vec_perm (uivP[0], uivP[1], align_perm);                \
 353                                                                         \
 354             align_perm = vec_lvsl (0, vi);                              \
 355             v = (vector signed char)                                    \
 356                 vec_perm (vivP[0], vivP[1], align_perm);                \
 357                                                                         \
 358             u  = (vector signed char)                                   \
 359                  vec_sub (u,(vector signed char)                        \
 360                           vec_splat((vector signed char){128},0));      \
 361             v  = (vector signed char)                                   \
 362                  vec_sub (v,(vector signed char)                        \
 363                           vec_splat((vector signed char){128},0));      \
 364                                                                         \
 365             U  = vec_unpackh (u);                                       \
 366             V  = vec_unpackh (v);                                       \
 367                                                                         \
 368                                                                         \
 369             Y0 = vec_unh (y0);                                          \
 370             Y1 = vec_unl (y0);                                          \
 371             Y2 = vec_unh (y1);                                          \
 372             Y3 = vec_unl (y1);                                          \
 373                                                                         \
 374             Y0 = vec_mradds (Y0, lCY, lOY);                             \
 375             Y1 = vec_mradds (Y1, lCY, lOY);                             \
 376             Y2 = vec_mradds (Y2, lCY, lOY);                             \
 377             Y3 = vec_mradds (Y3, lCY, lOY);                             \
 378                                                                         \
 379             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
 380             ux = vec_sl (U, lCSHIFT);                                   \
 381             ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
 382             ux0  = vec_mergeh (ux,ux);                                  \
 383             ux1  = vec_mergel (ux,ux);                                  \
 384                                                                         \
 385             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
 386             vx = vec_sl (V, lCSHIFT);                                   \
 387             vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
 388             vx0  = vec_mergeh (vx,vx);                                  \
 389             vx1  = vec_mergel (vx,vx);                                  \
 390                                                                         \
 391             /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
 392             uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
 393             uvx = vec_mradds (V, lCGV, uvx);                            \
 394             uvx0 = vec_mergeh (uvx,uvx);                                \
 395             uvx1 = vec_mergel (uvx,uvx);                                \
 396                                                                         \
 397             R0 = vec_add (Y0,vx0);                                      \
 398             G0 = vec_add (Y0,uvx0);                                     \
 399             B0 = vec_add (Y0,ux0);                                      \
 400             R1 = vec_add (Y1,vx1);                                      \
 401             G1 = vec_add (Y1,uvx1);                                     \
 402             B1 = vec_add (Y1,ux1);                                      \
 403                                                                         \
 404             R  = vec_packclp (R0,R1);                                   \
 405             G  = vec_packclp (G0,G1);                                   \
 406             B  = vec_packclp (B0,B1);                                   \
 407                                                                         \
 408             out_pixels(R,G,B,oute);                                     \
 409                                                                         \
 410             R0 = vec_add (Y2,vx0);                                      \
 411             G0 = vec_add (Y2,uvx0);                                     \
 412             B0 = vec_add (Y2,ux0);                                      \
 413             R1 = vec_add (Y3,vx1);                                      \
 414             G1 = vec_add (Y3,uvx1);                                     \
 415             B1 = vec_add (Y3,ux1);                                      \
 416             R  = vec_packclp (R0,R1);                                   \
 417             G  = vec_packclp (G0,G1);                                   \
 418             B  = vec_packclp (B0,B1);                                   \
 419                                                                         \
 420                                                                         \
 421             out_pixels(R,G,B,outo);                                     \
 422                                                                         \
 423             y1i  += 16;                                                 \
 424             y2i  += 16;                                                 \
 425             ui   += 8;                                                  \
 426             vi   += 8;                                                  \
 427                                                                         \
 428         }                                                               \
 429                                                                         \
 430         outo  += (outstrides[0])>>4;                                    \
 431         oute  += (outstrides[0])>>4;                                    \
 432                                                                         \
 433         ui    += instrides_scl[1];                                      \
 434         vi    += instrides_scl[2];                                      \
 435         y1i   += instrides_scl[0];                                      \
 436         y2i   += instrides_scl[0];                                      \
 437     }                                                                   \
 438     return srcSliceH;                                                   \
 439 }
 440
 441
 442 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
 443 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
 444 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
 445 #define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
 446 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
 447 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
 448
 449 DEFCSP420_CVT (yuv2_abgr, out_abgr)
 450 #if 1
 451 DEFCSP420_CVT (yuv2_bgra, out_bgra)
 452 #else
 453 static int altivec_yuv2_bgra32 (SwsContext *c,
 454                                 unsigned char **in, int *instrides,
 455                                 int srcSliceY,        int srcSliceH,
 456                                 unsigned char **oplanes, int *outstrides)
 457 {
 458     int w = c->srcW;
 459     int h = srcSliceH;
 460     int i,j;
 461     int instrides_scl[3];
 462     vector unsigned char y0,y1;
 463
 464     vector signed char  u,v;
 465
 466     vector signed short Y0,Y1,Y2,Y3;
 467     vector signed short U,V;
 468     vector signed short vx,ux,uvx;
 469     vector signed short vx0,ux0,uvx0;
 470     vector signed short vx1,ux1,uvx1;
 471     vector signed short R0,G0,B0;
 472     vector signed short R1,G1,B1;
 473     vector unsigned char R,G,B;
 474
 475     vector unsigned char *uivP, *vivP;
 476     vector unsigned char align_perm;
 477
 478     vector signed short
 479         lCY  = c->CY,
 480         lOY  = c->OY,
 481         lCRV = c->CRV,
 482         lCBU = c->CBU,
 483         lCGU = c->CGU,
 484         lCGV = c->CGV;
 485
 486     vector unsigned short lCSHIFT = c->CSHIFT;
 487
 488     ubyte *y1i   = in[0];
 489     ubyte *y2i   = in[0]+w;
 490     ubyte *ui    = in[1];
 491     ubyte *vi    = in[2];
 492
 493     vector unsigned char *oute
 494         = (vector unsigned char *)
 495           (oplanes[0]+srcSliceY*outstrides[0]);
 496     vector unsigned char *outo
 497         = (vector unsigned char *)
 498           (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
 499
 500
 501     instrides_scl[0] = instrides[0];
 502     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
 503     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
 504
 505
 506     for (i=0;i<h/2;i++) {
 507         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
 508         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
 509
 510         for (j=0;j<w/16;j++) {
 511
 512             y0 = vec_ldl (0,y1i);
 513             y1 = vec_ldl (0,y2i);
 514             uivP = (vector unsigned char *)ui;
 515             vivP = (vector unsigned char *)vi;
 516
 517             align_perm = vec_lvsl (0, ui);
 518             u  = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
 519
 520             align_perm = vec_lvsl (0, vi);
 521             v  = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
 522             u  = (vector signed char)
 523                  vec_sub (u,(vector signed char)
 524                           vec_splat((vector signed char){128},0));
 525
 526             v  = (vector signed char)
 527                  vec_sub (v, (vector signed char)
 528                           vec_splat((vector signed char){128},0));
 529
 530             U  = vec_unpackh (u);
 531             V  = vec_unpackh (v);
 532
 533
 534             Y0 = vec_unh (y0);
 535             Y1 = vec_unl (y0);
 536             Y2 = vec_unh (y1);
 537             Y3 = vec_unl (y1);
 538
 539             Y0 = vec_mradds (Y0, lCY, lOY);
 540             Y1 = vec_mradds (Y1, lCY, lOY);
 541             Y2 = vec_mradds (Y2, lCY, lOY);
 542             Y3 = vec_mradds (Y3, lCY, lOY);
 543
 544             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
 545             ux = vec_sl (U, lCSHIFT);
 546             ux = vec_mradds (ux, lCBU, (vector signed short){0});
 547             ux0  = vec_mergeh (ux,ux);
 548             ux1  = vec_mergel (ux,ux);
 549
 550             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */
 551             vx = vec_sl (V, lCSHIFT);
 552             vx = vec_mradds (vx, lCRV, (vector signed short){0});
 553             vx0  = vec_mergeh (vx,vx);
 554             vx1  = vec_mergel (vx,vx);
 555             /* uvx = ((CGU*u) + (CGV*v))>>15 */
 556             uvx = vec_mradds (U, lCGU, (vector signed short){0});
 557             uvx = vec_mradds (V, lCGV, uvx);
 558             uvx0 = vec_mergeh (uvx,uvx);
 559             uvx1 = vec_mergel (uvx,uvx);
 560             R0 = vec_add (Y0,vx0);
 561             G0 = vec_add (Y0,uvx0);
 562             B0 = vec_add (Y0,ux0);
 563             R1 = vec_add (Y1,vx1);
 564             G1 = vec_add (Y1,uvx1);
 565             B1 = vec_add (Y1,ux1);
 566             R  = vec_packclp (R0,R1);
 567             G  = vec_packclp (G0,G1);
 568             B  = vec_packclp (B0,B1);
 569
 570             out_argb(R,G,B,oute);
 571             R0 = vec_add (Y2,vx0);
 572             G0 = vec_add (Y2,uvx0);
 573             B0 = vec_add (Y2,ux0);
 574             R1 = vec_add (Y3,vx1);
 575             G1 = vec_add (Y3,uvx1);
 576             B1 = vec_add (Y3,ux1);
 577             R  = vec_packclp (R0,R1);
 578             G  = vec_packclp (G0,G1);
 579             B  = vec_packclp (B0,B1);
 580
 581             out_argb(R,G,B,outo);
 582             y1i  += 16;
 583             y2i  += 16;
 584             ui   += 8;
 585             vi   += 8;
 586
 587         }
 588
 589         outo  += (outstrides[0])>>4;
 590         oute  += (outstrides[0])>>4;
 591
 592         ui    += instrides_scl[1];
 593         vi    += instrides_scl[2];
 594         y1i   += instrides_scl[0];
 595         y2i   += instrides_scl[0];
 596     }
 597     return srcSliceH;
 598 }
 599
 600 #endif
 601
 602
 603 DEFCSP420_CVT (yuv2_rgba, out_rgba)
 604 DEFCSP420_CVT (yuv2_argb, out_argb)
 605 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
 606 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
 607
 608
 609 // uyvy|uyvy|uyvy|uyvy
 610 // 0123 4567 89ab cdef
 611 static
 612 const vector unsigned char
 613     demux_u = {0x10,0x00,0x10,0x00,
 614                0x10,0x04,0x10,0x04,
 615                0x10,0x08,0x10,0x08,
 616                0x10,0x0c,0x10,0x0c},
 617     demux_v = {0x10,0x02,0x10,0x02,
 618                0x10,0x06,0x10,0x06,
 619                0x10,0x0A,0x10,0x0A,
 620                0x10,0x0E,0x10,0x0E},
 621     demux_y = {0x10,0x01,0x10,0x03,
 622                0x10,0x05,0x10,0x07,
 623                0x10,0x09,0x10,0x0B,
 624                0x10,0x0D,0x10,0x0F};
 625
 626 /*
 627   this is so I can play live CCIR raw video
 628 */
 629 static int altivec_uyvy_rgb32 (SwsContext *c,
 630                                const unsigned char **in, int *instrides,
 631                                int srcSliceY,        int srcSliceH,
 632                                unsigned char **oplanes, int *outstrides)
 633 {
 634     int w = c->srcW;
 635     int h = srcSliceH;
 636     int i,j;
 637     vector unsigned char uyvy;
 638     vector signed   short Y,U,V;
 639     vector signed   short R0,G0,B0,R1,G1,B1;
 640     vector unsigned char  R,G,B;
 641     vector unsigned char *out;
 642     const ubyte *img;
 643
 644     img = in[0];
 645     out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
 646
 647     for (i=0;i<h;i++) {
 648         for (j=0;j<w/16;j++) {
 649             uyvy = vec_ld (0, img);
 650             U = (vector signed short)
 651                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
 652
 653             V = (vector signed short)
 654                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
 655
 656             Y = (vector signed short)
 657                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
 658
 659             cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
 660
 661             uyvy = vec_ld (16, img);
 662             U = (vector signed short)
 663                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
 664
 665             V = (vector signed short)
 666                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
 667
 668             Y = (vector signed short)
 669                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
 670
 671             cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
 672
 673             R  = vec_packclp (R0,R1);
 674             G  = vec_packclp (G0,G1);
 675             B  = vec_packclp (B0,B1);
 676
 677             //      vec_mstbgr24 (R,G,B, out);
 678             out_rgba (R,G,B,out);
 679
 680             img += 32;
 681         }
 682     }
 683     return srcSliceH;
 684 }
 685
 686
 687
 688 /* Ok currently the acceleration routine only supports
 689    inputs of widths a multiple of 16
 690    and heights a multiple 2
 691
 692    So we just fall back to the C codes for this.
 693 */
 694 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
 695 {
 696     if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
 697         return NULL;
 698
 699     /*
 700       and this seems not to matter too much I tried a bunch of
 701       videos with abnormal widths and MPlayer crashes elsewhere.
 702       mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
 703       boom with X11 bad match.
 704
 705     */
 706     if ((c->srcW & 0xf) != 0)    return NULL;
 707
 708     switch (c->srcFormat) {
 709     case PIX_FMT_YUV410P:
 710     case PIX_FMT_YUV420P:
 711     /*case IMGFMT_CLPL:        ??? */
 712     case PIX_FMT_GRAY8:
 713     case PIX_FMT_NV12:
 714     case PIX_FMT_NV21:
 715         if ((c->srcH & 0x1) != 0)
 716             return NULL;
 717
 718         switch(c->dstFormat) {
 719         case PIX_FMT_RGB24:
 720             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
 721             return altivec_yuv2_rgb24;
 722         case PIX_FMT_BGR24:
 723             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
 724             return altivec_yuv2_bgr24;
 725         case PIX_FMT_ARGB:
 726             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
 727             return altivec_yuv2_argb;
 728         case PIX_FMT_ABGR:
 729             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
 730             return altivec_yuv2_abgr;
 731         case PIX_FMT_RGBA:
 732             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
 733             return altivec_yuv2_rgba;
 734         case PIX_FMT_BGRA:
 735             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
 736             return altivec_yuv2_bgra;
 737         default: return NULL;
 738         }
 739         break;
 740
 741     case PIX_FMT_UYVY422:
 742         switch(c->dstFormat) {
 743         case PIX_FMT_BGR32:
 744             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
 745             return altivec_uyvy_rgb32;
 746         default: return NULL;
 747         }
 748         break;
 749
 750     }
 751     return NULL;
 752 }
 753
 754 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
 755 {
 756     union {
 757         DECLARE_ALIGNED(16, signed short, tmp)[8];
 758         vector signed short vec;
 759     } buf;
 760
 761     buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
 762     buf.tmp[1] =  -256*brightness;                                      //oy
 763     buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
 764     buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
 765     buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
 766     buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
 767
 768
 769     c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
 770     c->CY   = vec_splat ((vector signed short)buf.vec, 0);
 771     c->OY   = vec_splat ((vector signed short)buf.vec, 1);
 772     c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
 773     c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
 774     c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
 775     c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
 776     return;
 777 }
 778
 779
 780 void
 781 ff_yuv2packedX_altivec(SwsContext *c,
 782                        const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
 783                        const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
 784                      uint8_t *dest, int dstW, int dstY)
 785 {
 786     int i,j;
 787     vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
 788     vector signed short R0,G0,B0,R1,G1,B1;
 789
 790     vector unsigned char R,G,B;
 791     vector unsigned char *out,*nout;
 792
 793     vector signed short   RND = vec_splat_s16(1<<3);
 794     vector unsigned short SCL = vec_splat_u16(4);
 795     DECLARE_ALIGNED(16, unsigned long, scratch)[16];
 796
 797     vector signed short *YCoeffs, *CCoeffs;
 798
 799     YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
 800     CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
 801
 802     out = (vector unsigned char *)dest;
 803
 804     for (i=0; i<dstW; i+=16) {
 805         Y0 = RND;
 806         Y1 = RND;
 807         /* extract 16 coeffs from lumSrc */
 808         for (j=0; j<lumFilterSize; j++) {
 809             X0 = vec_ld (0,  &lumSrc[j][i]);
 810             X1 = vec_ld (16, &lumSrc[j][i]);
 811             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
 812             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
 813         }
 814
 815         U = RND;
 816         V = RND;
 817         /* extract 8 coeffs from U,V */
 818         for (j=0; j<chrFilterSize; j++) {
 819             X  = vec_ld (0, &chrSrc[j][i/2]);
 820             U  = vec_mradds (X, CCoeffs[j], U);
 821             X  = vec_ld (0, &chrSrc[j][i/2+VOFW]);
 822             V  = vec_mradds (X, CCoeffs[j], V);
 823         }
 824
 825         /* scale and clip signals */
 826         Y0 = vec_sra (Y0, SCL);
 827         Y1 = vec_sra (Y1, SCL);
 828         U  = vec_sra (U,  SCL);
 829         V  = vec_sra (V,  SCL);
 830
 831         Y0 = vec_clip_s16 (Y0);
 832         Y1 = vec_clip_s16 (Y1);
 833         U  = vec_clip_s16 (U);
 834         V  = vec_clip_s16 (V);
 835
 836         /* now we have
 837           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
 838           U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
 839
 840           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
 841           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
 842           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
 843         */
 844
 845         U0 = vec_mergeh (U,U);
 846         V0 = vec_mergeh (V,V);
 847
 848         U1 = vec_mergel (U,U);
 849         V1 = vec_mergel (V,V);
 850
 851         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
 852         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
 853
 854         R  = vec_packclp (R0,R1);
 855         G  = vec_packclp (G0,G1);
 856         B  = vec_packclp (B0,B1);
 857
 858         switch(c->dstFormat) {
 859         case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
 860         case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
 861         case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
 862         case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
 863         case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
 864         case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
 865         default:
 866             {
 867                 /* If this is reached, the caller should have called yuv2packedXinC
 868                    instead. */
 869                 static int printed_error_message;
 870                 if (!printed_error_message) {
 871                     av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
 872                            sws_format_name(c->dstFormat));
 873                     printed_error_message=1;
 874                 }
 875                 return;
 876             }
 877         }
 878     }
 879
 880     if (i < dstW) {
 881         i -= 16;
 882
 883         Y0 = RND;
 884         Y1 = RND;
 885         /* extract 16 coeffs from lumSrc */
 886         for (j=0; j<lumFilterSize; j++) {
 887             X0 = vec_ld (0,  &lumSrc[j][i]);
 888             X1 = vec_ld (16, &lumSrc[j][i]);
 889             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
 890             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
 891         }
 892
 893         U = RND;
 894         V = RND;
 895         /* extract 8 coeffs from U,V */
 896         for (j=0; j<chrFilterSize; j++) {
 897             X  = vec_ld (0, &chrSrc[j][i/2]);
 898             U  = vec_mradds (X, CCoeffs[j], U);
 899             X  = vec_ld (0, &chrSrc[j][i/2+VOFW]);
 900             V  = vec_mradds (X, CCoeffs[j], V);
 901         }
 902
 903         /* scale and clip signals */
 904         Y0 = vec_sra (Y0, SCL);
 905         Y1 = vec_sra (Y1, SCL);
 906         U  = vec_sra (U,  SCL);
 907         V  = vec_sra (V,  SCL);
 908
 909         Y0 = vec_clip_s16 (Y0);
 910         Y1 = vec_clip_s16 (Y1);
 911         U  = vec_clip_s16 (U);
 912         V  = vec_clip_s16 (V);
 913
 914         /* now we have
 915            Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
 916            U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
 917
 918            Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
 919            U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
 920            V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
 921         */
 922
 923         U0 = vec_mergeh (U,U);
 924         V0 = vec_mergeh (V,V);
 925
 926         U1 = vec_mergel (U,U);
 927         V1 = vec_mergel (V,V);
 928
 929         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
 930         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
 931
 932         R  = vec_packclp (R0,R1);
 933         G  = vec_packclp (G0,G1);
 934         B  = vec_packclp (B0,B1);
 935
 936         nout = (vector unsigned char *)scratch;
 937         switch(c->dstFormat) {
 938         case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
 939         case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
 940         case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
 941         case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
 942         case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
 943         case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
 944         default:
 945             /* Unreachable, I think. */
 946             av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
 947                    sws_format_name(c->dstFormat));
 948             return;
 949         }
 950
 951         memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
 952     }
 953
 954 }