git.sesse.net Git - ffmpeg/blob - postproc/yuv2rgb_altivec.c

   1 /*
   2   marc.hoffman@analog.com    March 8, 2004
   3
   4   Altivec Acceleration for Color Space Conversion revision 0.2
   5
   6   convert I420 YV12 to RGB in various formats,
   7     it rejects images that are not in 420 formats
   8     it rejects images that don't have widths of multiples of 16
   9     it rejects images that don't have heights of multiples of 2
  10   reject defers to C simulation codes.
  11
  12   lots of optimizations to be done here
  13
  14   1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
  15      so we currently use max min to clip
  16
  17   2. the inefficient use of chroma loading needs a bit of brushing up
  18
  19   3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
  20
  21
  22   MODIFIED to calculate coeffs from currently selected color space.
  23   MODIFIED core to be a macro which you spec the output format.
  24   ADDED UYVY conversion which is never called due to some thing in SWSCALE.
  25   CORRECTED algorithim selection to be strict on input formats.
  26   ADDED runtime detection of altivec.
  27
  28   ADDED altivec_yuv2packedX vertical scl + RGB converter
  29
  30   March 27,2004
  31   PERFORMANCE ANALYSIS
  32
  33   The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
  34   The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
  35
  36   720*480*30  ~10MPS
  37
  38   so we have roughly 10clocks per pixel this is too high something has to be wrong.
  39
  40   OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
  41
  42   OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
  43   guaranteed to have the input video frame it was just decompressed so
  44   it probably resides in L1 caches.  However we are creating the
  45   output video stream this needs to use the DSTST instruction to
  46   optimize for the cache.  We couple this with the fact that we are
  47   not going to be visiting the input buffer again so we mark it Least
  48   Recently Used.  This shaves 25% of the processor cycles off.
  49
  50   Now MEMCPY is the largest mips consumer in the system, probably due
  51   to the inefficient X11 stuff.
  52
  53   GL libraries seem to be very slow on this machine 1.33Ghz PB running
  54   Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
  55   a versioning issues, however i have libGL.1.2.dylib for both
  56   machines. ((We need to figure this out now))
  57
  58   GL2 libraries work now with patch for RGB32
  59
  60   NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
  61
  62   Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
  63
  64 */
  65 #include <stdio.h>
  66 #include <stdlib.h>
  67 #include <string.h>
  68 #include <inttypes.h>
  69 #include <assert.h>
  70 #include "config.h"
  71 #include "rgb2rgb.h"
  72 #include "swscale.h"
  73 #include "swscale_internal.h"
  74 #include "../mangle.h"
  75 #include "../libvo/img_format.h" //FIXME try to reduce dependency of such stuff
  76
  77 #undef PROFILE_THE_BEAST
  78 #undef INC_SCALING
  79
  80 typedef unsigned char ubyte;
  81 typedef signed char   sbyte;
  82
  83
  84 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
  85    homogeneous vector registers x0,x1,x2 are interleaved with the
  86    following technique:
  87
  88       o0 = vec_mergeh (x0,x1);
  89       o1 = vec_perm (o0, x2, perm_rgb_0);
  90       o2 = vec_perm (o0, x2, perm_rgb_1);
  91       o3 = vec_mergel (x0,x1);
  92       o4 = vec_perm (o3,o2,perm_rgb_2);
  93       o5 = vec_perm (o3,o2,perm_rgb_3);
  94
  95   perm_rgb_0:   o0(RG).h v1(B) --> o1*
  96               0   1  2   3   4
  97              rgbr|gbrg|brgb|rgbr
  98              0010 0100 1001 0010
  99              0102 3145 2673 894A
 100
 101   perm_rgb_1:   o0(RG).h v1(B) --> o2
 102               0   1  2   3   4
 103              gbrg|brgb|bbbb|bbbb
 104              0100 1001 1111 1111
 105              B5CD 6EF7 89AB CDEF
 106
 107   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
 108               0   1  2   3   4
 109              gbrg|brgb|rgbr|gbrg
 110              1111 1111 0010 0100
 111              89AB CDEF 0182 3945
 112
 113   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
 114               0   1  2   3   4
 115              brgb|rgbr|gbrg|brgb
 116              1001 0010 0100 1001
 117              a67b 89cA BdCD eEFf
 118
 119 */
 120 static
 121 const vector unsigned char
 122   perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
 123                                       0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
 124   perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
 125                                       0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
 126   perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
 127                                       0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
 128   perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
 129                                       0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
 130
 131 #define vec_merge3(x2,x1,x0,y0,y1,y2)    \
 132 do {                                     \
 133   typeof(x0) o0,o2,o3;                   \
 134       o0 = vec_mergeh (x0,x1);           \
 135       y0 = vec_perm (o0, x2, perm_rgb_0);\
 136       o2 = vec_perm (o0, x2, perm_rgb_1);\
 137       o3 = vec_mergel (x0,x1);           \
 138       y1 = vec_perm (o3,o2,perm_rgb_2);  \
 139       y2 = vec_perm (o3,o2,perm_rgb_3);  \
 140 } while(0)
 141
 142 #define vec_mstrgb24(x0,x1,x2,ptr)        \
 143 do {                                     \
 144   typeof(x0) _0,_1,_2;                   \
 145   vec_merge3 (x0,x1,x2,_0,_1,_2);        \
 146   vec_st (_0, 0, ptr++);                 \
 147   vec_st (_1, 0, ptr++);                 \
 148   vec_st (_2, 0, ptr++);                 \
 149 }  while (0);
 150
 151 #define vec_mstbgr24(x0,x1,x2,ptr)       \
 152 do {                                     \
 153   typeof(x0) _0,_1,_2;                   \
 154   vec_merge3 (x2,x1,x0,_0,_1,_2);        \
 155   vec_st (_0, 0, ptr++);                 \
 156   vec_st (_1, 0, ptr++);                 \
 157   vec_st (_2, 0, ptr++);                 \
 158 }  while (0);
 159
 160 /* pack the pixels in rgb0 format
 161    msb R
 162    lsb 0
 163 */
 164 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                                \
 165 do {                                                                                   \
 166   T _0,_1,_2,_3;                                                                       \
 167   _0 = vec_mergeh (x0,x1);                                                             \
 168   _1 = vec_mergeh (x2,x3);                                                             \
 169   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
 170   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
 171   vec_st (_2, 0*16, (T *)ptr);                                                         \
 172   vec_st (_3, 1*16, (T *)ptr);                                                         \
 173   _0 = vec_mergel (x0,x1);                                                             \
 174   _1 = vec_mergel (x2,x3);                                                             \
 175   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
 176   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
 177   vec_st (_2, 2*16, (T *)ptr);                                                         \
 178   vec_st (_3, 3*16, (T *)ptr);                                                         \
 179   ptr += 4;                                                                            \
 180 }  while (0);
 181
 182 /*
 183
 184   | 1     0       1.4021   | | Y |
 185   | 1    -0.3441 -0.7142   |x| Cb|
 186   | 1     1.7718  0        | | Cr|
 187
 188
 189   Y:      [-128 127]
 190   Cb/Cr : [-128 127]
 191
 192   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
 193
 194 */
 195
 196
 197
 198
 199 #define vec_unh(x) \
 200   (vector signed short) \
 201     vec_perm(x,(typeof(x))AVV(0),\
 202              (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
 203                                     0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
 204 #define vec_unl(x) \
 205   (vector signed short) \
 206     vec_perm(x,(typeof(x))AVV(0),\
 207              (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
 208                                     0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
 209
 210 #define vec_clip(x) \
 211   vec_max (vec_min (x, (typeof(x))AVV(235)), (typeof(x))AVV(16))
 212
 213 #define vec_packclp_a(x,y) \
 214   (vector unsigned char)vec_pack (vec_clip (x), vec_clip (y))
 215
 216 #define vec_packclp(x,y) \
 217   (vector unsigned char)vec_packs \
 218       ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
 219        (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
 220
 221 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
 222
 223
 224 static inline void cvtyuvtoRGB (SwsContext *c,
 225                            vector signed short Y, vector signed short U, vector signed short V,
 226                            vector signed short *R, vector signed short *G, vector signed short *B)
 227 {
 228   vector signed   short vx,ux,uvx;
 229
 230   Y = vec_mradds (Y, c->CY, c->OY);
 231   U  = vec_sub (U,(vector signed short)
 232                         vec_splat((vector signed short)AVV(128),0));
 233   V  = vec_sub (V,(vector signed short)
 234                         vec_splat((vector signed short)AVV(128),0));
 235
 236   //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
 237   ux = vec_sl (U, c->CSHIFT);
 238   *B = vec_mradds (ux, c->CBU, Y);
 239
 240   // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
 241   vx = vec_sl (V, c->CSHIFT);
 242   *R = vec_mradds (vx, c->CRV, Y);
 243
 244   // uvx = ((CGU*u) + (CGV*v))>>15;
 245   uvx = vec_mradds (U, c->CGU, Y);
 246   *G = vec_mradds (V, c->CGV, uvx);
 247 }
 248
 249
 250 /*
 251   ------------------------------------------------------------------------------
 252   CS converters
 253   ------------------------------------------------------------------------------
 254 */
 255
 256
 257 #define DEFCSP420_CVT(name,out_pixels)                                     \
 258 static int altivec_##name (SwsContext *c,                                  \
 259                                 unsigned char **in, int *instrides,        \
 260                                 int srcSliceY,  int srcSliceH,             \
 261                                 unsigned char **oplanes, int *outstrides)  \
 262 {                                                                          \
 263   int w = c->srcW;                                                         \
 264   int h = srcSliceH;                                                       \
 265   int i,j;                                                                 \
 266   int instrides_scl[3];                                                    \
 267   vector unsigned char y0,y1;                                              \
 268                                                                            \
 269   vector signed char  u,v;                                                 \
 270                                                                            \
 271   vector signed short Y0,Y1,Y2,Y3;                                         \
 272   vector signed short U,V;                                                 \
 273   vector signed short vx,ux,uvx;                                           \
 274   vector signed short vx0,ux0,uvx0;                                        \
 275   vector signed short vx1,ux1,uvx1;                                        \
 276   vector signed short R0,G0,B0;                                            \
 277   vector signed short R1,G1,B1;                                            \
 278   vector unsigned char R,G,B;                                              \
 279                                                                            \
 280   vector unsigned char *uivP, *vivP;                                       \
 281   vector unsigned char align_perm;                                         \
 282                                                                            \
 283   vector signed short                                                      \
 284     lCY  = c->CY,                                                          \
 285     lOY  = c->OY,                                                          \
 286     lCRV = c->CRV,                                                         \
 287     lCBU = c->CBU,                                                         \
 288     lCGU = c->CGU,                                                         \
 289     lCGV = c->CGV;                                                         \
 290                                                                            \
 291   vector unsigned short lCSHIFT = c->CSHIFT;                               \
 292                                                                            \
 293   ubyte *y1i   = in[0];                                                    \
 294   ubyte *y2i   = in[0]+w;                                                  \
 295   ubyte *ui    = in[1];                                                    \
 296   ubyte *vi    = in[2];                                                    \
 297                                                                            \
 298   vector unsigned char *oute                                               \
 299     = (vector unsigned char *)                                             \
 300         (oplanes[0]+srcSliceY*outstrides[0]);                              \
 301   vector unsigned char *outo                                               \
 302     = (vector unsigned char *)                                             \
 303         (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);                \
 304                                                                            \
 305                                                                            \
 306   instrides_scl[0] = instrides[0];                                         \
 307   instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */     \
 308   instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */     \
 309                                                                            \
 310                                                                            \
 311   for (i=0;i<h/2;i++) {                                                    \
 312     vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);                 \
 313     vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);                 \
 314                                                                            \
 315     for (j=0;j<w/16;j++) {                                                 \
 316                                                                            \
 317       y0 = vec_ldl (0,y1i);                                                \
 318       y1 = vec_ldl (0,y2i);                                                \
 319       uivP = (vector unsigned char *)ui;                                   \
 320       vivP = (vector unsigned char *)vi;                                   \
 321                                                                            \
 322       align_perm = vec_lvsl (0, ui);                                       \
 323       u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);     \
 324                                                                            \
 325       align_perm = vec_lvsl (0, vi);                                       \
 326       v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);     \
 327                                                                            \
 328       u  = (vector signed char)                                            \
 329                 vec_sub (u,(vector signed char)                            \
 330                                 vec_splat((vector signed char)AVV(128),0));\
 331       v  = (vector signed char)                                            \
 332                 vec_sub (v,(vector signed char)                            \
 333                                 vec_splat((vector signed char)AVV(128),0));\
 334                                                                            \
 335       U  = vec_unpackh (u);                                                \
 336       V  = vec_unpackh (v);                                                \
 337                                                                            \
 338                                                                            \
 339         Y0 = vec_unh (y0);                                                 \
 340         Y1 = vec_unl (y0);                                                 \
 341         Y2 = vec_unh (y1);                                                 \
 342         Y3 = vec_unl (y1);                                                 \
 343                                                                            \
 344         Y0 = vec_mradds (Y0, lCY, lOY);                                    \
 345         Y1 = vec_mradds (Y1, lCY, lOY);                                    \
 346         Y2 = vec_mradds (Y2, lCY, lOY);                                    \
 347         Y3 = vec_mradds (Y3, lCY, lOY);                                    \
 348                                                                            \
 349         /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                         \
 350         ux = vec_sl (U, lCSHIFT);                                          \
 351         ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));                   \
 352         ux0  = vec_mergeh (ux,ux);                                         \
 353         ux1  = vec_mergel (ux,ux);                                         \
 354                                                                            \
 355         /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;  */                         \
 356         vx = vec_sl (V, lCSHIFT);                                          \
 357         vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));                   \
 358         vx0  = vec_mergeh (vx,vx);                                         \
 359         vx1  = vec_mergel (vx,vx);                                         \
 360                                                                            \
 361         /* uvx = ((CGU*u) + (CGV*v))>>15 */                                \
 362         uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));                   \
 363         uvx = vec_mradds (V, lCGV, uvx);                                   \
 364         uvx0 = vec_mergeh (uvx,uvx);                                       \
 365         uvx1 = vec_mergel (uvx,uvx);                                       \
 366                                                                            \
 367         R0 = vec_add (Y0,vx0);                                             \
 368         G0 = vec_add (Y0,uvx0);                                            \
 369         B0 = vec_add (Y0,ux0);                                             \
 370         R1 = vec_add (Y1,vx1);                                             \
 371         G1 = vec_add (Y1,uvx1);                                            \
 372         B1 = vec_add (Y1,ux1);                                             \
 373                                                                            \
 374         R  = vec_packclp (R0,R1);                                          \
 375         G  = vec_packclp (G0,G1);                                          \
 376         B  = vec_packclp (B0,B1);                                          \
 377                                                                            \
 378         out_pixels(R,G,B,oute);                                            \
 379                                                                            \
 380         R0 = vec_add (Y2,vx0);                                             \
 381         G0 = vec_add (Y2,uvx0);                                            \
 382         B0 = vec_add (Y2,ux0);                                             \
 383         R1 = vec_add (Y3,vx1);                                             \
 384         G1 = vec_add (Y3,uvx1);                                            \
 385         B1 = vec_add (Y3,ux1);                                             \
 386         R  = vec_packclp (R0,R1);                                          \
 387         G  = vec_packclp (G0,G1);                                          \
 388         B  = vec_packclp (B0,B1);                                          \
 389                                                                            \
 390                                                                            \
 391         out_pixels(R,G,B,outo);                                            \
 392                                                                            \
 393       y1i  += 16;                                                          \
 394       y2i  += 16;                                                          \
 395       ui   += 8;                                                           \
 396       vi   += 8;                                                           \
 397                                                                            \
 398     }                                                                      \
 399                                                                            \
 400     outo += (outstrides[0])>>4;                                            \
 401     oute += (outstrides[0])>>4;                                            \
 402                                                                            \
 403     ui    += instrides_scl[1];                                             \
 404     vi    += instrides_scl[2];                                             \
 405     y1i   += instrides_scl[0];                                             \
 406     y2i   += instrides_scl[0];                                             \
 407   }                                                                        \
 408   return srcSliceH;                                                        \
 409 }
 410
 411
 412 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
 413 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
 414 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
 415 #define out_argb(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
 416 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
 417 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(c,b,a,ptr)
 418
 419 DEFCSP420_CVT (yuv2_abgr32, out_abgr)
 420 #if 1
 421 DEFCSP420_CVT (yuv2_bgra32, out_argb)
 422 #else
 423 static int altivec_yuv2_bgra32 (SwsContext *c,
 424                                 unsigned char **in, int *instrides,
 425                                 int srcSliceY,  int srcSliceH,
 426                                 unsigned char **oplanes, int *outstrides)
 427 {
 428   int w = c->srcW;
 429   int h = srcSliceH;
 430   int i,j;
 431   int instrides_scl[3];
 432   vector unsigned char y0,y1;
 433
 434   vector signed char  u,v;
 435
 436   vector signed short Y0,Y1,Y2,Y3;
 437   vector signed short U,V;
 438   vector signed short vx,ux,uvx;
 439   vector signed short vx0,ux0,uvx0;
 440   vector signed short vx1,ux1,uvx1;
 441   vector signed short R0,G0,B0;
 442   vector signed short R1,G1,B1;
 443   vector unsigned char R,G,B;
 444
 445   vector unsigned char *uivP, *vivP;
 446   vector unsigned char align_perm;
 447
 448   vector signed short
 449     lCY  = c->CY,
 450     lOY  = c->OY,
 451     lCRV = c->CRV,
 452     lCBU = c->CBU,
 453     lCGU = c->CGU,
 454     lCGV = c->CGV;
 455
 456   vector unsigned short lCSHIFT = c->CSHIFT;
 457
 458   ubyte *y1i   = in[0];
 459   ubyte *y2i   = in[0]+w;
 460   ubyte *ui    = in[1];
 461   ubyte *vi    = in[2];
 462
 463   vector unsigned char *oute
 464     = (vector unsigned char *)
 465         (oplanes[0]+srcSliceY*outstrides[0]);
 466   vector unsigned char *outo
 467     = (vector unsigned char *)
 468         (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
 469
 470
 471   instrides_scl[0] = instrides[0];
 472   instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
 473   instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
 474
 475
 476   for (i=0;i<h/2;i++) {
 477     vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
 478     vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
 479
 480     for (j=0;j<w/16;j++) {
 481
 482       y0 = vec_ldl (0,y1i);
 483       y1 = vec_ldl (0,y2i);
 484       uivP = (vector unsigned char *)ui;
 485       vivP = (vector unsigned char *)vi;
 486
 487       align_perm = vec_lvsl (0, ui);
 488       u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
 489
 490       align_perm = vec_lvsl (0, vi);
 491       v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
 492       u  = (vector signed char)
 493                 vec_sub (u,(vector signed char)
 494                                 vec_splat((vector signed char)AVV(128),0));
 495
 496       v  = (vector signed char)
 497                 vec_sub (v, (vector signed char)
 498                                 vec_splat((vector signed char)AVV(128),0));
 499
 500       U  = vec_unpackh (u);
 501       V  = vec_unpackh (v);
 502
 503
 504         Y0 = vec_unh (y0);
 505         Y1 = vec_unl (y0);
 506         Y2 = vec_unh (y1);
 507         Y3 = vec_unl (y1);
 508
 509         Y0 = vec_mradds (Y0, lCY, lOY);
 510         Y1 = vec_mradds (Y1, lCY, lOY);
 511         Y2 = vec_mradds (Y2, lCY, lOY);
 512         Y3 = vec_mradds (Y3, lCY, lOY);
 513
 514         /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
 515         ux = vec_sl (U, lCSHIFT);
 516         ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
 517         ux0  = vec_mergeh (ux,ux);
 518         ux1  = vec_mergel (ux,ux);
 519
 520         /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;  */
 521         vx = vec_sl (V, lCSHIFT);
 522         vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
 523         vx0  = vec_mergeh (vx,vx);
 524         vx1  = vec_mergel (vx,vx);
 525         /* uvx = ((CGU*u) + (CGV*v))>>15 */
 526         uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
 527         uvx = vec_mradds (V, lCGV, uvx);
 528         uvx0 = vec_mergeh (uvx,uvx);
 529         uvx1 = vec_mergel (uvx,uvx);
 530         R0 = vec_add (Y0,vx0);
 531         G0 = vec_add (Y0,uvx0);
 532         B0 = vec_add (Y0,ux0);
 533         R1 = vec_add (Y1,vx1);
 534         G1 = vec_add (Y1,uvx1);
 535         B1 = vec_add (Y1,ux1);
 536         R  = vec_packclp (R0,R1);
 537         G  = vec_packclp (G0,G1);
 538         B  = vec_packclp (B0,B1);
 539
 540         out_argb(R,G,B,oute);
 541         R0 = vec_add (Y2,vx0);
 542         G0 = vec_add (Y2,uvx0);
 543         B0 = vec_add (Y2,ux0);
 544         R1 = vec_add (Y3,vx1);
 545         G1 = vec_add (Y3,uvx1);
 546         B1 = vec_add (Y3,ux1);
 547         R  = vec_packclp (R0,R1);
 548         G  = vec_packclp (G0,G1);
 549         B  = vec_packclp (B0,B1);
 550
 551         out_argb(R,G,B,outo);
 552         y1i  += 16;
 553         y2i  += 16;
 554         ui   += 8;
 555         vi   += 8;
 556
 557     }
 558
 559     outo += (outstrides[0])>>4;
 560     oute += (outstrides[0])>>4;
 561
 562     ui    += instrides_scl[1];
 563     vi    += instrides_scl[2];
 564     y1i   += instrides_scl[0];
 565     y2i   += instrides_scl[0];
 566   }
 567   return srcSliceH;
 568 }
 569
 570 #endif
 571
 572
 573 DEFCSP420_CVT (yuv2_rgba32, out_rgba)
 574 DEFCSP420_CVT (yuv2_argb32, out_argb)
 575 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
 576 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
 577
 578
 579 // uyvy|uyvy|uyvy|uyvy
 580 // 0123 4567 89ab cdef
 581 static
 582 const vector unsigned char
 583   demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
 584                                    0x10,0x04,0x10,0x04,
 585                                    0x10,0x08,0x10,0x08,
 586                                    0x10,0x0c,0x10,0x0c),
 587   demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
 588                                    0x10,0x06,0x10,0x06,
 589                                    0x10,0x0A,0x10,0x0A,
 590                                    0x10,0x0E,0x10,0x0E),
 591   demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
 592                                    0x10,0x05,0x10,0x07,
 593                                    0x10,0x09,0x10,0x0B,
 594                                    0x10,0x0D,0x10,0x0F);
 595
 596 /*
 597   this is so I can play live CCIR raw video
 598 */
 599 static int altivec_uyvy_rgb32 (SwsContext *c,
 600                                unsigned char **in, int *instrides,
 601                                int srcSliceY,   int srcSliceH,
 602                                unsigned char **oplanes, int *outstrides)
 603 {
 604   int w = c->srcW;
 605   int h = srcSliceH;
 606   int i,j;
 607   vector unsigned char uyvy;
 608   vector signed   short Y,U,V;
 609   vector signed   short vx,ux,uvx;
 610   vector signed   short R0,G0,B0,R1,G1,B1;
 611   vector unsigned char  R,G,B;
 612   vector unsigned char *out;
 613   ubyte *img;
 614
 615   img = in[0];
 616   out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
 617
 618   for (i=0;i<h;i++) {
 619     for (j=0;j<w/16;j++) {
 620       uyvy = vec_ld (0, img);
 621       U = (vector signed short)
 622         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
 623
 624       V = (vector signed short)
 625         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
 626
 627       Y = (vector signed short)
 628         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
 629
 630       cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
 631
 632       uyvy = vec_ld (16, img);
 633       U = (vector signed short)
 634         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
 635
 636       V = (vector signed short)
 637         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
 638
 639       Y = (vector signed short)
 640         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
 641
 642       cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
 643
 644       R  = vec_packclp (R0,R1);
 645       G  = vec_packclp (G0,G1);
 646       B  = vec_packclp (B0,B1);
 647
 648       //      vec_mstbgr24 (R,G,B, out);
 649       out_rgba (R,G,B,out);
 650
 651       img += 32;
 652     }
 653   }
 654   return srcSliceH;
 655 }
 656
 657
 658
 659 /* Ok currently the acceleration routine only supports
 660    inputs of widths a multiple of 16
 661    and heights a multiple 2
 662
 663    So we just fall back to the C codes for this.
 664 */
 665 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
 666 {
 667   if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
 668     return NULL;
 669
 670   /*
 671     and this seems not to matter too much I tried a bunch of
 672     videos with abnormal widths and mplayer crashes else where.
 673     mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
 674     boom with X11 bad match.
 675
 676   */
 677   if ((c->srcW & 0xf) != 0)    return NULL;
 678
 679   switch (c->srcFormat) {
 680   case IMGFMT_YVU9:
 681   case IMGFMT_IF09:
 682   case IMGFMT_YV12:
 683   case IMGFMT_I420:
 684   case IMGFMT_IYUV:
 685   case IMGFMT_CLPL:
 686   case IMGFMT_Y800:
 687   case IMGFMT_Y8:
 688   case IMGFMT_NV12:
 689   case IMGFMT_NV21:
 690     if ((c->srcH & 0x1) != 0)
 691       return NULL;
 692
 693     switch(c->dstFormat){
 694     case IMGFMT_RGB24:
 695       MSG_WARN("ALTIVEC: Color Space RGB24\n");
 696       return altivec_yuv2_rgb24;
 697     case IMGFMT_BGR24:
 698       MSG_WARN("ALTIVEC: Color Space BGR24\n");
 699       return altivec_yuv2_bgr24;
 700     case IMGFMT_RGB32:
 701       MSG_WARN("ALTIVEC: Color Space ARGB32\n");
 702       return altivec_yuv2_argb32;
 703     case IMGFMT_BGR32:
 704       MSG_WARN("ALTIVEC: Color Space BGRA32\n");
 705       //      return profile_altivec_bgra32;
 706
 707       return altivec_yuv2_bgra32;
 708     default: return NULL;
 709     }
 710     break;
 711
 712   case IMGFMT_UYVY:
 713     switch(c->dstFormat){
 714     case IMGFMT_RGB32:
 715       MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
 716       return altivec_uyvy_rgb32;
 717     default: return NULL;
 718     }
 719     break;
 720
 721   }
 722   return NULL;
 723 }
 724
 725 static uint16_t roundToInt16(int64_t f){
 726         int r= (f + (1<<15))>>16;
 727              if(r<-0x7FFF) return 0x8000;
 728         else if(r> 0x7FFF) return 0x7FFF;
 729         else               return r;
 730 }
 731
 732 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
 733 {
 734   union {
 735         signed short tmp[8] __attribute__ ((aligned(16)));
 736         vector signed short vec;
 737         } buf;
 738
 739   buf.tmp[0] =  ( (0xffffLL) * contrast>>8 )>>9;                        //cy
 740   buf.tmp[1] =  -256*brightness;                                        //oy
 741   buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);     //crv
 742   buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);     //cbu
 743   buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));    //cgu
 744   buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));    //cgv
 745
 746
 747   c->CSHIFT = (vector unsigned short)vec_splat((vector unsigned short)AVV(2),0);
 748   c->CY  = vec_splat ((vector signed short)buf.vec, 0);
 749   c->OY  = vec_splat ((vector signed short)buf.vec, 1);
 750   c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
 751   c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
 752   c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
 753   c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
 754 #if 0
 755 {
 756 int i;
 757 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
 758 for (i=0; i<6;i++)
 759   printf("%s %d ", v[i],buf.tmp[i] );
 760   printf("\n");
 761 }
 762 #endif
 763  return;
 764 }
 765
 766
 767 void
 768 altivec_yuv2packedX (SwsContext *c,
 769                        int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
 770                        int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
 771                        uint8_t *dest, int dstW, int dstY)
 772 {
 773   int i,j;
 774   short tmp __attribute__((aligned (16)));
 775   int16_t *p;
 776   short *f;
 777   vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
 778   vector signed short R0,G0,B0,R1,G1,B1;
 779
 780   vector unsigned char R,G,B,pels[3];
 781   vector unsigned char *out,*nout;
 782
 783   vector signed short   RND = vec_splat((vector signed short)AVV(1<<3),0);
 784   vector unsigned short SCL = vec_splat((vector unsigned short)AVV(4),0);
 785   unsigned long scratch[16] __attribute__ ((aligned (16)));
 786
 787   vector signed short *vYCoeffsBank, *vCCoeffsBank;
 788
 789   vector signed short *YCoeffs, *CCoeffs;
 790
 791   vYCoeffsBank = malloc (sizeof (vector signed short)*lumFilterSize*dstW);
 792   vCCoeffsBank = malloc (sizeof (vector signed short)*chrFilterSize*dstW);
 793
 794   for (i=0;i<lumFilterSize*dstW;i++) {
 795     tmp = c->vLumFilter[i];
 796     p = &vYCoeffsBank[i];
 797     for (j=0;j<8;j++)
 798       p[j] = tmp;
 799   }
 800
 801   for (i=0;i<chrFilterSize*dstW;i++) {
 802     tmp = c->vChrFilter[i];
 803     p = &vCCoeffsBank[i];
 804     for (j=0;j<8;j++)
 805       p[j] = tmp;
 806   }
 807
 808   YCoeffs = vYCoeffsBank+dstY*lumFilterSize;
 809   CCoeffs = vCCoeffsBank+dstY*chrFilterSize;
 810
 811   out = (vector unsigned char *)dest;
 812
 813   for(i=0; i<dstW; i+=16){
 814     Y0 = RND;
 815     Y1 = RND;
 816     /* extract 16 coeffs from lumSrc */
 817     for(j=0; j<lumFilterSize; j++) {
 818       X0 = vec_ld (0,  &lumSrc[j][i]);
 819       X1 = vec_ld (16, &lumSrc[j][i]);
 820       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
 821       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
 822     }
 823
 824     U = RND;
 825     V = RND;
 826     /* extract 8 coeffs from U,V */
 827     for(j=0; j<chrFilterSize; j++) {
 828       X  = vec_ld (0, &chrSrc[j][i/2]);
 829       U  = vec_mradds (X, CCoeffs[j], U);
 830       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
 831       V  = vec_mradds (X, CCoeffs[j], V);
 832     }
 833
 834     /* scale and clip signals */
 835     Y0 = vec_sra (Y0, SCL);
 836     Y1 = vec_sra (Y1, SCL);
 837     U  = vec_sra (U,  SCL);
 838     V  = vec_sra (V,  SCL);
 839
 840     Y0 = vec_clip (Y0);
 841     Y1 = vec_clip (Y1);
 842     U  = vec_clip (U);
 843     V  = vec_clip (V);
 844
 845     /* now we have
 846       Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
 847       U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
 848
 849       Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
 850       U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
 851       V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
 852     */
 853
 854     U0 = vec_mergeh (U,U);
 855     V0 = vec_mergeh (V,V);
 856
 857     U1 = vec_mergel (U,U);
 858     V1 = vec_mergel (V,V);
 859
 860     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
 861     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
 862
 863     R  = vec_packclp (R0,R1);
 864     G  = vec_packclp (G0,G1);
 865     B  = vec_packclp (B0,B1);
 866
 867     out_rgba (R,G,B,out);
 868   }
 869
 870   if (i < dstW) {
 871     i -= 16;
 872
 873     Y0 = RND;
 874     Y1 = RND;
 875     /* extract 16 coeffs from lumSrc */
 876     for(j=0; j<lumFilterSize; j++) {
 877       X0 = vec_ld (0,  &lumSrc[j][i]);
 878       X1 = vec_ld (16, &lumSrc[j][i]);
 879       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
 880       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
 881     }
 882
 883     U = RND;
 884     V = RND;
 885     /* extract 8 coeffs from U,V */
 886     for(j=0; j<chrFilterSize; j++) {
 887       X  = vec_ld (0, &chrSrc[j][i/2]);
 888       U  = vec_mradds (X, CCoeffs[j], U);
 889       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
 890       V  = vec_mradds (X, CCoeffs[j], V);
 891     }
 892
 893     /* scale and clip signals */
 894     Y0 = vec_sra (Y0, SCL);
 895     Y1 = vec_sra (Y1, SCL);
 896     U  = vec_sra (U,  SCL);
 897     V  = vec_sra (V,  SCL);
 898
 899     Y0 = vec_clip (Y0);
 900     Y1 = vec_clip (Y1);
 901     U  = vec_clip (U);
 902     V  = vec_clip (V);
 903
 904     /* now we have
 905        Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
 906        U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
 907
 908        Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
 909        U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
 910        V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
 911     */
 912
 913     U0 = vec_mergeh (U,U);
 914     V0 = vec_mergeh (V,V);
 915
 916     U1 = vec_mergel (U,U);
 917     V1 = vec_mergel (V,V);
 918
 919     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
 920     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
 921
 922     R  = vec_packclp (R0,R1);
 923     G  = vec_packclp (G0,G1);
 924     B  = vec_packclp (B0,B1);
 925
 926     nout = (vector unsigned char *)scratch;
 927     out_rgba (R,G,B,nout);
 928
 929     memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
 930   }
 931
 932   if (vYCoeffsBank) free (vYCoeffsBank);
 933   if (vCCoeffsBank) free (vCCoeffsBank);
 934
 935 }