git.sesse.net Git - ffmpeg/blob - libswscale/ppc/yuv2rgb_altivec.c

   1 /*
   2  * AltiVec acceleration for colorspace conversion
   3  *
   4  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 /*
  24 Convert I420 YV12 to RGB in various formats,
  25   it rejects images that are not in 420 formats,
  26   it rejects images that don't have widths of multiples of 16,
  27   it rejects images that don't have heights of multiples of 2.
  28 Reject defers to C simulation code.
  29
  30 Lots of optimizations to be done here.
  31
  32 1. Need to fix saturation code. I just couldn't get it to fly with packs
  33    and adds, so we currently use max/min to clip.
  34
  35 2. The inefficient use of chroma loading needs a bit of brushing up.
  36
  37 3. Analysis of pipeline stalls needs to be done. Use shark to identify
  38    pipeline stalls.
  39
  40
  41 MODIFIED to calculate coeffs from currently selected color space.
  42 MODIFIED core to be a macro where you specify the output format.
  43 ADDED UYVY conversion which is never called due to some thing in swscale.
  44 CORRECTED algorithim selection to be strict on input formats.
  45 ADDED runtime detection of AltiVec.
  46
  47 ADDED altivec_yuv2packedX vertical scl + RGB converter
  48
  49 March 27,2004
  50 PERFORMANCE ANALYSIS
  51
  52 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
  53 used as test.
  54 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
  55 same sequence.
  56
  57 720 * 480 * 30  ~10MPS
  58
  59 so we have roughly 10 clocks per pixel. This is too high, something has
  60 to be wrong.
  61
  62 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
  63 need for vec_min.
  64
  65 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
  66 the input video frame, it was just decompressed so it probably resides in L1
  67 caches. However, we are creating the output video stream. This needs to use the
  68 DSTST instruction to optimize for the cache. We couple this with the fact that
  69 we are not going to be visiting the input buffer again so we mark it Least
  70 Recently Used. This shaves 25% of the processor cycles off.
  71
  72 Now memcpy is the largest mips consumer in the system, probably due
  73 to the inefficient X11 stuff.
  74
  75 GL libraries seem to be very slow on this machine 1.33Ghz PB running
  76 Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
  77 a versioning issue, however I have libGL.1.2.dylib for both
  78 machines. (We need to figure this out now.)
  79
  80 GL2 libraries work now with patch for RGB32.
  81
  82 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
  83
  84 Integrated luma prescaling adjustment for saturation/contrast/brightness
  85 adjustment.
  86 */
  87
  88 #include <stdio.h>
  89 #include <stdlib.h>
  90 #include <string.h>
  91 #include <inttypes.h>
  92 #include <assert.h>
  93 #include "config.h"
  94 #include "libswscale/rgb2rgb.h"
  95 #include "libswscale/swscale.h"
  96 #include "libswscale/swscale_internal.h"
  97 #include "libavutil/cpu.h"
  98 #include "libavutil/pixdesc.h"
  99 #include "yuv2rgb_altivec.h"
 100
 101 #undef PROFILE_THE_BEAST
 102 #undef INC_SCALING
 103
 104 typedef unsigned char ubyte;
 105 typedef signed char   sbyte;
 106
 107
 108 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
 109    homogeneous vector registers x0,x1,x2 are interleaved with the
 110    following technique:
 111
 112       o0 = vec_mergeh (x0,x1);
 113       o1 = vec_perm (o0, x2, perm_rgb_0);
 114       o2 = vec_perm (o0, x2, perm_rgb_1);
 115       o3 = vec_mergel (x0,x1);
 116       o4 = vec_perm (o3,o2,perm_rgb_2);
 117       o5 = vec_perm (o3,o2,perm_rgb_3);
 118
 119   perm_rgb_0:   o0(RG).h v1(B) --> o1*
 120               0   1  2   3   4
 121              rgbr|gbrg|brgb|rgbr
 122              0010 0100 1001 0010
 123              0102 3145 2673 894A
 124
 125   perm_rgb_1:   o0(RG).h v1(B) --> o2
 126               0   1  2   3   4
 127              gbrg|brgb|bbbb|bbbb
 128              0100 1001 1111 1111
 129              B5CD 6EF7 89AB CDEF
 130
 131   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
 132               0   1  2   3   4
 133              gbrg|brgb|rgbr|gbrg
 134              1111 1111 0010 0100
 135              89AB CDEF 0182 3945
 136
 137   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
 138               0   1  2   3   4
 139              brgb|rgbr|gbrg|brgb
 140              1001 0010 0100 1001
 141              a67b 89cA BdCD eEFf
 142
 143 */
 144 static
 145 const vector unsigned char
 146   perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
 147                 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
 148   perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
 149                 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
 150   perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
 151                 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
 152   perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
 153                 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
 154
 155 #define vec_merge3(x2,x1,x0,y0,y1,y2)       \
 156 do {                                        \
 157     __typeof__(x0) o0,o2,o3;                \
 158         o0 = vec_mergeh (x0,x1);            \
 159         y0 = vec_perm (o0, x2, perm_rgb_0); \
 160         o2 = vec_perm (o0, x2, perm_rgb_1); \
 161         o3 = vec_mergel (x0,x1);            \
 162         y1 = vec_perm (o3,o2,perm_rgb_2);   \
 163         y2 = vec_perm (o3,o2,perm_rgb_3);   \
 164 } while(0)
 165
 166 #define vec_mstbgr24(x0,x1,x2,ptr)      \
 167 do {                                    \
 168     __typeof__(x0) _0,_1,_2;            \
 169     vec_merge3 (x0,x1,x2,_0,_1,_2);     \
 170     vec_st (_0, 0, ptr++);              \
 171     vec_st (_1, 0, ptr++);              \
 172     vec_st (_2, 0, ptr++);              \
 173 }  while (0)
 174
 175 #define vec_mstrgb24(x0,x1,x2,ptr)      \
 176 do {                                    \
 177     __typeof__(x0) _0,_1,_2;            \
 178     vec_merge3 (x2,x1,x0,_0,_1,_2);     \
 179     vec_st (_0, 0, ptr++);              \
 180     vec_st (_1, 0, ptr++);              \
 181     vec_st (_2, 0, ptr++);              \
 182 }  while (0)
 183
 184 /* pack the pixels in rgb0 format
 185    msb R
 186    lsb 0
 187 */
 188 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
 189 do {                                                                          \
 190     T _0,_1,_2,_3;                                                            \
 191     _0 = vec_mergeh (x0,x1);                                                  \
 192     _1 = vec_mergeh (x2,x3);                                                  \
 193     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
 194     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
 195     vec_st (_2, 0*16, (T *)ptr);                                              \
 196     vec_st (_3, 1*16, (T *)ptr);                                              \
 197     _0 = vec_mergel (x0,x1);                                                  \
 198     _1 = vec_mergel (x2,x3);                                                  \
 199     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
 200     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
 201     vec_st (_2, 2*16, (T *)ptr);                                              \
 202     vec_st (_3, 3*16, (T *)ptr);                                              \
 203     ptr += 4;                                                                 \
 204 }  while (0)
 205
 206 /*
 207
 208   | 1     0       1.4021   | | Y |
 209   | 1    -0.3441 -0.7142   |x| Cb|
 210   | 1     1.7718  0        | | Cr|
 211
 212
 213   Y:      [-128 127]
 214   Cb/Cr : [-128 127]
 215
 216   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
 217
 218 */
 219
 220
 221
 222
 223 #define vec_unh(x) \
 224     (vector signed short) \
 225         vec_perm(x,(__typeof__(x)){0}, \
 226                  ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
 227                                          0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
 228 #define vec_unl(x) \
 229     (vector signed short) \
 230         vec_perm(x,(__typeof__(x)){0}, \
 231                  ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
 232                                          0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
 233
 234 #define vec_clip_s16(x) \
 235     vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
 236                          ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
 237
 238 #define vec_packclp(x,y) \
 239     (vector unsigned char)vec_packs \
 240         ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
 241          (vector unsigned short)vec_max (y,((vector signed short) {0})))
 242
 243 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
 244
 245
 246 static inline void cvtyuvtoRGB (SwsContext *c,
 247                                 vector signed short Y, vector signed short U, vector signed short V,
 248                                 vector signed short *R, vector signed short *G, vector signed short *B)
 249 {
 250     vector signed   short vx,ux,uvx;
 251
 252     Y = vec_mradds (Y, c->CY, c->OY);
 253     U  = vec_sub (U,(vector signed short)
 254                     vec_splat((vector signed short){128},0));
 255     V  = vec_sub (V,(vector signed short)
 256                     vec_splat((vector signed short){128},0));
 257
 258     //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
 259     ux = vec_sl (U, c->CSHIFT);
 260     *B = vec_mradds (ux, c->CBU, Y);
 261
 262     // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
 263     vx = vec_sl (V, c->CSHIFT);
 264     *R = vec_mradds (vx, c->CRV, Y);
 265
 266     // uvx = ((CGU*u) + (CGV*v))>>15;
 267     uvx = vec_mradds (U, c->CGU, Y);
 268     *G  = vec_mradds (V, c->CGV, uvx);
 269 }
 270
 271
 272 /*
 273   ------------------------------------------------------------------------------
 274   CS converters
 275   ------------------------------------------------------------------------------
 276 */
 277
 278
 279 #define DEFCSP420_CVT(name,out_pixels)                                  \
 280 static int altivec_##name (SwsContext *c,                               \
 281                            const unsigned char **in, int *instrides,    \
 282                            int srcSliceY,        int srcSliceH,         \
 283                            unsigned char **oplanes, int *outstrides)    \
 284 {                                                                       \
 285     int w = c->srcW;                                                    \
 286     int h = srcSliceH;                                                  \
 287     int i,j;                                                            \
 288     int instrides_scl[3];                                               \
 289     vector unsigned char y0,y1;                                         \
 290                                                                         \
 291     vector signed char  u,v;                                            \
 292                                                                         \
 293     vector signed short Y0,Y1,Y2,Y3;                                    \
 294     vector signed short U,V;                                            \
 295     vector signed short vx,ux,uvx;                                      \
 296     vector signed short vx0,ux0,uvx0;                                   \
 297     vector signed short vx1,ux1,uvx1;                                   \
 298     vector signed short R0,G0,B0;                                       \
 299     vector signed short R1,G1,B1;                                       \
 300     vector unsigned char R,G,B;                                         \
 301                                                                         \
 302     const vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;            \
 303     vector unsigned char align_perm;                                    \
 304                                                                         \
 305     vector signed short                                                 \
 306         lCY  = c->CY,                                                   \
 307         lOY  = c->OY,                                                   \
 308         lCRV = c->CRV,                                                  \
 309         lCBU = c->CBU,                                                  \
 310         lCGU = c->CGU,                                                  \
 311         lCGV = c->CGV;                                                  \
 312                                                                         \
 313     vector unsigned short lCSHIFT = c->CSHIFT;                          \
 314                                                                         \
 315     const ubyte *y1i   = in[0];                                         \
 316     const ubyte *y2i   = in[0]+instrides[0];                            \
 317     const ubyte *ui    = in[1];                                         \
 318     const ubyte *vi    = in[2];                                         \
 319                                                                         \
 320     vector unsigned char *oute                                          \
 321         = (vector unsigned char *)                                      \
 322             (oplanes[0]+srcSliceY*outstrides[0]);                       \
 323     vector unsigned char *outo                                          \
 324         = (vector unsigned char *)                                      \
 325             (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
 326                                                                         \
 327                                                                         \
 328     instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
 329     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
 330     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
 331                                                                         \
 332                                                                         \
 333     for (i=0;i<h/2;i++) {                                               \
 334         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
 335         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
 336                                                                         \
 337         for (j=0;j<w/16;j++) {                                          \
 338                                                                         \
 339             y1ivP = (const vector unsigned char *)y1i;                  \
 340             y2ivP = (const vector unsigned char *)y2i;                  \
 341             uivP  = (const vector unsigned char *)ui;                   \
 342             vivP  = (const vector unsigned char *)vi;                   \
 343                                                                         \
 344             align_perm = vec_lvsl (0, y1i);                             \
 345             y0 = (vector unsigned char)                                 \
 346                  vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
 347                                                                         \
 348             align_perm = vec_lvsl (0, y2i);                             \
 349             y1 = (vector unsigned char)                                 \
 350                  vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
 351                                                                         \
 352             align_perm = vec_lvsl (0, ui);                              \
 353             u = (vector signed char)                                    \
 354                 vec_perm (uivP[0], uivP[1], align_perm);                \
 355                                                                         \
 356             align_perm = vec_lvsl (0, vi);                              \
 357             v = (vector signed char)                                    \
 358                 vec_perm (vivP[0], vivP[1], align_perm);                \
 359                                                                         \
 360             u  = (vector signed char)                                   \
 361                  vec_sub (u,(vector signed char)                        \
 362                           vec_splat((vector signed char){128},0));      \
 363             v  = (vector signed char)                                   \
 364                  vec_sub (v,(vector signed char)                        \
 365                           vec_splat((vector signed char){128},0));      \
 366                                                                         \
 367             U  = vec_unpackh (u);                                       \
 368             V  = vec_unpackh (v);                                       \
 369                                                                         \
 370                                                                         \
 371             Y0 = vec_unh (y0);                                          \
 372             Y1 = vec_unl (y0);                                          \
 373             Y2 = vec_unh (y1);                                          \
 374             Y3 = vec_unl (y1);                                          \
 375                                                                         \
 376             Y0 = vec_mradds (Y0, lCY, lOY);                             \
 377             Y1 = vec_mradds (Y1, lCY, lOY);                             \
 378             Y2 = vec_mradds (Y2, lCY, lOY);                             \
 379             Y3 = vec_mradds (Y3, lCY, lOY);                             \
 380                                                                         \
 381             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
 382             ux = vec_sl (U, lCSHIFT);                                   \
 383             ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
 384             ux0  = vec_mergeh (ux,ux);                                  \
 385             ux1  = vec_mergel (ux,ux);                                  \
 386                                                                         \
 387             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
 388             vx = vec_sl (V, lCSHIFT);                                   \
 389             vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
 390             vx0  = vec_mergeh (vx,vx);                                  \
 391             vx1  = vec_mergel (vx,vx);                                  \
 392                                                                         \
 393             /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
 394             uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
 395             uvx = vec_mradds (V, lCGV, uvx);                            \
 396             uvx0 = vec_mergeh (uvx,uvx);                                \
 397             uvx1 = vec_mergel (uvx,uvx);                                \
 398                                                                         \
 399             R0 = vec_add (Y0,vx0);                                      \
 400             G0 = vec_add (Y0,uvx0);                                     \
 401             B0 = vec_add (Y0,ux0);                                      \
 402             R1 = vec_add (Y1,vx1);                                      \
 403             G1 = vec_add (Y1,uvx1);                                     \
 404             B1 = vec_add (Y1,ux1);                                      \
 405                                                                         \
 406             R  = vec_packclp (R0,R1);                                   \
 407             G  = vec_packclp (G0,G1);                                   \
 408             B  = vec_packclp (B0,B1);                                   \
 409                                                                         \
 410             out_pixels(R,G,B,oute);                                     \
 411                                                                         \
 412             R0 = vec_add (Y2,vx0);                                      \
 413             G0 = vec_add (Y2,uvx0);                                     \
 414             B0 = vec_add (Y2,ux0);                                      \
 415             R1 = vec_add (Y3,vx1);                                      \
 416             G1 = vec_add (Y3,uvx1);                                     \
 417             B1 = vec_add (Y3,ux1);                                      \
 418             R  = vec_packclp (R0,R1);                                   \
 419             G  = vec_packclp (G0,G1);                                   \
 420             B  = vec_packclp (B0,B1);                                   \
 421                                                                         \
 422                                                                         \
 423             out_pixels(R,G,B,outo);                                     \
 424                                                                         \
 425             y1i  += 16;                                                 \
 426             y2i  += 16;                                                 \
 427             ui   += 8;                                                  \
 428             vi   += 8;                                                  \
 429                                                                         \
 430         }                                                               \
 431                                                                         \
 432         outo  += (outstrides[0])>>4;                                    \
 433         oute  += (outstrides[0])>>4;                                    \
 434                                                                         \
 435         ui    += instrides_scl[1];                                      \
 436         vi    += instrides_scl[2];                                      \
 437         y1i   += instrides_scl[0];                                      \
 438         y2i   += instrides_scl[0];                                      \
 439     }                                                                   \
 440     return srcSliceH;                                                   \
 441 }
 442
 443
 444 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
 445 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
 446 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
 447 #define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
 448 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
 449 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
 450
 451 DEFCSP420_CVT (yuv2_abgr, out_abgr)
 452 DEFCSP420_CVT (yuv2_bgra, out_bgra)
 453 DEFCSP420_CVT (yuv2_rgba, out_rgba)
 454 DEFCSP420_CVT (yuv2_argb, out_argb)
 455 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
 456 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
 457
 458
 459 // uyvy|uyvy|uyvy|uyvy
 460 // 0123 4567 89ab cdef
 461 static
 462 const vector unsigned char
 463     demux_u = {0x10,0x00,0x10,0x00,
 464                0x10,0x04,0x10,0x04,
 465                0x10,0x08,0x10,0x08,
 466                0x10,0x0c,0x10,0x0c},
 467     demux_v = {0x10,0x02,0x10,0x02,
 468                0x10,0x06,0x10,0x06,
 469                0x10,0x0A,0x10,0x0A,
 470                0x10,0x0E,0x10,0x0E},
 471     demux_y = {0x10,0x01,0x10,0x03,
 472                0x10,0x05,0x10,0x07,
 473                0x10,0x09,0x10,0x0B,
 474                0x10,0x0D,0x10,0x0F};
 475
 476 /*
 477   this is so I can play live CCIR raw video
 478 */
 479 static int altivec_uyvy_rgb32 (SwsContext *c,
 480                                const unsigned char **in, int *instrides,
 481                                int srcSliceY,        int srcSliceH,
 482                                unsigned char **oplanes, int *outstrides)
 483 {
 484     int w = c->srcW;
 485     int h = srcSliceH;
 486     int i,j;
 487     vector unsigned char uyvy;
 488     vector signed   short Y,U,V;
 489     vector signed   short R0,G0,B0,R1,G1,B1;
 490     vector unsigned char  R,G,B;
 491     vector unsigned char *out;
 492     const ubyte *img;
 493
 494     img = in[0];
 495     out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
 496
 497     for (i=0;i<h;i++) {
 498         for (j=0;j<w/16;j++) {
 499             uyvy = vec_ld (0, img);
 500             U = (vector signed short)
 501                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
 502
 503             V = (vector signed short)
 504                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
 505
 506             Y = (vector signed short)
 507                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
 508
 509             cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
 510
 511             uyvy = vec_ld (16, img);
 512             U = (vector signed short)
 513                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
 514
 515             V = (vector signed short)
 516                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
 517
 518             Y = (vector signed short)
 519                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
 520
 521             cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
 522
 523             R  = vec_packclp (R0,R1);
 524             G  = vec_packclp (G0,G1);
 525             B  = vec_packclp (B0,B1);
 526
 527             //      vec_mstbgr24 (R,G,B, out);
 528             out_rgba (R,G,B,out);
 529
 530             img += 32;
 531         }
 532     }
 533     return srcSliceH;
 534 }
 535
 536
 537
 538 /* Ok currently the acceleration routine only supports
 539    inputs of widths a multiple of 16
 540    and heights a multiple 2
 541
 542    So we just fall back to the C codes for this.
 543 */
 544 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
 545 {
 546     if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
 547         return NULL;
 548
 549     /*
 550       and this seems not to matter too much I tried a bunch of
 551       videos with abnormal widths and MPlayer crashes elsewhere.
 552       mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
 553       boom with X11 bad match.
 554
 555     */
 556     if ((c->srcW & 0xf) != 0)    return NULL;
 557
 558     switch (c->srcFormat) {
 559     case PIX_FMT_YUV410P:
 560     case PIX_FMT_YUV420P:
 561     /*case IMGFMT_CLPL:        ??? */
 562     case PIX_FMT_GRAY8:
 563     case PIX_FMT_NV12:
 564     case PIX_FMT_NV21:
 565         if ((c->srcH & 0x1) != 0)
 566             return NULL;
 567
 568         switch(c->dstFormat) {
 569         case PIX_FMT_RGB24:
 570             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
 571             return altivec_yuv2_rgb24;
 572         case PIX_FMT_BGR24:
 573             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
 574             return altivec_yuv2_bgr24;
 575         case PIX_FMT_ARGB:
 576             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
 577             return altivec_yuv2_argb;
 578         case PIX_FMT_ABGR:
 579             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
 580             return altivec_yuv2_abgr;
 581         case PIX_FMT_RGBA:
 582             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
 583             return altivec_yuv2_rgba;
 584         case PIX_FMT_BGRA:
 585             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
 586             return altivec_yuv2_bgra;
 587         default: return NULL;
 588         }
 589         break;
 590
 591     case PIX_FMT_UYVY422:
 592         switch(c->dstFormat) {
 593         case PIX_FMT_BGR32:
 594             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
 595             return altivec_uyvy_rgb32;
 596         default: return NULL;
 597         }
 598         break;
 599
 600     }
 601     return NULL;
 602 }
 603
 604 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
 605 {
 606     union {
 607         DECLARE_ALIGNED(16, signed short, tmp)[8];
 608         vector signed short vec;
 609     } buf;
 610
 611     buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
 612     buf.tmp[1] =  -256*brightness;                                      //oy
 613     buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
 614     buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
 615     buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
 616     buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
 617
 618
 619     c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
 620     c->CY   = vec_splat ((vector signed short)buf.vec, 0);
 621     c->OY   = vec_splat ((vector signed short)buf.vec, 1);
 622     c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
 623     c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
 624     c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
 625     c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
 626     return;
 627 }
 628
 629
 630 static av_always_inline void
 631 ff_yuv2packedX_altivec(SwsContext *c, const int16_t *lumFilter,
 632                        const int16_t **lumSrc, int lumFilterSize,
 633                        const int16_t *chrFilter, const int16_t **chrUSrc,
 634                        const int16_t **chrVSrc, int chrFilterSize,
 635                        const int16_t **alpSrc, uint8_t *dest,
 636                        int dstW, int dstY, enum PixelFormat target)
 637 {
 638     int i,j;
 639     vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
 640     vector signed short R0,G0,B0,R1,G1,B1;
 641
 642     vector unsigned char R,G,B;
 643     vector unsigned char *out,*nout;
 644
 645     vector signed short   RND = vec_splat_s16(1<<3);
 646     vector unsigned short SCL = vec_splat_u16(4);
 647     DECLARE_ALIGNED(16, unsigned int, scratch)[16];
 648
 649     vector signed short *YCoeffs, *CCoeffs;
 650
 651     YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
 652     CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
 653
 654     out = (vector unsigned char *)dest;
 655
 656     for (i=0; i<dstW; i+=16) {
 657         Y0 = RND;
 658         Y1 = RND;
 659         /* extract 16 coeffs from lumSrc */
 660         for (j=0; j<lumFilterSize; j++) {
 661             X0 = vec_ld (0,  &lumSrc[j][i]);
 662             X1 = vec_ld (16, &lumSrc[j][i]);
 663             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
 664             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
 665         }
 666
 667         U = RND;
 668         V = RND;
 669         /* extract 8 coeffs from U,V */
 670         for (j=0; j<chrFilterSize; j++) {
 671             X  = vec_ld (0, &chrUSrc[j][i/2]);
 672             U  = vec_mradds (X, CCoeffs[j], U);
 673             X  = vec_ld (0, &chrVSrc[j][i/2]);
 674             V  = vec_mradds (X, CCoeffs[j], V);
 675         }
 676
 677         /* scale and clip signals */
 678         Y0 = vec_sra (Y0, SCL);
 679         Y1 = vec_sra (Y1, SCL);
 680         U  = vec_sra (U,  SCL);
 681         V  = vec_sra (V,  SCL);
 682
 683         Y0 = vec_clip_s16 (Y0);
 684         Y1 = vec_clip_s16 (Y1);
 685         U  = vec_clip_s16 (U);
 686         V  = vec_clip_s16 (V);
 687
 688         /* now we have
 689           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
 690           U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
 691
 692           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
 693           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
 694           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
 695         */
 696
 697         U0 = vec_mergeh (U,U);
 698         V0 = vec_mergeh (V,V);
 699
 700         U1 = vec_mergel (U,U);
 701         V1 = vec_mergel (V,V);
 702
 703         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
 704         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
 705
 706         R  = vec_packclp (R0,R1);
 707         G  = vec_packclp (G0,G1);
 708         B  = vec_packclp (B0,B1);
 709
 710         switch(target) {
 711         case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
 712         case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
 713         case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
 714         case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
 715         case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
 716         case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
 717         default:
 718             {
 719                 /* If this is reached, the caller should have called yuv2packedXinC
 720                    instead. */
 721                 static int printed_error_message;
 722                 if (!printed_error_message) {
 723                     av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
 724                            av_get_pix_fmt_name(c->dstFormat));
 725                     printed_error_message=1;
 726                 }
 727                 return;
 728             }
 729         }
 730     }
 731
 732     if (i < dstW) {
 733         i -= 16;
 734
 735         Y0 = RND;
 736         Y1 = RND;
 737         /* extract 16 coeffs from lumSrc */
 738         for (j=0; j<lumFilterSize; j++) {
 739             X0 = vec_ld (0,  &lumSrc[j][i]);
 740             X1 = vec_ld (16, &lumSrc[j][i]);
 741             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
 742             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
 743         }
 744
 745         U = RND;
 746         V = RND;
 747         /* extract 8 coeffs from U,V */
 748         for (j=0; j<chrFilterSize; j++) {
 749             X  = vec_ld (0, &chrUSrc[j][i/2]);
 750             U  = vec_mradds (X, CCoeffs[j], U);
 751             X  = vec_ld (0, &chrVSrc[j][i/2]);
 752             V  = vec_mradds (X, CCoeffs[j], V);
 753         }
 754
 755         /* scale and clip signals */
 756         Y0 = vec_sra (Y0, SCL);
 757         Y1 = vec_sra (Y1, SCL);
 758         U  = vec_sra (U,  SCL);
 759         V  = vec_sra (V,  SCL);
 760
 761         Y0 = vec_clip_s16 (Y0);
 762         Y1 = vec_clip_s16 (Y1);
 763         U  = vec_clip_s16 (U);
 764         V  = vec_clip_s16 (V);
 765
 766         /* now we have
 767            Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
 768            U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
 769
 770            Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
 771            U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
 772            V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
 773         */
 774
 775         U0 = vec_mergeh (U,U);
 776         V0 = vec_mergeh (V,V);
 777
 778         U1 = vec_mergel (U,U);
 779         V1 = vec_mergel (V,V);
 780
 781         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
 782         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
 783
 784         R  = vec_packclp (R0,R1);
 785         G  = vec_packclp (G0,G1);
 786         B  = vec_packclp (B0,B1);
 787
 788         nout = (vector unsigned char *)scratch;
 789         switch(target) {
 790         case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
 791         case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
 792         case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
 793         case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
 794         case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
 795         case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
 796         default:
 797             /* Unreachable, I think. */
 798             av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
 799                    av_get_pix_fmt_name(c->dstFormat));
 800             return;
 801         }
 802
 803         memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
 804     }
 805
 806 }
 807
 808 #define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
 809 void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, const int16_t *lumFilter, \
 810                             const int16_t **lumSrc, int lumFilterSize, \
 811                             const int16_t *chrFilter, const int16_t **chrUSrc, \
 812                             const int16_t **chrVSrc, int chrFilterSize, \
 813                             const int16_t **alpSrc, uint8_t *dest, \
 814                             int dstW, int dstY) \
 815 { \
 816     ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
 817                            chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 818                            alpSrc, dest, dstW, dstY, pixfmt); \
 819 }
 820
 821 YUV2PACKEDX_WRAPPER(abgr,  PIX_FMT_ABGR);
 822 YUV2PACKEDX_WRAPPER(bgra,  PIX_FMT_BGRA);
 823 YUV2PACKEDX_WRAPPER(argb,  PIX_FMT_ARGB);
 824 YUV2PACKEDX_WRAPPER(rgba,  PIX_FMT_RGBA);
 825 YUV2PACKEDX_WRAPPER(rgb24, PIX_FMT_RGB24);
 826 YUV2PACKEDX_WRAPPER(bgr24, PIX_FMT_BGR24);