git.sesse.net Git - ffmpeg/blob - libswscale/ppc/yuv2rgb_altivec.c

   1 /*
   2  * AltiVec acceleration for colorspace conversion
   3  *
   4  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
   5  *
   6  * This file is part of Libav.
   7  *
   8  * Libav is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * Libav is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with Libav; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 /*
  24 Convert I420 YV12 to RGB in various formats,
  25   it rejects images that are not in 420 formats,
  26   it rejects images that don't have widths of multiples of 16,
  27   it rejects images that don't have heights of multiples of 2.
  28 Reject defers to C simulation code.
  29
  30 Lots of optimizations to be done here.
  31
  32 1. Need to fix saturation code. I just couldn't get it to fly with packs
  33    and adds, so we currently use max/min to clip.
  34
  35 2. The inefficient use of chroma loading needs a bit of brushing up.
  36
  37 3. Analysis of pipeline stalls needs to be done. Use shark to identify
  38    pipeline stalls.
  39
  40
  41 MODIFIED to calculate coeffs from currently selected color space.
  42 MODIFIED core to be a macro where you specify the output format.
  43 ADDED UYVY conversion which is never called due to some thing in swscale.
  44 CORRECTED algorithim selection to be strict on input formats.
  45 ADDED runtime detection of AltiVec.
  46
  47 ADDED altivec_yuv2packedX vertical scl + RGB converter
  48
  49 March 27,2004
  50 PERFORMANCE ANALYSIS
  51
  52 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
  53 used as test.
  54 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
  55 same sequence.
  56
  57 720 * 480 * 30  ~10MPS
  58
  59 so we have roughly 10 clocks per pixel. This is too high, something has
  60 to be wrong.
  61
  62 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
  63 need for vec_min.
  64
  65 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
  66 the input video frame, it was just decompressed so it probably resides in L1
  67 caches. However, we are creating the output video stream. This needs to use the
  68 DSTST instruction to optimize for the cache. We couple this with the fact that
  69 we are not going to be visiting the input buffer again so we mark it Least
  70 Recently Used. This shaves 25% of the processor cycles off.
  71
  72 Now memcpy is the largest mips consumer in the system, probably due
  73 to the inefficient X11 stuff.
  74
  75 GL libraries seem to be very slow on this machine 1.33Ghz PB running
  76 Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
  77 a versioning issue, however I have libGL.1.2.dylib for both
  78 machines. (We need to figure this out now.)
  79
  80 GL2 libraries work now with patch for RGB32.
  81
  82 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
  83
  84 Integrated luma prescaling adjustment for saturation/contrast/brightness
  85 adjustment.
  86 */
  87
  88 #include <stdio.h>
  89 #include <stdlib.h>
  90 #include <string.h>
  91 #include <inttypes.h>
  92 #include <assert.h>
  93 #include "config.h"
  94 #include "libswscale/rgb2rgb.h"
  95 #include "libswscale/swscale.h"
  96 #include "libswscale/swscale_internal.h"
  97 #include "libavutil/cpu.h"
  98 #include "yuv2rgb_altivec.h"
  99
 100 #undef PROFILE_THE_BEAST
 101 #undef INC_SCALING
 102
 103 typedef unsigned char ubyte;
 104 typedef signed char   sbyte;
 105
 106
 107 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
 108    homogeneous vector registers x0,x1,x2 are interleaved with the
 109    following technique:
 110
 111       o0 = vec_mergeh (x0,x1);
 112       o1 = vec_perm (o0, x2, perm_rgb_0);
 113       o2 = vec_perm (o0, x2, perm_rgb_1);
 114       o3 = vec_mergel (x0,x1);
 115       o4 = vec_perm (o3,o2,perm_rgb_2);
 116       o5 = vec_perm (o3,o2,perm_rgb_3);
 117
 118   perm_rgb_0:   o0(RG).h v1(B) --> o1*
 119               0   1  2   3   4
 120              rgbr|gbrg|brgb|rgbr
 121              0010 0100 1001 0010
 122              0102 3145 2673 894A
 123
 124   perm_rgb_1:   o0(RG).h v1(B) --> o2
 125               0   1  2   3   4
 126              gbrg|brgb|bbbb|bbbb
 127              0100 1001 1111 1111
 128              B5CD 6EF7 89AB CDEF
 129
 130   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
 131               0   1  2   3   4
 132              gbrg|brgb|rgbr|gbrg
 133              1111 1111 0010 0100
 134              89AB CDEF 0182 3945
 135
 136   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
 137               0   1  2   3   4
 138              brgb|rgbr|gbrg|brgb
 139              1001 0010 0100 1001
 140              a67b 89cA BdCD eEFf
 141
 142 */
 143 static
 144 const vector unsigned char
 145   perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
 146                 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
 147   perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
 148                 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
 149   perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
 150                 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
 151   perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
 152                 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
 153
 154 #define vec_merge3(x2,x1,x0,y0,y1,y2)       \
 155 do {                                        \
 156     __typeof__(x0) o0,o2,o3;                \
 157         o0 = vec_mergeh (x0,x1);            \
 158         y0 = vec_perm (o0, x2, perm_rgb_0); \
 159         o2 = vec_perm (o0, x2, perm_rgb_1); \
 160         o3 = vec_mergel (x0,x1);            \
 161         y1 = vec_perm (o3,o2,perm_rgb_2);   \
 162         y2 = vec_perm (o3,o2,perm_rgb_3);   \
 163 } while(0)
 164
 165 #define vec_mstbgr24(x0,x1,x2,ptr)      \
 166 do {                                    \
 167     __typeof__(x0) _0,_1,_2;            \
 168     vec_merge3 (x0,x1,x2,_0,_1,_2);     \
 169     vec_st (_0, 0, ptr++);              \
 170     vec_st (_1, 0, ptr++);              \
 171     vec_st (_2, 0, ptr++);              \
 172 }  while (0)
 173
 174 #define vec_mstrgb24(x0,x1,x2,ptr)      \
 175 do {                                    \
 176     __typeof__(x0) _0,_1,_2;            \
 177     vec_merge3 (x2,x1,x0,_0,_1,_2);     \
 178     vec_st (_0, 0, ptr++);              \
 179     vec_st (_1, 0, ptr++);              \
 180     vec_st (_2, 0, ptr++);              \
 181 }  while (0)
 182
 183 /* pack the pixels in rgb0 format
 184    msb R
 185    lsb 0
 186 */
 187 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
 188 do {                                                                          \
 189     T _0,_1,_2,_3;                                                            \
 190     _0 = vec_mergeh (x0,x1);                                                  \
 191     _1 = vec_mergeh (x2,x3);                                                  \
 192     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
 193     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
 194     vec_st (_2, 0*16, (T *)ptr);                                              \
 195     vec_st (_3, 1*16, (T *)ptr);                                              \
 196     _0 = vec_mergel (x0,x1);                                                  \
 197     _1 = vec_mergel (x2,x3);                                                  \
 198     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
 199     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
 200     vec_st (_2, 2*16, (T *)ptr);                                              \
 201     vec_st (_3, 3*16, (T *)ptr);                                              \
 202     ptr += 4;                                                                 \
 203 }  while (0)
 204
 205 /*
 206
 207   | 1     0       1.4021   | | Y |
 208   | 1    -0.3441 -0.7142   |x| Cb|
 209   | 1     1.7718  0        | | Cr|
 210
 211
 212   Y:      [-128 127]
 213   Cb/Cr : [-128 127]
 214
 215   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
 216
 217 */
 218
 219
 220
 221
 222 #define vec_unh(x) \
 223     (vector signed short) \
 224         vec_perm(x,(__typeof__(x)){0}, \
 225                  ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
 226                                          0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
 227 #define vec_unl(x) \
 228     (vector signed short) \
 229         vec_perm(x,(__typeof__(x)){0}, \
 230                  ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
 231                                          0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
 232
 233 #define vec_clip_s16(x) \
 234     vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
 235                          ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
 236
 237 #define vec_packclp(x,y) \
 238     (vector unsigned char)vec_packs \
 239         ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
 240          (vector unsigned short)vec_max (y,((vector signed short) {0})))
 241
 242 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
 243
 244
 245 static inline void cvtyuvtoRGB (SwsContext *c,
 246                                 vector signed short Y, vector signed short U, vector signed short V,
 247                                 vector signed short *R, vector signed short *G, vector signed short *B)
 248 {
 249     vector signed   short vx,ux,uvx;
 250
 251     Y = vec_mradds (Y, c->CY, c->OY);
 252     U  = vec_sub (U,(vector signed short)
 253                     vec_splat((vector signed short){128},0));
 254     V  = vec_sub (V,(vector signed short)
 255                     vec_splat((vector signed short){128},0));
 256
 257     //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
 258     ux = vec_sl (U, c->CSHIFT);
 259     *B = vec_mradds (ux, c->CBU, Y);
 260
 261     // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
 262     vx = vec_sl (V, c->CSHIFT);
 263     *R = vec_mradds (vx, c->CRV, Y);
 264
 265     // uvx = ((CGU*u) + (CGV*v))>>15;
 266     uvx = vec_mradds (U, c->CGU, Y);
 267     *G  = vec_mradds (V, c->CGV, uvx);
 268 }
 269
 270
 271 /*
 272   ------------------------------------------------------------------------------
 273   CS converters
 274   ------------------------------------------------------------------------------
 275 */
 276
 277
 278 #define DEFCSP420_CVT(name,out_pixels)                                  \
 279 static int altivec_##name (SwsContext *c,                               \
 280                            const unsigned char **in, int *instrides,    \
 281                            int srcSliceY,        int srcSliceH,         \
 282                            unsigned char **oplanes, int *outstrides)    \
 283 {                                                                       \
 284     int w = c->srcW;                                                    \
 285     int h = srcSliceH;                                                  \
 286     int i,j;                                                            \
 287     int instrides_scl[3];                                               \
 288     vector unsigned char y0,y1;                                         \
 289                                                                         \
 290     vector signed char  u,v;                                            \
 291                                                                         \
 292     vector signed short Y0,Y1,Y2,Y3;                                    \
 293     vector signed short U,V;                                            \
 294     vector signed short vx,ux,uvx;                                      \
 295     vector signed short vx0,ux0,uvx0;                                   \
 296     vector signed short vx1,ux1,uvx1;                                   \
 297     vector signed short R0,G0,B0;                                       \
 298     vector signed short R1,G1,B1;                                       \
 299     vector unsigned char R,G,B;                                         \
 300                                                                         \
 301     vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
 302     vector unsigned char align_perm;                                    \
 303                                                                         \
 304     vector signed short                                                 \
 305         lCY  = c->CY,                                                   \
 306         lOY  = c->OY,                                                   \
 307         lCRV = c->CRV,                                                  \
 308         lCBU = c->CBU,                                                  \
 309         lCGU = c->CGU,                                                  \
 310         lCGV = c->CGV;                                                  \
 311                                                                         \
 312     vector unsigned short lCSHIFT = c->CSHIFT;                          \
 313                                                                         \
 314     const ubyte *y1i   = in[0];                                         \
 315     const ubyte *y2i   = in[0]+instrides[0];                            \
 316     const ubyte *ui    = in[1];                                         \
 317     const ubyte *vi    = in[2];                                         \
 318                                                                         \
 319     vector unsigned char *oute                                          \
 320         = (vector unsigned char *)                                      \
 321             (oplanes[0]+srcSliceY*outstrides[0]);                       \
 322     vector unsigned char *outo                                          \
 323         = (vector unsigned char *)                                      \
 324             (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
 325                                                                         \
 326                                                                         \
 327     instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
 328     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
 329     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
 330                                                                         \
 331                                                                         \
 332     for (i=0;i<h/2;i++) {                                               \
 333         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
 334         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
 335                                                                         \
 336         for (j=0;j<w/16;j++) {                                          \
 337                                                                         \
 338             y1ivP = (vector unsigned char *)y1i;                        \
 339             y2ivP = (vector unsigned char *)y2i;                        \
 340             uivP  = (vector unsigned char *)ui;                         \
 341             vivP  = (vector unsigned char *)vi;                         \
 342                                                                         \
 343             align_perm = vec_lvsl (0, y1i);                             \
 344             y0 = (vector unsigned char)                                 \
 345                  vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
 346                                                                         \
 347             align_perm = vec_lvsl (0, y2i);                             \
 348             y1 = (vector unsigned char)                                 \
 349                  vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
 350                                                                         \
 351             align_perm = vec_lvsl (0, ui);                              \
 352             u = (vector signed char)                                    \
 353                 vec_perm (uivP[0], uivP[1], align_perm);                \
 354                                                                         \
 355             align_perm = vec_lvsl (0, vi);                              \
 356             v = (vector signed char)                                    \
 357                 vec_perm (vivP[0], vivP[1], align_perm);                \
 358                                                                         \
 359             u  = (vector signed char)                                   \
 360                  vec_sub (u,(vector signed char)                        \
 361                           vec_splat((vector signed char){128},0));      \
 362             v  = (vector signed char)                                   \
 363                  vec_sub (v,(vector signed char)                        \
 364                           vec_splat((vector signed char){128},0));      \
 365                                                                         \
 366             U  = vec_unpackh (u);                                       \
 367             V  = vec_unpackh (v);                                       \
 368                                                                         \
 369                                                                         \
 370             Y0 = vec_unh (y0);                                          \
 371             Y1 = vec_unl (y0);                                          \
 372             Y2 = vec_unh (y1);                                          \
 373             Y3 = vec_unl (y1);                                          \
 374                                                                         \
 375             Y0 = vec_mradds (Y0, lCY, lOY);                             \
 376             Y1 = vec_mradds (Y1, lCY, lOY);                             \
 377             Y2 = vec_mradds (Y2, lCY, lOY);                             \
 378             Y3 = vec_mradds (Y3, lCY, lOY);                             \
 379                                                                         \
 380             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
 381             ux = vec_sl (U, lCSHIFT);                                   \
 382             ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
 383             ux0  = vec_mergeh (ux,ux);                                  \
 384             ux1  = vec_mergel (ux,ux);                                  \
 385                                                                         \
 386             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
 387             vx = vec_sl (V, lCSHIFT);                                   \
 388             vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
 389             vx0  = vec_mergeh (vx,vx);                                  \
 390             vx1  = vec_mergel (vx,vx);                                  \
 391                                                                         \
 392             /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
 393             uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
 394             uvx = vec_mradds (V, lCGV, uvx);                            \
 395             uvx0 = vec_mergeh (uvx,uvx);                                \
 396             uvx1 = vec_mergel (uvx,uvx);                                \
 397                                                                         \
 398             R0 = vec_add (Y0,vx0);                                      \
 399             G0 = vec_add (Y0,uvx0);                                     \
 400             B0 = vec_add (Y0,ux0);                                      \
 401             R1 = vec_add (Y1,vx1);                                      \
 402             G1 = vec_add (Y1,uvx1);                                     \
 403             B1 = vec_add (Y1,ux1);                                      \
 404                                                                         \
 405             R  = vec_packclp (R0,R1);                                   \
 406             G  = vec_packclp (G0,G1);                                   \
 407             B  = vec_packclp (B0,B1);                                   \
 408                                                                         \
 409             out_pixels(R,G,B,oute);                                     \
 410                                                                         \
 411             R0 = vec_add (Y2,vx0);                                      \
 412             G0 = vec_add (Y2,uvx0);                                     \
 413             B0 = vec_add (Y2,ux0);                                      \
 414             R1 = vec_add (Y3,vx1);                                      \
 415             G1 = vec_add (Y3,uvx1);                                     \
 416             B1 = vec_add (Y3,ux1);                                      \
 417             R  = vec_packclp (R0,R1);                                   \
 418             G  = vec_packclp (G0,G1);                                   \
 419             B  = vec_packclp (B0,B1);                                   \
 420                                                                         \
 421                                                                         \
 422             out_pixels(R,G,B,outo);                                     \
 423                                                                         \
 424             y1i  += 16;                                                 \
 425             y2i  += 16;                                                 \
 426             ui   += 8;                                                  \
 427             vi   += 8;                                                  \
 428                                                                         \
 429         }                                                               \
 430                                                                         \
 431         outo  += (outstrides[0])>>4;                                    \
 432         oute  += (outstrides[0])>>4;                                    \
 433                                                                         \
 434         ui    += instrides_scl[1];                                      \
 435         vi    += instrides_scl[2];                                      \
 436         y1i   += instrides_scl[0];                                      \
 437         y2i   += instrides_scl[0];                                      \
 438     }                                                                   \
 439     return srcSliceH;                                                   \
 440 }
 441
 442
 443 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
 444 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
 445 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
 446 #define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
 447 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
 448 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
 449
 450 DEFCSP420_CVT (yuv2_abgr, out_abgr)
 451 DEFCSP420_CVT (yuv2_bgra, out_bgra)
 452 DEFCSP420_CVT (yuv2_rgba, out_rgba)
 453 DEFCSP420_CVT (yuv2_argb, out_argb)
 454 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
 455 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
 456
 457
 458 // uyvy|uyvy|uyvy|uyvy
 459 // 0123 4567 89ab cdef
 460 static
 461 const vector unsigned char
 462     demux_u = {0x10,0x00,0x10,0x00,
 463                0x10,0x04,0x10,0x04,
 464                0x10,0x08,0x10,0x08,
 465                0x10,0x0c,0x10,0x0c},
 466     demux_v = {0x10,0x02,0x10,0x02,
 467                0x10,0x06,0x10,0x06,
 468                0x10,0x0A,0x10,0x0A,
 469                0x10,0x0E,0x10,0x0E},
 470     demux_y = {0x10,0x01,0x10,0x03,
 471                0x10,0x05,0x10,0x07,
 472                0x10,0x09,0x10,0x0B,
 473                0x10,0x0D,0x10,0x0F};
 474
 475 /*
 476   this is so I can play live CCIR raw video
 477 */
 478 static int altivec_uyvy_rgb32 (SwsContext *c,
 479                                const unsigned char **in, int *instrides,
 480                                int srcSliceY,        int srcSliceH,
 481                                unsigned char **oplanes, int *outstrides)
 482 {
 483     int w = c->srcW;
 484     int h = srcSliceH;
 485     int i,j;
 486     vector unsigned char uyvy;
 487     vector signed   short Y,U,V;
 488     vector signed   short R0,G0,B0,R1,G1,B1;
 489     vector unsigned char  R,G,B;
 490     vector unsigned char *out;
 491     const ubyte *img;
 492
 493     img = in[0];
 494     out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
 495
 496     for (i=0;i<h;i++) {
 497         for (j=0;j<w/16;j++) {
 498             uyvy = vec_ld (0, img);
 499             U = (vector signed short)
 500                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
 501
 502             V = (vector signed short)
 503                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
 504
 505             Y = (vector signed short)
 506                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
 507
 508             cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
 509
 510             uyvy = vec_ld (16, img);
 511             U = (vector signed short)
 512                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
 513
 514             V = (vector signed short)
 515                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
 516
 517             Y = (vector signed short)
 518                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
 519
 520             cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
 521
 522             R  = vec_packclp (R0,R1);
 523             G  = vec_packclp (G0,G1);
 524             B  = vec_packclp (B0,B1);
 525
 526             //      vec_mstbgr24 (R,G,B, out);
 527             out_rgba (R,G,B,out);
 528
 529             img += 32;
 530         }
 531     }
 532     return srcSliceH;
 533 }
 534
 535
 536
 537 /* Ok currently the acceleration routine only supports
 538    inputs of widths a multiple of 16
 539    and heights a multiple 2
 540
 541    So we just fall back to the C codes for this.
 542 */
 543 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
 544 {
 545     if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
 546         return NULL;
 547
 548     /*
 549       and this seems not to matter too much I tried a bunch of
 550       videos with abnormal widths and MPlayer crashes elsewhere.
 551       mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
 552       boom with X11 bad match.
 553
 554     */
 555     if ((c->srcW & 0xf) != 0)    return NULL;
 556
 557     switch (c->srcFormat) {
 558     case PIX_FMT_YUV410P:
 559     case PIX_FMT_YUV420P:
 560     /*case IMGFMT_CLPL:        ??? */
 561     case PIX_FMT_GRAY8:
 562     case PIX_FMT_NV12:
 563     case PIX_FMT_NV21:
 564         if ((c->srcH & 0x1) != 0)
 565             return NULL;
 566
 567         switch(c->dstFormat) {
 568         case PIX_FMT_RGB24:
 569             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
 570             return altivec_yuv2_rgb24;
 571         case PIX_FMT_BGR24:
 572             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
 573             return altivec_yuv2_bgr24;
 574         case PIX_FMT_ARGB:
 575             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
 576             return altivec_yuv2_argb;
 577         case PIX_FMT_ABGR:
 578             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
 579             return altivec_yuv2_abgr;
 580         case PIX_FMT_RGBA:
 581             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
 582             return altivec_yuv2_rgba;
 583         case PIX_FMT_BGRA:
 584             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
 585             return altivec_yuv2_bgra;
 586         default: return NULL;
 587         }
 588         break;
 589
 590     case PIX_FMT_UYVY422:
 591         switch(c->dstFormat) {
 592         case PIX_FMT_BGR32:
 593             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
 594             return altivec_uyvy_rgb32;
 595         default: return NULL;
 596         }
 597         break;
 598
 599     }
 600     return NULL;
 601 }
 602
 603 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
 604 {
 605     union {
 606         DECLARE_ALIGNED(16, signed short, tmp)[8];
 607         vector signed short vec;
 608     } buf;
 609
 610     buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
 611     buf.tmp[1] =  -256*brightness;                                      //oy
 612     buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
 613     buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
 614     buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
 615     buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
 616
 617
 618     c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
 619     c->CY   = vec_splat ((vector signed short)buf.vec, 0);
 620     c->OY   = vec_splat ((vector signed short)buf.vec, 1);
 621     c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
 622     c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
 623     c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
 624     c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
 625     return;
 626 }
 627
 628
 629 static av_always_inline void
 630 ff_yuv2packedX_altivec(SwsContext *c, const int16_t *lumFilter,
 631                        const int16_t **lumSrc, int lumFilterSize,
 632                        const int16_t *chrFilter, const int16_t **chrUSrc,
 633                        const int16_t **chrVSrc, int chrFilterSize,
 634                        const int16_t **alpSrc, uint8_t *dest,
 635                        int dstW, int dstY, enum PixelFormat target)
 636 {
 637     int i,j;
 638     vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
 639     vector signed short R0,G0,B0,R1,G1,B1;
 640
 641     vector unsigned char R,G,B;
 642     vector unsigned char *out,*nout;
 643
 644     vector signed short   RND = vec_splat_s16(1<<3);
 645     vector unsigned short SCL = vec_splat_u16(4);
 646     DECLARE_ALIGNED(16, unsigned int, scratch)[16];
 647
 648     vector signed short *YCoeffs, *CCoeffs;
 649
 650     YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
 651     CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
 652
 653     out = (vector unsigned char *)dest;
 654
 655     for (i=0; i<dstW; i+=16) {
 656         Y0 = RND;
 657         Y1 = RND;
 658         /* extract 16 coeffs from lumSrc */
 659         for (j=0; j<lumFilterSize; j++) {
 660             X0 = vec_ld (0,  &lumSrc[j][i]);
 661             X1 = vec_ld (16, &lumSrc[j][i]);
 662             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
 663             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
 664         }
 665
 666         U = RND;
 667         V = RND;
 668         /* extract 8 coeffs from U,V */
 669         for (j=0; j<chrFilterSize; j++) {
 670             X  = vec_ld (0, &chrUSrc[j][i/2]);
 671             U  = vec_mradds (X, CCoeffs[j], U);
 672             X  = vec_ld (0, &chrVSrc[j][i/2]);
 673             V  = vec_mradds (X, CCoeffs[j], V);
 674         }
 675
 676         /* scale and clip signals */
 677         Y0 = vec_sra (Y0, SCL);
 678         Y1 = vec_sra (Y1, SCL);
 679         U  = vec_sra (U,  SCL);
 680         V  = vec_sra (V,  SCL);
 681
 682         Y0 = vec_clip_s16 (Y0);
 683         Y1 = vec_clip_s16 (Y1);
 684         U  = vec_clip_s16 (U);
 685         V  = vec_clip_s16 (V);
 686
 687         /* now we have
 688           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
 689           U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
 690
 691           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
 692           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
 693           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
 694         */
 695
 696         U0 = vec_mergeh (U,U);
 697         V0 = vec_mergeh (V,V);
 698
 699         U1 = vec_mergel (U,U);
 700         V1 = vec_mergel (V,V);
 701
 702         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
 703         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
 704
 705         R  = vec_packclp (R0,R1);
 706         G  = vec_packclp (G0,G1);
 707         B  = vec_packclp (B0,B1);
 708
 709         switch(target) {
 710         case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
 711         case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
 712         case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
 713         case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
 714         case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
 715         case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
 716         default:
 717             {
 718                 /* If this is reached, the caller should have called yuv2packedXinC
 719                    instead. */
 720                 static int printed_error_message;
 721                 if (!printed_error_message) {
 722                     av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
 723                            sws_format_name(c->dstFormat));
 724                     printed_error_message=1;
 725                 }
 726                 return;
 727             }
 728         }
 729     }
 730
 731     if (i < dstW) {
 732         i -= 16;
 733
 734         Y0 = RND;
 735         Y1 = RND;
 736         /* extract 16 coeffs from lumSrc */
 737         for (j=0; j<lumFilterSize; j++) {
 738             X0 = vec_ld (0,  &lumSrc[j][i]);
 739             X1 = vec_ld (16, &lumSrc[j][i]);
 740             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
 741             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
 742         }
 743
 744         U = RND;
 745         V = RND;
 746         /* extract 8 coeffs from U,V */
 747         for (j=0; j<chrFilterSize; j++) {
 748             X  = vec_ld (0, &chrUSrc[j][i/2]);
 749             U  = vec_mradds (X, CCoeffs[j], U);
 750             X  = vec_ld (0, &chrVSrc[j][i/2]);
 751             V  = vec_mradds (X, CCoeffs[j], V);
 752         }
 753
 754         /* scale and clip signals */
 755         Y0 = vec_sra (Y0, SCL);
 756         Y1 = vec_sra (Y1, SCL);
 757         U  = vec_sra (U,  SCL);
 758         V  = vec_sra (V,  SCL);
 759
 760         Y0 = vec_clip_s16 (Y0);
 761         Y1 = vec_clip_s16 (Y1);
 762         U  = vec_clip_s16 (U);
 763         V  = vec_clip_s16 (V);
 764
 765         /* now we have
 766            Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
 767            U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
 768
 769            Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
 770            U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
 771            V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
 772         */
 773
 774         U0 = vec_mergeh (U,U);
 775         V0 = vec_mergeh (V,V);
 776
 777         U1 = vec_mergel (U,U);
 778         V1 = vec_mergel (V,V);
 779
 780         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
 781         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
 782
 783         R  = vec_packclp (R0,R1);
 784         G  = vec_packclp (G0,G1);
 785         B  = vec_packclp (B0,B1);
 786
 787         nout = (vector unsigned char *)scratch;
 788         switch(target) {
 789         case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
 790         case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
 791         case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
 792         case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
 793         case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
 794         case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
 795         default:
 796             /* Unreachable, I think. */
 797             av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
 798                    sws_format_name(c->dstFormat));
 799             return;
 800         }
 801
 802         memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
 803     }
 804
 805 }
 806
 807 #define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
 808 void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, const int16_t *lumFilter, \
 809                             const int16_t **lumSrc, int lumFilterSize, \
 810                             const int16_t *chrFilter, const int16_t **chrUSrc, \
 811                             const int16_t **chrVSrc, int chrFilterSize, \
 812                             const int16_t **alpSrc, uint8_t *dest, \
 813                             int dstW, int dstY) \
 814 { \
 815     ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
 816                            chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 817                            alpSrc, dest, dstW, dstY, pixfmt); \
 818 }
 819
 820 YUV2PACKEDX_WRAPPER(abgr,  PIX_FMT_ABGR);
 821 YUV2PACKEDX_WRAPPER(bgra,  PIX_FMT_BGRA);
 822 YUV2PACKEDX_WRAPPER(argb,  PIX_FMT_ARGB);
 823 YUV2PACKEDX_WRAPPER(rgba,  PIX_FMT_RGBA);
 824 YUV2PACKEDX_WRAPPER(rgb24, PIX_FMT_RGB24);
 825 YUV2PACKEDX_WRAPPER(bgr24, PIX_FMT_BGR24);