]> git.sesse.net Git - ffmpeg/blob - libswscale/ppc/yuv2rgb_altivec.c
Merge remote-tracking branch 'qatar/master'
[ffmpeg] / libswscale / ppc / yuv2rgb_altivec.c
1 /*
2  * AltiVec acceleration for colorspace conversion
3  *
4  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22
23 /*
24 Convert I420 YV12 to RGB in various formats,
25   it rejects images that are not in 420 formats,
26   it rejects images that don't have widths of multiples of 16,
27   it rejects images that don't have heights of multiples of 2.
28 Reject defers to C simulation code.
29
30 Lots of optimizations to be done here.
31
32 1. Need to fix saturation code. I just couldn't get it to fly with packs
33    and adds, so we currently use max/min to clip.
34
35 2. The inefficient use of chroma loading needs a bit of brushing up.
36
37 3. Analysis of pipeline stalls needs to be done. Use shark to identify
38    pipeline stalls.
39
40
41 MODIFIED to calculate coeffs from currently selected color space.
42 MODIFIED core to be a macro where you specify the output format.
43 ADDED UYVY conversion which is never called due to some thing in swscale.
44 CORRECTED algorithim selection to be strict on input formats.
45 ADDED runtime detection of AltiVec.
46
47 ADDED altivec_yuv2packedX vertical scl + RGB converter
48
49 March 27,2004
50 PERFORMANCE ANALYSIS
51
52 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53 used as test.
54 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55 same sequence.
56
57 720 * 480 * 30  ~10MPS
58
59 so we have roughly 10 clocks per pixel. This is too high, something has
60 to be wrong.
61
62 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63 need for vec_min.
64
65 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
66 the input video frame, it was just decompressed so it probably resides in L1
67 caches. However, we are creating the output video stream. This needs to use the
68 DSTST instruction to optimize for the cache. We couple this with the fact that
69 we are not going to be visiting the input buffer again so we mark it Least
70 Recently Used. This shaves 25% of the processor cycles off.
71
72 Now memcpy is the largest mips consumer in the system, probably due
73 to the inefficient X11 stuff.
74
75 GL libraries seem to be very slow on this machine 1.33Ghz PB running
76 Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
77 a versioning issue, however I have libGL.1.2.dylib for both
78 machines. (We need to figure this out now.)
79
80 GL2 libraries work now with patch for RGB32.
81
82 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
83
84 Integrated luma prescaling adjustment for saturation/contrast/brightness
85 adjustment.
86 */
87
88 #include <stdio.h>
89 #include <stdlib.h>
90 #include <string.h>
91 #include <inttypes.h>
92 #include <assert.h>
93 #include "config.h"
94 #include "libswscale/rgb2rgb.h"
95 #include "libswscale/swscale.h"
96 #include "libswscale/swscale_internal.h"
97 #include "libavutil/cpu.h"
98 #include "libavutil/pixdesc.h"
99 #include "yuv2rgb_altivec.h"
100
101 #undef PROFILE_THE_BEAST
102 #undef INC_SCALING
103
104 typedef unsigned char ubyte;
105 typedef signed char   sbyte;
106
107
108 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
109    homogeneous vector registers x0,x1,x2 are interleaved with the
110    following technique:
111
112       o0 = vec_mergeh (x0,x1);
113       o1 = vec_perm (o0, x2, perm_rgb_0);
114       o2 = vec_perm (o0, x2, perm_rgb_1);
115       o3 = vec_mergel (x0,x1);
116       o4 = vec_perm (o3,o2,perm_rgb_2);
117       o5 = vec_perm (o3,o2,perm_rgb_3);
118
119   perm_rgb_0:   o0(RG).h v1(B) --> o1*
120               0   1  2   3   4
121              rgbr|gbrg|brgb|rgbr
122              0010 0100 1001 0010
123              0102 3145 2673 894A
124
125   perm_rgb_1:   o0(RG).h v1(B) --> o2
126               0   1  2   3   4
127              gbrg|brgb|bbbb|bbbb
128              0100 1001 1111 1111
129              B5CD 6EF7 89AB CDEF
130
131   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
132               0   1  2   3   4
133              gbrg|brgb|rgbr|gbrg
134              1111 1111 0010 0100
135              89AB CDEF 0182 3945
136
137   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
138               0   1  2   3   4
139              brgb|rgbr|gbrg|brgb
140              1001 0010 0100 1001
141              a67b 89cA BdCD eEFf
142
143 */
144 static
145 const vector unsigned char
146   perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
147                 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
148   perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
149                 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
150   perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
151                 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
152   perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
153                 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
154
155 #define vec_merge3(x2,x1,x0,y0,y1,y2)       \
156 do {                                        \
157     __typeof__(x0) o0,o2,o3;                \
158         o0 = vec_mergeh (x0,x1);            \
159         y0 = vec_perm (o0, x2, perm_rgb_0); \
160         o2 = vec_perm (o0, x2, perm_rgb_1); \
161         o3 = vec_mergel (x0,x1);            \
162         y1 = vec_perm (o3,o2,perm_rgb_2);   \
163         y2 = vec_perm (o3,o2,perm_rgb_3);   \
164 } while(0)
165
166 #define vec_mstbgr24(x0,x1,x2,ptr)      \
167 do {                                    \
168     __typeof__(x0) _0,_1,_2;            \
169     vec_merge3 (x0,x1,x2,_0,_1,_2);     \
170     vec_st (_0, 0, ptr++);              \
171     vec_st (_1, 0, ptr++);              \
172     vec_st (_2, 0, ptr++);              \
173 }  while (0)
174
175 #define vec_mstrgb24(x0,x1,x2,ptr)      \
176 do {                                    \
177     __typeof__(x0) _0,_1,_2;            \
178     vec_merge3 (x2,x1,x0,_0,_1,_2);     \
179     vec_st (_0, 0, ptr++);              \
180     vec_st (_1, 0, ptr++);              \
181     vec_st (_2, 0, ptr++);              \
182 }  while (0)
183
184 /* pack the pixels in rgb0 format
185    msb R
186    lsb 0
187 */
188 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
189 do {                                                                          \
190     T _0,_1,_2,_3;                                                            \
191     _0 = vec_mergeh (x0,x1);                                                  \
192     _1 = vec_mergeh (x2,x3);                                                  \
193     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
194     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
195     vec_st (_2, 0*16, (T *)ptr);                                              \
196     vec_st (_3, 1*16, (T *)ptr);                                              \
197     _0 = vec_mergel (x0,x1);                                                  \
198     _1 = vec_mergel (x2,x3);                                                  \
199     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
200     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
201     vec_st (_2, 2*16, (T *)ptr);                                              \
202     vec_st (_3, 3*16, (T *)ptr);                                              \
203     ptr += 4;                                                                 \
204 }  while (0)
205
206 /*
207
208   | 1     0       1.4021   | | Y |
209   | 1    -0.3441 -0.7142   |x| Cb|
210   | 1     1.7718  0        | | Cr|
211
212
213   Y:      [-128 127]
214   Cb/Cr : [-128 127]
215
216   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
217
218 */
219
220
221
222
223 #define vec_unh(x) \
224     (vector signed short) \
225         vec_perm(x,(__typeof__(x)){0}, \
226                  ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
227                                          0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
228 #define vec_unl(x) \
229     (vector signed short) \
230         vec_perm(x,(__typeof__(x)){0}, \
231                  ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
232                                          0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
233
234 #define vec_clip_s16(x) \
235     vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
236                          ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
237
238 #define vec_packclp(x,y) \
239     (vector unsigned char)vec_packs \
240         ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
241          (vector unsigned short)vec_max (y,((vector signed short) {0})))
242
243 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
244
245
246 static inline void cvtyuvtoRGB (SwsContext *c,
247                                 vector signed short Y, vector signed short U, vector signed short V,
248                                 vector signed short *R, vector signed short *G, vector signed short *B)
249 {
250     vector signed   short vx,ux,uvx;
251
252     Y = vec_mradds (Y, c->CY, c->OY);
253     U  = vec_sub (U,(vector signed short)
254                     vec_splat((vector signed short){128},0));
255     V  = vec_sub (V,(vector signed short)
256                     vec_splat((vector signed short){128},0));
257
258     //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
259     ux = vec_sl (U, c->CSHIFT);
260     *B = vec_mradds (ux, c->CBU, Y);
261
262     // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
263     vx = vec_sl (V, c->CSHIFT);
264     *R = vec_mradds (vx, c->CRV, Y);
265
266     // uvx = ((CGU*u) + (CGV*v))>>15;
267     uvx = vec_mradds (U, c->CGU, Y);
268     *G  = vec_mradds (V, c->CGV, uvx);
269 }
270
271
272 /*
273   ------------------------------------------------------------------------------
274   CS converters
275   ------------------------------------------------------------------------------
276 */
277
278
279 #define DEFCSP420_CVT(name,out_pixels)                                  \
280 static int altivec_##name (SwsContext *c,                               \
281                            const unsigned char **in, int *instrides,    \
282                            int srcSliceY,        int srcSliceH,         \
283                            unsigned char **oplanes, int *outstrides)    \
284 {                                                                       \
285     int w = c->srcW;                                                    \
286     int h = srcSliceH;                                                  \
287     int i,j;                                                            \
288     int instrides_scl[3];                                               \
289     vector unsigned char y0,y1;                                         \
290                                                                         \
291     vector signed char  u,v;                                            \
292                                                                         \
293     vector signed short Y0,Y1,Y2,Y3;                                    \
294     vector signed short U,V;                                            \
295     vector signed short vx,ux,uvx;                                      \
296     vector signed short vx0,ux0,uvx0;                                   \
297     vector signed short vx1,ux1,uvx1;                                   \
298     vector signed short R0,G0,B0;                                       \
299     vector signed short R1,G1,B1;                                       \
300     vector unsigned char R,G,B;                                         \
301                                                                         \
302     const vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;            \
303     vector unsigned char align_perm;                                    \
304                                                                         \
305     vector signed short                                                 \
306         lCY  = c->CY,                                                   \
307         lOY  = c->OY,                                                   \
308         lCRV = c->CRV,                                                  \
309         lCBU = c->CBU,                                                  \
310         lCGU = c->CGU,                                                  \
311         lCGV = c->CGV;                                                  \
312                                                                         \
313     vector unsigned short lCSHIFT = c->CSHIFT;                          \
314                                                                         \
315     const ubyte *y1i   = in[0];                                         \
316     const ubyte *y2i   = in[0]+instrides[0];                            \
317     const ubyte *ui    = in[1];                                         \
318     const ubyte *vi    = in[2];                                         \
319                                                                         \
320     vector unsigned char *oute                                          \
321         = (vector unsigned char *)                                      \
322             (oplanes[0]+srcSliceY*outstrides[0]);                       \
323     vector unsigned char *outo                                          \
324         = (vector unsigned char *)                                      \
325             (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
326                                                                         \
327                                                                         \
328     instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
329     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
330     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
331                                                                         \
332                                                                         \
333     for (i=0;i<h/2;i++) {                                               \
334         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
335         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
336                                                                         \
337         for (j=0;j<w/16;j++) {                                          \
338                                                                         \
339             y1ivP = (const vector unsigned char *)y1i;                  \
340             y2ivP = (const vector unsigned char *)y2i;                  \
341             uivP  = (const vector unsigned char *)ui;                   \
342             vivP  = (const vector unsigned char *)vi;                   \
343                                                                         \
344             align_perm = vec_lvsl (0, y1i);                             \
345             y0 = (vector unsigned char)                                 \
346                  vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
347                                                                         \
348             align_perm = vec_lvsl (0, y2i);                             \
349             y1 = (vector unsigned char)                                 \
350                  vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
351                                                                         \
352             align_perm = vec_lvsl (0, ui);                              \
353             u = (vector signed char)                                    \
354                 vec_perm (uivP[0], uivP[1], align_perm);                \
355                                                                         \
356             align_perm = vec_lvsl (0, vi);                              \
357             v = (vector signed char)                                    \
358                 vec_perm (vivP[0], vivP[1], align_perm);                \
359                                                                         \
360             u  = (vector signed char)                                   \
361                  vec_sub (u,(vector signed char)                        \
362                           vec_splat((vector signed char){128},0));      \
363             v  = (vector signed char)                                   \
364                  vec_sub (v,(vector signed char)                        \
365                           vec_splat((vector signed char){128},0));      \
366                                                                         \
367             U  = vec_unpackh (u);                                       \
368             V  = vec_unpackh (v);                                       \
369                                                                         \
370                                                                         \
371             Y0 = vec_unh (y0);                                          \
372             Y1 = vec_unl (y0);                                          \
373             Y2 = vec_unh (y1);                                          \
374             Y3 = vec_unl (y1);                                          \
375                                                                         \
376             Y0 = vec_mradds (Y0, lCY, lOY);                             \
377             Y1 = vec_mradds (Y1, lCY, lOY);                             \
378             Y2 = vec_mradds (Y2, lCY, lOY);                             \
379             Y3 = vec_mradds (Y3, lCY, lOY);                             \
380                                                                         \
381             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
382             ux = vec_sl (U, lCSHIFT);                                   \
383             ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
384             ux0  = vec_mergeh (ux,ux);                                  \
385             ux1  = vec_mergel (ux,ux);                                  \
386                                                                         \
387             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
388             vx = vec_sl (V, lCSHIFT);                                   \
389             vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
390             vx0  = vec_mergeh (vx,vx);                                  \
391             vx1  = vec_mergel (vx,vx);                                  \
392                                                                         \
393             /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
394             uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
395             uvx = vec_mradds (V, lCGV, uvx);                            \
396             uvx0 = vec_mergeh (uvx,uvx);                                \
397             uvx1 = vec_mergel (uvx,uvx);                                \
398                                                                         \
399             R0 = vec_add (Y0,vx0);                                      \
400             G0 = vec_add (Y0,uvx0);                                     \
401             B0 = vec_add (Y0,ux0);                                      \
402             R1 = vec_add (Y1,vx1);                                      \
403             G1 = vec_add (Y1,uvx1);                                     \
404             B1 = vec_add (Y1,ux1);                                      \
405                                                                         \
406             R  = vec_packclp (R0,R1);                                   \
407             G  = vec_packclp (G0,G1);                                   \
408             B  = vec_packclp (B0,B1);                                   \
409                                                                         \
410             out_pixels(R,G,B,oute);                                     \
411                                                                         \
412             R0 = vec_add (Y2,vx0);                                      \
413             G0 = vec_add (Y2,uvx0);                                     \
414             B0 = vec_add (Y2,ux0);                                      \
415             R1 = vec_add (Y3,vx1);                                      \
416             G1 = vec_add (Y3,uvx1);                                     \
417             B1 = vec_add (Y3,ux1);                                      \
418             R  = vec_packclp (R0,R1);                                   \
419             G  = vec_packclp (G0,G1);                                   \
420             B  = vec_packclp (B0,B1);                                   \
421                                                                         \
422                                                                         \
423             out_pixels(R,G,B,outo);                                     \
424                                                                         \
425             y1i  += 16;                                                 \
426             y2i  += 16;                                                 \
427             ui   += 8;                                                  \
428             vi   += 8;                                                  \
429                                                                         \
430         }                                                               \
431                                                                         \
432         outo  += (outstrides[0])>>4;                                    \
433         oute  += (outstrides[0])>>4;                                    \
434                                                                         \
435         ui    += instrides_scl[1];                                      \
436         vi    += instrides_scl[2];                                      \
437         y1i   += instrides_scl[0];                                      \
438         y2i   += instrides_scl[0];                                      \
439     }                                                                   \
440     return srcSliceH;                                                   \
441 }
442
443
444 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
445 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
446 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
447 #define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
448 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
449 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
450
451 DEFCSP420_CVT (yuv2_abgr, out_abgr)
452 DEFCSP420_CVT (yuv2_bgra, out_bgra)
453 DEFCSP420_CVT (yuv2_rgba, out_rgba)
454 DEFCSP420_CVT (yuv2_argb, out_argb)
455 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
456 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
457
458
459 // uyvy|uyvy|uyvy|uyvy
460 // 0123 4567 89ab cdef
461 static
462 const vector unsigned char
463     demux_u = {0x10,0x00,0x10,0x00,
464                0x10,0x04,0x10,0x04,
465                0x10,0x08,0x10,0x08,
466                0x10,0x0c,0x10,0x0c},
467     demux_v = {0x10,0x02,0x10,0x02,
468                0x10,0x06,0x10,0x06,
469                0x10,0x0A,0x10,0x0A,
470                0x10,0x0E,0x10,0x0E},
471     demux_y = {0x10,0x01,0x10,0x03,
472                0x10,0x05,0x10,0x07,
473                0x10,0x09,0x10,0x0B,
474                0x10,0x0D,0x10,0x0F};
475
476 /*
477   this is so I can play live CCIR raw video
478 */
479 static int altivec_uyvy_rgb32 (SwsContext *c,
480                                const unsigned char **in, int *instrides,
481                                int srcSliceY,        int srcSliceH,
482                                unsigned char **oplanes, int *outstrides)
483 {
484     int w = c->srcW;
485     int h = srcSliceH;
486     int i,j;
487     vector unsigned char uyvy;
488     vector signed   short Y,U,V;
489     vector signed   short R0,G0,B0,R1,G1,B1;
490     vector unsigned char  R,G,B;
491     vector unsigned char *out;
492     const ubyte *img;
493
494     img = in[0];
495     out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
496
497     for (i=0;i<h;i++) {
498         for (j=0;j<w/16;j++) {
499             uyvy = vec_ld (0, img);
500             U = (vector signed short)
501                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
502
503             V = (vector signed short)
504                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
505
506             Y = (vector signed short)
507                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
508
509             cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
510
511             uyvy = vec_ld (16, img);
512             U = (vector signed short)
513                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
514
515             V = (vector signed short)
516                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
517
518             Y = (vector signed short)
519                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
520
521             cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
522
523             R  = vec_packclp (R0,R1);
524             G  = vec_packclp (G0,G1);
525             B  = vec_packclp (B0,B1);
526
527             //      vec_mstbgr24 (R,G,B, out);
528             out_rgba (R,G,B,out);
529
530             img += 32;
531         }
532     }
533     return srcSliceH;
534 }
535
536
537
538 /* Ok currently the acceleration routine only supports
539    inputs of widths a multiple of 16
540    and heights a multiple 2
541
542    So we just fall back to the C codes for this.
543 */
544 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
545 {
546     if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
547         return NULL;
548
549     /*
550       and this seems not to matter too much I tried a bunch of
551       videos with abnormal widths and MPlayer crashes elsewhere.
552       mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
553       boom with X11 bad match.
554
555     */
556     if ((c->srcW & 0xf) != 0)    return NULL;
557
558     switch (c->srcFormat) {
559     case PIX_FMT_YUV410P:
560     case PIX_FMT_YUV420P:
561     /*case IMGFMT_CLPL:        ??? */
562     case PIX_FMT_GRAY8:
563     case PIX_FMT_NV12:
564     case PIX_FMT_NV21:
565         if ((c->srcH & 0x1) != 0)
566             return NULL;
567
568         switch(c->dstFormat) {
569         case PIX_FMT_RGB24:
570             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
571             return altivec_yuv2_rgb24;
572         case PIX_FMT_BGR24:
573             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
574             return altivec_yuv2_bgr24;
575         case PIX_FMT_ARGB:
576             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
577             return altivec_yuv2_argb;
578         case PIX_FMT_ABGR:
579             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
580             return altivec_yuv2_abgr;
581         case PIX_FMT_RGBA:
582             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
583             return altivec_yuv2_rgba;
584         case PIX_FMT_BGRA:
585             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
586             return altivec_yuv2_bgra;
587         default: return NULL;
588         }
589         break;
590
591     case PIX_FMT_UYVY422:
592         switch(c->dstFormat) {
593         case PIX_FMT_BGR32:
594             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
595             return altivec_uyvy_rgb32;
596         default: return NULL;
597         }
598         break;
599
600     }
601     return NULL;
602 }
603
604 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
605 {
606     union {
607         DECLARE_ALIGNED(16, signed short, tmp)[8];
608         vector signed short vec;
609     } buf;
610
611     buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
612     buf.tmp[1] =  -256*brightness;                                      //oy
613     buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
614     buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
615     buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
616     buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
617
618
619     c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
620     c->CY   = vec_splat ((vector signed short)buf.vec, 0);
621     c->OY   = vec_splat ((vector signed short)buf.vec, 1);
622     c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
623     c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
624     c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
625     c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
626     return;
627 }
628
629
630 static av_always_inline void
631 ff_yuv2packedX_altivec(SwsContext *c, const int16_t *lumFilter,
632                        const int16_t **lumSrc, int lumFilterSize,
633                        const int16_t *chrFilter, const int16_t **chrUSrc,
634                        const int16_t **chrVSrc, int chrFilterSize,
635                        const int16_t **alpSrc, uint8_t *dest,
636                        int dstW, int dstY, enum PixelFormat target)
637 {
638     int i,j;
639     vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
640     vector signed short R0,G0,B0,R1,G1,B1;
641
642     vector unsigned char R,G,B;
643     vector unsigned char *out,*nout;
644
645     vector signed short   RND = vec_splat_s16(1<<3);
646     vector unsigned short SCL = vec_splat_u16(4);
647     DECLARE_ALIGNED(16, unsigned int, scratch)[16];
648
649     vector signed short *YCoeffs, *CCoeffs;
650
651     YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
652     CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
653
654     out = (vector unsigned char *)dest;
655
656     for (i=0; i<dstW; i+=16) {
657         Y0 = RND;
658         Y1 = RND;
659         /* extract 16 coeffs from lumSrc */
660         for (j=0; j<lumFilterSize; j++) {
661             X0 = vec_ld (0,  &lumSrc[j][i]);
662             X1 = vec_ld (16, &lumSrc[j][i]);
663             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
664             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
665         }
666
667         U = RND;
668         V = RND;
669         /* extract 8 coeffs from U,V */
670         for (j=0; j<chrFilterSize; j++) {
671             X  = vec_ld (0, &chrUSrc[j][i/2]);
672             U  = vec_mradds (X, CCoeffs[j], U);
673             X  = vec_ld (0, &chrVSrc[j][i/2]);
674             V  = vec_mradds (X, CCoeffs[j], V);
675         }
676
677         /* scale and clip signals */
678         Y0 = vec_sra (Y0, SCL);
679         Y1 = vec_sra (Y1, SCL);
680         U  = vec_sra (U,  SCL);
681         V  = vec_sra (V,  SCL);
682
683         Y0 = vec_clip_s16 (Y0);
684         Y1 = vec_clip_s16 (Y1);
685         U  = vec_clip_s16 (U);
686         V  = vec_clip_s16 (V);
687
688         /* now we have
689           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
690           U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
691
692           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
693           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
694           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
695         */
696
697         U0 = vec_mergeh (U,U);
698         V0 = vec_mergeh (V,V);
699
700         U1 = vec_mergel (U,U);
701         V1 = vec_mergel (V,V);
702
703         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
704         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
705
706         R  = vec_packclp (R0,R1);
707         G  = vec_packclp (G0,G1);
708         B  = vec_packclp (B0,B1);
709
710         switch(target) {
711         case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
712         case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
713         case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
714         case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
715         case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
716         case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
717         default:
718             {
719                 /* If this is reached, the caller should have called yuv2packedXinC
720                    instead. */
721                 static int printed_error_message;
722                 if (!printed_error_message) {
723                     av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
724                            av_get_pix_fmt_name(c->dstFormat));
725                     printed_error_message=1;
726                 }
727                 return;
728             }
729         }
730     }
731
732     if (i < dstW) {
733         i -= 16;
734
735         Y0 = RND;
736         Y1 = RND;
737         /* extract 16 coeffs from lumSrc */
738         for (j=0; j<lumFilterSize; j++) {
739             X0 = vec_ld (0,  &lumSrc[j][i]);
740             X1 = vec_ld (16, &lumSrc[j][i]);
741             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
742             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
743         }
744
745         U = RND;
746         V = RND;
747         /* extract 8 coeffs from U,V */
748         for (j=0; j<chrFilterSize; j++) {
749             X  = vec_ld (0, &chrUSrc[j][i/2]);
750             U  = vec_mradds (X, CCoeffs[j], U);
751             X  = vec_ld (0, &chrVSrc[j][i/2]);
752             V  = vec_mradds (X, CCoeffs[j], V);
753         }
754
755         /* scale and clip signals */
756         Y0 = vec_sra (Y0, SCL);
757         Y1 = vec_sra (Y1, SCL);
758         U  = vec_sra (U,  SCL);
759         V  = vec_sra (V,  SCL);
760
761         Y0 = vec_clip_s16 (Y0);
762         Y1 = vec_clip_s16 (Y1);
763         U  = vec_clip_s16 (U);
764         V  = vec_clip_s16 (V);
765
766         /* now we have
767            Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
768            U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
769
770            Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
771            U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
772            V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
773         */
774
775         U0 = vec_mergeh (U,U);
776         V0 = vec_mergeh (V,V);
777
778         U1 = vec_mergel (U,U);
779         V1 = vec_mergel (V,V);
780
781         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
782         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
783
784         R  = vec_packclp (R0,R1);
785         G  = vec_packclp (G0,G1);
786         B  = vec_packclp (B0,B1);
787
788         nout = (vector unsigned char *)scratch;
789         switch(target) {
790         case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
791         case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
792         case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
793         case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
794         case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
795         case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
796         default:
797             /* Unreachable, I think. */
798             av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
799                    av_get_pix_fmt_name(c->dstFormat));
800             return;
801         }
802
803         memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
804     }
805
806 }
807
808 #define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
809 void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, const int16_t *lumFilter, \
810                             const int16_t **lumSrc, int lumFilterSize, \
811                             const int16_t *chrFilter, const int16_t **chrUSrc, \
812                             const int16_t **chrVSrc, int chrFilterSize, \
813                             const int16_t **alpSrc, uint8_t *dest, \
814                             int dstW, int dstY) \
815 { \
816     ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
817                            chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
818                            alpSrc, dest, dstW, dstY, pixfmt); \
819 }
820
821 YUV2PACKEDX_WRAPPER(abgr,  PIX_FMT_ABGR);
822 YUV2PACKEDX_WRAPPER(bgra,  PIX_FMT_BGRA);
823 YUV2PACKEDX_WRAPPER(argb,  PIX_FMT_ARGB);
824 YUV2PACKEDX_WRAPPER(rgba,  PIX_FMT_RGBA);
825 YUV2PACKEDX_WRAPPER(rgb24, PIX_FMT_RGB24);
826 YUV2PACKEDX_WRAPPER(bgr24, PIX_FMT_BGR24);