]> git.sesse.net Git - ffmpeg/blob - libswscale/ppc/yuv2rgb_altivec.c
cmdutils: Rename read_file to cmdutils_read_file
[ffmpeg] / libswscale / ppc / yuv2rgb_altivec.c
1 /*
2  * AltiVec acceleration for colorspace conversion
3  *
4  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
5  *
6  * This file is part of Libav.
7  *
8  * Libav is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * Libav is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with Libav; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22
23 /*
24 Convert I420 YV12 to RGB in various formats,
25   it rejects images that are not in 420 formats,
26   it rejects images that don't have widths of multiples of 16,
27   it rejects images that don't have heights of multiples of 2.
28 Reject defers to C simulation code.
29
30 Lots of optimizations to be done here.
31
32 1. Need to fix saturation code. I just couldn't get it to fly with packs
33    and adds, so we currently use max/min to clip.
34
35 2. The inefficient use of chroma loading needs a bit of brushing up.
36
37 3. Analysis of pipeline stalls needs to be done. Use shark to identify
38    pipeline stalls.
39
40
41 MODIFIED to calculate coeffs from currently selected color space.
42 MODIFIED core to be a macro where you specify the output format.
43 ADDED UYVY conversion which is never called due to some thing in swscale.
44 CORRECTED algorithim selection to be strict on input formats.
45 ADDED runtime detection of AltiVec.
46
47 ADDED altivec_yuv2packedX vertical scl + RGB converter
48
49 March 27,2004
50 PERFORMANCE ANALYSIS
51
52 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53 used as test.
54 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55 same sequence.
56
57 720 * 480 * 30  ~10MPS
58
59 so we have roughly 10 clocks per pixel. This is too high, something has
60 to be wrong.
61
62 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63 need for vec_min.
64
65 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
66 the input video frame, it was just decompressed so it probably resides in L1
67 caches. However, we are creating the output video stream. This needs to use the
68 DSTST instruction to optimize for the cache. We couple this with the fact that
69 we are not going to be visiting the input buffer again so we mark it Least
70 Recently Used. This shaves 25% of the processor cycles off.
71
72 Now memcpy is the largest mips consumer in the system, probably due
73 to the inefficient X11 stuff.
74
75 GL libraries seem to be very slow on this machine 1.33Ghz PB running
76 Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
77 a versioning issue, however I have libGL.1.2.dylib for both
78 machines. (We need to figure this out now.)
79
80 GL2 libraries work now with patch for RGB32.
81
82 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
83
84 Integrated luma prescaling adjustment for saturation/contrast/brightness
85 adjustment.
86 */
87
88 #include <stdio.h>
89 #include <stdlib.h>
90 #include <string.h>
91 #include <inttypes.h>
92 #include <assert.h>
93 #include "config.h"
94 #include "libswscale/rgb2rgb.h"
95 #include "libswscale/swscale.h"
96 #include "libswscale/swscale_internal.h"
97 #include "libavutil/cpu.h"
98 #include "yuv2rgb_altivec.h"
99
100 #undef PROFILE_THE_BEAST
101 #undef INC_SCALING
102
103 typedef unsigned char ubyte;
104 typedef signed char   sbyte;
105
106
107 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
108    homogeneous vector registers x0,x1,x2 are interleaved with the
109    following technique:
110
111       o0 = vec_mergeh (x0,x1);
112       o1 = vec_perm (o0, x2, perm_rgb_0);
113       o2 = vec_perm (o0, x2, perm_rgb_1);
114       o3 = vec_mergel (x0,x1);
115       o4 = vec_perm (o3,o2,perm_rgb_2);
116       o5 = vec_perm (o3,o2,perm_rgb_3);
117
118   perm_rgb_0:   o0(RG).h v1(B) --> o1*
119               0   1  2   3   4
120              rgbr|gbrg|brgb|rgbr
121              0010 0100 1001 0010
122              0102 3145 2673 894A
123
124   perm_rgb_1:   o0(RG).h v1(B) --> o2
125               0   1  2   3   4
126              gbrg|brgb|bbbb|bbbb
127              0100 1001 1111 1111
128              B5CD 6EF7 89AB CDEF
129
130   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
131               0   1  2   3   4
132              gbrg|brgb|rgbr|gbrg
133              1111 1111 0010 0100
134              89AB CDEF 0182 3945
135
136   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
137               0   1  2   3   4
138              brgb|rgbr|gbrg|brgb
139              1001 0010 0100 1001
140              a67b 89cA BdCD eEFf
141
142 */
143 static
144 const vector unsigned char
145   perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
146                 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
147   perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
148                 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
149   perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
150                 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
151   perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
152                 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
153
154 #define vec_merge3(x2,x1,x0,y0,y1,y2)       \
155 do {                                        \
156     __typeof__(x0) o0,o2,o3;                \
157         o0 = vec_mergeh (x0,x1);            \
158         y0 = vec_perm (o0, x2, perm_rgb_0); \
159         o2 = vec_perm (o0, x2, perm_rgb_1); \
160         o3 = vec_mergel (x0,x1);            \
161         y1 = vec_perm (o3,o2,perm_rgb_2);   \
162         y2 = vec_perm (o3,o2,perm_rgb_3);   \
163 } while(0)
164
165 #define vec_mstbgr24(x0,x1,x2,ptr)      \
166 do {                                    \
167     __typeof__(x0) _0,_1,_2;            \
168     vec_merge3 (x0,x1,x2,_0,_1,_2);     \
169     vec_st (_0, 0, ptr++);              \
170     vec_st (_1, 0, ptr++);              \
171     vec_st (_2, 0, ptr++);              \
172 }  while (0)
173
174 #define vec_mstrgb24(x0,x1,x2,ptr)      \
175 do {                                    \
176     __typeof__(x0) _0,_1,_2;            \
177     vec_merge3 (x2,x1,x0,_0,_1,_2);     \
178     vec_st (_0, 0, ptr++);              \
179     vec_st (_1, 0, ptr++);              \
180     vec_st (_2, 0, ptr++);              \
181 }  while (0)
182
183 /* pack the pixels in rgb0 format
184    msb R
185    lsb 0
186 */
187 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
188 do {                                                                          \
189     T _0,_1,_2,_3;                                                            \
190     _0 = vec_mergeh (x0,x1);                                                  \
191     _1 = vec_mergeh (x2,x3);                                                  \
192     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
193     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
194     vec_st (_2, 0*16, (T *)ptr);                                              \
195     vec_st (_3, 1*16, (T *)ptr);                                              \
196     _0 = vec_mergel (x0,x1);                                                  \
197     _1 = vec_mergel (x2,x3);                                                  \
198     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
199     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
200     vec_st (_2, 2*16, (T *)ptr);                                              \
201     vec_st (_3, 3*16, (T *)ptr);                                              \
202     ptr += 4;                                                                 \
203 }  while (0)
204
205 /*
206
207   | 1     0       1.4021   | | Y |
208   | 1    -0.3441 -0.7142   |x| Cb|
209   | 1     1.7718  0        | | Cr|
210
211
212   Y:      [-128 127]
213   Cb/Cr : [-128 127]
214
215   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
216
217 */
218
219
220
221
222 #define vec_unh(x) \
223     (vector signed short) \
224         vec_perm(x,(__typeof__(x)){0}, \
225                  ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
226                                          0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
227 #define vec_unl(x) \
228     (vector signed short) \
229         vec_perm(x,(__typeof__(x)){0}, \
230                  ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
231                                          0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
232
233 #define vec_clip_s16(x) \
234     vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
235                          ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
236
237 #define vec_packclp(x,y) \
238     (vector unsigned char)vec_packs \
239         ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
240          (vector unsigned short)vec_max (y,((vector signed short) {0})))
241
242 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
243
244
245 static inline void cvtyuvtoRGB (SwsContext *c,
246                                 vector signed short Y, vector signed short U, vector signed short V,
247                                 vector signed short *R, vector signed short *G, vector signed short *B)
248 {
249     vector signed   short vx,ux,uvx;
250
251     Y = vec_mradds (Y, c->CY, c->OY);
252     U  = vec_sub (U,(vector signed short)
253                     vec_splat((vector signed short){128},0));
254     V  = vec_sub (V,(vector signed short)
255                     vec_splat((vector signed short){128},0));
256
257     //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
258     ux = vec_sl (U, c->CSHIFT);
259     *B = vec_mradds (ux, c->CBU, Y);
260
261     // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
262     vx = vec_sl (V, c->CSHIFT);
263     *R = vec_mradds (vx, c->CRV, Y);
264
265     // uvx = ((CGU*u) + (CGV*v))>>15;
266     uvx = vec_mradds (U, c->CGU, Y);
267     *G  = vec_mradds (V, c->CGV, uvx);
268 }
269
270
271 /*
272   ------------------------------------------------------------------------------
273   CS converters
274   ------------------------------------------------------------------------------
275 */
276
277
278 #define DEFCSP420_CVT(name,out_pixels)                                  \
279 static int altivec_##name (SwsContext *c,                               \
280                            const unsigned char **in, int *instrides,    \
281                            int srcSliceY,        int srcSliceH,         \
282                            unsigned char **oplanes, int *outstrides)    \
283 {                                                                       \
284     int w = c->srcW;                                                    \
285     int h = srcSliceH;                                                  \
286     int i,j;                                                            \
287     int instrides_scl[3];                                               \
288     vector unsigned char y0,y1;                                         \
289                                                                         \
290     vector signed char  u,v;                                            \
291                                                                         \
292     vector signed short Y0,Y1,Y2,Y3;                                    \
293     vector signed short U,V;                                            \
294     vector signed short vx,ux,uvx;                                      \
295     vector signed short vx0,ux0,uvx0;                                   \
296     vector signed short vx1,ux1,uvx1;                                   \
297     vector signed short R0,G0,B0;                                       \
298     vector signed short R1,G1,B1;                                       \
299     vector unsigned char R,G,B;                                         \
300                                                                         \
301     vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
302     vector unsigned char align_perm;                                    \
303                                                                         \
304     vector signed short                                                 \
305         lCY  = c->CY,                                                   \
306         lOY  = c->OY,                                                   \
307         lCRV = c->CRV,                                                  \
308         lCBU = c->CBU,                                                  \
309         lCGU = c->CGU,                                                  \
310         lCGV = c->CGV;                                                  \
311                                                                         \
312     vector unsigned short lCSHIFT = c->CSHIFT;                          \
313                                                                         \
314     const ubyte *y1i   = in[0];                                         \
315     const ubyte *y2i   = in[0]+instrides[0];                            \
316     const ubyte *ui    = in[1];                                         \
317     const ubyte *vi    = in[2];                                         \
318                                                                         \
319     vector unsigned char *oute                                          \
320         = (vector unsigned char *)                                      \
321             (oplanes[0]+srcSliceY*outstrides[0]);                       \
322     vector unsigned char *outo                                          \
323         = (vector unsigned char *)                                      \
324             (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
325                                                                         \
326                                                                         \
327     instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
328     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
329     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
330                                                                         \
331                                                                         \
332     for (i=0;i<h/2;i++) {                                               \
333         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
334         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
335                                                                         \
336         for (j=0;j<w/16;j++) {                                          \
337                                                                         \
338             y1ivP = (vector unsigned char *)y1i;                        \
339             y2ivP = (vector unsigned char *)y2i;                        \
340             uivP  = (vector unsigned char *)ui;                         \
341             vivP  = (vector unsigned char *)vi;                         \
342                                                                         \
343             align_perm = vec_lvsl (0, y1i);                             \
344             y0 = (vector unsigned char)                                 \
345                  vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
346                                                                         \
347             align_perm = vec_lvsl (0, y2i);                             \
348             y1 = (vector unsigned char)                                 \
349                  vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
350                                                                         \
351             align_perm = vec_lvsl (0, ui);                              \
352             u = (vector signed char)                                    \
353                 vec_perm (uivP[0], uivP[1], align_perm);                \
354                                                                         \
355             align_perm = vec_lvsl (0, vi);                              \
356             v = (vector signed char)                                    \
357                 vec_perm (vivP[0], vivP[1], align_perm);                \
358                                                                         \
359             u  = (vector signed char)                                   \
360                  vec_sub (u,(vector signed char)                        \
361                           vec_splat((vector signed char){128},0));      \
362             v  = (vector signed char)                                   \
363                  vec_sub (v,(vector signed char)                        \
364                           vec_splat((vector signed char){128},0));      \
365                                                                         \
366             U  = vec_unpackh (u);                                       \
367             V  = vec_unpackh (v);                                       \
368                                                                         \
369                                                                         \
370             Y0 = vec_unh (y0);                                          \
371             Y1 = vec_unl (y0);                                          \
372             Y2 = vec_unh (y1);                                          \
373             Y3 = vec_unl (y1);                                          \
374                                                                         \
375             Y0 = vec_mradds (Y0, lCY, lOY);                             \
376             Y1 = vec_mradds (Y1, lCY, lOY);                             \
377             Y2 = vec_mradds (Y2, lCY, lOY);                             \
378             Y3 = vec_mradds (Y3, lCY, lOY);                             \
379                                                                         \
380             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
381             ux = vec_sl (U, lCSHIFT);                                   \
382             ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
383             ux0  = vec_mergeh (ux,ux);                                  \
384             ux1  = vec_mergel (ux,ux);                                  \
385                                                                         \
386             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
387             vx = vec_sl (V, lCSHIFT);                                   \
388             vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
389             vx0  = vec_mergeh (vx,vx);                                  \
390             vx1  = vec_mergel (vx,vx);                                  \
391                                                                         \
392             /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
393             uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
394             uvx = vec_mradds (V, lCGV, uvx);                            \
395             uvx0 = vec_mergeh (uvx,uvx);                                \
396             uvx1 = vec_mergel (uvx,uvx);                                \
397                                                                         \
398             R0 = vec_add (Y0,vx0);                                      \
399             G0 = vec_add (Y0,uvx0);                                     \
400             B0 = vec_add (Y0,ux0);                                      \
401             R1 = vec_add (Y1,vx1);                                      \
402             G1 = vec_add (Y1,uvx1);                                     \
403             B1 = vec_add (Y1,ux1);                                      \
404                                                                         \
405             R  = vec_packclp (R0,R1);                                   \
406             G  = vec_packclp (G0,G1);                                   \
407             B  = vec_packclp (B0,B1);                                   \
408                                                                         \
409             out_pixels(R,G,B,oute);                                     \
410                                                                         \
411             R0 = vec_add (Y2,vx0);                                      \
412             G0 = vec_add (Y2,uvx0);                                     \
413             B0 = vec_add (Y2,ux0);                                      \
414             R1 = vec_add (Y3,vx1);                                      \
415             G1 = vec_add (Y3,uvx1);                                     \
416             B1 = vec_add (Y3,ux1);                                      \
417             R  = vec_packclp (R0,R1);                                   \
418             G  = vec_packclp (G0,G1);                                   \
419             B  = vec_packclp (B0,B1);                                   \
420                                                                         \
421                                                                         \
422             out_pixels(R,G,B,outo);                                     \
423                                                                         \
424             y1i  += 16;                                                 \
425             y2i  += 16;                                                 \
426             ui   += 8;                                                  \
427             vi   += 8;                                                  \
428                                                                         \
429         }                                                               \
430                                                                         \
431         outo  += (outstrides[0])>>4;                                    \
432         oute  += (outstrides[0])>>4;                                    \
433                                                                         \
434         ui    += instrides_scl[1];                                      \
435         vi    += instrides_scl[2];                                      \
436         y1i   += instrides_scl[0];                                      \
437         y2i   += instrides_scl[0];                                      \
438     }                                                                   \
439     return srcSliceH;                                                   \
440 }
441
442
443 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
444 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
445 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
446 #define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
447 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
448 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
449
450 DEFCSP420_CVT (yuv2_abgr, out_abgr)
451 DEFCSP420_CVT (yuv2_bgra, out_bgra)
452 DEFCSP420_CVT (yuv2_rgba, out_rgba)
453 DEFCSP420_CVT (yuv2_argb, out_argb)
454 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
455 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
456
457
458 // uyvy|uyvy|uyvy|uyvy
459 // 0123 4567 89ab cdef
460 static
461 const vector unsigned char
462     demux_u = {0x10,0x00,0x10,0x00,
463                0x10,0x04,0x10,0x04,
464                0x10,0x08,0x10,0x08,
465                0x10,0x0c,0x10,0x0c},
466     demux_v = {0x10,0x02,0x10,0x02,
467                0x10,0x06,0x10,0x06,
468                0x10,0x0A,0x10,0x0A,
469                0x10,0x0E,0x10,0x0E},
470     demux_y = {0x10,0x01,0x10,0x03,
471                0x10,0x05,0x10,0x07,
472                0x10,0x09,0x10,0x0B,
473                0x10,0x0D,0x10,0x0F};
474
475 /*
476   this is so I can play live CCIR raw video
477 */
478 static int altivec_uyvy_rgb32 (SwsContext *c,
479                                const unsigned char **in, int *instrides,
480                                int srcSliceY,        int srcSliceH,
481                                unsigned char **oplanes, int *outstrides)
482 {
483     int w = c->srcW;
484     int h = srcSliceH;
485     int i,j;
486     vector unsigned char uyvy;
487     vector signed   short Y,U,V;
488     vector signed   short R0,G0,B0,R1,G1,B1;
489     vector unsigned char  R,G,B;
490     vector unsigned char *out;
491     const ubyte *img;
492
493     img = in[0];
494     out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
495
496     for (i=0;i<h;i++) {
497         for (j=0;j<w/16;j++) {
498             uyvy = vec_ld (0, img);
499             U = (vector signed short)
500                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
501
502             V = (vector signed short)
503                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
504
505             Y = (vector signed short)
506                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
507
508             cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
509
510             uyvy = vec_ld (16, img);
511             U = (vector signed short)
512                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
513
514             V = (vector signed short)
515                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
516
517             Y = (vector signed short)
518                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
519
520             cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
521
522             R  = vec_packclp (R0,R1);
523             G  = vec_packclp (G0,G1);
524             B  = vec_packclp (B0,B1);
525
526             //      vec_mstbgr24 (R,G,B, out);
527             out_rgba (R,G,B,out);
528
529             img += 32;
530         }
531     }
532     return srcSliceH;
533 }
534
535
536
537 /* Ok currently the acceleration routine only supports
538    inputs of widths a multiple of 16
539    and heights a multiple 2
540
541    So we just fall back to the C codes for this.
542 */
543 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
544 {
545     if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
546         return NULL;
547
548     /*
549       and this seems not to matter too much I tried a bunch of
550       videos with abnormal widths and MPlayer crashes elsewhere.
551       mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
552       boom with X11 bad match.
553
554     */
555     if ((c->srcW & 0xf) != 0)    return NULL;
556
557     switch (c->srcFormat) {
558     case PIX_FMT_YUV410P:
559     case PIX_FMT_YUV420P:
560     /*case IMGFMT_CLPL:        ??? */
561     case PIX_FMT_GRAY8:
562     case PIX_FMT_NV12:
563     case PIX_FMT_NV21:
564         if ((c->srcH & 0x1) != 0)
565             return NULL;
566
567         switch(c->dstFormat) {
568         case PIX_FMT_RGB24:
569             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
570             return altivec_yuv2_rgb24;
571         case PIX_FMT_BGR24:
572             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
573             return altivec_yuv2_bgr24;
574         case PIX_FMT_ARGB:
575             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
576             return altivec_yuv2_argb;
577         case PIX_FMT_ABGR:
578             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
579             return altivec_yuv2_abgr;
580         case PIX_FMT_RGBA:
581             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
582             return altivec_yuv2_rgba;
583         case PIX_FMT_BGRA:
584             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
585             return altivec_yuv2_bgra;
586         default: return NULL;
587         }
588         break;
589
590     case PIX_FMT_UYVY422:
591         switch(c->dstFormat) {
592         case PIX_FMT_BGR32:
593             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
594             return altivec_uyvy_rgb32;
595         default: return NULL;
596         }
597         break;
598
599     }
600     return NULL;
601 }
602
603 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
604 {
605     union {
606         DECLARE_ALIGNED(16, signed short, tmp)[8];
607         vector signed short vec;
608     } buf;
609
610     buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
611     buf.tmp[1] =  -256*brightness;                                      //oy
612     buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
613     buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
614     buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
615     buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
616
617
618     c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
619     c->CY   = vec_splat ((vector signed short)buf.vec, 0);
620     c->OY   = vec_splat ((vector signed short)buf.vec, 1);
621     c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
622     c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
623     c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
624     c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
625     return;
626 }
627
628
629 static av_always_inline void
630 ff_yuv2packedX_altivec(SwsContext *c, const int16_t *lumFilter,
631                        const int16_t **lumSrc, int lumFilterSize,
632                        const int16_t *chrFilter, const int16_t **chrUSrc,
633                        const int16_t **chrVSrc, int chrFilterSize,
634                        const int16_t **alpSrc, uint8_t *dest,
635                        int dstW, int dstY, enum PixelFormat target)
636 {
637     int i,j;
638     vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
639     vector signed short R0,G0,B0,R1,G1,B1;
640
641     vector unsigned char R,G,B;
642     vector unsigned char *out,*nout;
643
644     vector signed short   RND = vec_splat_s16(1<<3);
645     vector unsigned short SCL = vec_splat_u16(4);
646     DECLARE_ALIGNED(16, unsigned int, scratch)[16];
647
648     vector signed short *YCoeffs, *CCoeffs;
649
650     YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
651     CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
652
653     out = (vector unsigned char *)dest;
654
655     for (i=0; i<dstW; i+=16) {
656         Y0 = RND;
657         Y1 = RND;
658         /* extract 16 coeffs from lumSrc */
659         for (j=0; j<lumFilterSize; j++) {
660             X0 = vec_ld (0,  &lumSrc[j][i]);
661             X1 = vec_ld (16, &lumSrc[j][i]);
662             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
663             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
664         }
665
666         U = RND;
667         V = RND;
668         /* extract 8 coeffs from U,V */
669         for (j=0; j<chrFilterSize; j++) {
670             X  = vec_ld (0, &chrUSrc[j][i/2]);
671             U  = vec_mradds (X, CCoeffs[j], U);
672             X  = vec_ld (0, &chrVSrc[j][i/2]);
673             V  = vec_mradds (X, CCoeffs[j], V);
674         }
675
676         /* scale and clip signals */
677         Y0 = vec_sra (Y0, SCL);
678         Y1 = vec_sra (Y1, SCL);
679         U  = vec_sra (U,  SCL);
680         V  = vec_sra (V,  SCL);
681
682         Y0 = vec_clip_s16 (Y0);
683         Y1 = vec_clip_s16 (Y1);
684         U  = vec_clip_s16 (U);
685         V  = vec_clip_s16 (V);
686
687         /* now we have
688           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
689           U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
690
691           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
692           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
693           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
694         */
695
696         U0 = vec_mergeh (U,U);
697         V0 = vec_mergeh (V,V);
698
699         U1 = vec_mergel (U,U);
700         V1 = vec_mergel (V,V);
701
702         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
703         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
704
705         R  = vec_packclp (R0,R1);
706         G  = vec_packclp (G0,G1);
707         B  = vec_packclp (B0,B1);
708
709         switch(target) {
710         case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
711         case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
712         case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
713         case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
714         case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
715         case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
716         default:
717             {
718                 /* If this is reached, the caller should have called yuv2packedXinC
719                    instead. */
720                 static int printed_error_message;
721                 if (!printed_error_message) {
722                     av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
723                            sws_format_name(c->dstFormat));
724                     printed_error_message=1;
725                 }
726                 return;
727             }
728         }
729     }
730
731     if (i < dstW) {
732         i -= 16;
733
734         Y0 = RND;
735         Y1 = RND;
736         /* extract 16 coeffs from lumSrc */
737         for (j=0; j<lumFilterSize; j++) {
738             X0 = vec_ld (0,  &lumSrc[j][i]);
739             X1 = vec_ld (16, &lumSrc[j][i]);
740             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
741             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
742         }
743
744         U = RND;
745         V = RND;
746         /* extract 8 coeffs from U,V */
747         for (j=0; j<chrFilterSize; j++) {
748             X  = vec_ld (0, &chrUSrc[j][i/2]);
749             U  = vec_mradds (X, CCoeffs[j], U);
750             X  = vec_ld (0, &chrVSrc[j][i/2]);
751             V  = vec_mradds (X, CCoeffs[j], V);
752         }
753
754         /* scale and clip signals */
755         Y0 = vec_sra (Y0, SCL);
756         Y1 = vec_sra (Y1, SCL);
757         U  = vec_sra (U,  SCL);
758         V  = vec_sra (V,  SCL);
759
760         Y0 = vec_clip_s16 (Y0);
761         Y1 = vec_clip_s16 (Y1);
762         U  = vec_clip_s16 (U);
763         V  = vec_clip_s16 (V);
764
765         /* now we have
766            Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
767            U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
768
769            Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
770            U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
771            V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
772         */
773
774         U0 = vec_mergeh (U,U);
775         V0 = vec_mergeh (V,V);
776
777         U1 = vec_mergel (U,U);
778         V1 = vec_mergel (V,V);
779
780         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
781         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
782
783         R  = vec_packclp (R0,R1);
784         G  = vec_packclp (G0,G1);
785         B  = vec_packclp (B0,B1);
786
787         nout = (vector unsigned char *)scratch;
788         switch(target) {
789         case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
790         case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
791         case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
792         case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
793         case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
794         case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
795         default:
796             /* Unreachable, I think. */
797             av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
798                    sws_format_name(c->dstFormat));
799             return;
800         }
801
802         memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
803     }
804
805 }
806
807 #define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
808 void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, const int16_t *lumFilter, \
809                             const int16_t **lumSrc, int lumFilterSize, \
810                             const int16_t *chrFilter, const int16_t **chrUSrc, \
811                             const int16_t **chrVSrc, int chrFilterSize, \
812                             const int16_t **alpSrc, uint8_t *dest, \
813                             int dstW, int dstY) \
814 { \
815     ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
816                            chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
817                            alpSrc, dest, dstW, dstY, pixfmt); \
818 }
819
820 YUV2PACKEDX_WRAPPER(abgr,  PIX_FMT_ABGR);
821 YUV2PACKEDX_WRAPPER(bgra,  PIX_FMT_BGRA);
822 YUV2PACKEDX_WRAPPER(argb,  PIX_FMT_ARGB);
823 YUV2PACKEDX_WRAPPER(rgba,  PIX_FMT_RGBA);
824 YUV2PACKEDX_WRAPPER(rgb24, PIX_FMT_RGB24);
825 YUV2PACKEDX_WRAPPER(bgr24, PIX_FMT_BGR24);