]> git.sesse.net Git - ffmpeg/blob - libswscale/yuv2rgb_altivec.c
small consistency fixes for the ffserver manual
[ffmpeg] / libswscale / yuv2rgb_altivec.c
1 /*
2   marc.hoffman@analog.com    March 8, 2004
3
4   AltiVec acceleration for colorspace conversion revision 0.2
5
6   convert I420 YV12 to RGB in various formats,
7     it rejects images that are not in 420 formats
8     it rejects images that don't have widths of multiples of 16
9     it rejects images that don't have heights of multiples of 2
10   reject defers to C simulation codes.
11
12   lots of optimizations to be done here
13
14   1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15      so we currently use max min to clip
16
17   2. the inefficient use of chroma loading needs a bit of brushing up
18
19   3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
20
21
22   MODIFIED to calculate coeffs from currently selected color space.
23   MODIFIED core to be a macro which you spec the output format.
24   ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25   CORRECTED algorithim selection to be strict on input formats.
26   ADDED runtime detection of altivec.
27
28   ADDED altivec_yuv2packedX vertical scl + RGB converter
29
30   March 27,2004
31   PERFORMANCE ANALYSIS
32
33   The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34   The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
35
36   720*480*30  ~10MPS
37
38   so we have roughly 10clocks per pixel this is too high something has to be wrong.
39
40   OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
41
42   OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43   guaranteed to have the input video frame it was just decompressed so
44   it probably resides in L1 caches.  However we are creating the
45   output video stream this needs to use the DSTST instruction to
46   optimize for the cache.  We couple this with the fact that we are
47   not going to be visiting the input buffer again so we mark it Least
48   Recently Used.  This shaves 25% of the processor cycles off.
49
50   Now MEMCPY is the largest mips consumer in the system, probably due
51   to the inefficient X11 stuff.
52
53   GL libraries seem to be very slow on this machine 1.33Ghz PB running
54   Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
55   a versioning issues, however I have libGL.1.2.dylib for both
56   machines. ((We need to figure this out now))
57
58   GL2 libraries work now with patch for RGB32
59
60   NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
61
62   Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
63 */
64
65 /*
66  * This file is part of FFmpeg.
67  *
68  * FFmpeg is free software; you can redistribute it and/or modify
69  * it under the terms of the GNU General Public License as published by
70  * the Free Software Foundation; either version 2 of the License, or
71  * (at your option) any later version.
72  *
73  * FFmpeg is distributed in the hope that it will be useful,
74  * but WITHOUT ANY WARRANTY; without even the implied warranty of
75  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
76  * GNU General Public License for more details.
77  *
78  * You should have received a copy of the GNU General Public License
79  * along with FFmpeg; if not, write to the Free Software
80  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
81  */
82
83 #include <stdio.h>
84 #include <stdlib.h>
85 #include <string.h>
86 #include <inttypes.h>
87 #include <assert.h>
88 #include "config.h"
89 #ifdef HAVE_MALLOC_H
90 #include <malloc.h>
91 #endif
92 #include "rgb2rgb.h"
93 #include "swscale.h"
94 #include "swscale_internal.h"
95
96 #undef PROFILE_THE_BEAST
97 #undef INC_SCALING
98
99 typedef unsigned char ubyte;
100 typedef signed char   sbyte;
101
102
103 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
104    homogeneous vector registers x0,x1,x2 are interleaved with the
105    following technique:
106
107       o0 = vec_mergeh (x0,x1);
108       o1 = vec_perm (o0, x2, perm_rgb_0);
109       o2 = vec_perm (o0, x2, perm_rgb_1);
110       o3 = vec_mergel (x0,x1);
111       o4 = vec_perm (o3,o2,perm_rgb_2);
112       o5 = vec_perm (o3,o2,perm_rgb_3);
113
114   perm_rgb_0:   o0(RG).h v1(B) --> o1*
115               0   1  2   3   4
116              rgbr|gbrg|brgb|rgbr
117              0010 0100 1001 0010
118              0102 3145 2673 894A
119
120   perm_rgb_1:   o0(RG).h v1(B) --> o2
121               0   1  2   3   4
122              gbrg|brgb|bbbb|bbbb
123              0100 1001 1111 1111
124              B5CD 6EF7 89AB CDEF
125
126   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
127               0   1  2   3   4
128              gbrg|brgb|rgbr|gbrg
129              1111 1111 0010 0100
130              89AB CDEF 0182 3945
131
132   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
133               0   1  2   3   4
134              brgb|rgbr|gbrg|brgb
135              1001 0010 0100 1001
136              a67b 89cA BdCD eEFf
137
138 */
139 static
140 const vector unsigned char
141   perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
142                                                0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
143   perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
144                                                0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
145   perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
146                                                0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
147   perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
148                                                0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
149
150 #define vec_merge3(x2,x1,x0,y0,y1,y2)       \
151 do {                                        \
152     typeof(x0) o0,o2,o3;                    \
153         o0 = vec_mergeh (x0,x1);            \
154         y0 = vec_perm (o0, x2, perm_rgb_0); \
155         o2 = vec_perm (o0, x2, perm_rgb_1); \
156         o3 = vec_mergel (x0,x1);            \
157         y1 = vec_perm (o3,o2,perm_rgb_2);   \
158         y2 = vec_perm (o3,o2,perm_rgb_3);   \
159 } while(0)
160
161 #define vec_mstbgr24(x0,x1,x2,ptr)      \
162 do {                                    \
163     typeof(x0) _0,_1,_2;                \
164     vec_merge3 (x0,x1,x2,_0,_1,_2);     \
165     vec_st (_0, 0, ptr++);              \
166     vec_st (_1, 0, ptr++);              \
167     vec_st (_2, 0, ptr++);              \
168 }  while (0);
169
170 #define vec_mstrgb24(x0,x1,x2,ptr)      \
171 do {                                    \
172     typeof(x0) _0,_1,_2;                \
173     vec_merge3 (x2,x1,x0,_0,_1,_2);     \
174     vec_st (_0, 0, ptr++);              \
175     vec_st (_1, 0, ptr++);              \
176     vec_st (_2, 0, ptr++);              \
177 }  while (0);
178
179 /* pack the pixels in rgb0 format
180    msb R
181    lsb 0
182 */
183 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
184 do {                                                                          \
185     T _0,_1,_2,_3;                                                            \
186     _0 = vec_mergeh (x0,x1);                                                  \
187     _1 = vec_mergeh (x2,x3);                                                  \
188     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
189     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
190     vec_st (_2, 0*16, (T *)ptr);                                              \
191     vec_st (_3, 1*16, (T *)ptr);                                              \
192     _0 = vec_mergel (x0,x1);                                                  \
193     _1 = vec_mergel (x2,x3);                                                  \
194     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
195     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
196     vec_st (_2, 2*16, (T *)ptr);                                              \
197     vec_st (_3, 3*16, (T *)ptr);                                              \
198     ptr += 4;                                                                 \
199 }  while (0);
200
201 /*
202
203   | 1     0       1.4021   | | Y |
204   | 1    -0.3441 -0.7142   |x| Cb|
205   | 1     1.7718  0        | | Cr|
206
207
208   Y:      [-128 127]
209   Cb/Cr : [-128 127]
210
211   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
212
213 */
214
215
216
217
218 #define vec_unh(x) \
219     (vector signed short) \
220         vec_perm(x,(typeof(x))AVV(0),\
221                  (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
222                                            0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
223 #define vec_unl(x) \
224     (vector signed short) \
225         vec_perm(x,(typeof(x))AVV(0),\
226                  (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
227                                            0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
228
229 #define vec_clip_s16(x) \
230     vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\
231                          (vector signed short)AVV( 16, 16, 16, 16, 16, 16, 16, 16))
232
233 #define vec_packclp(x,y) \
234     (vector unsigned char)vec_packs \
235         ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
236          (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
237
238 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
239
240
241 static inline void cvtyuvtoRGB (SwsContext *c,
242                                 vector signed short Y, vector signed short U, vector signed short V,
243                                 vector signed short *R, vector signed short *G, vector signed short *B)
244 {
245     vector signed   short vx,ux,uvx;
246
247     Y = vec_mradds (Y, c->CY, c->OY);
248     U  = vec_sub (U,(vector signed short)
249                     vec_splat((vector signed short)AVV(128),0));
250     V  = vec_sub (V,(vector signed short)
251                     vec_splat((vector signed short)AVV(128),0));
252
253     //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
254     ux = vec_sl (U, c->CSHIFT);
255     *B = vec_mradds (ux, c->CBU, Y);
256
257     // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
258     vx = vec_sl (V, c->CSHIFT);
259     *R = vec_mradds (vx, c->CRV, Y);
260
261     // uvx = ((CGU*u) + (CGV*v))>>15;
262     uvx = vec_mradds (U, c->CGU, Y);
263     *G  = vec_mradds (V, c->CGV, uvx);
264 }
265
266
267 /*
268   ------------------------------------------------------------------------------
269   CS converters
270   ------------------------------------------------------------------------------
271 */
272
273
274 #define DEFCSP420_CVT(name,out_pixels)                                  \
275 static int altivec_##name (SwsContext *c,                               \
276                            unsigned char **in, int *instrides,          \
277                            int srcSliceY,        int srcSliceH,         \
278                            unsigned char **oplanes, int *outstrides)    \
279 {                                                                       \
280     int w = c->srcW;                                                    \
281     int h = srcSliceH;                                                  \
282     int i,j;                                                            \
283     int instrides_scl[3];                                               \
284     vector unsigned char y0,y1;                                         \
285                                                                         \
286     vector signed char  u,v;                                            \
287                                                                         \
288     vector signed short Y0,Y1,Y2,Y3;                                    \
289     vector signed short U,V;                                            \
290     vector signed short vx,ux,uvx;                                      \
291     vector signed short vx0,ux0,uvx0;                                   \
292     vector signed short vx1,ux1,uvx1;                                   \
293     vector signed short R0,G0,B0;                                       \
294     vector signed short R1,G1,B1;                                       \
295     vector unsigned char R,G,B;                                         \
296                                                                         \
297     vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
298     vector unsigned char align_perm;                                    \
299                                                                         \
300     vector signed short                                                 \
301         lCY  = c->CY,                                                   \
302         lOY  = c->OY,                                                   \
303         lCRV = c->CRV,                                                  \
304         lCBU = c->CBU,                                                  \
305         lCGU = c->CGU,                                                  \
306         lCGV = c->CGV;                                                  \
307                                                                         \
308     vector unsigned short lCSHIFT = c->CSHIFT;                          \
309                                                                         \
310     ubyte *y1i   = in[0];                                               \
311     ubyte *y2i   = in[0]+instrides[0];                                  \
312     ubyte *ui    = in[1];                                               \
313     ubyte *vi    = in[2];                                               \
314                                                                         \
315     vector unsigned char *oute                                          \
316         = (vector unsigned char *)                                      \
317             (oplanes[0]+srcSliceY*outstrides[0]);                       \
318     vector unsigned char *outo                                          \
319         = (vector unsigned char *)                                      \
320             (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
321                                                                         \
322                                                                         \
323     instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
324     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
325     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
326                                                                         \
327                                                                         \
328     for (i=0;i<h/2;i++) {                                               \
329         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
330         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
331                                                                         \
332         for (j=0;j<w/16;j++) {                                          \
333                                                                         \
334             y1ivP = (vector unsigned char *)y1i;                        \
335             y2ivP = (vector unsigned char *)y2i;                        \
336             uivP  = (vector unsigned char *)ui;                         \
337             vivP  = (vector unsigned char *)vi;                         \
338                                                                         \
339             align_perm = vec_lvsl (0, y1i);                             \
340             y0 = (vector unsigned char)                                 \
341                  vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
342                                                                         \
343             align_perm = vec_lvsl (0, y2i);                             \
344             y1 = (vector unsigned char)                                 \
345                  vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
346                                                                         \
347             align_perm = vec_lvsl (0, ui);                              \
348             u = (vector signed char)                                    \
349                 vec_perm (uivP[0], uivP[1], align_perm);                \
350                                                                         \
351             align_perm = vec_lvsl (0, vi);                              \
352             v = (vector signed char)                                    \
353                 vec_perm (vivP[0], vivP[1], align_perm);                \
354                                                                         \
355             u  = (vector signed char)                                   \
356                  vec_sub (u,(vector signed char)                        \
357                           vec_splat((vector signed char)AVV(128),0));   \
358             v  = (vector signed char)                                   \
359                  vec_sub (v,(vector signed char)                        \
360                           vec_splat((vector signed char)AVV(128),0));   \
361                                                                         \
362             U  = vec_unpackh (u);                                       \
363             V  = vec_unpackh (v);                                       \
364                                                                         \
365                                                                         \
366             Y0 = vec_unh (y0);                                          \
367             Y1 = vec_unl (y0);                                          \
368             Y2 = vec_unh (y1);                                          \
369             Y3 = vec_unl (y1);                                          \
370                                                                         \
371             Y0 = vec_mradds (Y0, lCY, lOY);                             \
372             Y1 = vec_mradds (Y1, lCY, lOY);                             \
373             Y2 = vec_mradds (Y2, lCY, lOY);                             \
374             Y3 = vec_mradds (Y3, lCY, lOY);                             \
375                                                                         \
376             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
377             ux = vec_sl (U, lCSHIFT);                                   \
378             ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));    \
379             ux0  = vec_mergeh (ux,ux);                                  \
380             ux1  = vec_mergel (ux,ux);                                  \
381                                                                         \
382             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
383             vx = vec_sl (V, lCSHIFT);                                   \
384             vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));    \
385             vx0  = vec_mergeh (vx,vx);                                  \
386             vx1  = vec_mergel (vx,vx);                                  \
387                                                                         \
388             /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
389             uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));    \
390             uvx = vec_mradds (V, lCGV, uvx);                            \
391             uvx0 = vec_mergeh (uvx,uvx);                                \
392             uvx1 = vec_mergel (uvx,uvx);                                \
393                                                                         \
394             R0 = vec_add (Y0,vx0);                                      \
395             G0 = vec_add (Y0,uvx0);                                     \
396             B0 = vec_add (Y0,ux0);                                      \
397             R1 = vec_add (Y1,vx1);                                      \
398             G1 = vec_add (Y1,uvx1);                                     \
399             B1 = vec_add (Y1,ux1);                                      \
400                                                                         \
401             R  = vec_packclp (R0,R1);                                   \
402             G  = vec_packclp (G0,G1);                                   \
403             B  = vec_packclp (B0,B1);                                   \
404                                                                         \
405             out_pixels(R,G,B,oute);                                     \
406                                                                         \
407             R0 = vec_add (Y2,vx0);                                      \
408             G0 = vec_add (Y2,uvx0);                                     \
409             B0 = vec_add (Y2,ux0);                                      \
410             R1 = vec_add (Y3,vx1);                                      \
411             G1 = vec_add (Y3,uvx1);                                     \
412             B1 = vec_add (Y3,ux1);                                      \
413             R  = vec_packclp (R0,R1);                                   \
414             G  = vec_packclp (G0,G1);                                   \
415             B  = vec_packclp (B0,B1);                                   \
416                                                                         \
417                                                                         \
418             out_pixels(R,G,B,outo);                                     \
419                                                                         \
420             y1i  += 16;                                                 \
421             y2i  += 16;                                                 \
422             ui   += 8;                                                  \
423             vi   += 8;                                                  \
424                                                                         \
425         }                                                               \
426                                                                         \
427         outo  += (outstrides[0])>>4;                                    \
428         oute  += (outstrides[0])>>4;                                    \
429                                                                         \
430         ui    += instrides_scl[1];                                      \
431         vi    += instrides_scl[2];                                      \
432         y1i   += instrides_scl[0];                                      \
433         y2i   += instrides_scl[0];                                      \
434     }                                                                   \
435     return srcSliceH;                                                   \
436 }
437
438
439 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
440 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
441 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
442 #define out_argb(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
443 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
444 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
445
446 DEFCSP420_CVT (yuv2_abgr, out_abgr)
447 #if 1
448 DEFCSP420_CVT (yuv2_bgra, out_bgra)
449 #else
450 static int altivec_yuv2_bgra32 (SwsContext *c,
451                                 unsigned char **in, int *instrides,
452                                 int srcSliceY,        int srcSliceH,
453                                 unsigned char **oplanes, int *outstrides)
454 {
455     int w = c->srcW;
456     int h = srcSliceH;
457     int i,j;
458     int instrides_scl[3];
459     vector unsigned char y0,y1;
460
461     vector signed char  u,v;
462
463     vector signed short Y0,Y1,Y2,Y3;
464     vector signed short U,V;
465     vector signed short vx,ux,uvx;
466     vector signed short vx0,ux0,uvx0;
467     vector signed short vx1,ux1,uvx1;
468     vector signed short R0,G0,B0;
469     vector signed short R1,G1,B1;
470     vector unsigned char R,G,B;
471
472     vector unsigned char *uivP, *vivP;
473     vector unsigned char align_perm;
474
475     vector signed short
476         lCY  = c->CY,
477         lOY  = c->OY,
478         lCRV = c->CRV,
479         lCBU = c->CBU,
480         lCGU = c->CGU,
481         lCGV = c->CGV;
482
483     vector unsigned short lCSHIFT = c->CSHIFT;
484
485     ubyte *y1i   = in[0];
486     ubyte *y2i   = in[0]+w;
487     ubyte *ui    = in[1];
488     ubyte *vi    = in[2];
489
490     vector unsigned char *oute
491         = (vector unsigned char *)
492           (oplanes[0]+srcSliceY*outstrides[0]);
493     vector unsigned char *outo
494         = (vector unsigned char *)
495           (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
496
497
498     instrides_scl[0] = instrides[0];
499     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
500     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
501
502
503     for (i=0;i<h/2;i++) {
504         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
505         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
506
507         for (j=0;j<w/16;j++) {
508
509             y0 = vec_ldl (0,y1i);
510             y1 = vec_ldl (0,y2i);
511             uivP = (vector unsigned char *)ui;
512             vivP = (vector unsigned char *)vi;
513
514             align_perm = vec_lvsl (0, ui);
515             u  = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
516
517             align_perm = vec_lvsl (0, vi);
518             v  = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
519             u  = (vector signed char)
520                  vec_sub (u,(vector signed char)
521                           vec_splat((vector signed char)AVV(128),0));
522
523             v  = (vector signed char)
524                  vec_sub (v, (vector signed char)
525                           vec_splat((vector signed char)AVV(128),0));
526
527             U  = vec_unpackh (u);
528             V  = vec_unpackh (v);
529
530
531             Y0 = vec_unh (y0);
532             Y1 = vec_unl (y0);
533             Y2 = vec_unh (y1);
534             Y3 = vec_unl (y1);
535
536             Y0 = vec_mradds (Y0, lCY, lOY);
537             Y1 = vec_mradds (Y1, lCY, lOY);
538             Y2 = vec_mradds (Y2, lCY, lOY);
539             Y3 = vec_mradds (Y3, lCY, lOY);
540
541             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
542             ux = vec_sl (U, lCSHIFT);
543             ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
544             ux0  = vec_mergeh (ux,ux);
545             ux1  = vec_mergel (ux,ux);
546
547             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */
548             vx = vec_sl (V, lCSHIFT);
549             vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
550             vx0  = vec_mergeh (vx,vx);
551             vx1  = vec_mergel (vx,vx);
552             /* uvx = ((CGU*u) + (CGV*v))>>15 */
553             uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
554             uvx = vec_mradds (V, lCGV, uvx);
555             uvx0 = vec_mergeh (uvx,uvx);
556             uvx1 = vec_mergel (uvx,uvx);
557             R0 = vec_add (Y0,vx0);
558             G0 = vec_add (Y0,uvx0);
559             B0 = vec_add (Y0,ux0);
560             R1 = vec_add (Y1,vx1);
561             G1 = vec_add (Y1,uvx1);
562             B1 = vec_add (Y1,ux1);
563             R  = vec_packclp (R0,R1);
564             G  = vec_packclp (G0,G1);
565             B  = vec_packclp (B0,B1);
566
567             out_argb(R,G,B,oute);
568             R0 = vec_add (Y2,vx0);
569             G0 = vec_add (Y2,uvx0);
570             B0 = vec_add (Y2,ux0);
571             R1 = vec_add (Y3,vx1);
572             G1 = vec_add (Y3,uvx1);
573             B1 = vec_add (Y3,ux1);
574             R  = vec_packclp (R0,R1);
575             G  = vec_packclp (G0,G1);
576             B  = vec_packclp (B0,B1);
577
578             out_argb(R,G,B,outo);
579             y1i  += 16;
580             y2i  += 16;
581             ui   += 8;
582             vi   += 8;
583
584         }
585
586         outo  += (outstrides[0])>>4;
587         oute  += (outstrides[0])>>4;
588
589         ui    += instrides_scl[1];
590         vi    += instrides_scl[2];
591         y1i   += instrides_scl[0];
592         y2i   += instrides_scl[0];
593     }
594     return srcSliceH;
595 }
596
597 #endif
598
599
600 DEFCSP420_CVT (yuv2_rgba, out_rgba)
601 DEFCSP420_CVT (yuv2_argb, out_argb)
602 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
603 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
604
605
606 // uyvy|uyvy|uyvy|uyvy
607 // 0123 4567 89ab cdef
608 static
609 const vector unsigned char
610     demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
611                                               0x10,0x04,0x10,0x04,
612                                               0x10,0x08,0x10,0x08,
613                                               0x10,0x0c,0x10,0x0c),
614     demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
615                                               0x10,0x06,0x10,0x06,
616                                               0x10,0x0A,0x10,0x0A,
617                                               0x10,0x0E,0x10,0x0E),
618     demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
619                                               0x10,0x05,0x10,0x07,
620                                               0x10,0x09,0x10,0x0B,
621                                               0x10,0x0D,0x10,0x0F);
622
623 /*
624   this is so I can play live CCIR raw video
625 */
626 static int altivec_uyvy_rgb32 (SwsContext *c,
627                                unsigned char **in, int *instrides,
628                                int srcSliceY,        int srcSliceH,
629                                unsigned char **oplanes, int *outstrides)
630 {
631     int w = c->srcW;
632     int h = srcSliceH;
633     int i,j;
634     vector unsigned char uyvy;
635     vector signed   short Y,U,V;
636     vector signed   short R0,G0,B0,R1,G1,B1;
637     vector unsigned char  R,G,B;
638     vector unsigned char *out;
639     ubyte *img;
640
641     img = in[0];
642     out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
643
644     for (i=0;i<h;i++) {
645         for (j=0;j<w/16;j++) {
646             uyvy = vec_ld (0, img);
647             U = (vector signed short)
648                 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
649
650             V = (vector signed short)
651                 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
652
653             Y = (vector signed short)
654                 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
655
656             cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
657
658             uyvy = vec_ld (16, img);
659             U = (vector signed short)
660                 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
661
662             V = (vector signed short)
663                 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
664
665             Y = (vector signed short)
666                 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
667
668             cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
669
670             R  = vec_packclp (R0,R1);
671             G  = vec_packclp (G0,G1);
672             B  = vec_packclp (B0,B1);
673
674             //      vec_mstbgr24 (R,G,B, out);
675             out_rgba (R,G,B,out);
676
677             img += 32;
678         }
679     }
680     return srcSliceH;
681 }
682
683
684
685 /* Ok currently the acceleration routine only supports
686    inputs of widths a multiple of 16
687    and heights a multiple 2
688
689    So we just fall back to the C codes for this.
690 */
691 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
692 {
693     if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
694         return NULL;
695
696     /*
697       and this seems not to matter too much I tried a bunch of
698       videos with abnormal widths and MPlayer crashes elsewhere.
699       mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
700       boom with X11 bad match.
701
702     */
703     if ((c->srcW & 0xf) != 0)    return NULL;
704
705     switch (c->srcFormat) {
706     case PIX_FMT_YUV410P:
707     case PIX_FMT_YUV420P:
708     /*case IMGFMT_CLPL:        ??? */
709     case PIX_FMT_GRAY8:
710     case PIX_FMT_NV12:
711     case PIX_FMT_NV21:
712         if ((c->srcH & 0x1) != 0)
713             return NULL;
714
715         switch(c->dstFormat){
716         case PIX_FMT_RGB24:
717             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
718             return altivec_yuv2_rgb24;
719         case PIX_FMT_BGR24:
720             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
721             return altivec_yuv2_bgr24;
722         case PIX_FMT_ARGB:
723             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
724             return altivec_yuv2_argb;
725         case PIX_FMT_ABGR:
726             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
727             return altivec_yuv2_abgr;
728         case PIX_FMT_RGBA:
729             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
730             return altivec_yuv2_rgba;
731         case PIX_FMT_BGRA:
732             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
733             return altivec_yuv2_bgra;
734         default: return NULL;
735         }
736         break;
737
738     case PIX_FMT_UYVY422:
739         switch(c->dstFormat){
740         case PIX_FMT_BGR32:
741             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
742             return altivec_uyvy_rgb32;
743         default: return NULL;
744         }
745         break;
746
747     }
748     return NULL;
749 }
750
751 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
752 {
753     union {
754         signed short tmp[8] __attribute__ ((aligned(16)));
755         vector signed short vec;
756     } buf;
757
758     buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
759     buf.tmp[1] =  -256*brightness;                                      //oy
760     buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
761     buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
762     buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
763     buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
764
765
766     c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
767     c->CY   = vec_splat ((vector signed short)buf.vec, 0);
768     c->OY   = vec_splat ((vector signed short)buf.vec, 1);
769     c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
770     c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
771     c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
772     c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
773 #if 0
774     {
775     int i;
776     char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
777     for (i=0; i<6; i++)
778         printf("%s %d ", v[i],buf.tmp[i] );
779         printf("\n");
780     }
781 #endif
782     return;
783 }
784
785
786 void
787 altivec_yuv2packedX (SwsContext *c,
788                      int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
789                      int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
790                      uint8_t *dest, int dstW, int dstY)
791 {
792     int i,j;
793     vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
794     vector signed short R0,G0,B0,R1,G1,B1;
795
796     vector unsigned char R,G,B;
797     vector unsigned char *out,*nout;
798
799     vector signed short   RND = vec_splat_s16(1<<3);
800     vector unsigned short SCL = vec_splat_u16(4);
801     unsigned long scratch[16] __attribute__ ((aligned (16)));
802
803     vector signed short *YCoeffs, *CCoeffs;
804
805     YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
806     CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
807
808     out = (vector unsigned char *)dest;
809
810     for (i=0; i<dstW; i+=16){
811         Y0 = RND;
812         Y1 = RND;
813         /* extract 16 coeffs from lumSrc */
814         for (j=0; j<lumFilterSize; j++) {
815             X0 = vec_ld (0,  &lumSrc[j][i]);
816             X1 = vec_ld (16, &lumSrc[j][i]);
817             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
818             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
819         }
820
821         U = RND;
822         V = RND;
823         /* extract 8 coeffs from U,V */
824         for (j=0; j<chrFilterSize; j++) {
825             X  = vec_ld (0, &chrSrc[j][i/2]);
826             U  = vec_mradds (X, CCoeffs[j], U);
827             X  = vec_ld (0, &chrSrc[j][i/2+2048]);
828             V  = vec_mradds (X, CCoeffs[j], V);
829         }
830
831         /* scale and clip signals */
832         Y0 = vec_sra (Y0, SCL);
833         Y1 = vec_sra (Y1, SCL);
834         U  = vec_sra (U,  SCL);
835         V  = vec_sra (V,  SCL);
836
837         Y0 = vec_clip_s16 (Y0);
838         Y1 = vec_clip_s16 (Y1);
839         U  = vec_clip_s16 (U);
840         V  = vec_clip_s16 (V);
841
842         /* now we have
843           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
844           U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
845
846           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
847           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
848           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
849         */
850
851         U0 = vec_mergeh (U,U);
852         V0 = vec_mergeh (V,V);
853
854         U1 = vec_mergel (U,U);
855         V1 = vec_mergel (V,V);
856
857         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
858         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
859
860         R  = vec_packclp (R0,R1);
861         G  = vec_packclp (G0,G1);
862         B  = vec_packclp (B0,B1);
863
864         switch(c->dstFormat) {
865             case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
866             case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
867             case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
868             case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
869             case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
870             case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
871             default:
872             {
873                 /* If this is reached, the caller should have called yuv2packedXinC
874                    instead. */
875                 static int printed_error_message;
876                 if (!printed_error_message) {
877                     av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
878                            sws_format_name(c->dstFormat));
879                     printed_error_message=1;
880                 }
881                 return;
882             }
883         }
884     }
885
886     if (i < dstW) {
887         i -= 16;
888
889         Y0 = RND;
890         Y1 = RND;
891         /* extract 16 coeffs from lumSrc */
892         for (j=0; j<lumFilterSize; j++) {
893             X0 = vec_ld (0,  &lumSrc[j][i]);
894             X1 = vec_ld (16, &lumSrc[j][i]);
895             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
896             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
897         }
898
899         U = RND;
900         V = RND;
901         /* extract 8 coeffs from U,V */
902         for (j=0; j<chrFilterSize; j++) {
903             X  = vec_ld (0, &chrSrc[j][i/2]);
904             U  = vec_mradds (X, CCoeffs[j], U);
905             X  = vec_ld (0, &chrSrc[j][i/2+2048]);
906             V  = vec_mradds (X, CCoeffs[j], V);
907         }
908
909         /* scale and clip signals */
910         Y0 = vec_sra (Y0, SCL);
911         Y1 = vec_sra (Y1, SCL);
912         U  = vec_sra (U,  SCL);
913         V  = vec_sra (V,  SCL);
914
915         Y0 = vec_clip_s16 (Y0);
916         Y1 = vec_clip_s16 (Y1);
917         U  = vec_clip_s16 (U);
918         V  = vec_clip_s16 (V);
919
920         /* now we have
921            Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
922            U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
923
924            Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
925            U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
926            V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
927         */
928
929         U0 = vec_mergeh (U,U);
930         V0 = vec_mergeh (V,V);
931
932         U1 = vec_mergel (U,U);
933         V1 = vec_mergel (V,V);
934
935         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
936         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
937
938         R  = vec_packclp (R0,R1);
939         G  = vec_packclp (G0,G1);
940         B  = vec_packclp (B0,B1);
941
942         nout = (vector unsigned char *)scratch;
943         switch(c->dstFormat) {
944             case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
945             case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
946             case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
947             case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
948             case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
949             case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
950             default:
951                 /* Unreachable, I think. */
952                 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
953                        sws_format_name(c->dstFormat));
954                 return;
955         }
956
957         memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
958     }
959
960 }