]> git.sesse.net Git - ffmpeg/blob - postproc/yuv2rgb_altivec.c
fixing title on PSP (0x60 != 'a') someone needs a 10x10km ascii table ...
[ffmpeg] / postproc / yuv2rgb_altivec.c
1 /*
2   marc.hoffman@analog.com    March 8, 2004
3
4   Altivec Acceleration for Color Space Conversion revision 0.2
5
6   convert I420 YV12 to RGB in various formats,
7     it rejects images that are not in 420 formats
8     it rejects images that don't have widths of multiples of 16
9     it rejects images that don't have heights of multiples of 2
10   reject defers to C simulation codes.
11
12   lots of optimizations to be done here
13
14   1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15      so we currently use max min to clip
16
17   2. the inefficient use of chroma loading needs a bit of brushing up
18
19   3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
20
21
22   MODIFIED to calculate coeffs from currently selected color space.
23   MODIFIED core to be a macro which you spec the output format.
24   ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25   CORRECTED algorithim selection to be strict on input formats.
26   ADDED runtime detection of altivec.
27
28   ADDED altivec_yuv2packedX vertical scl + RGB converter
29
30   March 27,2004
31   PERFORMANCE ANALYSIS
32
33   The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34   The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
35
36   720*480*30  ~10MPS
37
38   so we have roughly 10clocks per pixel this is too high something has to be wrong.
39
40   OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
41
42   OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43   guaranteed to have the input video frame it was just decompressed so
44   it probably resides in L1 caches.  However we are creating the
45   output video stream this needs to use the DSTST instruction to
46   optimize for the cache.  We couple this with the fact that we are
47   not going to be visiting the input buffer again so we mark it Least
48   Recently Used.  This shaves 25% of the processor cycles off.
49
50   Now MEMCPY is the largest mips consumer in the system, probably due
51   to the inefficient X11 stuff.
52
53   GL libraries seem to be very slow on this machine 1.33Ghz PB running
54   Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
55   a versioning issues, however i have libGL.1.2.dylib for both
56   machines. ((We need to figure this out now))
57
58   GL2 libraries work now with patch for RGB32
59
60   NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
61
62   Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment. 
63
64 */
65 #include <stdio.h>
66 #include <stdlib.h>
67 #include <string.h>
68 #include <inttypes.h>
69 #include <assert.h>
70 #include "config.h"
71 #ifdef HAVE_MALLOC_H
72 #include <malloc.h>
73 #endif
74 #include "rgb2rgb.h"
75 #include "swscale.h"
76 #include "swscale_internal.h"
77 #include "mangle.h"
78 #include "libvo/img_format.h" //FIXME try to reduce dependency of such stuff
79
80 #undef PROFILE_THE_BEAST
81 #undef INC_SCALING
82
83 typedef unsigned char ubyte;
84 typedef signed char   sbyte;
85
86
87 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
88    homogeneous vector registers x0,x1,x2 are interleaved with the
89    following technique:
90
91       o0 = vec_mergeh (x0,x1);
92       o1 = vec_perm (o0, x2, perm_rgb_0);
93       o2 = vec_perm (o0, x2, perm_rgb_1);
94       o3 = vec_mergel (x0,x1);
95       o4 = vec_perm (o3,o2,perm_rgb_2);
96       o5 = vec_perm (o3,o2,perm_rgb_3);
97
98   perm_rgb_0:   o0(RG).h v1(B) --> o1*
99               0   1  2   3   4
100              rgbr|gbrg|brgb|rgbr
101              0010 0100 1001 0010
102              0102 3145 2673 894A
103
104   perm_rgb_1:   o0(RG).h v1(B) --> o2
105               0   1  2   3   4
106              gbrg|brgb|bbbb|bbbb
107              0100 1001 1111 1111
108              B5CD 6EF7 89AB CDEF
109
110   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
111               0   1  2   3   4
112              gbrg|brgb|rgbr|gbrg
113              1111 1111 0010 0100
114              89AB CDEF 0182 3945
115
116   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
117               0   1  2   3   4
118              brgb|rgbr|gbrg|brgb
119              1001 0010 0100 1001
120              a67b 89cA BdCD eEFf
121
122 */
123 static
124 const vector unsigned char
125   perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
126                                       0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
127   perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
128                                       0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
129   perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
130                                       0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
131   perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
132                                       0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
133
134 #define vec_merge3(x2,x1,x0,y0,y1,y2)    \
135 do {                                     \
136   typeof(x0) o0,o2,o3;                   \
137       o0 = vec_mergeh (x0,x1);           \
138       y0 = vec_perm (o0, x2, perm_rgb_0);\
139       o2 = vec_perm (o0, x2, perm_rgb_1);\
140       o3 = vec_mergel (x0,x1);           \
141       y1 = vec_perm (o3,o2,perm_rgb_2);  \
142       y2 = vec_perm (o3,o2,perm_rgb_3);  \
143 } while(0)
144
145 #define vec_mstbgr24(x0,x1,x2,ptr)        \
146 do {                                     \
147   typeof(x0) _0,_1,_2;                   \
148   vec_merge3 (x0,x1,x2,_0,_1,_2);        \
149   vec_st (_0, 0, ptr++);                 \
150   vec_st (_1, 0, ptr++);                 \
151   vec_st (_2, 0, ptr++);                 \
152 }  while (0);
153
154 #define vec_mstrgb24(x0,x1,x2,ptr)       \
155 do {                                     \
156   typeof(x0) _0,_1,_2;                   \
157   vec_merge3 (x2,x1,x0,_0,_1,_2);        \
158   vec_st (_0, 0, ptr++);                 \
159   vec_st (_1, 0, ptr++);                 \
160   vec_st (_2, 0, ptr++);                 \
161 }  while (0);
162
163 /* pack the pixels in rgb0 format
164    msb R
165    lsb 0
166 */
167 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                                \
168 do {                                                                                   \
169   T _0,_1,_2,_3;                                                                       \
170   _0 = vec_mergeh (x0,x1);                                                             \
171   _1 = vec_mergeh (x2,x3);                                                             \
172   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
173   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
174   vec_st (_2, 0*16, (T *)ptr);                                                         \
175   vec_st (_3, 1*16, (T *)ptr);                                                         \
176   _0 = vec_mergel (x0,x1);                                                             \
177   _1 = vec_mergel (x2,x3);                                                             \
178   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
179   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
180   vec_st (_2, 2*16, (T *)ptr);                                                         \
181   vec_st (_3, 3*16, (T *)ptr);                                                         \
182   ptr += 4;                                                                            \
183 }  while (0);
184
185 /*
186
187   | 1     0       1.4021   | | Y |
188   | 1    -0.3441 -0.7142   |x| Cb|
189   | 1     1.7718  0        | | Cr|
190
191
192   Y:      [-128 127]
193   Cb/Cr : [-128 127]
194
195   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
196
197 */
198
199
200
201
202 #define vec_unh(x) \
203   (vector signed short) \
204     vec_perm(x,(typeof(x))AVV(0),\
205              (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
206                                     0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
207 #define vec_unl(x) \
208   (vector signed short) \
209     vec_perm(x,(typeof(x))AVV(0),\
210              (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
211                                     0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
212
213 #define vec_clip_s16(x) \
214   vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\
215                        (vector signed short)AVV(16, 16, 16, 16, 16, 16, 16, 16 ))
216
217 #define vec_packclp(x,y) \
218   (vector unsigned char)vec_packs \
219       ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
220        (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
221
222 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
223
224
225 static inline void cvtyuvtoRGB (SwsContext *c,
226                            vector signed short Y, vector signed short U, vector signed short V,
227                            vector signed short *R, vector signed short *G, vector signed short *B)
228 {
229   vector signed   short vx,ux,uvx;
230
231   Y = vec_mradds (Y, c->CY, c->OY);
232   U  = vec_sub (U,(vector signed short)
233                         vec_splat((vector signed short)AVV(128),0));
234   V  = vec_sub (V,(vector signed short)
235                         vec_splat((vector signed short)AVV(128),0));
236
237   //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
238   ux = vec_sl (U, c->CSHIFT);
239   *B = vec_mradds (ux, c->CBU, Y);
240
241   // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
242   vx = vec_sl (V, c->CSHIFT);
243   *R = vec_mradds (vx, c->CRV, Y);
244
245   // uvx = ((CGU*u) + (CGV*v))>>15;
246   uvx = vec_mradds (U, c->CGU, Y);
247   *G = vec_mradds (V, c->CGV, uvx);
248 }
249
250
251 /*
252   ------------------------------------------------------------------------------
253   CS converters
254   ------------------------------------------------------------------------------
255 */
256
257
258 #define DEFCSP420_CVT(name,out_pixels)                                     \
259 static int altivec_##name (SwsContext *c,                                  \
260                                 unsigned char **in, int *instrides,        \
261                                 int srcSliceY,  int srcSliceH,             \
262                                 unsigned char **oplanes, int *outstrides)  \
263 {                                                                          \
264   int w = c->srcW;                                                         \
265   int h = srcSliceH;                                                       \
266   int i,j;                                                                 \
267   int instrides_scl[3];                                                    \
268   vector unsigned char y0,y1;                                              \
269                                                                            \
270   vector signed char  u,v;                                                 \
271                                                                            \
272   vector signed short Y0,Y1,Y2,Y3;                                         \
273   vector signed short U,V;                                                 \
274   vector signed short vx,ux,uvx;                                           \
275   vector signed short vx0,ux0,uvx0;                                        \
276   vector signed short vx1,ux1,uvx1;                                        \
277   vector signed short R0,G0,B0;                                            \
278   vector signed short R1,G1,B1;                                            \
279   vector unsigned char R,G,B;                                              \
280                                                                            \
281   vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                       \
282   vector unsigned char align_perm;                                         \
283                                                                            \
284   vector signed short                                                      \
285     lCY  = c->CY,                                                          \
286     lOY  = c->OY,                                                          \
287     lCRV = c->CRV,                                                         \
288     lCBU = c->CBU,                                                         \
289     lCGU = c->CGU,                                                         \
290     lCGV = c->CGV;                                                         \
291                                                                            \
292   vector unsigned short lCSHIFT = c->CSHIFT;                               \
293                                                                            \
294   ubyte *y1i   = in[0];                                                    \
295   ubyte *y2i   = in[0]+instrides[0];                                       \
296   ubyte *ui    = in[1];                                                    \
297   ubyte *vi    = in[2];                                                    \
298                                                                            \
299   vector unsigned char *oute                                               \
300     = (vector unsigned char *)                                             \
301         (oplanes[0]+srcSliceY*outstrides[0]);                              \
302   vector unsigned char *outo                                               \
303     = (vector unsigned char *)                                             \
304         (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);                \
305                                                                            \
306                                                                            \
307   instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */  \
308   instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */     \
309   instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */     \
310                                                                            \
311                                                                            \
312   for (i=0;i<h/2;i++) {                                                    \
313     vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);                 \
314     vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);                 \
315                                                                            \
316     for (j=0;j<w/16;j++) {                                                 \
317                                                                            \
318       y1ivP = (vector unsigned char *)y1i;                                 \
319       y2ivP = (vector unsigned char *)y2i;                                 \
320       uivP = (vector unsigned char *)ui;                                   \
321       vivP = (vector unsigned char *)vi;                                   \
322                                                                            \
323       align_perm = vec_lvsl (0, y1i);                                      \
324       y0 = (vector unsigned char)vec_perm (y1ivP[0], y1ivP[1], align_perm);\
325                                                                            \
326       align_perm = vec_lvsl (0, y2i);                                      \
327       y1 = (vector unsigned char)vec_perm (y2ivP[0], y2ivP[1], align_perm);\
328                                                                            \
329       align_perm = vec_lvsl (0, ui);                                       \
330       u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);     \
331                                                                            \
332       align_perm = vec_lvsl (0, vi);                                       \
333       v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);     \
334                                                                            \
335       u  = (vector signed char)                                            \
336                 vec_sub (u,(vector signed char)                            \
337                                 vec_splat((vector signed char)AVV(128),0));\
338       v  = (vector signed char)                                            \
339                 vec_sub (v,(vector signed char)                            \
340                                 vec_splat((vector signed char)AVV(128),0));\
341                                                                            \
342       U  = vec_unpackh (u);                                                \
343       V  = vec_unpackh (v);                                                \
344                                                                            \
345                                                                            \
346         Y0 = vec_unh (y0);                                                 \
347         Y1 = vec_unl (y0);                                                 \
348         Y2 = vec_unh (y1);                                                 \
349         Y3 = vec_unl (y1);                                                 \
350                                                                            \
351         Y0 = vec_mradds (Y0, lCY, lOY);                                    \
352         Y1 = vec_mradds (Y1, lCY, lOY);                                    \
353         Y2 = vec_mradds (Y2, lCY, lOY);                                    \
354         Y3 = vec_mradds (Y3, lCY, lOY);                                    \
355                                                                            \
356         /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                         \
357         ux = vec_sl (U, lCSHIFT);                                          \
358         ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));                   \
359         ux0  = vec_mergeh (ux,ux);                                         \
360         ux1  = vec_mergel (ux,ux);                                         \
361                                                                            \
362         /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;  */                         \
363         vx = vec_sl (V, lCSHIFT);                                          \
364         vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));                   \
365         vx0  = vec_mergeh (vx,vx);                                         \
366         vx1  = vec_mergel (vx,vx);                                         \
367                                                                            \
368         /* uvx = ((CGU*u) + (CGV*v))>>15 */                                \
369         uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));                   \
370         uvx = vec_mradds (V, lCGV, uvx);                                   \
371         uvx0 = vec_mergeh (uvx,uvx);                                       \
372         uvx1 = vec_mergel (uvx,uvx);                                       \
373                                                                            \
374         R0 = vec_add (Y0,vx0);                                             \
375         G0 = vec_add (Y0,uvx0);                                            \
376         B0 = vec_add (Y0,ux0);                                             \
377         R1 = vec_add (Y1,vx1);                                             \
378         G1 = vec_add (Y1,uvx1);                                            \
379         B1 = vec_add (Y1,ux1);                                             \
380                                                                            \
381         R  = vec_packclp (R0,R1);                                          \
382         G  = vec_packclp (G0,G1);                                          \
383         B  = vec_packclp (B0,B1);                                          \
384                                                                            \
385         out_pixels(R,G,B,oute);                                            \
386                                                                            \
387         R0 = vec_add (Y2,vx0);                                             \
388         G0 = vec_add (Y2,uvx0);                                            \
389         B0 = vec_add (Y2,ux0);                                             \
390         R1 = vec_add (Y3,vx1);                                             \
391         G1 = vec_add (Y3,uvx1);                                            \
392         B1 = vec_add (Y3,ux1);                                             \
393         R  = vec_packclp (R0,R1);                                          \
394         G  = vec_packclp (G0,G1);                                          \
395         B  = vec_packclp (B0,B1);                                          \
396                                                                            \
397                                                                            \
398         out_pixels(R,G,B,outo);                                            \
399                                                                            \
400       y1i  += 16;                                                          \
401       y2i  += 16;                                                          \
402       ui   += 8;                                                           \
403       vi   += 8;                                                           \
404                                                                            \
405     }                                                                      \
406                                                                            \
407     outo += (outstrides[0])>>4;                                            \
408     oute += (outstrides[0])>>4;                                            \
409                                                                            \
410     ui    += instrides_scl[1];                                             \
411     vi    += instrides_scl[2];                                             \
412     y1i   += instrides_scl[0];                                             \
413     y2i   += instrides_scl[0];                                             \
414   }                                                                        \
415   return srcSliceH;                                                        \
416 }
417
418
419 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
420 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
421 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
422 #define out_argb(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
423 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
424 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
425
426 DEFCSP420_CVT (yuv2_abgr, out_abgr)
427 #if 1
428 DEFCSP420_CVT (yuv2_bgra, out_bgra)
429 #else
430 static int altivec_yuv2_bgra32 (SwsContext *c,                                  
431                                 unsigned char **in, int *instrides,        
432                                 int srcSliceY,  int srcSliceH,             
433                                 unsigned char **oplanes, int *outstrides)  
434 {                                                                          
435   int w = c->srcW;                                                         
436   int h = srcSliceH;                                                       
437   int i,j;                                                                 
438   int instrides_scl[3];                                                    
439   vector unsigned char y0,y1;                                              
440                                                                            
441   vector signed char  u,v;                                                 
442                                                                            
443   vector signed short Y0,Y1,Y2,Y3;                                         
444   vector signed short U,V;                                                 
445   vector signed short vx,ux,uvx;                                           
446   vector signed short vx0,ux0,uvx0;                                        
447   vector signed short vx1,ux1,uvx1;                                        
448   vector signed short R0,G0,B0;                                            
449   vector signed short R1,G1,B1;                                            
450   vector unsigned char R,G,B;                                              
451                                                                            
452   vector unsigned char *uivP, *vivP;                                       
453   vector unsigned char align_perm;                                         
454                                                                            
455   vector signed short                                                      
456     lCY  = c->CY,                                                          
457     lOY  = c->OY,                                                          
458     lCRV = c->CRV,                                                         
459     lCBU = c->CBU,                                                         
460     lCGU = c->CGU,                                                         
461     lCGV = c->CGV;                                                         
462                                                                            
463   vector unsigned short lCSHIFT = c->CSHIFT;                               
464                                                                            
465   ubyte *y1i   = in[0];                                                    
466   ubyte *y2i   = in[0]+w;                                                  
467   ubyte *ui    = in[1];                                                    
468   ubyte *vi    = in[2];                                                    
469                                                                            
470   vector unsigned char *oute                                               
471     = (vector unsigned char *)                                             
472         (oplanes[0]+srcSliceY*outstrides[0]);                              
473   vector unsigned char *outo                                               
474     = (vector unsigned char *)                                             
475         (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);                
476                                                                            
477                                                                            
478   instrides_scl[0] = instrides[0];                                         
479   instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */     
480   instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */     
481                                                                            
482                                                                            
483   for (i=0;i<h/2;i++) {                                                    
484     vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);                 
485     vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);                 
486                                                                            
487     for (j=0;j<w/16;j++) {                                                 
488                                                                            
489       y0 = vec_ldl (0,y1i);                                                
490       y1 = vec_ldl (0,y2i);                                                
491       uivP = (vector unsigned char *)ui;                                   
492       vivP = (vector unsigned char *)vi;                                   
493                                                                            
494       align_perm = vec_lvsl (0, ui);                                       
495       u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);     
496                                                                            
497       align_perm = vec_lvsl (0, vi);                                       
498       v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
499       u  = (vector signed char)
500                 vec_sub (u,(vector signed char)
501                                 vec_splat((vector signed char)AVV(128),0));
502       
503       v  = (vector signed char)
504                 vec_sub (v, (vector signed char)
505                                 vec_splat((vector signed char)AVV(128),0));
506       
507       U  = vec_unpackh (u);                                                
508       V  = vec_unpackh (v);                                                
509                                                                            
510                                                                            
511         Y0 = vec_unh (y0);                                                 
512         Y1 = vec_unl (y0);                                                 
513         Y2 = vec_unh (y1);                                                 
514         Y3 = vec_unl (y1);                                                 
515                                                                            
516         Y0 = vec_mradds (Y0, lCY, lOY);                                    
517         Y1 = vec_mradds (Y1, lCY, lOY);                                    
518         Y2 = vec_mradds (Y2, lCY, lOY);                                    
519         Y3 = vec_mradds (Y3, lCY, lOY);                                    
520                                                                            
521         /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                         
522         ux = vec_sl (U, lCSHIFT);                                          
523         ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
524         ux0  = vec_mergeh (ux,ux);                                         
525         ux1  = vec_mergel (ux,ux);                                         
526                                                                            
527         /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;  */                         
528         vx = vec_sl (V, lCSHIFT);                                          
529         vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
530         vx0  = vec_mergeh (vx,vx);
531         vx1  = vec_mergel (vx,vx);
532         /* uvx = ((CGU*u) + (CGV*v))>>15 */
533         uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
534         uvx = vec_mradds (V, lCGV, uvx);
535         uvx0 = vec_mergeh (uvx,uvx);
536         uvx1 = vec_mergel (uvx,uvx);
537         R0 = vec_add (Y0,vx0);
538         G0 = vec_add (Y0,uvx0);
539         B0 = vec_add (Y0,ux0);
540         R1 = vec_add (Y1,vx1);
541         G1 = vec_add (Y1,uvx1);
542         B1 = vec_add (Y1,ux1);
543         R  = vec_packclp (R0,R1);
544         G  = vec_packclp (G0,G1);
545         B  = vec_packclp (B0,B1);
546         
547         out_argb(R,G,B,oute);
548         R0 = vec_add (Y2,vx0);
549         G0 = vec_add (Y2,uvx0);
550         B0 = vec_add (Y2,ux0);
551         R1 = vec_add (Y3,vx1);
552         G1 = vec_add (Y3,uvx1);
553         B1 = vec_add (Y3,ux1);
554         R  = vec_packclp (R0,R1);
555         G  = vec_packclp (G0,G1);
556         B  = vec_packclp (B0,B1);
557         
558         out_argb(R,G,B,outo);
559         y1i  += 16;                                                        
560         y2i  += 16;                                                        
561         ui   += 8;
562         vi   += 8;                                                         
563                                                                            
564     }                                                                      
565                                                                            
566     outo += (outstrides[0])>>4;                                            
567     oute += (outstrides[0])>>4;                                            
568                                                                            
569     ui    += instrides_scl[1];                                             
570     vi    += instrides_scl[2];                                             
571     y1i   += instrides_scl[0];                                             
572     y2i   += instrides_scl[0];                                             
573   }                                                                        
574   return srcSliceH;                                                        
575 }
576
577 #endif
578
579
580 DEFCSP420_CVT (yuv2_rgba, out_rgba)
581 DEFCSP420_CVT (yuv2_argb, out_argb)
582 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
583 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
584
585
586 // uyvy|uyvy|uyvy|uyvy
587 // 0123 4567 89ab cdef
588 static
589 const vector unsigned char
590   demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
591                                    0x10,0x04,0x10,0x04,
592                                    0x10,0x08,0x10,0x08,
593                                    0x10,0x0c,0x10,0x0c),
594   demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
595                                    0x10,0x06,0x10,0x06,
596                                    0x10,0x0A,0x10,0x0A,
597                                    0x10,0x0E,0x10,0x0E),
598   demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
599                                    0x10,0x05,0x10,0x07,
600                                    0x10,0x09,0x10,0x0B,
601                                    0x10,0x0D,0x10,0x0F);
602
603 /*
604   this is so I can play live CCIR raw video
605 */
606 static int altivec_uyvy_rgb32 (SwsContext *c,
607                                unsigned char **in, int *instrides,
608                                int srcSliceY,   int srcSliceH,
609                                unsigned char **oplanes, int *outstrides)
610 {
611   int w = c->srcW;
612   int h = srcSliceH;
613   int i,j;
614   vector unsigned char uyvy;
615   vector signed   short Y,U,V;
616   vector signed   short vx,ux,uvx;
617   vector signed   short R0,G0,B0,R1,G1,B1;
618   vector unsigned char  R,G,B;
619   vector unsigned char *out;
620   ubyte *img;
621
622   img = in[0];
623   out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
624
625   for (i=0;i<h;i++) {
626     for (j=0;j<w/16;j++) {
627       uyvy = vec_ld (0, img);
628       U = (vector signed short)
629         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
630
631       V = (vector signed short)
632         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
633
634       Y = (vector signed short)
635         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
636
637       cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
638
639       uyvy = vec_ld (16, img);
640       U = (vector signed short)
641         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
642
643       V = (vector signed short)
644         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
645
646       Y = (vector signed short)
647         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
648
649       cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
650
651       R  = vec_packclp (R0,R1);
652       G  = vec_packclp (G0,G1);
653       B  = vec_packclp (B0,B1);
654
655       //      vec_mstbgr24 (R,G,B, out);
656       out_rgba (R,G,B,out);
657
658       img += 32;
659     }
660   }
661   return srcSliceH;
662 }
663
664
665
666 /* Ok currently the acceleration routine only supports
667    inputs of widths a multiple of 16
668    and heights a multiple 2
669
670    So we just fall back to the C codes for this.
671 */
672 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
673 {
674   if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))    
675     return NULL;
676
677   /*
678     and this seems not to matter too much I tried a bunch of 
679     videos with abnormal widths and mplayer crashes else where.
680     mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv 
681     boom with X11 bad match.
682     
683   */
684   if ((c->srcW & 0xf) != 0)    return NULL;
685
686   switch (c->srcFormat) {
687   case IMGFMT_YVU9:
688   case IMGFMT_IF09:
689   case IMGFMT_YV12:
690   case IMGFMT_I420:
691   case IMGFMT_IYUV:
692   case IMGFMT_CLPL:
693   case IMGFMT_Y800:
694   case IMGFMT_Y8:
695   case IMGFMT_NV12:
696   case IMGFMT_NV21:
697     if ((c->srcH & 0x1) != 0)
698       return NULL;
699
700     switch(c->dstFormat){
701     case IMGFMT_RGB24:
702       MSG_WARN("ALTIVEC: Color Space RGB24\n");
703       return altivec_yuv2_rgb24;
704     case IMGFMT_BGR24:
705       MSG_WARN("ALTIVEC: Color Space BGR24\n");
706       return altivec_yuv2_bgr24;
707     case IMGFMT_ARGB:
708       MSG_WARN("ALTIVEC: Color Space ARGB\n");
709       return altivec_yuv2_argb;
710     case IMGFMT_ABGR:
711       MSG_WARN("ALTIVEC: Color Space ABGR\n");
712       return altivec_yuv2_abgr;
713     case IMGFMT_RGBA:
714       MSG_WARN("ALTIVEC: Color Space RGBA\n");
715       return altivec_yuv2_rgba;
716     case IMGFMT_BGRA:
717       MSG_WARN("ALTIVEC: Color Space BGRA\n");
718       return altivec_yuv2_bgra;
719     default: return NULL;
720     }
721     break;
722
723   case IMGFMT_UYVY:
724     switch(c->dstFormat){
725     case IMGFMT_RGB32:
726       MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
727       return altivec_uyvy_rgb32;
728     default: return NULL;
729     }
730     break;
731
732   }
733   return NULL;
734 }
735
736 static uint16_t roundToInt16(int64_t f){
737         int r= (f + (1<<15))>>16;
738              if(r<-0x7FFF) return 0x8000;
739         else if(r> 0x7FFF) return 0x7FFF;
740         else               return r;
741 }
742
743 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
744 {
745   union {
746         signed short tmp[8] __attribute__ ((aligned(16)));
747         vector signed short vec;
748         } buf;
749
750   buf.tmp[0] =  ( (0xffffLL) * contrast>>8 )>>9;                        //cy
751   buf.tmp[1] =  -256*brightness;                                        //oy
752   buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);     //crv
753   buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);     //cbu
754   buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));    //cgu
755   buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));    //cgv
756
757
758   c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
759   c->CY  = vec_splat ((vector signed short)buf.vec, 0);
760   c->OY  = vec_splat ((vector signed short)buf.vec, 1);
761   c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
762   c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
763   c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
764   c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
765 #if 0
766 {
767 int i;
768 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
769 for (i=0; i<6;i++)
770   printf("%s %d ", v[i],buf.tmp[i] );
771   printf("\n");
772 }
773 #endif
774  return;
775 }
776
777
778 void
779 altivec_yuv2packedX (SwsContext *c,
780                        int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
781                        int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
782                        uint8_t *dest, int dstW, int dstY)
783 {
784   int i,j;
785   short *f;
786   vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
787   vector signed short R0,G0,B0,R1,G1,B1;
788
789   vector unsigned char R,G,B,pels[3];
790   vector unsigned char *out,*nout;
791
792   vector signed short   RND = vec_splat_s16(1<<3);
793   vector unsigned short SCL = vec_splat_u16(4);
794   unsigned long scratch[16] __attribute__ ((aligned (16)));
795
796   vector signed short *YCoeffs, *CCoeffs;
797
798   YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
799   CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
800
801   out = (vector unsigned char *)dest;
802
803   for(i=0; i<dstW; i+=16){
804     Y0 = RND;
805     Y1 = RND;
806     /* extract 16 coeffs from lumSrc */
807     for(j=0; j<lumFilterSize; j++) {
808       X0 = vec_ld (0,  &lumSrc[j][i]);
809       X1 = vec_ld (16, &lumSrc[j][i]);
810       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
811       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
812     }
813
814     U = RND;
815     V = RND;
816     /* extract 8 coeffs from U,V */
817     for(j=0; j<chrFilterSize; j++) {
818       X  = vec_ld (0, &chrSrc[j][i/2]);
819       U  = vec_mradds (X, CCoeffs[j], U);
820       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
821       V  = vec_mradds (X, CCoeffs[j], V);
822     }
823
824     /* scale and clip signals */
825     Y0 = vec_sra (Y0, SCL);
826     Y1 = vec_sra (Y1, SCL);
827     U  = vec_sra (U,  SCL);
828     V  = vec_sra (V,  SCL);
829
830     Y0 = vec_clip_s16 (Y0);
831     Y1 = vec_clip_s16 (Y1);
832     U  = vec_clip_s16 (U);
833     V  = vec_clip_s16 (V);
834
835     /* now we have
836       Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
837       U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
838
839       Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
840       U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
841       V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
842     */
843
844     U0 = vec_mergeh (U,U);
845     V0 = vec_mergeh (V,V);
846
847     U1 = vec_mergel (U,U);
848     V1 = vec_mergel (V,V);
849
850     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
851     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
852
853     R  = vec_packclp (R0,R1);
854     G  = vec_packclp (G0,G1);
855     B  = vec_packclp (B0,B1);
856
857     switch(c->dstFormat) {
858       case IMGFMT_ABGR: out_abgr (R,G,B,out); break;
859       case IMGFMT_BGRA: out_bgra (R,G,B,out); break;
860       case IMGFMT_RGBA: out_rgba (R,G,B,out); break;
861       case IMGFMT_ARGB: out_argb (R,G,B,out); break;
862       case IMGFMT_RGB24: out_rgb24 (R,G,B,out); break;
863       case IMGFMT_BGR24: out_bgr24 (R,G,B,out); break;
864       default:
865         {
866           /* If this is reached, the caller should have called yuv2packedXinC
867              instead. */
868           static int printed_error_message;
869           if(!printed_error_message) {
870             MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
871                     vo_format_name(c->dstFormat));
872             printed_error_message=1;
873           }
874           return;
875         }
876     }
877   }
878
879   if (i < dstW) {
880     i -= 16;
881
882     Y0 = RND;
883     Y1 = RND;
884     /* extract 16 coeffs from lumSrc */
885     for(j=0; j<lumFilterSize; j++) {
886       X0 = vec_ld (0,  &lumSrc[j][i]);
887       X1 = vec_ld (16, &lumSrc[j][i]);
888       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
889       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
890     }
891
892     U = RND;
893     V = RND;
894     /* extract 8 coeffs from U,V */
895     for(j=0; j<chrFilterSize; j++) {
896       X  = vec_ld (0, &chrSrc[j][i/2]);
897       U  = vec_mradds (X, CCoeffs[j], U);
898       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
899       V  = vec_mradds (X, CCoeffs[j], V);
900     }
901
902     /* scale and clip signals */
903     Y0 = vec_sra (Y0, SCL);
904     Y1 = vec_sra (Y1, SCL);
905     U  = vec_sra (U,  SCL);
906     V  = vec_sra (V,  SCL);
907
908     Y0 = vec_clip_s16 (Y0);
909     Y1 = vec_clip_s16 (Y1);
910     U  = vec_clip_s16 (U);
911     V  = vec_clip_s16 (V);
912
913     /* now we have
914        Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
915        U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
916
917        Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
918        U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
919        V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
920     */
921
922     U0 = vec_mergeh (U,U);
923     V0 = vec_mergeh (V,V);
924
925     U1 = vec_mergel (U,U);
926     V1 = vec_mergel (V,V);
927
928     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
929     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
930
931     R  = vec_packclp (R0,R1);
932     G  = vec_packclp (G0,G1);
933     B  = vec_packclp (B0,B1);
934
935     nout = (vector unsigned char *)scratch;
936     switch(c->dstFormat) {
937       case IMGFMT_ABGR: out_abgr (R,G,B,nout); break;
938       case IMGFMT_BGRA: out_bgra (R,G,B,nout); break;
939       case IMGFMT_RGBA: out_rgba (R,G,B,nout); break;
940       case IMGFMT_ARGB: out_argb (R,G,B,nout); break;
941       case IMGFMT_RGB24: out_rgb24 (R,G,B,nout); break;
942       case IMGFMT_BGR24: out_bgr24 (R,G,B,nout); break;
943       default:
944         /* Unreachable, I think. */
945         MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
946                 vo_format_name(c->dstFormat));
947         return;
948     }
949
950     memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
951   }
952
953 }