]> git.sesse.net Git - ffmpeg/blob - libswscale/yuv2rgb_altivec.c
total_size should be 64bit (1 hunk of the asf seeking patch from DrDivx / Steve Lhomme)
[ffmpeg] / libswscale / yuv2rgb_altivec.c
1 /*
2   marc.hoffman@analog.com    March 8, 2004
3
4   Altivec Acceleration for Color Space Conversion revision 0.2
5
6   convert I420 YV12 to RGB in various formats,
7     it rejects images that are not in 420 formats
8     it rejects images that don't have widths of multiples of 16
9     it rejects images that don't have heights of multiples of 2
10   reject defers to C simulation codes.
11
12   lots of optimizations to be done here
13
14   1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15      so we currently use max min to clip
16
17   2. the inefficient use of chroma loading needs a bit of brushing up
18
19   3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
20
21
22   MODIFIED to calculate coeffs from currently selected color space.
23   MODIFIED core to be a macro which you spec the output format.
24   ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25   CORRECTED algorithim selection to be strict on input formats.
26   ADDED runtime detection of altivec.
27
28   ADDED altivec_yuv2packedX vertical scl + RGB converter
29
30   March 27,2004
31   PERFORMANCE ANALYSIS
32
33   The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34   The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
35
36   720*480*30  ~10MPS
37
38   so we have roughly 10clocks per pixel this is too high something has to be wrong.
39
40   OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
41
42   OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43   guaranteed to have the input video frame it was just decompressed so
44   it probably resides in L1 caches.  However we are creating the
45   output video stream this needs to use the DSTST instruction to
46   optimize for the cache.  We couple this with the fact that we are
47   not going to be visiting the input buffer again so we mark it Least
48   Recently Used.  This shaves 25% of the processor cycles off.
49
50   Now MEMCPY is the largest mips consumer in the system, probably due
51   to the inefficient X11 stuff.
52
53   GL libraries seem to be very slow on this machine 1.33Ghz PB running
54   Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
55   a versioning issues, however i have libGL.1.2.dylib for both
56   machines. ((We need to figure this out now))
57
58   GL2 libraries work now with patch for RGB32
59
60   NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
61
62   Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment. 
63
64 */
65 #include <stdio.h>
66 #include <stdlib.h>
67 #include <string.h>
68 #include <inttypes.h>
69 #include <assert.h>
70 #include "config.h"
71 #ifdef HAVE_MALLOC_H
72 #include <malloc.h>
73 #endif
74 #include "rgb2rgb.h"
75 #include "swscale.h"
76 #include "swscale_internal.h"
77 #include "libmpcodecs/img_format.h" //FIXME try to reduce dependency of such stuff
78
79 #undef PROFILE_THE_BEAST
80 #undef INC_SCALING
81
82 typedef unsigned char ubyte;
83 typedef signed char   sbyte;
84
85
86 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
87    homogeneous vector registers x0,x1,x2 are interleaved with the
88    following technique:
89
90       o0 = vec_mergeh (x0,x1);
91       o1 = vec_perm (o0, x2, perm_rgb_0);
92       o2 = vec_perm (o0, x2, perm_rgb_1);
93       o3 = vec_mergel (x0,x1);
94       o4 = vec_perm (o3,o2,perm_rgb_2);
95       o5 = vec_perm (o3,o2,perm_rgb_3);
96
97   perm_rgb_0:   o0(RG).h v1(B) --> o1*
98               0   1  2   3   4
99              rgbr|gbrg|brgb|rgbr
100              0010 0100 1001 0010
101              0102 3145 2673 894A
102
103   perm_rgb_1:   o0(RG).h v1(B) --> o2
104               0   1  2   3   4
105              gbrg|brgb|bbbb|bbbb
106              0100 1001 1111 1111
107              B5CD 6EF7 89AB CDEF
108
109   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
110               0   1  2   3   4
111              gbrg|brgb|rgbr|gbrg
112              1111 1111 0010 0100
113              89AB CDEF 0182 3945
114
115   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
116               0   1  2   3   4
117              brgb|rgbr|gbrg|brgb
118              1001 0010 0100 1001
119              a67b 89cA BdCD eEFf
120
121 */
122 static
123 const vector unsigned char
124   perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
125                                       0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
126   perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
127                                       0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
128   perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
129                                       0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
130   perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
131                                       0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
132
133 #define vec_merge3(x2,x1,x0,y0,y1,y2)    \
134 do {                                     \
135   typeof(x0) o0,o2,o3;                   \
136       o0 = vec_mergeh (x0,x1);           \
137       y0 = vec_perm (o0, x2, perm_rgb_0);\
138       o2 = vec_perm (o0, x2, perm_rgb_1);\
139       o3 = vec_mergel (x0,x1);           \
140       y1 = vec_perm (o3,o2,perm_rgb_2);  \
141       y2 = vec_perm (o3,o2,perm_rgb_3);  \
142 } while(0)
143
144 #define vec_mstbgr24(x0,x1,x2,ptr)        \
145 do {                                     \
146   typeof(x0) _0,_1,_2;                   \
147   vec_merge3 (x0,x1,x2,_0,_1,_2);        \
148   vec_st (_0, 0, ptr++);                 \
149   vec_st (_1, 0, ptr++);                 \
150   vec_st (_2, 0, ptr++);                 \
151 }  while (0);
152
153 #define vec_mstrgb24(x0,x1,x2,ptr)       \
154 do {                                     \
155   typeof(x0) _0,_1,_2;                   \
156   vec_merge3 (x2,x1,x0,_0,_1,_2);        \
157   vec_st (_0, 0, ptr++);                 \
158   vec_st (_1, 0, ptr++);                 \
159   vec_st (_2, 0, ptr++);                 \
160 }  while (0);
161
162 /* pack the pixels in rgb0 format
163    msb R
164    lsb 0
165 */
166 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                                \
167 do {                                                                                   \
168   T _0,_1,_2,_3;                                                                       \
169   _0 = vec_mergeh (x0,x1);                                                             \
170   _1 = vec_mergeh (x2,x3);                                                             \
171   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
172   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
173   vec_st (_2, 0*16, (T *)ptr);                                                         \
174   vec_st (_3, 1*16, (T *)ptr);                                                         \
175   _0 = vec_mergel (x0,x1);                                                             \
176   _1 = vec_mergel (x2,x3);                                                             \
177   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
178   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
179   vec_st (_2, 2*16, (T *)ptr);                                                         \
180   vec_st (_3, 3*16, (T *)ptr);                                                         \
181   ptr += 4;                                                                            \
182 }  while (0);
183
184 /*
185
186   | 1     0       1.4021   | | Y |
187   | 1    -0.3441 -0.7142   |x| Cb|
188   | 1     1.7718  0        | | Cr|
189
190
191   Y:      [-128 127]
192   Cb/Cr : [-128 127]
193
194   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
195
196 */
197
198
199
200
201 #define vec_unh(x) \
202   (vector signed short) \
203     vec_perm(x,(typeof(x))AVV(0),\
204              (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
205                                     0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
206 #define vec_unl(x) \
207   (vector signed short) \
208     vec_perm(x,(typeof(x))AVV(0),\
209              (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
210                                     0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
211
212 #define vec_clip_s16(x) \
213   vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\
214                        (vector signed short)AVV(16, 16, 16, 16, 16, 16, 16, 16 ))
215
216 #define vec_packclp(x,y) \
217   (vector unsigned char)vec_packs \
218       ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
219        (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
220
221 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
222
223
224 static inline void cvtyuvtoRGB (SwsContext *c,
225                            vector signed short Y, vector signed short U, vector signed short V,
226                            vector signed short *R, vector signed short *G, vector signed short *B)
227 {
228   vector signed   short vx,ux,uvx;
229
230   Y = vec_mradds (Y, c->CY, c->OY);
231   U  = vec_sub (U,(vector signed short)
232                         vec_splat((vector signed short)AVV(128),0));
233   V  = vec_sub (V,(vector signed short)
234                         vec_splat((vector signed short)AVV(128),0));
235
236   //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
237   ux = vec_sl (U, c->CSHIFT);
238   *B = vec_mradds (ux, c->CBU, Y);
239
240   // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
241   vx = vec_sl (V, c->CSHIFT);
242   *R = vec_mradds (vx, c->CRV, Y);
243
244   // uvx = ((CGU*u) + (CGV*v))>>15;
245   uvx = vec_mradds (U, c->CGU, Y);
246   *G = vec_mradds (V, c->CGV, uvx);
247 }
248
249
250 /*
251   ------------------------------------------------------------------------------
252   CS converters
253   ------------------------------------------------------------------------------
254 */
255
256
257 #define DEFCSP420_CVT(name,out_pixels)                                     \
258 static int altivec_##name (SwsContext *c,                                  \
259                                 unsigned char **in, int *instrides,        \
260                                 int srcSliceY,  int srcSliceH,             \
261                                 unsigned char **oplanes, int *outstrides)  \
262 {                                                                          \
263   int w = c->srcW;                                                         \
264   int h = srcSliceH;                                                       \
265   int i,j;                                                                 \
266   int instrides_scl[3];                                                    \
267   vector unsigned char y0,y1;                                              \
268                                                                            \
269   vector signed char  u,v;                                                 \
270                                                                            \
271   vector signed short Y0,Y1,Y2,Y3;                                         \
272   vector signed short U,V;                                                 \
273   vector signed short vx,ux,uvx;                                           \
274   vector signed short vx0,ux0,uvx0;                                        \
275   vector signed short vx1,ux1,uvx1;                                        \
276   vector signed short R0,G0,B0;                                            \
277   vector signed short R1,G1,B1;                                            \
278   vector unsigned char R,G,B;                                              \
279                                                                            \
280   vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                       \
281   vector unsigned char align_perm;                                         \
282                                                                            \
283   vector signed short                                                      \
284     lCY  = c->CY,                                                          \
285     lOY  = c->OY,                                                          \
286     lCRV = c->CRV,                                                         \
287     lCBU = c->CBU,                                                         \
288     lCGU = c->CGU,                                                         \
289     lCGV = c->CGV;                                                         \
290                                                                            \
291   vector unsigned short lCSHIFT = c->CSHIFT;                               \
292                                                                            \
293   ubyte *y1i   = in[0];                                                    \
294   ubyte *y2i   = in[0]+instrides[0];                                       \
295   ubyte *ui    = in[1];                                                    \
296   ubyte *vi    = in[2];                                                    \
297                                                                            \
298   vector unsigned char *oute                                               \
299     = (vector unsigned char *)                                             \
300         (oplanes[0]+srcSliceY*outstrides[0]);                              \
301   vector unsigned char *outo                                               \
302     = (vector unsigned char *)                                             \
303         (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);                \
304                                                                            \
305                                                                            \
306   instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */  \
307   instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */     \
308   instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */     \
309                                                                            \
310                                                                            \
311   for (i=0;i<h/2;i++) {                                                    \
312     vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);                 \
313     vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);                 \
314                                                                            \
315     for (j=0;j<w/16;j++) {                                                 \
316                                                                            \
317       y1ivP = (vector unsigned char *)y1i;                                 \
318       y2ivP = (vector unsigned char *)y2i;                                 \
319       uivP = (vector unsigned char *)ui;                                   \
320       vivP = (vector unsigned char *)vi;                                   \
321                                                                            \
322       align_perm = vec_lvsl (0, y1i);                                      \
323       y0 = (vector unsigned char)vec_perm (y1ivP[0], y1ivP[1], align_perm);\
324                                                                            \
325       align_perm = vec_lvsl (0, y2i);                                      \
326       y1 = (vector unsigned char)vec_perm (y2ivP[0], y2ivP[1], align_perm);\
327                                                                            \
328       align_perm = vec_lvsl (0, ui);                                       \
329       u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);     \
330                                                                            \
331       align_perm = vec_lvsl (0, vi);                                       \
332       v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);     \
333                                                                            \
334       u  = (vector signed char)                                            \
335                 vec_sub (u,(vector signed char)                            \
336                                 vec_splat((vector signed char)AVV(128),0));\
337       v  = (vector signed char)                                            \
338                 vec_sub (v,(vector signed char)                            \
339                                 vec_splat((vector signed char)AVV(128),0));\
340                                                                            \
341       U  = vec_unpackh (u);                                                \
342       V  = vec_unpackh (v);                                                \
343                                                                            \
344                                                                            \
345         Y0 = vec_unh (y0);                                                 \
346         Y1 = vec_unl (y0);                                                 \
347         Y2 = vec_unh (y1);                                                 \
348         Y3 = vec_unl (y1);                                                 \
349                                                                            \
350         Y0 = vec_mradds (Y0, lCY, lOY);                                    \
351         Y1 = vec_mradds (Y1, lCY, lOY);                                    \
352         Y2 = vec_mradds (Y2, lCY, lOY);                                    \
353         Y3 = vec_mradds (Y3, lCY, lOY);                                    \
354                                                                            \
355         /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                         \
356         ux = vec_sl (U, lCSHIFT);                                          \
357         ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));                   \
358         ux0  = vec_mergeh (ux,ux);                                         \
359         ux1  = vec_mergel (ux,ux);                                         \
360                                                                            \
361         /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;  */                         \
362         vx = vec_sl (V, lCSHIFT);                                          \
363         vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));                   \
364         vx0  = vec_mergeh (vx,vx);                                         \
365         vx1  = vec_mergel (vx,vx);                                         \
366                                                                            \
367         /* uvx = ((CGU*u) + (CGV*v))>>15 */                                \
368         uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));                   \
369         uvx = vec_mradds (V, lCGV, uvx);                                   \
370         uvx0 = vec_mergeh (uvx,uvx);                                       \
371         uvx1 = vec_mergel (uvx,uvx);                                       \
372                                                                            \
373         R0 = vec_add (Y0,vx0);                                             \
374         G0 = vec_add (Y0,uvx0);                                            \
375         B0 = vec_add (Y0,ux0);                                             \
376         R1 = vec_add (Y1,vx1);                                             \
377         G1 = vec_add (Y1,uvx1);                                            \
378         B1 = vec_add (Y1,ux1);                                             \
379                                                                            \
380         R  = vec_packclp (R0,R1);                                          \
381         G  = vec_packclp (G0,G1);                                          \
382         B  = vec_packclp (B0,B1);                                          \
383                                                                            \
384         out_pixels(R,G,B,oute);                                            \
385                                                                            \
386         R0 = vec_add (Y2,vx0);                                             \
387         G0 = vec_add (Y2,uvx0);                                            \
388         B0 = vec_add (Y2,ux0);                                             \
389         R1 = vec_add (Y3,vx1);                                             \
390         G1 = vec_add (Y3,uvx1);                                            \
391         B1 = vec_add (Y3,ux1);                                             \
392         R  = vec_packclp (R0,R1);                                          \
393         G  = vec_packclp (G0,G1);                                          \
394         B  = vec_packclp (B0,B1);                                          \
395                                                                            \
396                                                                            \
397         out_pixels(R,G,B,outo);                                            \
398                                                                            \
399       y1i  += 16;                                                          \
400       y2i  += 16;                                                          \
401       ui   += 8;                                                           \
402       vi   += 8;                                                           \
403                                                                            \
404     }                                                                      \
405                                                                            \
406     outo += (outstrides[0])>>4;                                            \
407     oute += (outstrides[0])>>4;                                            \
408                                                                            \
409     ui    += instrides_scl[1];                                             \
410     vi    += instrides_scl[2];                                             \
411     y1i   += instrides_scl[0];                                             \
412     y2i   += instrides_scl[0];                                             \
413   }                                                                        \
414   return srcSliceH;                                                        \
415 }
416
417
418 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
419 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
420 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
421 #define out_argb(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
422 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
423 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
424
425 DEFCSP420_CVT (yuv2_abgr, out_abgr)
426 #if 1
427 DEFCSP420_CVT (yuv2_bgra, out_bgra)
428 #else
429 static int altivec_yuv2_bgra32 (SwsContext *c,                                  
430                                 unsigned char **in, int *instrides,        
431                                 int srcSliceY,  int srcSliceH,             
432                                 unsigned char **oplanes, int *outstrides)  
433 {                                                                          
434   int w = c->srcW;                                                         
435   int h = srcSliceH;                                                       
436   int i,j;                                                                 
437   int instrides_scl[3];                                                    
438   vector unsigned char y0,y1;                                              
439                                                                            
440   vector signed char  u,v;                                                 
441                                                                            
442   vector signed short Y0,Y1,Y2,Y3;                                         
443   vector signed short U,V;                                                 
444   vector signed short vx,ux,uvx;                                           
445   vector signed short vx0,ux0,uvx0;                                        
446   vector signed short vx1,ux1,uvx1;                                        
447   vector signed short R0,G0,B0;                                            
448   vector signed short R1,G1,B1;                                            
449   vector unsigned char R,G,B;                                              
450                                                                            
451   vector unsigned char *uivP, *vivP;                                       
452   vector unsigned char align_perm;                                         
453                                                                            
454   vector signed short                                                      
455     lCY  = c->CY,                                                          
456     lOY  = c->OY,                                                          
457     lCRV = c->CRV,                                                         
458     lCBU = c->CBU,                                                         
459     lCGU = c->CGU,                                                         
460     lCGV = c->CGV;                                                         
461                                                                            
462   vector unsigned short lCSHIFT = c->CSHIFT;                               
463                                                                            
464   ubyte *y1i   = in[0];                                                    
465   ubyte *y2i   = in[0]+w;                                                  
466   ubyte *ui    = in[1];                                                    
467   ubyte *vi    = in[2];                                                    
468                                                                            
469   vector unsigned char *oute                                               
470     = (vector unsigned char *)                                             
471         (oplanes[0]+srcSliceY*outstrides[0]);                              
472   vector unsigned char *outo                                               
473     = (vector unsigned char *)                                             
474         (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);                
475                                                                            
476                                                                            
477   instrides_scl[0] = instrides[0];                                         
478   instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */     
479   instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */     
480                                                                            
481                                                                            
482   for (i=0;i<h/2;i++) {                                                    
483     vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);                 
484     vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);                 
485                                                                            
486     for (j=0;j<w/16;j++) {                                                 
487                                                                            
488       y0 = vec_ldl (0,y1i);                                                
489       y1 = vec_ldl (0,y2i);                                                
490       uivP = (vector unsigned char *)ui;                                   
491       vivP = (vector unsigned char *)vi;                                   
492                                                                            
493       align_perm = vec_lvsl (0, ui);                                       
494       u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);     
495                                                                            
496       align_perm = vec_lvsl (0, vi);                                       
497       v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
498       u  = (vector signed char)
499                 vec_sub (u,(vector signed char)
500                                 vec_splat((vector signed char)AVV(128),0));
501       
502       v  = (vector signed char)
503                 vec_sub (v, (vector signed char)
504                                 vec_splat((vector signed char)AVV(128),0));
505       
506       U  = vec_unpackh (u);                                                
507       V  = vec_unpackh (v);                                                
508                                                                            
509                                                                            
510         Y0 = vec_unh (y0);                                                 
511         Y1 = vec_unl (y0);                                                 
512         Y2 = vec_unh (y1);                                                 
513         Y3 = vec_unl (y1);                                                 
514                                                                            
515         Y0 = vec_mradds (Y0, lCY, lOY);                                    
516         Y1 = vec_mradds (Y1, lCY, lOY);                                    
517         Y2 = vec_mradds (Y2, lCY, lOY);                                    
518         Y3 = vec_mradds (Y3, lCY, lOY);                                    
519                                                                            
520         /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                         
521         ux = vec_sl (U, lCSHIFT);                                          
522         ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
523         ux0  = vec_mergeh (ux,ux);                                         
524         ux1  = vec_mergel (ux,ux);                                         
525                                                                            
526         /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;  */                         
527         vx = vec_sl (V, lCSHIFT);                                          
528         vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
529         vx0  = vec_mergeh (vx,vx);
530         vx1  = vec_mergel (vx,vx);
531         /* uvx = ((CGU*u) + (CGV*v))>>15 */
532         uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
533         uvx = vec_mradds (V, lCGV, uvx);
534         uvx0 = vec_mergeh (uvx,uvx);
535         uvx1 = vec_mergel (uvx,uvx);
536         R0 = vec_add (Y0,vx0);
537         G0 = vec_add (Y0,uvx0);
538         B0 = vec_add (Y0,ux0);
539         R1 = vec_add (Y1,vx1);
540         G1 = vec_add (Y1,uvx1);
541         B1 = vec_add (Y1,ux1);
542         R  = vec_packclp (R0,R1);
543         G  = vec_packclp (G0,G1);
544         B  = vec_packclp (B0,B1);
545         
546         out_argb(R,G,B,oute);
547         R0 = vec_add (Y2,vx0);
548         G0 = vec_add (Y2,uvx0);
549         B0 = vec_add (Y2,ux0);
550         R1 = vec_add (Y3,vx1);
551         G1 = vec_add (Y3,uvx1);
552         B1 = vec_add (Y3,ux1);
553         R  = vec_packclp (R0,R1);
554         G  = vec_packclp (G0,G1);
555         B  = vec_packclp (B0,B1);
556         
557         out_argb(R,G,B,outo);
558         y1i  += 16;                                                        
559         y2i  += 16;                                                        
560         ui   += 8;
561         vi   += 8;                                                         
562                                                                            
563     }                                                                      
564                                                                            
565     outo += (outstrides[0])>>4;                                            
566     oute += (outstrides[0])>>4;                                            
567                                                                            
568     ui    += instrides_scl[1];                                             
569     vi    += instrides_scl[2];                                             
570     y1i   += instrides_scl[0];                                             
571     y2i   += instrides_scl[0];                                             
572   }                                                                        
573   return srcSliceH;                                                        
574 }
575
576 #endif
577
578
579 DEFCSP420_CVT (yuv2_rgba, out_rgba)
580 DEFCSP420_CVT (yuv2_argb, out_argb)
581 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
582 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
583
584
585 // uyvy|uyvy|uyvy|uyvy
586 // 0123 4567 89ab cdef
587 static
588 const vector unsigned char
589   demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
590                                    0x10,0x04,0x10,0x04,
591                                    0x10,0x08,0x10,0x08,
592                                    0x10,0x0c,0x10,0x0c),
593   demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
594                                    0x10,0x06,0x10,0x06,
595                                    0x10,0x0A,0x10,0x0A,
596                                    0x10,0x0E,0x10,0x0E),
597   demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
598                                    0x10,0x05,0x10,0x07,
599                                    0x10,0x09,0x10,0x0B,
600                                    0x10,0x0D,0x10,0x0F);
601
602 /*
603   this is so I can play live CCIR raw video
604 */
605 static int altivec_uyvy_rgb32 (SwsContext *c,
606                                unsigned char **in, int *instrides,
607                                int srcSliceY,   int srcSliceH,
608                                unsigned char **oplanes, int *outstrides)
609 {
610   int w = c->srcW;
611   int h = srcSliceH;
612   int i,j;
613   vector unsigned char uyvy;
614   vector signed   short Y,U,V;
615   vector signed   short vx,ux,uvx;
616   vector signed   short R0,G0,B0,R1,G1,B1;
617   vector unsigned char  R,G,B;
618   vector unsigned char *out;
619   ubyte *img;
620
621   img = in[0];
622   out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
623
624   for (i=0;i<h;i++) {
625     for (j=0;j<w/16;j++) {
626       uyvy = vec_ld (0, img);
627       U = (vector signed short)
628         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
629
630       V = (vector signed short)
631         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
632
633       Y = (vector signed short)
634         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
635
636       cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
637
638       uyvy = vec_ld (16, img);
639       U = (vector signed short)
640         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
641
642       V = (vector signed short)
643         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
644
645       Y = (vector signed short)
646         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
647
648       cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
649
650       R  = vec_packclp (R0,R1);
651       G  = vec_packclp (G0,G1);
652       B  = vec_packclp (B0,B1);
653
654       //      vec_mstbgr24 (R,G,B, out);
655       out_rgba (R,G,B,out);
656
657       img += 32;
658     }
659   }
660   return srcSliceH;
661 }
662
663
664
665 /* Ok currently the acceleration routine only supports
666    inputs of widths a multiple of 16
667    and heights a multiple 2
668
669    So we just fall back to the C codes for this.
670 */
671 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
672 {
673   if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))    
674     return NULL;
675
676   /*
677     and this seems not to matter too much I tried a bunch of 
678     videos with abnormal widths and mplayer crashes else where.
679     mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv 
680     boom with X11 bad match.
681     
682   */
683   if ((c->srcW & 0xf) != 0)    return NULL;
684
685   switch (c->srcFormat) {
686   case IMGFMT_YVU9:
687   case IMGFMT_IF09:
688   case IMGFMT_YV12:
689   case IMGFMT_I420:
690   case IMGFMT_IYUV:
691   case IMGFMT_CLPL:
692   case IMGFMT_Y800:
693   case IMGFMT_Y8:
694   case IMGFMT_NV12:
695   case IMGFMT_NV21:
696     if ((c->srcH & 0x1) != 0)
697       return NULL;
698
699     switch(c->dstFormat){
700     case IMGFMT_RGB24:
701       MSG_WARN("ALTIVEC: Color Space RGB24\n");
702       return altivec_yuv2_rgb24;
703     case IMGFMT_BGR24:
704       MSG_WARN("ALTIVEC: Color Space BGR24\n");
705       return altivec_yuv2_bgr24;
706     case IMGFMT_ARGB:
707       MSG_WARN("ALTIVEC: Color Space ARGB\n");
708       return altivec_yuv2_argb;
709     case IMGFMT_ABGR:
710       MSG_WARN("ALTIVEC: Color Space ABGR\n");
711       return altivec_yuv2_abgr;
712     case IMGFMT_RGBA:
713       MSG_WARN("ALTIVEC: Color Space RGBA\n");
714       return altivec_yuv2_rgba;
715     case IMGFMT_BGRA:
716       MSG_WARN("ALTIVEC: Color Space BGRA\n");
717       return altivec_yuv2_bgra;
718     default: return NULL;
719     }
720     break;
721
722   case IMGFMT_UYVY:
723     switch(c->dstFormat){
724     case IMGFMT_RGB32:
725       MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
726       return altivec_uyvy_rgb32;
727     default: return NULL;
728     }
729     break;
730
731   }
732   return NULL;
733 }
734
735 static uint16_t roundToInt16(int64_t f){
736         int r= (f + (1<<15))>>16;
737              if(r<-0x7FFF) return 0x8000;
738         else if(r> 0x7FFF) return 0x7FFF;
739         else               return r;
740 }
741
742 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
743 {
744   union {
745         signed short tmp[8] __attribute__ ((aligned(16)));
746         vector signed short vec;
747         } buf;
748
749   buf.tmp[0] =  ( (0xffffLL) * contrast>>8 )>>9;                        //cy
750   buf.tmp[1] =  -256*brightness;                                        //oy
751   buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);     //crv
752   buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);     //cbu
753   buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));    //cgu
754   buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));    //cgv
755
756
757   c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
758   c->CY  = vec_splat ((vector signed short)buf.vec, 0);
759   c->OY  = vec_splat ((vector signed short)buf.vec, 1);
760   c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
761   c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
762   c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
763   c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
764 #if 0
765 {
766 int i;
767 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
768 for (i=0; i<6;i++)
769   printf("%s %d ", v[i],buf.tmp[i] );
770   printf("\n");
771 }
772 #endif
773  return;
774 }
775
776
777 void
778 altivec_yuv2packedX (SwsContext *c,
779                        int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
780                        int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
781                        uint8_t *dest, int dstW, int dstY)
782 {
783   int i,j;
784   short *f;
785   vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
786   vector signed short R0,G0,B0,R1,G1,B1;
787
788   vector unsigned char R,G,B,pels[3];
789   vector unsigned char *out,*nout;
790
791   vector signed short   RND = vec_splat_s16(1<<3);
792   vector unsigned short SCL = vec_splat_u16(4);
793   unsigned long scratch[16] __attribute__ ((aligned (16)));
794
795   vector signed short *YCoeffs, *CCoeffs;
796
797   YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
798   CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
799
800   out = (vector unsigned char *)dest;
801
802   for(i=0; i<dstW; i+=16){
803     Y0 = RND;
804     Y1 = RND;
805     /* extract 16 coeffs from lumSrc */
806     for(j=0; j<lumFilterSize; j++) {
807       X0 = vec_ld (0,  &lumSrc[j][i]);
808       X1 = vec_ld (16, &lumSrc[j][i]);
809       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
810       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
811     }
812
813     U = RND;
814     V = RND;
815     /* extract 8 coeffs from U,V */
816     for(j=0; j<chrFilterSize; j++) {
817       X  = vec_ld (0, &chrSrc[j][i/2]);
818       U  = vec_mradds (X, CCoeffs[j], U);
819       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
820       V  = vec_mradds (X, CCoeffs[j], V);
821     }
822
823     /* scale and clip signals */
824     Y0 = vec_sra (Y0, SCL);
825     Y1 = vec_sra (Y1, SCL);
826     U  = vec_sra (U,  SCL);
827     V  = vec_sra (V,  SCL);
828
829     Y0 = vec_clip_s16 (Y0);
830     Y1 = vec_clip_s16 (Y1);
831     U  = vec_clip_s16 (U);
832     V  = vec_clip_s16 (V);
833
834     /* now we have
835       Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
836       U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
837
838       Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
839       U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
840       V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
841     */
842
843     U0 = vec_mergeh (U,U);
844     V0 = vec_mergeh (V,V);
845
846     U1 = vec_mergel (U,U);
847     V1 = vec_mergel (V,V);
848
849     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
850     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
851
852     R  = vec_packclp (R0,R1);
853     G  = vec_packclp (G0,G1);
854     B  = vec_packclp (B0,B1);
855
856     switch(c->dstFormat) {
857       case IMGFMT_ABGR: out_abgr (R,G,B,out); break;
858       case IMGFMT_BGRA: out_bgra (R,G,B,out); break;
859       case IMGFMT_RGBA: out_rgba (R,G,B,out); break;
860       case IMGFMT_ARGB: out_argb (R,G,B,out); break;
861       case IMGFMT_RGB24: out_rgb24 (R,G,B,out); break;
862       case IMGFMT_BGR24: out_bgr24 (R,G,B,out); break;
863       default:
864         {
865           /* If this is reached, the caller should have called yuv2packedXinC
866              instead. */
867           static int printed_error_message;
868           if(!printed_error_message) {
869             MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
870                     sws_format_name(c->dstFormat));
871             printed_error_message=1;
872           }
873           return;
874         }
875     }
876   }
877
878   if (i < dstW) {
879     i -= 16;
880
881     Y0 = RND;
882     Y1 = RND;
883     /* extract 16 coeffs from lumSrc */
884     for(j=0; j<lumFilterSize; j++) {
885       X0 = vec_ld (0,  &lumSrc[j][i]);
886       X1 = vec_ld (16, &lumSrc[j][i]);
887       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
888       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
889     }
890
891     U = RND;
892     V = RND;
893     /* extract 8 coeffs from U,V */
894     for(j=0; j<chrFilterSize; j++) {
895       X  = vec_ld (0, &chrSrc[j][i/2]);
896       U  = vec_mradds (X, CCoeffs[j], U);
897       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
898       V  = vec_mradds (X, CCoeffs[j], V);
899     }
900
901     /* scale and clip signals */
902     Y0 = vec_sra (Y0, SCL);
903     Y1 = vec_sra (Y1, SCL);
904     U  = vec_sra (U,  SCL);
905     V  = vec_sra (V,  SCL);
906
907     Y0 = vec_clip_s16 (Y0);
908     Y1 = vec_clip_s16 (Y1);
909     U  = vec_clip_s16 (U);
910     V  = vec_clip_s16 (V);
911
912     /* now we have
913        Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
914        U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
915
916        Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
917        U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
918        V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
919     */
920
921     U0 = vec_mergeh (U,U);
922     V0 = vec_mergeh (V,V);
923
924     U1 = vec_mergel (U,U);
925     V1 = vec_mergel (V,V);
926
927     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
928     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
929
930     R  = vec_packclp (R0,R1);
931     G  = vec_packclp (G0,G1);
932     B  = vec_packclp (B0,B1);
933
934     nout = (vector unsigned char *)scratch;
935     switch(c->dstFormat) {
936       case IMGFMT_ABGR: out_abgr (R,G,B,nout); break;
937       case IMGFMT_BGRA: out_bgra (R,G,B,nout); break;
938       case IMGFMT_RGBA: out_rgba (R,G,B,nout); break;
939       case IMGFMT_ARGB: out_argb (R,G,B,nout); break;
940       case IMGFMT_RGB24: out_rgb24 (R,G,B,nout); break;
941       case IMGFMT_BGR24: out_bgr24 (R,G,B,nout); break;
942       default:
943         /* Unreachable, I think. */
944         MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
945                 sws_format_name(c->dstFormat));
946         return;
947     }
948
949     memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
950   }
951
952 }