]> git.sesse.net Git - ffmpeg/blob - postproc/yuv2rgb_altivec.c
altivec_yuv2packedX() ignores the requested output format and unconditionally
[ffmpeg] / postproc / yuv2rgb_altivec.c
1 /*
2   marc.hoffman@analog.com    March 8, 2004
3
4   Altivec Acceleration for Color Space Conversion revision 0.2
5
6   convert I420 YV12 to RGB in various formats,
7     it rejects images that are not in 420 formats
8     it rejects images that don't have widths of multiples of 16
9     it rejects images that don't have heights of multiples of 2
10   reject defers to C simulation codes.
11
12   lots of optimizations to be done here
13
14   1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15      so we currently use max min to clip
16
17   2. the inefficient use of chroma loading needs a bit of brushing up
18
19   3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
20
21
22   MODIFIED to calculate coeffs from currently selected color space.
23   MODIFIED core to be a macro which you spec the output format.
24   ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25   CORRECTED algorithim selection to be strict on input formats.
26   ADDED runtime detection of altivec.
27
28   ADDED altivec_yuv2packedX vertical scl + RGB converter
29
30   March 27,2004
31   PERFORMANCE ANALYSIS
32
33   The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34   The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
35
36   720*480*30  ~10MPS
37
38   so we have roughly 10clocks per pixel this is too high something has to be wrong.
39
40   OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
41
42   OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43   guaranteed to have the input video frame it was just decompressed so
44   it probably resides in L1 caches.  However we are creating the
45   output video stream this needs to use the DSTST instruction to
46   optimize for the cache.  We couple this with the fact that we are
47   not going to be visiting the input buffer again so we mark it Least
48   Recently Used.  This shaves 25% of the processor cycles off.
49
50   Now MEMCPY is the largest mips consumer in the system, probably due
51   to the inefficient X11 stuff.
52
53   GL libraries seem to be very slow on this machine 1.33Ghz PB running
54   Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
55   a versioning issues, however i have libGL.1.2.dylib for both
56   machines. ((We need to figure this out now))
57
58   GL2 libraries work now with patch for RGB32
59
60   NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
61
62   Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment. 
63
64 */
65 #include <stdio.h>
66 #include <stdlib.h>
67 #include <string.h>
68 #include <inttypes.h>
69 #include <assert.h>
70 #include "config.h"
71 #ifdef HAVE_MALLOC_H
72 #include <malloc.h>
73 #endif
74 #include "rgb2rgb.h"
75 #include "swscale.h"
76 #include "swscale_internal.h"
77 #include "mangle.h"
78 #include "libvo/img_format.h" //FIXME try to reduce dependency of such stuff
79
80 #undef PROFILE_THE_BEAST
81 #undef INC_SCALING
82
83 typedef unsigned char ubyte;
84 typedef signed char   sbyte;
85
86
87 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
88    homogeneous vector registers x0,x1,x2 are interleaved with the
89    following technique:
90
91       o0 = vec_mergeh (x0,x1);
92       o1 = vec_perm (o0, x2, perm_rgb_0);
93       o2 = vec_perm (o0, x2, perm_rgb_1);
94       o3 = vec_mergel (x0,x1);
95       o4 = vec_perm (o3,o2,perm_rgb_2);
96       o5 = vec_perm (o3,o2,perm_rgb_3);
97
98   perm_rgb_0:   o0(RG).h v1(B) --> o1*
99               0   1  2   3   4
100              rgbr|gbrg|brgb|rgbr
101              0010 0100 1001 0010
102              0102 3145 2673 894A
103
104   perm_rgb_1:   o0(RG).h v1(B) --> o2
105               0   1  2   3   4
106              gbrg|brgb|bbbb|bbbb
107              0100 1001 1111 1111
108              B5CD 6EF7 89AB CDEF
109
110   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
111               0   1  2   3   4
112              gbrg|brgb|rgbr|gbrg
113              1111 1111 0010 0100
114              89AB CDEF 0182 3945
115
116   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
117               0   1  2   3   4
118              brgb|rgbr|gbrg|brgb
119              1001 0010 0100 1001
120              a67b 89cA BdCD eEFf
121
122 */
123 static
124 const vector unsigned char
125   perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
126                                       0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
127   perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
128                                       0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
129   perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
130                                       0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
131   perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
132                                       0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
133
134 #define vec_merge3(x2,x1,x0,y0,y1,y2)    \
135 do {                                     \
136   typeof(x0) o0,o2,o3;                   \
137       o0 = vec_mergeh (x0,x1);           \
138       y0 = vec_perm (o0, x2, perm_rgb_0);\
139       o2 = vec_perm (o0, x2, perm_rgb_1);\
140       o3 = vec_mergel (x0,x1);           \
141       y1 = vec_perm (o3,o2,perm_rgb_2);  \
142       y2 = vec_perm (o3,o2,perm_rgb_3);  \
143 } while(0)
144
145 #define vec_mstrgb24(x0,x1,x2,ptr)        \
146 do {                                     \
147   typeof(x0) _0,_1,_2;                   \
148   vec_merge3 (x0,x1,x2,_0,_1,_2);        \
149   vec_st (_0, 0, ptr++);                 \
150   vec_st (_1, 0, ptr++);                 \
151   vec_st (_2, 0, ptr++);                 \
152 }  while (0);
153
154 #define vec_mstbgr24(x0,x1,x2,ptr)       \
155 do {                                     \
156   typeof(x0) _0,_1,_2;                   \
157   vec_merge3 (x2,x1,x0,_0,_1,_2);        \
158   vec_st (_0, 0, ptr++);                 \
159   vec_st (_1, 0, ptr++);                 \
160   vec_st (_2, 0, ptr++);                 \
161 }  while (0);
162
163 /* pack the pixels in rgb0 format
164    msb R
165    lsb 0
166 */
167 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                                \
168 do {                                                                                   \
169   T _0,_1,_2,_3;                                                                       \
170   _0 = vec_mergeh (x0,x1);                                                             \
171   _1 = vec_mergeh (x2,x3);                                                             \
172   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
173   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
174   vec_st (_2, 0*16, (T *)ptr);                                                         \
175   vec_st (_3, 1*16, (T *)ptr);                                                         \
176   _0 = vec_mergel (x0,x1);                                                             \
177   _1 = vec_mergel (x2,x3);                                                             \
178   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
179   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
180   vec_st (_2, 2*16, (T *)ptr);                                                         \
181   vec_st (_3, 3*16, (T *)ptr);                                                         \
182   ptr += 4;                                                                            \
183 }  while (0);
184
185 /*
186
187   | 1     0       1.4021   | | Y |
188   | 1    -0.3441 -0.7142   |x| Cb|
189   | 1     1.7718  0        | | Cr|
190
191
192   Y:      [-128 127]
193   Cb/Cr : [-128 127]
194
195   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
196
197 */
198
199
200
201
202 #define vec_unh(x) \
203   (vector signed short) \
204     vec_perm(x,(typeof(x))AVV(0),\
205              (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
206                                     0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
207 #define vec_unl(x) \
208   (vector signed short) \
209     vec_perm(x,(typeof(x))AVV(0),\
210              (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
211                                     0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
212
213 #define vec_clip(x) \
214   vec_max (vec_min (x, (typeof(x))AVV(235)), (typeof(x))AVV(16))
215
216 #define vec_packclp_a(x,y) \
217   (vector unsigned char)vec_pack (vec_clip (x), vec_clip (y))
218
219 #define vec_packclp(x,y) \
220   (vector unsigned char)vec_packs \
221       ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
222        (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
223
224 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
225
226
227 static inline void cvtyuvtoRGB (SwsContext *c,
228                            vector signed short Y, vector signed short U, vector signed short V,
229                            vector signed short *R, vector signed short *G, vector signed short *B)
230 {
231   vector signed   short vx,ux,uvx;
232
233   Y = vec_mradds (Y, c->CY, c->OY);
234   U  = vec_sub (U,(vector signed short)
235                         vec_splat((vector signed short)AVV(128),0));
236   V  = vec_sub (V,(vector signed short)
237                         vec_splat((vector signed short)AVV(128),0));
238
239   //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
240   ux = vec_sl (U, c->CSHIFT);
241   *B = vec_mradds (ux, c->CBU, Y);
242
243   // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
244   vx = vec_sl (V, c->CSHIFT);
245   *R = vec_mradds (vx, c->CRV, Y);
246
247   // uvx = ((CGU*u) + (CGV*v))>>15;
248   uvx = vec_mradds (U, c->CGU, Y);
249   *G = vec_mradds (V, c->CGV, uvx);
250 }
251
252
253 /*
254   ------------------------------------------------------------------------------
255   CS converters
256   ------------------------------------------------------------------------------
257 */
258
259
260 #define DEFCSP420_CVT(name,out_pixels)                                     \
261 static int altivec_##name (SwsContext *c,                                  \
262                                 unsigned char **in, int *instrides,        \
263                                 int srcSliceY,  int srcSliceH,             \
264                                 unsigned char **oplanes, int *outstrides)  \
265 {                                                                          \
266   int w = c->srcW;                                                         \
267   int h = srcSliceH;                                                       \
268   int i,j;                                                                 \
269   int instrides_scl[3];                                                    \
270   vector unsigned char y0,y1;                                              \
271                                                                            \
272   vector signed char  u,v;                                                 \
273                                                                            \
274   vector signed short Y0,Y1,Y2,Y3;                                         \
275   vector signed short U,V;                                                 \
276   vector signed short vx,ux,uvx;                                           \
277   vector signed short vx0,ux0,uvx0;                                        \
278   vector signed short vx1,ux1,uvx1;                                        \
279   vector signed short R0,G0,B0;                                            \
280   vector signed short R1,G1,B1;                                            \
281   vector unsigned char R,G,B;                                              \
282                                                                            \
283   vector unsigned char *uivP, *vivP;                                       \
284   vector unsigned char align_perm;                                         \
285                                                                            \
286   vector signed short                                                      \
287     lCY  = c->CY,                                                          \
288     lOY  = c->OY,                                                          \
289     lCRV = c->CRV,                                                         \
290     lCBU = c->CBU,                                                         \
291     lCGU = c->CGU,                                                         \
292     lCGV = c->CGV;                                                         \
293                                                                            \
294   vector unsigned short lCSHIFT = c->CSHIFT;                               \
295                                                                            \
296   ubyte *y1i   = in[0];                                                    \
297   ubyte *y2i   = in[0]+w;                                                  \
298   ubyte *ui    = in[1];                                                    \
299   ubyte *vi    = in[2];                                                    \
300                                                                            \
301   vector unsigned char *oute                                               \
302     = (vector unsigned char *)                                             \
303         (oplanes[0]+srcSliceY*outstrides[0]);                              \
304   vector unsigned char *outo                                               \
305     = (vector unsigned char *)                                             \
306         (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);                \
307                                                                            \
308                                                                            \
309   instrides_scl[0] = instrides[0];                                         \
310   instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */     \
311   instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */     \
312                                                                            \
313                                                                            \
314   for (i=0;i<h/2;i++) {                                                    \
315     vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);                 \
316     vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);                 \
317                                                                            \
318     for (j=0;j<w/16;j++) {                                                 \
319                                                                            \
320       y0 = vec_ldl (0,y1i);                                                \
321       y1 = vec_ldl (0,y2i);                                                \
322       uivP = (vector unsigned char *)ui;                                   \
323       vivP = (vector unsigned char *)vi;                                   \
324                                                                            \
325       align_perm = vec_lvsl (0, ui);                                       \
326       u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);     \
327                                                                            \
328       align_perm = vec_lvsl (0, vi);                                       \
329       v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);     \
330                                                                            \
331       u  = (vector signed char)                                            \
332                 vec_sub (u,(vector signed char)                            \
333                                 vec_splat((vector signed char)AVV(128),0));\
334       v  = (vector signed char)                                            \
335                 vec_sub (v,(vector signed char)                            \
336                                 vec_splat((vector signed char)AVV(128),0));\
337                                                                            \
338       U  = vec_unpackh (u);                                                \
339       V  = vec_unpackh (v);                                                \
340                                                                            \
341                                                                            \
342         Y0 = vec_unh (y0);                                                 \
343         Y1 = vec_unl (y0);                                                 \
344         Y2 = vec_unh (y1);                                                 \
345         Y3 = vec_unl (y1);                                                 \
346                                                                            \
347         Y0 = vec_mradds (Y0, lCY, lOY);                                    \
348         Y1 = vec_mradds (Y1, lCY, lOY);                                    \
349         Y2 = vec_mradds (Y2, lCY, lOY);                                    \
350         Y3 = vec_mradds (Y3, lCY, lOY);                                    \
351                                                                            \
352         /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                         \
353         ux = vec_sl (U, lCSHIFT);                                          \
354         ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));                   \
355         ux0  = vec_mergeh (ux,ux);                                         \
356         ux1  = vec_mergel (ux,ux);                                         \
357                                                                            \
358         /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;  */                         \
359         vx = vec_sl (V, lCSHIFT);                                          \
360         vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));                   \
361         vx0  = vec_mergeh (vx,vx);                                         \
362         vx1  = vec_mergel (vx,vx);                                         \
363                                                                            \
364         /* uvx = ((CGU*u) + (CGV*v))>>15 */                                \
365         uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));                   \
366         uvx = vec_mradds (V, lCGV, uvx);                                   \
367         uvx0 = vec_mergeh (uvx,uvx);                                       \
368         uvx1 = vec_mergel (uvx,uvx);                                       \
369                                                                            \
370         R0 = vec_add (Y0,vx0);                                             \
371         G0 = vec_add (Y0,uvx0);                                            \
372         B0 = vec_add (Y0,ux0);                                             \
373         R1 = vec_add (Y1,vx1);                                             \
374         G1 = vec_add (Y1,uvx1);                                            \
375         B1 = vec_add (Y1,ux1);                                             \
376                                                                            \
377         R  = vec_packclp (R0,R1);                                          \
378         G  = vec_packclp (G0,G1);                                          \
379         B  = vec_packclp (B0,B1);                                          \
380                                                                            \
381         out_pixels(R,G,B,oute);                                            \
382                                                                            \
383         R0 = vec_add (Y2,vx0);                                             \
384         G0 = vec_add (Y2,uvx0);                                            \
385         B0 = vec_add (Y2,ux0);                                             \
386         R1 = vec_add (Y3,vx1);                                             \
387         G1 = vec_add (Y3,uvx1);                                            \
388         B1 = vec_add (Y3,ux1);                                             \
389         R  = vec_packclp (R0,R1);                                          \
390         G  = vec_packclp (G0,G1);                                          \
391         B  = vec_packclp (B0,B1);                                          \
392                                                                            \
393                                                                            \
394         out_pixels(R,G,B,outo);                                            \
395                                                                            \
396       y1i  += 16;                                                          \
397       y2i  += 16;                                                          \
398       ui   += 8;                                                           \
399       vi   += 8;                                                           \
400                                                                            \
401     }                                                                      \
402                                                                            \
403     outo += (outstrides[0])>>4;                                            \
404     oute += (outstrides[0])>>4;                                            \
405                                                                            \
406     ui    += instrides_scl[1];                                             \
407     vi    += instrides_scl[2];                                             \
408     y1i   += instrides_scl[0];                                             \
409     y2i   += instrides_scl[0];                                             \
410   }                                                                        \
411   return srcSliceH;                                                        \
412 }
413
414
415 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
416 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
417 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
418 #define out_argb(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
419 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
420 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(c,b,a,ptr)
421
422 DEFCSP420_CVT (yuv2_abgr32, out_abgr)
423 #if 1
424 DEFCSP420_CVT (yuv2_bgra32, out_argb)
425 #else
426 static int altivec_yuv2_bgra32 (SwsContext *c,                                  
427                                 unsigned char **in, int *instrides,        
428                                 int srcSliceY,  int srcSliceH,             
429                                 unsigned char **oplanes, int *outstrides)  
430 {                                                                          
431   int w = c->srcW;                                                         
432   int h = srcSliceH;                                                       
433   int i,j;                                                                 
434   int instrides_scl[3];                                                    
435   vector unsigned char y0,y1;                                              
436                                                                            
437   vector signed char  u,v;                                                 
438                                                                            
439   vector signed short Y0,Y1,Y2,Y3;                                         
440   vector signed short U,V;                                                 
441   vector signed short vx,ux,uvx;                                           
442   vector signed short vx0,ux0,uvx0;                                        
443   vector signed short vx1,ux1,uvx1;                                        
444   vector signed short R0,G0,B0;                                            
445   vector signed short R1,G1,B1;                                            
446   vector unsigned char R,G,B;                                              
447                                                                            
448   vector unsigned char *uivP, *vivP;                                       
449   vector unsigned char align_perm;                                         
450                                                                            
451   vector signed short                                                      
452     lCY  = c->CY,                                                          
453     lOY  = c->OY,                                                          
454     lCRV = c->CRV,                                                         
455     lCBU = c->CBU,                                                         
456     lCGU = c->CGU,                                                         
457     lCGV = c->CGV;                                                         
458                                                                            
459   vector unsigned short lCSHIFT = c->CSHIFT;                               
460                                                                            
461   ubyte *y1i   = in[0];                                                    
462   ubyte *y2i   = in[0]+w;                                                  
463   ubyte *ui    = in[1];                                                    
464   ubyte *vi    = in[2];                                                    
465                                                                            
466   vector unsigned char *oute                                               
467     = (vector unsigned char *)                                             
468         (oplanes[0]+srcSliceY*outstrides[0]);                              
469   vector unsigned char *outo                                               
470     = (vector unsigned char *)                                             
471         (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);                
472                                                                            
473                                                                            
474   instrides_scl[0] = instrides[0];                                         
475   instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */     
476   instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */     
477                                                                            
478                                                                            
479   for (i=0;i<h/2;i++) {                                                    
480     vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);                 
481     vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);                 
482                                                                            
483     for (j=0;j<w/16;j++) {                                                 
484                                                                            
485       y0 = vec_ldl (0,y1i);                                                
486       y1 = vec_ldl (0,y2i);                                                
487       uivP = (vector unsigned char *)ui;                                   
488       vivP = (vector unsigned char *)vi;                                   
489                                                                            
490       align_perm = vec_lvsl (0, ui);                                       
491       u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);     
492                                                                            
493       align_perm = vec_lvsl (0, vi);                                       
494       v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
495       u  = (vector signed char)
496                 vec_sub (u,(vector signed char)
497                                 vec_splat((vector signed char)AVV(128),0));
498       
499       v  = (vector signed char)
500                 vec_sub (v, (vector signed char)
501                                 vec_splat((vector signed char)AVV(128),0));
502       
503       U  = vec_unpackh (u);                                                
504       V  = vec_unpackh (v);                                                
505                                                                            
506                                                                            
507         Y0 = vec_unh (y0);                                                 
508         Y1 = vec_unl (y0);                                                 
509         Y2 = vec_unh (y1);                                                 
510         Y3 = vec_unl (y1);                                                 
511                                                                            
512         Y0 = vec_mradds (Y0, lCY, lOY);                                    
513         Y1 = vec_mradds (Y1, lCY, lOY);                                    
514         Y2 = vec_mradds (Y2, lCY, lOY);                                    
515         Y3 = vec_mradds (Y3, lCY, lOY);                                    
516                                                                            
517         /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                         
518         ux = vec_sl (U, lCSHIFT);                                          
519         ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
520         ux0  = vec_mergeh (ux,ux);                                         
521         ux1  = vec_mergel (ux,ux);                                         
522                                                                            
523         /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;  */                         
524         vx = vec_sl (V, lCSHIFT);                                          
525         vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
526         vx0  = vec_mergeh (vx,vx);
527         vx1  = vec_mergel (vx,vx);
528         /* uvx = ((CGU*u) + (CGV*v))>>15 */
529         uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
530         uvx = vec_mradds (V, lCGV, uvx);
531         uvx0 = vec_mergeh (uvx,uvx);
532         uvx1 = vec_mergel (uvx,uvx);
533         R0 = vec_add (Y0,vx0);
534         G0 = vec_add (Y0,uvx0);
535         B0 = vec_add (Y0,ux0);
536         R1 = vec_add (Y1,vx1);
537         G1 = vec_add (Y1,uvx1);
538         B1 = vec_add (Y1,ux1);
539         R  = vec_packclp (R0,R1);
540         G  = vec_packclp (G0,G1);
541         B  = vec_packclp (B0,B1);
542         
543         out_argb(R,G,B,oute);
544         R0 = vec_add (Y2,vx0);
545         G0 = vec_add (Y2,uvx0);
546         B0 = vec_add (Y2,ux0);
547         R1 = vec_add (Y3,vx1);
548         G1 = vec_add (Y3,uvx1);
549         B1 = vec_add (Y3,ux1);
550         R  = vec_packclp (R0,R1);
551         G  = vec_packclp (G0,G1);
552         B  = vec_packclp (B0,B1);
553         
554         out_argb(R,G,B,outo);
555         y1i  += 16;                                                        
556         y2i  += 16;                                                        
557         ui   += 8;
558         vi   += 8;                                                         
559                                                                            
560     }                                                                      
561                                                                            
562     outo += (outstrides[0])>>4;                                            
563     oute += (outstrides[0])>>4;                                            
564                                                                            
565     ui    += instrides_scl[1];                                             
566     vi    += instrides_scl[2];                                             
567     y1i   += instrides_scl[0];                                             
568     y2i   += instrides_scl[0];                                             
569   }                                                                        
570   return srcSliceH;                                                        
571 }
572
573 #endif
574
575
576 DEFCSP420_CVT (yuv2_rgba32, out_rgba)
577 DEFCSP420_CVT (yuv2_argb32, out_argb)
578 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
579 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
580
581
582 // uyvy|uyvy|uyvy|uyvy
583 // 0123 4567 89ab cdef
584 static
585 const vector unsigned char
586   demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
587                                    0x10,0x04,0x10,0x04,
588                                    0x10,0x08,0x10,0x08,
589                                    0x10,0x0c,0x10,0x0c),
590   demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
591                                    0x10,0x06,0x10,0x06,
592                                    0x10,0x0A,0x10,0x0A,
593                                    0x10,0x0E,0x10,0x0E),
594   demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
595                                    0x10,0x05,0x10,0x07,
596                                    0x10,0x09,0x10,0x0B,
597                                    0x10,0x0D,0x10,0x0F);
598
599 /*
600   this is so I can play live CCIR raw video
601 */
602 static int altivec_uyvy_rgb32 (SwsContext *c,
603                                unsigned char **in, int *instrides,
604                                int srcSliceY,   int srcSliceH,
605                                unsigned char **oplanes, int *outstrides)
606 {
607   int w = c->srcW;
608   int h = srcSliceH;
609   int i,j;
610   vector unsigned char uyvy;
611   vector signed   short Y,U,V;
612   vector signed   short vx,ux,uvx;
613   vector signed   short R0,G0,B0,R1,G1,B1;
614   vector unsigned char  R,G,B;
615   vector unsigned char *out;
616   ubyte *img;
617
618   img = in[0];
619   out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
620
621   for (i=0;i<h;i++) {
622     for (j=0;j<w/16;j++) {
623       uyvy = vec_ld (0, img);
624       U = (vector signed short)
625         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
626
627       V = (vector signed short)
628         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
629
630       Y = (vector signed short)
631         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
632
633       cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
634
635       uyvy = vec_ld (16, img);
636       U = (vector signed short)
637         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
638
639       V = (vector signed short)
640         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
641
642       Y = (vector signed short)
643         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
644
645       cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
646
647       R  = vec_packclp (R0,R1);
648       G  = vec_packclp (G0,G1);
649       B  = vec_packclp (B0,B1);
650
651       //      vec_mstbgr24 (R,G,B, out);
652       out_rgba (R,G,B,out);
653
654       img += 32;
655     }
656   }
657   return srcSliceH;
658 }
659
660
661
662 /* Ok currently the acceleration routine only supports
663    inputs of widths a multiple of 16
664    and heights a multiple 2
665
666    So we just fall back to the C codes for this.
667 */
668 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
669 {
670   if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))    
671     return NULL;
672
673   /*
674     and this seems not to matter too much I tried a bunch of 
675     videos with abnormal widths and mplayer crashes else where.
676     mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv 
677     boom with X11 bad match.
678     
679   */
680   if ((c->srcW & 0xf) != 0)    return NULL;
681
682   switch (c->srcFormat) {
683   case IMGFMT_YVU9:
684   case IMGFMT_IF09:
685   case IMGFMT_YV12:
686   case IMGFMT_I420:
687   case IMGFMT_IYUV:
688   case IMGFMT_CLPL:
689   case IMGFMT_Y800:
690   case IMGFMT_Y8:
691   case IMGFMT_NV12:
692   case IMGFMT_NV21:
693     if ((c->srcH & 0x1) != 0)
694       return NULL;
695
696     switch(c->dstFormat){
697     case IMGFMT_RGB24:
698       MSG_WARN("ALTIVEC: Color Space RGB24\n");
699       return altivec_yuv2_rgb24;
700     case IMGFMT_BGR24:
701       MSG_WARN("ALTIVEC: Color Space BGR24\n");
702       return altivec_yuv2_bgr24;
703     case IMGFMT_RGB32:
704       MSG_WARN("ALTIVEC: Color Space ARGB32\n");
705       return altivec_yuv2_argb32;
706     case IMGFMT_BGR32:
707       MSG_WARN("ALTIVEC: Color Space BGRA32\n");
708       //      return profile_altivec_bgra32;
709
710       return altivec_yuv2_bgra32;
711     default: return NULL;
712     }
713     break;
714
715   case IMGFMT_UYVY:
716     switch(c->dstFormat){
717     case IMGFMT_RGB32:
718       MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
719       return altivec_uyvy_rgb32;
720     default: return NULL;
721     }
722     break;
723
724   }
725   return NULL;
726 }
727
728 static uint16_t roundToInt16(int64_t f){
729         int r= (f + (1<<15))>>16;
730              if(r<-0x7FFF) return 0x8000;
731         else if(r> 0x7FFF) return 0x7FFF;
732         else               return r;
733 }
734
735 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
736 {
737   union {
738         signed short tmp[8] __attribute__ ((aligned(16)));
739         vector signed short vec;
740         } buf;
741
742   buf.tmp[0] =  ( (0xffffLL) * contrast>>8 )>>9;                        //cy
743   buf.tmp[1] =  -256*brightness;                                        //oy
744   buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);     //crv
745   buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);     //cbu
746   buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));    //cgu
747   buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));    //cgv
748
749
750   c->CSHIFT = (vector unsigned short)vec_splat((vector unsigned short)AVV(2),0);
751   c->CY  = vec_splat ((vector signed short)buf.vec, 0);
752   c->OY  = vec_splat ((vector signed short)buf.vec, 1);
753   c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
754   c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
755   c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
756   c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
757 #if 0
758 {
759 int i;
760 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
761 for (i=0; i<6;i++)
762   printf("%s %d ", v[i],buf.tmp[i] );
763   printf("\n");
764 }
765 #endif
766  return;
767 }
768
769
770 void
771 altivec_yuv2packedX (SwsContext *c,
772                        int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
773                        int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
774                        uint8_t *dest, int dstW, int dstY)
775 {
776   int i,j;
777   short tmp __attribute__((aligned (16)));
778   int16_t *p;
779   short *f;
780   vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
781   vector signed short R0,G0,B0,R1,G1,B1;
782
783   vector unsigned char R,G,B,pels[3];
784   vector unsigned char *out,*nout;
785
786   vector signed short   RND = vec_splat((vector signed short)AVV(1<<3),0);
787   vector unsigned short SCL = vec_splat((vector unsigned short)AVV(4),0);
788   unsigned long scratch[16] __attribute__ ((aligned (16)));
789
790   vector signed short *vYCoeffsBank, *vCCoeffsBank;
791
792   vector signed short *YCoeffs, *CCoeffs;
793
794   vYCoeffsBank = memalign (16, sizeof (vector signed short)*lumFilterSize*c->dstH);
795   vCCoeffsBank = memalign (16, sizeof (vector signed short)*chrFilterSize*c->dstH);
796
797   for (i=0;i<lumFilterSize*c->dstH;i++) {
798     tmp = c->vLumFilter[i];
799     p = &vYCoeffsBank[i];
800     for (j=0;j<8;j++)
801       p[j] = tmp;
802   }
803
804   for (i=0;i<chrFilterSize*c->dstH;i++) {
805     tmp = c->vChrFilter[i];
806     p = &vCCoeffsBank[i];
807     for (j=0;j<8;j++)
808       p[j] = tmp;
809   }
810
811   YCoeffs = vYCoeffsBank+dstY*lumFilterSize;
812   CCoeffs = vCCoeffsBank+dstY*chrFilterSize;
813
814   out = (vector unsigned char *)dest;
815
816   for(i=0; i<dstW; i+=16){
817     Y0 = RND;
818     Y1 = RND;
819     /* extract 16 coeffs from lumSrc */
820     for(j=0; j<lumFilterSize; j++) {
821       X0 = vec_ld (0,  &lumSrc[j][i]);
822       X1 = vec_ld (16, &lumSrc[j][i]);
823       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
824       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
825     }
826
827     U = RND;
828     V = RND;
829     /* extract 8 coeffs from U,V */
830     for(j=0; j<chrFilterSize; j++) {
831       X  = vec_ld (0, &chrSrc[j][i/2]);
832       U  = vec_mradds (X, CCoeffs[j], U);
833       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
834       V  = vec_mradds (X, CCoeffs[j], V);
835     }
836
837     /* scale and clip signals */
838     Y0 = vec_sra (Y0, SCL);
839     Y1 = vec_sra (Y1, SCL);
840     U  = vec_sra (U,  SCL);
841     V  = vec_sra (V,  SCL);
842
843     Y0 = vec_clip (Y0);
844     Y1 = vec_clip (Y1);
845     U  = vec_clip (U);
846     V  = vec_clip (V);
847
848     /* now we have
849       Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
850       U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
851
852       Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
853       U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
854       V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
855     */
856
857     U0 = vec_mergeh (U,U);
858     V0 = vec_mergeh (V,V);
859
860     U1 = vec_mergel (U,U);
861     V1 = vec_mergel (V,V);
862
863     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
864     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
865
866     R  = vec_packclp (R0,R1);
867     G  = vec_packclp (G0,G1);
868     B  = vec_packclp (B0,B1);
869
870     switch(c->dstFormat) {
871       case IMGFMT_ABGR: out_abgr (R,G,B,out); break;
872       case IMGFMT_BGRA: out_bgra (R,G,B,out); break;
873       case IMGFMT_RGBA: out_rgba (R,G,B,out); break;
874       case IMGFMT_ARGB: out_argb (R,G,B,out); break;
875       case IMGFMT_RGB24: out_rgb24 (R,G,B,out); break;
876       case IMGFMT_BGR24: out_bgr24 (R,G,B,out); break;
877       default:
878         {
879           /* FIXME: either write more out_* macros or punt to yuv2packedXinC */
880           static int printed_error_message;
881           if(!printed_error_message) {
882             MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
883                     vo_format_name(c->dstFormat));
884             printed_error_message=1;
885           }
886           return;
887         }
888     }
889   }
890
891   if (i < dstW) {
892     i -= 16;
893
894     Y0 = RND;
895     Y1 = RND;
896     /* extract 16 coeffs from lumSrc */
897     for(j=0; j<lumFilterSize; j++) {
898       X0 = vec_ld (0,  &lumSrc[j][i]);
899       X1 = vec_ld (16, &lumSrc[j][i]);
900       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
901       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
902     }
903
904     U = RND;
905     V = RND;
906     /* extract 8 coeffs from U,V */
907     for(j=0; j<chrFilterSize; j++) {
908       X  = vec_ld (0, &chrSrc[j][i/2]);
909       U  = vec_mradds (X, CCoeffs[j], U);
910       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
911       V  = vec_mradds (X, CCoeffs[j], V);
912     }
913
914     /* scale and clip signals */
915     Y0 = vec_sra (Y0, SCL);
916     Y1 = vec_sra (Y1, SCL);
917     U  = vec_sra (U,  SCL);
918     V  = vec_sra (V,  SCL);
919
920     Y0 = vec_clip (Y0);
921     Y1 = vec_clip (Y1);
922     U  = vec_clip (U);
923     V  = vec_clip (V);
924
925     /* now we have
926        Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
927        U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
928
929        Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
930        U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
931        V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
932     */
933
934     U0 = vec_mergeh (U,U);
935     V0 = vec_mergeh (V,V);
936
937     U1 = vec_mergel (U,U);
938     V1 = vec_mergel (V,V);
939
940     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
941     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
942
943     R  = vec_packclp (R0,R1);
944     G  = vec_packclp (G0,G1);
945     B  = vec_packclp (B0,B1);
946
947     nout = (vector unsigned char *)scratch;
948     switch(c->dstFormat) {
949       case IMGFMT_ABGR: out_abgr (R,G,B,nout); break;
950       case IMGFMT_BGRA: out_bgra (R,G,B,nout); break;
951       case IMGFMT_RGBA: out_rgba (R,G,B,nout); break;
952       case IMGFMT_ARGB: out_argb (R,G,B,nout); break;
953       case IMGFMT_RGB24: out_rgb24 (R,G,B,nout); break;
954       case IMGFMT_BGR24: out_bgr24 (R,G,B,nout); break;
955       default:
956         /* Unreachable, I think. */
957         MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
958                 vo_format_name(c->dstFormat));
959         return;
960     }
961
962     memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
963   }
964
965   if (vYCoeffsBank) free (vYCoeffsBank);
966   if (vCCoeffsBank) free (vCCoeffsBank);
967
968 }