]> git.sesse.net Git - ffmpeg/blob - postproc/yuv2rgb_altivec.c
100l, forgot to change an ifdef on last commit
[ffmpeg] / postproc / yuv2rgb_altivec.c
1 /*
2   marc.hoffman@analog.com    March 8, 2004
3
4   Altivec Acceleration for Color Space Conversion revision 0.2
5
6   convert I420 YV12 to RGB in various formats,
7     it rejects images that are not in 420 formats
8     it rejects images that don't have widths of multiples of 16
9     it rejects images that don't have heights of multiples of 2
10   reject defers to C simulation codes.
11
12   lots of optimizations to be done here
13
14   1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15      so we currently use max min to clip
16
17   2. the inefficient use of chroma loading needs a bit of brushing up
18
19   3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
20
21
22   MODIFIED to calculate coeffs from currently selected color space.
23   MODIFIED core to be a macro which you spec the output format.
24   ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25   CORRECTED algorithim selection to be strict on input formats.
26   ADDED runtime detection of altivec.
27
28   ADDED altivec_yuv2packedX vertical scl + RGB converter
29
30   March 27,2004
31   PERFORMANCE ANALYSIS
32
33   The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34   The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
35
36   720*480*30  ~10MPS
37
38   so we have roughly 10clocks per pixel this is too high something has to be wrong.
39
40   OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
41
42   OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43   guaranteed to have the input video frame it was just decompressed so
44   it probably resides in L1 caches.  However we are creating the
45   output video stream this needs to use the DSTST instruction to
46   optimize for the cache.  We couple this with the fact that we are
47   not going to be visiting the input buffer again so we mark it Least
48   Recently Used.  This shaves 25% of the processor cycles off.
49
50   Now MEMCPY is the largest mips consumer in the system, probably due
51   to the inefficient X11 stuff.
52
53   GL libraries seem to be very slow on this machine 1.33Ghz PB running
54   Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
55   a versioning issues, however i have libGL.1.2.dylib for both
56   machines. ((We need to figure this out now))
57
58   GL2 libraries work now with patch for RGB32
59
60   NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
61
62   Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment. 
63
64 */
65 #include <stdio.h>
66 #include <stdlib.h>
67 #include <string.h>
68 #include <inttypes.h>
69 #include <assert.h>
70 #include "config.h"
71 #include "rgb2rgb.h"
72 #include "swscale.h"
73 #include "swscale_internal.h"
74 #include "../mangle.h"
75 #include "../libvo/img_format.h" //FIXME try to reduce dependency of such stuff
76
77 #undef PROFILE_THE_BEAST
78 #undef INC_SCALING
79
80 typedef unsigned char ubyte;
81 typedef signed char   sbyte;
82
83
84 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
85    homogeneous vector registers x0,x1,x2 are interleaved with the
86    following technique:
87
88       o0 = vec_mergeh (x0,x1);
89       o1 = vec_perm (o0, x2, perm_rgb_0);
90       o2 = vec_perm (o0, x2, perm_rgb_1);
91       o3 = vec_mergel (x0,x1);
92       o4 = vec_perm (o3,o2,perm_rgb_2);
93       o5 = vec_perm (o3,o2,perm_rgb_3);
94
95   perm_rgb_0:   o0(RG).h v1(B) --> o1*
96               0   1  2   3   4
97              rgbr|gbrg|brgb|rgbr
98              0010 0100 1001 0010
99              0102 3145 2673 894A
100
101   perm_rgb_1:   o0(RG).h v1(B) --> o2
102               0   1  2   3   4
103              gbrg|brgb|bbbb|bbbb
104              0100 1001 1111 1111
105              B5CD 6EF7 89AB CDEF
106
107   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
108               0   1  2   3   4
109              gbrg|brgb|rgbr|gbrg
110              1111 1111 0010 0100
111              89AB CDEF 0182 3945
112
113   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
114               0   1  2   3   4
115              brgb|rgbr|gbrg|brgb
116              1001 0010 0100 1001
117              a67b 89cA BdCD eEFf
118
119 */
120 static
121 const vector unsigned char
122   perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
123                                       0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
124   perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
125                                       0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
126   perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
127                                       0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
128   perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
129                                       0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
130
131 #define vec_merge3(x2,x1,x0,y0,y1,y2)    \
132 do {                                     \
133   typeof(x0) o0,o2,o3;                   \
134       o0 = vec_mergeh (x0,x1);           \
135       y0 = vec_perm (o0, x2, perm_rgb_0);\
136       o2 = vec_perm (o0, x2, perm_rgb_1);\
137       o3 = vec_mergel (x0,x1);           \
138       y1 = vec_perm (o3,o2,perm_rgb_2);  \
139       y2 = vec_perm (o3,o2,perm_rgb_3);  \
140 } while(0)
141
142 #define vec_mstrgb24(x0,x1,x2,ptr)        \
143 do {                                     \
144   typeof(x0) _0,_1,_2;                   \
145   vec_merge3 (x0,x1,x2,_0,_1,_2);        \
146   vec_st (_0, 0, ptr++);                 \
147   vec_st (_1, 0, ptr++);                 \
148   vec_st (_2, 0, ptr++);                 \
149 }  while (0);
150
151 #define vec_mstbgr24(x0,x1,x2,ptr)       \
152 do {                                     \
153   typeof(x0) _0,_1,_2;                   \
154   vec_merge3 (x2,x1,x0,_0,_1,_2);        \
155   vec_st (_0, 0, ptr++);                 \
156   vec_st (_1, 0, ptr++);                 \
157   vec_st (_2, 0, ptr++);                 \
158 }  while (0);
159
160 /* pack the pixels in rgb0 format
161    msb R
162    lsb 0
163 */
164 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                                \
165 do {                                                                                   \
166   T _0,_1,_2,_3;                                                                       \
167   _0 = vec_mergeh (x0,x1);                                                             \
168   _1 = vec_mergeh (x2,x3);                                                             \
169   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
170   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
171   vec_st (_2, 0*16, (T *)ptr);                                                         \
172   vec_st (_3, 1*16, (T *)ptr);                                                         \
173   _0 = vec_mergel (x0,x1);                                                             \
174   _1 = vec_mergel (x2,x3);                                                             \
175   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
176   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
177   vec_st (_2, 2*16, (T *)ptr);                                                         \
178   vec_st (_3, 3*16, (T *)ptr);                                                         \
179   ptr += 4;                                                                            \
180 }  while (0);
181
182 /*
183
184   | 1     0       1.4021   | | Y |
185   | 1    -0.3441 -0.7142   |x| Cb|
186   | 1     1.7718  0        | | Cr|
187
188
189   Y:      [-128 127]
190   Cb/Cr : [-128 127]
191
192   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
193
194 */
195
196
197
198
199 #define vec_unh(x) \
200   (vector signed short) \
201     vec_perm(x,(typeof(x))AVV(0),\
202              (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
203                                     0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
204 #define vec_unl(x) \
205   (vector signed short) \
206     vec_perm(x,(typeof(x))AVV(0),\
207              (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
208                                     0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
209
210 #define vec_clip(x) \
211   vec_max (vec_min (x, (typeof(x))AVV(235)), (typeof(x))AVV(16))
212
213 #define vec_packclp_a(x,y) \
214   (vector unsigned char)vec_pack (vec_clip (x), vec_clip (y))
215
216 #define vec_packclp(x,y) \
217   (vector unsigned char)vec_packs \
218       ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
219        (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
220
221 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
222
223
224 static inline void cvtyuvtoRGB (SwsContext *c,
225                            vector signed short Y, vector signed short U, vector signed short V,
226                            vector signed short *R, vector signed short *G, vector signed short *B)
227 {
228   vector signed   short vx,ux,uvx;
229
230   Y = vec_mradds (Y, c->CY, c->OY);
231   U  = vec_sub (U,(vector signed short)
232                         vec_splat((vector signed short)AVV(128),0));
233   V  = vec_sub (V,(vector signed short)
234                         vec_splat((vector signed short)AVV(128),0));
235
236   //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
237   ux = vec_sl (U, c->CSHIFT);
238   *B = vec_mradds (ux, c->CBU, Y);
239
240   // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
241   vx = vec_sl (V, c->CSHIFT);
242   *R = vec_mradds (vx, c->CRV, Y);
243
244   // uvx = ((CGU*u) + (CGV*v))>>15;
245   uvx = vec_mradds (U, c->CGU, Y);
246   *G = vec_mradds (V, c->CGV, uvx);
247 }
248
249
250 /*
251   ------------------------------------------------------------------------------
252   CS converters
253   ------------------------------------------------------------------------------
254 */
255
256
257 #define DEFCSP420_CVT(name,out_pixels)                                     \
258 static int altivec_##name (SwsContext *c,                                  \
259                                 unsigned char **in, int *instrides,        \
260                                 int srcSliceY,  int srcSliceH,             \
261                                 unsigned char **oplanes, int *outstrides)  \
262 {                                                                          \
263   int w = c->srcW;                                                         \
264   int h = srcSliceH;                                                       \
265   int i,j;                                                                 \
266   int instrides_scl[3];                                                    \
267   vector unsigned char y0,y1;                                              \
268                                                                            \
269   vector signed char  u,v;                                                 \
270                                                                            \
271   vector signed short Y0,Y1,Y2,Y3;                                         \
272   vector signed short U,V;                                                 \
273   vector signed short vx,ux,uvx;                                           \
274   vector signed short vx0,ux0,uvx0;                                        \
275   vector signed short vx1,ux1,uvx1;                                        \
276   vector signed short R0,G0,B0;                                            \
277   vector signed short R1,G1,B1;                                            \
278   vector unsigned char R,G,B;                                              \
279                                                                            \
280   vector unsigned char *uivP, *vivP;                                       \
281   vector unsigned char align_perm;                                         \
282                                                                            \
283   vector signed short                                                      \
284     lCY  = c->CY,                                                          \
285     lOY  = c->OY,                                                          \
286     lCRV = c->CRV,                                                         \
287     lCBU = c->CBU,                                                         \
288     lCGU = c->CGU,                                                         \
289     lCGV = c->CGV;                                                         \
290                                                                            \
291   vector unsigned short lCSHIFT = c->CSHIFT;                               \
292                                                                            \
293   ubyte *y1i   = in[0];                                                    \
294   ubyte *y2i   = in[0]+w;                                                  \
295   ubyte *ui    = in[1];                                                    \
296   ubyte *vi    = in[2];                                                    \
297                                                                            \
298   vector unsigned char *oute                                               \
299     = (vector unsigned char *)                                             \
300         (oplanes[0]+srcSliceY*outstrides[0]);                              \
301   vector unsigned char *outo                                               \
302     = (vector unsigned char *)                                             \
303         (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);                \
304                                                                            \
305                                                                            \
306   instrides_scl[0] = instrides[0];                                         \
307   instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */     \
308   instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */     \
309                                                                            \
310                                                                            \
311   for (i=0;i<h/2;i++) {                                                    \
312     vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);                 \
313     vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);                 \
314                                                                            \
315     for (j=0;j<w/16;j++) {                                                 \
316                                                                            \
317       y0 = vec_ldl (0,y1i);                                                \
318       y1 = vec_ldl (0,y2i);                                                \
319       uivP = (vector unsigned char *)ui;                                   \
320       vivP = (vector unsigned char *)vi;                                   \
321                                                                            \
322       align_perm = vec_lvsl (0, ui);                                       \
323       u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);     \
324                                                                            \
325       align_perm = vec_lvsl (0, vi);                                       \
326       v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);     \
327                                                                            \
328       u  = (vector signed char)                                            \
329                 vec_sub (u,(vector signed char)                            \
330                                 vec_splat((vector signed char)AVV(128),0));\
331       v  = (vector signed char)                                            \
332                 vec_sub (v,(vector signed char)                            \
333                                 vec_splat((vector signed char)AVV(128),0));\
334                                                                            \
335       U  = vec_unpackh (u);                                                \
336       V  = vec_unpackh (v);                                                \
337                                                                            \
338                                                                            \
339         Y0 = vec_unh (y0);                                                 \
340         Y1 = vec_unl (y0);                                                 \
341         Y2 = vec_unh (y1);                                                 \
342         Y3 = vec_unl (y1);                                                 \
343                                                                            \
344         Y0 = vec_mradds (Y0, lCY, lOY);                                    \
345         Y1 = vec_mradds (Y1, lCY, lOY);                                    \
346         Y2 = vec_mradds (Y2, lCY, lOY);                                    \
347         Y3 = vec_mradds (Y3, lCY, lOY);                                    \
348                                                                            \
349         /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                         \
350         ux = vec_sl (U, lCSHIFT);                                          \
351         ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));                   \
352         ux0  = vec_mergeh (ux,ux);                                         \
353         ux1  = vec_mergel (ux,ux);                                         \
354                                                                            \
355         /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;  */                         \
356         vx = vec_sl (V, lCSHIFT);                                          \
357         vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));                   \
358         vx0  = vec_mergeh (vx,vx);                                         \
359         vx1  = vec_mergel (vx,vx);                                         \
360                                                                            \
361         /* uvx = ((CGU*u) + (CGV*v))>>15 */                                \
362         uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));                   \
363         uvx = vec_mradds (V, lCGV, uvx);                                   \
364         uvx0 = vec_mergeh (uvx,uvx);                                       \
365         uvx1 = vec_mergel (uvx,uvx);                                       \
366                                                                            \
367         R0 = vec_add (Y0,vx0);                                             \
368         G0 = vec_add (Y0,uvx0);                                            \
369         B0 = vec_add (Y0,ux0);                                             \
370         R1 = vec_add (Y1,vx1);                                             \
371         G1 = vec_add (Y1,uvx1);                                            \
372         B1 = vec_add (Y1,ux1);                                             \
373                                                                            \
374         R  = vec_packclp (R0,R1);                                          \
375         G  = vec_packclp (G0,G1);                                          \
376         B  = vec_packclp (B0,B1);                                          \
377                                                                            \
378         out_pixels(R,G,B,oute);                                            \
379                                                                            \
380         R0 = vec_add (Y2,vx0);                                             \
381         G0 = vec_add (Y2,uvx0);                                            \
382         B0 = vec_add (Y2,ux0);                                             \
383         R1 = vec_add (Y3,vx1);                                             \
384         G1 = vec_add (Y3,uvx1);                                            \
385         B1 = vec_add (Y3,ux1);                                             \
386         R  = vec_packclp (R0,R1);                                          \
387         G  = vec_packclp (G0,G1);                                          \
388         B  = vec_packclp (B0,B1);                                          \
389                                                                            \
390                                                                            \
391         out_pixels(R,G,B,outo);                                            \
392                                                                            \
393       y1i  += 16;                                                          \
394       y2i  += 16;                                                          \
395       ui   += 8;                                                           \
396       vi   += 8;                                                           \
397                                                                            \
398     }                                                                      \
399                                                                            \
400     outo += (outstrides[0])>>4;                                            \
401     oute += (outstrides[0])>>4;                                            \
402                                                                            \
403     ui    += instrides_scl[1];                                             \
404     vi    += instrides_scl[2];                                             \
405     y1i   += instrides_scl[0];                                             \
406     y2i   += instrides_scl[0];                                             \
407   }                                                                        \
408   return srcSliceH;                                                        \
409 }
410
411
412 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
413 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
414 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
415 #define out_argb(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
416 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
417 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(c,b,a,ptr)
418
419 DEFCSP420_CVT (yuv2_abgr32, out_abgr)
420 #if 1
421 DEFCSP420_CVT (yuv2_bgra32, out_argb)
422 #else
423 static int altivec_yuv2_bgra32 (SwsContext *c,                                  
424                                 unsigned char **in, int *instrides,        
425                                 int srcSliceY,  int srcSliceH,             
426                                 unsigned char **oplanes, int *outstrides)  
427 {                                                                          
428   int w = c->srcW;                                                         
429   int h = srcSliceH;                                                       
430   int i,j;                                                                 
431   int instrides_scl[3];                                                    
432   vector unsigned char y0,y1;                                              
433                                                                            
434   vector signed char  u,v;                                                 
435                                                                            
436   vector signed short Y0,Y1,Y2,Y3;                                         
437   vector signed short U,V;                                                 
438   vector signed short vx,ux,uvx;                                           
439   vector signed short vx0,ux0,uvx0;                                        
440   vector signed short vx1,ux1,uvx1;                                        
441   vector signed short R0,G0,B0;                                            
442   vector signed short R1,G1,B1;                                            
443   vector unsigned char R,G,B;                                              
444                                                                            
445   vector unsigned char *uivP, *vivP;                                       
446   vector unsigned char align_perm;                                         
447                                                                            
448   vector signed short                                                      
449     lCY  = c->CY,                                                          
450     lOY  = c->OY,                                                          
451     lCRV = c->CRV,                                                         
452     lCBU = c->CBU,                                                         
453     lCGU = c->CGU,                                                         
454     lCGV = c->CGV;                                                         
455                                                                            
456   vector unsigned short lCSHIFT = c->CSHIFT;                               
457                                                                            
458   ubyte *y1i   = in[0];                                                    
459   ubyte *y2i   = in[0]+w;                                                  
460   ubyte *ui    = in[1];                                                    
461   ubyte *vi    = in[2];                                                    
462                                                                            
463   vector unsigned char *oute                                               
464     = (vector unsigned char *)                                             
465         (oplanes[0]+srcSliceY*outstrides[0]);                              
466   vector unsigned char *outo                                               
467     = (vector unsigned char *)                                             
468         (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);                
469                                                                            
470                                                                            
471   instrides_scl[0] = instrides[0];                                         
472   instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */     
473   instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */     
474                                                                            
475                                                                            
476   for (i=0;i<h/2;i++) {                                                    
477     vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);                 
478     vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);                 
479                                                                            
480     for (j=0;j<w/16;j++) {                                                 
481                                                                            
482       y0 = vec_ldl (0,y1i);                                                
483       y1 = vec_ldl (0,y2i);                                                
484       uivP = (vector unsigned char *)ui;                                   
485       vivP = (vector unsigned char *)vi;                                   
486                                                                            
487       align_perm = vec_lvsl (0, ui);                                       
488       u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);     
489                                                                            
490       align_perm = vec_lvsl (0, vi);                                       
491       v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
492       u  = (vector signed char)
493                 vec_sub (u,(vector signed char)
494                                 vec_splat((vector signed char)AVV(128),0));
495       
496       v  = (vector signed char)
497                 vec_sub (v, (vector signed char)
498                                 vec_splat((vector signed char)AVV(128),0));
499       
500       U  = vec_unpackh (u);                                                
501       V  = vec_unpackh (v);                                                
502                                                                            
503                                                                            
504         Y0 = vec_unh (y0);                                                 
505         Y1 = vec_unl (y0);                                                 
506         Y2 = vec_unh (y1);                                                 
507         Y3 = vec_unl (y1);                                                 
508                                                                            
509         Y0 = vec_mradds (Y0, lCY, lOY);                                    
510         Y1 = vec_mradds (Y1, lCY, lOY);                                    
511         Y2 = vec_mradds (Y2, lCY, lOY);                                    
512         Y3 = vec_mradds (Y3, lCY, lOY);                                    
513                                                                            
514         /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                         
515         ux = vec_sl (U, lCSHIFT);                                          
516         ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
517         ux0  = vec_mergeh (ux,ux);                                         
518         ux1  = vec_mergel (ux,ux);                                         
519                                                                            
520         /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;  */                         
521         vx = vec_sl (V, lCSHIFT);                                          
522         vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
523         vx0  = vec_mergeh (vx,vx);
524         vx1  = vec_mergel (vx,vx);
525         /* uvx = ((CGU*u) + (CGV*v))>>15 */
526         uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
527         uvx = vec_mradds (V, lCGV, uvx);
528         uvx0 = vec_mergeh (uvx,uvx);
529         uvx1 = vec_mergel (uvx,uvx);
530         R0 = vec_add (Y0,vx0);
531         G0 = vec_add (Y0,uvx0);
532         B0 = vec_add (Y0,ux0);
533         R1 = vec_add (Y1,vx1);
534         G1 = vec_add (Y1,uvx1);
535         B1 = vec_add (Y1,ux1);
536         R  = vec_packclp (R0,R1);
537         G  = vec_packclp (G0,G1);
538         B  = vec_packclp (B0,B1);
539         
540         out_argb(R,G,B,oute);
541         R0 = vec_add (Y2,vx0);
542         G0 = vec_add (Y2,uvx0);
543         B0 = vec_add (Y2,ux0);
544         R1 = vec_add (Y3,vx1);
545         G1 = vec_add (Y3,uvx1);
546         B1 = vec_add (Y3,ux1);
547         R  = vec_packclp (R0,R1);
548         G  = vec_packclp (G0,G1);
549         B  = vec_packclp (B0,B1);
550         
551         out_argb(R,G,B,outo);
552         y1i  += 16;                                                        
553         y2i  += 16;                                                        
554         ui   += 8;
555         vi   += 8;                                                         
556                                                                            
557     }                                                                      
558                                                                            
559     outo += (outstrides[0])>>4;                                            
560     oute += (outstrides[0])>>4;                                            
561                                                                            
562     ui    += instrides_scl[1];                                             
563     vi    += instrides_scl[2];                                             
564     y1i   += instrides_scl[0];                                             
565     y2i   += instrides_scl[0];                                             
566   }                                                                        
567   return srcSliceH;                                                        
568 }
569
570 #endif
571
572
573 DEFCSP420_CVT (yuv2_rgba32, out_rgba)
574 DEFCSP420_CVT (yuv2_argb32, out_argb)
575 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
576 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
577
578
579 // uyvy|uyvy|uyvy|uyvy
580 // 0123 4567 89ab cdef
581 static
582 const vector unsigned char
583   demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
584                                    0x10,0x04,0x10,0x04,
585                                    0x10,0x08,0x10,0x08,
586                                    0x10,0x0c,0x10,0x0c),
587   demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
588                                    0x10,0x06,0x10,0x06,
589                                    0x10,0x0A,0x10,0x0A,
590                                    0x10,0x0E,0x10,0x0E),
591   demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
592                                    0x10,0x05,0x10,0x07,
593                                    0x10,0x09,0x10,0x0B,
594                                    0x10,0x0D,0x10,0x0F);
595
596 /*
597   this is so I can play live CCIR raw video
598 */
599 static int altivec_uyvy_rgb32 (SwsContext *c,
600                                unsigned char **in, int *instrides,
601                                int srcSliceY,   int srcSliceH,
602                                unsigned char **oplanes, int *outstrides)
603 {
604   int w = c->srcW;
605   int h = srcSliceH;
606   int i,j;
607   vector unsigned char uyvy;
608   vector signed   short Y,U,V;
609   vector signed   short vx,ux,uvx;
610   vector signed   short R0,G0,B0,R1,G1,B1;
611   vector unsigned char  R,G,B;
612   vector unsigned char *out;
613   ubyte *img;
614
615   img = in[0];
616   out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
617
618   for (i=0;i<h;i++) {
619     for (j=0;j<w/16;j++) {
620       uyvy = vec_ld (0, img);
621       U = (vector signed short)
622         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
623
624       V = (vector signed short)
625         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
626
627       Y = (vector signed short)
628         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
629
630       cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
631
632       uyvy = vec_ld (16, img);
633       U = (vector signed short)
634         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
635
636       V = (vector signed short)
637         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
638
639       Y = (vector signed short)
640         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
641
642       cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
643
644       R  = vec_packclp (R0,R1);
645       G  = vec_packclp (G0,G1);
646       B  = vec_packclp (B0,B1);
647
648       //      vec_mstbgr24 (R,G,B, out);
649       out_rgba (R,G,B,out);
650
651       img += 32;
652     }
653   }
654   return srcSliceH;
655 }
656
657
658
659 /* Ok currently the acceleration routine only supports
660    inputs of widths a multiple of 16
661    and heights a multiple 2
662
663    So we just fall back to the C codes for this.
664 */
665 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
666 {
667   if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))    
668     return NULL;
669
670   /*
671     and this seems not to matter too much I tried a bunch of 
672     videos with abnormal widths and mplayer crashes else where.
673     mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv 
674     boom with X11 bad match.
675     
676   */
677   if ((c->srcW & 0xf) != 0)    return NULL;
678
679   switch (c->srcFormat) {
680   case IMGFMT_YVU9:
681   case IMGFMT_IF09:
682   case IMGFMT_YV12:
683   case IMGFMT_I420:
684   case IMGFMT_IYUV:
685   case IMGFMT_CLPL:
686   case IMGFMT_Y800:
687   case IMGFMT_Y8:
688   case IMGFMT_NV12:
689   case IMGFMT_NV21:
690     if ((c->srcH & 0x1) != 0)
691       return NULL;
692
693     switch(c->dstFormat){
694     case IMGFMT_RGB24:
695       MSG_WARN("ALTIVEC: Color Space RGB24\n");
696       return altivec_yuv2_rgb24;
697     case IMGFMT_BGR24:
698       MSG_WARN("ALTIVEC: Color Space BGR24\n");
699       return altivec_yuv2_bgr24;
700     case IMGFMT_RGB32:
701       MSG_WARN("ALTIVEC: Color Space ARGB32\n");
702       return altivec_yuv2_argb32;
703     case IMGFMT_BGR32:
704       MSG_WARN("ALTIVEC: Color Space BGRA32\n");
705       //      return profile_altivec_bgra32;
706
707       return altivec_yuv2_bgra32;
708     default: return NULL;
709     }
710     break;
711
712   case IMGFMT_UYVY:
713     switch(c->dstFormat){
714     case IMGFMT_RGB32:
715       MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
716       return altivec_uyvy_rgb32;
717     default: return NULL;
718     }
719     break;
720
721   }
722   return NULL;
723 }
724
725 static uint16_t roundToInt16(int64_t f){
726         int r= (f + (1<<15))>>16;
727              if(r<-0x7FFF) return 0x8000;
728         else if(r> 0x7FFF) return 0x7FFF;
729         else               return r;
730 }
731
732 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
733 {
734   union {
735         signed short tmp[8] __attribute__ ((aligned(16)));
736         vector signed short vec;
737         } buf;
738
739   buf.tmp[0] =  ( (0xffffLL) * contrast>>8 )>>9;                        //cy
740   buf.tmp[1] =  -256*brightness;                                        //oy
741   buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);     //crv
742   buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);     //cbu
743   buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));    //cgu
744   buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));    //cgv
745
746
747   c->CSHIFT = (vector unsigned short)vec_splat((vector unsigned short)AVV(2),0);
748   c->CY  = vec_splat ((vector signed short)buf.vec, 0);
749   c->OY  = vec_splat ((vector signed short)buf.vec, 1);
750   c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
751   c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
752   c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
753   c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
754 #if 0
755 {
756 int i;
757 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
758 for (i=0; i<6;i++)
759   printf("%s %d ", v[i],buf.tmp[i] );
760   printf("\n");
761 }
762 #endif
763  return;
764 }
765
766
767 void
768 altivec_yuv2packedX (SwsContext *c,
769                        int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
770                        int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
771                        uint8_t *dest, int dstW, int dstY)
772 {
773   int i,j;
774   short tmp __attribute__((aligned (16)));
775   int16_t *p;
776   short *f;
777   vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
778   vector signed short R0,G0,B0,R1,G1,B1;
779
780   vector unsigned char R,G,B,pels[3];
781   vector unsigned char *out,*nout;
782
783   vector signed short   RND = vec_splat((vector signed short)AVV(1<<3),0);
784   vector unsigned short SCL = vec_splat((vector unsigned short)AVV(4),0);
785   unsigned long scratch[16] __attribute__ ((aligned (16)));
786
787   vector signed short *vYCoeffsBank, *vCCoeffsBank;
788
789   vector signed short *YCoeffs, *CCoeffs;
790
791   vYCoeffsBank = malloc (sizeof (vector signed short)*lumFilterSize*dstW);
792   vCCoeffsBank = malloc (sizeof (vector signed short)*chrFilterSize*dstW);
793
794   for (i=0;i<lumFilterSize*dstW;i++) {
795     tmp = c->vLumFilter[i];
796     p = &vYCoeffsBank[i];
797     for (j=0;j<8;j++)
798       p[j] = tmp;
799   }
800
801   for (i=0;i<chrFilterSize*dstW;i++) {
802     tmp = c->vChrFilter[i];
803     p = &vCCoeffsBank[i];
804     for (j=0;j<8;j++)
805       p[j] = tmp;
806   }
807
808   YCoeffs = vYCoeffsBank+dstY*lumFilterSize;
809   CCoeffs = vCCoeffsBank+dstY*chrFilterSize;
810
811   out = (vector unsigned char *)dest;
812
813   for(i=0; i<dstW; i+=16){
814     Y0 = RND;
815     Y1 = RND;
816     /* extract 16 coeffs from lumSrc */
817     for(j=0; j<lumFilterSize; j++) {
818       X0 = vec_ld (0,  &lumSrc[j][i]);
819       X1 = vec_ld (16, &lumSrc[j][i]);
820       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
821       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
822     }
823
824     U = RND;
825     V = RND;
826     /* extract 8 coeffs from U,V */
827     for(j=0; j<chrFilterSize; j++) {
828       X  = vec_ld (0, &chrSrc[j][i/2]);
829       U  = vec_mradds (X, CCoeffs[j], U);
830       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
831       V  = vec_mradds (X, CCoeffs[j], V);
832     }
833
834     /* scale and clip signals */
835     Y0 = vec_sra (Y0, SCL);
836     Y1 = vec_sra (Y1, SCL);
837     U  = vec_sra (U,  SCL);
838     V  = vec_sra (V,  SCL);
839
840     Y0 = vec_clip (Y0);
841     Y1 = vec_clip (Y1);
842     U  = vec_clip (U);
843     V  = vec_clip (V);
844
845     /* now we have
846       Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
847       U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
848
849       Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
850       U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
851       V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
852     */
853
854     U0 = vec_mergeh (U,U);
855     V0 = vec_mergeh (V,V);
856
857     U1 = vec_mergel (U,U);
858     V1 = vec_mergel (V,V);
859
860     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
861     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
862
863     R  = vec_packclp (R0,R1);
864     G  = vec_packclp (G0,G1);
865     B  = vec_packclp (B0,B1);
866
867     out_rgba (R,G,B,out);
868   }
869
870   if (i < dstW) {
871     i -= 16;
872
873     Y0 = RND;
874     Y1 = RND;
875     /* extract 16 coeffs from lumSrc */
876     for(j=0; j<lumFilterSize; j++) {
877       X0 = vec_ld (0,  &lumSrc[j][i]);
878       X1 = vec_ld (16, &lumSrc[j][i]);
879       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
880       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
881     }
882
883     U = RND;
884     V = RND;
885     /* extract 8 coeffs from U,V */
886     for(j=0; j<chrFilterSize; j++) {
887       X  = vec_ld (0, &chrSrc[j][i/2]);
888       U  = vec_mradds (X, CCoeffs[j], U);
889       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
890       V  = vec_mradds (X, CCoeffs[j], V);
891     }
892
893     /* scale and clip signals */
894     Y0 = vec_sra (Y0, SCL);
895     Y1 = vec_sra (Y1, SCL);
896     U  = vec_sra (U,  SCL);
897     V  = vec_sra (V,  SCL);
898
899     Y0 = vec_clip (Y0);
900     Y1 = vec_clip (Y1);
901     U  = vec_clip (U);
902     V  = vec_clip (V);
903
904     /* now we have
905        Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
906        U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
907
908        Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
909        U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
910        V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
911     */
912
913     U0 = vec_mergeh (U,U);
914     V0 = vec_mergeh (V,V);
915
916     U1 = vec_mergel (U,U);
917     V1 = vec_mergel (V,V);
918
919     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
920     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
921
922     R  = vec_packclp (R0,R1);
923     G  = vec_packclp (G0,G1);
924     B  = vec_packclp (B0,B1);
925
926     nout = (vector unsigned char *)scratch;
927     out_rgba (R,G,B,nout);
928
929     memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
930   }
931
932   if (vYCoeffsBank) free (vYCoeffsBank);
933   if (vCCoeffsBank) free (vCCoeffsBank);
934
935 }