]> git.sesse.net Git - ffmpeg/blob - postproc/yuv2rgb_altivec.c
some fixes
[ffmpeg] / postproc / yuv2rgb_altivec.c
1 /*
2   marc.hoffman@analog.com    March 8, 2004
3
4   Altivec Acceleration for Color Space Conversion revision 0.2
5
6   convert I420 YV12 to RGB in various formats,
7     it rejects images that are not in 420 formats
8     it rejects images that don't have widths of multiples of 16
9     it rejects images that don't have heights of multiples of 2
10   reject defers to C simulation codes.
11
12   lots of optimizations to be done here
13
14   1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15      so we currently use max min to clip
16
17   2. the inefficient use of chroma loading needs a bit of brushing up
18
19   3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
20
21
22   MODIFIED to calculate coeffs from currently selected color space.
23   MODIFIED core to be a macro which you spec the output format.
24   ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25   CORRECTED algorithim selection to be strict on input formats.
26   ADDED runtime detection of altivec.
27
28   ADDED altivec_yuv2packedX vertical scl + RGB converter
29
30   March 27,2004
31   PERFORMANCE ANALYSIS
32
33   The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34   The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
35
36   720*480*30  ~10MPS
37
38   so we have roughly 10clocks per pixel this is too high something has to be wrong.
39
40   OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
41
42   OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43   guaranteed to have the input video frame it was just decompressed so
44   it probably resides in L1 caches.  However we are creating the
45   output video stream this needs to use the DSTST instruction to
46   optimize for the cache.  We couple this with the fact that we are
47   not going to be visiting the input buffer again so we mark it Least
48   Recently Used.  This shaves 25% of the processor cycles off.
49
50   Now MEMCPY is the largest mips consumer in the system, probably due
51   to the inefficient X11 stuff.
52
53   GL libraries seem to be very slow on this machine 1.33Ghz PB running
54   Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
55   a versioning issues, however i have libGL.1.2.dylib for both
56   machines. ((We need to figure this out now))
57
58   GL2 libraries work now with patch for RGB32
59
60   NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
61
62   Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment. 
63
64 */
65 #include <stdio.h>
66 #include <stdlib.h>
67 #include <string.h>
68 #include <inttypes.h>
69 #include <assert.h>
70 #include "config.h"
71 #include "rgb2rgb.h"
72 #include "swscale.h"
73 #include "swscale_internal.h"
74 #include "../mangle.h"
75 #include "../libvo/img_format.h" //FIXME try to reduce dependency of such stuff
76
77 #undef PROFILE_THE_BEAST
78 #undef INC_SCALING
79
80 typedef unsigned char ubyte;
81 typedef signed char   sbyte;
82
83
84 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
85    homogeneous vector registers x0,x1,x2 are interleaved with the
86    following technique:
87
88       o0 = vec_mergeh (x0,x1);
89       o1 = vec_perm (o0, x2, perm_rgb_0);
90       o2 = vec_perm (o0, x2, perm_rgb_1);
91       o3 = vec_mergel (x0,x1);
92       o4 = vec_perm (o3,o2,perm_rgb_2);
93       o5 = vec_perm (o3,o2,perm_rgb_3);
94
95   perm_rgb_0:   o0(RG).h v1(B) --> o1*
96               0   1  2   3   4
97              rgbr|gbrg|brgb|rgbr
98              0010 0100 1001 0010
99              0102 3145 2673 894A
100
101   perm_rgb_1:   o0(RG).h v1(B) --> o2
102               0   1  2   3   4
103              gbrg|brgb|bbbb|bbbb
104              0100 1001 1111 1111
105              B5CD 6EF7 89AB CDEF
106
107   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
108               0   1  2   3   4
109              gbrg|brgb|rgbr|gbrg
110              1111 1111 0010 0100
111              89AB CDEF 0182 3945
112
113   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
114               0   1  2   3   4
115              brgb|rgbr|gbrg|brgb
116              1001 0010 0100 1001
117              a67b 89cA BdCD eEFf
118
119 */
120 static
121 const vector unsigned char
122   perm_rgb_0 = (vector unsigned char)(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
123                                       0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
124   perm_rgb_1 = (vector unsigned char)(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
125                                       0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
126   perm_rgb_2 = (vector unsigned char)(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
127                                       0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
128   perm_rgb_3 = (vector unsigned char)(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
129                                       0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
130
131 #define vec_merge3(x2,x1,x0,y0,y1,y2)    \
132 do {                                     \
133   typeof(x0) o0,o2,o3;                   \
134       o0 = vec_mergeh (x0,x1);           \
135       y0 = vec_perm (o0, x2, perm_rgb_0);\
136       o2 = vec_perm (o0, x2, perm_rgb_1);\
137       o3 = vec_mergel (x0,x1);           \
138       y1 = vec_perm (o3,o2,perm_rgb_2);  \
139       y2 = vec_perm (o3,o2,perm_rgb_3);  \
140 } while(0)
141
142 #define vec_mstrgb24(x0,x1,x2,ptr)        \
143 do {                                     \
144   typeof(x0) _0,_1,_2;                   \
145   vec_merge3 (x0,x1,x2,_0,_1,_2);        \
146   vec_st (_0, 0, ptr++);                 \
147   vec_st (_1, 0, ptr++);                 \
148   vec_st (_2, 0, ptr++);                 \
149 }  while (0);
150
151 #define vec_mstbgr24(x0,x1,x2,ptr)       \
152 do {                                     \
153   typeof(x0) _0,_1,_2;                   \
154   vec_merge3 (x2,x1,x0,_0,_1,_2);        \
155   vec_st (_0, 0, ptr++);                 \
156   vec_st (_1, 0, ptr++);                 \
157   vec_st (_2, 0, ptr++);                 \
158 }  while (0);
159
160 /* pack the pixels in rgb0 format
161    msb R
162    lsb 0
163 */
164 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                                \
165 do {                                                                                   \
166   T _0,_1,_2,_3;                                                                       \
167   _0 = vec_mergeh (x0,x1);                                                             \
168   _1 = vec_mergeh (x2,x3);                                                             \
169   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
170   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
171   vec_st (_2, 0*16, (T *)ptr);                                                         \
172   vec_st (_3, 1*16, (T *)ptr);                                                         \
173   _0 = vec_mergel (x0,x1);                                                             \
174   _1 = vec_mergel (x2,x3);                                                             \
175   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
176   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
177   vec_st (_2, 2*16, (T *)ptr);                                                         \
178   vec_st (_3, 3*16, (T *)ptr);                                                         \
179   ptr += 4;                                                                            \
180 }  while (0);
181
182 /*
183
184   | 1     0       1.4021   | | Y |
185   | 1    -0.3441 -0.7142   |x| Cb|
186   | 1     1.7718  0        | | Cr|
187
188
189   Y:      [-128 127]
190   Cb/Cr : [-128 127]
191
192   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
193
194 */
195
196
197
198
199 #define vec_unh(x) \
200   (vector signed short) \
201     vec_perm(x,(typeof(x))(0),\
202              (vector unsigned char)(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
203                                     0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
204 #define vec_unl(x) \
205   (vector signed short) \
206     vec_perm(x,(typeof(x))(0),\
207              (vector unsigned char)(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
208                                     0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
209
210 #define vec_clip(x) \
211   vec_max (vec_min (x, (typeof(x))(255)), (typeof(x))(0))
212
213 #define vec_packclp_a(x,y) \
214   (vector unsigned char)vec_pack (vec_clip (x), vec_clip (y))
215
216 #define vec_packclp(x,y) \
217   (vector unsigned char)vec_packs \
218       ((vector unsigned short)vec_max (x,(vector signed short) (0)), \
219        (vector unsigned short)vec_max (y,(vector signed short) (0)))
220
221 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),a,a,a,ptr)
222
223
224 static inline void cvtyuvtoRGB (SwsContext *c,
225                            vector signed short Y, vector signed short U, vector signed short V,
226                            vector signed short *R, vector signed short *G, vector signed short *B)
227 {
228   vector signed   short vx,ux,uvx;
229
230   Y = vec_mradds (Y, c->CY, c->OY);
231
232   U = vec_sub (U,(vector signed short)(128));
233   V = vec_sub (V,(vector signed short)(128));
234
235   //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
236   ux = vec_sl (U, c->CSHIFT);
237   *B = vec_mradds (ux, c->CBU, Y);
238
239   // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
240   vx = vec_sl (V, c->CSHIFT);
241   *R = vec_mradds (vx, c->CRV, Y);
242
243   // uvx = ((CGU*u) + (CGV*v))>>15;
244   uvx = vec_mradds (U, c->CGU, Y);
245   *G = vec_mradds (V, c->CGV, uvx);
246 }
247
248
249 /*
250   ------------------------------------------------------------------------------
251   CS converters
252   ------------------------------------------------------------------------------
253 */
254
255
256 #define DEFCSP420_CVT(name,out_pixels)                                     \
257 static int altivec_##name (SwsContext *c,                                  \
258                                 unsigned char **in, int *instrides,        \
259                                 int srcSliceY,  int srcSliceH,             \
260                                 unsigned char **oplanes, int *outstrides)  \
261 {                                                                          \
262   int w = c->srcW;                                                         \
263   int h = srcSliceH;                                                       \
264   int i,j;                                                                 \
265   int instrides_scl[3];                                                    \
266   vector unsigned char y0,y1;                                              \
267                                                                            \
268   vector signed char  u,v;                                                 \
269                                                                            \
270   vector signed short Y0,Y1,Y2,Y3;                                         \
271   vector signed short U,V;                                                 \
272   vector signed short vx,ux,uvx;                                           \
273   vector signed short vx0,ux0,uvx0;                                        \
274   vector signed short vx1,ux1,uvx1;                                        \
275   vector signed short R0,G0,B0;                                            \
276   vector signed short R1,G1,B1;                                            \
277   vector unsigned char R,G,B;                                              \
278                                                                            \
279   vector unsigned char *uivP, *vivP;                                       \
280   vector unsigned char align_perm;                                         \
281                                                                            \
282   vector signed short                                                      \
283     lCY  = c->CY,                                                          \
284     lOY  = c->OY,                                                          \
285     lCRV = c->CRV,                                                         \
286     lCBU = c->CBU,                                                         \
287     lCGU = c->CGU,                                                         \
288     lCGV = c->CGV;                                                         \
289                                                                            \
290   vector unsigned short lCSHIFT = c->CSHIFT;                               \
291                                                                            \
292   ubyte *y1i   = in[0];                                                    \
293   ubyte *y2i   = in[0]+w;                                                  \
294   ubyte *ui    = in[1];                                                    \
295   ubyte *vi    = in[2];                                                    \
296                                                                            \
297   vector unsigned char *oute                                               \
298     = (vector unsigned char *)                                             \
299         (oplanes[0]+srcSliceY*outstrides[0]);                              \
300   vector unsigned char *outo                                               \
301     = (vector unsigned char *)                                             \
302         (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);                \
303                                                                            \
304                                                                            \
305   instrides_scl[0] = instrides[0];                                         \
306   instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */     \
307   instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */     \
308                                                                            \
309                                                                            \
310   for (i=0;i<h/2;i++) {                                                    \
311     vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);                 \
312     vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);                 \
313                                                                            \
314     for (j=0;j<w/16;j++) {                                                 \
315                                                                            \
316       y0 = vec_ldl (0,y1i);                                                \
317       y1 = vec_ldl (0,y2i);                                                \
318       uivP = (vector unsigned char *)ui;                                   \
319       vivP = (vector unsigned char *)vi;                                   \
320                                                                            \
321       align_perm = vec_lvsl (0, ui);                                       \
322       u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);     \
323                                                                            \
324       align_perm = vec_lvsl (0, vi);                                       \
325       v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);     \
326                                                                            \
327       u  = (vector signed char)vec_sub (u, (vector signed char)(128));     \
328       v  = (vector signed char)vec_sub (v, (vector signed char)(128));     \
329       U  = vec_unpackh (u);                                                \
330       V  = vec_unpackh (v);                                                \
331                                                                            \
332                                                                            \
333         Y0 = vec_unh (y0);                                                 \
334         Y1 = vec_unl (y0);                                                 \
335         Y2 = vec_unh (y1);                                                 \
336         Y3 = vec_unl (y1);                                                 \
337                                                                            \
338         Y0 = vec_mradds (Y0, lCY, lOY);                                    \
339         Y1 = vec_mradds (Y1, lCY, lOY);                                    \
340         Y2 = vec_mradds (Y2, lCY, lOY);                                    \
341         Y3 = vec_mradds (Y3, lCY, lOY);                                    \
342                                                                            \
343         /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                         \
344         ux = vec_sl (U, lCSHIFT);                                          \
345         ux = vec_mradds (ux, lCBU, (vector signed short)(0));              \
346         ux0  = vec_mergeh (ux,ux);                                         \
347         ux1  = vec_mergel (ux,ux);                                         \
348                                                                            \
349         /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;  */                         \
350         vx = vec_sl (V, lCSHIFT);                                          \
351         vx = vec_mradds (vx, lCRV, (vector signed short)(0));              \
352         vx0  = vec_mergeh (vx,vx);                                         \
353         vx1  = vec_mergel (vx,vx);                                         \
354                                                                            \
355         /* uvx = ((CGU*u) + (CGV*v))>>15 */                                \
356         uvx = vec_mradds (U, lCGU, (vector signed short)(0));              \
357         uvx = vec_mradds (V, lCGV, uvx);                                   \
358         uvx0 = vec_mergeh (uvx,uvx);                                       \
359         uvx1 = vec_mergel (uvx,uvx);                                       \
360                                                                            \
361         R0 = vec_add (Y0,vx0);                                             \
362         G0 = vec_add (Y0,uvx0);                                            \
363         B0 = vec_add (Y0,ux0);                                             \
364         R1 = vec_add (Y1,vx1);                                             \
365         G1 = vec_add (Y1,uvx1);                                            \
366         B1 = vec_add (Y1,ux1);                                             \
367                                                                            \
368         R  = vec_packclp (R0,R1);                                          \
369         G  = vec_packclp (G0,G1);                                          \
370         B  = vec_packclp (B0,B1);                                          \
371                                                                            \
372         out_pixels(R,G,B,oute);                                            \
373                                                                            \
374         R0 = vec_add (Y2,vx0);                                             \
375         G0 = vec_add (Y2,uvx0);                                            \
376         B0 = vec_add (Y2,ux0);                                             \
377         R1 = vec_add (Y3,vx1);                                             \
378         G1 = vec_add (Y3,uvx1);                                            \
379         B1 = vec_add (Y3,ux1);                                             \
380         R  = vec_packclp (R0,R1);                                          \
381         G  = vec_packclp (G0,G1);                                          \
382         B  = vec_packclp (B0,B1);                                          \
383                                                                            \
384                                                                            \
385         out_pixels(R,G,B,outo);                                            \
386                                                                            \
387       y1i  += 16;                                                          \
388       y2i  += 16;                                                          \
389       ui   += 8;                                                           \
390       vi   += 8;                                                           \
391                                                                            \
392     }                                                                      \
393                                                                            \
394     outo += (outstrides[0])>>4;                                            \
395     oute += (outstrides[0])>>4;                                            \
396                                                                            \
397     ui    += instrides_scl[1];                                             \
398     vi    += instrides_scl[2];                                             \
399     y1i   += instrides_scl[0];                                             \
400     y2i   += instrides_scl[0];                                             \
401   }                                                                        \
402   return srcSliceH;                                                        \
403 }
404
405
406 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))(0)),c,b,a,ptr)
407 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(typeof(a),c,b,a,((typeof (a))(0)),ptr)
408 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(typeof(a),a,b,c,((typeof (a))(0)),ptr)
409 #define out_argb(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))(0)),a,b,c,ptr)
410 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
411 #define out_bgr24(a,b,c,ptr) vec_mstrgb24(c,b,a,ptr)
412
413 DEFCSP420_CVT (yuv2_abgr32, out_abgr)
414 DEFCSP420_CVT (yuv2_bgra32, out_argb)
415 DEFCSP420_CVT (yuv2_rgba32, out_rgba)
416 DEFCSP420_CVT (yuv2_argb32, out_argb)
417 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
418 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
419
420
421 // uyvy|uyvy|uyvy|uyvy
422 // 0123 4567 89ab cdef
423 static
424 const vector unsigned char
425   demux_u = (vector unsigned char)(0x10,0x00,0x10,0x00,
426                                    0x10,0x04,0x10,0x04,
427                                    0x10,0x08,0x10,0x08,
428                                    0x10,0x0c,0x10,0x0c),
429   demux_v = (vector unsigned char)(0x10,0x02,0x10,0x02,
430                                    0x10,0x06,0x10,0x06,
431                                    0x10,0x0A,0x10,0x0A,
432                                    0x10,0x0E,0x10,0x0E),
433   demux_y = (vector unsigned char)(0x10,0x01,0x10,0x03,
434                                    0x10,0x05,0x10,0x07,
435                                    0x10,0x09,0x10,0x0B,
436                                    0x10,0x0D,0x10,0x0F);
437
438 /*
439   this is so I can play live CCIR raw video
440 */
441 static int altivec_uyvy_rgb32 (SwsContext *c,
442                                unsigned char **in, int *instrides,
443                                int srcSliceY,   int srcSliceH,
444                                unsigned char **oplanes, int *outstrides)
445 {
446   int w = c->srcW;
447   int h = srcSliceH;
448   int i,j;
449   vector unsigned char uyvy;
450   vector signed   short Y,U,V;
451   vector signed   short vx,ux,uvx;
452   vector signed   short R0,G0,B0,R1,G1,B1;
453   vector unsigned char  R,G,B;
454   vector unsigned char *out;
455   ubyte *img;
456
457   img = in[0];
458   out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
459
460   for (i=0;i<h;i++) {
461     for (j=0;j<w/16;j++) {
462       uyvy = vec_ld (0, img);
463       U = (vector signed short)
464         vec_perm (uyvy, (vector unsigned char)(0), demux_u);
465
466       V = (vector signed short)
467         vec_perm (uyvy, (vector unsigned char)(0), demux_v);
468
469       Y = (vector signed short)
470         vec_perm (uyvy, (vector unsigned char)(0), demux_y);
471
472       cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
473
474       uyvy = vec_ld (16, img);
475       U = (vector signed short)
476         vec_perm (uyvy, (vector unsigned char)(0), demux_u);
477
478       V = (vector signed short)
479         vec_perm (uyvy, (vector unsigned char)(0), demux_v);
480
481       Y = (vector signed short)
482         vec_perm (uyvy, (vector unsigned char)(0), demux_y);
483
484       cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
485
486       R  = vec_packclp (R0,R1);
487       G  = vec_packclp (G0,G1);
488       B  = vec_packclp (B0,B1);
489
490       //      vec_mstbgr24 (R,G,B, out);
491       out_rgba (R,G,B,out);
492
493       img += 32;
494     }
495   }
496   return srcSliceH;
497 }
498
499
500
501 /* Ok currently the acceleration routine only supports
502    inputs of widths a multiple of 16
503    and heights a multiple 2
504
505    So we just fall back to the C codes for this.
506 */
507 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
508 {
509   if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))    
510     return NULL;
511
512   /*
513     and this seems not to matter too much I tried a bunch of 
514     videos with abnormal widths and mplayer crashes else where.
515     mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv 
516     boom with X11 bad match.
517     
518   */
519   if ((c->srcW & 0xf) != 0)    return NULL;
520
521   switch (c->srcFormat) {
522   case IMGFMT_YVU9:
523   case IMGFMT_IF09:
524   case IMGFMT_YV12:
525   case IMGFMT_I420:
526   case IMGFMT_IYUV:
527   case IMGFMT_CLPL:
528   case IMGFMT_Y800:
529   case IMGFMT_Y8:
530   case IMGFMT_NV12:
531   case IMGFMT_NV21:
532     if ((c->srcH & 0x1) != 0)
533       return NULL;
534
535     switch(c->dstFormat){
536     case IMGFMT_RGB24:
537       MSG_WARN("ALTIVEC: Color Space RGB24\n");
538       return altivec_yuv2_rgb24;
539     case IMGFMT_BGR24:
540       MSG_WARN("ALTIVEC: Color Space BGR24\n");
541       return altivec_yuv2_bgr24;
542     case IMGFMT_RGB32:
543       MSG_WARN("ALTIVEC: Color Space ARGB32\n");
544       return altivec_yuv2_argb32;
545     case IMGFMT_BGR32:
546       MSG_WARN("ALTIVEC: Color Space BGRA32\n");
547       //      return profile_altivec_bgra32;
548
549       return altivec_yuv2_bgra32;
550     default: return NULL;
551     }
552     break;
553
554   case IMGFMT_UYVY:
555     switch(c->dstFormat){
556     case IMGFMT_RGB32:
557       MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
558       return altivec_uyvy_rgb32;
559     default: return NULL;
560     }
561     break;
562
563   }
564   return NULL;
565 }
566
567
568 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4])
569 {
570   vector signed short
571     CY  = (vector signed short)(0x7fff),
572     CRV = (vector signed short)(22972),
573     CBU = (vector signed short)(29029),
574     CGU = (vector signed short)(-11276),
575     CGV = (vector signed short)(-23400),
576     OY;
577
578   vector signed short Y0;
579   int brightness = c->brightness,  contrast = c->contrast,  saturation = c->saturation;
580   int64_t crv  __attribute__ ((aligned(16)));
581   int64_t cbu  __attribute__ ((aligned(16)));
582   int64_t cgu  __attribute__ ((aligned(16)));
583   int64_t cgv  __attribute__ ((aligned(16)));
584   short tmp __attribute__ ((aligned(16)));
585
586   int64_t cy  = (1<<16)-1;
587   int64_t oy  = 0;
588
589   if ((c->flags & SWS_CPU_CAPS_ALTIVEC) == 0)
590     return;
591
592   crv = inv_table[0];
593   cbu = inv_table[1];
594   cgu = inv_table[2];
595   cgv = inv_table[3];
596
597 #if 0
598   printf ("crv: %hvx\n", CRV);
599   printf ("cbu: %hvx\n", CBU);
600   printf ("cgv: %hvx\n", CGV);
601   printf ("cgu: %hvx\n", CGU);
602
603   printf ("contrast: %d,   brightness: %d, saturation: %d\n", contrast, brightness, saturation);
604
605   printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
606 #endif
607
608   cy = (cy *contrast             )>>17;
609   crv= (crv*contrast * saturation)>>32;
610   cbu= (cbu*contrast * saturation)>>32;
611   cgu= (cgu*contrast * saturation)>>32;
612   cgv= (cgv*contrast * saturation)>>32;
613
614   oy -= 256*brightness;
615
616   //printf("%llx %llx %llx %llx %llx\n", cy, crv, cbu, cgu, cgv);
617
618   //  vector signed short CBU,CRV,CGU,CGY,CY;
619   tmp = cy;
620   CY = vec_lde (0, &tmp);
621   CY  = vec_splat (CY, 0);
622
623   tmp = oy;
624   OY = vec_lde (0, &tmp);
625   OY  = vec_splat (OY, 0);
626
627   tmp = crv>>3;
628   CRV = vec_lde (0, &tmp);
629   CRV  = vec_splat (CRV, 0);
630   tmp = cbu>>3;
631   CBU = vec_lde (0, &tmp);
632   CBU  = vec_splat (CBU, 0);
633
634   tmp = -(cgu>>1);
635   CGU = vec_lde (0, &tmp);
636   CGU  = vec_splat (CGU, 0);
637   tmp = -(cgv>>1);
638   CGV = vec_lde (0, &tmp);
639   CGV  = vec_splat (CGV, 0);
640
641   c->CSHIFT = (vector unsigned short)(2);
642   c->CY = CY;
643   c->OY = OY;
644   c->CRV = CRV;
645   c->CBU = CBU;
646   c->CGU = CGU;
647   c->CGV = CGV;
648
649 #if 0
650   printf ("cy:  %hvx\n", CY);
651   printf ("oy:  %hvx\n", OY);
652   printf ("crv: %hvx\n", CRV);
653   printf ("cbu: %hvx\n", CBU);
654   printf ("cgv: %hvx\n", CGV);
655   printf ("cgu: %hvx\n", CGU);
656 #endif
657
658  return;
659 }
660
661
662 void
663 altivec_yuv2packedX (SwsContext *c,
664                        int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
665                        int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
666                        uint8_t *dest, int dstW, int dstY)
667 {
668   int i,j;
669   short tmp __attribute__((aligned (16)));
670   short *p;
671   short *f;
672   vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
673   vector signed short R0,G0,B0,R1,G1,B1;
674
675   vector unsigned char R,G,B,pels[3];
676   vector unsigned char *out,*nout;
677   vector signed short   RND = (vector signed short)(1<<3);
678   vector unsigned short SCL = (vector unsigned short)(4);
679   unsigned long scratch[16] __attribute__ ((aligned (16)));
680
681   vector signed short *vYCoeffsBank, *vCCoeffsBank;
682
683   vector signed short *YCoeffs, *CCoeffs;
684
685   vYCoeffsBank = malloc (sizeof (vector signed short)*lumFilterSize*dstW);
686   vCCoeffsBank = malloc (sizeof (vector signed short)*chrFilterSize*dstW);
687
688   for (i=0;i<lumFilterSize*dstW;i++) {
689     tmp = c->vLumFilter[i];
690     p = &vYCoeffsBank[i];
691     for (j=0;j<8;j++)
692       p[j] = tmp;
693   }
694
695   for (i=0;i<chrFilterSize*dstW;i++) {
696     tmp = c->vChrFilter[i];
697     p = &vCCoeffsBank[i];
698     for (j=0;j<8;j++)
699       p[j] = tmp;
700   }
701
702   YCoeffs = vYCoeffsBank+dstY*lumFilterSize;
703   CCoeffs = vCCoeffsBank+dstY*chrFilterSize;
704
705   out = (vector unsigned char *)dest;
706
707   for(i=0; i<dstW; i+=16){
708     Y0 = RND;
709     Y1 = RND;
710     /* extract 16 coeffs from lumSrc */
711     for(j=0; j<lumFilterSize; j++) {
712       X0 = vec_ld (0,  &lumSrc[j][i]);
713       X1 = vec_ld (16, &lumSrc[j][i]);
714       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
715       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
716     }
717
718     U = RND;
719     V = RND;
720     /* extract 8 coeffs from U,V */
721     for(j=0; j<chrFilterSize; j++) {
722       X  = vec_ld (0, &chrSrc[j][i/2]);
723       U  = vec_mradds (X, CCoeffs[j], U);
724       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
725       V  = vec_mradds (X, CCoeffs[j], V);
726     }
727
728     /* scale and clip signals */
729     Y0 = vec_sra (Y0, SCL);
730     Y1 = vec_sra (Y1, SCL);
731     U  = vec_sra (U,  SCL);
732     V  = vec_sra (V,  SCL);
733
734     Y0 = vec_clip (Y0);
735     Y1 = vec_clip (Y1);
736     U  = vec_clip (U);
737     V  = vec_clip (V);
738
739     /* now we have
740       Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
741       U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
742
743       Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
744       U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
745       V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
746     */
747
748     U0 = vec_mergeh (U,U);
749     V0 = vec_mergeh (V,V);
750
751     U1 = vec_mergel (U,U);
752     V1 = vec_mergel (V,V);
753
754     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
755     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
756
757     R  = vec_packclp (R0,R1);
758     G  = vec_packclp (G0,G1);
759     B  = vec_packclp (B0,B1);
760
761     out_rgba (R,G,B,out);
762   }
763
764   if (i < dstW) {
765     i -= 16;
766
767     Y0 = RND;
768     Y1 = RND;
769     /* extract 16 coeffs from lumSrc */
770     for(j=0; j<lumFilterSize; j++) {
771       X0 = vec_ld (0,  &lumSrc[j][i]);
772       X1 = vec_ld (16, &lumSrc[j][i]);
773       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
774       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
775     }
776
777     U = RND;
778     V = RND;
779     /* extract 8 coeffs from U,V */
780     for(j=0; j<chrFilterSize; j++) {
781       X  = vec_ld (0, &chrSrc[j][i/2]);
782       U  = vec_mradds (X, CCoeffs[j], U);
783       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
784       V  = vec_mradds (X, CCoeffs[j], V);
785     }
786
787     /* scale and clip signals */
788     Y0 = vec_sra (Y0, SCL);
789     Y1 = vec_sra (Y1, SCL);
790     U  = vec_sra (U,  SCL);
791     V  = vec_sra (V,  SCL);
792
793     Y0 = vec_clip (Y0);
794     Y1 = vec_clip (Y1);
795     U  = vec_clip (U);
796     V  = vec_clip (V);
797
798     /* now we have
799        Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
800        U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
801
802        Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
803        U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
804        V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
805     */
806
807     U0 = vec_mergeh (U,U);
808     V0 = vec_mergeh (V,V);
809
810     U1 = vec_mergel (U,U);
811     V1 = vec_mergel (V,V);
812
813     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
814     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
815
816     R  = vec_packclp (R0,R1);
817     G  = vec_packclp (G0,G1);
818     B  = vec_packclp (B0,B1);
819
820     nout = (vector unsigned char *)scratch;
821     out_rgba (R,G,B,nout);
822
823     memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
824   }
825
826   if (vYCoeffsBank) free (vYCoeffsBank);
827   if (vCCoeffsBank) free (vCCoeffsBank);
828
829 }
830
831