]> git.sesse.net Git - ffmpeg/blob - postproc/yuv2rgb_altivec.c
altivec yuv->rgb converter
[ffmpeg] / postproc / yuv2rgb_altivec.c
1 /*
2   marc.hoffman@analog.com    March 8, 2004
3
4   Altivec Acceleration for Color Space Conversion revision 0.2
5
6   convert I420 YV12 to RGB in various formats,
7     it rejects images that are not in 420 formats
8     it rejects images that don't have widths of multiples of 16
9     it rejects images that don't have heights of multiples of 2
10   reject defers to C simulation codes.
11
12   lots of optimizations to be done here
13
14   1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15      so we currently use max min to clip
16
17   2. the inefficient use of chroma loading needs a bit of brushing up
18
19   3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
20
21
22   MODIFIED to calculate coeffs from currently selected color space.
23   MODIFIED core to be a macro which you spec the output format.
24   ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25   CORRECTED algorithim selection to be strict on input formats.
26   ADDED runtime detection of altivec.
27
28   ADDED altivec_yuv2packedX vertical scl + RGB converter
29
30   March 27,2004
31   PERFORMANCE ANALYSIS
32
33   The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34   The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
35
36   720*480*30  ~10MPS
37
38   so we have roughly 10clocks per pixel this is too high something has to be wrong.
39
40   OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
41
42   OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43   guaranteed to have the input video frame it was just decompressed so
44   it probably resides in L1 caches.  However we are creating the
45   output video stream this needs to use the DSTST instruction to
46   optimize for the cache.  We couple this with the fact that we are
47   not going to be visiting the input buffer again so we mark it Least
48   Recently Used.  This shaves 25% of the processor cycles off.
49
50   Now MEMCPY is the largest mips consumer in the system, probably due
51   to the inefficient X11 stuff.
52
53   GL libraries seem to be very slow on this machine 1.33Ghz PB running
54   Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
55   a versioning issues, however i have libGL.1.2.dylib for both
56   machines. ((We need to figure this out now))
57
58   GL2 libraries work now with patch for RGB32
59
60   NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
61
62   Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment. 
63
64 */
65 #include <stdio.h>
66 #include <stdlib.h>
67 #include <inttypes.h>
68 #include <assert.h>
69 #include "config.h"
70 #include "rgb2rgb.h"
71 #include "swscale.h"
72 #include "swscale_internal.h"
73 #include "../mangle.h"
74 #include "../libvo/img_format.h" //FIXME try to reduce dependency of such stuff
75
76 #undef PROFILE_THE_BEAST
77 #undef INC_SCALING
78
79 typedef unsigned char ubyte;
80 typedef signed char   sbyte;
81
82
83 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
84    homogeneous vector registers x0,x1,x2 are interleaved with the
85    following technique:
86
87       o0 = vec_mergeh (x0,x1);
88       o1 = vec_perm (o0, x2, perm_rgb_0);
89       o2 = vec_perm (o0, x2, perm_rgb_1);
90       o3 = vec_mergel (x0,x1);
91       o4 = vec_perm (o3,o2,perm_rgb_2);
92       o5 = vec_perm (o3,o2,perm_rgb_3);
93
94   perm_rgb_0:   o0(RG).h v1(B) --> o1*
95               0   1  2   3   4
96              rgbr|gbrg|brgb|rgbr
97              0010 0100 1001 0010
98              0102 3145 2673 894A
99
100   perm_rgb_1:   o0(RG).h v1(B) --> o2
101               0   1  2   3   4
102              gbrg|brgb|bbbb|bbbb
103              0100 1001 1111 1111
104              B5CD 6EF7 89AB CDEF
105
106   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
107               0   1  2   3   4
108              gbrg|brgb|rgbr|gbrg
109              1111 1111 0010 0100
110              89AB CDEF 0182 3945
111
112   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
113               0   1  2   3   4
114              brgb|rgbr|gbrg|brgb
115              1001 0010 0100 1001
116              a67b 89cA BdCD eEFf
117
118 */
119 static
120 const vector unsigned char
121   perm_rgb_0 = (vector unsigned char)(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
122                                       0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
123   perm_rgb_1 = (vector unsigned char)(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
124                                       0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
125   perm_rgb_2 = (vector unsigned char)(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
126                                       0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
127   perm_rgb_3 = (vector unsigned char)(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
128                                       0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
129
130 #define vec_merge3(x2,x1,x0,y0,y1,y2)    \
131 do {                                     \
132   typeof(x0) o0,o2,o3;                   \
133       o0 = vec_mergeh (x0,x1);           \
134       y0 = vec_perm (o0, x2, perm_rgb_0);\
135       o2 = vec_perm (o0, x2, perm_rgb_1);\
136       o3 = vec_mergel (x0,x1);           \
137       y1 = vec_perm (o3,o2,perm_rgb_2);  \
138       y2 = vec_perm (o3,o2,perm_rgb_3);  \
139 } while(0)
140
141 #define vec_mstrgb24(x0,x1,x2,ptr)        \
142 do {                                     \
143   typeof(x0) _0,_1,_2;                   \
144   vec_merge3 (x0,x1,x2,_0,_1,_2);        \
145   vec_st (_0, 0, ptr++);                 \
146   vec_st (_1, 0, ptr++);                 \
147   vec_st (_2, 0, ptr++);                 \
148 }  while (0);
149
150 #define vec_mstbgr24(x0,x1,x2,ptr)       \
151 do {                                     \
152   typeof(x0) _0,_1,_2;                   \
153   vec_merge3 (x2,x1,x0,_0,_1,_2);        \
154   vec_st (_0, 0, ptr++);                 \
155   vec_st (_1, 0, ptr++);                 \
156   vec_st (_2, 0, ptr++);                 \
157 }  while (0);
158
159 /* pack the pixels in rgb0 format
160    msb R
161    lsb 0
162 */
163 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                                \
164 do {                                                                                   \
165   T _0,_1,_2,_3;                                                                       \
166   _0 = vec_mergeh (x0,x1);                                                             \
167   _1 = vec_mergeh (x2,x3);                                                             \
168   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
169   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
170   vec_st (_2, 0*16, (T *)ptr);                                                         \
171   vec_st (_3, 1*16, (T *)ptr);                                                         \
172   _0 = vec_mergel (x0,x1);                                                             \
173   _1 = vec_mergel (x2,x3);                                                             \
174   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
175   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
176   vec_st (_2, 2*16, (T *)ptr);                                                         \
177   vec_st (_3, 3*16, (T *)ptr);                                                         \
178   ptr += 4;                                                                            \
179 }  while (0);
180
181 /*
182
183   | 1     0       1.4021   | | Y |
184   | 1    -0.3441 -0.7142   |x| Cb|
185   | 1     1.7718  0        | | Cr|
186
187
188   Y:      [-128 127]
189   Cb/Cr : [-128 127]
190
191   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
192
193 */
194
195
196
197
198 #define vec_unh(x) \
199   (vector signed short) \
200     vec_perm(x,(typeof(x))(0),\
201              (vector unsigned char)(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
202                                     0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
203 #define vec_unl(x) \
204   (vector signed short) \
205     vec_perm(x,(typeof(x))(0),\
206              (vector unsigned char)(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
207                                     0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
208
209 #define vec_clip(x) \
210   vec_max (vec_min (x, (typeof(x))(255)), (typeof(x))(0))
211
212 #define vec_packclp_a(x,y) \
213   (vector unsigned char)vec_pack (vec_clip (x), vec_clip (y))
214
215 #define vec_packclp(x,y) \
216   (vector unsigned char)vec_packs \
217       ((vector unsigned short)vec_max (x,(vector signed short) (0)), \
218        (vector unsigned short)vec_max (y,(vector signed short) (0)))
219
220 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),a,a,a,ptr)
221
222
223 static inline cvtyuvtoRGB (SwsContext *c,
224                            vector signed short Y, vector signed short U, vector signed short V,
225                            vector signed short *R, vector signed short *G, vector signed short *B)
226 {
227   vector signed   short vx,ux,uvx;
228
229   Y = vec_mradds (Y, c->CY, c->OY);
230
231   U = vec_sub (U,(vector signed short)(128));
232   V = vec_sub (V,(vector signed short)(128));
233
234   //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
235   ux = vec_sl (U, c->CSHIFT);
236   *B = vec_mradds (ux, c->CBU, Y);
237
238   // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
239   vx = vec_sl (V, c->CSHIFT);
240   *R = vec_mradds (vx, c->CRV, Y);
241
242   // uvx = ((CGU*u) + (CGV*v))>>15;
243   uvx = vec_mradds (U, c->CGU, Y);
244   *G = vec_mradds (V, c->CGV, uvx);
245 }
246
247
248 /*
249   ------------------------------------------------------------------------------
250   CS converters
251   ------------------------------------------------------------------------------
252 */
253
254
255 #define DEFCSP420_CVT(name,out_pixels)                                     \
256 static int altivec_##name (SwsContext *c,                                  \
257                                 unsigned char **in, int *instrides,        \
258                                 int srcSliceY,  int srcSliceH,             \
259                                 unsigned char **oplanes, int *outstrides)  \
260 {                                                                          \
261   int w = c->srcW;                                                         \
262   int h = srcSliceH;                                                       \
263   int i,j;                                                                 \
264   int instrides_scl[3];                                                    \
265   vector unsigned char y0,y1;                                              \
266                                                                            \
267   vector signed char  u,v;                                                 \
268                                                                            \
269   vector signed short Y0,Y1,Y2,Y3;                                         \
270   vector signed short U,V;                                                 \
271   vector signed short vx,ux,uvx;                                           \
272   vector signed short vx0,ux0,uvx0;                                        \
273   vector signed short vx1,ux1,uvx1;                                        \
274   vector signed short R0,G0,B0;                                            \
275   vector signed short R1,G1,B1;                                            \
276   vector unsigned char R,G,B;                                              \
277                                                                            \
278   vector unsigned char *uivP, *vivP;                                       \
279   vector unsigned char align_perm;                                         \
280                                                                            \
281   vector signed short                                                      \
282     lCY  = c->CY,                                                          \
283     lOY  = c->OY,                                                          \
284     lCRV = c->CRV,                                                         \
285     lCBU = c->CBU,                                                         \
286     lCGU = c->CGU,                                                         \
287     lCGV = c->CGV;                                                         \
288                                                                            \
289   vector unsigned short lCSHIFT = c->CSHIFT;                               \
290                                                                            \
291   ubyte *y1i   = in[0];                                                    \
292   ubyte *y2i   = in[0]+w;                                                  \
293   ubyte *ui    = in[1];                                                    \
294   ubyte *vi    = in[2];                                                    \
295                                                                            \
296   vector unsigned char *oute                                               \
297     = (vector unsigned char *)                                             \
298         (oplanes[0]+srcSliceY*outstrides[0]);                              \
299   vector unsigned char *outo                                               \
300     = (vector unsigned char *)                                             \
301         (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);                \
302                                                                            \
303                                                                            \
304   instrides_scl[0] = instrides[0];                                         \
305   instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */     \
306   instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */     \
307                                                                            \
308                                                                            \
309   for (i=0;i<h/2;i++) {                                                    \
310     vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);                 \
311     vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);                 \
312                                                                            \
313     for (j=0;j<w/16;j++) {                                                 \
314                                                                            \
315       y0 = vec_ldl (0,y1i);                                                \
316       y1 = vec_ldl (0,y2i);                                                \
317       uivP = (vector unsigned char *)ui;                                   \
318       vivP = (vector unsigned char *)vi;                                   \
319                                                                            \
320       align_perm = vec_lvsl (0, ui);                                       \
321       u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);     \
322                                                                            \
323       align_perm = vec_lvsl (0, vi);                                       \
324       v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);     \
325                                                                            \
326       u  = (vector signed char)vec_sub (u, (vector signed char)(128));     \
327       v  = (vector signed char)vec_sub (v, (vector signed char)(128));     \
328       U  = vec_unpackh (u);                                                \
329       V  = vec_unpackh (v);                                                \
330                                                                            \
331                                                                            \
332         Y0 = vec_unh (y0);                                                 \
333         Y1 = vec_unl (y0);                                                 \
334         Y2 = vec_unh (y1);                                                 \
335         Y3 = vec_unl (y1);                                                 \
336                                                                            \
337         Y0 = vec_mradds (Y0, lCY, lOY);                                    \
338         Y1 = vec_mradds (Y1, lCY, lOY);                                    \
339         Y2 = vec_mradds (Y2, lCY, lOY);                                    \
340         Y3 = vec_mradds (Y3, lCY, lOY);                                    \
341                                                                            \
342         /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                         \
343         ux = vec_sl (U, lCSHIFT);                                          \
344         ux = vec_mradds (ux, lCBU, (vector signed short)(0));              \
345         ux0  = vec_mergeh (ux,ux);                                         \
346         ux1  = vec_mergel (ux,ux);                                         \
347                                                                            \
348         /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;  */                         \
349         vx = vec_sl (V, lCSHIFT);                                          \
350         vx = vec_mradds (vx, lCRV, (vector signed short)(0));              \
351         vx0  = vec_mergeh (vx,vx);                                         \
352         vx1  = vec_mergel (vx,vx);                                         \
353                                                                            \
354         /* uvx = ((CGU*u) + (CGV*v))>>15 */                                \
355         uvx = vec_mradds (U, lCGU, (vector signed short)(0));              \
356         uvx = vec_mradds (V, lCGV, uvx);                                   \
357         uvx0 = vec_mergeh (uvx,uvx);                                       \
358         uvx1 = vec_mergel (uvx,uvx);                                       \
359                                                                            \
360         R0 = vec_add (Y0,vx0);                                             \
361         G0 = vec_add (Y0,uvx0);                                            \
362         B0 = vec_add (Y0,ux0);                                             \
363         R1 = vec_add (Y1,vx1);                                             \
364         G1 = vec_add (Y1,uvx1);                                            \
365         B1 = vec_add (Y1,ux1);                                             \
366                                                                            \
367         R  = vec_packclp (R0,R1);                                          \
368         G  = vec_packclp (G0,G1);                                          \
369         B  = vec_packclp (B0,B1);                                          \
370                                                                            \
371         out_pixels(R,G,B,oute);                                            \
372                                                                            \
373         R0 = vec_add (Y2,vx0);                                             \
374         G0 = vec_add (Y2,uvx0);                                            \
375         B0 = vec_add (Y2,ux0);                                             \
376         R1 = vec_add (Y3,vx1);                                             \
377         G1 = vec_add (Y3,uvx1);                                            \
378         B1 = vec_add (Y3,ux1);                                             \
379         R  = vec_packclp (R0,R1);                                          \
380         G  = vec_packclp (G0,G1);                                          \
381         B  = vec_packclp (B0,B1);                                          \
382                                                                            \
383                                                                            \
384         out_pixels(R,G,B,outo);                                            \
385                                                                            \
386       y1i  += 16;                                                          \
387       y2i  += 16;                                                          \
388       ui   += 8;                                                           \
389       vi   += 8;                                                           \
390                                                                            \
391     }                                                                      \
392                                                                            \
393     outo += (outstrides[0])>>4;                                            \
394     oute += (outstrides[0])>>4;                                            \
395                                                                            \
396     ui    += instrides_scl[1];                                             \
397     vi    += instrides_scl[2];                                             \
398     y1i   += instrides_scl[0];                                             \
399     y2i   += instrides_scl[0];                                             \
400   }                                                                        \
401   return srcSliceH;                                                        \
402 }
403
404
405 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))(0)),c,b,a,ptr)
406 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(typeof(a),c,b,a,((typeof (a))(0)),ptr)
407 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(typeof(a),a,b,c,((typeof (a))(0)),ptr)
408 #define out_argb(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))(0)),a,b,c,ptr)
409 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
410 #define out_bgr24(a,b,c,ptr) vec_mstrgb24(c,b,a,ptr)
411
412 DEFCSP420_CVT (yuv2_abgr32, out_abgr)
413 DEFCSP420_CVT (yuv2_bgra32, out_argb)
414 DEFCSP420_CVT (yuv2_rgba32, out_rgba)
415 DEFCSP420_CVT (yuv2_argb32, out_argb)
416 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
417 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
418
419
420 // uyvy|uyvy|uyvy|uyvy
421 // 0123 4567 89ab cdef
422 static
423 const vector unsigned char
424   demux_u = (vector unsigned char)(0x10,0x00,0x10,0x00,
425                                    0x10,0x04,0x10,0x04,
426                                    0x10,0x08,0x10,0x08,
427                                    0x10,0x0c,0x10,0x0c),
428   demux_v = (vector unsigned char)(0x10,0x02,0x10,0x02,
429                                    0x10,0x06,0x10,0x06,
430                                    0x10,0x0A,0x10,0x0A,
431                                    0x10,0x0E,0x10,0x0E),
432   demux_y = (vector unsigned char)(0x10,0x01,0x10,0x03,
433                                    0x10,0x05,0x10,0x07,
434                                    0x10,0x09,0x10,0x0B,
435                                    0x10,0x0D,0x10,0x0F);
436
437 /*
438   this is so I can play live CCIR raw video
439 */
440 static int altivec_uyvy_rgb32 (SwsContext *c,
441                                unsigned char **in, int *instrides,
442                                int srcSliceY,   int srcSliceH,
443                                unsigned char **oplanes, int *outstrides)
444 {
445   int w = c->srcW;
446   int h = srcSliceH;
447   int i,j;
448   vector unsigned char uyvy;
449   vector signed   short Y,U,V;
450   vector signed   short vx,ux,uvx;
451   vector signed   short R0,G0,B0,R1,G1,B1;
452   vector unsigned char  R,G,B;
453   vector unsigned char *out;
454   ubyte *img;
455
456   img = in[0];
457   out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
458
459   for (i=0;i<h;i++) {
460     for (j=0;j<w/16;j++) {
461       uyvy = vec_ld (0, img);
462       U = (vector signed short)
463         vec_perm (uyvy, (vector unsigned char)(0), demux_u);
464
465       V = (vector signed short)
466         vec_perm (uyvy, (vector unsigned char)(0), demux_v);
467
468       Y = (vector signed short)
469         vec_perm (uyvy, (vector unsigned char)(0), demux_y);
470
471       cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
472
473       uyvy = vec_ld (16, img);
474       U = (vector signed short)
475         vec_perm (uyvy, (vector unsigned char)(0), demux_u);
476
477       V = (vector signed short)
478         vec_perm (uyvy, (vector unsigned char)(0), demux_v);
479
480       Y = (vector signed short)
481         vec_perm (uyvy, (vector unsigned char)(0), demux_y);
482
483       cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
484
485       R  = vec_packclp (R0,R1);
486       G  = vec_packclp (G0,G1);
487       B  = vec_packclp (B0,B1);
488
489       //      vec_mstbgr24 (R,G,B, out);
490       out_rgba (R,G,B,out);
491
492       img += 32;
493     }
494   }
495 }
496
497
498
499 /* Ok currently the acceleration routine only supports
500    inputs of widths a multiple of 16
501    and heights a multiple 2
502
503    So we just fall back to the C codes for this.
504 */
505 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
506 {
507   if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))    
508     return NULL;
509
510   /*
511     and this seems not to matter too much I tried a bunch of 
512     videos with abnormal widths and mplayer crashes else where.
513     mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv 
514     boom with X11 bad match.
515     
516   */
517   if ((c->srcW & 0xf) != 0)    return NULL;
518
519   switch (c->srcFormat) {
520   case IMGFMT_YVU9:
521   case IMGFMT_IF09:
522   case IMGFMT_YV12:
523   case IMGFMT_I420:
524   case IMGFMT_IYUV:
525   case IMGFMT_CLPL:
526   case IMGFMT_Y800:
527   case IMGFMT_Y8:
528   case IMGFMT_NV12:
529   case IMGFMT_NV21:
530     if ((c->srcH & 0x1) != 0)
531       return NULL;
532
533     switch(c->dstFormat){
534     case IMGFMT_RGB24:
535       MSG_WARN("ALTIVEC: Color Space RGB24\n");
536       return altivec_yuv2_rgb24;
537     case IMGFMT_BGR24:
538       MSG_WARN("ALTIVEC: Color Space BGR24\n");
539       return altivec_yuv2_bgr24;
540     case IMGFMT_RGB32:
541       MSG_WARN("ALTIVEC: Color Space ARGB32\n");
542       return altivec_yuv2_argb32;
543     case IMGFMT_BGR32:
544       MSG_WARN("ALTIVEC: Color Space BGRA32\n");
545       //      return profile_altivec_bgra32;
546
547       return altivec_yuv2_bgra32;
548     default: return NULL;
549     }
550     break;
551
552   case IMGFMT_UYVY:
553     switch(c->dstFormat){
554     case IMGFMT_RGB32:
555       MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
556       return altivec_uyvy_rgb32;
557     case IMGFMT_RGB24:
558     case IMGFMT_BGR32:
559
560     default: return NULL;
561     }
562     break;
563
564   }
565   return NULL;
566 }
567
568
569 int yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4])
570 {
571
572   vector signed short
573     CY  = (vector signed short)(0x7fff),
574     CRV = (vector signed short)(22972),
575     CBU = (vector signed short)(29029),
576     CGU = (vector signed short)(-11276),
577     CGV = (vector signed short)(-23400),
578     OY;
579
580   vector unsigned short CSHIFT = (vector unsigned short)(1);
581
582   vector signed short Y0;
583   int brightness = c->brightness,  contrast = c->contrast,  saturation = c->saturation;
584   int64_t crv  __attribute__ ((aligned(16)));
585   int64_t cbu  __attribute__ ((aligned(16)));
586   int64_t cgu  __attribute__ ((aligned(16)));
587   int64_t cgv  __attribute__ ((aligned(16)));
588   short tmp __attribute__ ((aligned(16)));
589
590   int64_t cy  = (1<<16)-1;
591   int64_t oy  = 0;
592
593   if ((c->flags & SWS_CPU_CAPS_ALTIVEC) == 0)
594     return;
595
596   crv = inv_table[0];
597   cbu = inv_table[1];
598   cgu = inv_table[2];
599   cgv = inv_table[3];
600
601 #if 0
602   printf ("crv: %hvx\n", CRV);
603   printf ("cbu: %hvx\n", CBU);
604   printf ("cgv: %hvx\n", CGV);
605   printf ("cgu: %hvx\n", CGU);
606
607   printf ("contrast: %d,   brightness: %d, saturation: %d\n", contrast, brightness, saturation);
608
609   printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
610 #endif
611
612   cy = (cy *contrast             )>>17;
613   crv= (crv*contrast * saturation)>>32;
614   cbu= (cbu*contrast * saturation)>>32;
615   cgu= (cgu*contrast * saturation)>>32;
616   cgv= (cgv*contrast * saturation)>>32;
617
618   oy -= 256*brightness;
619
620
621   //printf("%llx %llx %llx %llx %llx\n", cy, crv, cbu, cgu, cgv);
622
623   //  vector signed short CBU,CRV,CGU,CGY,CY;
624   tmp = cy;
625   CY = vec_lde (0, &tmp);
626   CY  = vec_splat (CY, 0);
627
628   tmp = oy;
629   OY = vec_lde (0, &tmp);
630   OY  = vec_splat (OY, 0);
631
632   tmp = crv>>3;
633   CRV = vec_lde (0, &tmp);
634   CRV  = vec_splat (CRV, 0);
635   tmp = cbu>>3;
636   CBU = vec_lde (0, &tmp);
637   CBU  = vec_splat (CBU, 0);
638
639   tmp = -(cgu>>1);
640   CGU = vec_lde (0, &tmp);
641   CGU  = vec_splat (CGU, 0);
642   tmp = -(cgv>>1);
643   CGV = vec_lde (0, &tmp);
644   CGV  = vec_splat (CGV, 0);
645
646   CSHIFT = (vector unsigned short)(2);
647 #if 1
648   c->CSHIFT = CSHIFT;
649   c->CY = CY;
650   c->OY = OY;
651   c->CRV = CRV;
652   c->CBU = CBU;
653   c->CGU = CGU;
654   c->CGV = CGV;
655 #endif
656 #if 1
657   printf ("cy:  %hvx\n", CY);
658   printf ("oy:  %hvx\n", OY);
659   printf ("crv: %hvx\n", CRV);
660   printf ("cbu: %hvx\n", CBU);
661   printf ("cgv: %hvx\n", CGV);
662   printf ("cgu: %hvx\n", CGU);
663 #endif
664 }
665
666
667 void
668 altivec_yuv2packedX (SwsContext *c,
669                        int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
670                        int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
671                        uint8_t *dest, int dstW, int dstY)
672 {
673   int i,j;
674   short tmp __attribute__((aligned (16)));
675   short *p;
676   short *f;
677   vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
678   vector signed short R0,G0,B0,R1,G1,B1;
679
680   vector unsigned char R,G,B,pels[3];
681   vector unsigned char *out,*nout;
682   vector signed short   RND = (vector signed short)(1<<3);
683   vector unsigned short SCL = (vector unsigned short)(4);
684   unsigned long scratch[16] __attribute__ ((aligned (16)));
685
686   vector signed short *vYCoeffsBank, *vCCoeffsBank;
687
688   vector signed short *YCoeffs, *CCoeffs;
689
690   vYCoeffsBank = malloc (sizeof (vector signed short)*lumFilterSize*dstW);
691   vCCoeffsBank = malloc (sizeof (vector signed short)*chrFilterSize*dstW);
692
693   for (i=0;i<lumFilterSize*dstW;i++) {
694     tmp = c->vLumFilter[i];
695     p = &vYCoeffsBank[i];
696     for (j=0;j<8;j++)
697       p[j] = tmp;
698   }
699
700   for (i=0;i<chrFilterSize*dstW;i++) {
701     tmp = c->vChrFilter[i];
702     p = &vCCoeffsBank[i];
703     for (j=0;j<8;j++)
704       p[j] = tmp;
705   }
706
707   YCoeffs = vYCoeffsBank+dstY*lumFilterSize;
708   CCoeffs = vCCoeffsBank+dstY*chrFilterSize;
709
710   out = (vector unsigned char *)dest;
711
712   for(i=0; i<dstW; i+=16){
713     Y0 = RND;
714     Y1 = RND;
715     /* extract 16 coeffs from lumSrc */
716     for(j=0; j<lumFilterSize; j++) {
717       X0 = vec_ld (0,  &lumSrc[j][i]);
718       X1 = vec_ld (16, &lumSrc[j][i]);
719       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
720       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
721     }
722
723     U = RND;
724     V = RND;
725     /* extract 8 coeffs from U,V */
726     for(j=0; j<chrFilterSize; j++) {
727       X  = vec_ld (0, &chrSrc[j][i/2]);
728       U  = vec_mradds (X, CCoeffs[j], U);
729       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
730       V  = vec_mradds (X, CCoeffs[j], V);
731     }
732
733     /* scale and clip signals */
734     Y0 = vec_sra (Y0, SCL);
735     Y1 = vec_sra (Y1, SCL);
736     U  = vec_sra (U,  SCL);
737     V  = vec_sra (V,  SCL);
738
739     Y0 = vec_clip (Y0);
740     Y1 = vec_clip (Y1);
741     U  = vec_clip (U);
742     V  = vec_clip (V);
743
744     /* now we have
745       Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
746       U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
747
748       Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
749       U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
750       V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
751     */
752
753     U0 = vec_mergeh (U,U);
754     V0 = vec_mergeh (V,V);
755
756     U1 = vec_mergel (U,U);
757     V1 = vec_mergel (V,V);
758
759     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
760     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
761
762     R  = vec_packclp (R0,R1);
763     G  = vec_packclp (G0,G1);
764     B  = vec_packclp (B0,B1);
765
766     out_rgba (R,G,B,out);
767   }
768
769   if (i < dstW) {
770     i -= 16;
771
772     Y0 = RND;
773     Y1 = RND;
774     /* extract 16 coeffs from lumSrc */
775     for(j=0; j<lumFilterSize; j++) {
776       X0 = vec_ld (0,  &lumSrc[j][i]);
777       X1 = vec_ld (16, &lumSrc[j][i]);
778       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
779       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
780     }
781
782     U = RND;
783     V = RND;
784     /* extract 8 coeffs from U,V */
785     for(j=0; j<chrFilterSize; j++) {
786       X  = vec_ld (0, &chrSrc[j][i/2]);
787       U  = vec_mradds (X, CCoeffs[j], U);
788       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
789       V  = vec_mradds (X, CCoeffs[j], V);
790     }
791
792     /* scale and clip signals */
793     Y0 = vec_sra (Y0, SCL);
794     Y1 = vec_sra (Y1, SCL);
795     U  = vec_sra (U,  SCL);
796     V  = vec_sra (V,  SCL);
797
798     Y0 = vec_clip (Y0);
799     Y1 = vec_clip (Y1);
800     U  = vec_clip (U);
801     V  = vec_clip (V);
802
803     /* now we have
804        Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
805        U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
806
807        Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
808        U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
809        V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
810     */
811
812     U0 = vec_mergeh (U,U);
813     V0 = vec_mergeh (V,V);
814
815     U1 = vec_mergel (U,U);
816     V1 = vec_mergel (V,V);
817
818     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
819     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
820
821     R  = vec_packclp (R0,R1);
822     G  = vec_packclp (G0,G1);
823     B  = vec_packclp (B0,B1);
824
825     nout = (vector unsigned char *)scratch;
826     out_rgba (R,G,B,nout);
827
828     memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
829   }
830
831   if (vYCoeffsBank) free (vYCoeffsBank);
832   if (vCCoeffsBank) free (vCCoeffsBank);
833
834 }
835
836