]> git.sesse.net Git - ffmpeg/blob - libswscale/yuv2rgb_altivec.c
Factorize rgb/bgr15/16/32->Y by using the preprocessor.
[ffmpeg] / libswscale / yuv2rgb_altivec.c
1 /*
2  * AltiVec acceleration for colorspace conversion
3  *
4  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22
23 /*
24 Convert I420 YV12 to RGB in various formats,
25   it rejects images that are not in 420 formats,
26   it rejects images that don't have widths of multiples of 16,
27   it rejects images that don't have heights of multiples of 2.
28 Reject defers to C simulation code.
29
30 Lots of optimizations to be done here.
31
32 1. Need to fix saturation code. I just couldn't get it to fly with packs
33    and adds, so we currently use max/min to clip.
34
35 2. The inefficient use of chroma loading needs a bit of brushing up.
36
37 3. Analysis of pipeline stalls needs to be done. Use shark to identify
38    pipeline stalls.
39
40
41 MODIFIED to calculate coeffs from currently selected color space.
42 MODIFIED core to be a macro where you specify the output format.
43 ADDED UYVY conversion which is never called due to some thing in swscale.
44 CORRECTED algorithim selection to be strict on input formats.
45 ADDED runtime detection of AltiVec.
46
47 ADDED altivec_yuv2packedX vertical scl + RGB converter
48
49 March 27,2004
50 PERFORMANCE ANALYSIS
51
52 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53 used as test.
54 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55 same sequence.
56
57 720 * 480 * 30  ~10MPS
58
59 so we have roughly 10 clocks per pixel. This is too high, something has
60 to be wrong.
61
62 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63 need for vec_min.
64
65 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
66 the input video frame, it was just decompressed so it probably resides in L1
67 caches. However, we are creating the output video stream. This needs to use the
68 DSTST instruction to optimize for the cache. We couple this with the fact that
69 we are not going to be visiting the input buffer again so we mark it Least
70 Recently Used. This shaves 25% of the processor cycles off.
71
72 Now memcpy is the largest mips consumer in the system, probably due
73 to the inefficient X11 stuff.
74
75 GL libraries seem to be very slow on this machine 1.33Ghz PB running
76 Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
77 a versioning issue, however I have libGL.1.2.dylib for both
78 machines. (We need to figure this out now.)
79
80 GL2 libraries work now with patch for RGB32.
81
82 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
83
84 Integrated luma prescaling adjustment for saturation/contrast/brightness
85 adjustment.
86 */
87
88 #include <stdio.h>
89 #include <stdlib.h>
90 #include <string.h>
91 #include <inttypes.h>
92 #include <assert.h>
93 #include "config.h"
94 #ifdef HAVE_MALLOC_H
95 #include <malloc.h>
96 #endif
97 #include "rgb2rgb.h"
98 #include "swscale.h"
99 #include "swscale_internal.h"
100
101 #undef PROFILE_THE_BEAST
102 #undef INC_SCALING
103
104 typedef unsigned char ubyte;
105 typedef signed char   sbyte;
106
107
108 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
109    homogeneous vector registers x0,x1,x2 are interleaved with the
110    following technique:
111
112       o0 = vec_mergeh (x0,x1);
113       o1 = vec_perm (o0, x2, perm_rgb_0);
114       o2 = vec_perm (o0, x2, perm_rgb_1);
115       o3 = vec_mergel (x0,x1);
116       o4 = vec_perm (o3,o2,perm_rgb_2);
117       o5 = vec_perm (o3,o2,perm_rgb_3);
118
119   perm_rgb_0:   o0(RG).h v1(B) --> o1*
120               0   1  2   3   4
121              rgbr|gbrg|brgb|rgbr
122              0010 0100 1001 0010
123              0102 3145 2673 894A
124
125   perm_rgb_1:   o0(RG).h v1(B) --> o2
126               0   1  2   3   4
127              gbrg|brgb|bbbb|bbbb
128              0100 1001 1111 1111
129              B5CD 6EF7 89AB CDEF
130
131   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
132               0   1  2   3   4
133              gbrg|brgb|rgbr|gbrg
134              1111 1111 0010 0100
135              89AB CDEF 0182 3945
136
137   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
138               0   1  2   3   4
139              brgb|rgbr|gbrg|brgb
140              1001 0010 0100 1001
141              a67b 89cA BdCD eEFf
142
143 */
144 static
145 const vector unsigned char
146   perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
147                 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
148   perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
149                 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
150   perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
151                 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
152   perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
153                 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
154
155 #define vec_merge3(x2,x1,x0,y0,y1,y2)       \
156 do {                                        \
157     typeof(x0) o0,o2,o3;                    \
158         o0 = vec_mergeh (x0,x1);            \
159         y0 = vec_perm (o0, x2, perm_rgb_0); \
160         o2 = vec_perm (o0, x2, perm_rgb_1); \
161         o3 = vec_mergel (x0,x1);            \
162         y1 = vec_perm (o3,o2,perm_rgb_2);   \
163         y2 = vec_perm (o3,o2,perm_rgb_3);   \
164 } while(0)
165
166 #define vec_mstbgr24(x0,x1,x2,ptr)      \
167 do {                                    \
168     typeof(x0) _0,_1,_2;                \
169     vec_merge3 (x0,x1,x2,_0,_1,_2);     \
170     vec_st (_0, 0, ptr++);              \
171     vec_st (_1, 0, ptr++);              \
172     vec_st (_2, 0, ptr++);              \
173 }  while (0);
174
175 #define vec_mstrgb24(x0,x1,x2,ptr)      \
176 do {                                    \
177     typeof(x0) _0,_1,_2;                \
178     vec_merge3 (x2,x1,x0,_0,_1,_2);     \
179     vec_st (_0, 0, ptr++);              \
180     vec_st (_1, 0, ptr++);              \
181     vec_st (_2, 0, ptr++);              \
182 }  while (0);
183
184 /* pack the pixels in rgb0 format
185    msb R
186    lsb 0
187 */
188 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
189 do {                                                                          \
190     T _0,_1,_2,_3;                                                            \
191     _0 = vec_mergeh (x0,x1);                                                  \
192     _1 = vec_mergeh (x2,x3);                                                  \
193     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
194     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
195     vec_st (_2, 0*16, (T *)ptr);                                              \
196     vec_st (_3, 1*16, (T *)ptr);                                              \
197     _0 = vec_mergel (x0,x1);                                                  \
198     _1 = vec_mergel (x2,x3);                                                  \
199     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
200     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
201     vec_st (_2, 2*16, (T *)ptr);                                              \
202     vec_st (_3, 3*16, (T *)ptr);                                              \
203     ptr += 4;                                                                 \
204 }  while (0);
205
206 /*
207
208   | 1     0       1.4021   | | Y |
209   | 1    -0.3441 -0.7142   |x| Cb|
210   | 1     1.7718  0        | | Cr|
211
212
213   Y:      [-128 127]
214   Cb/Cr : [-128 127]
215
216   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
217
218 */
219
220
221
222
223 #define vec_unh(x) \
224     (vector signed short) \
225         vec_perm(x,(typeof(x)){0}, \
226                  ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
227                                          0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
228 #define vec_unl(x) \
229     (vector signed short) \
230         vec_perm(x,(typeof(x)){0}, \
231                  ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
232                                          0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
233
234 #define vec_clip_s16(x) \
235     vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
236                          ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
237
238 #define vec_packclp(x,y) \
239     (vector unsigned char)vec_packs \
240         ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
241          (vector unsigned short)vec_max (y,((vector signed short) {0})))
242
243 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a)){0}),a,a,a,ptr)
244
245
246 static inline void cvtyuvtoRGB (SwsContext *c,
247                                 vector signed short Y, vector signed short U, vector signed short V,
248                                 vector signed short *R, vector signed short *G, vector signed short *B)
249 {
250     vector signed   short vx,ux,uvx;
251
252     Y = vec_mradds (Y, c->CY, c->OY);
253     U  = vec_sub (U,(vector signed short)
254                     vec_splat((vector signed short){128},0));
255     V  = vec_sub (V,(vector signed short)
256                     vec_splat((vector signed short){128},0));
257
258     //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
259     ux = vec_sl (U, c->CSHIFT);
260     *B = vec_mradds (ux, c->CBU, Y);
261
262     // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
263     vx = vec_sl (V, c->CSHIFT);
264     *R = vec_mradds (vx, c->CRV, Y);
265
266     // uvx = ((CGU*u) + (CGV*v))>>15;
267     uvx = vec_mradds (U, c->CGU, Y);
268     *G  = vec_mradds (V, c->CGV, uvx);
269 }
270
271
272 /*
273   ------------------------------------------------------------------------------
274   CS converters
275   ------------------------------------------------------------------------------
276 */
277
278
279 #define DEFCSP420_CVT(name,out_pixels)                                  \
280 static int altivec_##name (SwsContext *c,                               \
281                            unsigned char **in, int *instrides,          \
282                            int srcSliceY,        int srcSliceH,         \
283                            unsigned char **oplanes, int *outstrides)    \
284 {                                                                       \
285     int w = c->srcW;                                                    \
286     int h = srcSliceH;                                                  \
287     int i,j;                                                            \
288     int instrides_scl[3];                                               \
289     vector unsigned char y0,y1;                                         \
290                                                                         \
291     vector signed char  u,v;                                            \
292                                                                         \
293     vector signed short Y0,Y1,Y2,Y3;                                    \
294     vector signed short U,V;                                            \
295     vector signed short vx,ux,uvx;                                      \
296     vector signed short vx0,ux0,uvx0;                                   \
297     vector signed short vx1,ux1,uvx1;                                   \
298     vector signed short R0,G0,B0;                                       \
299     vector signed short R1,G1,B1;                                       \
300     vector unsigned char R,G,B;                                         \
301                                                                         \
302     vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
303     vector unsigned char align_perm;                                    \
304                                                                         \
305     vector signed short                                                 \
306         lCY  = c->CY,                                                   \
307         lOY  = c->OY,                                                   \
308         lCRV = c->CRV,                                                  \
309         lCBU = c->CBU,                                                  \
310         lCGU = c->CGU,                                                  \
311         lCGV = c->CGV;                                                  \
312                                                                         \
313     vector unsigned short lCSHIFT = c->CSHIFT;                          \
314                                                                         \
315     ubyte *y1i   = in[0];                                               \
316     ubyte *y2i   = in[0]+instrides[0];                                  \
317     ubyte *ui    = in[1];                                               \
318     ubyte *vi    = in[2];                                               \
319                                                                         \
320     vector unsigned char *oute                                          \
321         = (vector unsigned char *)                                      \
322             (oplanes[0]+srcSliceY*outstrides[0]);                       \
323     vector unsigned char *outo                                          \
324         = (vector unsigned char *)                                      \
325             (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
326                                                                         \
327                                                                         \
328     instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
329     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
330     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
331                                                                         \
332                                                                         \
333     for (i=0;i<h/2;i++) {                                               \
334         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
335         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
336                                                                         \
337         for (j=0;j<w/16;j++) {                                          \
338                                                                         \
339             y1ivP = (vector unsigned char *)y1i;                        \
340             y2ivP = (vector unsigned char *)y2i;                        \
341             uivP  = (vector unsigned char *)ui;                         \
342             vivP  = (vector unsigned char *)vi;                         \
343                                                                         \
344             align_perm = vec_lvsl (0, y1i);                             \
345             y0 = (vector unsigned char)                                 \
346                  vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
347                                                                         \
348             align_perm = vec_lvsl (0, y2i);                             \
349             y1 = (vector unsigned char)                                 \
350                  vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
351                                                                         \
352             align_perm = vec_lvsl (0, ui);                              \
353             u = (vector signed char)                                    \
354                 vec_perm (uivP[0], uivP[1], align_perm);                \
355                                                                         \
356             align_perm = vec_lvsl (0, vi);                              \
357             v = (vector signed char)                                    \
358                 vec_perm (vivP[0], vivP[1], align_perm);                \
359                                                                         \
360             u  = (vector signed char)                                   \
361                  vec_sub (u,(vector signed char)                        \
362                           vec_splat((vector signed char){128},0));      \
363             v  = (vector signed char)                                   \
364                  vec_sub (v,(vector signed char)                        \
365                           vec_splat((vector signed char){128},0));      \
366                                                                         \
367             U  = vec_unpackh (u);                                       \
368             V  = vec_unpackh (v);                                       \
369                                                                         \
370                                                                         \
371             Y0 = vec_unh (y0);                                          \
372             Y1 = vec_unl (y0);                                          \
373             Y2 = vec_unh (y1);                                          \
374             Y3 = vec_unl (y1);                                          \
375                                                                         \
376             Y0 = vec_mradds (Y0, lCY, lOY);                             \
377             Y1 = vec_mradds (Y1, lCY, lOY);                             \
378             Y2 = vec_mradds (Y2, lCY, lOY);                             \
379             Y3 = vec_mradds (Y3, lCY, lOY);                             \
380                                                                         \
381             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
382             ux = vec_sl (U, lCSHIFT);                                   \
383             ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
384             ux0  = vec_mergeh (ux,ux);                                  \
385             ux1  = vec_mergel (ux,ux);                                  \
386                                                                         \
387             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
388             vx = vec_sl (V, lCSHIFT);                                   \
389             vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
390             vx0  = vec_mergeh (vx,vx);                                  \
391             vx1  = vec_mergel (vx,vx);                                  \
392                                                                         \
393             /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
394             uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
395             uvx = vec_mradds (V, lCGV, uvx);                            \
396             uvx0 = vec_mergeh (uvx,uvx);                                \
397             uvx1 = vec_mergel (uvx,uvx);                                \
398                                                                         \
399             R0 = vec_add (Y0,vx0);                                      \
400             G0 = vec_add (Y0,uvx0);                                     \
401             B0 = vec_add (Y0,ux0);                                      \
402             R1 = vec_add (Y1,vx1);                                      \
403             G1 = vec_add (Y1,uvx1);                                     \
404             B1 = vec_add (Y1,ux1);                                      \
405                                                                         \
406             R  = vec_packclp (R0,R1);                                   \
407             G  = vec_packclp (G0,G1);                                   \
408             B  = vec_packclp (B0,B1);                                   \
409                                                                         \
410             out_pixels(R,G,B,oute);                                     \
411                                                                         \
412             R0 = vec_add (Y2,vx0);                                      \
413             G0 = vec_add (Y2,uvx0);                                     \
414             B0 = vec_add (Y2,ux0);                                      \
415             R1 = vec_add (Y3,vx1);                                      \
416             G1 = vec_add (Y3,uvx1);                                     \
417             B1 = vec_add (Y3,ux1);                                      \
418             R  = vec_packclp (R0,R1);                                   \
419             G  = vec_packclp (G0,G1);                                   \
420             B  = vec_packclp (B0,B1);                                   \
421                                                                         \
422                                                                         \
423             out_pixels(R,G,B,outo);                                     \
424                                                                         \
425             y1i  += 16;                                                 \
426             y2i  += 16;                                                 \
427             ui   += 8;                                                  \
428             vi   += 8;                                                  \
429                                                                         \
430         }                                                               \
431                                                                         \
432         outo  += (outstrides[0])>>4;                                    \
433         oute  += (outstrides[0])>>4;                                    \
434                                                                         \
435         ui    += instrides_scl[1];                                      \
436         vi    += instrides_scl[2];                                      \
437         y1i   += instrides_scl[0];                                      \
438         y2i   += instrides_scl[0];                                      \
439     }                                                                   \
440     return srcSliceH;                                                   \
441 }
442
443
444 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a)){0}),c,b,a,ptr)
445 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(typeof(a),c,b,a,((typeof (a)){0}),ptr)
446 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(typeof(a),a,b,c,((typeof (a)){0}),ptr)
447 #define out_argb(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a)){0}),a,b,c,ptr)
448 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
449 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
450
451 DEFCSP420_CVT (yuv2_abgr, out_abgr)
452 #if 1
453 DEFCSP420_CVT (yuv2_bgra, out_bgra)
454 #else
455 static int altivec_yuv2_bgra32 (SwsContext *c,
456                                 unsigned char **in, int *instrides,
457                                 int srcSliceY,        int srcSliceH,
458                                 unsigned char **oplanes, int *outstrides)
459 {
460     int w = c->srcW;
461     int h = srcSliceH;
462     int i,j;
463     int instrides_scl[3];
464     vector unsigned char y0,y1;
465
466     vector signed char  u,v;
467
468     vector signed short Y0,Y1,Y2,Y3;
469     vector signed short U,V;
470     vector signed short vx,ux,uvx;
471     vector signed short vx0,ux0,uvx0;
472     vector signed short vx1,ux1,uvx1;
473     vector signed short R0,G0,B0;
474     vector signed short R1,G1,B1;
475     vector unsigned char R,G,B;
476
477     vector unsigned char *uivP, *vivP;
478     vector unsigned char align_perm;
479
480     vector signed short
481         lCY  = c->CY,
482         lOY  = c->OY,
483         lCRV = c->CRV,
484         lCBU = c->CBU,
485         lCGU = c->CGU,
486         lCGV = c->CGV;
487
488     vector unsigned short lCSHIFT = c->CSHIFT;
489
490     ubyte *y1i   = in[0];
491     ubyte *y2i   = in[0]+w;
492     ubyte *ui    = in[1];
493     ubyte *vi    = in[2];
494
495     vector unsigned char *oute
496         = (vector unsigned char *)
497           (oplanes[0]+srcSliceY*outstrides[0]);
498     vector unsigned char *outo
499         = (vector unsigned char *)
500           (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
501
502
503     instrides_scl[0] = instrides[0];
504     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
505     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
506
507
508     for (i=0;i<h/2;i++) {
509         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
510         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
511
512         for (j=0;j<w/16;j++) {
513
514             y0 = vec_ldl (0,y1i);
515             y1 = vec_ldl (0,y2i);
516             uivP = (vector unsigned char *)ui;
517             vivP = (vector unsigned char *)vi;
518
519             align_perm = vec_lvsl (0, ui);
520             u  = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
521
522             align_perm = vec_lvsl (0, vi);
523             v  = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
524             u  = (vector signed char)
525                  vec_sub (u,(vector signed char)
526                           vec_splat((vector signed char){128},0));
527
528             v  = (vector signed char)
529                  vec_sub (v, (vector signed char)
530                           vec_splat((vector signed char){128},0));
531
532             U  = vec_unpackh (u);
533             V  = vec_unpackh (v);
534
535
536             Y0 = vec_unh (y0);
537             Y1 = vec_unl (y0);
538             Y2 = vec_unh (y1);
539             Y3 = vec_unl (y1);
540
541             Y0 = vec_mradds (Y0, lCY, lOY);
542             Y1 = vec_mradds (Y1, lCY, lOY);
543             Y2 = vec_mradds (Y2, lCY, lOY);
544             Y3 = vec_mradds (Y3, lCY, lOY);
545
546             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
547             ux = vec_sl (U, lCSHIFT);
548             ux = vec_mradds (ux, lCBU, (vector signed short){0});
549             ux0  = vec_mergeh (ux,ux);
550             ux1  = vec_mergel (ux,ux);
551
552             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */
553             vx = vec_sl (V, lCSHIFT);
554             vx = vec_mradds (vx, lCRV, (vector signed short){0});
555             vx0  = vec_mergeh (vx,vx);
556             vx1  = vec_mergel (vx,vx);
557             /* uvx = ((CGU*u) + (CGV*v))>>15 */
558             uvx = vec_mradds (U, lCGU, (vector signed short){0});
559             uvx = vec_mradds (V, lCGV, uvx);
560             uvx0 = vec_mergeh (uvx,uvx);
561             uvx1 = vec_mergel (uvx,uvx);
562             R0 = vec_add (Y0,vx0);
563             G0 = vec_add (Y0,uvx0);
564             B0 = vec_add (Y0,ux0);
565             R1 = vec_add (Y1,vx1);
566             G1 = vec_add (Y1,uvx1);
567             B1 = vec_add (Y1,ux1);
568             R  = vec_packclp (R0,R1);
569             G  = vec_packclp (G0,G1);
570             B  = vec_packclp (B0,B1);
571
572             out_argb(R,G,B,oute);
573             R0 = vec_add (Y2,vx0);
574             G0 = vec_add (Y2,uvx0);
575             B0 = vec_add (Y2,ux0);
576             R1 = vec_add (Y3,vx1);
577             G1 = vec_add (Y3,uvx1);
578             B1 = vec_add (Y3,ux1);
579             R  = vec_packclp (R0,R1);
580             G  = vec_packclp (G0,G1);
581             B  = vec_packclp (B0,B1);
582
583             out_argb(R,G,B,outo);
584             y1i  += 16;
585             y2i  += 16;
586             ui   += 8;
587             vi   += 8;
588
589         }
590
591         outo  += (outstrides[0])>>4;
592         oute  += (outstrides[0])>>4;
593
594         ui    += instrides_scl[1];
595         vi    += instrides_scl[2];
596         y1i   += instrides_scl[0];
597         y2i   += instrides_scl[0];
598     }
599     return srcSliceH;
600 }
601
602 #endif
603
604
605 DEFCSP420_CVT (yuv2_rgba, out_rgba)
606 DEFCSP420_CVT (yuv2_argb, out_argb)
607 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
608 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
609
610
611 // uyvy|uyvy|uyvy|uyvy
612 // 0123 4567 89ab cdef
613 static
614 const vector unsigned char
615     demux_u = {0x10,0x00,0x10,0x00,
616                0x10,0x04,0x10,0x04,
617                0x10,0x08,0x10,0x08,
618                0x10,0x0c,0x10,0x0c},
619     demux_v = {0x10,0x02,0x10,0x02,
620                0x10,0x06,0x10,0x06,
621                0x10,0x0A,0x10,0x0A,
622                0x10,0x0E,0x10,0x0E},
623     demux_y = {0x10,0x01,0x10,0x03,
624                0x10,0x05,0x10,0x07,
625                0x10,0x09,0x10,0x0B,
626                0x10,0x0D,0x10,0x0F};
627
628 /*
629   this is so I can play live CCIR raw video
630 */
631 static int altivec_uyvy_rgb32 (SwsContext *c,
632                                unsigned char **in, int *instrides,
633                                int srcSliceY,        int srcSliceH,
634                                unsigned char **oplanes, int *outstrides)
635 {
636     int w = c->srcW;
637     int h = srcSliceH;
638     int i,j;
639     vector unsigned char uyvy;
640     vector signed   short Y,U,V;
641     vector signed   short R0,G0,B0,R1,G1,B1;
642     vector unsigned char  R,G,B;
643     vector unsigned char *out;
644     ubyte *img;
645
646     img = in[0];
647     out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
648
649     for (i=0;i<h;i++) {
650         for (j=0;j<w/16;j++) {
651             uyvy = vec_ld (0, img);
652             U = (vector signed short)
653                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
654
655             V = (vector signed short)
656                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
657
658             Y = (vector signed short)
659                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
660
661             cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
662
663             uyvy = vec_ld (16, img);
664             U = (vector signed short)
665                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
666
667             V = (vector signed short)
668                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
669
670             Y = (vector signed short)
671                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
672
673             cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
674
675             R  = vec_packclp (R0,R1);
676             G  = vec_packclp (G0,G1);
677             B  = vec_packclp (B0,B1);
678
679             //      vec_mstbgr24 (R,G,B, out);
680             out_rgba (R,G,B,out);
681
682             img += 32;
683         }
684     }
685     return srcSliceH;
686 }
687
688
689
690 /* Ok currently the acceleration routine only supports
691    inputs of widths a multiple of 16
692    and heights a multiple 2
693
694    So we just fall back to the C codes for this.
695 */
696 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
697 {
698     if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
699         return NULL;
700
701     /*
702       and this seems not to matter too much I tried a bunch of
703       videos with abnormal widths and MPlayer crashes elsewhere.
704       mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
705       boom with X11 bad match.
706
707     */
708     if ((c->srcW & 0xf) != 0)    return NULL;
709
710     switch (c->srcFormat) {
711     case PIX_FMT_YUV410P:
712     case PIX_FMT_YUV420P:
713     /*case IMGFMT_CLPL:        ??? */
714     case PIX_FMT_GRAY8:
715     case PIX_FMT_NV12:
716     case PIX_FMT_NV21:
717         if ((c->srcH & 0x1) != 0)
718             return NULL;
719
720         switch(c->dstFormat){
721         case PIX_FMT_RGB24:
722             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
723             return altivec_yuv2_rgb24;
724         case PIX_FMT_BGR24:
725             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
726             return altivec_yuv2_bgr24;
727         case PIX_FMT_ARGB:
728             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
729             return altivec_yuv2_argb;
730         case PIX_FMT_ABGR:
731             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
732             return altivec_yuv2_abgr;
733         case PIX_FMT_RGBA:
734             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
735             return altivec_yuv2_rgba;
736         case PIX_FMT_BGRA:
737             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
738             return altivec_yuv2_bgra;
739         default: return NULL;
740         }
741         break;
742
743     case PIX_FMT_UYVY422:
744         switch(c->dstFormat){
745         case PIX_FMT_BGR32:
746             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
747             return altivec_uyvy_rgb32;
748         default: return NULL;
749         }
750         break;
751
752     }
753     return NULL;
754 }
755
756 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
757 {
758     union {
759         signed short tmp[8] __attribute__ ((aligned(16)));
760         vector signed short vec;
761     } buf;
762
763     buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
764     buf.tmp[1] =  -256*brightness;                                      //oy
765     buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
766     buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
767     buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
768     buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
769
770
771     c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
772     c->CY   = vec_splat ((vector signed short)buf.vec, 0);
773     c->OY   = vec_splat ((vector signed short)buf.vec, 1);
774     c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
775     c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
776     c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
777     c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
778 #if 0
779     {
780     int i;
781     char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
782     for (i=0; i<6; i++)
783         printf("%s %d ", v[i],buf.tmp[i] );
784         printf("\n");
785     }
786 #endif
787     return;
788 }
789
790
791 void
792 altivec_yuv2packedX (SwsContext *c,
793                      int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
794                      int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
795                      uint8_t *dest, int dstW, int dstY)
796 {
797     int i,j;
798     vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
799     vector signed short R0,G0,B0,R1,G1,B1;
800
801     vector unsigned char R,G,B;
802     vector unsigned char *out,*nout;
803
804     vector signed short   RND = vec_splat_s16(1<<3);
805     vector unsigned short SCL = vec_splat_u16(4);
806     unsigned long scratch[16] __attribute__ ((aligned (16)));
807
808     vector signed short *YCoeffs, *CCoeffs;
809
810     YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
811     CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
812
813     out = (vector unsigned char *)dest;
814
815     for (i=0; i<dstW; i+=16){
816         Y0 = RND;
817         Y1 = RND;
818         /* extract 16 coeffs from lumSrc */
819         for (j=0; j<lumFilterSize; j++) {
820             X0 = vec_ld (0,  &lumSrc[j][i]);
821             X1 = vec_ld (16, &lumSrc[j][i]);
822             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
823             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
824         }
825
826         U = RND;
827         V = RND;
828         /* extract 8 coeffs from U,V */
829         for (j=0; j<chrFilterSize; j++) {
830             X  = vec_ld (0, &chrSrc[j][i/2]);
831             U  = vec_mradds (X, CCoeffs[j], U);
832             X  = vec_ld (0, &chrSrc[j][i/2+2048]);
833             V  = vec_mradds (X, CCoeffs[j], V);
834         }
835
836         /* scale and clip signals */
837         Y0 = vec_sra (Y0, SCL);
838         Y1 = vec_sra (Y1, SCL);
839         U  = vec_sra (U,  SCL);
840         V  = vec_sra (V,  SCL);
841
842         Y0 = vec_clip_s16 (Y0);
843         Y1 = vec_clip_s16 (Y1);
844         U  = vec_clip_s16 (U);
845         V  = vec_clip_s16 (V);
846
847         /* now we have
848           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
849           U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
850
851           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
852           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
853           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
854         */
855
856         U0 = vec_mergeh (U,U);
857         V0 = vec_mergeh (V,V);
858
859         U1 = vec_mergel (U,U);
860         V1 = vec_mergel (V,V);
861
862         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
863         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
864
865         R  = vec_packclp (R0,R1);
866         G  = vec_packclp (G0,G1);
867         B  = vec_packclp (B0,B1);
868
869         switch(c->dstFormat) {
870             case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
871             case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
872             case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
873             case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
874             case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
875             case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
876             default:
877             {
878                 /* If this is reached, the caller should have called yuv2packedXinC
879                    instead. */
880                 static int printed_error_message;
881                 if (!printed_error_message) {
882                     av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
883                            sws_format_name(c->dstFormat));
884                     printed_error_message=1;
885                 }
886                 return;
887             }
888         }
889     }
890
891     if (i < dstW) {
892         i -= 16;
893
894         Y0 = RND;
895         Y1 = RND;
896         /* extract 16 coeffs from lumSrc */
897         for (j=0; j<lumFilterSize; j++) {
898             X0 = vec_ld (0,  &lumSrc[j][i]);
899             X1 = vec_ld (16, &lumSrc[j][i]);
900             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
901             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
902         }
903
904         U = RND;
905         V = RND;
906         /* extract 8 coeffs from U,V */
907         for (j=0; j<chrFilterSize; j++) {
908             X  = vec_ld (0, &chrSrc[j][i/2]);
909             U  = vec_mradds (X, CCoeffs[j], U);
910             X  = vec_ld (0, &chrSrc[j][i/2+2048]);
911             V  = vec_mradds (X, CCoeffs[j], V);
912         }
913
914         /* scale and clip signals */
915         Y0 = vec_sra (Y0, SCL);
916         Y1 = vec_sra (Y1, SCL);
917         U  = vec_sra (U,  SCL);
918         V  = vec_sra (V,  SCL);
919
920         Y0 = vec_clip_s16 (Y0);
921         Y1 = vec_clip_s16 (Y1);
922         U  = vec_clip_s16 (U);
923         V  = vec_clip_s16 (V);
924
925         /* now we have
926            Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
927            U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
928
929            Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
930            U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
931            V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
932         */
933
934         U0 = vec_mergeh (U,U);
935         V0 = vec_mergeh (V,V);
936
937         U1 = vec_mergel (U,U);
938         V1 = vec_mergel (V,V);
939
940         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
941         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
942
943         R  = vec_packclp (R0,R1);
944         G  = vec_packclp (G0,G1);
945         B  = vec_packclp (B0,B1);
946
947         nout = (vector unsigned char *)scratch;
948         switch(c->dstFormat) {
949             case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
950             case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
951             case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
952             case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
953             case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
954             case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
955             default:
956                 /* Unreachable, I think. */
957                 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
958                        sws_format_name(c->dstFormat));
959                 return;
960         }
961
962         memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
963     }
964
965 }