]> git.sesse.net Git - ffmpeg/blob - libswscale/ppc/yuv2rgb_altivec.c
09b72ae846468fbef4669bed0ce098fc3dfe2ac6
[ffmpeg] / libswscale / ppc / yuv2rgb_altivec.c
1 /*
2  * AltiVec acceleration for colorspace conversion
3  *
4  * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22
23 /*
24 Convert I420 YV12 to RGB in various formats,
25   it rejects images that are not in 420 formats,
26   it rejects images that don't have widths of multiples of 16,
27   it rejects images that don't have heights of multiples of 2.
28 Reject defers to C simulation code.
29
30 Lots of optimizations to be done here.
31
32 1. Need to fix saturation code. I just couldn't get it to fly with packs
33    and adds, so we currently use max/min to clip.
34
35 2. The inefficient use of chroma loading needs a bit of brushing up.
36
37 3. Analysis of pipeline stalls needs to be done. Use shark to identify
38    pipeline stalls.
39
40
41 MODIFIED to calculate coeffs from currently selected color space.
42 MODIFIED core to be a macro where you specify the output format.
43 ADDED UYVY conversion which is never called due to some thing in swscale.
44 CORRECTED algorithim selection to be strict on input formats.
45 ADDED runtime detection of AltiVec.
46
47 ADDED altivec_yuv2packedX vertical scl + RGB converter
48
49 March 27,2004
50 PERFORMANCE ANALYSIS
51
52 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53 used as test.
54 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55 same sequence.
56
57 720 * 480 * 30  ~10MPS
58
59 so we have roughly 10 clocks per pixel. This is too high, something has
60 to be wrong.
61
62 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63 need for vec_min.
64
65 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
66 the input video frame, it was just decompressed so it probably resides in L1
67 caches. However, we are creating the output video stream. This needs to use the
68 DSTST instruction to optimize for the cache. We couple this with the fact that
69 we are not going to be visiting the input buffer again so we mark it Least
70 Recently Used. This shaves 25% of the processor cycles off.
71
72 Now memcpy is the largest mips consumer in the system, probably due
73 to the inefficient X11 stuff.
74
75 GL libraries seem to be very slow on this machine 1.33Ghz PB running
76 Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
77 a versioning issue, however I have libGL.1.2.dylib for both
78 machines. (We need to figure this out now.)
79
80 GL2 libraries work now with patch for RGB32.
81
82 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
83
84 Integrated luma prescaling adjustment for saturation/contrast/brightness
85 adjustment.
86 */
87
88 #include <stdio.h>
89 #include <stdlib.h>
90 #include <string.h>
91 #include <inttypes.h>
92 #include <assert.h>
93 #include "config.h"
94 #include "libswscale/rgb2rgb.h"
95 #include "libswscale/swscale.h"
96 #include "libswscale/swscale_internal.h"
97 #include "libavutil/cpu.h"
98
99 #undef PROFILE_THE_BEAST
100 #undef INC_SCALING
101
102 typedef unsigned char ubyte;
103 typedef signed char   sbyte;
104
105
106 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
107    homogeneous vector registers x0,x1,x2 are interleaved with the
108    following technique:
109
110       o0 = vec_mergeh (x0,x1);
111       o1 = vec_perm (o0, x2, perm_rgb_0);
112       o2 = vec_perm (o0, x2, perm_rgb_1);
113       o3 = vec_mergel (x0,x1);
114       o4 = vec_perm (o3,o2,perm_rgb_2);
115       o5 = vec_perm (o3,o2,perm_rgb_3);
116
117   perm_rgb_0:   o0(RG).h v1(B) --> o1*
118               0   1  2   3   4
119              rgbr|gbrg|brgb|rgbr
120              0010 0100 1001 0010
121              0102 3145 2673 894A
122
123   perm_rgb_1:   o0(RG).h v1(B) --> o2
124               0   1  2   3   4
125              gbrg|brgb|bbbb|bbbb
126              0100 1001 1111 1111
127              B5CD 6EF7 89AB CDEF
128
129   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
130               0   1  2   3   4
131              gbrg|brgb|rgbr|gbrg
132              1111 1111 0010 0100
133              89AB CDEF 0182 3945
134
135   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
136               0   1  2   3   4
137              brgb|rgbr|gbrg|brgb
138              1001 0010 0100 1001
139              a67b 89cA BdCD eEFf
140
141 */
142 static
143 const vector unsigned char
144   perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
145                 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
146   perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
147                 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
148   perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
149                 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
150   perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
151                 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
152
153 #define vec_merge3(x2,x1,x0,y0,y1,y2)       \
154 do {                                        \
155     __typeof__(x0) o0,o2,o3;                \
156         o0 = vec_mergeh (x0,x1);            \
157         y0 = vec_perm (o0, x2, perm_rgb_0); \
158         o2 = vec_perm (o0, x2, perm_rgb_1); \
159         o3 = vec_mergel (x0,x1);            \
160         y1 = vec_perm (o3,o2,perm_rgb_2);   \
161         y2 = vec_perm (o3,o2,perm_rgb_3);   \
162 } while(0)
163
164 #define vec_mstbgr24(x0,x1,x2,ptr)      \
165 do {                                    \
166     __typeof__(x0) _0,_1,_2;            \
167     vec_merge3 (x0,x1,x2,_0,_1,_2);     \
168     vec_st (_0, 0, ptr++);              \
169     vec_st (_1, 0, ptr++);              \
170     vec_st (_2, 0, ptr++);              \
171 }  while (0)
172
173 #define vec_mstrgb24(x0,x1,x2,ptr)      \
174 do {                                    \
175     __typeof__(x0) _0,_1,_2;            \
176     vec_merge3 (x2,x1,x0,_0,_1,_2);     \
177     vec_st (_0, 0, ptr++);              \
178     vec_st (_1, 0, ptr++);              \
179     vec_st (_2, 0, ptr++);              \
180 }  while (0)
181
182 /* pack the pixels in rgb0 format
183    msb R
184    lsb 0
185 */
186 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
187 do {                                                                          \
188     T _0,_1,_2,_3;                                                            \
189     _0 = vec_mergeh (x0,x1);                                                  \
190     _1 = vec_mergeh (x2,x3);                                                  \
191     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
192     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
193     vec_st (_2, 0*16, (T *)ptr);                                              \
194     vec_st (_3, 1*16, (T *)ptr);                                              \
195     _0 = vec_mergel (x0,x1);                                                  \
196     _1 = vec_mergel (x2,x3);                                                  \
197     _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
198     _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
199     vec_st (_2, 2*16, (T *)ptr);                                              \
200     vec_st (_3, 3*16, (T *)ptr);                                              \
201     ptr += 4;                                                                 \
202 }  while (0)
203
204 /*
205
206   | 1     0       1.4021   | | Y |
207   | 1    -0.3441 -0.7142   |x| Cb|
208   | 1     1.7718  0        | | Cr|
209
210
211   Y:      [-128 127]
212   Cb/Cr : [-128 127]
213
214   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
215
216 */
217
218
219
220
221 #define vec_unh(x) \
222     (vector signed short) \
223         vec_perm(x,(__typeof__(x)){0}, \
224                  ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
225                                          0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
226 #define vec_unl(x) \
227     (vector signed short) \
228         vec_perm(x,(__typeof__(x)){0}, \
229                  ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
230                                          0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
231
232 #define vec_clip_s16(x) \
233     vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
234                          ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
235
236 #define vec_packclp(x,y) \
237     (vector unsigned char)vec_packs \
238         ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
239          (vector unsigned short)vec_max (y,((vector signed short) {0})))
240
241 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
242
243
244 static inline void cvtyuvtoRGB (SwsContext *c,
245                                 vector signed short Y, vector signed short U, vector signed short V,
246                                 vector signed short *R, vector signed short *G, vector signed short *B)
247 {
248     vector signed   short vx,ux,uvx;
249
250     Y = vec_mradds (Y, c->CY, c->OY);
251     U  = vec_sub (U,(vector signed short)
252                     vec_splat((vector signed short){128},0));
253     V  = vec_sub (V,(vector signed short)
254                     vec_splat((vector signed short){128},0));
255
256     //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
257     ux = vec_sl (U, c->CSHIFT);
258     *B = vec_mradds (ux, c->CBU, Y);
259
260     // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
261     vx = vec_sl (V, c->CSHIFT);
262     *R = vec_mradds (vx, c->CRV, Y);
263
264     // uvx = ((CGU*u) + (CGV*v))>>15;
265     uvx = vec_mradds (U, c->CGU, Y);
266     *G  = vec_mradds (V, c->CGV, uvx);
267 }
268
269
270 /*
271   ------------------------------------------------------------------------------
272   CS converters
273   ------------------------------------------------------------------------------
274 */
275
276
277 #define DEFCSP420_CVT(name,out_pixels)                                  \
278 static int altivec_##name (SwsContext *c,                               \
279                            const unsigned char **in, int *instrides,    \
280                            int srcSliceY,        int srcSliceH,         \
281                            unsigned char **oplanes, int *outstrides)    \
282 {                                                                       \
283     int w = c->srcW;                                                    \
284     int h = srcSliceH;                                                  \
285     int i,j;                                                            \
286     int instrides_scl[3];                                               \
287     vector unsigned char y0,y1;                                         \
288                                                                         \
289     vector signed char  u,v;                                            \
290                                                                         \
291     vector signed short Y0,Y1,Y2,Y3;                                    \
292     vector signed short U,V;                                            \
293     vector signed short vx,ux,uvx;                                      \
294     vector signed short vx0,ux0,uvx0;                                   \
295     vector signed short vx1,ux1,uvx1;                                   \
296     vector signed short R0,G0,B0;                                       \
297     vector signed short R1,G1,B1;                                       \
298     vector unsigned char R,G,B;                                         \
299                                                                         \
300     vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
301     vector unsigned char align_perm;                                    \
302                                                                         \
303     vector signed short                                                 \
304         lCY  = c->CY,                                                   \
305         lOY  = c->OY,                                                   \
306         lCRV = c->CRV,                                                  \
307         lCBU = c->CBU,                                                  \
308         lCGU = c->CGU,                                                  \
309         lCGV = c->CGV;                                                  \
310                                                                         \
311     vector unsigned short lCSHIFT = c->CSHIFT;                          \
312                                                                         \
313     const ubyte *y1i   = in[0];                                         \
314     const ubyte *y2i   = in[0]+instrides[0];                            \
315     const ubyte *ui    = in[1];                                         \
316     const ubyte *vi    = in[2];                                         \
317                                                                         \
318     vector unsigned char *oute                                          \
319         = (vector unsigned char *)                                      \
320             (oplanes[0]+srcSliceY*outstrides[0]);                       \
321     vector unsigned char *outo                                          \
322         = (vector unsigned char *)                                      \
323             (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
324                                                                         \
325                                                                         \
326     instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
327     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
328     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
329                                                                         \
330                                                                         \
331     for (i=0;i<h/2;i++) {                                               \
332         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
333         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
334                                                                         \
335         for (j=0;j<w/16;j++) {                                          \
336                                                                         \
337             y1ivP = (vector unsigned char *)y1i;                        \
338             y2ivP = (vector unsigned char *)y2i;                        \
339             uivP  = (vector unsigned char *)ui;                         \
340             vivP  = (vector unsigned char *)vi;                         \
341                                                                         \
342             align_perm = vec_lvsl (0, y1i);                             \
343             y0 = (vector unsigned char)                                 \
344                  vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
345                                                                         \
346             align_perm = vec_lvsl (0, y2i);                             \
347             y1 = (vector unsigned char)                                 \
348                  vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
349                                                                         \
350             align_perm = vec_lvsl (0, ui);                              \
351             u = (vector signed char)                                    \
352                 vec_perm (uivP[0], uivP[1], align_perm);                \
353                                                                         \
354             align_perm = vec_lvsl (0, vi);                              \
355             v = (vector signed char)                                    \
356                 vec_perm (vivP[0], vivP[1], align_perm);                \
357                                                                         \
358             u  = (vector signed char)                                   \
359                  vec_sub (u,(vector signed char)                        \
360                           vec_splat((vector signed char){128},0));      \
361             v  = (vector signed char)                                   \
362                  vec_sub (v,(vector signed char)                        \
363                           vec_splat((vector signed char){128},0));      \
364                                                                         \
365             U  = vec_unpackh (u);                                       \
366             V  = vec_unpackh (v);                                       \
367                                                                         \
368                                                                         \
369             Y0 = vec_unh (y0);                                          \
370             Y1 = vec_unl (y0);                                          \
371             Y2 = vec_unh (y1);                                          \
372             Y3 = vec_unl (y1);                                          \
373                                                                         \
374             Y0 = vec_mradds (Y0, lCY, lOY);                             \
375             Y1 = vec_mradds (Y1, lCY, lOY);                             \
376             Y2 = vec_mradds (Y2, lCY, lOY);                             \
377             Y3 = vec_mradds (Y3, lCY, lOY);                             \
378                                                                         \
379             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
380             ux = vec_sl (U, lCSHIFT);                                   \
381             ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
382             ux0  = vec_mergeh (ux,ux);                                  \
383             ux1  = vec_mergel (ux,ux);                                  \
384                                                                         \
385             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
386             vx = vec_sl (V, lCSHIFT);                                   \
387             vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
388             vx0  = vec_mergeh (vx,vx);                                  \
389             vx1  = vec_mergel (vx,vx);                                  \
390                                                                         \
391             /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
392             uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
393             uvx = vec_mradds (V, lCGV, uvx);                            \
394             uvx0 = vec_mergeh (uvx,uvx);                                \
395             uvx1 = vec_mergel (uvx,uvx);                                \
396                                                                         \
397             R0 = vec_add (Y0,vx0);                                      \
398             G0 = vec_add (Y0,uvx0);                                     \
399             B0 = vec_add (Y0,ux0);                                      \
400             R1 = vec_add (Y1,vx1);                                      \
401             G1 = vec_add (Y1,uvx1);                                     \
402             B1 = vec_add (Y1,ux1);                                      \
403                                                                         \
404             R  = vec_packclp (R0,R1);                                   \
405             G  = vec_packclp (G0,G1);                                   \
406             B  = vec_packclp (B0,B1);                                   \
407                                                                         \
408             out_pixels(R,G,B,oute);                                     \
409                                                                         \
410             R0 = vec_add (Y2,vx0);                                      \
411             G0 = vec_add (Y2,uvx0);                                     \
412             B0 = vec_add (Y2,ux0);                                      \
413             R1 = vec_add (Y3,vx1);                                      \
414             G1 = vec_add (Y3,uvx1);                                     \
415             B1 = vec_add (Y3,ux1);                                      \
416             R  = vec_packclp (R0,R1);                                   \
417             G  = vec_packclp (G0,G1);                                   \
418             B  = vec_packclp (B0,B1);                                   \
419                                                                         \
420                                                                         \
421             out_pixels(R,G,B,outo);                                     \
422                                                                         \
423             y1i  += 16;                                                 \
424             y2i  += 16;                                                 \
425             ui   += 8;                                                  \
426             vi   += 8;                                                  \
427                                                                         \
428         }                                                               \
429                                                                         \
430         outo  += (outstrides[0])>>4;                                    \
431         oute  += (outstrides[0])>>4;                                    \
432                                                                         \
433         ui    += instrides_scl[1];                                      \
434         vi    += instrides_scl[2];                                      \
435         y1i   += instrides_scl[0];                                      \
436         y2i   += instrides_scl[0];                                      \
437     }                                                                   \
438     return srcSliceH;                                                   \
439 }
440
441
442 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
443 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
444 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
445 #define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
446 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
447 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
448
449 DEFCSP420_CVT (yuv2_abgr, out_abgr)
450 #if 1
451 DEFCSP420_CVT (yuv2_bgra, out_bgra)
452 #else
453 static int altivec_yuv2_bgra32 (SwsContext *c,
454                                 unsigned char **in, int *instrides,
455                                 int srcSliceY,        int srcSliceH,
456                                 unsigned char **oplanes, int *outstrides)
457 {
458     int w = c->srcW;
459     int h = srcSliceH;
460     int i,j;
461     int instrides_scl[3];
462     vector unsigned char y0,y1;
463
464     vector signed char  u,v;
465
466     vector signed short Y0,Y1,Y2,Y3;
467     vector signed short U,V;
468     vector signed short vx,ux,uvx;
469     vector signed short vx0,ux0,uvx0;
470     vector signed short vx1,ux1,uvx1;
471     vector signed short R0,G0,B0;
472     vector signed short R1,G1,B1;
473     vector unsigned char R,G,B;
474
475     vector unsigned char *uivP, *vivP;
476     vector unsigned char align_perm;
477
478     vector signed short
479         lCY  = c->CY,
480         lOY  = c->OY,
481         lCRV = c->CRV,
482         lCBU = c->CBU,
483         lCGU = c->CGU,
484         lCGV = c->CGV;
485
486     vector unsigned short lCSHIFT = c->CSHIFT;
487
488     ubyte *y1i   = in[0];
489     ubyte *y2i   = in[0]+w;
490     ubyte *ui    = in[1];
491     ubyte *vi    = in[2];
492
493     vector unsigned char *oute
494         = (vector unsigned char *)
495           (oplanes[0]+srcSliceY*outstrides[0]);
496     vector unsigned char *outo
497         = (vector unsigned char *)
498           (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
499
500
501     instrides_scl[0] = instrides[0];
502     instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
503     instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
504
505
506     for (i=0;i<h/2;i++) {
507         vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
508         vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
509
510         for (j=0;j<w/16;j++) {
511
512             y0 = vec_ldl (0,y1i);
513             y1 = vec_ldl (0,y2i);
514             uivP = (vector unsigned char *)ui;
515             vivP = (vector unsigned char *)vi;
516
517             align_perm = vec_lvsl (0, ui);
518             u  = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
519
520             align_perm = vec_lvsl (0, vi);
521             v  = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
522             u  = (vector signed char)
523                  vec_sub (u,(vector signed char)
524                           vec_splat((vector signed char){128},0));
525
526             v  = (vector signed char)
527                  vec_sub (v, (vector signed char)
528                           vec_splat((vector signed char){128},0));
529
530             U  = vec_unpackh (u);
531             V  = vec_unpackh (v);
532
533
534             Y0 = vec_unh (y0);
535             Y1 = vec_unl (y0);
536             Y2 = vec_unh (y1);
537             Y3 = vec_unl (y1);
538
539             Y0 = vec_mradds (Y0, lCY, lOY);
540             Y1 = vec_mradds (Y1, lCY, lOY);
541             Y2 = vec_mradds (Y2, lCY, lOY);
542             Y3 = vec_mradds (Y3, lCY, lOY);
543
544             /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
545             ux = vec_sl (U, lCSHIFT);
546             ux = vec_mradds (ux, lCBU, (vector signed short){0});
547             ux0  = vec_mergeh (ux,ux);
548             ux1  = vec_mergel (ux,ux);
549
550             /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */
551             vx = vec_sl (V, lCSHIFT);
552             vx = vec_mradds (vx, lCRV, (vector signed short){0});
553             vx0  = vec_mergeh (vx,vx);
554             vx1  = vec_mergel (vx,vx);
555             /* uvx = ((CGU*u) + (CGV*v))>>15 */
556             uvx = vec_mradds (U, lCGU, (vector signed short){0});
557             uvx = vec_mradds (V, lCGV, uvx);
558             uvx0 = vec_mergeh (uvx,uvx);
559             uvx1 = vec_mergel (uvx,uvx);
560             R0 = vec_add (Y0,vx0);
561             G0 = vec_add (Y0,uvx0);
562             B0 = vec_add (Y0,ux0);
563             R1 = vec_add (Y1,vx1);
564             G1 = vec_add (Y1,uvx1);
565             B1 = vec_add (Y1,ux1);
566             R  = vec_packclp (R0,R1);
567             G  = vec_packclp (G0,G1);
568             B  = vec_packclp (B0,B1);
569
570             out_argb(R,G,B,oute);
571             R0 = vec_add (Y2,vx0);
572             G0 = vec_add (Y2,uvx0);
573             B0 = vec_add (Y2,ux0);
574             R1 = vec_add (Y3,vx1);
575             G1 = vec_add (Y3,uvx1);
576             B1 = vec_add (Y3,ux1);
577             R  = vec_packclp (R0,R1);
578             G  = vec_packclp (G0,G1);
579             B  = vec_packclp (B0,B1);
580
581             out_argb(R,G,B,outo);
582             y1i  += 16;
583             y2i  += 16;
584             ui   += 8;
585             vi   += 8;
586
587         }
588
589         outo  += (outstrides[0])>>4;
590         oute  += (outstrides[0])>>4;
591
592         ui    += instrides_scl[1];
593         vi    += instrides_scl[2];
594         y1i   += instrides_scl[0];
595         y2i   += instrides_scl[0];
596     }
597     return srcSliceH;
598 }
599
600 #endif
601
602
603 DEFCSP420_CVT (yuv2_rgba, out_rgba)
604 DEFCSP420_CVT (yuv2_argb, out_argb)
605 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
606 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
607
608
609 // uyvy|uyvy|uyvy|uyvy
610 // 0123 4567 89ab cdef
611 static
612 const vector unsigned char
613     demux_u = {0x10,0x00,0x10,0x00,
614                0x10,0x04,0x10,0x04,
615                0x10,0x08,0x10,0x08,
616                0x10,0x0c,0x10,0x0c},
617     demux_v = {0x10,0x02,0x10,0x02,
618                0x10,0x06,0x10,0x06,
619                0x10,0x0A,0x10,0x0A,
620                0x10,0x0E,0x10,0x0E},
621     demux_y = {0x10,0x01,0x10,0x03,
622                0x10,0x05,0x10,0x07,
623                0x10,0x09,0x10,0x0B,
624                0x10,0x0D,0x10,0x0F};
625
626 /*
627   this is so I can play live CCIR raw video
628 */
629 static int altivec_uyvy_rgb32 (SwsContext *c,
630                                const unsigned char **in, int *instrides,
631                                int srcSliceY,        int srcSliceH,
632                                unsigned char **oplanes, int *outstrides)
633 {
634     int w = c->srcW;
635     int h = srcSliceH;
636     int i,j;
637     vector unsigned char uyvy;
638     vector signed   short Y,U,V;
639     vector signed   short R0,G0,B0,R1,G1,B1;
640     vector unsigned char  R,G,B;
641     vector unsigned char *out;
642     const ubyte *img;
643
644     img = in[0];
645     out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
646
647     for (i=0;i<h;i++) {
648         for (j=0;j<w/16;j++) {
649             uyvy = vec_ld (0, img);
650             U = (vector signed short)
651                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
652
653             V = (vector signed short)
654                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
655
656             Y = (vector signed short)
657                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
658
659             cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
660
661             uyvy = vec_ld (16, img);
662             U = (vector signed short)
663                 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
664
665             V = (vector signed short)
666                 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
667
668             Y = (vector signed short)
669                 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
670
671             cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
672
673             R  = vec_packclp (R0,R1);
674             G  = vec_packclp (G0,G1);
675             B  = vec_packclp (B0,B1);
676
677             //      vec_mstbgr24 (R,G,B, out);
678             out_rgba (R,G,B,out);
679
680             img += 32;
681         }
682     }
683     return srcSliceH;
684 }
685
686
687
688 /* Ok currently the acceleration routine only supports
689    inputs of widths a multiple of 16
690    and heights a multiple 2
691
692    So we just fall back to the C codes for this.
693 */
694 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
695 {
696     if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
697         return NULL;
698
699     /*
700       and this seems not to matter too much I tried a bunch of
701       videos with abnormal widths and MPlayer crashes elsewhere.
702       mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
703       boom with X11 bad match.
704
705     */
706     if ((c->srcW & 0xf) != 0)    return NULL;
707
708     switch (c->srcFormat) {
709     case PIX_FMT_YUV410P:
710     case PIX_FMT_YUV420P:
711     /*case IMGFMT_CLPL:        ??? */
712     case PIX_FMT_GRAY8:
713     case PIX_FMT_NV12:
714     case PIX_FMT_NV21:
715         if ((c->srcH & 0x1) != 0)
716             return NULL;
717
718         switch(c->dstFormat) {
719         case PIX_FMT_RGB24:
720             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
721             return altivec_yuv2_rgb24;
722         case PIX_FMT_BGR24:
723             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
724             return altivec_yuv2_bgr24;
725         case PIX_FMT_ARGB:
726             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
727             return altivec_yuv2_argb;
728         case PIX_FMT_ABGR:
729             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
730             return altivec_yuv2_abgr;
731         case PIX_FMT_RGBA:
732             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
733             return altivec_yuv2_rgba;
734         case PIX_FMT_BGRA:
735             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
736             return altivec_yuv2_bgra;
737         default: return NULL;
738         }
739         break;
740
741     case PIX_FMT_UYVY422:
742         switch(c->dstFormat) {
743         case PIX_FMT_BGR32:
744             av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
745             return altivec_uyvy_rgb32;
746         default: return NULL;
747         }
748         break;
749
750     }
751     return NULL;
752 }
753
754 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
755 {
756     union {
757         DECLARE_ALIGNED(16, signed short, tmp)[8];
758         vector signed short vec;
759     } buf;
760
761     buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
762     buf.tmp[1] =  -256*brightness;                                      //oy
763     buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
764     buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
765     buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
766     buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
767
768
769     c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
770     c->CY   = vec_splat ((vector signed short)buf.vec, 0);
771     c->OY   = vec_splat ((vector signed short)buf.vec, 1);
772     c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
773     c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
774     c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
775     c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
776     return;
777 }
778
779
780 void
781 ff_yuv2packedX_altivec(SwsContext *c,
782                        const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
783                        const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
784                      uint8_t *dest, int dstW, int dstY)
785 {
786     int i,j;
787     vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
788     vector signed short R0,G0,B0,R1,G1,B1;
789
790     vector unsigned char R,G,B;
791     vector unsigned char *out,*nout;
792
793     vector signed short   RND = vec_splat_s16(1<<3);
794     vector unsigned short SCL = vec_splat_u16(4);
795     DECLARE_ALIGNED(16, unsigned long, scratch)[16];
796
797     vector signed short *YCoeffs, *CCoeffs;
798
799     YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
800     CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
801
802     out = (vector unsigned char *)dest;
803
804     for (i=0; i<dstW; i+=16) {
805         Y0 = RND;
806         Y1 = RND;
807         /* extract 16 coeffs from lumSrc */
808         for (j=0; j<lumFilterSize; j++) {
809             X0 = vec_ld (0,  &lumSrc[j][i]);
810             X1 = vec_ld (16, &lumSrc[j][i]);
811             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
812             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
813         }
814
815         U = RND;
816         V = RND;
817         /* extract 8 coeffs from U,V */
818         for (j=0; j<chrFilterSize; j++) {
819             X  = vec_ld (0, &chrSrc[j][i/2]);
820             U  = vec_mradds (X, CCoeffs[j], U);
821             X  = vec_ld (0, &chrSrc[j][i/2+VOFW]);
822             V  = vec_mradds (X, CCoeffs[j], V);
823         }
824
825         /* scale and clip signals */
826         Y0 = vec_sra (Y0, SCL);
827         Y1 = vec_sra (Y1, SCL);
828         U  = vec_sra (U,  SCL);
829         V  = vec_sra (V,  SCL);
830
831         Y0 = vec_clip_s16 (Y0);
832         Y1 = vec_clip_s16 (Y1);
833         U  = vec_clip_s16 (U);
834         V  = vec_clip_s16 (V);
835
836         /* now we have
837           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
838           U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
839
840           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
841           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
842           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
843         */
844
845         U0 = vec_mergeh (U,U);
846         V0 = vec_mergeh (V,V);
847
848         U1 = vec_mergel (U,U);
849         V1 = vec_mergel (V,V);
850
851         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
852         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
853
854         R  = vec_packclp (R0,R1);
855         G  = vec_packclp (G0,G1);
856         B  = vec_packclp (B0,B1);
857
858         switch(c->dstFormat) {
859         case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
860         case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
861         case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
862         case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
863         case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
864         case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
865         default:
866             {
867                 /* If this is reached, the caller should have called yuv2packedXinC
868                    instead. */
869                 static int printed_error_message;
870                 if (!printed_error_message) {
871                     av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
872                            sws_format_name(c->dstFormat));
873                     printed_error_message=1;
874                 }
875                 return;
876             }
877         }
878     }
879
880     if (i < dstW) {
881         i -= 16;
882
883         Y0 = RND;
884         Y1 = RND;
885         /* extract 16 coeffs from lumSrc */
886         for (j=0; j<lumFilterSize; j++) {
887             X0 = vec_ld (0,  &lumSrc[j][i]);
888             X1 = vec_ld (16, &lumSrc[j][i]);
889             Y0 = vec_mradds (X0, YCoeffs[j], Y0);
890             Y1 = vec_mradds (X1, YCoeffs[j], Y1);
891         }
892
893         U = RND;
894         V = RND;
895         /* extract 8 coeffs from U,V */
896         for (j=0; j<chrFilterSize; j++) {
897             X  = vec_ld (0, &chrSrc[j][i/2]);
898             U  = vec_mradds (X, CCoeffs[j], U);
899             X  = vec_ld (0, &chrSrc[j][i/2+VOFW]);
900             V  = vec_mradds (X, CCoeffs[j], V);
901         }
902
903         /* scale and clip signals */
904         Y0 = vec_sra (Y0, SCL);
905         Y1 = vec_sra (Y1, SCL);
906         U  = vec_sra (U,  SCL);
907         V  = vec_sra (V,  SCL);
908
909         Y0 = vec_clip_s16 (Y0);
910         Y1 = vec_clip_s16 (Y1);
911         U  = vec_clip_s16 (U);
912         V  = vec_clip_s16 (V);
913
914         /* now we have
915            Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
916            U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
917
918            Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
919            U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
920            V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
921         */
922
923         U0 = vec_mergeh (U,U);
924         V0 = vec_mergeh (V,V);
925
926         U1 = vec_mergel (U,U);
927         V1 = vec_mergel (V,V);
928
929         cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
930         cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
931
932         R  = vec_packclp (R0,R1);
933         G  = vec_packclp (G0,G1);
934         B  = vec_packclp (B0,B1);
935
936         nout = (vector unsigned char *)scratch;
937         switch(c->dstFormat) {
938         case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
939         case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
940         case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
941         case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
942         case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
943         case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
944         default:
945             /* Unreachable, I think. */
946             av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
947                    sws_format_name(c->dstFormat));
948             return;
949         }
950
951         memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
952     }
953
954 }