]> git.sesse.net Git - ffmpeg/blob - libswscale/yuv2rgb_altivec.c
cosmetics: Remove trailing whitespace.
[ffmpeg] / libswscale / yuv2rgb_altivec.c
1 /*
2   marc.hoffman@analog.com    March 8, 2004
3
4   Altivec Acceleration for Color Space Conversion revision 0.2
5
6   convert I420 YV12 to RGB in various formats,
7     it rejects images that are not in 420 formats
8     it rejects images that don't have widths of multiples of 16
9     it rejects images that don't have heights of multiples of 2
10   reject defers to C simulation codes.
11
12   lots of optimizations to be done here
13
14   1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15      so we currently use max min to clip
16
17   2. the inefficient use of chroma loading needs a bit of brushing up
18
19   3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
20
21
22   MODIFIED to calculate coeffs from currently selected color space.
23   MODIFIED core to be a macro which you spec the output format.
24   ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25   CORRECTED algorithim selection to be strict on input formats.
26   ADDED runtime detection of altivec.
27
28   ADDED altivec_yuv2packedX vertical scl + RGB converter
29
30   March 27,2004
31   PERFORMANCE ANALYSIS
32
33   The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34   The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
35
36   720*480*30  ~10MPS
37
38   so we have roughly 10clocks per pixel this is too high something has to be wrong.
39
40   OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
41
42   OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43   guaranteed to have the input video frame it was just decompressed so
44   it probably resides in L1 caches.  However we are creating the
45   output video stream this needs to use the DSTST instruction to
46   optimize for the cache.  We couple this with the fact that we are
47   not going to be visiting the input buffer again so we mark it Least
48   Recently Used.  This shaves 25% of the processor cycles off.
49
50   Now MEMCPY is the largest mips consumer in the system, probably due
51   to the inefficient X11 stuff.
52
53   GL libraries seem to be very slow on this machine 1.33Ghz PB running
54   Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
55   a versioning issues, however i have libGL.1.2.dylib for both
56   machines. ((We need to figure this out now))
57
58   GL2 libraries work now with patch for RGB32
59
60   NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
61
62   Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
63 */
64
65 /*
66  * This file is part of FFmpeg.
67  *
68  * FFmpeg is free software; you can redistribute it and/or modify
69  * it under the terms of the GNU General Public License as published by
70  * the Free Software Foundation; either version 2 of the License, or
71  * (at your option) any later version.
72  *
73  * FFmpeg is distributed in the hope that it will be useful,
74  * but WITHOUT ANY WARRANTY; without even the implied warranty of
75  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
76  * GNU General Public License for more details.
77  *
78  * You should have received a copy of the GNU General Public License
79  * along with FFmpeg; if not, write to the Free Software
80  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
81  */
82
83 #include <stdio.h>
84 #include <stdlib.h>
85 #include <string.h>
86 #include <inttypes.h>
87 #include <assert.h>
88 #include "config.h"
89 #ifdef HAVE_MALLOC_H
90 #include <malloc.h>
91 #endif
92 #include "rgb2rgb.h"
93 #include "swscale.h"
94 #include "swscale_internal.h"
95
96 #undef PROFILE_THE_BEAST
97 #undef INC_SCALING
98
99 typedef unsigned char ubyte;
100 typedef signed char   sbyte;
101
102
103 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
104    homogeneous vector registers x0,x1,x2 are interleaved with the
105    following technique:
106
107       o0 = vec_mergeh (x0,x1);
108       o1 = vec_perm (o0, x2, perm_rgb_0);
109       o2 = vec_perm (o0, x2, perm_rgb_1);
110       o3 = vec_mergel (x0,x1);
111       o4 = vec_perm (o3,o2,perm_rgb_2);
112       o5 = vec_perm (o3,o2,perm_rgb_3);
113
114   perm_rgb_0:   o0(RG).h v1(B) --> o1*
115               0   1  2   3   4
116              rgbr|gbrg|brgb|rgbr
117              0010 0100 1001 0010
118              0102 3145 2673 894A
119
120   perm_rgb_1:   o0(RG).h v1(B) --> o2
121               0   1  2   3   4
122              gbrg|brgb|bbbb|bbbb
123              0100 1001 1111 1111
124              B5CD 6EF7 89AB CDEF
125
126   perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
127               0   1  2   3   4
128              gbrg|brgb|rgbr|gbrg
129              1111 1111 0010 0100
130              89AB CDEF 0182 3945
131
132   perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
133               0   1  2   3   4
134              brgb|rgbr|gbrg|brgb
135              1001 0010 0100 1001
136              a67b 89cA BdCD eEFf
137
138 */
139 static
140 const vector unsigned char
141   perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
142                                       0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
143   perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
144                                       0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
145   perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
146                                       0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
147   perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
148                                       0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
149
150 #define vec_merge3(x2,x1,x0,y0,y1,y2)    \
151 do {                                     \
152   typeof(x0) o0,o2,o3;                   \
153       o0 = vec_mergeh (x0,x1);           \
154       y0 = vec_perm (o0, x2, perm_rgb_0);\
155       o2 = vec_perm (o0, x2, perm_rgb_1);\
156       o3 = vec_mergel (x0,x1);           \
157       y1 = vec_perm (o3,o2,perm_rgb_2);  \
158       y2 = vec_perm (o3,o2,perm_rgb_3);  \
159 } while(0)
160
161 #define vec_mstbgr24(x0,x1,x2,ptr)        \
162 do {                                     \
163   typeof(x0) _0,_1,_2;                   \
164   vec_merge3 (x0,x1,x2,_0,_1,_2);        \
165   vec_st (_0, 0, ptr++);                 \
166   vec_st (_1, 0, ptr++);                 \
167   vec_st (_2, 0, ptr++);                 \
168 }  while (0);
169
170 #define vec_mstrgb24(x0,x1,x2,ptr)       \
171 do {                                     \
172   typeof(x0) _0,_1,_2;                   \
173   vec_merge3 (x2,x1,x0,_0,_1,_2);        \
174   vec_st (_0, 0, ptr++);                 \
175   vec_st (_1, 0, ptr++);                 \
176   vec_st (_2, 0, ptr++);                 \
177 }  while (0);
178
179 /* pack the pixels in rgb0 format
180    msb R
181    lsb 0
182 */
183 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                                \
184 do {                                                                                   \
185   T _0,_1,_2,_3;                                                                       \
186   _0 = vec_mergeh (x0,x1);                                                             \
187   _1 = vec_mergeh (x2,x3);                                                             \
188   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
189   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
190   vec_st (_2, 0*16, (T *)ptr);                                                         \
191   vec_st (_3, 1*16, (T *)ptr);                                                         \
192   _0 = vec_mergel (x0,x1);                                                             \
193   _1 = vec_mergel (x2,x3);                                                             \
194   _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
195   _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
196   vec_st (_2, 2*16, (T *)ptr);                                                         \
197   vec_st (_3, 3*16, (T *)ptr);                                                         \
198   ptr += 4;                                                                            \
199 }  while (0);
200
201 /*
202
203   | 1     0       1.4021   | | Y |
204   | 1    -0.3441 -0.7142   |x| Cb|
205   | 1     1.7718  0        | | Cr|
206
207
208   Y:      [-128 127]
209   Cb/Cr : [-128 127]
210
211   typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
212
213 */
214
215
216
217
218 #define vec_unh(x) \
219   (vector signed short) \
220     vec_perm(x,(typeof(x))AVV(0),\
221              (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
222                                     0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
223 #define vec_unl(x) \
224   (vector signed short) \
225     vec_perm(x,(typeof(x))AVV(0),\
226              (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
227                                     0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
228
229 #define vec_clip_s16(x) \
230   vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\
231                        (vector signed short)AVV(16, 16, 16, 16, 16, 16, 16, 16 ))
232
233 #define vec_packclp(x,y) \
234   (vector unsigned char)vec_packs \
235       ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
236        (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
237
238 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
239
240
241 static inline void cvtyuvtoRGB (SwsContext *c,
242                            vector signed short Y, vector signed short U, vector signed short V,
243                            vector signed short *R, vector signed short *G, vector signed short *B)
244 {
245   vector signed   short vx,ux,uvx;
246
247   Y = vec_mradds (Y, c->CY, c->OY);
248   U  = vec_sub (U,(vector signed short)
249                         vec_splat((vector signed short)AVV(128),0));
250   V  = vec_sub (V,(vector signed short)
251                         vec_splat((vector signed short)AVV(128),0));
252
253   //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
254   ux = vec_sl (U, c->CSHIFT);
255   *B = vec_mradds (ux, c->CBU, Y);
256
257   // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
258   vx = vec_sl (V, c->CSHIFT);
259   *R = vec_mradds (vx, c->CRV, Y);
260
261   // uvx = ((CGU*u) + (CGV*v))>>15;
262   uvx = vec_mradds (U, c->CGU, Y);
263   *G = vec_mradds (V, c->CGV, uvx);
264 }
265
266
267 /*
268   ------------------------------------------------------------------------------
269   CS converters
270   ------------------------------------------------------------------------------
271 */
272
273
274 #define DEFCSP420_CVT(name,out_pixels)                                     \
275 static int altivec_##name (SwsContext *c,                                  \
276                                 unsigned char **in, int *instrides,        \
277                                 int srcSliceY,  int srcSliceH,             \
278                                 unsigned char **oplanes, int *outstrides)  \
279 {                                                                          \
280   int w = c->srcW;                                                         \
281   int h = srcSliceH;                                                       \
282   int i,j;                                                                 \
283   int instrides_scl[3];                                                    \
284   vector unsigned char y0,y1;                                              \
285                                                                            \
286   vector signed char  u,v;                                                 \
287                                                                            \
288   vector signed short Y0,Y1,Y2,Y3;                                         \
289   vector signed short U,V;                                                 \
290   vector signed short vx,ux,uvx;                                           \
291   vector signed short vx0,ux0,uvx0;                                        \
292   vector signed short vx1,ux1,uvx1;                                        \
293   vector signed short R0,G0,B0;                                            \
294   vector signed short R1,G1,B1;                                            \
295   vector unsigned char R,G,B;                                              \
296                                                                            \
297   vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                       \
298   vector unsigned char align_perm;                                         \
299                                                                            \
300   vector signed short                                                      \
301     lCY  = c->CY,                                                          \
302     lOY  = c->OY,                                                          \
303     lCRV = c->CRV,                                                         \
304     lCBU = c->CBU,                                                         \
305     lCGU = c->CGU,                                                         \
306     lCGV = c->CGV;                                                         \
307                                                                            \
308   vector unsigned short lCSHIFT = c->CSHIFT;                               \
309                                                                            \
310   ubyte *y1i   = in[0];                                                    \
311   ubyte *y2i   = in[0]+instrides[0];                                       \
312   ubyte *ui    = in[1];                                                    \
313   ubyte *vi    = in[2];                                                    \
314                                                                            \
315   vector unsigned char *oute                                               \
316     = (vector unsigned char *)                                             \
317         (oplanes[0]+srcSliceY*outstrides[0]);                              \
318   vector unsigned char *outo                                               \
319     = (vector unsigned char *)                                             \
320         (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);                \
321                                                                            \
322                                                                            \
323   instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */  \
324   instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */     \
325   instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */     \
326                                                                            \
327                                                                            \
328   for (i=0;i<h/2;i++) {                                                    \
329     vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);                 \
330     vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);                 \
331                                                                            \
332     for (j=0;j<w/16;j++) {                                                 \
333                                                                            \
334       y1ivP = (vector unsigned char *)y1i;                                 \
335       y2ivP = (vector unsigned char *)y2i;                                 \
336       uivP = (vector unsigned char *)ui;                                   \
337       vivP = (vector unsigned char *)vi;                                   \
338                                                                            \
339       align_perm = vec_lvsl (0, y1i);                                      \
340       y0 = (vector unsigned char)vec_perm (y1ivP[0], y1ivP[1], align_perm);\
341                                                                            \
342       align_perm = vec_lvsl (0, y2i);                                      \
343       y1 = (vector unsigned char)vec_perm (y2ivP[0], y2ivP[1], align_perm);\
344                                                                            \
345       align_perm = vec_lvsl (0, ui);                                       \
346       u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);     \
347                                                                            \
348       align_perm = vec_lvsl (0, vi);                                       \
349       v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);     \
350                                                                            \
351       u  = (vector signed char)                                            \
352                 vec_sub (u,(vector signed char)                            \
353                                 vec_splat((vector signed char)AVV(128),0));\
354       v  = (vector signed char)                                            \
355                 vec_sub (v,(vector signed char)                            \
356                                 vec_splat((vector signed char)AVV(128),0));\
357                                                                            \
358       U  = vec_unpackh (u);                                                \
359       V  = vec_unpackh (v);                                                \
360                                                                            \
361                                                                            \
362         Y0 = vec_unh (y0);                                                 \
363         Y1 = vec_unl (y0);                                                 \
364         Y2 = vec_unh (y1);                                                 \
365         Y3 = vec_unl (y1);                                                 \
366                                                                            \
367         Y0 = vec_mradds (Y0, lCY, lOY);                                    \
368         Y1 = vec_mradds (Y1, lCY, lOY);                                    \
369         Y2 = vec_mradds (Y2, lCY, lOY);                                    \
370         Y3 = vec_mradds (Y3, lCY, lOY);                                    \
371                                                                            \
372         /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                         \
373         ux = vec_sl (U, lCSHIFT);                                          \
374         ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));                   \
375         ux0  = vec_mergeh (ux,ux);                                         \
376         ux1  = vec_mergel (ux,ux);                                         \
377                                                                            \
378         /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;  */                         \
379         vx = vec_sl (V, lCSHIFT);                                          \
380         vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));                   \
381         vx0  = vec_mergeh (vx,vx);                                         \
382         vx1  = vec_mergel (vx,vx);                                         \
383                                                                            \
384         /* uvx = ((CGU*u) + (CGV*v))>>15 */                                \
385         uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));                   \
386         uvx = vec_mradds (V, lCGV, uvx);                                   \
387         uvx0 = vec_mergeh (uvx,uvx);                                       \
388         uvx1 = vec_mergel (uvx,uvx);                                       \
389                                                                            \
390         R0 = vec_add (Y0,vx0);                                             \
391         G0 = vec_add (Y0,uvx0);                                            \
392         B0 = vec_add (Y0,ux0);                                             \
393         R1 = vec_add (Y1,vx1);                                             \
394         G1 = vec_add (Y1,uvx1);                                            \
395         B1 = vec_add (Y1,ux1);                                             \
396                                                                            \
397         R  = vec_packclp (R0,R1);                                          \
398         G  = vec_packclp (G0,G1);                                          \
399         B  = vec_packclp (B0,B1);                                          \
400                                                                            \
401         out_pixels(R,G,B,oute);                                            \
402                                                                            \
403         R0 = vec_add (Y2,vx0);                                             \
404         G0 = vec_add (Y2,uvx0);                                            \
405         B0 = vec_add (Y2,ux0);                                             \
406         R1 = vec_add (Y3,vx1);                                             \
407         G1 = vec_add (Y3,uvx1);                                            \
408         B1 = vec_add (Y3,ux1);                                             \
409         R  = vec_packclp (R0,R1);                                          \
410         G  = vec_packclp (G0,G1);                                          \
411         B  = vec_packclp (B0,B1);                                          \
412                                                                            \
413                                                                            \
414         out_pixels(R,G,B,outo);                                            \
415                                                                            \
416       y1i  += 16;                                                          \
417       y2i  += 16;                                                          \
418       ui   += 8;                                                           \
419       vi   += 8;                                                           \
420                                                                            \
421     }                                                                      \
422                                                                            \
423     outo += (outstrides[0])>>4;                                            \
424     oute += (outstrides[0])>>4;                                            \
425                                                                            \
426     ui    += instrides_scl[1];                                             \
427     vi    += instrides_scl[2];                                             \
428     y1i   += instrides_scl[0];                                             \
429     y2i   += instrides_scl[0];                                             \
430   }                                                                        \
431   return srcSliceH;                                                        \
432 }
433
434
435 #define out_abgr(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
436 #define out_bgra(a,b,c,ptr)  vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
437 #define out_rgba(a,b,c,ptr)  vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
438 #define out_argb(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
439 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
440 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
441
442 DEFCSP420_CVT (yuv2_abgr, out_abgr)
443 #if 1
444 DEFCSP420_CVT (yuv2_bgra, out_bgra)
445 #else
446 static int altivec_yuv2_bgra32 (SwsContext *c,
447                                 unsigned char **in, int *instrides,
448                                 int srcSliceY,  int srcSliceH,
449                                 unsigned char **oplanes, int *outstrides)
450 {
451   int w = c->srcW;
452   int h = srcSliceH;
453   int i,j;
454   int instrides_scl[3];
455   vector unsigned char y0,y1;
456
457   vector signed char  u,v;
458
459   vector signed short Y0,Y1,Y2,Y3;
460   vector signed short U,V;
461   vector signed short vx,ux,uvx;
462   vector signed short vx0,ux0,uvx0;
463   vector signed short vx1,ux1,uvx1;
464   vector signed short R0,G0,B0;
465   vector signed short R1,G1,B1;
466   vector unsigned char R,G,B;
467
468   vector unsigned char *uivP, *vivP;
469   vector unsigned char align_perm;
470
471   vector signed short
472     lCY  = c->CY,
473     lOY  = c->OY,
474     lCRV = c->CRV,
475     lCBU = c->CBU,
476     lCGU = c->CGU,
477     lCGV = c->CGV;
478
479   vector unsigned short lCSHIFT = c->CSHIFT;
480
481   ubyte *y1i   = in[0];
482   ubyte *y2i   = in[0]+w;
483   ubyte *ui    = in[1];
484   ubyte *vi    = in[2];
485
486   vector unsigned char *oute
487     = (vector unsigned char *)
488         (oplanes[0]+srcSliceY*outstrides[0]);
489   vector unsigned char *outo
490     = (vector unsigned char *)
491         (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
492
493
494   instrides_scl[0] = instrides[0];
495   instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
496   instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
497
498
499   for (i=0;i<h/2;i++) {
500     vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
501     vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
502
503     for (j=0;j<w/16;j++) {
504
505       y0 = vec_ldl (0,y1i);
506       y1 = vec_ldl (0,y2i);
507       uivP = (vector unsigned char *)ui;
508       vivP = (vector unsigned char *)vi;
509
510       align_perm = vec_lvsl (0, ui);
511       u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
512
513       align_perm = vec_lvsl (0, vi);
514       v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
515       u  = (vector signed char)
516                 vec_sub (u,(vector signed char)
517                                 vec_splat((vector signed char)AVV(128),0));
518
519       v  = (vector signed char)
520                 vec_sub (v, (vector signed char)
521                                 vec_splat((vector signed char)AVV(128),0));
522
523       U  = vec_unpackh (u);
524       V  = vec_unpackh (v);
525
526
527         Y0 = vec_unh (y0);
528         Y1 = vec_unl (y0);
529         Y2 = vec_unh (y1);
530         Y3 = vec_unl (y1);
531
532         Y0 = vec_mradds (Y0, lCY, lOY);
533         Y1 = vec_mradds (Y1, lCY, lOY);
534         Y2 = vec_mradds (Y2, lCY, lOY);
535         Y3 = vec_mradds (Y3, lCY, lOY);
536
537         /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
538         ux = vec_sl (U, lCSHIFT);
539         ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
540         ux0  = vec_mergeh (ux,ux);
541         ux1  = vec_mergel (ux,ux);
542
543         /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;  */
544         vx = vec_sl (V, lCSHIFT);
545         vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
546         vx0  = vec_mergeh (vx,vx);
547         vx1  = vec_mergel (vx,vx);
548         /* uvx = ((CGU*u) + (CGV*v))>>15 */
549         uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
550         uvx = vec_mradds (V, lCGV, uvx);
551         uvx0 = vec_mergeh (uvx,uvx);
552         uvx1 = vec_mergel (uvx,uvx);
553         R0 = vec_add (Y0,vx0);
554         G0 = vec_add (Y0,uvx0);
555         B0 = vec_add (Y0,ux0);
556         R1 = vec_add (Y1,vx1);
557         G1 = vec_add (Y1,uvx1);
558         B1 = vec_add (Y1,ux1);
559         R  = vec_packclp (R0,R1);
560         G  = vec_packclp (G0,G1);
561         B  = vec_packclp (B0,B1);
562
563         out_argb(R,G,B,oute);
564         R0 = vec_add (Y2,vx0);
565         G0 = vec_add (Y2,uvx0);
566         B0 = vec_add (Y2,ux0);
567         R1 = vec_add (Y3,vx1);
568         G1 = vec_add (Y3,uvx1);
569         B1 = vec_add (Y3,ux1);
570         R  = vec_packclp (R0,R1);
571         G  = vec_packclp (G0,G1);
572         B  = vec_packclp (B0,B1);
573
574         out_argb(R,G,B,outo);
575         y1i  += 16;
576         y2i  += 16;
577         ui   += 8;
578         vi   += 8;
579
580     }
581
582     outo += (outstrides[0])>>4;
583     oute += (outstrides[0])>>4;
584
585     ui    += instrides_scl[1];
586     vi    += instrides_scl[2];
587     y1i   += instrides_scl[0];
588     y2i   += instrides_scl[0];
589   }
590   return srcSliceH;
591 }
592
593 #endif
594
595
596 DEFCSP420_CVT (yuv2_rgba, out_rgba)
597 DEFCSP420_CVT (yuv2_argb, out_argb)
598 DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
599 DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
600
601
602 // uyvy|uyvy|uyvy|uyvy
603 // 0123 4567 89ab cdef
604 static
605 const vector unsigned char
606   demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
607                                    0x10,0x04,0x10,0x04,
608                                    0x10,0x08,0x10,0x08,
609                                    0x10,0x0c,0x10,0x0c),
610   demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
611                                    0x10,0x06,0x10,0x06,
612                                    0x10,0x0A,0x10,0x0A,
613                                    0x10,0x0E,0x10,0x0E),
614   demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
615                                    0x10,0x05,0x10,0x07,
616                                    0x10,0x09,0x10,0x0B,
617                                    0x10,0x0D,0x10,0x0F);
618
619 /*
620   this is so I can play live CCIR raw video
621 */
622 static int altivec_uyvy_rgb32 (SwsContext *c,
623                                unsigned char **in, int *instrides,
624                                int srcSliceY,   int srcSliceH,
625                                unsigned char **oplanes, int *outstrides)
626 {
627   int w = c->srcW;
628   int h = srcSliceH;
629   int i,j;
630   vector unsigned char uyvy;
631   vector signed   short Y,U,V;
632   vector signed   short R0,G0,B0,R1,G1,B1;
633   vector unsigned char  R,G,B;
634   vector unsigned char *out;
635   ubyte *img;
636
637   img = in[0];
638   out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
639
640   for (i=0;i<h;i++) {
641     for (j=0;j<w/16;j++) {
642       uyvy = vec_ld (0, img);
643       U = (vector signed short)
644         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
645
646       V = (vector signed short)
647         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
648
649       Y = (vector signed short)
650         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
651
652       cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
653
654       uyvy = vec_ld (16, img);
655       U = (vector signed short)
656         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
657
658       V = (vector signed short)
659         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
660
661       Y = (vector signed short)
662         vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
663
664       cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
665
666       R  = vec_packclp (R0,R1);
667       G  = vec_packclp (G0,G1);
668       B  = vec_packclp (B0,B1);
669
670       //      vec_mstbgr24 (R,G,B, out);
671       out_rgba (R,G,B,out);
672
673       img += 32;
674     }
675   }
676   return srcSliceH;
677 }
678
679
680
681 /* Ok currently the acceleration routine only supports
682    inputs of widths a multiple of 16
683    and heights a multiple 2
684
685    So we just fall back to the C codes for this.
686 */
687 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
688 {
689   if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
690     return NULL;
691
692   /*
693     and this seems not to matter too much I tried a bunch of
694     videos with abnormal widths and mplayer crashes else where.
695     mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
696     boom with X11 bad match.
697
698   */
699   if ((c->srcW & 0xf) != 0)    return NULL;
700
701   switch (c->srcFormat) {
702   case PIX_FMT_YUV410P:
703   case PIX_FMT_YUV420P:
704   /*case IMGFMT_CLPL:   ??? */
705   case PIX_FMT_GRAY8:
706   case PIX_FMT_NV12:
707   case PIX_FMT_NV21:
708     if ((c->srcH & 0x1) != 0)
709       return NULL;
710
711     switch(c->dstFormat){
712     case PIX_FMT_RGB24:
713       av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
714       return altivec_yuv2_rgb24;
715     case PIX_FMT_BGR24:
716       av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
717       return altivec_yuv2_bgr24;
718     case PIX_FMT_ARGB:
719       av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
720       return altivec_yuv2_argb;
721     case PIX_FMT_ABGR:
722       av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
723       return altivec_yuv2_abgr;
724     case PIX_FMT_RGBA:
725       av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
726       return altivec_yuv2_rgba;
727     case PIX_FMT_BGRA:
728       av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
729       return altivec_yuv2_bgra;
730     default: return NULL;
731     }
732     break;
733
734   case PIX_FMT_UYVY422:
735     switch(c->dstFormat){
736     case PIX_FMT_BGR32:
737       av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
738       return altivec_uyvy_rgb32;
739     default: return NULL;
740     }
741     break;
742
743   }
744   return NULL;
745 }
746
747 static uint16_t roundToInt16(int64_t f){
748         int r= (f + (1<<15))>>16;
749              if(r<-0x7FFF) return 0x8000;
750         else if(r> 0x7FFF) return 0x7FFF;
751         else               return r;
752 }
753
754 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
755 {
756   union {
757         signed short tmp[8] __attribute__ ((aligned(16)));
758         vector signed short vec;
759         } buf;
760
761   buf.tmp[0] =  ( (0xffffLL) * contrast>>8 )>>9;                        //cy
762   buf.tmp[1] =  -256*brightness;                                        //oy
763   buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);     //crv
764   buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);     //cbu
765   buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));    //cgu
766   buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));    //cgv
767
768
769   c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
770   c->CY  = vec_splat ((vector signed short)buf.vec, 0);
771   c->OY  = vec_splat ((vector signed short)buf.vec, 1);
772   c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
773   c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
774   c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
775   c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
776 #if 0
777 {
778 int i;
779 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
780 for (i=0; i<6;i++)
781   printf("%s %d ", v[i],buf.tmp[i] );
782   printf("\n");
783 }
784 #endif
785  return;
786 }
787
788
789 void
790 altivec_yuv2packedX (SwsContext *c,
791                        int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
792                        int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
793                        uint8_t *dest, int dstW, int dstY)
794 {
795   int i,j;
796   vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
797   vector signed short R0,G0,B0,R1,G1,B1;
798
799   vector unsigned char R,G,B;
800   vector unsigned char *out,*nout;
801
802   vector signed short   RND = vec_splat_s16(1<<3);
803   vector unsigned short SCL = vec_splat_u16(4);
804   unsigned long scratch[16] __attribute__ ((aligned (16)));
805
806   vector signed short *YCoeffs, *CCoeffs;
807
808   YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
809   CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
810
811   out = (vector unsigned char *)dest;
812
813   for(i=0; i<dstW; i+=16){
814     Y0 = RND;
815     Y1 = RND;
816     /* extract 16 coeffs from lumSrc */
817     for(j=0; j<lumFilterSize; j++) {
818       X0 = vec_ld (0,  &lumSrc[j][i]);
819       X1 = vec_ld (16, &lumSrc[j][i]);
820       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
821       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
822     }
823
824     U = RND;
825     V = RND;
826     /* extract 8 coeffs from U,V */
827     for(j=0; j<chrFilterSize; j++) {
828       X  = vec_ld (0, &chrSrc[j][i/2]);
829       U  = vec_mradds (X, CCoeffs[j], U);
830       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
831       V  = vec_mradds (X, CCoeffs[j], V);
832     }
833
834     /* scale and clip signals */
835     Y0 = vec_sra (Y0, SCL);
836     Y1 = vec_sra (Y1, SCL);
837     U  = vec_sra (U,  SCL);
838     V  = vec_sra (V,  SCL);
839
840     Y0 = vec_clip_s16 (Y0);
841     Y1 = vec_clip_s16 (Y1);
842     U  = vec_clip_s16 (U);
843     V  = vec_clip_s16 (V);
844
845     /* now we have
846       Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
847       U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
848
849       Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
850       U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
851       V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
852     */
853
854     U0 = vec_mergeh (U,U);
855     V0 = vec_mergeh (V,V);
856
857     U1 = vec_mergel (U,U);
858     V1 = vec_mergel (V,V);
859
860     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
861     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
862
863     R  = vec_packclp (R0,R1);
864     G  = vec_packclp (G0,G1);
865     B  = vec_packclp (B0,B1);
866
867     switch(c->dstFormat) {
868       case PIX_FMT_ABGR: out_abgr (R,G,B,out); break;
869       case PIX_FMT_BGRA: out_bgra (R,G,B,out); break;
870       case PIX_FMT_RGBA: out_rgba (R,G,B,out); break;
871       case PIX_FMT_ARGB: out_argb (R,G,B,out); break;
872       case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
873       case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
874       default:
875         {
876           /* If this is reached, the caller should have called yuv2packedXinC
877              instead. */
878           static int printed_error_message;
879           if(!printed_error_message) {
880             av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
881                     sws_format_name(c->dstFormat));
882             printed_error_message=1;
883           }
884           return;
885         }
886     }
887   }
888
889   if (i < dstW) {
890     i -= 16;
891
892     Y0 = RND;
893     Y1 = RND;
894     /* extract 16 coeffs from lumSrc */
895     for(j=0; j<lumFilterSize; j++) {
896       X0 = vec_ld (0,  &lumSrc[j][i]);
897       X1 = vec_ld (16, &lumSrc[j][i]);
898       Y0 = vec_mradds (X0, YCoeffs[j], Y0);
899       Y1 = vec_mradds (X1, YCoeffs[j], Y1);
900     }
901
902     U = RND;
903     V = RND;
904     /* extract 8 coeffs from U,V */
905     for(j=0; j<chrFilterSize; j++) {
906       X  = vec_ld (0, &chrSrc[j][i/2]);
907       U  = vec_mradds (X, CCoeffs[j], U);
908       X  = vec_ld (0, &chrSrc[j][i/2+2048]);
909       V  = vec_mradds (X, CCoeffs[j], V);
910     }
911
912     /* scale and clip signals */
913     Y0 = vec_sra (Y0, SCL);
914     Y1 = vec_sra (Y1, SCL);
915     U  = vec_sra (U,  SCL);
916     V  = vec_sra (V,  SCL);
917
918     Y0 = vec_clip_s16 (Y0);
919     Y1 = vec_clip_s16 (Y1);
920     U  = vec_clip_s16 (U);
921     V  = vec_clip_s16 (V);
922
923     /* now we have
924        Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
925        U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
926
927        Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
928        U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
929        V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
930     */
931
932     U0 = vec_mergeh (U,U);
933     V0 = vec_mergeh (V,V);
934
935     U1 = vec_mergel (U,U);
936     V1 = vec_mergel (V,V);
937
938     cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
939     cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
940
941     R  = vec_packclp (R0,R1);
942     G  = vec_packclp (G0,G1);
943     B  = vec_packclp (B0,B1);
944
945     nout = (vector unsigned char *)scratch;
946     switch(c->dstFormat) {
947       case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break;
948       case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break;
949       case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break;
950       case PIX_FMT_ARGB: out_argb (R,G,B,nout); break;
951       case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
952       case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
953       default:
954         /* Unreachable, I think. */
955         av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
956                 sws_format_name(c->dstFormat));
957         return;
958     }
959
960     memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
961   }
962
963 }