2 marc.hoffman@analog.com March 8, 2004
4 AltiVec acceleration for colorspace conversion revision 0.2
6 convert I420 YV12 to RGB in various formats,
7 it rejects images that are not in 420 formats
8 it rejects images that don't have widths of multiples of 16
9 it rejects images that don't have heights of multiples of 2
10 reject defers to C simulation codes.
12 lots of optimizations to be done here
14 1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15 so we currently use max min to clip
17 2. the inefficient use of chroma loading needs a bit of brushing up
19 3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
22 MODIFIED to calculate coeffs from currently selected color space.
23 MODIFIED core to be a macro which you spec the output format.
24 ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25 CORRECTED algorithim selection to be strict on input formats.
26 ADDED runtime detection of altivec.
28 ADDED altivec_yuv2packedX vertical scl + RGB converter
33 The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34 The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
38 so we have roughly 10clocks per pixel this is too high something has to be wrong.
40 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
42 OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43 guaranteed to have the input video frame it was just decompressed so
44 it probably resides in L1 caches. However we are creating the
45 output video stream this needs to use the DSTST instruction to
46 optimize for the cache. We couple this with the fact that we are
47 not going to be visiting the input buffer again so we mark it Least
48 Recently Used. This shaves 25% of the processor cycles off.
50 Now MEMCPY is the largest mips consumer in the system, probably due
51 to the inefficient X11 stuff.
53 GL libraries seem to be very slow on this machine 1.33Ghz PB running
54 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
55 a versioning issues, however I have libGL.1.2.dylib for both
56 machines. ((We need to figure this out now))
58 GL2 libraries work now with patch for RGB32
60 NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
62 Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
66 * This file is part of FFmpeg.
68 * FFmpeg is free software; you can redistribute it and/or modify
69 * it under the terms of the GNU General Public License as published by
70 * the Free Software Foundation; either version 2 of the License, or
71 * (at your option) any later version.
73 * FFmpeg is distributed in the hope that it will be useful,
74 * but WITHOUT ANY WARRANTY; without even the implied warranty of
75 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
76 * GNU General Public License for more details.
78 * You should have received a copy of the GNU General Public License
79 * along with FFmpeg; if not, write to the Free Software
80 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
94 #include "swscale_internal.h"
96 #undef PROFILE_THE_BEAST
99 typedef unsigned char ubyte;
100 typedef signed char sbyte;
103 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
104 homogeneous vector registers x0,x1,x2 are interleaved with the
107 o0 = vec_mergeh (x0,x1);
108 o1 = vec_perm (o0, x2, perm_rgb_0);
109 o2 = vec_perm (o0, x2, perm_rgb_1);
110 o3 = vec_mergel (x0,x1);
111 o4 = vec_perm (o3,o2,perm_rgb_2);
112 o5 = vec_perm (o3,o2,perm_rgb_3);
114 perm_rgb_0: o0(RG).h v1(B) --> o1*
120 perm_rgb_1: o0(RG).h v1(B) --> o2
126 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
132 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
140 const vector unsigned char
141 perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
142 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
143 perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
144 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
145 perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
146 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
147 perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
148 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
150 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
152 typeof(x0) o0,o2,o3; \
153 o0 = vec_mergeh (x0,x1); \
154 y0 = vec_perm (o0, x2, perm_rgb_0); \
155 o2 = vec_perm (o0, x2, perm_rgb_1); \
156 o3 = vec_mergel (x0,x1); \
157 y1 = vec_perm (o3,o2,perm_rgb_2); \
158 y2 = vec_perm (o3,o2,perm_rgb_3); \
161 #define vec_mstbgr24(x0,x1,x2,ptr) \
163 typeof(x0) _0,_1,_2; \
164 vec_merge3 (x0,x1,x2,_0,_1,_2); \
165 vec_st (_0, 0, ptr++); \
166 vec_st (_1, 0, ptr++); \
167 vec_st (_2, 0, ptr++); \
170 #define vec_mstrgb24(x0,x1,x2,ptr) \
172 typeof(x0) _0,_1,_2; \
173 vec_merge3 (x2,x1,x0,_0,_1,_2); \
174 vec_st (_0, 0, ptr++); \
175 vec_st (_1, 0, ptr++); \
176 vec_st (_2, 0, ptr++); \
179 /* pack the pixels in rgb0 format
183 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
186 _0 = vec_mergeh (x0,x1); \
187 _1 = vec_mergeh (x2,x3); \
188 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
189 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
190 vec_st (_2, 0*16, (T *)ptr); \
191 vec_st (_3, 1*16, (T *)ptr); \
192 _0 = vec_mergel (x0,x1); \
193 _1 = vec_mergel (x2,x3); \
194 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
195 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
196 vec_st (_2, 2*16, (T *)ptr); \
197 vec_st (_3, 3*16, (T *)ptr); \
204 | 1 -0.3441 -0.7142 |x| Cb|
211 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
219 (vector signed short) \
220 vec_perm(x,(typeof(x))AVV(0),\
221 (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
222 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
224 (vector signed short) \
225 vec_perm(x,(typeof(x))AVV(0),\
226 (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
227 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
229 #define vec_clip_s16(x) \
230 vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\
231 (vector signed short)AVV( 16, 16, 16, 16, 16, 16, 16, 16))
233 #define vec_packclp(x,y) \
234 (vector unsigned char)vec_packs \
235 ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
236 (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
238 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
241 static inline void cvtyuvtoRGB (SwsContext *c,
242 vector signed short Y, vector signed short U, vector signed short V,
243 vector signed short *R, vector signed short *G, vector signed short *B)
245 vector signed short vx,ux,uvx;
247 Y = vec_mradds (Y, c->CY, c->OY);
248 U = vec_sub (U,(vector signed short)
249 vec_splat((vector signed short)AVV(128),0));
250 V = vec_sub (V,(vector signed short)
251 vec_splat((vector signed short)AVV(128),0));
253 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
254 ux = vec_sl (U, c->CSHIFT);
255 *B = vec_mradds (ux, c->CBU, Y);
257 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
258 vx = vec_sl (V, c->CSHIFT);
259 *R = vec_mradds (vx, c->CRV, Y);
261 // uvx = ((CGU*u) + (CGV*v))>>15;
262 uvx = vec_mradds (U, c->CGU, Y);
263 *G = vec_mradds (V, c->CGV, uvx);
268 ------------------------------------------------------------------------------
270 ------------------------------------------------------------------------------
274 #define DEFCSP420_CVT(name,out_pixels) \
275 static int altivec_##name (SwsContext *c, \
276 unsigned char **in, int *instrides, \
277 int srcSliceY, int srcSliceH, \
278 unsigned char **oplanes, int *outstrides) \
283 int instrides_scl[3]; \
284 vector unsigned char y0,y1; \
286 vector signed char u,v; \
288 vector signed short Y0,Y1,Y2,Y3; \
289 vector signed short U,V; \
290 vector signed short vx,ux,uvx; \
291 vector signed short vx0,ux0,uvx0; \
292 vector signed short vx1,ux1,uvx1; \
293 vector signed short R0,G0,B0; \
294 vector signed short R1,G1,B1; \
295 vector unsigned char R,G,B; \
297 vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
298 vector unsigned char align_perm; \
300 vector signed short \
308 vector unsigned short lCSHIFT = c->CSHIFT; \
310 ubyte *y1i = in[0]; \
311 ubyte *y2i = in[0]+instrides[0]; \
315 vector unsigned char *oute \
316 = (vector unsigned char *) \
317 (oplanes[0]+srcSliceY*outstrides[0]); \
318 vector unsigned char *outo \
319 = (vector unsigned char *) \
320 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
323 instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \
324 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
325 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
328 for (i=0;i<h/2;i++) { \
329 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
330 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
332 for (j=0;j<w/16;j++) { \
334 y1ivP = (vector unsigned char *)y1i; \
335 y2ivP = (vector unsigned char *)y2i; \
336 uivP = (vector unsigned char *)ui; \
337 vivP = (vector unsigned char *)vi; \
339 align_perm = vec_lvsl (0, y1i); \
340 y0 = (vector unsigned char) \
341 vec_perm (y1ivP[0], y1ivP[1], align_perm); \
343 align_perm = vec_lvsl (0, y2i); \
344 y1 = (vector unsigned char) \
345 vec_perm (y2ivP[0], y2ivP[1], align_perm); \
347 align_perm = vec_lvsl (0, ui); \
348 u = (vector signed char) \
349 vec_perm (uivP[0], uivP[1], align_perm); \
351 align_perm = vec_lvsl (0, vi); \
352 v = (vector signed char) \
353 vec_perm (vivP[0], vivP[1], align_perm); \
355 u = (vector signed char) \
356 vec_sub (u,(vector signed char) \
357 vec_splat((vector signed char)AVV(128),0)); \
358 v = (vector signed char) \
359 vec_sub (v,(vector signed char) \
360 vec_splat((vector signed char)AVV(128),0)); \
362 U = vec_unpackh (u); \
363 V = vec_unpackh (v); \
371 Y0 = vec_mradds (Y0, lCY, lOY); \
372 Y1 = vec_mradds (Y1, lCY, lOY); \
373 Y2 = vec_mradds (Y2, lCY, lOY); \
374 Y3 = vec_mradds (Y3, lCY, lOY); \
376 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
377 ux = vec_sl (U, lCSHIFT); \
378 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \
379 ux0 = vec_mergeh (ux,ux); \
380 ux1 = vec_mergel (ux,ux); \
382 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
383 vx = vec_sl (V, lCSHIFT); \
384 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \
385 vx0 = vec_mergeh (vx,vx); \
386 vx1 = vec_mergel (vx,vx); \
388 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
389 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \
390 uvx = vec_mradds (V, lCGV, uvx); \
391 uvx0 = vec_mergeh (uvx,uvx); \
392 uvx1 = vec_mergel (uvx,uvx); \
394 R0 = vec_add (Y0,vx0); \
395 G0 = vec_add (Y0,uvx0); \
396 B0 = vec_add (Y0,ux0); \
397 R1 = vec_add (Y1,vx1); \
398 G1 = vec_add (Y1,uvx1); \
399 B1 = vec_add (Y1,ux1); \
401 R = vec_packclp (R0,R1); \
402 G = vec_packclp (G0,G1); \
403 B = vec_packclp (B0,B1); \
405 out_pixels(R,G,B,oute); \
407 R0 = vec_add (Y2,vx0); \
408 G0 = vec_add (Y2,uvx0); \
409 B0 = vec_add (Y2,ux0); \
410 R1 = vec_add (Y3,vx1); \
411 G1 = vec_add (Y3,uvx1); \
412 B1 = vec_add (Y3,ux1); \
413 R = vec_packclp (R0,R1); \
414 G = vec_packclp (G0,G1); \
415 B = vec_packclp (B0,B1); \
418 out_pixels(R,G,B,outo); \
427 outo += (outstrides[0])>>4; \
428 oute += (outstrides[0])>>4; \
430 ui += instrides_scl[1]; \
431 vi += instrides_scl[2]; \
432 y1i += instrides_scl[0]; \
433 y2i += instrides_scl[0]; \
439 #define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
440 #define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
441 #define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
442 #define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
443 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
444 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
446 DEFCSP420_CVT (yuv2_abgr, out_abgr)
448 DEFCSP420_CVT (yuv2_bgra, out_bgra)
450 static int altivec_yuv2_bgra32 (SwsContext *c,
451 unsigned char **in, int *instrides,
452 int srcSliceY, int srcSliceH,
453 unsigned char **oplanes, int *outstrides)
458 int instrides_scl[3];
459 vector unsigned char y0,y1;
461 vector signed char u,v;
463 vector signed short Y0,Y1,Y2,Y3;
464 vector signed short U,V;
465 vector signed short vx,ux,uvx;
466 vector signed short vx0,ux0,uvx0;
467 vector signed short vx1,ux1,uvx1;
468 vector signed short R0,G0,B0;
469 vector signed short R1,G1,B1;
470 vector unsigned char R,G,B;
472 vector unsigned char *uivP, *vivP;
473 vector unsigned char align_perm;
483 vector unsigned short lCSHIFT = c->CSHIFT;
486 ubyte *y2i = in[0]+w;
490 vector unsigned char *oute
491 = (vector unsigned char *)
492 (oplanes[0]+srcSliceY*outstrides[0]);
493 vector unsigned char *outo
494 = (vector unsigned char *)
495 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
498 instrides_scl[0] = instrides[0];
499 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */
500 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */
503 for (i=0;i<h/2;i++) {
504 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
505 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
507 for (j=0;j<w/16;j++) {
509 y0 = vec_ldl (0,y1i);
510 y1 = vec_ldl (0,y2i);
511 uivP = (vector unsigned char *)ui;
512 vivP = (vector unsigned char *)vi;
514 align_perm = vec_lvsl (0, ui);
515 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
517 align_perm = vec_lvsl (0, vi);
518 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
519 u = (vector signed char)
520 vec_sub (u,(vector signed char)
521 vec_splat((vector signed char)AVV(128),0));
523 v = (vector signed char)
524 vec_sub (v, (vector signed char)
525 vec_splat((vector signed char)AVV(128),0));
536 Y0 = vec_mradds (Y0, lCY, lOY);
537 Y1 = vec_mradds (Y1, lCY, lOY);
538 Y2 = vec_mradds (Y2, lCY, lOY);
539 Y3 = vec_mradds (Y3, lCY, lOY);
541 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */
542 ux = vec_sl (U, lCSHIFT);
543 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
544 ux0 = vec_mergeh (ux,ux);
545 ux1 = vec_mergel (ux,ux);
547 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */
548 vx = vec_sl (V, lCSHIFT);
549 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
550 vx0 = vec_mergeh (vx,vx);
551 vx1 = vec_mergel (vx,vx);
552 /* uvx = ((CGU*u) + (CGV*v))>>15 */
553 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
554 uvx = vec_mradds (V, lCGV, uvx);
555 uvx0 = vec_mergeh (uvx,uvx);
556 uvx1 = vec_mergel (uvx,uvx);
557 R0 = vec_add (Y0,vx0);
558 G0 = vec_add (Y0,uvx0);
559 B0 = vec_add (Y0,ux0);
560 R1 = vec_add (Y1,vx1);
561 G1 = vec_add (Y1,uvx1);
562 B1 = vec_add (Y1,ux1);
563 R = vec_packclp (R0,R1);
564 G = vec_packclp (G0,G1);
565 B = vec_packclp (B0,B1);
567 out_argb(R,G,B,oute);
568 R0 = vec_add (Y2,vx0);
569 G0 = vec_add (Y2,uvx0);
570 B0 = vec_add (Y2,ux0);
571 R1 = vec_add (Y3,vx1);
572 G1 = vec_add (Y3,uvx1);
573 B1 = vec_add (Y3,ux1);
574 R = vec_packclp (R0,R1);
575 G = vec_packclp (G0,G1);
576 B = vec_packclp (B0,B1);
578 out_argb(R,G,B,outo);
586 outo += (outstrides[0])>>4;
587 oute += (outstrides[0])>>4;
589 ui += instrides_scl[1];
590 vi += instrides_scl[2];
591 y1i += instrides_scl[0];
592 y2i += instrides_scl[0];
600 DEFCSP420_CVT (yuv2_rgba, out_rgba)
601 DEFCSP420_CVT (yuv2_argb, out_argb)
602 DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
603 DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
606 // uyvy|uyvy|uyvy|uyvy
607 // 0123 4567 89ab cdef
609 const vector unsigned char
610 demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
613 0x10,0x0c,0x10,0x0c),
614 demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
617 0x10,0x0E,0x10,0x0E),
618 demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
621 0x10,0x0D,0x10,0x0F);
624 this is so I can play live CCIR raw video
626 static int altivec_uyvy_rgb32 (SwsContext *c,
627 unsigned char **in, int *instrides,
628 int srcSliceY, int srcSliceH,
629 unsigned char **oplanes, int *outstrides)
634 vector unsigned char uyvy;
635 vector signed short Y,U,V;
636 vector signed short R0,G0,B0,R1,G1,B1;
637 vector unsigned char R,G,B;
638 vector unsigned char *out;
642 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
645 for (j=0;j<w/16;j++) {
646 uyvy = vec_ld (0, img);
647 U = (vector signed short)
648 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
650 V = (vector signed short)
651 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
653 Y = (vector signed short)
654 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
656 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
658 uyvy = vec_ld (16, img);
659 U = (vector signed short)
660 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
662 V = (vector signed short)
663 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
665 Y = (vector signed short)
666 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
668 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
670 R = vec_packclp (R0,R1);
671 G = vec_packclp (G0,G1);
672 B = vec_packclp (B0,B1);
674 // vec_mstbgr24 (R,G,B, out);
675 out_rgba (R,G,B,out);
685 /* Ok currently the acceleration routine only supports
686 inputs of widths a multiple of 16
687 and heights a multiple 2
689 So we just fall back to the C codes for this.
691 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
693 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
697 and this seems not to matter too much I tried a bunch of
698 videos with abnormal widths and MPlayer crashes elsewhere.
699 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
700 boom with X11 bad match.
703 if ((c->srcW & 0xf) != 0) return NULL;
705 switch (c->srcFormat) {
706 case PIX_FMT_YUV410P:
707 case PIX_FMT_YUV420P:
708 /*case IMGFMT_CLPL: ??? */
712 if ((c->srcH & 0x1) != 0)
715 switch(c->dstFormat){
717 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
718 return altivec_yuv2_rgb24;
720 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
721 return altivec_yuv2_bgr24;
723 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
724 return altivec_yuv2_argb;
726 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
727 return altivec_yuv2_abgr;
729 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
730 return altivec_yuv2_rgba;
732 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
733 return altivec_yuv2_bgra;
734 default: return NULL;
738 case PIX_FMT_UYVY422:
739 switch(c->dstFormat){
741 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
742 return altivec_uyvy_rgb32;
743 default: return NULL;
751 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
754 signed short tmp[8] __attribute__ ((aligned(16)));
755 vector signed short vec;
758 buf.tmp[0] = ((0xffffLL) * contrast>>8)>>9; //cy
759 buf.tmp[1] = -256*brightness; //oy
760 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv
761 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu
762 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu
763 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv
766 c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
767 c->CY = vec_splat ((vector signed short)buf.vec, 0);
768 c->OY = vec_splat ((vector signed short)buf.vec, 1);
769 c->CRV = vec_splat ((vector signed short)buf.vec, 2);
770 c->CBU = vec_splat ((vector signed short)buf.vec, 3);
771 c->CGU = vec_splat ((vector signed short)buf.vec, 4);
772 c->CGV = vec_splat ((vector signed short)buf.vec, 5);
776 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
778 printf("%s %d ", v[i],buf.tmp[i] );
787 altivec_yuv2packedX (SwsContext *c,
788 int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
789 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
790 uint8_t *dest, int dstW, int dstY)
793 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
794 vector signed short R0,G0,B0,R1,G1,B1;
796 vector unsigned char R,G,B;
797 vector unsigned char *out,*nout;
799 vector signed short RND = vec_splat_s16(1<<3);
800 vector unsigned short SCL = vec_splat_u16(4);
801 unsigned long scratch[16] __attribute__ ((aligned (16)));
803 vector signed short *YCoeffs, *CCoeffs;
805 YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
806 CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
808 out = (vector unsigned char *)dest;
810 for (i=0; i<dstW; i+=16){
813 /* extract 16 coeffs from lumSrc */
814 for (j=0; j<lumFilterSize; j++) {
815 X0 = vec_ld (0, &lumSrc[j][i]);
816 X1 = vec_ld (16, &lumSrc[j][i]);
817 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
818 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
823 /* extract 8 coeffs from U,V */
824 for (j=0; j<chrFilterSize; j++) {
825 X = vec_ld (0, &chrSrc[j][i/2]);
826 U = vec_mradds (X, CCoeffs[j], U);
827 X = vec_ld (0, &chrSrc[j][i/2+2048]);
828 V = vec_mradds (X, CCoeffs[j], V);
831 /* scale and clip signals */
832 Y0 = vec_sra (Y0, SCL);
833 Y1 = vec_sra (Y1, SCL);
834 U = vec_sra (U, SCL);
835 V = vec_sra (V, SCL);
837 Y0 = vec_clip_s16 (Y0);
838 Y1 = vec_clip_s16 (Y1);
839 U = vec_clip_s16 (U);
840 V = vec_clip_s16 (V);
843 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
844 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
846 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
847 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
848 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
851 U0 = vec_mergeh (U,U);
852 V0 = vec_mergeh (V,V);
854 U1 = vec_mergel (U,U);
855 V1 = vec_mergel (V,V);
857 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
858 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
860 R = vec_packclp (R0,R1);
861 G = vec_packclp (G0,G1);
862 B = vec_packclp (B0,B1);
864 switch(c->dstFormat) {
865 case PIX_FMT_ABGR: out_abgr (R,G,B,out); break;
866 case PIX_FMT_BGRA: out_bgra (R,G,B,out); break;
867 case PIX_FMT_RGBA: out_rgba (R,G,B,out); break;
868 case PIX_FMT_ARGB: out_argb (R,G,B,out); break;
869 case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
870 case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
873 /* If this is reached, the caller should have called yuv2packedXinC
875 static int printed_error_message;
876 if (!printed_error_message) {
877 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
878 sws_format_name(c->dstFormat));
879 printed_error_message=1;
891 /* extract 16 coeffs from lumSrc */
892 for (j=0; j<lumFilterSize; j++) {
893 X0 = vec_ld (0, &lumSrc[j][i]);
894 X1 = vec_ld (16, &lumSrc[j][i]);
895 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
896 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
901 /* extract 8 coeffs from U,V */
902 for (j=0; j<chrFilterSize; j++) {
903 X = vec_ld (0, &chrSrc[j][i/2]);
904 U = vec_mradds (X, CCoeffs[j], U);
905 X = vec_ld (0, &chrSrc[j][i/2+2048]);
906 V = vec_mradds (X, CCoeffs[j], V);
909 /* scale and clip signals */
910 Y0 = vec_sra (Y0, SCL);
911 Y1 = vec_sra (Y1, SCL);
912 U = vec_sra (U, SCL);
913 V = vec_sra (V, SCL);
915 Y0 = vec_clip_s16 (Y0);
916 Y1 = vec_clip_s16 (Y1);
917 U = vec_clip_s16 (U);
918 V = vec_clip_s16 (V);
921 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
922 U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
924 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
925 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
926 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
929 U0 = vec_mergeh (U,U);
930 V0 = vec_mergeh (V,V);
932 U1 = vec_mergel (U,U);
933 V1 = vec_mergel (V,V);
935 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
936 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
938 R = vec_packclp (R0,R1);
939 G = vec_packclp (G0,G1);
940 B = vec_packclp (B0,B1);
942 nout = (vector unsigned char *)scratch;
943 switch(c->dstFormat) {
944 case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break;
945 case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break;
946 case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break;
947 case PIX_FMT_ARGB: out_argb (R,G,B,nout); break;
948 case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
949 case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
951 /* Unreachable, I think. */
952 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
953 sws_format_name(c->dstFormat));
957 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);