2 marc.hoffman@analog.com March 8, 2004
4 Altivec Acceleration for Color Space Conversion revision 0.2
6 convert I420 YV12 to RGB in various formats,
7 it rejects images that are not in 420 formats
8 it rejects images that don't have widths of multiples of 16
9 it rejects images that don't have heights of multiples of 2
10 reject defers to C simulation codes.
12 lots of optimizations to be done here
14 1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15 so we currently use max min to clip
17 2. the inefficient use of chroma loading needs a bit of brushing up
19 3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
22 MODIFIED to calculate coeffs from currently selected color space.
23 MODIFIED core to be a macro which you spec the output format.
24 ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25 CORRECTED algorithim selection to be strict on input formats.
26 ADDED runtime detection of altivec.
28 ADDED altivec_yuv2packedX vertical scl + RGB converter
33 The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34 The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
38 so we have roughly 10clocks per pixel this is too high something has to be wrong.
40 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
42 OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43 guaranteed to have the input video frame it was just decompressed so
44 it probably resides in L1 caches. However we are creating the
45 output video stream this needs to use the DSTST instruction to
46 optimize for the cache. We couple this with the fact that we are
47 not going to be visiting the input buffer again so we mark it Least
48 Recently Used. This shaves 25% of the processor cycles off.
50 Now MEMCPY is the largest mips consumer in the system, probably due
51 to the inefficient X11 stuff.
53 GL libraries seem to be very slow on this machine 1.33Ghz PB running
54 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
55 a versioning issues, however i have libGL.1.2.dylib for both
56 machines. ((We need to figure this out now))
58 GL2 libraries work now with patch for RGB32
60 NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
62 Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
76 #include "swscale_internal.h"
78 #include "libvo/img_format.h" //FIXME try to reduce dependency of such stuff
80 #undef PROFILE_THE_BEAST
83 typedef unsigned char ubyte;
84 typedef signed char sbyte;
87 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
88 homogeneous vector registers x0,x1,x2 are interleaved with the
91 o0 = vec_mergeh (x0,x1);
92 o1 = vec_perm (o0, x2, perm_rgb_0);
93 o2 = vec_perm (o0, x2, perm_rgb_1);
94 o3 = vec_mergel (x0,x1);
95 o4 = vec_perm (o3,o2,perm_rgb_2);
96 o5 = vec_perm (o3,o2,perm_rgb_3);
98 perm_rgb_0: o0(RG).h v1(B) --> o1*
104 perm_rgb_1: o0(RG).h v1(B) --> o2
110 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
116 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
124 const vector unsigned char
125 perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
126 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
127 perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
128 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
129 perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
130 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
131 perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
132 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
134 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
136 typeof(x0) o0,o2,o3; \
137 o0 = vec_mergeh (x0,x1); \
138 y0 = vec_perm (o0, x2, perm_rgb_0);\
139 o2 = vec_perm (o0, x2, perm_rgb_1);\
140 o3 = vec_mergel (x0,x1); \
141 y1 = vec_perm (o3,o2,perm_rgb_2); \
142 y2 = vec_perm (o3,o2,perm_rgb_3); \
145 #define vec_mstrgb24(x0,x1,x2,ptr) \
147 typeof(x0) _0,_1,_2; \
148 vec_merge3 (x0,x1,x2,_0,_1,_2); \
149 vec_st (_0, 0, ptr++); \
150 vec_st (_1, 0, ptr++); \
151 vec_st (_2, 0, ptr++); \
154 #define vec_mstbgr24(x0,x1,x2,ptr) \
156 typeof(x0) _0,_1,_2; \
157 vec_merge3 (x2,x1,x0,_0,_1,_2); \
158 vec_st (_0, 0, ptr++); \
159 vec_st (_1, 0, ptr++); \
160 vec_st (_2, 0, ptr++); \
163 /* pack the pixels in rgb0 format
167 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
170 _0 = vec_mergeh (x0,x1); \
171 _1 = vec_mergeh (x2,x3); \
172 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
173 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
174 vec_st (_2, 0*16, (T *)ptr); \
175 vec_st (_3, 1*16, (T *)ptr); \
176 _0 = vec_mergel (x0,x1); \
177 _1 = vec_mergel (x2,x3); \
178 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
179 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
180 vec_st (_2, 2*16, (T *)ptr); \
181 vec_st (_3, 3*16, (T *)ptr); \
188 | 1 -0.3441 -0.7142 |x| Cb|
195 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
203 (vector signed short) \
204 vec_perm(x,(typeof(x))AVV(0),\
205 (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
206 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
208 (vector signed short) \
209 vec_perm(x,(typeof(x))AVV(0),\
210 (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
211 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
213 #define vec_clip(x) \
214 vec_max (vec_min (x, (typeof(x))AVV(235)), (typeof(x))AVV(16))
216 #define vec_packclp_a(x,y) \
217 (vector unsigned char)vec_pack (vec_clip (x), vec_clip (y))
219 #define vec_packclp(x,y) \
220 (vector unsigned char)vec_packs \
221 ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
222 (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
224 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
227 static inline void cvtyuvtoRGB (SwsContext *c,
228 vector signed short Y, vector signed short U, vector signed short V,
229 vector signed short *R, vector signed short *G, vector signed short *B)
231 vector signed short vx,ux,uvx;
233 Y = vec_mradds (Y, c->CY, c->OY);
234 U = vec_sub (U,(vector signed short)
235 vec_splat((vector signed short)AVV(128),0));
236 V = vec_sub (V,(vector signed short)
237 vec_splat((vector signed short)AVV(128),0));
239 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
240 ux = vec_sl (U, c->CSHIFT);
241 *B = vec_mradds (ux, c->CBU, Y);
243 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
244 vx = vec_sl (V, c->CSHIFT);
245 *R = vec_mradds (vx, c->CRV, Y);
247 // uvx = ((CGU*u) + (CGV*v))>>15;
248 uvx = vec_mradds (U, c->CGU, Y);
249 *G = vec_mradds (V, c->CGV, uvx);
254 ------------------------------------------------------------------------------
256 ------------------------------------------------------------------------------
260 #define DEFCSP420_CVT(name,out_pixels) \
261 static int altivec_##name (SwsContext *c, \
262 unsigned char **in, int *instrides, \
263 int srcSliceY, int srcSliceH, \
264 unsigned char **oplanes, int *outstrides) \
269 int instrides_scl[3]; \
270 vector unsigned char y0,y1; \
272 vector signed char u,v; \
274 vector signed short Y0,Y1,Y2,Y3; \
275 vector signed short U,V; \
276 vector signed short vx,ux,uvx; \
277 vector signed short vx0,ux0,uvx0; \
278 vector signed short vx1,ux1,uvx1; \
279 vector signed short R0,G0,B0; \
280 vector signed short R1,G1,B1; \
281 vector unsigned char R,G,B; \
283 vector unsigned char *uivP, *vivP; \
284 vector unsigned char align_perm; \
286 vector signed short \
294 vector unsigned short lCSHIFT = c->CSHIFT; \
296 ubyte *y1i = in[0]; \
297 ubyte *y2i = in[0]+w; \
301 vector unsigned char *oute \
302 = (vector unsigned char *) \
303 (oplanes[0]+srcSliceY*outstrides[0]); \
304 vector unsigned char *outo \
305 = (vector unsigned char *) \
306 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
309 instrides_scl[0] = instrides[0]; \
310 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
311 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
314 for (i=0;i<h/2;i++) { \
315 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
316 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
318 for (j=0;j<w/16;j++) { \
320 y0 = vec_ldl (0,y1i); \
321 y1 = vec_ldl (0,y2i); \
322 uivP = (vector unsigned char *)ui; \
323 vivP = (vector unsigned char *)vi; \
325 align_perm = vec_lvsl (0, ui); \
326 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); \
328 align_perm = vec_lvsl (0, vi); \
329 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); \
331 u = (vector signed char) \
332 vec_sub (u,(vector signed char) \
333 vec_splat((vector signed char)AVV(128),0));\
334 v = (vector signed char) \
335 vec_sub (v,(vector signed char) \
336 vec_splat((vector signed char)AVV(128),0));\
338 U = vec_unpackh (u); \
339 V = vec_unpackh (v); \
347 Y0 = vec_mradds (Y0, lCY, lOY); \
348 Y1 = vec_mradds (Y1, lCY, lOY); \
349 Y2 = vec_mradds (Y2, lCY, lOY); \
350 Y3 = vec_mradds (Y3, lCY, lOY); \
352 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
353 ux = vec_sl (U, lCSHIFT); \
354 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \
355 ux0 = vec_mergeh (ux,ux); \
356 ux1 = vec_mergel (ux,ux); \
358 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
359 vx = vec_sl (V, lCSHIFT); \
360 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \
361 vx0 = vec_mergeh (vx,vx); \
362 vx1 = vec_mergel (vx,vx); \
364 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
365 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \
366 uvx = vec_mradds (V, lCGV, uvx); \
367 uvx0 = vec_mergeh (uvx,uvx); \
368 uvx1 = vec_mergel (uvx,uvx); \
370 R0 = vec_add (Y0,vx0); \
371 G0 = vec_add (Y0,uvx0); \
372 B0 = vec_add (Y0,ux0); \
373 R1 = vec_add (Y1,vx1); \
374 G1 = vec_add (Y1,uvx1); \
375 B1 = vec_add (Y1,ux1); \
377 R = vec_packclp (R0,R1); \
378 G = vec_packclp (G0,G1); \
379 B = vec_packclp (B0,B1); \
381 out_pixels(R,G,B,oute); \
383 R0 = vec_add (Y2,vx0); \
384 G0 = vec_add (Y2,uvx0); \
385 B0 = vec_add (Y2,ux0); \
386 R1 = vec_add (Y3,vx1); \
387 G1 = vec_add (Y3,uvx1); \
388 B1 = vec_add (Y3,ux1); \
389 R = vec_packclp (R0,R1); \
390 G = vec_packclp (G0,G1); \
391 B = vec_packclp (B0,B1); \
394 out_pixels(R,G,B,outo); \
403 outo += (outstrides[0])>>4; \
404 oute += (outstrides[0])>>4; \
406 ui += instrides_scl[1]; \
407 vi += instrides_scl[2]; \
408 y1i += instrides_scl[0]; \
409 y2i += instrides_scl[0]; \
415 #define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
416 #define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
417 #define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
418 #define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
419 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
420 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(c,b,a,ptr)
422 DEFCSP420_CVT (yuv2_abgr32, out_abgr)
424 DEFCSP420_CVT (yuv2_bgra32, out_argb)
426 static int altivec_yuv2_bgra32 (SwsContext *c,
427 unsigned char **in, int *instrides,
428 int srcSliceY, int srcSliceH,
429 unsigned char **oplanes, int *outstrides)
434 int instrides_scl[3];
435 vector unsigned char y0,y1;
437 vector signed char u,v;
439 vector signed short Y0,Y1,Y2,Y3;
440 vector signed short U,V;
441 vector signed short vx,ux,uvx;
442 vector signed short vx0,ux0,uvx0;
443 vector signed short vx1,ux1,uvx1;
444 vector signed short R0,G0,B0;
445 vector signed short R1,G1,B1;
446 vector unsigned char R,G,B;
448 vector unsigned char *uivP, *vivP;
449 vector unsigned char align_perm;
459 vector unsigned short lCSHIFT = c->CSHIFT;
462 ubyte *y2i = in[0]+w;
466 vector unsigned char *oute
467 = (vector unsigned char *)
468 (oplanes[0]+srcSliceY*outstrides[0]);
469 vector unsigned char *outo
470 = (vector unsigned char *)
471 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
474 instrides_scl[0] = instrides[0];
475 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */
476 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */
479 for (i=0;i<h/2;i++) {
480 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
481 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
483 for (j=0;j<w/16;j++) {
485 y0 = vec_ldl (0,y1i);
486 y1 = vec_ldl (0,y2i);
487 uivP = (vector unsigned char *)ui;
488 vivP = (vector unsigned char *)vi;
490 align_perm = vec_lvsl (0, ui);
491 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
493 align_perm = vec_lvsl (0, vi);
494 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
495 u = (vector signed char)
496 vec_sub (u,(vector signed char)
497 vec_splat((vector signed char)AVV(128),0));
499 v = (vector signed char)
500 vec_sub (v, (vector signed char)
501 vec_splat((vector signed char)AVV(128),0));
512 Y0 = vec_mradds (Y0, lCY, lOY);
513 Y1 = vec_mradds (Y1, lCY, lOY);
514 Y2 = vec_mradds (Y2, lCY, lOY);
515 Y3 = vec_mradds (Y3, lCY, lOY);
517 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */
518 ux = vec_sl (U, lCSHIFT);
519 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
520 ux0 = vec_mergeh (ux,ux);
521 ux1 = vec_mergel (ux,ux);
523 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */
524 vx = vec_sl (V, lCSHIFT);
525 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
526 vx0 = vec_mergeh (vx,vx);
527 vx1 = vec_mergel (vx,vx);
528 /* uvx = ((CGU*u) + (CGV*v))>>15 */
529 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
530 uvx = vec_mradds (V, lCGV, uvx);
531 uvx0 = vec_mergeh (uvx,uvx);
532 uvx1 = vec_mergel (uvx,uvx);
533 R0 = vec_add (Y0,vx0);
534 G0 = vec_add (Y0,uvx0);
535 B0 = vec_add (Y0,ux0);
536 R1 = vec_add (Y1,vx1);
537 G1 = vec_add (Y1,uvx1);
538 B1 = vec_add (Y1,ux1);
539 R = vec_packclp (R0,R1);
540 G = vec_packclp (G0,G1);
541 B = vec_packclp (B0,B1);
543 out_argb(R,G,B,oute);
544 R0 = vec_add (Y2,vx0);
545 G0 = vec_add (Y2,uvx0);
546 B0 = vec_add (Y2,ux0);
547 R1 = vec_add (Y3,vx1);
548 G1 = vec_add (Y3,uvx1);
549 B1 = vec_add (Y3,ux1);
550 R = vec_packclp (R0,R1);
551 G = vec_packclp (G0,G1);
552 B = vec_packclp (B0,B1);
554 out_argb(R,G,B,outo);
562 outo += (outstrides[0])>>4;
563 oute += (outstrides[0])>>4;
565 ui += instrides_scl[1];
566 vi += instrides_scl[2];
567 y1i += instrides_scl[0];
568 y2i += instrides_scl[0];
576 DEFCSP420_CVT (yuv2_rgba32, out_rgba)
577 DEFCSP420_CVT (yuv2_argb32, out_argb)
578 DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
579 DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
582 // uyvy|uyvy|uyvy|uyvy
583 // 0123 4567 89ab cdef
585 const vector unsigned char
586 demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
589 0x10,0x0c,0x10,0x0c),
590 demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
593 0x10,0x0E,0x10,0x0E),
594 demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
597 0x10,0x0D,0x10,0x0F);
600 this is so I can play live CCIR raw video
602 static int altivec_uyvy_rgb32 (SwsContext *c,
603 unsigned char **in, int *instrides,
604 int srcSliceY, int srcSliceH,
605 unsigned char **oplanes, int *outstrides)
610 vector unsigned char uyvy;
611 vector signed short Y,U,V;
612 vector signed short vx,ux,uvx;
613 vector signed short R0,G0,B0,R1,G1,B1;
614 vector unsigned char R,G,B;
615 vector unsigned char *out;
619 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
622 for (j=0;j<w/16;j++) {
623 uyvy = vec_ld (0, img);
624 U = (vector signed short)
625 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
627 V = (vector signed short)
628 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
630 Y = (vector signed short)
631 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
633 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
635 uyvy = vec_ld (16, img);
636 U = (vector signed short)
637 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
639 V = (vector signed short)
640 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
642 Y = (vector signed short)
643 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
645 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
647 R = vec_packclp (R0,R1);
648 G = vec_packclp (G0,G1);
649 B = vec_packclp (B0,B1);
651 // vec_mstbgr24 (R,G,B, out);
652 out_rgba (R,G,B,out);
662 /* Ok currently the acceleration routine only supports
663 inputs of widths a multiple of 16
664 and heights a multiple 2
666 So we just fall back to the C codes for this.
668 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
670 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
674 and this seems not to matter too much I tried a bunch of
675 videos with abnormal widths and mplayer crashes else where.
676 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
677 boom with X11 bad match.
680 if ((c->srcW & 0xf) != 0) return NULL;
682 switch (c->srcFormat) {
693 if ((c->srcH & 0x1) != 0)
696 switch(c->dstFormat){
698 MSG_WARN("ALTIVEC: Color Space RGB24\n");
699 return altivec_yuv2_rgb24;
701 MSG_WARN("ALTIVEC: Color Space BGR24\n");
702 return altivec_yuv2_bgr24;
704 MSG_WARN("ALTIVEC: Color Space ARGB32\n");
705 return altivec_yuv2_argb32;
707 MSG_WARN("ALTIVEC: Color Space BGRA32\n");
708 // return profile_altivec_bgra32;
710 return altivec_yuv2_bgra32;
711 default: return NULL;
716 switch(c->dstFormat){
718 MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
719 return altivec_uyvy_rgb32;
720 default: return NULL;
728 static uint16_t roundToInt16(int64_t f){
729 int r= (f + (1<<15))>>16;
730 if(r<-0x7FFF) return 0x8000;
731 else if(r> 0x7FFF) return 0x7FFF;
735 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
738 signed short tmp[8] __attribute__ ((aligned(16)));
739 vector signed short vec;
742 buf.tmp[0] = ( (0xffffLL) * contrast>>8 )>>9; //cy
743 buf.tmp[1] = -256*brightness; //oy
744 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv
745 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu
746 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu
747 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv
750 c->CSHIFT = (vector unsigned short)vec_splat((vector unsigned short)AVV(2),0);
751 c->CY = vec_splat ((vector signed short)buf.vec, 0);
752 c->OY = vec_splat ((vector signed short)buf.vec, 1);
753 c->CRV = vec_splat ((vector signed short)buf.vec, 2);
754 c->CBU = vec_splat ((vector signed short)buf.vec, 3);
755 c->CGU = vec_splat ((vector signed short)buf.vec, 4);
756 c->CGV = vec_splat ((vector signed short)buf.vec, 5);
760 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
762 printf("%s %d ", v[i],buf.tmp[i] );
771 altivec_yuv2packedX (SwsContext *c,
772 int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
773 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
774 uint8_t *dest, int dstW, int dstY)
777 short tmp __attribute__((aligned (16)));
780 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
781 vector signed short R0,G0,B0,R1,G1,B1;
783 vector unsigned char R,G,B,pels[3];
784 vector unsigned char *out,*nout;
786 vector signed short RND = vec_splat((vector signed short)AVV(1<<3),0);
787 vector unsigned short SCL = vec_splat((vector unsigned short)AVV(4),0);
788 unsigned long scratch[16] __attribute__ ((aligned (16)));
790 vector signed short *vYCoeffsBank, *vCCoeffsBank;
792 vector signed short *YCoeffs, *CCoeffs;
794 vYCoeffsBank = memalign (16, sizeof (vector signed short)*lumFilterSize*c->dstH);
795 vCCoeffsBank = memalign (16, sizeof (vector signed short)*chrFilterSize*c->dstH);
797 for (i=0;i<lumFilterSize*c->dstH;i++) {
798 tmp = c->vLumFilter[i];
799 p = &vYCoeffsBank[i];
804 for (i=0;i<chrFilterSize*c->dstH;i++) {
805 tmp = c->vChrFilter[i];
806 p = &vCCoeffsBank[i];
811 YCoeffs = vYCoeffsBank+dstY*lumFilterSize;
812 CCoeffs = vCCoeffsBank+dstY*chrFilterSize;
814 out = (vector unsigned char *)dest;
816 for(i=0; i<dstW; i+=16){
819 /* extract 16 coeffs from lumSrc */
820 for(j=0; j<lumFilterSize; j++) {
821 X0 = vec_ld (0, &lumSrc[j][i]);
822 X1 = vec_ld (16, &lumSrc[j][i]);
823 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
824 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
829 /* extract 8 coeffs from U,V */
830 for(j=0; j<chrFilterSize; j++) {
831 X = vec_ld (0, &chrSrc[j][i/2]);
832 U = vec_mradds (X, CCoeffs[j], U);
833 X = vec_ld (0, &chrSrc[j][i/2+2048]);
834 V = vec_mradds (X, CCoeffs[j], V);
837 /* scale and clip signals */
838 Y0 = vec_sra (Y0, SCL);
839 Y1 = vec_sra (Y1, SCL);
840 U = vec_sra (U, SCL);
841 V = vec_sra (V, SCL);
849 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
850 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
852 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
853 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
854 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
857 U0 = vec_mergeh (U,U);
858 V0 = vec_mergeh (V,V);
860 U1 = vec_mergel (U,U);
861 V1 = vec_mergel (V,V);
863 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
864 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
866 R = vec_packclp (R0,R1);
867 G = vec_packclp (G0,G1);
868 B = vec_packclp (B0,B1);
870 switch(c->dstFormat) {
871 case IMGFMT_ABGR: out_abgr (R,G,B,out); break;
872 case IMGFMT_BGRA: out_bgra (R,G,B,out); break;
873 case IMGFMT_RGBA: out_rgba (R,G,B,out); break;
874 case IMGFMT_ARGB: out_argb (R,G,B,out); break;
875 case IMGFMT_RGB24: out_rgb24 (R,G,B,out); break;
876 case IMGFMT_BGR24: out_bgr24 (R,G,B,out); break;
879 /* FIXME: either write more out_* macros or punt to yuv2packedXinC */
880 static int printed_error_message;
881 if(!printed_error_message) {
882 MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
883 vo_format_name(c->dstFormat));
884 printed_error_message=1;
896 /* extract 16 coeffs from lumSrc */
897 for(j=0; j<lumFilterSize; j++) {
898 X0 = vec_ld (0, &lumSrc[j][i]);
899 X1 = vec_ld (16, &lumSrc[j][i]);
900 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
901 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
906 /* extract 8 coeffs from U,V */
907 for(j=0; j<chrFilterSize; j++) {
908 X = vec_ld (0, &chrSrc[j][i/2]);
909 U = vec_mradds (X, CCoeffs[j], U);
910 X = vec_ld (0, &chrSrc[j][i/2+2048]);
911 V = vec_mradds (X, CCoeffs[j], V);
914 /* scale and clip signals */
915 Y0 = vec_sra (Y0, SCL);
916 Y1 = vec_sra (Y1, SCL);
917 U = vec_sra (U, SCL);
918 V = vec_sra (V, SCL);
926 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
927 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
929 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
930 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
931 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
934 U0 = vec_mergeh (U,U);
935 V0 = vec_mergeh (V,V);
937 U1 = vec_mergel (U,U);
938 V1 = vec_mergel (V,V);
940 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
941 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
943 R = vec_packclp (R0,R1);
944 G = vec_packclp (G0,G1);
945 B = vec_packclp (B0,B1);
947 nout = (vector unsigned char *)scratch;
948 switch(c->dstFormat) {
949 case IMGFMT_ABGR: out_abgr (R,G,B,nout); break;
950 case IMGFMT_BGRA: out_bgra (R,G,B,nout); break;
951 case IMGFMT_RGBA: out_rgba (R,G,B,nout); break;
952 case IMGFMT_ARGB: out_argb (R,G,B,nout); break;
953 case IMGFMT_RGB24: out_rgb24 (R,G,B,nout); break;
954 case IMGFMT_BGR24: out_bgr24 (R,G,B,nout); break;
956 /* Unreachable, I think. */
957 MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
958 vo_format_name(c->dstFormat));
962 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
965 if (vYCoeffsBank) free (vYCoeffsBank);
966 if (vCCoeffsBank) free (vCCoeffsBank);