2 marc.hoffman@analog.com March 8, 2004
4 Altivec Acceleration for Color Space Conversion revision 0.2
6 convert I420 YV12 to RGB in various formats,
7 it rejects images that are not in 420 formats
8 it rejects images that don't have widths of multiples of 16
9 it rejects images that don't have heights of multiples of 2
10 reject defers to C simulation codes.
12 lots of optimizations to be done here
14 1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15 so we currently use max min to clip
17 2. the inefficient use of chroma loading needs a bit of brushing up
19 3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
22 MODIFIED to calculate coeffs from currently selected color space.
23 MODIFIED core to be a macro which you spec the output format.
24 ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25 CORRECTED algorithim selection to be strict on input formats.
26 ADDED runtime detection of altivec.
28 ADDED altivec_yuv2packedX vertical scl + RGB converter
33 The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34 The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
38 so we have roughly 10clocks per pixel this is too high something has to be wrong.
40 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
42 OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43 guaranteed to have the input video frame it was just decompressed so
44 it probably resides in L1 caches. However we are creating the
45 output video stream this needs to use the DSTST instruction to
46 optimize for the cache. We couple this with the fact that we are
47 not going to be visiting the input buffer again so we mark it Least
48 Recently Used. This shaves 25% of the processor cycles off.
50 Now MEMCPY is the largest mips consumer in the system, probably due
51 to the inefficient X11 stuff.
53 GL libraries seem to be very slow on this machine 1.33Ghz PB running
54 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
55 a versioning issues, however i have libGL.1.2.dylib for both
56 machines. ((We need to figure this out now))
58 GL2 libraries work now with patch for RGB32
60 NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
62 Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
73 #include "swscale_internal.h"
75 #include "libvo/img_format.h" //FIXME try to reduce dependency of such stuff
77 #undef PROFILE_THE_BEAST
80 typedef unsigned char ubyte;
81 typedef signed char sbyte;
84 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
85 homogeneous vector registers x0,x1,x2 are interleaved with the
88 o0 = vec_mergeh (x0,x1);
89 o1 = vec_perm (o0, x2, perm_rgb_0);
90 o2 = vec_perm (o0, x2, perm_rgb_1);
91 o3 = vec_mergel (x0,x1);
92 o4 = vec_perm (o3,o2,perm_rgb_2);
93 o5 = vec_perm (o3,o2,perm_rgb_3);
95 perm_rgb_0: o0(RG).h v1(B) --> o1*
101 perm_rgb_1: o0(RG).h v1(B) --> o2
107 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
113 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
121 const vector unsigned char
122 perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
123 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
124 perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
125 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
126 perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
127 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
128 perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
129 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
131 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
133 typeof(x0) o0,o2,o3; \
134 o0 = vec_mergeh (x0,x1); \
135 y0 = vec_perm (o0, x2, perm_rgb_0);\
136 o2 = vec_perm (o0, x2, perm_rgb_1);\
137 o3 = vec_mergel (x0,x1); \
138 y1 = vec_perm (o3,o2,perm_rgb_2); \
139 y2 = vec_perm (o3,o2,perm_rgb_3); \
142 #define vec_mstrgb24(x0,x1,x2,ptr) \
144 typeof(x0) _0,_1,_2; \
145 vec_merge3 (x0,x1,x2,_0,_1,_2); \
146 vec_st (_0, 0, ptr++); \
147 vec_st (_1, 0, ptr++); \
148 vec_st (_2, 0, ptr++); \
151 #define vec_mstbgr24(x0,x1,x2,ptr) \
153 typeof(x0) _0,_1,_2; \
154 vec_merge3 (x2,x1,x0,_0,_1,_2); \
155 vec_st (_0, 0, ptr++); \
156 vec_st (_1, 0, ptr++); \
157 vec_st (_2, 0, ptr++); \
160 /* pack the pixels in rgb0 format
164 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
167 _0 = vec_mergeh (x0,x1); \
168 _1 = vec_mergeh (x2,x3); \
169 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
170 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
171 vec_st (_2, 0*16, (T *)ptr); \
172 vec_st (_3, 1*16, (T *)ptr); \
173 _0 = vec_mergel (x0,x1); \
174 _1 = vec_mergel (x2,x3); \
175 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
176 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
177 vec_st (_2, 2*16, (T *)ptr); \
178 vec_st (_3, 3*16, (T *)ptr); \
185 | 1 -0.3441 -0.7142 |x| Cb|
192 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
200 (vector signed short) \
201 vec_perm(x,(typeof(x))AVV(0),\
202 (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
203 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
205 (vector signed short) \
206 vec_perm(x,(typeof(x))AVV(0),\
207 (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
208 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
210 #define vec_clip(x) \
211 vec_max (vec_min (x, (typeof(x))AVV(235)), (typeof(x))AVV(16))
213 #define vec_packclp_a(x,y) \
214 (vector unsigned char)vec_pack (vec_clip (x), vec_clip (y))
216 #define vec_packclp(x,y) \
217 (vector unsigned char)vec_packs \
218 ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
219 (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
221 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
224 static inline void cvtyuvtoRGB (SwsContext *c,
225 vector signed short Y, vector signed short U, vector signed short V,
226 vector signed short *R, vector signed short *G, vector signed short *B)
228 vector signed short vx,ux,uvx;
230 Y = vec_mradds (Y, c->CY, c->OY);
231 U = vec_sub (U,(vector signed short)
232 vec_splat((vector signed short)AVV(128),0));
233 V = vec_sub (V,(vector signed short)
234 vec_splat((vector signed short)AVV(128),0));
236 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
237 ux = vec_sl (U, c->CSHIFT);
238 *B = vec_mradds (ux, c->CBU, Y);
240 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
241 vx = vec_sl (V, c->CSHIFT);
242 *R = vec_mradds (vx, c->CRV, Y);
244 // uvx = ((CGU*u) + (CGV*v))>>15;
245 uvx = vec_mradds (U, c->CGU, Y);
246 *G = vec_mradds (V, c->CGV, uvx);
251 ------------------------------------------------------------------------------
253 ------------------------------------------------------------------------------
257 #define DEFCSP420_CVT(name,out_pixels) \
258 static int altivec_##name (SwsContext *c, \
259 unsigned char **in, int *instrides, \
260 int srcSliceY, int srcSliceH, \
261 unsigned char **oplanes, int *outstrides) \
266 int instrides_scl[3]; \
267 vector unsigned char y0,y1; \
269 vector signed char u,v; \
271 vector signed short Y0,Y1,Y2,Y3; \
272 vector signed short U,V; \
273 vector signed short vx,ux,uvx; \
274 vector signed short vx0,ux0,uvx0; \
275 vector signed short vx1,ux1,uvx1; \
276 vector signed short R0,G0,B0; \
277 vector signed short R1,G1,B1; \
278 vector unsigned char R,G,B; \
280 vector unsigned char *uivP, *vivP; \
281 vector unsigned char align_perm; \
283 vector signed short \
291 vector unsigned short lCSHIFT = c->CSHIFT; \
293 ubyte *y1i = in[0]; \
294 ubyte *y2i = in[0]+w; \
298 vector unsigned char *oute \
299 = (vector unsigned char *) \
300 (oplanes[0]+srcSliceY*outstrides[0]); \
301 vector unsigned char *outo \
302 = (vector unsigned char *) \
303 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
306 instrides_scl[0] = instrides[0]; \
307 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
308 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
311 for (i=0;i<h/2;i++) { \
312 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
313 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
315 for (j=0;j<w/16;j++) { \
317 y0 = vec_ldl (0,y1i); \
318 y1 = vec_ldl (0,y2i); \
319 uivP = (vector unsigned char *)ui; \
320 vivP = (vector unsigned char *)vi; \
322 align_perm = vec_lvsl (0, ui); \
323 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); \
325 align_perm = vec_lvsl (0, vi); \
326 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); \
328 u = (vector signed char) \
329 vec_sub (u,(vector signed char) \
330 vec_splat((vector signed char)AVV(128),0));\
331 v = (vector signed char) \
332 vec_sub (v,(vector signed char) \
333 vec_splat((vector signed char)AVV(128),0));\
335 U = vec_unpackh (u); \
336 V = vec_unpackh (v); \
344 Y0 = vec_mradds (Y0, lCY, lOY); \
345 Y1 = vec_mradds (Y1, lCY, lOY); \
346 Y2 = vec_mradds (Y2, lCY, lOY); \
347 Y3 = vec_mradds (Y3, lCY, lOY); \
349 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
350 ux = vec_sl (U, lCSHIFT); \
351 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \
352 ux0 = vec_mergeh (ux,ux); \
353 ux1 = vec_mergel (ux,ux); \
355 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
356 vx = vec_sl (V, lCSHIFT); \
357 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \
358 vx0 = vec_mergeh (vx,vx); \
359 vx1 = vec_mergel (vx,vx); \
361 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
362 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \
363 uvx = vec_mradds (V, lCGV, uvx); \
364 uvx0 = vec_mergeh (uvx,uvx); \
365 uvx1 = vec_mergel (uvx,uvx); \
367 R0 = vec_add (Y0,vx0); \
368 G0 = vec_add (Y0,uvx0); \
369 B0 = vec_add (Y0,ux0); \
370 R1 = vec_add (Y1,vx1); \
371 G1 = vec_add (Y1,uvx1); \
372 B1 = vec_add (Y1,ux1); \
374 R = vec_packclp (R0,R1); \
375 G = vec_packclp (G0,G1); \
376 B = vec_packclp (B0,B1); \
378 out_pixels(R,G,B,oute); \
380 R0 = vec_add (Y2,vx0); \
381 G0 = vec_add (Y2,uvx0); \
382 B0 = vec_add (Y2,ux0); \
383 R1 = vec_add (Y3,vx1); \
384 G1 = vec_add (Y3,uvx1); \
385 B1 = vec_add (Y3,ux1); \
386 R = vec_packclp (R0,R1); \
387 G = vec_packclp (G0,G1); \
388 B = vec_packclp (B0,B1); \
391 out_pixels(R,G,B,outo); \
400 outo += (outstrides[0])>>4; \
401 oute += (outstrides[0])>>4; \
403 ui += instrides_scl[1]; \
404 vi += instrides_scl[2]; \
405 y1i += instrides_scl[0]; \
406 y2i += instrides_scl[0]; \
412 #define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
413 #define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
414 #define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
415 #define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
416 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
417 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(c,b,a,ptr)
419 DEFCSP420_CVT (yuv2_abgr32, out_abgr)
421 DEFCSP420_CVT (yuv2_bgra32, out_argb)
423 static int altivec_yuv2_bgra32 (SwsContext *c,
424 unsigned char **in, int *instrides,
425 int srcSliceY, int srcSliceH,
426 unsigned char **oplanes, int *outstrides)
431 int instrides_scl[3];
432 vector unsigned char y0,y1;
434 vector signed char u,v;
436 vector signed short Y0,Y1,Y2,Y3;
437 vector signed short U,V;
438 vector signed short vx,ux,uvx;
439 vector signed short vx0,ux0,uvx0;
440 vector signed short vx1,ux1,uvx1;
441 vector signed short R0,G0,B0;
442 vector signed short R1,G1,B1;
443 vector unsigned char R,G,B;
445 vector unsigned char *uivP, *vivP;
446 vector unsigned char align_perm;
456 vector unsigned short lCSHIFT = c->CSHIFT;
459 ubyte *y2i = in[0]+w;
463 vector unsigned char *oute
464 = (vector unsigned char *)
465 (oplanes[0]+srcSliceY*outstrides[0]);
466 vector unsigned char *outo
467 = (vector unsigned char *)
468 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
471 instrides_scl[0] = instrides[0];
472 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */
473 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */
476 for (i=0;i<h/2;i++) {
477 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
478 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
480 for (j=0;j<w/16;j++) {
482 y0 = vec_ldl (0,y1i);
483 y1 = vec_ldl (0,y2i);
484 uivP = (vector unsigned char *)ui;
485 vivP = (vector unsigned char *)vi;
487 align_perm = vec_lvsl (0, ui);
488 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
490 align_perm = vec_lvsl (0, vi);
491 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
492 u = (vector signed char)
493 vec_sub (u,(vector signed char)
494 vec_splat((vector signed char)AVV(128),0));
496 v = (vector signed char)
497 vec_sub (v, (vector signed char)
498 vec_splat((vector signed char)AVV(128),0));
509 Y0 = vec_mradds (Y0, lCY, lOY);
510 Y1 = vec_mradds (Y1, lCY, lOY);
511 Y2 = vec_mradds (Y2, lCY, lOY);
512 Y3 = vec_mradds (Y3, lCY, lOY);
514 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */
515 ux = vec_sl (U, lCSHIFT);
516 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
517 ux0 = vec_mergeh (ux,ux);
518 ux1 = vec_mergel (ux,ux);
520 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */
521 vx = vec_sl (V, lCSHIFT);
522 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
523 vx0 = vec_mergeh (vx,vx);
524 vx1 = vec_mergel (vx,vx);
525 /* uvx = ((CGU*u) + (CGV*v))>>15 */
526 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
527 uvx = vec_mradds (V, lCGV, uvx);
528 uvx0 = vec_mergeh (uvx,uvx);
529 uvx1 = vec_mergel (uvx,uvx);
530 R0 = vec_add (Y0,vx0);
531 G0 = vec_add (Y0,uvx0);
532 B0 = vec_add (Y0,ux0);
533 R1 = vec_add (Y1,vx1);
534 G1 = vec_add (Y1,uvx1);
535 B1 = vec_add (Y1,ux1);
536 R = vec_packclp (R0,R1);
537 G = vec_packclp (G0,G1);
538 B = vec_packclp (B0,B1);
540 out_argb(R,G,B,oute);
541 R0 = vec_add (Y2,vx0);
542 G0 = vec_add (Y2,uvx0);
543 B0 = vec_add (Y2,ux0);
544 R1 = vec_add (Y3,vx1);
545 G1 = vec_add (Y3,uvx1);
546 B1 = vec_add (Y3,ux1);
547 R = vec_packclp (R0,R1);
548 G = vec_packclp (G0,G1);
549 B = vec_packclp (B0,B1);
551 out_argb(R,G,B,outo);
559 outo += (outstrides[0])>>4;
560 oute += (outstrides[0])>>4;
562 ui += instrides_scl[1];
563 vi += instrides_scl[2];
564 y1i += instrides_scl[0];
565 y2i += instrides_scl[0];
573 DEFCSP420_CVT (yuv2_rgba32, out_rgba)
574 DEFCSP420_CVT (yuv2_argb32, out_argb)
575 DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
576 DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
579 // uyvy|uyvy|uyvy|uyvy
580 // 0123 4567 89ab cdef
582 const vector unsigned char
583 demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
586 0x10,0x0c,0x10,0x0c),
587 demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
590 0x10,0x0E,0x10,0x0E),
591 demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
594 0x10,0x0D,0x10,0x0F);
597 this is so I can play live CCIR raw video
599 static int altivec_uyvy_rgb32 (SwsContext *c,
600 unsigned char **in, int *instrides,
601 int srcSliceY, int srcSliceH,
602 unsigned char **oplanes, int *outstrides)
607 vector unsigned char uyvy;
608 vector signed short Y,U,V;
609 vector signed short vx,ux,uvx;
610 vector signed short R0,G0,B0,R1,G1,B1;
611 vector unsigned char R,G,B;
612 vector unsigned char *out;
616 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
619 for (j=0;j<w/16;j++) {
620 uyvy = vec_ld (0, img);
621 U = (vector signed short)
622 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
624 V = (vector signed short)
625 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
627 Y = (vector signed short)
628 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
630 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
632 uyvy = vec_ld (16, img);
633 U = (vector signed short)
634 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
636 V = (vector signed short)
637 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
639 Y = (vector signed short)
640 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
642 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
644 R = vec_packclp (R0,R1);
645 G = vec_packclp (G0,G1);
646 B = vec_packclp (B0,B1);
648 // vec_mstbgr24 (R,G,B, out);
649 out_rgba (R,G,B,out);
659 /* Ok currently the acceleration routine only supports
660 inputs of widths a multiple of 16
661 and heights a multiple 2
663 So we just fall back to the C codes for this.
665 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
667 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
671 and this seems not to matter too much I tried a bunch of
672 videos with abnormal widths and mplayer crashes else where.
673 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
674 boom with X11 bad match.
677 if ((c->srcW & 0xf) != 0) return NULL;
679 switch (c->srcFormat) {
690 if ((c->srcH & 0x1) != 0)
693 switch(c->dstFormat){
695 MSG_WARN("ALTIVEC: Color Space RGB24\n");
696 return altivec_yuv2_rgb24;
698 MSG_WARN("ALTIVEC: Color Space BGR24\n");
699 return altivec_yuv2_bgr24;
701 MSG_WARN("ALTIVEC: Color Space ARGB32\n");
702 return altivec_yuv2_argb32;
704 MSG_WARN("ALTIVEC: Color Space BGRA32\n");
705 // return profile_altivec_bgra32;
707 return altivec_yuv2_bgra32;
708 default: return NULL;
713 switch(c->dstFormat){
715 MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
716 return altivec_uyvy_rgb32;
717 default: return NULL;
725 static uint16_t roundToInt16(int64_t f){
726 int r= (f + (1<<15))>>16;
727 if(r<-0x7FFF) return 0x8000;
728 else if(r> 0x7FFF) return 0x7FFF;
732 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
735 signed short tmp[8] __attribute__ ((aligned(16)));
736 vector signed short vec;
739 buf.tmp[0] = ( (0xffffLL) * contrast>>8 )>>9; //cy
740 buf.tmp[1] = -256*brightness; //oy
741 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv
742 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu
743 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu
744 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv
747 c->CSHIFT = (vector unsigned short)vec_splat((vector unsigned short)AVV(2),0);
748 c->CY = vec_splat ((vector signed short)buf.vec, 0);
749 c->OY = vec_splat ((vector signed short)buf.vec, 1);
750 c->CRV = vec_splat ((vector signed short)buf.vec, 2);
751 c->CBU = vec_splat ((vector signed short)buf.vec, 3);
752 c->CGU = vec_splat ((vector signed short)buf.vec, 4);
753 c->CGV = vec_splat ((vector signed short)buf.vec, 5);
757 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
759 printf("%s %d ", v[i],buf.tmp[i] );
768 altivec_yuv2packedX (SwsContext *c,
769 int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
770 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
771 uint8_t *dest, int dstW, int dstY)
774 short tmp __attribute__((aligned (16)));
777 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
778 vector signed short R0,G0,B0,R1,G1,B1;
780 vector unsigned char R,G,B,pels[3];
781 vector unsigned char *out,*nout;
783 vector signed short RND = vec_splat((vector signed short)AVV(1<<3),0);
784 vector unsigned short SCL = vec_splat((vector unsigned short)AVV(4),0);
785 unsigned long scratch[16] __attribute__ ((aligned (16)));
787 vector signed short *vYCoeffsBank, *vCCoeffsBank;
789 vector signed short *YCoeffs, *CCoeffs;
791 vYCoeffsBank = malloc (sizeof (vector signed short)*lumFilterSize*dstW);
792 vCCoeffsBank = malloc (sizeof (vector signed short)*chrFilterSize*dstW);
794 for (i=0;i<lumFilterSize*dstW;i++) {
795 tmp = c->vLumFilter[i];
796 p = &vYCoeffsBank[i];
801 for (i=0;i<chrFilterSize*dstW;i++) {
802 tmp = c->vChrFilter[i];
803 p = &vCCoeffsBank[i];
808 YCoeffs = vYCoeffsBank+dstY*lumFilterSize;
809 CCoeffs = vCCoeffsBank+dstY*chrFilterSize;
811 out = (vector unsigned char *)dest;
813 for(i=0; i<dstW; i+=16){
816 /* extract 16 coeffs from lumSrc */
817 for(j=0; j<lumFilterSize; j++) {
818 X0 = vec_ld (0, &lumSrc[j][i]);
819 X1 = vec_ld (16, &lumSrc[j][i]);
820 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
821 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
826 /* extract 8 coeffs from U,V */
827 for(j=0; j<chrFilterSize; j++) {
828 X = vec_ld (0, &chrSrc[j][i/2]);
829 U = vec_mradds (X, CCoeffs[j], U);
830 X = vec_ld (0, &chrSrc[j][i/2+2048]);
831 V = vec_mradds (X, CCoeffs[j], V);
834 /* scale and clip signals */
835 Y0 = vec_sra (Y0, SCL);
836 Y1 = vec_sra (Y1, SCL);
837 U = vec_sra (U, SCL);
838 V = vec_sra (V, SCL);
846 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
847 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
849 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
850 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
851 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
854 U0 = vec_mergeh (U,U);
855 V0 = vec_mergeh (V,V);
857 U1 = vec_mergel (U,U);
858 V1 = vec_mergel (V,V);
860 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
861 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
863 R = vec_packclp (R0,R1);
864 G = vec_packclp (G0,G1);
865 B = vec_packclp (B0,B1);
867 out_rgba (R,G,B,out);
875 /* extract 16 coeffs from lumSrc */
876 for(j=0; j<lumFilterSize; j++) {
877 X0 = vec_ld (0, &lumSrc[j][i]);
878 X1 = vec_ld (16, &lumSrc[j][i]);
879 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
880 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
885 /* extract 8 coeffs from U,V */
886 for(j=0; j<chrFilterSize; j++) {
887 X = vec_ld (0, &chrSrc[j][i/2]);
888 U = vec_mradds (X, CCoeffs[j], U);
889 X = vec_ld (0, &chrSrc[j][i/2+2048]);
890 V = vec_mradds (X, CCoeffs[j], V);
893 /* scale and clip signals */
894 Y0 = vec_sra (Y0, SCL);
895 Y1 = vec_sra (Y1, SCL);
896 U = vec_sra (U, SCL);
897 V = vec_sra (V, SCL);
905 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
906 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
908 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
909 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
910 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
913 U0 = vec_mergeh (U,U);
914 V0 = vec_mergeh (V,V);
916 U1 = vec_mergel (U,U);
917 V1 = vec_mergel (V,V);
919 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
920 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
922 R = vec_packclp (R0,R1);
923 G = vec_packclp (G0,G1);
924 B = vec_packclp (B0,B1);
926 nout = (vector unsigned char *)scratch;
927 out_rgba (R,G,B,nout);
929 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
932 if (vYCoeffsBank) free (vYCoeffsBank);
933 if (vCCoeffsBank) free (vCCoeffsBank);