2 marc.hoffman@analog.com March 8, 2004
4 Altivec Acceleration for Color Space Conversion revision 0.2
6 convert I420 YV12 to RGB in various formats,
7 it rejects images that are not in 420 formats
8 it rejects images that don't have widths of multiples of 16
9 it rejects images that don't have heights of multiples of 2
10 reject defers to C simulation codes.
12 lots of optimizations to be done here
14 1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15 so we currently use max min to clip
17 2. the inefficient use of chroma loading needs a bit of brushing up
19 3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
22 MODIFIED to calculate coeffs from currently selected color space.
23 MODIFIED core to be a macro which you spec the output format.
24 ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25 CORRECTED algorithim selection to be strict on input formats.
26 ADDED runtime detection of altivec.
28 ADDED altivec_yuv2packedX vertical scl + RGB converter
33 The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34 The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
38 so we have roughly 10clocks per pixel this is too high something has to be wrong.
40 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
42 OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43 guaranteed to have the input video frame it was just decompressed so
44 it probably resides in L1 caches. However we are creating the
45 output video stream this needs to use the DSTST instruction to
46 optimize for the cache. We couple this with the fact that we are
47 not going to be visiting the input buffer again so we mark it Least
48 Recently Used. This shaves 25% of the processor cycles off.
50 Now MEMCPY is the largest mips consumer in the system, probably due
51 to the inefficient X11 stuff.
53 GL libraries seem to be very slow on this machine 1.33Ghz PB running
54 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
55 a versioning issues, however i have libGL.1.2.dylib for both
56 machines. ((We need to figure this out now))
58 GL2 libraries work now with patch for RGB32
60 NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
62 Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
76 #include "swscale_internal.h"
78 #include "libvo/img_format.h" //FIXME try to reduce dependency of such stuff
80 #undef PROFILE_THE_BEAST
83 typedef unsigned char ubyte;
84 typedef signed char sbyte;
87 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
88 homogeneous vector registers x0,x1,x2 are interleaved with the
91 o0 = vec_mergeh (x0,x1);
92 o1 = vec_perm (o0, x2, perm_rgb_0);
93 o2 = vec_perm (o0, x2, perm_rgb_1);
94 o3 = vec_mergel (x0,x1);
95 o4 = vec_perm (o3,o2,perm_rgb_2);
96 o5 = vec_perm (o3,o2,perm_rgb_3);
98 perm_rgb_0: o0(RG).h v1(B) --> o1*
104 perm_rgb_1: o0(RG).h v1(B) --> o2
110 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
116 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
124 const vector unsigned char
125 perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
126 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
127 perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
128 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
129 perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
130 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
131 perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
132 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
134 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
136 typeof(x0) o0,o2,o3; \
137 o0 = vec_mergeh (x0,x1); \
138 y0 = vec_perm (o0, x2, perm_rgb_0);\
139 o2 = vec_perm (o0, x2, perm_rgb_1);\
140 o3 = vec_mergel (x0,x1); \
141 y1 = vec_perm (o3,o2,perm_rgb_2); \
142 y2 = vec_perm (o3,o2,perm_rgb_3); \
145 #define vec_mstbgr24(x0,x1,x2,ptr) \
147 typeof(x0) _0,_1,_2; \
148 vec_merge3 (x0,x1,x2,_0,_1,_2); \
149 vec_st (_0, 0, ptr++); \
150 vec_st (_1, 0, ptr++); \
151 vec_st (_2, 0, ptr++); \
154 #define vec_mstrgb24(x0,x1,x2,ptr) \
156 typeof(x0) _0,_1,_2; \
157 vec_merge3 (x2,x1,x0,_0,_1,_2); \
158 vec_st (_0, 0, ptr++); \
159 vec_st (_1, 0, ptr++); \
160 vec_st (_2, 0, ptr++); \
163 /* pack the pixels in rgb0 format
167 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
170 _0 = vec_mergeh (x0,x1); \
171 _1 = vec_mergeh (x2,x3); \
172 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
173 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
174 vec_st (_2, 0*16, (T *)ptr); \
175 vec_st (_3, 1*16, (T *)ptr); \
176 _0 = vec_mergel (x0,x1); \
177 _1 = vec_mergel (x2,x3); \
178 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
179 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
180 vec_st (_2, 2*16, (T *)ptr); \
181 vec_st (_3, 3*16, (T *)ptr); \
188 | 1 -0.3441 -0.7142 |x| Cb|
195 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
203 (vector signed short) \
204 vec_perm(x,(typeof(x))AVV(0),\
205 (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
206 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
208 (vector signed short) \
209 vec_perm(x,(typeof(x))AVV(0),\
210 (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
211 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
213 #define vec_clip_s16(x) \
214 vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\
215 (vector signed short)AVV(16, 16, 16, 16, 16, 16, 16, 16 ))
217 #define vec_packclp(x,y) \
218 (vector unsigned char)vec_packs \
219 ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
220 (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
222 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
225 static inline void cvtyuvtoRGB (SwsContext *c,
226 vector signed short Y, vector signed short U, vector signed short V,
227 vector signed short *R, vector signed short *G, vector signed short *B)
229 vector signed short vx,ux,uvx;
231 Y = vec_mradds (Y, c->CY, c->OY);
232 U = vec_sub (U,(vector signed short)
233 vec_splat((vector signed short)AVV(128),0));
234 V = vec_sub (V,(vector signed short)
235 vec_splat((vector signed short)AVV(128),0));
237 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
238 ux = vec_sl (U, c->CSHIFT);
239 *B = vec_mradds (ux, c->CBU, Y);
241 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
242 vx = vec_sl (V, c->CSHIFT);
243 *R = vec_mradds (vx, c->CRV, Y);
245 // uvx = ((CGU*u) + (CGV*v))>>15;
246 uvx = vec_mradds (U, c->CGU, Y);
247 *G = vec_mradds (V, c->CGV, uvx);
252 ------------------------------------------------------------------------------
254 ------------------------------------------------------------------------------
258 #define DEFCSP420_CVT(name,out_pixels) \
259 static int altivec_##name (SwsContext *c, \
260 unsigned char **in, int *instrides, \
261 int srcSliceY, int srcSliceH, \
262 unsigned char **oplanes, int *outstrides) \
267 int instrides_scl[3]; \
268 vector unsigned char y0,y1; \
270 vector signed char u,v; \
272 vector signed short Y0,Y1,Y2,Y3; \
273 vector signed short U,V; \
274 vector signed short vx,ux,uvx; \
275 vector signed short vx0,ux0,uvx0; \
276 vector signed short vx1,ux1,uvx1; \
277 vector signed short R0,G0,B0; \
278 vector signed short R1,G1,B1; \
279 vector unsigned char R,G,B; \
281 vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
282 vector unsigned char align_perm; \
284 vector signed short \
292 vector unsigned short lCSHIFT = c->CSHIFT; \
294 ubyte *y1i = in[0]; \
295 ubyte *y2i = in[0]+instrides[0]; \
299 vector unsigned char *oute \
300 = (vector unsigned char *) \
301 (oplanes[0]+srcSliceY*outstrides[0]); \
302 vector unsigned char *outo \
303 = (vector unsigned char *) \
304 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
307 instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \
308 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
309 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
312 for (i=0;i<h/2;i++) { \
313 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
314 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
316 for (j=0;j<w/16;j++) { \
318 y1ivP = (vector unsigned char *)y1i; \
319 y2ivP = (vector unsigned char *)y2i; \
320 uivP = (vector unsigned char *)ui; \
321 vivP = (vector unsigned char *)vi; \
323 align_perm = vec_lvsl (0, y1i); \
324 y0 = (vector unsigned char)vec_perm (y1ivP[0], y1ivP[1], align_perm);\
326 align_perm = vec_lvsl (0, y2i); \
327 y1 = (vector unsigned char)vec_perm (y2ivP[0], y2ivP[1], align_perm);\
329 align_perm = vec_lvsl (0, ui); \
330 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); \
332 align_perm = vec_lvsl (0, vi); \
333 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); \
335 u = (vector signed char) \
336 vec_sub (u,(vector signed char) \
337 vec_splat((vector signed char)AVV(128),0));\
338 v = (vector signed char) \
339 vec_sub (v,(vector signed char) \
340 vec_splat((vector signed char)AVV(128),0));\
342 U = vec_unpackh (u); \
343 V = vec_unpackh (v); \
351 Y0 = vec_mradds (Y0, lCY, lOY); \
352 Y1 = vec_mradds (Y1, lCY, lOY); \
353 Y2 = vec_mradds (Y2, lCY, lOY); \
354 Y3 = vec_mradds (Y3, lCY, lOY); \
356 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
357 ux = vec_sl (U, lCSHIFT); \
358 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \
359 ux0 = vec_mergeh (ux,ux); \
360 ux1 = vec_mergel (ux,ux); \
362 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
363 vx = vec_sl (V, lCSHIFT); \
364 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \
365 vx0 = vec_mergeh (vx,vx); \
366 vx1 = vec_mergel (vx,vx); \
368 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
369 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \
370 uvx = vec_mradds (V, lCGV, uvx); \
371 uvx0 = vec_mergeh (uvx,uvx); \
372 uvx1 = vec_mergel (uvx,uvx); \
374 R0 = vec_add (Y0,vx0); \
375 G0 = vec_add (Y0,uvx0); \
376 B0 = vec_add (Y0,ux0); \
377 R1 = vec_add (Y1,vx1); \
378 G1 = vec_add (Y1,uvx1); \
379 B1 = vec_add (Y1,ux1); \
381 R = vec_packclp (R0,R1); \
382 G = vec_packclp (G0,G1); \
383 B = vec_packclp (B0,B1); \
385 out_pixels(R,G,B,oute); \
387 R0 = vec_add (Y2,vx0); \
388 G0 = vec_add (Y2,uvx0); \
389 B0 = vec_add (Y2,ux0); \
390 R1 = vec_add (Y3,vx1); \
391 G1 = vec_add (Y3,uvx1); \
392 B1 = vec_add (Y3,ux1); \
393 R = vec_packclp (R0,R1); \
394 G = vec_packclp (G0,G1); \
395 B = vec_packclp (B0,B1); \
398 out_pixels(R,G,B,outo); \
407 outo += (outstrides[0])>>4; \
408 oute += (outstrides[0])>>4; \
410 ui += instrides_scl[1]; \
411 vi += instrides_scl[2]; \
412 y1i += instrides_scl[0]; \
413 y2i += instrides_scl[0]; \
419 #define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
420 #define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
421 #define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
422 #define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
423 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
424 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
426 DEFCSP420_CVT (yuv2_abgr, out_abgr)
428 DEFCSP420_CVT (yuv2_bgra, out_bgra)
430 static int altivec_yuv2_bgra32 (SwsContext *c,
431 unsigned char **in, int *instrides,
432 int srcSliceY, int srcSliceH,
433 unsigned char **oplanes, int *outstrides)
438 int instrides_scl[3];
439 vector unsigned char y0,y1;
441 vector signed char u,v;
443 vector signed short Y0,Y1,Y2,Y3;
444 vector signed short U,V;
445 vector signed short vx,ux,uvx;
446 vector signed short vx0,ux0,uvx0;
447 vector signed short vx1,ux1,uvx1;
448 vector signed short R0,G0,B0;
449 vector signed short R1,G1,B1;
450 vector unsigned char R,G,B;
452 vector unsigned char *uivP, *vivP;
453 vector unsigned char align_perm;
463 vector unsigned short lCSHIFT = c->CSHIFT;
466 ubyte *y2i = in[0]+w;
470 vector unsigned char *oute
471 = (vector unsigned char *)
472 (oplanes[0]+srcSliceY*outstrides[0]);
473 vector unsigned char *outo
474 = (vector unsigned char *)
475 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
478 instrides_scl[0] = instrides[0];
479 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */
480 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */
483 for (i=0;i<h/2;i++) {
484 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
485 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
487 for (j=0;j<w/16;j++) {
489 y0 = vec_ldl (0,y1i);
490 y1 = vec_ldl (0,y2i);
491 uivP = (vector unsigned char *)ui;
492 vivP = (vector unsigned char *)vi;
494 align_perm = vec_lvsl (0, ui);
495 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
497 align_perm = vec_lvsl (0, vi);
498 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
499 u = (vector signed char)
500 vec_sub (u,(vector signed char)
501 vec_splat((vector signed char)AVV(128),0));
503 v = (vector signed char)
504 vec_sub (v, (vector signed char)
505 vec_splat((vector signed char)AVV(128),0));
516 Y0 = vec_mradds (Y0, lCY, lOY);
517 Y1 = vec_mradds (Y1, lCY, lOY);
518 Y2 = vec_mradds (Y2, lCY, lOY);
519 Y3 = vec_mradds (Y3, lCY, lOY);
521 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */
522 ux = vec_sl (U, lCSHIFT);
523 ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
524 ux0 = vec_mergeh (ux,ux);
525 ux1 = vec_mergel (ux,ux);
527 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */
528 vx = vec_sl (V, lCSHIFT);
529 vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
530 vx0 = vec_mergeh (vx,vx);
531 vx1 = vec_mergel (vx,vx);
532 /* uvx = ((CGU*u) + (CGV*v))>>15 */
533 uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
534 uvx = vec_mradds (V, lCGV, uvx);
535 uvx0 = vec_mergeh (uvx,uvx);
536 uvx1 = vec_mergel (uvx,uvx);
537 R0 = vec_add (Y0,vx0);
538 G0 = vec_add (Y0,uvx0);
539 B0 = vec_add (Y0,ux0);
540 R1 = vec_add (Y1,vx1);
541 G1 = vec_add (Y1,uvx1);
542 B1 = vec_add (Y1,ux1);
543 R = vec_packclp (R0,R1);
544 G = vec_packclp (G0,G1);
545 B = vec_packclp (B0,B1);
547 out_argb(R,G,B,oute);
548 R0 = vec_add (Y2,vx0);
549 G0 = vec_add (Y2,uvx0);
550 B0 = vec_add (Y2,ux0);
551 R1 = vec_add (Y3,vx1);
552 G1 = vec_add (Y3,uvx1);
553 B1 = vec_add (Y3,ux1);
554 R = vec_packclp (R0,R1);
555 G = vec_packclp (G0,G1);
556 B = vec_packclp (B0,B1);
558 out_argb(R,G,B,outo);
566 outo += (outstrides[0])>>4;
567 oute += (outstrides[0])>>4;
569 ui += instrides_scl[1];
570 vi += instrides_scl[2];
571 y1i += instrides_scl[0];
572 y2i += instrides_scl[0];
580 DEFCSP420_CVT (yuv2_rgba, out_rgba)
581 DEFCSP420_CVT (yuv2_argb, out_argb)
582 DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
583 DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
586 // uyvy|uyvy|uyvy|uyvy
587 // 0123 4567 89ab cdef
589 const vector unsigned char
590 demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
593 0x10,0x0c,0x10,0x0c),
594 demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
597 0x10,0x0E,0x10,0x0E),
598 demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
601 0x10,0x0D,0x10,0x0F);
604 this is so I can play live CCIR raw video
606 static int altivec_uyvy_rgb32 (SwsContext *c,
607 unsigned char **in, int *instrides,
608 int srcSliceY, int srcSliceH,
609 unsigned char **oplanes, int *outstrides)
614 vector unsigned char uyvy;
615 vector signed short Y,U,V;
616 vector signed short vx,ux,uvx;
617 vector signed short R0,G0,B0,R1,G1,B1;
618 vector unsigned char R,G,B;
619 vector unsigned char *out;
623 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
626 for (j=0;j<w/16;j++) {
627 uyvy = vec_ld (0, img);
628 U = (vector signed short)
629 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
631 V = (vector signed short)
632 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
634 Y = (vector signed short)
635 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
637 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
639 uyvy = vec_ld (16, img);
640 U = (vector signed short)
641 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
643 V = (vector signed short)
644 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
646 Y = (vector signed short)
647 vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
649 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
651 R = vec_packclp (R0,R1);
652 G = vec_packclp (G0,G1);
653 B = vec_packclp (B0,B1);
655 // vec_mstbgr24 (R,G,B, out);
656 out_rgba (R,G,B,out);
666 /* Ok currently the acceleration routine only supports
667 inputs of widths a multiple of 16
668 and heights a multiple 2
670 So we just fall back to the C codes for this.
672 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
674 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
678 and this seems not to matter too much I tried a bunch of
679 videos with abnormal widths and mplayer crashes else where.
680 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
681 boom with X11 bad match.
684 if ((c->srcW & 0xf) != 0) return NULL;
686 switch (c->srcFormat) {
697 if ((c->srcH & 0x1) != 0)
700 switch(c->dstFormat){
702 MSG_WARN("ALTIVEC: Color Space RGB24\n");
703 return altivec_yuv2_rgb24;
705 MSG_WARN("ALTIVEC: Color Space BGR24\n");
706 return altivec_yuv2_bgr24;
708 MSG_WARN("ALTIVEC: Color Space ARGB\n");
709 return altivec_yuv2_argb;
711 MSG_WARN("ALTIVEC: Color Space ABGR\n");
712 return altivec_yuv2_abgr;
714 MSG_WARN("ALTIVEC: Color Space RGBA\n");
715 return altivec_yuv2_rgba;
717 MSG_WARN("ALTIVEC: Color Space BGRA\n");
718 return altivec_yuv2_bgra;
719 default: return NULL;
724 switch(c->dstFormat){
726 MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
727 return altivec_uyvy_rgb32;
728 default: return NULL;
736 static uint16_t roundToInt16(int64_t f){
737 int r= (f + (1<<15))>>16;
738 if(r<-0x7FFF) return 0x8000;
739 else if(r> 0x7FFF) return 0x7FFF;
743 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
746 signed short tmp[8] __attribute__ ((aligned(16)));
747 vector signed short vec;
750 buf.tmp[0] = ( (0xffffLL) * contrast>>8 )>>9; //cy
751 buf.tmp[1] = -256*brightness; //oy
752 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv
753 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu
754 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu
755 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv
758 c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
759 c->CY = vec_splat ((vector signed short)buf.vec, 0);
760 c->OY = vec_splat ((vector signed short)buf.vec, 1);
761 c->CRV = vec_splat ((vector signed short)buf.vec, 2);
762 c->CBU = vec_splat ((vector signed short)buf.vec, 3);
763 c->CGU = vec_splat ((vector signed short)buf.vec, 4);
764 c->CGV = vec_splat ((vector signed short)buf.vec, 5);
768 char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
770 printf("%s %d ", v[i],buf.tmp[i] );
779 altivec_yuv2packedX (SwsContext *c,
780 int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
781 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
782 uint8_t *dest, int dstW, int dstY)
786 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
787 vector signed short R0,G0,B0,R1,G1,B1;
789 vector unsigned char R,G,B,pels[3];
790 vector unsigned char *out,*nout;
792 vector signed short RND = vec_splat_s16(1<<3);
793 vector unsigned short SCL = vec_splat_u16(4);
794 unsigned long scratch[16] __attribute__ ((aligned (16)));
796 vector signed short *YCoeffs, *CCoeffs;
798 YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
799 CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
801 out = (vector unsigned char *)dest;
803 for(i=0; i<dstW; i+=16){
806 /* extract 16 coeffs from lumSrc */
807 for(j=0; j<lumFilterSize; j++) {
808 X0 = vec_ld (0, &lumSrc[j][i]);
809 X1 = vec_ld (16, &lumSrc[j][i]);
810 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
811 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
816 /* extract 8 coeffs from U,V */
817 for(j=0; j<chrFilterSize; j++) {
818 X = vec_ld (0, &chrSrc[j][i/2]);
819 U = vec_mradds (X, CCoeffs[j], U);
820 X = vec_ld (0, &chrSrc[j][i/2+2048]);
821 V = vec_mradds (X, CCoeffs[j], V);
824 /* scale and clip signals */
825 Y0 = vec_sra (Y0, SCL);
826 Y1 = vec_sra (Y1, SCL);
827 U = vec_sra (U, SCL);
828 V = vec_sra (V, SCL);
830 Y0 = vec_clip_s16 (Y0);
831 Y1 = vec_clip_s16 (Y1);
832 U = vec_clip_s16 (U);
833 V = vec_clip_s16 (V);
836 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
837 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
839 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
840 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
841 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
844 U0 = vec_mergeh (U,U);
845 V0 = vec_mergeh (V,V);
847 U1 = vec_mergel (U,U);
848 V1 = vec_mergel (V,V);
850 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
851 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
853 R = vec_packclp (R0,R1);
854 G = vec_packclp (G0,G1);
855 B = vec_packclp (B0,B1);
857 switch(c->dstFormat) {
858 case IMGFMT_ABGR: out_abgr (R,G,B,out); break;
859 case IMGFMT_BGRA: out_bgra (R,G,B,out); break;
860 case IMGFMT_RGBA: out_rgba (R,G,B,out); break;
861 case IMGFMT_ARGB: out_argb (R,G,B,out); break;
862 case IMGFMT_RGB24: out_rgb24 (R,G,B,out); break;
863 case IMGFMT_BGR24: out_bgr24 (R,G,B,out); break;
866 /* If this is reached, the caller should have called yuv2packedXinC
868 static int printed_error_message;
869 if(!printed_error_message) {
870 MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
871 vo_format_name(c->dstFormat));
872 printed_error_message=1;
884 /* extract 16 coeffs from lumSrc */
885 for(j=0; j<lumFilterSize; j++) {
886 X0 = vec_ld (0, &lumSrc[j][i]);
887 X1 = vec_ld (16, &lumSrc[j][i]);
888 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
889 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
894 /* extract 8 coeffs from U,V */
895 for(j=0; j<chrFilterSize; j++) {
896 X = vec_ld (0, &chrSrc[j][i/2]);
897 U = vec_mradds (X, CCoeffs[j], U);
898 X = vec_ld (0, &chrSrc[j][i/2+2048]);
899 V = vec_mradds (X, CCoeffs[j], V);
902 /* scale and clip signals */
903 Y0 = vec_sra (Y0, SCL);
904 Y1 = vec_sra (Y1, SCL);
905 U = vec_sra (U, SCL);
906 V = vec_sra (V, SCL);
908 Y0 = vec_clip_s16 (Y0);
909 Y1 = vec_clip_s16 (Y1);
910 U = vec_clip_s16 (U);
911 V = vec_clip_s16 (V);
914 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
915 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
917 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
918 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
919 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
922 U0 = vec_mergeh (U,U);
923 V0 = vec_mergeh (V,V);
925 U1 = vec_mergel (U,U);
926 V1 = vec_mergel (V,V);
928 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
929 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
931 R = vec_packclp (R0,R1);
932 G = vec_packclp (G0,G1);
933 B = vec_packclp (B0,B1);
935 nout = (vector unsigned char *)scratch;
936 switch(c->dstFormat) {
937 case IMGFMT_ABGR: out_abgr (R,G,B,nout); break;
938 case IMGFMT_BGRA: out_bgra (R,G,B,nout); break;
939 case IMGFMT_RGBA: out_rgba (R,G,B,nout); break;
940 case IMGFMT_ARGB: out_argb (R,G,B,nout); break;
941 case IMGFMT_RGB24: out_rgb24 (R,G,B,nout); break;
942 case IMGFMT_BGR24: out_bgr24 (R,G,B,nout); break;
944 /* Unreachable, I think. */
945 MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
946 vo_format_name(c->dstFormat));
950 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);