2 marc.hoffman@analog.com March 8, 2004
4 Altivec Acceleration for Color Space Conversion revision 0.2
6 convert I420 YV12 to RGB in various formats,
7 it rejects images that are not in 420 formats
8 it rejects images that don't have widths of multiples of 16
9 it rejects images that don't have heights of multiples of 2
10 reject defers to C simulation codes.
12 lots of optimizations to be done here
14 1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15 so we currently use max min to clip
17 2. the inefficient use of chroma loading needs a bit of brushing up
19 3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
22 MODIFIED to calculate coeffs from currently selected color space.
23 MODIFIED core to be a macro which you spec the output format.
24 ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25 CORRECTED algorithim selection to be strict on input formats.
26 ADDED runtime detection of altivec.
28 ADDED altivec_yuv2packedX vertical scl + RGB converter
33 The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34 The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
38 so we have roughly 10clocks per pixel this is too high something has to be wrong.
40 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
42 OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43 guaranteed to have the input video frame it was just decompressed so
44 it probably resides in L1 caches. However we are creating the
45 output video stream this needs to use the DSTST instruction to
46 optimize for the cache. We couple this with the fact that we are
47 not going to be visiting the input buffer again so we mark it Least
48 Recently Used. This shaves 25% of the processor cycles off.
50 Now MEMCPY is the largest mips consumer in the system, probably due
51 to the inefficient X11 stuff.
53 GL libraries seem to be very slow on this machine 1.33Ghz PB running
54 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
55 a versioning issues, however i have libGL.1.2.dylib for both
56 machines. ((We need to figure this out now))
58 GL2 libraries work now with patch for RGB32
60 NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
62 Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
72 #include "swscale_internal.h"
73 #include "../mangle.h"
74 #include "../libvo/img_format.h" //FIXME try to reduce dependency of such stuff
76 #undef PROFILE_THE_BEAST
79 typedef unsigned char ubyte;
80 typedef signed char sbyte;
83 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
84 homogeneous vector registers x0,x1,x2 are interleaved with the
87 o0 = vec_mergeh (x0,x1);
88 o1 = vec_perm (o0, x2, perm_rgb_0);
89 o2 = vec_perm (o0, x2, perm_rgb_1);
90 o3 = vec_mergel (x0,x1);
91 o4 = vec_perm (o3,o2,perm_rgb_2);
92 o5 = vec_perm (o3,o2,perm_rgb_3);
94 perm_rgb_0: o0(RG).h v1(B) --> o1*
100 perm_rgb_1: o0(RG).h v1(B) --> o2
106 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
112 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
120 const vector unsigned char
121 perm_rgb_0 = (vector unsigned char)(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
122 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
123 perm_rgb_1 = (vector unsigned char)(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
124 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
125 perm_rgb_2 = (vector unsigned char)(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
126 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
127 perm_rgb_3 = (vector unsigned char)(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
128 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
130 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
132 typeof(x0) o0,o2,o3; \
133 o0 = vec_mergeh (x0,x1); \
134 y0 = vec_perm (o0, x2, perm_rgb_0);\
135 o2 = vec_perm (o0, x2, perm_rgb_1);\
136 o3 = vec_mergel (x0,x1); \
137 y1 = vec_perm (o3,o2,perm_rgb_2); \
138 y2 = vec_perm (o3,o2,perm_rgb_3); \
141 #define vec_mstrgb24(x0,x1,x2,ptr) \
143 typeof(x0) _0,_1,_2; \
144 vec_merge3 (x0,x1,x2,_0,_1,_2); \
145 vec_st (_0, 0, ptr++); \
146 vec_st (_1, 0, ptr++); \
147 vec_st (_2, 0, ptr++); \
150 #define vec_mstbgr24(x0,x1,x2,ptr) \
152 typeof(x0) _0,_1,_2; \
153 vec_merge3 (x2,x1,x0,_0,_1,_2); \
154 vec_st (_0, 0, ptr++); \
155 vec_st (_1, 0, ptr++); \
156 vec_st (_2, 0, ptr++); \
159 /* pack the pixels in rgb0 format
163 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
166 _0 = vec_mergeh (x0,x1); \
167 _1 = vec_mergeh (x2,x3); \
168 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
169 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
170 vec_st (_2, 0*16, (T *)ptr); \
171 vec_st (_3, 1*16, (T *)ptr); \
172 _0 = vec_mergel (x0,x1); \
173 _1 = vec_mergel (x2,x3); \
174 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
175 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
176 vec_st (_2, 2*16, (T *)ptr); \
177 vec_st (_3, 3*16, (T *)ptr); \
184 | 1 -0.3441 -0.7142 |x| Cb|
191 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
199 (vector signed short) \
200 vec_perm(x,(typeof(x))(0),\
201 (vector unsigned char)(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
202 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
204 (vector signed short) \
205 vec_perm(x,(typeof(x))(0),\
206 (vector unsigned char)(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
207 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
209 #define vec_clip(x) \
210 vec_max (vec_min (x, (typeof(x))(255)), (typeof(x))(0))
212 #define vec_packclp_a(x,y) \
213 (vector unsigned char)vec_pack (vec_clip (x), vec_clip (y))
215 #define vec_packclp(x,y) \
216 (vector unsigned char)vec_packs \
217 ((vector unsigned short)vec_max (x,(vector signed short) (0)), \
218 (vector unsigned short)vec_max (y,(vector signed short) (0)))
220 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),a,a,a,ptr)
223 static inline cvtyuvtoRGB (SwsContext *c,
224 vector signed short Y, vector signed short U, vector signed short V,
225 vector signed short *R, vector signed short *G, vector signed short *B)
227 vector signed short vx,ux,uvx;
229 Y = vec_mradds (Y, c->CY, c->OY);
231 U = vec_sub (U,(vector signed short)(128));
232 V = vec_sub (V,(vector signed short)(128));
234 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
235 ux = vec_sl (U, c->CSHIFT);
236 *B = vec_mradds (ux, c->CBU, Y);
238 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
239 vx = vec_sl (V, c->CSHIFT);
240 *R = vec_mradds (vx, c->CRV, Y);
242 // uvx = ((CGU*u) + (CGV*v))>>15;
243 uvx = vec_mradds (U, c->CGU, Y);
244 *G = vec_mradds (V, c->CGV, uvx);
249 ------------------------------------------------------------------------------
251 ------------------------------------------------------------------------------
255 #define DEFCSP420_CVT(name,out_pixels) \
256 static int altivec_##name (SwsContext *c, \
257 unsigned char **in, int *instrides, \
258 int srcSliceY, int srcSliceH, \
259 unsigned char **oplanes, int *outstrides) \
264 int instrides_scl[3]; \
265 vector unsigned char y0,y1; \
267 vector signed char u,v; \
269 vector signed short Y0,Y1,Y2,Y3; \
270 vector signed short U,V; \
271 vector signed short vx,ux,uvx; \
272 vector signed short vx0,ux0,uvx0; \
273 vector signed short vx1,ux1,uvx1; \
274 vector signed short R0,G0,B0; \
275 vector signed short R1,G1,B1; \
276 vector unsigned char R,G,B; \
278 vector unsigned char *uivP, *vivP; \
279 vector unsigned char align_perm; \
281 vector signed short \
289 vector unsigned short lCSHIFT = c->CSHIFT; \
291 ubyte *y1i = in[0]; \
292 ubyte *y2i = in[0]+w; \
296 vector unsigned char *oute \
297 = (vector unsigned char *) \
298 (oplanes[0]+srcSliceY*outstrides[0]); \
299 vector unsigned char *outo \
300 = (vector unsigned char *) \
301 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
304 instrides_scl[0] = instrides[0]; \
305 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
306 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
309 for (i=0;i<h/2;i++) { \
310 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
311 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
313 for (j=0;j<w/16;j++) { \
315 y0 = vec_ldl (0,y1i); \
316 y1 = vec_ldl (0,y2i); \
317 uivP = (vector unsigned char *)ui; \
318 vivP = (vector unsigned char *)vi; \
320 align_perm = vec_lvsl (0, ui); \
321 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); \
323 align_perm = vec_lvsl (0, vi); \
324 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); \
326 u = (vector signed char)vec_sub (u, (vector signed char)(128)); \
327 v = (vector signed char)vec_sub (v, (vector signed char)(128)); \
328 U = vec_unpackh (u); \
329 V = vec_unpackh (v); \
337 Y0 = vec_mradds (Y0, lCY, lOY); \
338 Y1 = vec_mradds (Y1, lCY, lOY); \
339 Y2 = vec_mradds (Y2, lCY, lOY); \
340 Y3 = vec_mradds (Y3, lCY, lOY); \
342 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
343 ux = vec_sl (U, lCSHIFT); \
344 ux = vec_mradds (ux, lCBU, (vector signed short)(0)); \
345 ux0 = vec_mergeh (ux,ux); \
346 ux1 = vec_mergel (ux,ux); \
348 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
349 vx = vec_sl (V, lCSHIFT); \
350 vx = vec_mradds (vx, lCRV, (vector signed short)(0)); \
351 vx0 = vec_mergeh (vx,vx); \
352 vx1 = vec_mergel (vx,vx); \
354 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
355 uvx = vec_mradds (U, lCGU, (vector signed short)(0)); \
356 uvx = vec_mradds (V, lCGV, uvx); \
357 uvx0 = vec_mergeh (uvx,uvx); \
358 uvx1 = vec_mergel (uvx,uvx); \
360 R0 = vec_add (Y0,vx0); \
361 G0 = vec_add (Y0,uvx0); \
362 B0 = vec_add (Y0,ux0); \
363 R1 = vec_add (Y1,vx1); \
364 G1 = vec_add (Y1,uvx1); \
365 B1 = vec_add (Y1,ux1); \
367 R = vec_packclp (R0,R1); \
368 G = vec_packclp (G0,G1); \
369 B = vec_packclp (B0,B1); \
371 out_pixels(R,G,B,oute); \
373 R0 = vec_add (Y2,vx0); \
374 G0 = vec_add (Y2,uvx0); \
375 B0 = vec_add (Y2,ux0); \
376 R1 = vec_add (Y3,vx1); \
377 G1 = vec_add (Y3,uvx1); \
378 B1 = vec_add (Y3,ux1); \
379 R = vec_packclp (R0,R1); \
380 G = vec_packclp (G0,G1); \
381 B = vec_packclp (B0,B1); \
384 out_pixels(R,G,B,outo); \
393 outo += (outstrides[0])>>4; \
394 oute += (outstrides[0])>>4; \
396 ui += instrides_scl[1]; \
397 vi += instrides_scl[2]; \
398 y1i += instrides_scl[0]; \
399 y2i += instrides_scl[0]; \
405 #define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),c,b,a,ptr)
406 #define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))(0)),ptr)
407 #define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))(0)),ptr)
408 #define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),a,b,c,ptr)
409 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
410 #define out_bgr24(a,b,c,ptr) vec_mstrgb24(c,b,a,ptr)
412 DEFCSP420_CVT (yuv2_abgr32, out_abgr)
413 DEFCSP420_CVT (yuv2_bgra32, out_argb)
414 DEFCSP420_CVT (yuv2_rgba32, out_rgba)
415 DEFCSP420_CVT (yuv2_argb32, out_argb)
416 DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
417 DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
420 // uyvy|uyvy|uyvy|uyvy
421 // 0123 4567 89ab cdef
423 const vector unsigned char
424 demux_u = (vector unsigned char)(0x10,0x00,0x10,0x00,
427 0x10,0x0c,0x10,0x0c),
428 demux_v = (vector unsigned char)(0x10,0x02,0x10,0x02,
431 0x10,0x0E,0x10,0x0E),
432 demux_y = (vector unsigned char)(0x10,0x01,0x10,0x03,
435 0x10,0x0D,0x10,0x0F);
438 this is so I can play live CCIR raw video
440 static int altivec_uyvy_rgb32 (SwsContext *c,
441 unsigned char **in, int *instrides,
442 int srcSliceY, int srcSliceH,
443 unsigned char **oplanes, int *outstrides)
448 vector unsigned char uyvy;
449 vector signed short Y,U,V;
450 vector signed short vx,ux,uvx;
451 vector signed short R0,G0,B0,R1,G1,B1;
452 vector unsigned char R,G,B;
453 vector unsigned char *out;
457 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
460 for (j=0;j<w/16;j++) {
461 uyvy = vec_ld (0, img);
462 U = (vector signed short)
463 vec_perm (uyvy, (vector unsigned char)(0), demux_u);
465 V = (vector signed short)
466 vec_perm (uyvy, (vector unsigned char)(0), demux_v);
468 Y = (vector signed short)
469 vec_perm (uyvy, (vector unsigned char)(0), demux_y);
471 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
473 uyvy = vec_ld (16, img);
474 U = (vector signed short)
475 vec_perm (uyvy, (vector unsigned char)(0), demux_u);
477 V = (vector signed short)
478 vec_perm (uyvy, (vector unsigned char)(0), demux_v);
480 Y = (vector signed short)
481 vec_perm (uyvy, (vector unsigned char)(0), demux_y);
483 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
485 R = vec_packclp (R0,R1);
486 G = vec_packclp (G0,G1);
487 B = vec_packclp (B0,B1);
489 // vec_mstbgr24 (R,G,B, out);
490 out_rgba (R,G,B,out);
499 /* Ok currently the acceleration routine only supports
500 inputs of widths a multiple of 16
501 and heights a multiple 2
503 So we just fall back to the C codes for this.
505 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
507 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
511 and this seems not to matter too much I tried a bunch of
512 videos with abnormal widths and mplayer crashes else where.
513 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
514 boom with X11 bad match.
517 if ((c->srcW & 0xf) != 0) return NULL;
519 switch (c->srcFormat) {
530 if ((c->srcH & 0x1) != 0)
533 switch(c->dstFormat){
535 MSG_WARN("ALTIVEC: Color Space RGB24\n");
536 return altivec_yuv2_rgb24;
538 MSG_WARN("ALTIVEC: Color Space BGR24\n");
539 return altivec_yuv2_bgr24;
541 MSG_WARN("ALTIVEC: Color Space ARGB32\n");
542 return altivec_yuv2_argb32;
544 MSG_WARN("ALTIVEC: Color Space BGRA32\n");
545 // return profile_altivec_bgra32;
547 return altivec_yuv2_bgra32;
548 default: return NULL;
553 switch(c->dstFormat){
555 MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
556 return altivec_uyvy_rgb32;
560 default: return NULL;
569 int yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4])
573 CY = (vector signed short)(0x7fff),
574 CRV = (vector signed short)(22972),
575 CBU = (vector signed short)(29029),
576 CGU = (vector signed short)(-11276),
577 CGV = (vector signed short)(-23400),
580 vector unsigned short CSHIFT = (vector unsigned short)(1);
582 vector signed short Y0;
583 int brightness = c->brightness, contrast = c->contrast, saturation = c->saturation;
584 int64_t crv __attribute__ ((aligned(16)));
585 int64_t cbu __attribute__ ((aligned(16)));
586 int64_t cgu __attribute__ ((aligned(16)));
587 int64_t cgv __attribute__ ((aligned(16)));
588 short tmp __attribute__ ((aligned(16)));
590 int64_t cy = (1<<16)-1;
593 if ((c->flags & SWS_CPU_CAPS_ALTIVEC) == 0)
602 printf ("crv: %hvx\n", CRV);
603 printf ("cbu: %hvx\n", CBU);
604 printf ("cgv: %hvx\n", CGV);
605 printf ("cgu: %hvx\n", CGU);
607 printf ("contrast: %d, brightness: %d, saturation: %d\n", contrast, brightness, saturation);
609 printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
612 cy = (cy *contrast )>>17;
613 crv= (crv*contrast * saturation)>>32;
614 cbu= (cbu*contrast * saturation)>>32;
615 cgu= (cgu*contrast * saturation)>>32;
616 cgv= (cgv*contrast * saturation)>>32;
618 oy -= 256*brightness;
621 //printf("%llx %llx %llx %llx %llx\n", cy, crv, cbu, cgu, cgv);
623 // vector signed short CBU,CRV,CGU,CGY,CY;
625 CY = vec_lde (0, &tmp);
626 CY = vec_splat (CY, 0);
629 OY = vec_lde (0, &tmp);
630 OY = vec_splat (OY, 0);
633 CRV = vec_lde (0, &tmp);
634 CRV = vec_splat (CRV, 0);
636 CBU = vec_lde (0, &tmp);
637 CBU = vec_splat (CBU, 0);
640 CGU = vec_lde (0, &tmp);
641 CGU = vec_splat (CGU, 0);
643 CGV = vec_lde (0, &tmp);
644 CGV = vec_splat (CGV, 0);
646 CSHIFT = (vector unsigned short)(2);
657 printf ("cy: %hvx\n", CY);
658 printf ("oy: %hvx\n", OY);
659 printf ("crv: %hvx\n", CRV);
660 printf ("cbu: %hvx\n", CBU);
661 printf ("cgv: %hvx\n", CGV);
662 printf ("cgu: %hvx\n", CGU);
668 altivec_yuv2packedX (SwsContext *c,
669 int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
670 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
671 uint8_t *dest, int dstW, int dstY)
674 short tmp __attribute__((aligned (16)));
677 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
678 vector signed short R0,G0,B0,R1,G1,B1;
680 vector unsigned char R,G,B,pels[3];
681 vector unsigned char *out,*nout;
682 vector signed short RND = (vector signed short)(1<<3);
683 vector unsigned short SCL = (vector unsigned short)(4);
684 unsigned long scratch[16] __attribute__ ((aligned (16)));
686 vector signed short *vYCoeffsBank, *vCCoeffsBank;
688 vector signed short *YCoeffs, *CCoeffs;
690 vYCoeffsBank = malloc (sizeof (vector signed short)*lumFilterSize*dstW);
691 vCCoeffsBank = malloc (sizeof (vector signed short)*chrFilterSize*dstW);
693 for (i=0;i<lumFilterSize*dstW;i++) {
694 tmp = c->vLumFilter[i];
695 p = &vYCoeffsBank[i];
700 for (i=0;i<chrFilterSize*dstW;i++) {
701 tmp = c->vChrFilter[i];
702 p = &vCCoeffsBank[i];
707 YCoeffs = vYCoeffsBank+dstY*lumFilterSize;
708 CCoeffs = vCCoeffsBank+dstY*chrFilterSize;
710 out = (vector unsigned char *)dest;
712 for(i=0; i<dstW; i+=16){
715 /* extract 16 coeffs from lumSrc */
716 for(j=0; j<lumFilterSize; j++) {
717 X0 = vec_ld (0, &lumSrc[j][i]);
718 X1 = vec_ld (16, &lumSrc[j][i]);
719 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
720 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
725 /* extract 8 coeffs from U,V */
726 for(j=0; j<chrFilterSize; j++) {
727 X = vec_ld (0, &chrSrc[j][i/2]);
728 U = vec_mradds (X, CCoeffs[j], U);
729 X = vec_ld (0, &chrSrc[j][i/2+2048]);
730 V = vec_mradds (X, CCoeffs[j], V);
733 /* scale and clip signals */
734 Y0 = vec_sra (Y0, SCL);
735 Y1 = vec_sra (Y1, SCL);
736 U = vec_sra (U, SCL);
737 V = vec_sra (V, SCL);
745 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
746 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
748 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
749 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
750 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
753 U0 = vec_mergeh (U,U);
754 V0 = vec_mergeh (V,V);
756 U1 = vec_mergel (U,U);
757 V1 = vec_mergel (V,V);
759 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
760 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
762 R = vec_packclp (R0,R1);
763 G = vec_packclp (G0,G1);
764 B = vec_packclp (B0,B1);
766 out_rgba (R,G,B,out);
774 /* extract 16 coeffs from lumSrc */
775 for(j=0; j<lumFilterSize; j++) {
776 X0 = vec_ld (0, &lumSrc[j][i]);
777 X1 = vec_ld (16, &lumSrc[j][i]);
778 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
779 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
784 /* extract 8 coeffs from U,V */
785 for(j=0; j<chrFilterSize; j++) {
786 X = vec_ld (0, &chrSrc[j][i/2]);
787 U = vec_mradds (X, CCoeffs[j], U);
788 X = vec_ld (0, &chrSrc[j][i/2+2048]);
789 V = vec_mradds (X, CCoeffs[j], V);
792 /* scale and clip signals */
793 Y0 = vec_sra (Y0, SCL);
794 Y1 = vec_sra (Y1, SCL);
795 U = vec_sra (U, SCL);
796 V = vec_sra (V, SCL);
804 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
805 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
807 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
808 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
809 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
812 U0 = vec_mergeh (U,U);
813 V0 = vec_mergeh (V,V);
815 U1 = vec_mergel (U,U);
816 V1 = vec_mergel (V,V);
818 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
819 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
821 R = vec_packclp (R0,R1);
822 G = vec_packclp (G0,G1);
823 B = vec_packclp (B0,B1);
825 nout = (vector unsigned char *)scratch;
826 out_rgba (R,G,B,nout);
828 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
831 if (vYCoeffsBank) free (vYCoeffsBank);
832 if (vCCoeffsBank) free (vCCoeffsBank);