2 marc.hoffman@analog.com March 8, 2004
4 Altivec Acceleration for Color Space Conversion revision 0.2
6 convert I420 YV12 to RGB in various formats,
7 it rejects images that are not in 420 formats
8 it rejects images that don't have widths of multiples of 16
9 it rejects images that don't have heights of multiples of 2
10 reject defers to C simulation codes.
12 lots of optimizations to be done here
14 1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15 so we currently use max min to clip
17 2. the inefficient use of chroma loading needs a bit of brushing up
19 3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
22 MODIFIED to calculate coeffs from currently selected color space.
23 MODIFIED core to be a macro which you spec the output format.
24 ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25 CORRECTED algorithim selection to be strict on input formats.
26 ADDED runtime detection of altivec.
28 ADDED altivec_yuv2packedX vertical scl + RGB converter
33 The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34 The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
38 so we have roughly 10clocks per pixel this is too high something has to be wrong.
40 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
42 OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43 guaranteed to have the input video frame it was just decompressed so
44 it probably resides in L1 caches. However we are creating the
45 output video stream this needs to use the DSTST instruction to
46 optimize for the cache. We couple this with the fact that we are
47 not going to be visiting the input buffer again so we mark it Least
48 Recently Used. This shaves 25% of the processor cycles off.
50 Now MEMCPY is the largest mips consumer in the system, probably due
51 to the inefficient X11 stuff.
53 GL libraries seem to be very slow on this machine 1.33Ghz PB running
54 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
55 a versioning issues, however i have libGL.1.2.dylib for both
56 machines. ((We need to figure this out now))
58 GL2 libraries work now with patch for RGB32
60 NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
62 Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
73 #include "swscale_internal.h"
74 #include "../mangle.h"
75 #include "../libvo/img_format.h" //FIXME try to reduce dependency of such stuff
77 #undef PROFILE_THE_BEAST
80 typedef unsigned char ubyte;
81 typedef signed char sbyte;
84 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
85 homogeneous vector registers x0,x1,x2 are interleaved with the
88 o0 = vec_mergeh (x0,x1);
89 o1 = vec_perm (o0, x2, perm_rgb_0);
90 o2 = vec_perm (o0, x2, perm_rgb_1);
91 o3 = vec_mergel (x0,x1);
92 o4 = vec_perm (o3,o2,perm_rgb_2);
93 o5 = vec_perm (o3,o2,perm_rgb_3);
95 perm_rgb_0: o0(RG).h v1(B) --> o1*
101 perm_rgb_1: o0(RG).h v1(B) --> o2
107 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
113 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
121 const vector unsigned char
122 perm_rgb_0 = (vector unsigned char)(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
123 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
124 perm_rgb_1 = (vector unsigned char)(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
125 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
126 perm_rgb_2 = (vector unsigned char)(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
127 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
128 perm_rgb_3 = (vector unsigned char)(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
129 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
131 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
133 typeof(x0) o0,o2,o3; \
134 o0 = vec_mergeh (x0,x1); \
135 y0 = vec_perm (o0, x2, perm_rgb_0);\
136 o2 = vec_perm (o0, x2, perm_rgb_1);\
137 o3 = vec_mergel (x0,x1); \
138 y1 = vec_perm (o3,o2,perm_rgb_2); \
139 y2 = vec_perm (o3,o2,perm_rgb_3); \
142 #define vec_mstrgb24(x0,x1,x2,ptr) \
144 typeof(x0) _0,_1,_2; \
145 vec_merge3 (x0,x1,x2,_0,_1,_2); \
146 vec_st (_0, 0, ptr++); \
147 vec_st (_1, 0, ptr++); \
148 vec_st (_2, 0, ptr++); \
151 #define vec_mstbgr24(x0,x1,x2,ptr) \
153 typeof(x0) _0,_1,_2; \
154 vec_merge3 (x2,x1,x0,_0,_1,_2); \
155 vec_st (_0, 0, ptr++); \
156 vec_st (_1, 0, ptr++); \
157 vec_st (_2, 0, ptr++); \
160 /* pack the pixels in rgb0 format
164 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
167 _0 = vec_mergeh (x0,x1); \
168 _1 = vec_mergeh (x2,x3); \
169 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
170 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
171 vec_st (_2, 0*16, (T *)ptr); \
172 vec_st (_3, 1*16, (T *)ptr); \
173 _0 = vec_mergel (x0,x1); \
174 _1 = vec_mergel (x2,x3); \
175 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
176 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
177 vec_st (_2, 2*16, (T *)ptr); \
178 vec_st (_3, 3*16, (T *)ptr); \
185 | 1 -0.3441 -0.7142 |x| Cb|
192 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
200 (vector signed short) \
201 vec_perm(x,(typeof(x))(0),\
202 (vector unsigned char)(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
203 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
205 (vector signed short) \
206 vec_perm(x,(typeof(x))(0),\
207 (vector unsigned char)(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
208 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
210 #define vec_clip(x) \
211 vec_max (vec_min (x, (typeof(x))(255)), (typeof(x))(0))
213 #define vec_packclp_a(x,y) \
214 (vector unsigned char)vec_pack (vec_clip (x), vec_clip (y))
216 #define vec_packclp(x,y) \
217 (vector unsigned char)vec_packs \
218 ((vector unsigned short)vec_max (x,(vector signed short) (0)), \
219 (vector unsigned short)vec_max (y,(vector signed short) (0)))
221 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),a,a,a,ptr)
224 static inline void cvtyuvtoRGB (SwsContext *c,
225 vector signed short Y, vector signed short U, vector signed short V,
226 vector signed short *R, vector signed short *G, vector signed short *B)
228 vector signed short vx,ux,uvx;
230 Y = vec_mradds (Y, c->CY, c->OY);
232 U = vec_sub (U,(vector signed short)(128));
233 V = vec_sub (V,(vector signed short)(128));
235 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
236 ux = vec_sl (U, c->CSHIFT);
237 *B = vec_mradds (ux, c->CBU, Y);
239 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
240 vx = vec_sl (V, c->CSHIFT);
241 *R = vec_mradds (vx, c->CRV, Y);
243 // uvx = ((CGU*u) + (CGV*v))>>15;
244 uvx = vec_mradds (U, c->CGU, Y);
245 *G = vec_mradds (V, c->CGV, uvx);
250 ------------------------------------------------------------------------------
252 ------------------------------------------------------------------------------
256 #define DEFCSP420_CVT(name,out_pixels) \
257 static int altivec_##name (SwsContext *c, \
258 unsigned char **in, int *instrides, \
259 int srcSliceY, int srcSliceH, \
260 unsigned char **oplanes, int *outstrides) \
265 int instrides_scl[3]; \
266 vector unsigned char y0,y1; \
268 vector signed char u,v; \
270 vector signed short Y0,Y1,Y2,Y3; \
271 vector signed short U,V; \
272 vector signed short vx,ux,uvx; \
273 vector signed short vx0,ux0,uvx0; \
274 vector signed short vx1,ux1,uvx1; \
275 vector signed short R0,G0,B0; \
276 vector signed short R1,G1,B1; \
277 vector unsigned char R,G,B; \
279 vector unsigned char *uivP, *vivP; \
280 vector unsigned char align_perm; \
282 vector signed short \
290 vector unsigned short lCSHIFT = c->CSHIFT; \
292 ubyte *y1i = in[0]; \
293 ubyte *y2i = in[0]+w; \
297 vector unsigned char *oute \
298 = (vector unsigned char *) \
299 (oplanes[0]+srcSliceY*outstrides[0]); \
300 vector unsigned char *outo \
301 = (vector unsigned char *) \
302 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
305 instrides_scl[0] = instrides[0]; \
306 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
307 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
310 for (i=0;i<h/2;i++) { \
311 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
312 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
314 for (j=0;j<w/16;j++) { \
316 y0 = vec_ldl (0,y1i); \
317 y1 = vec_ldl (0,y2i); \
318 uivP = (vector unsigned char *)ui; \
319 vivP = (vector unsigned char *)vi; \
321 align_perm = vec_lvsl (0, ui); \
322 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); \
324 align_perm = vec_lvsl (0, vi); \
325 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); \
327 u = (vector signed char)vec_sub (u, (vector signed char)(128)); \
328 v = (vector signed char)vec_sub (v, (vector signed char)(128)); \
329 U = vec_unpackh (u); \
330 V = vec_unpackh (v); \
338 Y0 = vec_mradds (Y0, lCY, lOY); \
339 Y1 = vec_mradds (Y1, lCY, lOY); \
340 Y2 = vec_mradds (Y2, lCY, lOY); \
341 Y3 = vec_mradds (Y3, lCY, lOY); \
343 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
344 ux = vec_sl (U, lCSHIFT); \
345 ux = vec_mradds (ux, lCBU, (vector signed short)(0)); \
346 ux0 = vec_mergeh (ux,ux); \
347 ux1 = vec_mergel (ux,ux); \
349 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
350 vx = vec_sl (V, lCSHIFT); \
351 vx = vec_mradds (vx, lCRV, (vector signed short)(0)); \
352 vx0 = vec_mergeh (vx,vx); \
353 vx1 = vec_mergel (vx,vx); \
355 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
356 uvx = vec_mradds (U, lCGU, (vector signed short)(0)); \
357 uvx = vec_mradds (V, lCGV, uvx); \
358 uvx0 = vec_mergeh (uvx,uvx); \
359 uvx1 = vec_mergel (uvx,uvx); \
361 R0 = vec_add (Y0,vx0); \
362 G0 = vec_add (Y0,uvx0); \
363 B0 = vec_add (Y0,ux0); \
364 R1 = vec_add (Y1,vx1); \
365 G1 = vec_add (Y1,uvx1); \
366 B1 = vec_add (Y1,ux1); \
368 R = vec_packclp (R0,R1); \
369 G = vec_packclp (G0,G1); \
370 B = vec_packclp (B0,B1); \
372 out_pixels(R,G,B,oute); \
374 R0 = vec_add (Y2,vx0); \
375 G0 = vec_add (Y2,uvx0); \
376 B0 = vec_add (Y2,ux0); \
377 R1 = vec_add (Y3,vx1); \
378 G1 = vec_add (Y3,uvx1); \
379 B1 = vec_add (Y3,ux1); \
380 R = vec_packclp (R0,R1); \
381 G = vec_packclp (G0,G1); \
382 B = vec_packclp (B0,B1); \
385 out_pixels(R,G,B,outo); \
394 outo += (outstrides[0])>>4; \
395 oute += (outstrides[0])>>4; \
397 ui += instrides_scl[1]; \
398 vi += instrides_scl[2]; \
399 y1i += instrides_scl[0]; \
400 y2i += instrides_scl[0]; \
406 #define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),c,b,a,ptr)
407 #define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))(0)),ptr)
408 #define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))(0)),ptr)
409 #define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),a,b,c,ptr)
410 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
411 #define out_bgr24(a,b,c,ptr) vec_mstrgb24(c,b,a,ptr)
413 DEFCSP420_CVT (yuv2_abgr32, out_abgr)
414 DEFCSP420_CVT (yuv2_bgra32, out_argb)
415 DEFCSP420_CVT (yuv2_rgba32, out_rgba)
416 DEFCSP420_CVT (yuv2_argb32, out_argb)
417 DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
418 DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
421 // uyvy|uyvy|uyvy|uyvy
422 // 0123 4567 89ab cdef
424 const vector unsigned char
425 demux_u = (vector unsigned char)(0x10,0x00,0x10,0x00,
428 0x10,0x0c,0x10,0x0c),
429 demux_v = (vector unsigned char)(0x10,0x02,0x10,0x02,
432 0x10,0x0E,0x10,0x0E),
433 demux_y = (vector unsigned char)(0x10,0x01,0x10,0x03,
436 0x10,0x0D,0x10,0x0F);
439 this is so I can play live CCIR raw video
441 static int altivec_uyvy_rgb32 (SwsContext *c,
442 unsigned char **in, int *instrides,
443 int srcSliceY, int srcSliceH,
444 unsigned char **oplanes, int *outstrides)
449 vector unsigned char uyvy;
450 vector signed short Y,U,V;
451 vector signed short vx,ux,uvx;
452 vector signed short R0,G0,B0,R1,G1,B1;
453 vector unsigned char R,G,B;
454 vector unsigned char *out;
458 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
461 for (j=0;j<w/16;j++) {
462 uyvy = vec_ld (0, img);
463 U = (vector signed short)
464 vec_perm (uyvy, (vector unsigned char)(0), demux_u);
466 V = (vector signed short)
467 vec_perm (uyvy, (vector unsigned char)(0), demux_v);
469 Y = (vector signed short)
470 vec_perm (uyvy, (vector unsigned char)(0), demux_y);
472 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
474 uyvy = vec_ld (16, img);
475 U = (vector signed short)
476 vec_perm (uyvy, (vector unsigned char)(0), demux_u);
478 V = (vector signed short)
479 vec_perm (uyvy, (vector unsigned char)(0), demux_v);
481 Y = (vector signed short)
482 vec_perm (uyvy, (vector unsigned char)(0), demux_y);
484 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
486 R = vec_packclp (R0,R1);
487 G = vec_packclp (G0,G1);
488 B = vec_packclp (B0,B1);
490 // vec_mstbgr24 (R,G,B, out);
491 out_rgba (R,G,B,out);
501 /* Ok currently the acceleration routine only supports
502 inputs of widths a multiple of 16
503 and heights a multiple 2
505 So we just fall back to the C codes for this.
507 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
509 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
513 and this seems not to matter too much I tried a bunch of
514 videos with abnormal widths and mplayer crashes else where.
515 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
516 boom with X11 bad match.
519 if ((c->srcW & 0xf) != 0) return NULL;
521 switch (c->srcFormat) {
532 if ((c->srcH & 0x1) != 0)
535 switch(c->dstFormat){
537 MSG_WARN("ALTIVEC: Color Space RGB24\n");
538 return altivec_yuv2_rgb24;
540 MSG_WARN("ALTIVEC: Color Space BGR24\n");
541 return altivec_yuv2_bgr24;
543 MSG_WARN("ALTIVEC: Color Space ARGB32\n");
544 return altivec_yuv2_argb32;
546 MSG_WARN("ALTIVEC: Color Space BGRA32\n");
547 // return profile_altivec_bgra32;
549 return altivec_yuv2_bgra32;
550 default: return NULL;
555 switch(c->dstFormat){
557 MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
558 return altivec_uyvy_rgb32;
559 default: return NULL;
568 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4])
571 CY = (vector signed short)(0x7fff),
572 CRV = (vector signed short)(22972),
573 CBU = (vector signed short)(29029),
574 CGU = (vector signed short)(-11276),
575 CGV = (vector signed short)(-23400),
578 vector signed short Y0;
579 int brightness = c->brightness, contrast = c->contrast, saturation = c->saturation;
580 int64_t crv __attribute__ ((aligned(16)));
581 int64_t cbu __attribute__ ((aligned(16)));
582 int64_t cgu __attribute__ ((aligned(16)));
583 int64_t cgv __attribute__ ((aligned(16)));
584 short tmp __attribute__ ((aligned(16)));
586 int64_t cy = (1<<16)-1;
589 if ((c->flags & SWS_CPU_CAPS_ALTIVEC) == 0)
598 printf ("crv: %hvx\n", CRV);
599 printf ("cbu: %hvx\n", CBU);
600 printf ("cgv: %hvx\n", CGV);
601 printf ("cgu: %hvx\n", CGU);
603 printf ("contrast: %d, brightness: %d, saturation: %d\n", contrast, brightness, saturation);
605 printf("%lld %lld %lld %lld %lld\n", cy, crv, cbu, cgu, cgv);
608 cy = (cy *contrast )>>17;
609 crv= (crv*contrast * saturation)>>32;
610 cbu= (cbu*contrast * saturation)>>32;
611 cgu= (cgu*contrast * saturation)>>32;
612 cgv= (cgv*contrast * saturation)>>32;
614 oy -= 256*brightness;
616 //printf("%llx %llx %llx %llx %llx\n", cy, crv, cbu, cgu, cgv);
618 // vector signed short CBU,CRV,CGU,CGY,CY;
620 CY = vec_lde (0, &tmp);
621 CY = vec_splat (CY, 0);
624 OY = vec_lde (0, &tmp);
625 OY = vec_splat (OY, 0);
628 CRV = vec_lde (0, &tmp);
629 CRV = vec_splat (CRV, 0);
631 CBU = vec_lde (0, &tmp);
632 CBU = vec_splat (CBU, 0);
635 CGU = vec_lde (0, &tmp);
636 CGU = vec_splat (CGU, 0);
638 CGV = vec_lde (0, &tmp);
639 CGV = vec_splat (CGV, 0);
641 c->CSHIFT = (vector unsigned short)(2);
650 printf ("cy: %hvx\n", CY);
651 printf ("oy: %hvx\n", OY);
652 printf ("crv: %hvx\n", CRV);
653 printf ("cbu: %hvx\n", CBU);
654 printf ("cgv: %hvx\n", CGV);
655 printf ("cgu: %hvx\n", CGU);
663 altivec_yuv2packedX (SwsContext *c,
664 int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
665 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
666 uint8_t *dest, int dstW, int dstY)
669 short tmp __attribute__((aligned (16)));
672 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
673 vector signed short R0,G0,B0,R1,G1,B1;
675 vector unsigned char R,G,B,pels[3];
676 vector unsigned char *out,*nout;
677 vector signed short RND = (vector signed short)(1<<3);
678 vector unsigned short SCL = (vector unsigned short)(4);
679 unsigned long scratch[16] __attribute__ ((aligned (16)));
681 vector signed short *vYCoeffsBank, *vCCoeffsBank;
683 vector signed short *YCoeffs, *CCoeffs;
685 vYCoeffsBank = malloc (sizeof (vector signed short)*lumFilterSize*dstW);
686 vCCoeffsBank = malloc (sizeof (vector signed short)*chrFilterSize*dstW);
688 for (i=0;i<lumFilterSize*dstW;i++) {
689 tmp = c->vLumFilter[i];
690 p = &vYCoeffsBank[i];
695 for (i=0;i<chrFilterSize*dstW;i++) {
696 tmp = c->vChrFilter[i];
697 p = &vCCoeffsBank[i];
702 YCoeffs = vYCoeffsBank+dstY*lumFilterSize;
703 CCoeffs = vCCoeffsBank+dstY*chrFilterSize;
705 out = (vector unsigned char *)dest;
707 for(i=0; i<dstW; i+=16){
710 /* extract 16 coeffs from lumSrc */
711 for(j=0; j<lumFilterSize; j++) {
712 X0 = vec_ld (0, &lumSrc[j][i]);
713 X1 = vec_ld (16, &lumSrc[j][i]);
714 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
715 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
720 /* extract 8 coeffs from U,V */
721 for(j=0; j<chrFilterSize; j++) {
722 X = vec_ld (0, &chrSrc[j][i/2]);
723 U = vec_mradds (X, CCoeffs[j], U);
724 X = vec_ld (0, &chrSrc[j][i/2+2048]);
725 V = vec_mradds (X, CCoeffs[j], V);
728 /* scale and clip signals */
729 Y0 = vec_sra (Y0, SCL);
730 Y1 = vec_sra (Y1, SCL);
731 U = vec_sra (U, SCL);
732 V = vec_sra (V, SCL);
740 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
741 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
743 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
744 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
745 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
748 U0 = vec_mergeh (U,U);
749 V0 = vec_mergeh (V,V);
751 U1 = vec_mergel (U,U);
752 V1 = vec_mergel (V,V);
754 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
755 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
757 R = vec_packclp (R0,R1);
758 G = vec_packclp (G0,G1);
759 B = vec_packclp (B0,B1);
761 out_rgba (R,G,B,out);
769 /* extract 16 coeffs from lumSrc */
770 for(j=0; j<lumFilterSize; j++) {
771 X0 = vec_ld (0, &lumSrc[j][i]);
772 X1 = vec_ld (16, &lumSrc[j][i]);
773 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
774 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
779 /* extract 8 coeffs from U,V */
780 for(j=0; j<chrFilterSize; j++) {
781 X = vec_ld (0, &chrSrc[j][i/2]);
782 U = vec_mradds (X, CCoeffs[j], U);
783 X = vec_ld (0, &chrSrc[j][i/2+2048]);
784 V = vec_mradds (X, CCoeffs[j], V);
787 /* scale and clip signals */
788 Y0 = vec_sra (Y0, SCL);
789 Y1 = vec_sra (Y1, SCL);
790 U = vec_sra (U, SCL);
791 V = vec_sra (V, SCL);
799 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
800 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
802 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
803 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
804 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
807 U0 = vec_mergeh (U,U);
808 V0 = vec_mergeh (V,V);
810 U1 = vec_mergel (U,U);
811 V1 = vec_mergel (V,V);
813 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
814 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
816 R = vec_packclp (R0,R1);
817 G = vec_packclp (G0,G1);
818 B = vec_packclp (B0,B1);
820 nout = (vector unsigned char *)scratch;
821 out_rgba (R,G,B,nout);
823 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
826 if (vYCoeffsBank) free (vYCoeffsBank);
827 if (vCCoeffsBank) free (vCCoeffsBank);