2 marc.hoffman@analog.com March 8, 2004
4 Altivec Acceleration for Color Space Conversion revision 0.2
6 convert I420 YV12 to RGB in various formats,
7 it rejects images that are not in 420 formats
8 it rejects images that don't have widths of multiples of 16
9 it rejects images that don't have heights of multiples of 2
10 reject defers to C simulation codes.
12 lots of optimizations to be done here
14 1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15 so we currently use max min to clip
17 2. the inefficient use of chroma loading needs a bit of brushing up
19 3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
22 MODIFIED to calculate coeffs from currently selected color space.
23 MODIFIED core to be a macro which you spec the output format.
24 ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25 CORRECTED algorithim selection to be strict on input formats.
26 ADDED runtime detection of altivec.
28 ADDED altivec_yuv2packedX vertical scl + RGB converter
33 The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34 The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
38 so we have roughly 10clocks per pixel this is too high something has to be wrong.
40 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
42 OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43 guaranteed to have the input video frame it was just decompressed so
44 it probably resides in L1 caches. However we are creating the
45 output video stream this needs to use the DSTST instruction to
46 optimize for the cache. We couple this with the fact that we are
47 not going to be visiting the input buffer again so we mark it Least
48 Recently Used. This shaves 25% of the processor cycles off.
50 Now MEMCPY is the largest mips consumer in the system, probably due
51 to the inefficient X11 stuff.
53 GL libraries seem to be very slow on this machine 1.33Ghz PB running
54 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
55 a versioning issues, however i have libGL.1.2.dylib for both
56 machines. ((We need to figure this out now))
58 GL2 libraries work now with patch for RGB32
60 NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
62 Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
73 #include "swscale_internal.h"
74 #include "../mangle.h"
75 #include "../libvo/img_format.h" //FIXME try to reduce dependency of such stuff
77 #undef PROFILE_THE_BEAST
80 typedef unsigned char ubyte;
81 typedef signed char sbyte;
84 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
85 homogeneous vector registers x0,x1,x2 are interleaved with the
88 o0 = vec_mergeh (x0,x1);
89 o1 = vec_perm (o0, x2, perm_rgb_0);
90 o2 = vec_perm (o0, x2, perm_rgb_1);
91 o3 = vec_mergel (x0,x1);
92 o4 = vec_perm (o3,o2,perm_rgb_2);
93 o5 = vec_perm (o3,o2,perm_rgb_3);
95 perm_rgb_0: o0(RG).h v1(B) --> o1*
101 perm_rgb_1: o0(RG).h v1(B) --> o2
107 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
113 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
121 const vector unsigned char
122 perm_rgb_0 = (vector unsigned char)(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
123 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
124 perm_rgb_1 = (vector unsigned char)(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
125 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
126 perm_rgb_2 = (vector unsigned char)(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
127 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
128 perm_rgb_3 = (vector unsigned char)(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
129 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
131 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
133 typeof(x0) o0,o2,o3; \
134 o0 = vec_mergeh (x0,x1); \
135 y0 = vec_perm (o0, x2, perm_rgb_0);\
136 o2 = vec_perm (o0, x2, perm_rgb_1);\
137 o3 = vec_mergel (x0,x1); \
138 y1 = vec_perm (o3,o2,perm_rgb_2); \
139 y2 = vec_perm (o3,o2,perm_rgb_3); \
142 #define vec_mstrgb24(x0,x1,x2,ptr) \
144 typeof(x0) _0,_1,_2; \
145 vec_merge3 (x0,x1,x2,_0,_1,_2); \
146 vec_st (_0, 0, ptr++); \
147 vec_st (_1, 0, ptr++); \
148 vec_st (_2, 0, ptr++); \
151 #define vec_mstbgr24(x0,x1,x2,ptr) \
153 typeof(x0) _0,_1,_2; \
154 vec_merge3 (x2,x1,x0,_0,_1,_2); \
155 vec_st (_0, 0, ptr++); \
156 vec_st (_1, 0, ptr++); \
157 vec_st (_2, 0, ptr++); \
160 /* pack the pixels in rgb0 format
164 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
167 _0 = vec_mergeh (x0,x1); \
168 _1 = vec_mergeh (x2,x3); \
169 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
170 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
171 vec_st (_2, 0*16, (T *)ptr); \
172 vec_st (_3, 1*16, (T *)ptr); \
173 _0 = vec_mergel (x0,x1); \
174 _1 = vec_mergel (x2,x3); \
175 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
176 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
177 vec_st (_2, 2*16, (T *)ptr); \
178 vec_st (_3, 3*16, (T *)ptr); \
185 | 1 -0.3441 -0.7142 |x| Cb|
192 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
200 (vector signed short) \
201 vec_perm(x,(typeof(x))(0),\
202 (vector unsigned char)(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
203 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
205 (vector signed short) \
206 vec_perm(x,(typeof(x))(0),\
207 (vector unsigned char)(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
208 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
210 #define vec_clip(x) \
211 vec_max (vec_min (x, (typeof(x))(255)), (typeof(x))(0))
213 #define vec_packclp_a(x,y) \
214 (vector unsigned char)vec_pack (vec_clip (x), vec_clip (y))
216 #define vec_packclp(x,y) \
217 (vector unsigned char)vec_packs \
218 ((vector unsigned short)vec_max (x,(vector signed short) (0)), \
219 (vector unsigned short)vec_max (y,(vector signed short) (0)))
221 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),a,a,a,ptr)
224 static inline void cvtyuvtoRGB (SwsContext *c,
225 vector signed short Y, vector signed short U, vector signed short V,
226 vector signed short *R, vector signed short *G, vector signed short *B)
228 vector signed short vx,ux,uvx;
230 Y = vec_mradds (Y, c->CY, c->OY);
232 U = vec_sub (U,(vector signed short)(128));
233 V = vec_sub (V,(vector signed short)(128));
235 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
236 ux = vec_sl (U, c->CSHIFT);
237 *B = vec_mradds (ux, c->CBU, Y);
239 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
240 vx = vec_sl (V, c->CSHIFT);
241 *R = vec_mradds (vx, c->CRV, Y);
243 // uvx = ((CGU*u) + (CGV*v))>>15;
244 uvx = vec_mradds (U, c->CGU, Y);
245 *G = vec_mradds (V, c->CGV, uvx);
250 ------------------------------------------------------------------------------
252 ------------------------------------------------------------------------------
256 #define DEFCSP420_CVT(name,out_pixels) \
257 static int altivec_##name (SwsContext *c, \
258 unsigned char **in, int *instrides, \
259 int srcSliceY, int srcSliceH, \
260 unsigned char **oplanes, int *outstrides) \
265 int instrides_scl[3]; \
266 vector unsigned char y0,y1; \
268 vector signed char u,v; \
270 vector signed short Y0,Y1,Y2,Y3; \
271 vector signed short U,V; \
272 vector signed short vx,ux,uvx; \
273 vector signed short vx0,ux0,uvx0; \
274 vector signed short vx1,ux1,uvx1; \
275 vector signed short R0,G0,B0; \
276 vector signed short R1,G1,B1; \
277 vector unsigned char R,G,B; \
279 vector unsigned char *uivP, *vivP; \
280 vector unsigned char align_perm; \
282 vector signed short \
290 vector unsigned short lCSHIFT = c->CSHIFT; \
292 ubyte *y1i = in[0]; \
293 ubyte *y2i = in[0]+w; \
297 vector unsigned char *oute \
298 = (vector unsigned char *) \
299 (oplanes[0]+srcSliceY*outstrides[0]); \
300 vector unsigned char *outo \
301 = (vector unsigned char *) \
302 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
305 instrides_scl[0] = instrides[0]; \
306 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
307 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
310 for (i=0;i<h/2;i++) { \
311 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
312 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
314 for (j=0;j<w/16;j++) { \
316 y0 = vec_ldl (0,y1i); \
317 y1 = vec_ldl (0,y2i); \
318 uivP = (vector unsigned char *)ui; \
319 vivP = (vector unsigned char *)vi; \
321 align_perm = vec_lvsl (0, ui); \
322 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); \
324 align_perm = vec_lvsl (0, vi); \
325 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); \
327 u = (vector signed char)vec_sub (u, (vector signed char)(128)); \
328 v = (vector signed char)vec_sub (v, (vector signed char)(128)); \
329 U = vec_unpackh (u); \
330 V = vec_unpackh (v); \
338 Y0 = vec_mradds (Y0, lCY, lOY); \
339 Y1 = vec_mradds (Y1, lCY, lOY); \
340 Y2 = vec_mradds (Y2, lCY, lOY); \
341 Y3 = vec_mradds (Y3, lCY, lOY); \
343 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
344 ux = vec_sl (U, lCSHIFT); \
345 ux = vec_mradds (ux, lCBU, (vector signed short)(0)); \
346 ux0 = vec_mergeh (ux,ux); \
347 ux1 = vec_mergel (ux,ux); \
349 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
350 vx = vec_sl (V, lCSHIFT); \
351 vx = vec_mradds (vx, lCRV, (vector signed short)(0)); \
352 vx0 = vec_mergeh (vx,vx); \
353 vx1 = vec_mergel (vx,vx); \
355 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
356 uvx = vec_mradds (U, lCGU, (vector signed short)(0)); \
357 uvx = vec_mradds (V, lCGV, uvx); \
358 uvx0 = vec_mergeh (uvx,uvx); \
359 uvx1 = vec_mergel (uvx,uvx); \
361 R0 = vec_add (Y0,vx0); \
362 G0 = vec_add (Y0,uvx0); \
363 B0 = vec_add (Y0,ux0); \
364 R1 = vec_add (Y1,vx1); \
365 G1 = vec_add (Y1,uvx1); \
366 B1 = vec_add (Y1,ux1); \
368 R = vec_packclp (R0,R1); \
369 G = vec_packclp (G0,G1); \
370 B = vec_packclp (B0,B1); \
372 out_pixels(R,G,B,oute); \
374 R0 = vec_add (Y2,vx0); \
375 G0 = vec_add (Y2,uvx0); \
376 B0 = vec_add (Y2,ux0); \
377 R1 = vec_add (Y3,vx1); \
378 G1 = vec_add (Y3,uvx1); \
379 B1 = vec_add (Y3,ux1); \
380 R = vec_packclp (R0,R1); \
381 G = vec_packclp (G0,G1); \
382 B = vec_packclp (B0,B1); \
385 out_pixels(R,G,B,outo); \
394 outo += (outstrides[0])>>4; \
395 oute += (outstrides[0])>>4; \
397 ui += instrides_scl[1]; \
398 vi += instrides_scl[2]; \
399 y1i += instrides_scl[0]; \
400 y2i += instrides_scl[0]; \
406 #define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),c,b,a,ptr)
407 #define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))(0)),ptr)
408 #define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))(0)),ptr)
409 #define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))(0)),a,b,c,ptr)
410 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
411 #define out_bgr24(a,b,c,ptr) vec_mstrgb24(c,b,a,ptr)
413 DEFCSP420_CVT (yuv2_abgr32, out_abgr)
414 DEFCSP420_CVT (yuv2_bgra32, out_argb)
415 DEFCSP420_CVT (yuv2_rgba32, out_rgba)
416 DEFCSP420_CVT (yuv2_argb32, out_argb)
417 DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
418 DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
421 // uyvy|uyvy|uyvy|uyvy
422 // 0123 4567 89ab cdef
424 const vector unsigned char
425 demux_u = (vector unsigned char)(0x10,0x00,0x10,0x00,
428 0x10,0x0c,0x10,0x0c),
429 demux_v = (vector unsigned char)(0x10,0x02,0x10,0x02,
432 0x10,0x0E,0x10,0x0E),
433 demux_y = (vector unsigned char)(0x10,0x01,0x10,0x03,
436 0x10,0x0D,0x10,0x0F);
439 this is so I can play live CCIR raw video
441 static int altivec_uyvy_rgb32 (SwsContext *c,
442 unsigned char **in, int *instrides,
443 int srcSliceY, int srcSliceH,
444 unsigned char **oplanes, int *outstrides)
449 vector unsigned char uyvy;
450 vector signed short Y,U,V;
451 vector signed short vx,ux,uvx;
452 vector signed short R0,G0,B0,R1,G1,B1;
453 vector unsigned char R,G,B;
454 vector unsigned char *out;
458 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
461 for (j=0;j<w/16;j++) {
462 uyvy = vec_ld (0, img);
463 U = (vector signed short)
464 vec_perm (uyvy, (vector unsigned char)(0), demux_u);
466 V = (vector signed short)
467 vec_perm (uyvy, (vector unsigned char)(0), demux_v);
469 Y = (vector signed short)
470 vec_perm (uyvy, (vector unsigned char)(0), demux_y);
472 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
474 uyvy = vec_ld (16, img);
475 U = (vector signed short)
476 vec_perm (uyvy, (vector unsigned char)(0), demux_u);
478 V = (vector signed short)
479 vec_perm (uyvy, (vector unsigned char)(0), demux_v);
481 Y = (vector signed short)
482 vec_perm (uyvy, (vector unsigned char)(0), demux_y);
484 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
486 R = vec_packclp (R0,R1);
487 G = vec_packclp (G0,G1);
488 B = vec_packclp (B0,B1);
490 // vec_mstbgr24 (R,G,B, out);
491 out_rgba (R,G,B,out);
501 /* Ok currently the acceleration routine only supports
502 inputs of widths a multiple of 16
503 and heights a multiple 2
505 So we just fall back to the C codes for this.
507 SwsFunc yuv2rgb_init_altivec (SwsContext *c)
509 if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
513 and this seems not to matter too much I tried a bunch of
514 videos with abnormal widths and mplayer crashes else where.
515 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
516 boom with X11 bad match.
519 if ((c->srcW & 0xf) != 0) return NULL;
521 switch (c->srcFormat) {
532 if ((c->srcH & 0x1) != 0)
535 switch(c->dstFormat){
537 MSG_WARN("ALTIVEC: Color Space RGB24\n");
538 return altivec_yuv2_rgb24;
540 MSG_WARN("ALTIVEC: Color Space BGR24\n");
541 return altivec_yuv2_bgr24;
543 MSG_WARN("ALTIVEC: Color Space ARGB32\n");
544 return altivec_yuv2_argb32;
546 MSG_WARN("ALTIVEC: Color Space BGRA32\n");
547 // return profile_altivec_bgra32;
549 return altivec_yuv2_bgra32;
550 default: return NULL;
555 switch(c->dstFormat){
557 MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
558 return altivec_uyvy_rgb32;
559 default: return NULL;
567 void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4])
569 vector signed short CY, CRV, CBU, CGU, CGV, OY, Y0;
570 int64_t crv __attribute__ ((aligned(16))) = inv_table[0];
571 int64_t cbu __attribute__ ((aligned(16))) = inv_table[1];
572 int64_t cgu __attribute__ ((aligned(16))) = inv_table[2];
573 int64_t cgv __attribute__ ((aligned(16))) = inv_table[3];
574 int64_t cy = (1<<16)-1;
576 short tmp __attribute__ ((aligned(16)));
578 if ((c->flags & SWS_CPU_CAPS_ALTIVEC) == 0)
581 cy = (cy *c->contrast )>>17;
582 crv= (crv*c->contrast * c->saturation)>>32;
583 cbu= (cbu*c->contrast * c->saturation)>>32;
584 cgu= (cgu*c->contrast * c->saturation)>>32;
585 cgv= (cgv*c->contrast * c->saturation)>>32;
587 oy -= 256*c->brightness;
590 CY = vec_lde (0, &tmp);
591 CY = vec_splat (CY, 0);
594 OY = vec_lde (0, &tmp);
595 OY = vec_splat (OY, 0);
598 CRV = vec_lde (0, &tmp);
599 CRV = vec_splat (CRV, 0);
601 CBU = vec_lde (0, &tmp);
602 CBU = vec_splat (CBU, 0);
605 CGU = vec_lde (0, &tmp);
606 CGU = vec_splat (CGU, 0);
608 CGV = vec_lde (0, &tmp);
609 CGV = vec_splat (CGV, 0);
611 c->CSHIFT = (vector unsigned short)(2);
620 printf ("cy: %hvx\n", CY);
621 printf ("oy: %hvx\n", OY);
622 printf ("crv: %hvx\n", CRV);
623 printf ("cbu: %hvx\n", CBU);
624 printf ("cgv: %hvx\n", CGV);
625 printf ("cgu: %hvx\n", CGU);
633 altivec_yuv2packedX (SwsContext *c,
634 int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
635 int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
636 uint8_t *dest, int dstW, int dstY)
639 short tmp __attribute__((aligned (16)));
642 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
643 vector signed short R0,G0,B0,R1,G1,B1;
645 vector unsigned char R,G,B,pels[3];
646 vector unsigned char *out,*nout;
647 vector signed short RND = (vector signed short)(1<<3);
648 vector unsigned short SCL = (vector unsigned short)(4);
649 unsigned long scratch[16] __attribute__ ((aligned (16)));
651 vector signed short *vYCoeffsBank, *vCCoeffsBank;
653 vector signed short *YCoeffs, *CCoeffs;
655 vYCoeffsBank = malloc (sizeof (vector signed short)*lumFilterSize*dstW);
656 vCCoeffsBank = malloc (sizeof (vector signed short)*chrFilterSize*dstW);
658 for (i=0;i<lumFilterSize*dstW;i++) {
659 tmp = c->vLumFilter[i];
660 p = &vYCoeffsBank[i];
665 for (i=0;i<chrFilterSize*dstW;i++) {
666 tmp = c->vChrFilter[i];
667 p = &vCCoeffsBank[i];
672 YCoeffs = vYCoeffsBank+dstY*lumFilterSize;
673 CCoeffs = vCCoeffsBank+dstY*chrFilterSize;
675 out = (vector unsigned char *)dest;
677 for(i=0; i<dstW; i+=16){
680 /* extract 16 coeffs from lumSrc */
681 for(j=0; j<lumFilterSize; j++) {
682 X0 = vec_ld (0, &lumSrc[j][i]);
683 X1 = vec_ld (16, &lumSrc[j][i]);
684 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
685 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
690 /* extract 8 coeffs from U,V */
691 for(j=0; j<chrFilterSize; j++) {
692 X = vec_ld (0, &chrSrc[j][i/2]);
693 U = vec_mradds (X, CCoeffs[j], U);
694 X = vec_ld (0, &chrSrc[j][i/2+2048]);
695 V = vec_mradds (X, CCoeffs[j], V);
698 /* scale and clip signals */
699 Y0 = vec_sra (Y0, SCL);
700 Y1 = vec_sra (Y1, SCL);
701 U = vec_sra (U, SCL);
702 V = vec_sra (V, SCL);
710 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
711 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
713 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
714 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
715 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
718 U0 = vec_mergeh (U,U);
719 V0 = vec_mergeh (V,V);
721 U1 = vec_mergel (U,U);
722 V1 = vec_mergel (V,V);
724 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
725 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
727 R = vec_packclp (R0,R1);
728 G = vec_packclp (G0,G1);
729 B = vec_packclp (B0,B1);
731 out_rgba (R,G,B,out);
739 /* extract 16 coeffs from lumSrc */
740 for(j=0; j<lumFilterSize; j++) {
741 X0 = vec_ld (0, &lumSrc[j][i]);
742 X1 = vec_ld (16, &lumSrc[j][i]);
743 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
744 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
749 /* extract 8 coeffs from U,V */
750 for(j=0; j<chrFilterSize; j++) {
751 X = vec_ld (0, &chrSrc[j][i/2]);
752 U = vec_mradds (X, CCoeffs[j], U);
753 X = vec_ld (0, &chrSrc[j][i/2+2048]);
754 V = vec_mradds (X, CCoeffs[j], V);
757 /* scale and clip signals */
758 Y0 = vec_sra (Y0, SCL);
759 Y1 = vec_sra (Y1, SCL);
760 U = vec_sra (U, SCL);
761 V = vec_sra (V, SCL);
769 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
770 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
772 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
773 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
774 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
777 U0 = vec_mergeh (U,U);
778 V0 = vec_mergeh (V,V);
780 U1 = vec_mergel (U,U);
781 V1 = vec_mergel (V,V);
783 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
784 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
786 R = vec_packclp (R0,R1);
787 G = vec_packclp (G0,G1);
788 B = vec_packclp (B0,B1);
790 nout = (vector unsigned char *)scratch;
791 out_rgba (R,G,B,nout);
793 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
796 if (vYCoeffsBank) free (vYCoeffsBank);
797 if (vCCoeffsBank) free (vCCoeffsBank);