2 * AltiVec acceleration for colorspace conversion
4 * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
6 * This file is part of Libav.
8 * Libav is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * Libav is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with Libav; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 Convert I420 YV12 to RGB in various formats,
25 it rejects images that are not in 420 formats,
26 it rejects images that don't have widths of multiples of 16,
27 it rejects images that don't have heights of multiples of 2.
28 Reject defers to C simulation code.
30 Lots of optimizations to be done here.
32 1. Need to fix saturation code. I just couldn't get it to fly with packs
33 and adds, so we currently use max/min to clip.
35 2. The inefficient use of chroma loading needs a bit of brushing up.
37 3. Analysis of pipeline stalls needs to be done. Use shark to identify
41 MODIFIED to calculate coeffs from currently selected color space.
42 MODIFIED core to be a macro where you specify the output format.
43 ADDED UYVY conversion which is never called due to some thing in swscale.
44 CORRECTED algorithim selection to be strict on input formats.
45 ADDED runtime detection of AltiVec.
47 ADDED altivec_yuv2packedX vertical scl + RGB converter
52 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
54 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
59 so we have roughly 10 clocks per pixel. This is too high, something has
62 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
65 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
66 the input video frame, it was just decompressed so it probably resides in L1
67 caches. However, we are creating the output video stream. This needs to use the
68 DSTST instruction to optimize for the cache. We couple this with the fact that
69 we are not going to be visiting the input buffer again so we mark it Least
70 Recently Used. This shaves 25% of the processor cycles off.
72 Now memcpy is the largest mips consumer in the system, probably due
73 to the inefficient X11 stuff.
75 GL libraries seem to be very slow on this machine 1.33Ghz PB running
76 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
77 a versioning issue, however I have libGL.1.2.dylib for both
78 machines. (We need to figure this out now.)
80 GL2 libraries work now with patch for RGB32.
82 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
84 Integrated luma prescaling adjustment for saturation/contrast/brightness
94 #include "libswscale/rgb2rgb.h"
95 #include "libswscale/swscale.h"
96 #include "libswscale/swscale_internal.h"
97 #include "libavutil/cpu.h"
98 #include "yuv2rgb_altivec.h"
100 #undef PROFILE_THE_BEAST
103 typedef unsigned char ubyte;
104 typedef signed char sbyte;
107 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
108 homogeneous vector registers x0,x1,x2 are interleaved with the
111 o0 = vec_mergeh (x0,x1);
112 o1 = vec_perm (o0, x2, perm_rgb_0);
113 o2 = vec_perm (o0, x2, perm_rgb_1);
114 o3 = vec_mergel (x0,x1);
115 o4 = vec_perm (o3,o2,perm_rgb_2);
116 o5 = vec_perm (o3,o2,perm_rgb_3);
118 perm_rgb_0: o0(RG).h v1(B) --> o1*
124 perm_rgb_1: o0(RG).h v1(B) --> o2
130 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
136 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
144 const vector unsigned char
145 perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
146 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
147 perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
148 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
149 perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
150 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
151 perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
152 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
154 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
156 __typeof__(x0) o0,o2,o3; \
157 o0 = vec_mergeh (x0,x1); \
158 y0 = vec_perm (o0, x2, perm_rgb_0); \
159 o2 = vec_perm (o0, x2, perm_rgb_1); \
160 o3 = vec_mergel (x0,x1); \
161 y1 = vec_perm (o3,o2,perm_rgb_2); \
162 y2 = vec_perm (o3,o2,perm_rgb_3); \
165 #define vec_mstbgr24(x0,x1,x2,ptr) \
167 __typeof__(x0) _0,_1,_2; \
168 vec_merge3 (x0,x1,x2,_0,_1,_2); \
169 vec_st (_0, 0, ptr++); \
170 vec_st (_1, 0, ptr++); \
171 vec_st (_2, 0, ptr++); \
174 #define vec_mstrgb24(x0,x1,x2,ptr) \
176 __typeof__(x0) _0,_1,_2; \
177 vec_merge3 (x2,x1,x0,_0,_1,_2); \
178 vec_st (_0, 0, ptr++); \
179 vec_st (_1, 0, ptr++); \
180 vec_st (_2, 0, ptr++); \
183 /* pack the pixels in rgb0 format
187 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
190 _0 = vec_mergeh (x0,x1); \
191 _1 = vec_mergeh (x2,x3); \
192 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
193 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
194 vec_st (_2, 0*16, (T *)ptr); \
195 vec_st (_3, 1*16, (T *)ptr); \
196 _0 = vec_mergel (x0,x1); \
197 _1 = vec_mergel (x2,x3); \
198 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
199 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
200 vec_st (_2, 2*16, (T *)ptr); \
201 vec_st (_3, 3*16, (T *)ptr); \
208 | 1 -0.3441 -0.7142 |x| Cb|
215 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
223 (vector signed short) \
224 vec_perm(x,(__typeof__(x)){0}, \
225 ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
226 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
228 (vector signed short) \
229 vec_perm(x,(__typeof__(x)){0}, \
230 ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
231 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
233 #define vec_clip_s16(x) \
234 vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
235 ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
237 #define vec_packclp(x,y) \
238 (vector unsigned char)vec_packs \
239 ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
240 (vector unsigned short)vec_max (y,((vector signed short) {0})))
242 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
245 static inline void cvtyuvtoRGB (SwsContext *c,
246 vector signed short Y, vector signed short U, vector signed short V,
247 vector signed short *R, vector signed short *G, vector signed short *B)
249 vector signed short vx,ux,uvx;
251 Y = vec_mradds (Y, c->CY, c->OY);
252 U = vec_sub (U,(vector signed short)
253 vec_splat((vector signed short){128},0));
254 V = vec_sub (V,(vector signed short)
255 vec_splat((vector signed short){128},0));
257 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
258 ux = vec_sl (U, c->CSHIFT);
259 *B = vec_mradds (ux, c->CBU, Y);
261 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
262 vx = vec_sl (V, c->CSHIFT);
263 *R = vec_mradds (vx, c->CRV, Y);
265 // uvx = ((CGU*u) + (CGV*v))>>15;
266 uvx = vec_mradds (U, c->CGU, Y);
267 *G = vec_mradds (V, c->CGV, uvx);
272 ------------------------------------------------------------------------------
274 ------------------------------------------------------------------------------
278 #define DEFCSP420_CVT(name,out_pixels) \
279 static int altivec_##name (SwsContext *c, \
280 const unsigned char **in, int *instrides, \
281 int srcSliceY, int srcSliceH, \
282 unsigned char **oplanes, int *outstrides) \
287 int instrides_scl[3]; \
288 vector unsigned char y0,y1; \
290 vector signed char u,v; \
292 vector signed short Y0,Y1,Y2,Y3; \
293 vector signed short U,V; \
294 vector signed short vx,ux,uvx; \
295 vector signed short vx0,ux0,uvx0; \
296 vector signed short vx1,ux1,uvx1; \
297 vector signed short R0,G0,B0; \
298 vector signed short R1,G1,B1; \
299 vector unsigned char R,G,B; \
301 vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
302 vector unsigned char align_perm; \
304 vector signed short \
312 vector unsigned short lCSHIFT = c->CSHIFT; \
314 const ubyte *y1i = in[0]; \
315 const ubyte *y2i = in[0]+instrides[0]; \
316 const ubyte *ui = in[1]; \
317 const ubyte *vi = in[2]; \
319 vector unsigned char *oute \
320 = (vector unsigned char *) \
321 (oplanes[0]+srcSliceY*outstrides[0]); \
322 vector unsigned char *outo \
323 = (vector unsigned char *) \
324 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
327 instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \
328 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
329 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
332 for (i=0;i<h/2;i++) { \
333 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
334 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
336 for (j=0;j<w/16;j++) { \
338 y1ivP = (vector unsigned char *)y1i; \
339 y2ivP = (vector unsigned char *)y2i; \
340 uivP = (vector unsigned char *)ui; \
341 vivP = (vector unsigned char *)vi; \
343 align_perm = vec_lvsl (0, y1i); \
344 y0 = (vector unsigned char) \
345 vec_perm (y1ivP[0], y1ivP[1], align_perm); \
347 align_perm = vec_lvsl (0, y2i); \
348 y1 = (vector unsigned char) \
349 vec_perm (y2ivP[0], y2ivP[1], align_perm); \
351 align_perm = vec_lvsl (0, ui); \
352 u = (vector signed char) \
353 vec_perm (uivP[0], uivP[1], align_perm); \
355 align_perm = vec_lvsl (0, vi); \
356 v = (vector signed char) \
357 vec_perm (vivP[0], vivP[1], align_perm); \
359 u = (vector signed char) \
360 vec_sub (u,(vector signed char) \
361 vec_splat((vector signed char){128},0)); \
362 v = (vector signed char) \
363 vec_sub (v,(vector signed char) \
364 vec_splat((vector signed char){128},0)); \
366 U = vec_unpackh (u); \
367 V = vec_unpackh (v); \
375 Y0 = vec_mradds (Y0, lCY, lOY); \
376 Y1 = vec_mradds (Y1, lCY, lOY); \
377 Y2 = vec_mradds (Y2, lCY, lOY); \
378 Y3 = vec_mradds (Y3, lCY, lOY); \
380 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
381 ux = vec_sl (U, lCSHIFT); \
382 ux = vec_mradds (ux, lCBU, (vector signed short){0}); \
383 ux0 = vec_mergeh (ux,ux); \
384 ux1 = vec_mergel (ux,ux); \
386 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
387 vx = vec_sl (V, lCSHIFT); \
388 vx = vec_mradds (vx, lCRV, (vector signed short){0}); \
389 vx0 = vec_mergeh (vx,vx); \
390 vx1 = vec_mergel (vx,vx); \
392 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
393 uvx = vec_mradds (U, lCGU, (vector signed short){0}); \
394 uvx = vec_mradds (V, lCGV, uvx); \
395 uvx0 = vec_mergeh (uvx,uvx); \
396 uvx1 = vec_mergel (uvx,uvx); \
398 R0 = vec_add (Y0,vx0); \
399 G0 = vec_add (Y0,uvx0); \
400 B0 = vec_add (Y0,ux0); \
401 R1 = vec_add (Y1,vx1); \
402 G1 = vec_add (Y1,uvx1); \
403 B1 = vec_add (Y1,ux1); \
405 R = vec_packclp (R0,R1); \
406 G = vec_packclp (G0,G1); \
407 B = vec_packclp (B0,B1); \
409 out_pixels(R,G,B,oute); \
411 R0 = vec_add (Y2,vx0); \
412 G0 = vec_add (Y2,uvx0); \
413 B0 = vec_add (Y2,ux0); \
414 R1 = vec_add (Y3,vx1); \
415 G1 = vec_add (Y3,uvx1); \
416 B1 = vec_add (Y3,ux1); \
417 R = vec_packclp (R0,R1); \
418 G = vec_packclp (G0,G1); \
419 B = vec_packclp (B0,B1); \
422 out_pixels(R,G,B,outo); \
431 outo += (outstrides[0])>>4; \
432 oute += (outstrides[0])>>4; \
434 ui += instrides_scl[1]; \
435 vi += instrides_scl[2]; \
436 y1i += instrides_scl[0]; \
437 y2i += instrides_scl[0]; \
443 #define out_abgr(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
444 #define out_bgra(a,b,c,ptr) vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
445 #define out_rgba(a,b,c,ptr) vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
446 #define out_argb(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
447 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
448 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
450 DEFCSP420_CVT (yuv2_abgr, out_abgr)
451 DEFCSP420_CVT (yuv2_bgra, out_bgra)
452 DEFCSP420_CVT (yuv2_rgba, out_rgba)
453 DEFCSP420_CVT (yuv2_argb, out_argb)
454 DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
455 DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
458 // uyvy|uyvy|uyvy|uyvy
459 // 0123 4567 89ab cdef
461 const vector unsigned char
462 demux_u = {0x10,0x00,0x10,0x00,
465 0x10,0x0c,0x10,0x0c},
466 demux_v = {0x10,0x02,0x10,0x02,
469 0x10,0x0E,0x10,0x0E},
470 demux_y = {0x10,0x01,0x10,0x03,
473 0x10,0x0D,0x10,0x0F};
476 this is so I can play live CCIR raw video
478 static int altivec_uyvy_rgb32 (SwsContext *c,
479 const unsigned char **in, int *instrides,
480 int srcSliceY, int srcSliceH,
481 unsigned char **oplanes, int *outstrides)
486 vector unsigned char uyvy;
487 vector signed short Y,U,V;
488 vector signed short R0,G0,B0,R1,G1,B1;
489 vector unsigned char R,G,B;
490 vector unsigned char *out;
494 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
497 for (j=0;j<w/16;j++) {
498 uyvy = vec_ld (0, img);
499 U = (vector signed short)
500 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
502 V = (vector signed short)
503 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
505 Y = (vector signed short)
506 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
508 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
510 uyvy = vec_ld (16, img);
511 U = (vector signed short)
512 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
514 V = (vector signed short)
515 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
517 Y = (vector signed short)
518 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
520 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
522 R = vec_packclp (R0,R1);
523 G = vec_packclp (G0,G1);
524 B = vec_packclp (B0,B1);
526 // vec_mstbgr24 (R,G,B, out);
527 out_rgba (R,G,B,out);
537 /* Ok currently the acceleration routine only supports
538 inputs of widths a multiple of 16
539 and heights a multiple 2
541 So we just fall back to the C codes for this.
543 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
545 if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
549 and this seems not to matter too much I tried a bunch of
550 videos with abnormal widths and MPlayer crashes elsewhere.
551 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
552 boom with X11 bad match.
555 if ((c->srcW & 0xf) != 0) return NULL;
557 switch (c->srcFormat) {
558 case PIX_FMT_YUV410P:
559 case PIX_FMT_YUV420P:
560 /*case IMGFMT_CLPL: ??? */
564 if ((c->srcH & 0x1) != 0)
567 switch(c->dstFormat) {
569 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
570 return altivec_yuv2_rgb24;
572 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
573 return altivec_yuv2_bgr24;
575 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
576 return altivec_yuv2_argb;
578 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
579 return altivec_yuv2_abgr;
581 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
582 return altivec_yuv2_rgba;
584 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
585 return altivec_yuv2_bgra;
586 default: return NULL;
590 case PIX_FMT_UYVY422:
591 switch(c->dstFormat) {
593 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
594 return altivec_uyvy_rgb32;
595 default: return NULL;
603 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
606 DECLARE_ALIGNED(16, signed short, tmp)[8];
607 vector signed short vec;
610 buf.tmp[0] = ((0xffffLL) * contrast>>8)>>9; //cy
611 buf.tmp[1] = -256*brightness; //oy
612 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv
613 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu
614 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu
615 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv
618 c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
619 c->CY = vec_splat ((vector signed short)buf.vec, 0);
620 c->OY = vec_splat ((vector signed short)buf.vec, 1);
621 c->CRV = vec_splat ((vector signed short)buf.vec, 2);
622 c->CBU = vec_splat ((vector signed short)buf.vec, 3);
623 c->CGU = vec_splat ((vector signed short)buf.vec, 4);
624 c->CGV = vec_splat ((vector signed short)buf.vec, 5);
629 static av_always_inline void
630 ff_yuv2packedX_altivec(SwsContext *c, const int16_t *lumFilter,
631 const int16_t **lumSrc, int lumFilterSize,
632 const int16_t *chrFilter, const int16_t **chrUSrc,
633 const int16_t **chrVSrc, int chrFilterSize,
634 const int16_t **alpSrc, uint8_t *dest,
635 int dstW, int dstY, enum PixelFormat target)
638 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
639 vector signed short R0,G0,B0,R1,G1,B1;
641 vector unsigned char R,G,B;
642 vector unsigned char *out,*nout;
644 vector signed short RND = vec_splat_s16(1<<3);
645 vector unsigned short SCL = vec_splat_u16(4);
646 DECLARE_ALIGNED(16, unsigned int, scratch)[16];
648 vector signed short *YCoeffs, *CCoeffs;
650 YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
651 CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
653 out = (vector unsigned char *)dest;
655 for (i=0; i<dstW; i+=16) {
658 /* extract 16 coeffs from lumSrc */
659 for (j=0; j<lumFilterSize; j++) {
660 X0 = vec_ld (0, &lumSrc[j][i]);
661 X1 = vec_ld (16, &lumSrc[j][i]);
662 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
663 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
668 /* extract 8 coeffs from U,V */
669 for (j=0; j<chrFilterSize; j++) {
670 X = vec_ld (0, &chrUSrc[j][i/2]);
671 U = vec_mradds (X, CCoeffs[j], U);
672 X = vec_ld (0, &chrVSrc[j][i/2]);
673 V = vec_mradds (X, CCoeffs[j], V);
676 /* scale and clip signals */
677 Y0 = vec_sra (Y0, SCL);
678 Y1 = vec_sra (Y1, SCL);
679 U = vec_sra (U, SCL);
680 V = vec_sra (V, SCL);
682 Y0 = vec_clip_s16 (Y0);
683 Y1 = vec_clip_s16 (Y1);
684 U = vec_clip_s16 (U);
685 V = vec_clip_s16 (V);
688 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
689 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
691 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
692 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
693 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
696 U0 = vec_mergeh (U,U);
697 V0 = vec_mergeh (V,V);
699 U1 = vec_mergel (U,U);
700 V1 = vec_mergel (V,V);
702 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
703 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
705 R = vec_packclp (R0,R1);
706 G = vec_packclp (G0,G1);
707 B = vec_packclp (B0,B1);
710 case PIX_FMT_ABGR: out_abgr (R,G,B,out); break;
711 case PIX_FMT_BGRA: out_bgra (R,G,B,out); break;
712 case PIX_FMT_RGBA: out_rgba (R,G,B,out); break;
713 case PIX_FMT_ARGB: out_argb (R,G,B,out); break;
714 case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
715 case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
718 /* If this is reached, the caller should have called yuv2packedXinC
720 static int printed_error_message;
721 if (!printed_error_message) {
722 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
723 sws_format_name(c->dstFormat));
724 printed_error_message=1;
736 /* extract 16 coeffs from lumSrc */
737 for (j=0; j<lumFilterSize; j++) {
738 X0 = vec_ld (0, &lumSrc[j][i]);
739 X1 = vec_ld (16, &lumSrc[j][i]);
740 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
741 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
746 /* extract 8 coeffs from U,V */
747 for (j=0; j<chrFilterSize; j++) {
748 X = vec_ld (0, &chrUSrc[j][i/2]);
749 U = vec_mradds (X, CCoeffs[j], U);
750 X = vec_ld (0, &chrVSrc[j][i/2]);
751 V = vec_mradds (X, CCoeffs[j], V);
754 /* scale and clip signals */
755 Y0 = vec_sra (Y0, SCL);
756 Y1 = vec_sra (Y1, SCL);
757 U = vec_sra (U, SCL);
758 V = vec_sra (V, SCL);
760 Y0 = vec_clip_s16 (Y0);
761 Y1 = vec_clip_s16 (Y1);
762 U = vec_clip_s16 (U);
763 V = vec_clip_s16 (V);
766 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
767 U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
769 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
770 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
771 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
774 U0 = vec_mergeh (U,U);
775 V0 = vec_mergeh (V,V);
777 U1 = vec_mergel (U,U);
778 V1 = vec_mergel (V,V);
780 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
781 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
783 R = vec_packclp (R0,R1);
784 G = vec_packclp (G0,G1);
785 B = vec_packclp (B0,B1);
787 nout = (vector unsigned char *)scratch;
789 case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break;
790 case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break;
791 case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break;
792 case PIX_FMT_ARGB: out_argb (R,G,B,nout); break;
793 case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
794 case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
796 /* Unreachable, I think. */
797 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
798 sws_format_name(c->dstFormat));
802 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
807 #define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
808 void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, const int16_t *lumFilter, \
809 const int16_t **lumSrc, int lumFilterSize, \
810 const int16_t *chrFilter, const int16_t **chrUSrc, \
811 const int16_t **chrVSrc, int chrFilterSize, \
812 const int16_t **alpSrc, uint8_t *dest, \
813 int dstW, int dstY) \
815 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
816 chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
817 alpSrc, dest, dstW, dstY, pixfmt); \
820 YUV2PACKEDX_WRAPPER(abgr, PIX_FMT_ABGR);
821 YUV2PACKEDX_WRAPPER(bgra, PIX_FMT_BGRA);
822 YUV2PACKEDX_WRAPPER(argb, PIX_FMT_ARGB);
823 YUV2PACKEDX_WRAPPER(rgba, PIX_FMT_RGBA);
824 YUV2PACKEDX_WRAPPER(rgb24, PIX_FMT_RGB24);
825 YUV2PACKEDX_WRAPPER(bgr24, PIX_FMT_BGR24);