2 * AltiVec acceleration for colorspace conversion
4 * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 Convert I420 YV12 to RGB in various formats,
25 it rejects images that are not in 420 formats,
26 it rejects images that don't have widths of multiples of 16,
27 it rejects images that don't have heights of multiples of 2.
28 Reject defers to C simulation code.
30 Lots of optimizations to be done here.
32 1. Need to fix saturation code. I just couldn't get it to fly with packs
33 and adds, so we currently use max/min to clip.
35 2. The inefficient use of chroma loading needs a bit of brushing up.
37 3. Analysis of pipeline stalls needs to be done. Use shark to identify
41 MODIFIED to calculate coeffs from currently selected color space.
42 MODIFIED core to be a macro where you specify the output format.
43 ADDED UYVY conversion which is never called due to some thing in swscale.
44 CORRECTED algorithim selection to be strict on input formats.
45 ADDED runtime detection of AltiVec.
47 ADDED altivec_yuv2packedX vertical scl + RGB converter
52 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
54 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
59 so we have roughly 10 clocks per pixel. This is too high, something has
62 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
65 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
66 the input video frame, it was just decompressed so it probably resides in L1
67 caches. However, we are creating the output video stream. This needs to use the
68 DSTST instruction to optimize for the cache. We couple this with the fact that
69 we are not going to be visiting the input buffer again so we mark it Least
70 Recently Used. This shaves 25% of the processor cycles off.
72 Now memcpy is the largest mips consumer in the system, probably due
73 to the inefficient X11 stuff.
75 GL libraries seem to be very slow on this machine 1.33Ghz PB running
76 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
77 a versioning issue, however I have libGL.1.2.dylib for both
78 machines. (We need to figure this out now.)
80 GL2 libraries work now with patch for RGB32.
82 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
84 Integrated luma prescaling adjustment for saturation/contrast/brightness
94 #include "libswscale/rgb2rgb.h"
95 #include "libswscale/swscale.h"
96 #include "libswscale/swscale_internal.h"
97 #include "libavutil/cpu.h"
98 #include "libavutil/pixdesc.h"
99 #include "yuv2rgb_altivec.h"
101 #undef PROFILE_THE_BEAST
104 typedef unsigned char ubyte;
105 typedef signed char sbyte;
108 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
109 homogeneous vector registers x0,x1,x2 are interleaved with the
112 o0 = vec_mergeh (x0,x1);
113 o1 = vec_perm (o0, x2, perm_rgb_0);
114 o2 = vec_perm (o0, x2, perm_rgb_1);
115 o3 = vec_mergel (x0,x1);
116 o4 = vec_perm (o3,o2,perm_rgb_2);
117 o5 = vec_perm (o3,o2,perm_rgb_3);
119 perm_rgb_0: o0(RG).h v1(B) --> o1*
125 perm_rgb_1: o0(RG).h v1(B) --> o2
131 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
137 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
145 const vector unsigned char
146 perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
147 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
148 perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
149 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
150 perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
151 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
152 perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
153 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
155 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
157 __typeof__(x0) o0,o2,o3; \
158 o0 = vec_mergeh (x0,x1); \
159 y0 = vec_perm (o0, x2, perm_rgb_0); \
160 o2 = vec_perm (o0, x2, perm_rgb_1); \
161 o3 = vec_mergel (x0,x1); \
162 y1 = vec_perm (o3,o2,perm_rgb_2); \
163 y2 = vec_perm (o3,o2,perm_rgb_3); \
166 #define vec_mstbgr24(x0,x1,x2,ptr) \
168 __typeof__(x0) _0,_1,_2; \
169 vec_merge3 (x0,x1,x2,_0,_1,_2); \
170 vec_st (_0, 0, ptr++); \
171 vec_st (_1, 0, ptr++); \
172 vec_st (_2, 0, ptr++); \
175 #define vec_mstrgb24(x0,x1,x2,ptr) \
177 __typeof__(x0) _0,_1,_2; \
178 vec_merge3 (x2,x1,x0,_0,_1,_2); \
179 vec_st (_0, 0, ptr++); \
180 vec_st (_1, 0, ptr++); \
181 vec_st (_2, 0, ptr++); \
184 /* pack the pixels in rgb0 format
188 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
191 _0 = vec_mergeh (x0,x1); \
192 _1 = vec_mergeh (x2,x3); \
193 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
194 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
195 vec_st (_2, 0*16, (T *)ptr); \
196 vec_st (_3, 1*16, (T *)ptr); \
197 _0 = vec_mergel (x0,x1); \
198 _1 = vec_mergel (x2,x3); \
199 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
200 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
201 vec_st (_2, 2*16, (T *)ptr); \
202 vec_st (_3, 3*16, (T *)ptr); \
209 | 1 -0.3441 -0.7142 |x| Cb|
216 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
224 (vector signed short) \
225 vec_perm(x,(__typeof__(x)){0}, \
226 ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
227 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
229 (vector signed short) \
230 vec_perm(x,(__typeof__(x)){0}, \
231 ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
232 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
234 #define vec_clip_s16(x) \
235 vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
236 ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
238 #define vec_packclp(x,y) \
239 (vector unsigned char)vec_packs \
240 ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
241 (vector unsigned short)vec_max (y,((vector signed short) {0})))
243 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
246 static inline void cvtyuvtoRGB (SwsContext *c,
247 vector signed short Y, vector signed short U, vector signed short V,
248 vector signed short *R, vector signed short *G, vector signed short *B)
250 vector signed short vx,ux,uvx;
252 Y = vec_mradds (Y, c->CY, c->OY);
253 U = vec_sub (U,(vector signed short)
254 vec_splat((vector signed short){128},0));
255 V = vec_sub (V,(vector signed short)
256 vec_splat((vector signed short){128},0));
258 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
259 ux = vec_sl (U, c->CSHIFT);
260 *B = vec_mradds (ux, c->CBU, Y);
262 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
263 vx = vec_sl (V, c->CSHIFT);
264 *R = vec_mradds (vx, c->CRV, Y);
266 // uvx = ((CGU*u) + (CGV*v))>>15;
267 uvx = vec_mradds (U, c->CGU, Y);
268 *G = vec_mradds (V, c->CGV, uvx);
273 ------------------------------------------------------------------------------
275 ------------------------------------------------------------------------------
279 #define DEFCSP420_CVT(name,out_pixels) \
280 static int altivec_##name (SwsContext *c, \
281 const unsigned char **in, int *instrides, \
282 int srcSliceY, int srcSliceH, \
283 unsigned char **oplanes, int *outstrides) \
288 int instrides_scl[3]; \
289 vector unsigned char y0,y1; \
291 vector signed char u,v; \
293 vector signed short Y0,Y1,Y2,Y3; \
294 vector signed short U,V; \
295 vector signed short vx,ux,uvx; \
296 vector signed short vx0,ux0,uvx0; \
297 vector signed short vx1,ux1,uvx1; \
298 vector signed short R0,G0,B0; \
299 vector signed short R1,G1,B1; \
300 vector unsigned char R,G,B; \
302 const vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
303 vector unsigned char align_perm; \
305 vector signed short \
313 vector unsigned short lCSHIFT = c->CSHIFT; \
315 const ubyte *y1i = in[0]; \
316 const ubyte *y2i = in[0]+instrides[0]; \
317 const ubyte *ui = in[1]; \
318 const ubyte *vi = in[2]; \
320 vector unsigned char *oute \
321 = (vector unsigned char *) \
322 (oplanes[0]+srcSliceY*outstrides[0]); \
323 vector unsigned char *outo \
324 = (vector unsigned char *) \
325 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
328 instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \
329 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
330 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
333 for (i=0;i<h/2;i++) { \
334 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
335 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
337 for (j=0;j<w/16;j++) { \
339 y1ivP = (const vector unsigned char *)y1i; \
340 y2ivP = (const vector unsigned char *)y2i; \
341 uivP = (const vector unsigned char *)ui; \
342 vivP = (const vector unsigned char *)vi; \
344 align_perm = vec_lvsl (0, y1i); \
345 y0 = (vector unsigned char) \
346 vec_perm (y1ivP[0], y1ivP[1], align_perm); \
348 align_perm = vec_lvsl (0, y2i); \
349 y1 = (vector unsigned char) \
350 vec_perm (y2ivP[0], y2ivP[1], align_perm); \
352 align_perm = vec_lvsl (0, ui); \
353 u = (vector signed char) \
354 vec_perm (uivP[0], uivP[1], align_perm); \
356 align_perm = vec_lvsl (0, vi); \
357 v = (vector signed char) \
358 vec_perm (vivP[0], vivP[1], align_perm); \
360 u = (vector signed char) \
361 vec_sub (u,(vector signed char) \
362 vec_splat((vector signed char){128},0)); \
363 v = (vector signed char) \
364 vec_sub (v,(vector signed char) \
365 vec_splat((vector signed char){128},0)); \
367 U = vec_unpackh (u); \
368 V = vec_unpackh (v); \
376 Y0 = vec_mradds (Y0, lCY, lOY); \
377 Y1 = vec_mradds (Y1, lCY, lOY); \
378 Y2 = vec_mradds (Y2, lCY, lOY); \
379 Y3 = vec_mradds (Y3, lCY, lOY); \
381 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
382 ux = vec_sl (U, lCSHIFT); \
383 ux = vec_mradds (ux, lCBU, (vector signed short){0}); \
384 ux0 = vec_mergeh (ux,ux); \
385 ux1 = vec_mergel (ux,ux); \
387 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
388 vx = vec_sl (V, lCSHIFT); \
389 vx = vec_mradds (vx, lCRV, (vector signed short){0}); \
390 vx0 = vec_mergeh (vx,vx); \
391 vx1 = vec_mergel (vx,vx); \
393 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
394 uvx = vec_mradds (U, lCGU, (vector signed short){0}); \
395 uvx = vec_mradds (V, lCGV, uvx); \
396 uvx0 = vec_mergeh (uvx,uvx); \
397 uvx1 = vec_mergel (uvx,uvx); \
399 R0 = vec_add (Y0,vx0); \
400 G0 = vec_add (Y0,uvx0); \
401 B0 = vec_add (Y0,ux0); \
402 R1 = vec_add (Y1,vx1); \
403 G1 = vec_add (Y1,uvx1); \
404 B1 = vec_add (Y1,ux1); \
406 R = vec_packclp (R0,R1); \
407 G = vec_packclp (G0,G1); \
408 B = vec_packclp (B0,B1); \
410 out_pixels(R,G,B,oute); \
412 R0 = vec_add (Y2,vx0); \
413 G0 = vec_add (Y2,uvx0); \
414 B0 = vec_add (Y2,ux0); \
415 R1 = vec_add (Y3,vx1); \
416 G1 = vec_add (Y3,uvx1); \
417 B1 = vec_add (Y3,ux1); \
418 R = vec_packclp (R0,R1); \
419 G = vec_packclp (G0,G1); \
420 B = vec_packclp (B0,B1); \
423 out_pixels(R,G,B,outo); \
432 outo += (outstrides[0])>>4; \
433 oute += (outstrides[0])>>4; \
435 ui += instrides_scl[1]; \
436 vi += instrides_scl[2]; \
437 y1i += instrides_scl[0]; \
438 y2i += instrides_scl[0]; \
444 #define out_abgr(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
445 #define out_bgra(a,b,c,ptr) vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
446 #define out_rgba(a,b,c,ptr) vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
447 #define out_argb(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
448 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
449 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
451 DEFCSP420_CVT (yuv2_abgr, out_abgr)
452 DEFCSP420_CVT (yuv2_bgra, out_bgra)
453 DEFCSP420_CVT (yuv2_rgba, out_rgba)
454 DEFCSP420_CVT (yuv2_argb, out_argb)
455 DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
456 DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
459 // uyvy|uyvy|uyvy|uyvy
460 // 0123 4567 89ab cdef
462 const vector unsigned char
463 demux_u = {0x10,0x00,0x10,0x00,
466 0x10,0x0c,0x10,0x0c},
467 demux_v = {0x10,0x02,0x10,0x02,
470 0x10,0x0E,0x10,0x0E},
471 demux_y = {0x10,0x01,0x10,0x03,
474 0x10,0x0D,0x10,0x0F};
477 this is so I can play live CCIR raw video
479 static int altivec_uyvy_rgb32 (SwsContext *c,
480 const unsigned char **in, int *instrides,
481 int srcSliceY, int srcSliceH,
482 unsigned char **oplanes, int *outstrides)
487 vector unsigned char uyvy;
488 vector signed short Y,U,V;
489 vector signed short R0,G0,B0,R1,G1,B1;
490 vector unsigned char R,G,B;
491 vector unsigned char *out;
495 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
498 for (j=0;j<w/16;j++) {
499 uyvy = vec_ld (0, img);
500 U = (vector signed short)
501 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
503 V = (vector signed short)
504 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
506 Y = (vector signed short)
507 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
509 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
511 uyvy = vec_ld (16, img);
512 U = (vector signed short)
513 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
515 V = (vector signed short)
516 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
518 Y = (vector signed short)
519 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
521 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
523 R = vec_packclp (R0,R1);
524 G = vec_packclp (G0,G1);
525 B = vec_packclp (B0,B1);
527 // vec_mstbgr24 (R,G,B, out);
528 out_rgba (R,G,B,out);
538 /* Ok currently the acceleration routine only supports
539 inputs of widths a multiple of 16
540 and heights a multiple 2
542 So we just fall back to the C codes for this.
544 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
546 if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
550 and this seems not to matter too much I tried a bunch of
551 videos with abnormal widths and MPlayer crashes elsewhere.
552 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
553 boom with X11 bad match.
556 if ((c->srcW & 0xf) != 0) return NULL;
558 switch (c->srcFormat) {
559 case PIX_FMT_YUV410P:
560 case PIX_FMT_YUV420P:
561 /*case IMGFMT_CLPL: ??? */
565 if ((c->srcH & 0x1) != 0)
568 switch(c->dstFormat) {
570 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
571 return altivec_yuv2_rgb24;
573 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
574 return altivec_yuv2_bgr24;
576 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
577 return altivec_yuv2_argb;
579 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
580 return altivec_yuv2_abgr;
582 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
583 return altivec_yuv2_rgba;
585 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
586 return altivec_yuv2_bgra;
587 default: return NULL;
591 case PIX_FMT_UYVY422:
592 switch(c->dstFormat) {
594 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
595 return altivec_uyvy_rgb32;
596 default: return NULL;
604 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
607 DECLARE_ALIGNED(16, signed short, tmp)[8];
608 vector signed short vec;
611 buf.tmp[0] = ((0xffffLL) * contrast>>8)>>9; //cy
612 buf.tmp[1] = -256*brightness; //oy
613 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv
614 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu
615 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu
616 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv
619 c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
620 c->CY = vec_splat ((vector signed short)buf.vec, 0);
621 c->OY = vec_splat ((vector signed short)buf.vec, 1);
622 c->CRV = vec_splat ((vector signed short)buf.vec, 2);
623 c->CBU = vec_splat ((vector signed short)buf.vec, 3);
624 c->CGU = vec_splat ((vector signed short)buf.vec, 4);
625 c->CGV = vec_splat ((vector signed short)buf.vec, 5);
630 static av_always_inline void
631 ff_yuv2packedX_altivec(SwsContext *c, const int16_t *lumFilter,
632 const int16_t **lumSrc, int lumFilterSize,
633 const int16_t *chrFilter, const int16_t **chrUSrc,
634 const int16_t **chrVSrc, int chrFilterSize,
635 const int16_t **alpSrc, uint8_t *dest,
636 int dstW, int dstY, enum PixelFormat target)
639 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
640 vector signed short R0,G0,B0,R1,G1,B1;
642 vector unsigned char R,G,B;
643 vector unsigned char *out,*nout;
645 vector signed short RND = vec_splat_s16(1<<3);
646 vector unsigned short SCL = vec_splat_u16(4);
647 DECLARE_ALIGNED(16, unsigned int, scratch)[16];
649 vector signed short *YCoeffs, *CCoeffs;
651 YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
652 CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
654 out = (vector unsigned char *)dest;
656 for (i=0; i<dstW; i+=16) {
659 /* extract 16 coeffs from lumSrc */
660 for (j=0; j<lumFilterSize; j++) {
661 X0 = vec_ld (0, &lumSrc[j][i]);
662 X1 = vec_ld (16, &lumSrc[j][i]);
663 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
664 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
669 /* extract 8 coeffs from U,V */
670 for (j=0; j<chrFilterSize; j++) {
671 X = vec_ld (0, &chrUSrc[j][i/2]);
672 U = vec_mradds (X, CCoeffs[j], U);
673 X = vec_ld (0, &chrVSrc[j][i/2]);
674 V = vec_mradds (X, CCoeffs[j], V);
677 /* scale and clip signals */
678 Y0 = vec_sra (Y0, SCL);
679 Y1 = vec_sra (Y1, SCL);
680 U = vec_sra (U, SCL);
681 V = vec_sra (V, SCL);
683 Y0 = vec_clip_s16 (Y0);
684 Y1 = vec_clip_s16 (Y1);
685 U = vec_clip_s16 (U);
686 V = vec_clip_s16 (V);
689 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
690 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
692 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
693 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
694 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
697 U0 = vec_mergeh (U,U);
698 V0 = vec_mergeh (V,V);
700 U1 = vec_mergel (U,U);
701 V1 = vec_mergel (V,V);
703 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
704 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
706 R = vec_packclp (R0,R1);
707 G = vec_packclp (G0,G1);
708 B = vec_packclp (B0,B1);
711 case PIX_FMT_ABGR: out_abgr (R,G,B,out); break;
712 case PIX_FMT_BGRA: out_bgra (R,G,B,out); break;
713 case PIX_FMT_RGBA: out_rgba (R,G,B,out); break;
714 case PIX_FMT_ARGB: out_argb (R,G,B,out); break;
715 case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
716 case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
719 /* If this is reached, the caller should have called yuv2packedXinC
721 static int printed_error_message;
722 if (!printed_error_message) {
723 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
724 av_get_pix_fmt_name(c->dstFormat));
725 printed_error_message=1;
737 /* extract 16 coeffs from lumSrc */
738 for (j=0; j<lumFilterSize; j++) {
739 X0 = vec_ld (0, &lumSrc[j][i]);
740 X1 = vec_ld (16, &lumSrc[j][i]);
741 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
742 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
747 /* extract 8 coeffs from U,V */
748 for (j=0; j<chrFilterSize; j++) {
749 X = vec_ld (0, &chrUSrc[j][i/2]);
750 U = vec_mradds (X, CCoeffs[j], U);
751 X = vec_ld (0, &chrVSrc[j][i/2]);
752 V = vec_mradds (X, CCoeffs[j], V);
755 /* scale and clip signals */
756 Y0 = vec_sra (Y0, SCL);
757 Y1 = vec_sra (Y1, SCL);
758 U = vec_sra (U, SCL);
759 V = vec_sra (V, SCL);
761 Y0 = vec_clip_s16 (Y0);
762 Y1 = vec_clip_s16 (Y1);
763 U = vec_clip_s16 (U);
764 V = vec_clip_s16 (V);
767 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
768 U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
770 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
771 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
772 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
775 U0 = vec_mergeh (U,U);
776 V0 = vec_mergeh (V,V);
778 U1 = vec_mergel (U,U);
779 V1 = vec_mergel (V,V);
781 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
782 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
784 R = vec_packclp (R0,R1);
785 G = vec_packclp (G0,G1);
786 B = vec_packclp (B0,B1);
788 nout = (vector unsigned char *)scratch;
790 case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break;
791 case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break;
792 case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break;
793 case PIX_FMT_ARGB: out_argb (R,G,B,nout); break;
794 case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
795 case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
797 /* Unreachable, I think. */
798 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
799 av_get_pix_fmt_name(c->dstFormat));
803 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
808 #define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
809 void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, const int16_t *lumFilter, \
810 const int16_t **lumSrc, int lumFilterSize, \
811 const int16_t *chrFilter, const int16_t **chrUSrc, \
812 const int16_t **chrVSrc, int chrFilterSize, \
813 const int16_t **alpSrc, uint8_t *dest, \
814 int dstW, int dstY) \
816 ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
817 chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
818 alpSrc, dest, dstW, dstY, pixfmt); \
821 YUV2PACKEDX_WRAPPER(abgr, PIX_FMT_ABGR);
822 YUV2PACKEDX_WRAPPER(bgra, PIX_FMT_BGRA);
823 YUV2PACKEDX_WRAPPER(argb, PIX_FMT_ARGB);
824 YUV2PACKEDX_WRAPPER(rgba, PIX_FMT_RGBA);
825 YUV2PACKEDX_WRAPPER(rgb24, PIX_FMT_RGB24);
826 YUV2PACKEDX_WRAPPER(bgr24, PIX_FMT_BGR24);