2 * AltiVec acceleration for colorspace conversion
4 * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 Convert I420 YV12 to RGB in various formats,
25 it rejects images that are not in 420 formats,
26 it rejects images that don't have widths of multiples of 16,
27 it rejects images that don't have heights of multiples of 2.
28 Reject defers to C simulation code.
30 Lots of optimizations to be done here.
32 1. Need to fix saturation code. I just couldn't get it to fly with packs
33 and adds, so we currently use max/min to clip.
35 2. The inefficient use of chroma loading needs a bit of brushing up.
37 3. Analysis of pipeline stalls needs to be done. Use shark to identify
41 MODIFIED to calculate coeffs from currently selected color space.
42 MODIFIED core to be a macro where you specify the output format.
43 ADDED UYVY conversion which is never called due to some thing in swscale.
44 CORRECTED algorithim selection to be strict on input formats.
45 ADDED runtime detection of AltiVec.
47 ADDED altivec_yuv2packedX vertical scl + RGB converter
52 The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
54 The AltiVec version uses 10% of the processor or ~100Mips for D1 video
59 so we have roughly 10 clocks per pixel. This is too high, something has
62 OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
65 OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
66 the input video frame, it was just decompressed so it probably resides in L1
67 caches. However, we are creating the output video stream. This needs to use the
68 DSTST instruction to optimize for the cache. We couple this with the fact that
69 we are not going to be visiting the input buffer again so we mark it Least
70 Recently Used. This shaves 25% of the processor cycles off.
72 Now memcpy is the largest mips consumer in the system, probably due
73 to the inefficient X11 stuff.
75 GL libraries seem to be very slow on this machine 1.33Ghz PB running
76 Jaguar, this is not the case for my 1Ghz PB. I thought it might be
77 a versioning issue, however I have libGL.1.2.dylib for both
78 machines. (We need to figure this out now.)
80 GL2 libraries work now with patch for RGB32.
82 NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
84 Integrated luma prescaling adjustment for saturation/contrast/brightness
94 #include "libswscale/rgb2rgb.h"
95 #include "libswscale/swscale.h"
96 #include "libswscale/swscale_internal.h"
97 #include "libavutil/cpu.h"
99 #undef PROFILE_THE_BEAST
102 typedef unsigned char ubyte;
103 typedef signed char sbyte;
106 /* RGB interleaver, 16 planar pels 8-bit samples per channel in
107 homogeneous vector registers x0,x1,x2 are interleaved with the
110 o0 = vec_mergeh (x0,x1);
111 o1 = vec_perm (o0, x2, perm_rgb_0);
112 o2 = vec_perm (o0, x2, perm_rgb_1);
113 o3 = vec_mergel (x0,x1);
114 o4 = vec_perm (o3,o2,perm_rgb_2);
115 o5 = vec_perm (o3,o2,perm_rgb_3);
117 perm_rgb_0: o0(RG).h v1(B) --> o1*
123 perm_rgb_1: o0(RG).h v1(B) --> o2
129 perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
135 perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
143 const vector unsigned char
144 perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
145 0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
146 perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
147 0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
148 perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
149 0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
150 perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
151 0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
153 #define vec_merge3(x2,x1,x0,y0,y1,y2) \
155 __typeof__(x0) o0,o2,o3; \
156 o0 = vec_mergeh (x0,x1); \
157 y0 = vec_perm (o0, x2, perm_rgb_0); \
158 o2 = vec_perm (o0, x2, perm_rgb_1); \
159 o3 = vec_mergel (x0,x1); \
160 y1 = vec_perm (o3,o2,perm_rgb_2); \
161 y2 = vec_perm (o3,o2,perm_rgb_3); \
164 #define vec_mstbgr24(x0,x1,x2,ptr) \
166 __typeof__(x0) _0,_1,_2; \
167 vec_merge3 (x0,x1,x2,_0,_1,_2); \
168 vec_st (_0, 0, ptr++); \
169 vec_st (_1, 0, ptr++); \
170 vec_st (_2, 0, ptr++); \
173 #define vec_mstrgb24(x0,x1,x2,ptr) \
175 __typeof__(x0) _0,_1,_2; \
176 vec_merge3 (x2,x1,x0,_0,_1,_2); \
177 vec_st (_0, 0, ptr++); \
178 vec_st (_1, 0, ptr++); \
179 vec_st (_2, 0, ptr++); \
182 /* pack the pixels in rgb0 format
186 #define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \
189 _0 = vec_mergeh (x0,x1); \
190 _1 = vec_mergeh (x2,x3); \
191 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
192 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
193 vec_st (_2, 0*16, (T *)ptr); \
194 vec_st (_3, 1*16, (T *)ptr); \
195 _0 = vec_mergel (x0,x1); \
196 _1 = vec_mergel (x2,x3); \
197 _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
198 _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
199 vec_st (_2, 2*16, (T *)ptr); \
200 vec_st (_3, 3*16, (T *)ptr); \
207 | 1 -0.3441 -0.7142 |x| Cb|
214 typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
222 (vector signed short) \
223 vec_perm(x,(__typeof__(x)){0}, \
224 ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
225 0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
227 (vector signed short) \
228 vec_perm(x,(__typeof__(x)){0}, \
229 ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
230 0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
232 #define vec_clip_s16(x) \
233 vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
234 ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
236 #define vec_packclp(x,y) \
237 (vector unsigned char)vec_packs \
238 ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
239 (vector unsigned short)vec_max (y,((vector signed short) {0})))
241 //#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
244 static inline void cvtyuvtoRGB (SwsContext *c,
245 vector signed short Y, vector signed short U, vector signed short V,
246 vector signed short *R, vector signed short *G, vector signed short *B)
248 vector signed short vx,ux,uvx;
250 Y = vec_mradds (Y, c->CY, c->OY);
251 U = vec_sub (U,(vector signed short)
252 vec_splat((vector signed short){128},0));
253 V = vec_sub (V,(vector signed short)
254 vec_splat((vector signed short){128},0));
256 // ux = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
257 ux = vec_sl (U, c->CSHIFT);
258 *B = vec_mradds (ux, c->CBU, Y);
260 // vx = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
261 vx = vec_sl (V, c->CSHIFT);
262 *R = vec_mradds (vx, c->CRV, Y);
264 // uvx = ((CGU*u) + (CGV*v))>>15;
265 uvx = vec_mradds (U, c->CGU, Y);
266 *G = vec_mradds (V, c->CGV, uvx);
271 ------------------------------------------------------------------------------
273 ------------------------------------------------------------------------------
277 #define DEFCSP420_CVT(name,out_pixels) \
278 static int altivec_##name (SwsContext *c, \
279 const unsigned char **in, int *instrides, \
280 int srcSliceY, int srcSliceH, \
281 unsigned char **oplanes, int *outstrides) \
286 int instrides_scl[3]; \
287 vector unsigned char y0,y1; \
289 vector signed char u,v; \
291 vector signed short Y0,Y1,Y2,Y3; \
292 vector signed short U,V; \
293 vector signed short vx,ux,uvx; \
294 vector signed short vx0,ux0,uvx0; \
295 vector signed short vx1,ux1,uvx1; \
296 vector signed short R0,G0,B0; \
297 vector signed short R1,G1,B1; \
298 vector unsigned char R,G,B; \
300 vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
301 vector unsigned char align_perm; \
303 vector signed short \
311 vector unsigned short lCSHIFT = c->CSHIFT; \
313 const ubyte *y1i = in[0]; \
314 const ubyte *y2i = in[0]+instrides[0]; \
315 const ubyte *ui = in[1]; \
316 const ubyte *vi = in[2]; \
318 vector unsigned char *oute \
319 = (vector unsigned char *) \
320 (oplanes[0]+srcSliceY*outstrides[0]); \
321 vector unsigned char *outo \
322 = (vector unsigned char *) \
323 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \
326 instrides_scl[0] = instrides[0]*2-w; /* the loop moves y{1,2}i by w */ \
327 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */ \
328 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */ \
331 for (i=0;i<h/2;i++) { \
332 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0); \
333 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1); \
335 for (j=0;j<w/16;j++) { \
337 y1ivP = (vector unsigned char *)y1i; \
338 y2ivP = (vector unsigned char *)y2i; \
339 uivP = (vector unsigned char *)ui; \
340 vivP = (vector unsigned char *)vi; \
342 align_perm = vec_lvsl (0, y1i); \
343 y0 = (vector unsigned char) \
344 vec_perm (y1ivP[0], y1ivP[1], align_perm); \
346 align_perm = vec_lvsl (0, y2i); \
347 y1 = (vector unsigned char) \
348 vec_perm (y2ivP[0], y2ivP[1], align_perm); \
350 align_perm = vec_lvsl (0, ui); \
351 u = (vector signed char) \
352 vec_perm (uivP[0], uivP[1], align_perm); \
354 align_perm = vec_lvsl (0, vi); \
355 v = (vector signed char) \
356 vec_perm (vivP[0], vivP[1], align_perm); \
358 u = (vector signed char) \
359 vec_sub (u,(vector signed char) \
360 vec_splat((vector signed char){128},0)); \
361 v = (vector signed char) \
362 vec_sub (v,(vector signed char) \
363 vec_splat((vector signed char){128},0)); \
365 U = vec_unpackh (u); \
366 V = vec_unpackh (v); \
374 Y0 = vec_mradds (Y0, lCY, lOY); \
375 Y1 = vec_mradds (Y1, lCY, lOY); \
376 Y2 = vec_mradds (Y2, lCY, lOY); \
377 Y3 = vec_mradds (Y3, lCY, lOY); \
379 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \
380 ux = vec_sl (U, lCSHIFT); \
381 ux = vec_mradds (ux, lCBU, (vector signed short){0}); \
382 ux0 = vec_mergeh (ux,ux); \
383 ux1 = vec_mergel (ux,ux); \
385 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \
386 vx = vec_sl (V, lCSHIFT); \
387 vx = vec_mradds (vx, lCRV, (vector signed short){0}); \
388 vx0 = vec_mergeh (vx,vx); \
389 vx1 = vec_mergel (vx,vx); \
391 /* uvx = ((CGU*u) + (CGV*v))>>15 */ \
392 uvx = vec_mradds (U, lCGU, (vector signed short){0}); \
393 uvx = vec_mradds (V, lCGV, uvx); \
394 uvx0 = vec_mergeh (uvx,uvx); \
395 uvx1 = vec_mergel (uvx,uvx); \
397 R0 = vec_add (Y0,vx0); \
398 G0 = vec_add (Y0,uvx0); \
399 B0 = vec_add (Y0,ux0); \
400 R1 = vec_add (Y1,vx1); \
401 G1 = vec_add (Y1,uvx1); \
402 B1 = vec_add (Y1,ux1); \
404 R = vec_packclp (R0,R1); \
405 G = vec_packclp (G0,G1); \
406 B = vec_packclp (B0,B1); \
408 out_pixels(R,G,B,oute); \
410 R0 = vec_add (Y2,vx0); \
411 G0 = vec_add (Y2,uvx0); \
412 B0 = vec_add (Y2,ux0); \
413 R1 = vec_add (Y3,vx1); \
414 G1 = vec_add (Y3,uvx1); \
415 B1 = vec_add (Y3,ux1); \
416 R = vec_packclp (R0,R1); \
417 G = vec_packclp (G0,G1); \
418 B = vec_packclp (B0,B1); \
421 out_pixels(R,G,B,outo); \
430 outo += (outstrides[0])>>4; \
431 oute += (outstrides[0])>>4; \
433 ui += instrides_scl[1]; \
434 vi += instrides_scl[2]; \
435 y1i += instrides_scl[0]; \
436 y2i += instrides_scl[0]; \
442 #define out_abgr(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
443 #define out_bgra(a,b,c,ptr) vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
444 #define out_rgba(a,b,c,ptr) vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
445 #define out_argb(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
446 #define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
447 #define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
449 DEFCSP420_CVT (yuv2_abgr, out_abgr)
451 DEFCSP420_CVT (yuv2_bgra, out_bgra)
453 static int altivec_yuv2_bgra32 (SwsContext *c,
454 unsigned char **in, int *instrides,
455 int srcSliceY, int srcSliceH,
456 unsigned char **oplanes, int *outstrides)
461 int instrides_scl[3];
462 vector unsigned char y0,y1;
464 vector signed char u,v;
466 vector signed short Y0,Y1,Y2,Y3;
467 vector signed short U,V;
468 vector signed short vx,ux,uvx;
469 vector signed short vx0,ux0,uvx0;
470 vector signed short vx1,ux1,uvx1;
471 vector signed short R0,G0,B0;
472 vector signed short R1,G1,B1;
473 vector unsigned char R,G,B;
475 vector unsigned char *uivP, *vivP;
476 vector unsigned char align_perm;
486 vector unsigned short lCSHIFT = c->CSHIFT;
489 ubyte *y2i = in[0]+w;
493 vector unsigned char *oute
494 = (vector unsigned char *)
495 (oplanes[0]+srcSliceY*outstrides[0]);
496 vector unsigned char *outo
497 = (vector unsigned char *)
498 (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
501 instrides_scl[0] = instrides[0];
502 instrides_scl[1] = instrides[1]-w/2; /* the loop moves ui by w/2 */
503 instrides_scl[2] = instrides[2]-w/2; /* the loop moves vi by w/2 */
506 for (i=0;i<h/2;i++) {
507 vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
508 vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
510 for (j=0;j<w/16;j++) {
512 y0 = vec_ldl (0,y1i);
513 y1 = vec_ldl (0,y2i);
514 uivP = (vector unsigned char *)ui;
515 vivP = (vector unsigned char *)vi;
517 align_perm = vec_lvsl (0, ui);
518 u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
520 align_perm = vec_lvsl (0, vi);
521 v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
522 u = (vector signed char)
523 vec_sub (u,(vector signed char)
524 vec_splat((vector signed char){128},0));
526 v = (vector signed char)
527 vec_sub (v, (vector signed char)
528 vec_splat((vector signed char){128},0));
539 Y0 = vec_mradds (Y0, lCY, lOY);
540 Y1 = vec_mradds (Y1, lCY, lOY);
541 Y2 = vec_mradds (Y2, lCY, lOY);
542 Y3 = vec_mradds (Y3, lCY, lOY);
544 /* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */
545 ux = vec_sl (U, lCSHIFT);
546 ux = vec_mradds (ux, lCBU, (vector signed short){0});
547 ux0 = vec_mergeh (ux,ux);
548 ux1 = vec_mergel (ux,ux);
550 /* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */
551 vx = vec_sl (V, lCSHIFT);
552 vx = vec_mradds (vx, lCRV, (vector signed short){0});
553 vx0 = vec_mergeh (vx,vx);
554 vx1 = vec_mergel (vx,vx);
555 /* uvx = ((CGU*u) + (CGV*v))>>15 */
556 uvx = vec_mradds (U, lCGU, (vector signed short){0});
557 uvx = vec_mradds (V, lCGV, uvx);
558 uvx0 = vec_mergeh (uvx,uvx);
559 uvx1 = vec_mergel (uvx,uvx);
560 R0 = vec_add (Y0,vx0);
561 G0 = vec_add (Y0,uvx0);
562 B0 = vec_add (Y0,ux0);
563 R1 = vec_add (Y1,vx1);
564 G1 = vec_add (Y1,uvx1);
565 B1 = vec_add (Y1,ux1);
566 R = vec_packclp (R0,R1);
567 G = vec_packclp (G0,G1);
568 B = vec_packclp (B0,B1);
570 out_argb(R,G,B,oute);
571 R0 = vec_add (Y2,vx0);
572 G0 = vec_add (Y2,uvx0);
573 B0 = vec_add (Y2,ux0);
574 R1 = vec_add (Y3,vx1);
575 G1 = vec_add (Y3,uvx1);
576 B1 = vec_add (Y3,ux1);
577 R = vec_packclp (R0,R1);
578 G = vec_packclp (G0,G1);
579 B = vec_packclp (B0,B1);
581 out_argb(R,G,B,outo);
589 outo += (outstrides[0])>>4;
590 oute += (outstrides[0])>>4;
592 ui += instrides_scl[1];
593 vi += instrides_scl[2];
594 y1i += instrides_scl[0];
595 y2i += instrides_scl[0];
603 DEFCSP420_CVT (yuv2_rgba, out_rgba)
604 DEFCSP420_CVT (yuv2_argb, out_argb)
605 DEFCSP420_CVT (yuv2_rgb24, out_rgb24)
606 DEFCSP420_CVT (yuv2_bgr24, out_bgr24)
609 // uyvy|uyvy|uyvy|uyvy
610 // 0123 4567 89ab cdef
612 const vector unsigned char
613 demux_u = {0x10,0x00,0x10,0x00,
616 0x10,0x0c,0x10,0x0c},
617 demux_v = {0x10,0x02,0x10,0x02,
620 0x10,0x0E,0x10,0x0E},
621 demux_y = {0x10,0x01,0x10,0x03,
624 0x10,0x0D,0x10,0x0F};
627 this is so I can play live CCIR raw video
629 static int altivec_uyvy_rgb32 (SwsContext *c,
630 const unsigned char **in, int *instrides,
631 int srcSliceY, int srcSliceH,
632 unsigned char **oplanes, int *outstrides)
637 vector unsigned char uyvy;
638 vector signed short Y,U,V;
639 vector signed short R0,G0,B0,R1,G1,B1;
640 vector unsigned char R,G,B;
641 vector unsigned char *out;
645 out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
648 for (j=0;j<w/16;j++) {
649 uyvy = vec_ld (0, img);
650 U = (vector signed short)
651 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
653 V = (vector signed short)
654 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
656 Y = (vector signed short)
657 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
659 cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
661 uyvy = vec_ld (16, img);
662 U = (vector signed short)
663 vec_perm (uyvy, (vector unsigned char){0}, demux_u);
665 V = (vector signed short)
666 vec_perm (uyvy, (vector unsigned char){0}, demux_v);
668 Y = (vector signed short)
669 vec_perm (uyvy, (vector unsigned char){0}, demux_y);
671 cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
673 R = vec_packclp (R0,R1);
674 G = vec_packclp (G0,G1);
675 B = vec_packclp (B0,B1);
677 // vec_mstbgr24 (R,G,B, out);
678 out_rgba (R,G,B,out);
688 /* Ok currently the acceleration routine only supports
689 inputs of widths a multiple of 16
690 and heights a multiple 2
692 So we just fall back to the C codes for this.
694 SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
696 if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
700 and this seems not to matter too much I tried a bunch of
701 videos with abnormal widths and MPlayer crashes elsewhere.
702 mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
703 boom with X11 bad match.
706 if ((c->srcW & 0xf) != 0) return NULL;
708 switch (c->srcFormat) {
709 case PIX_FMT_YUV410P:
710 case PIX_FMT_YUV420P:
711 /*case IMGFMT_CLPL: ??? */
715 if ((c->srcH & 0x1) != 0)
718 switch(c->dstFormat) {
720 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
721 return altivec_yuv2_rgb24;
723 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
724 return altivec_yuv2_bgr24;
726 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
727 return altivec_yuv2_argb;
729 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
730 return altivec_yuv2_abgr;
732 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
733 return altivec_yuv2_rgba;
735 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
736 return altivec_yuv2_bgra;
737 default: return NULL;
741 case PIX_FMT_UYVY422:
742 switch(c->dstFormat) {
744 av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
745 return altivec_uyvy_rgb32;
746 default: return NULL;
754 void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
757 DECLARE_ALIGNED(16, signed short, tmp)[8];
758 vector signed short vec;
761 buf.tmp[0] = ((0xffffLL) * contrast>>8)>>9; //cy
762 buf.tmp[1] = -256*brightness; //oy
763 buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv
764 buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu
765 buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu
766 buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv
769 c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
770 c->CY = vec_splat ((vector signed short)buf.vec, 0);
771 c->OY = vec_splat ((vector signed short)buf.vec, 1);
772 c->CRV = vec_splat ((vector signed short)buf.vec, 2);
773 c->CBU = vec_splat ((vector signed short)buf.vec, 3);
774 c->CGU = vec_splat ((vector signed short)buf.vec, 4);
775 c->CGV = vec_splat ((vector signed short)buf.vec, 5);
781 ff_yuv2packedX_altivec(SwsContext *c, const int16_t *lumFilter,
782 const int16_t **lumSrc, int lumFilterSize,
783 const int16_t *chrFilter, const int16_t **chrUSrc,
784 const int16_t **chrVSrc, int chrFilterSize,
785 uint8_t *dest, int dstW, int dstY)
788 vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
789 vector signed short R0,G0,B0,R1,G1,B1;
791 vector unsigned char R,G,B;
792 vector unsigned char *out,*nout;
794 vector signed short RND = vec_splat_s16(1<<3);
795 vector unsigned short SCL = vec_splat_u16(4);
796 DECLARE_ALIGNED(16, unsigned long, scratch)[16];
798 vector signed short *YCoeffs, *CCoeffs;
800 YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
801 CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
803 out = (vector unsigned char *)dest;
805 for (i=0; i<dstW; i+=16) {
808 /* extract 16 coeffs from lumSrc */
809 for (j=0; j<lumFilterSize; j++) {
810 X0 = vec_ld (0, &lumSrc[j][i]);
811 X1 = vec_ld (16, &lumSrc[j][i]);
812 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
813 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
818 /* extract 8 coeffs from U,V */
819 for (j=0; j<chrFilterSize; j++) {
820 X = vec_ld (0, &chrUSrc[j][i/2]);
821 U = vec_mradds (X, CCoeffs[j], U);
822 X = vec_ld (0, &chrVSrc[j][i/2]);
823 V = vec_mradds (X, CCoeffs[j], V);
826 /* scale and clip signals */
827 Y0 = vec_sra (Y0, SCL);
828 Y1 = vec_sra (Y1, SCL);
829 U = vec_sra (U, SCL);
830 V = vec_sra (V, SCL);
832 Y0 = vec_clip_s16 (Y0);
833 Y1 = vec_clip_s16 (Y1);
834 U = vec_clip_s16 (U);
835 V = vec_clip_s16 (V);
838 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
839 U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7
841 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
842 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
843 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
846 U0 = vec_mergeh (U,U);
847 V0 = vec_mergeh (V,V);
849 U1 = vec_mergel (U,U);
850 V1 = vec_mergel (V,V);
852 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
853 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
855 R = vec_packclp (R0,R1);
856 G = vec_packclp (G0,G1);
857 B = vec_packclp (B0,B1);
859 switch(c->dstFormat) {
860 case PIX_FMT_ABGR: out_abgr (R,G,B,out); break;
861 case PIX_FMT_BGRA: out_bgra (R,G,B,out); break;
862 case PIX_FMT_RGBA: out_rgba (R,G,B,out); break;
863 case PIX_FMT_ARGB: out_argb (R,G,B,out); break;
864 case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
865 case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
868 /* If this is reached, the caller should have called yuv2packedXinC
870 static int printed_error_message;
871 if (!printed_error_message) {
872 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
873 sws_format_name(c->dstFormat));
874 printed_error_message=1;
886 /* extract 16 coeffs from lumSrc */
887 for (j=0; j<lumFilterSize; j++) {
888 X0 = vec_ld (0, &lumSrc[j][i]);
889 X1 = vec_ld (16, &lumSrc[j][i]);
890 Y0 = vec_mradds (X0, YCoeffs[j], Y0);
891 Y1 = vec_mradds (X1, YCoeffs[j], Y1);
896 /* extract 8 coeffs from U,V */
897 for (j=0; j<chrFilterSize; j++) {
898 X = vec_ld (0, &chrUSrc[j][i/2]);
899 U = vec_mradds (X, CCoeffs[j], U);
900 X = vec_ld (0, &chrVSrc[j][i/2]);
901 V = vec_mradds (X, CCoeffs[j], V);
904 /* scale and clip signals */
905 Y0 = vec_sra (Y0, SCL);
906 Y1 = vec_sra (Y1, SCL);
907 U = vec_sra (U, SCL);
908 V = vec_sra (V, SCL);
910 Y0 = vec_clip_s16 (Y0);
911 Y1 = vec_clip_s16 (Y1);
912 U = vec_clip_s16 (U);
913 V = vec_clip_s16 (V);
916 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
917 U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
919 Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15
920 U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7
921 V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7
924 U0 = vec_mergeh (U,U);
925 V0 = vec_mergeh (V,V);
927 U1 = vec_mergel (U,U);
928 V1 = vec_mergel (V,V);
930 cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
931 cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
933 R = vec_packclp (R0,R1);
934 G = vec_packclp (G0,G1);
935 B = vec_packclp (B0,B1);
937 nout = (vector unsigned char *)scratch;
938 switch(c->dstFormat) {
939 case PIX_FMT_ABGR: out_abgr (R,G,B,nout); break;
940 case PIX_FMT_BGRA: out_bgra (R,G,B,nout); break;
941 case PIX_FMT_RGBA: out_rgba (R,G,B,nout); break;
942 case PIX_FMT_ARGB: out_argb (R,G,B,nout); break;
943 case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
944 case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
946 /* Unreachable, I think. */
947 av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
948 sws_format_name(c->dstFormat));
952 memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);