X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=postproc%2Frgb2rgb.c;h=7668a395c8dc484263b88350ad31111bffb0481e;hb=21316f3c84876cbc4bdd71726e36ead47c9e4694;hp=5779bf5e309363f0933046a30e466f6306bfb37e;hpb=90226a43e64e50d74932210e97fb9d4f27a01fdb;p=ffmpeg diff --git a/postproc/rgb2rgb.c b/postproc/rgb2rgb.c index 5779bf5e309..7668a395c8d 100644 --- a/postproc/rgb2rgb.c +++ b/postproc/rgb2rgb.c @@ -1,120 +1,155 @@ /* * * rgb2rgb.c, Software RGB to RGB convertor + * pluralize by Software PAL8 to RGB convertor + * Software YUV to YUV convertor + * Software YUV to RGB convertor * Written by Nick Kurshev. - * palette stuff & yuv stuff by Michael + * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL) */ #include #include "../config.h" #include "rgb2rgb.h" -#include "../mmx_defs.h" +#include "../cpudetect.h" -#ifdef HAVE_MMX +#ifdef ARCH_X86 +#define CAN_COMPILE_X86_ASM +#endif + +#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit + +#ifdef CAN_COMPILE_X86_ASM +static const uint64_t mask32b __attribute__((aligned(8))) = 0x000000FF000000FFULL; +static const uint64_t mask32g __attribute__((aligned(8))) = 0x0000FF000000FF00ULL; +static const uint64_t mask32r __attribute__((aligned(8))) = 0x00FF000000FF0000ULL; static const uint64_t mask32 __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL; static const uint64_t mask24l __attribute__((aligned(8))) = 0x0000000000FFFFFFULL; static const uint64_t mask24h __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL; +static const uint64_t mask24hh __attribute__((aligned(8))) = 0xffff000000000000ULL; +static const uint64_t mask24hhh __attribute__((aligned(8))) = 0xffffffff00000000ULL; +static const uint64_t mask24hhhh __attribute__((aligned(8))) = 0xffffffffffff0000ULL; static const uint64_t mask15b __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */ static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */ static const uint64_t mask15s __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL; +static const uint64_t red_16mask __attribute__((aligned(8))) = 0x0000f8000000f800ULL; +static const uint64_t green_16mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL; +static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL; +static const uint64_t red_15mask __attribute__((aligned(8))) = 0x00007c000000f800ULL; +static const uint64_t green_15mask __attribute__((aligned(8)))= 0x000003e0000007e0ULL; +static const uint64_t blue_15mask __attribute__((aligned(8))) = 0x0000001f0000001fULL; + +#ifdef FAST_BGR2YV12 +static const uint64_t bgr2YCoeff __attribute__((aligned(8))) = 0x000000210041000DULL; +static const uint64_t bgr2UCoeff __attribute__((aligned(8))) = 0x0000FFEEFFDC0038ULL; +static const uint64_t bgr2VCoeff __attribute__((aligned(8))) = 0x00000038FFD2FFF8ULL; +#else +static const uint64_t bgr2YCoeff __attribute__((aligned(8))) = 0x000020E540830C8BULL; +static const uint64_t bgr2UCoeff __attribute__((aligned(8))) = 0x0000ED0FDAC23831ULL; +static const uint64_t bgr2VCoeff __attribute__((aligned(8))) = 0x00003831D0E6F6EAULL; +#endif +static const uint64_t bgr2YOffset __attribute__((aligned(8))) = 0x1010101010101010ULL; +static const uint64_t bgr2UVOffset __attribute__((aligned(8)))= 0x8080808080808080ULL; +static const uint64_t w1111 __attribute__((aligned(8))) = 0x0001000100010001ULL; + +#if 0 +static volatile uint64_t __attribute__((aligned(8))) b5Dither; +static volatile uint64_t __attribute__((aligned(8))) g5Dither; +static volatile uint64_t __attribute__((aligned(8))) g6Dither; +static volatile uint64_t __attribute__((aligned(8))) r5Dither; + +static uint64_t __attribute__((aligned(8))) dither4[2]={ + 0x0103010301030103LL, + 0x0200020002000200LL,}; + +static uint64_t __attribute__((aligned(8))) dither8[2]={ + 0x0602060206020602LL, + 0x0004000400040004LL,}; +#endif #endif +#define RGB2YUV_SHIFT 8 +#define BY ((int)( 0.098*(1<>2; - for(i=0; i>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); - } +#ifdef CAN_COMPILE_X86_ASM + // ordered per speed fasterst first + if(gCpuCaps.hasMMX2) + rgb32to16_MMX2(src, dst, src_size); + else if(gCpuCaps.has3DNow) + rgb32to16_3DNow(src, dst, src_size); + else if(gCpuCaps.hasMMX) + rgb32to16_MMX(src, dst, src_size); + else + rgb32to16_C(src, dst, src_size); +#else + rgb32to16_C(src, dst, src_size); +#endif } void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size) { - unsigned j,i,num_pixels=src_size/4; - uint16_t *d = (uint16_t *)dst; - for(i=0,j=0; j>3) | ((g&0xF8)<<3) | ((r&0xF8)<<7); - } +#ifdef CAN_COMPILE_X86_ASM + // ordered per speed fasterst first + if(gCpuCaps.hasMMX2) + rgb32to15_MMX2(src, dst, src_size); + else if(gCpuCaps.has3DNow) + rgb32to15_3DNow(src, dst, src_size); + else if(gCpuCaps.hasMMX) + rgb32to15_MMX(src, dst, src_size); + else + rgb32to15_C(src, dst, src_size); +#else + rgb32to15_C(src, dst, src_size); +#endif } void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size) { - unsigned j,i,num_pixels=src_size/3; - uint16_t *d = (uint16_t *)dst; - for(i=0,j=0; j>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); - } +#ifdef CAN_COMPILE_X86_ASM + // ordered per speed fasterst first + if(gCpuCaps.hasMMX2) + rgb24to16_MMX2(src, dst, src_size); + else if(gCpuCaps.has3DNow) + rgb24to16_3DNow(src, dst, src_size); + else if(gCpuCaps.hasMMX) + rgb24to16_MMX(src, dst, src_size); + else + rgb24to16_C(src, dst, src_size); +#else + rgb24to16_C(src, dst, src_size); +#endif } void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size) { - unsigned j,i,num_pixels=src_size/3; - uint16_t *d = (uint16_t *)dst; - for(i=0,j=0; j>3) | ((g&0xF8)<<3) | ((r&0xF8)<<7); - } +#ifdef CAN_COMPILE_X86_ASM + // ordered per speed fasterst first + if(gCpuCaps.hasMMX2) + rgb24to15_MMX2(src, dst, src_size); + else if(gCpuCaps.has3DNow) + rgb24to15_3DNow(src, dst, src_size); + else if(gCpuCaps.hasMMX) + rgb24to15_MMX(src, dst, src_size); + else + rgb24to15_C(src, dst, src_size); +#else + rgb24to15_C(src, dst, src_size); +#endif } /** @@ -289,125 +293,118 @@ void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, cons for(i=0; i>1) - : "memory", "%eax" - ); - +#ifdef CAN_COMPILE_X86_ASM + // ordered per speed fasterst first + if(gCpuCaps.hasMMX2) + yv12toyuy2_MMX2(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride); + else if(gCpuCaps.has3DNow) + yv12toyuy2_3DNow(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride); + else if(gCpuCaps.hasMMX) + yv12toyuy2_MMX(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride); + else + yv12toyuy2_C(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride); #else - int i; - num_pixels>>=1; - for(i=0; i>1) - : "memory", "%eax" - ); +/** + * + * height should be a multiple of 2 and width should be a multiple of 2 (if this is a + * problem for anyone then tell me, and ill fix it) + * chrominance data is only taken from every secound line others are ignored FIXME write HQ version + */ +void rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, + unsigned int width, unsigned int height, + unsigned int lumStride, unsigned int chromStride, unsigned int srcStride) +{ +#ifdef CAN_COMPILE_X86_ASM + // ordered per speed fasterst first + if(gCpuCaps.hasMMX2) + rgb24toyv12_MMX2(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride); + else if(gCpuCaps.has3DNow) + rgb24toyv12_3DNow(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride); + else if(gCpuCaps.hasMMX) + rgb24toyv12_MMX(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride); + else + rgb24toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride); #else - int i; - num_pixels>>=1; - for(i=0; i