X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libswscale%2Fswscale_template.c;h=9aa9bd8d417eb04fc4c8ab3c2f41ddac3cd203cd;hb=4b19045566d5b67be200eebb4eba577c865d133a;hp=49ce0d32d51c4e1edb407e9ea6c4bdad6626990f;hpb=40fa514033aa6709bce49f70214a78ad3eab33eb;p=ffmpeg diff --git a/libswscale/swscale_template.c b/libswscale/swscale_template.c index 49ce0d32d51..9aa9bd8d417 100644 --- a/libswscale/swscale_template.c +++ b/libswscale/swscale_template.c @@ -25,132 +25,128 @@ #undef MOVNTQ #undef PAVGB #undef PREFETCH -#undef PREFETCHW -#if HAVE_AMD3DNOW +#if COMPILE_TEMPLATE_AMD3DNOW #define PREFETCH "prefetch" -#define PREFETCHW "prefetchw" -#elif HAVE_MMX2 +#elif COMPILE_TEMPLATE_MMX2 #define PREFETCH "prefetchnta" -#define PREFETCHW "prefetcht0" #else #define PREFETCH " # nop" -#define PREFETCHW " # nop" #endif -#if HAVE_MMX2 +#if COMPILE_TEMPLATE_MMX2 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" -#elif HAVE_AMD3DNOW +#elif COMPILE_TEMPLATE_AMD3DNOW #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" #endif -#if HAVE_MMX2 +#if COMPILE_TEMPLATE_MMX2 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" #else #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" #endif #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) -#if HAVE_ALTIVEC +#if COMPILE_TEMPLATE_ALTIVEC #include "ppc/swscale_altivec_template.c" #endif #define YSCALEYUV2YV12X(x, offset, dest, width) \ __asm__ volatile(\ - "xor %%"REG_a", %%"REG_a" \n\t"\ - "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ - "movq %%mm3, %%mm4 \n\t"\ - "lea " offset "(%0), %%"REG_d" \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - ASMALIGN(4) /* FIXME Unroll? */\ - "1: \n\t"\ - "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ - "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\ - "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\ - "add $16, %%"REG_d" \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - "test %%"REG_S", %%"REG_S" \n\t"\ - "pmulhw %%mm0, %%mm2 \n\t"\ - "pmulhw %%mm0, %%mm5 \n\t"\ - "paddw %%mm2, %%mm3 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - " jnz 1b \n\t"\ - "psraw $3, %%mm3 \n\t"\ - "psraw $3, %%mm4 \n\t"\ - "packuswb %%mm4, %%mm3 \n\t"\ - MOVNTQ(%%mm3, (%1, %%REGa))\ - "add $8, %%"REG_a" \n\t"\ - "cmp %2, %%"REG_a" \n\t"\ - "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ - "movq %%mm3, %%mm4 \n\t"\ - "lea " offset "(%0), %%"REG_d" \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - "jb 1b \n\t"\ - :: "r" (&c->redDither),\ - "r" (dest), "g" (width)\ - : "%"REG_a, "%"REG_d, "%"REG_S\ + "xor %%"REG_a", %%"REG_a" \n\t"\ + "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ + "movq %%mm3, %%mm4 \n\t"\ + "lea " offset "(%0), %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + ASMALIGN(4) /* FIXME Unroll? */\ + "1: \n\t"\ + "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ + "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\ + "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm5 \n\t" /* srcData */\ + "add $16, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "test %%"REG_S", %%"REG_S" \n\t"\ + "pmulhw %%mm0, %%mm2 \n\t"\ + "pmulhw %%mm0, %%mm5 \n\t"\ + "paddw %%mm2, %%mm3 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + " jnz 1b \n\t"\ + "psraw $3, %%mm3 \n\t"\ + "psraw $3, %%mm4 \n\t"\ + "packuswb %%mm4, %%mm3 \n\t"\ + MOVNTQ(%%mm3, (%1, %%REGa))\ + "add $8, %%"REG_a" \n\t"\ + "cmp %2, %%"REG_a" \n\t"\ + "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ + "movq %%mm3, %%mm4 \n\t"\ + "lea " offset "(%0), %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "jb 1b \n\t"\ + :: "r" (&c->redDither),\ + "r" (dest), "g" (width)\ + : "%"REG_a, "%"REG_d, "%"REG_S\ ); #define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \ __asm__ volatile(\ - "lea " offset "(%0), %%"REG_d" \n\t"\ - "xor %%"REG_a", %%"REG_a" \n\t"\ - "pxor %%mm4, %%mm4 \n\t"\ - "pxor %%mm5, %%mm5 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - ASMALIGN(4) \ - "1: \n\t"\ - "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\ - "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\ - "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ - "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\ - "movq %%mm0, %%mm3 \n\t"\ - "punpcklwd %%mm1, %%mm0 \n\t"\ - "punpckhwd %%mm1, %%mm3 \n\t"\ - "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ - "pmaddwd %%mm1, %%mm0 \n\t"\ - "pmaddwd %%mm1, %%mm3 \n\t"\ - "paddd %%mm0, %%mm4 \n\t"\ - "paddd %%mm3, %%mm5 \n\t"\ - "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\ - "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ - "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ - "test %%"REG_S", %%"REG_S" \n\t"\ - "movq %%mm2, %%mm0 \n\t"\ - "punpcklwd %%mm3, %%mm2 \n\t"\ - "punpckhwd %%mm3, %%mm0 \n\t"\ - "pmaddwd %%mm1, %%mm2 \n\t"\ - "pmaddwd %%mm1, %%mm0 \n\t"\ - "paddd %%mm2, %%mm6 \n\t"\ - "paddd %%mm0, %%mm7 \n\t"\ - " jnz 1b \n\t"\ - "psrad $16, %%mm4 \n\t"\ - "psrad $16, %%mm5 \n\t"\ - "psrad $16, %%mm6 \n\t"\ - "psrad $16, %%mm7 \n\t"\ - "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ - "packssdw %%mm5, %%mm4 \n\t"\ - "packssdw %%mm7, %%mm6 \n\t"\ - "paddw %%mm0, %%mm4 \n\t"\ - "paddw %%mm0, %%mm6 \n\t"\ - "psraw $3, %%mm4 \n\t"\ - "psraw $3, %%mm6 \n\t"\ - "packuswb %%mm6, %%mm4 \n\t"\ - MOVNTQ(%%mm4, (%1, %%REGa))\ - "add $8, %%"REG_a" \n\t"\ - "cmp %2, %%"REG_a" \n\t"\ - "lea " offset "(%0), %%"REG_d" \n\t"\ - "pxor %%mm4, %%mm4 \n\t"\ - "pxor %%mm5, %%mm5 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - "jb 1b \n\t"\ - :: "r" (&c->redDither),\ - "r" (dest), "g" (width)\ - : "%"REG_a, "%"REG_d, "%"REG_S\ + "lea " offset "(%0), %%"REG_d" \n\t"\ + "xor %%"REG_a", %%"REG_a" \n\t"\ + "pxor %%mm4, %%mm4 \n\t"\ + "pxor %%mm5, %%mm5 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + "pxor %%mm7, %%mm7 \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + ASMALIGN(4) \ + "1: \n\t"\ + "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* srcData */\ + "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* srcData */\ + "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ + "movq " x "(%%"REG_S", %%"REG_a", 2), %%mm1 \n\t" /* srcData */\ + "movq %%mm0, %%mm3 \n\t"\ + "punpcklwd %%mm1, %%mm0 \n\t"\ + "punpckhwd %%mm1, %%mm3 \n\t"\ + "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm1 \n\t" /* filterCoeff */\ + "pmaddwd %%mm1, %%mm0 \n\t"\ + "pmaddwd %%mm1, %%mm3 \n\t"\ + "paddd %%mm0, %%mm4 \n\t"\ + "paddd %%mm3, %%mm5 \n\t"\ + "movq 8+" x "(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* srcData */\ + "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ + "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ + "test %%"REG_S", %%"REG_S" \n\t"\ + "movq %%mm2, %%mm0 \n\t"\ + "punpcklwd %%mm3, %%mm2 \n\t"\ + "punpckhwd %%mm3, %%mm0 \n\t"\ + "pmaddwd %%mm1, %%mm2 \n\t"\ + "pmaddwd %%mm1, %%mm0 \n\t"\ + "paddd %%mm2, %%mm6 \n\t"\ + "paddd %%mm0, %%mm7 \n\t"\ + " jnz 1b \n\t"\ + "psrad $16, %%mm4 \n\t"\ + "psrad $16, %%mm5 \n\t"\ + "psrad $16, %%mm6 \n\t"\ + "psrad $16, %%mm7 \n\t"\ + "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ + "packssdw %%mm5, %%mm4 \n\t"\ + "packssdw %%mm7, %%mm6 \n\t"\ + "paddw %%mm0, %%mm4 \n\t"\ + "paddw %%mm0, %%mm6 \n\t"\ + "psraw $3, %%mm4 \n\t"\ + "psraw $3, %%mm6 \n\t"\ + "packuswb %%mm6, %%mm4 \n\t"\ + MOVNTQ(%%mm4, (%1, %%REGa))\ + "add $8, %%"REG_a" \n\t"\ + "cmp %2, %%"REG_a" \n\t"\ + "lea " offset "(%0), %%"REG_d" \n\t"\ + "pxor %%mm4, %%mm4 \n\t"\ + "pxor %%mm5, %%mm5 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + "pxor %%mm7, %%mm7 \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "jb 1b \n\t"\ + :: "r" (&c->redDither),\ + "r" (dest), "g" (width)\ + : "%"REG_a, "%"REG_d, "%"REG_S\ ); #define YSCALEYUV2YV121 \ @@ -193,27 +189,27 @@ */ #define YSCALEYUV2PACKEDX_UV \ __asm__ volatile(\ - "xor %%"REG_a", %%"REG_a" \n\t"\ - ASMALIGN(4)\ - "nop \n\t"\ - "1: \n\t"\ - "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ - "movq %%mm3, %%mm4 \n\t"\ - ASMALIGN(4)\ - "2: \n\t"\ - "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ - "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ - "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ - "add $16, %%"REG_d" \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - "pmulhw %%mm0, %%mm2 \n\t"\ - "pmulhw %%mm0, %%mm5 \n\t"\ - "paddw %%mm2, %%mm3 \n\t"\ - "paddw %%mm5, %%mm4 \n\t"\ - "test %%"REG_S", %%"REG_S" \n\t"\ - " jnz 2b \n\t"\ + "xor %%"REG_a", %%"REG_a" \n\t"\ + ASMALIGN(4)\ + "nop \n\t"\ + "1: \n\t"\ + "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ + "movq %%mm3, %%mm4 \n\t"\ + ASMALIGN(4)\ + "2: \n\t"\ + "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ + "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ + "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ + "add $16, %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "pmulhw %%mm0, %%mm2 \n\t"\ + "pmulhw %%mm0, %%mm5 \n\t"\ + "paddw %%mm2, %%mm3 \n\t"\ + "paddw %%mm5, %%mm4 \n\t"\ + "test %%"REG_S", %%"REG_S" \n\t"\ + " jnz 2b \n\t"\ #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ "lea "offset"(%0), %%"REG_d" \n\t"\ @@ -238,62 +234,62 @@ YSCALEYUV2PACKEDX_UV \ YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \ -#define YSCALEYUV2PACKEDX_END \ - :: "r" (&c->redDither), \ - "m" (dummy), "m" (dummy), "m" (dummy),\ - "r" (dest), "m" (dstW) \ - : "%"REG_a, "%"REG_d, "%"REG_S \ +#define YSCALEYUV2PACKEDX_END \ + :: "r" (&c->redDither), \ + "m" (dummy), "m" (dummy), "m" (dummy),\ + "r" (dest), "m" (dstW) \ + : "%"REG_a, "%"REG_d, "%"REG_S \ ); #define YSCALEYUV2PACKEDX_ACCURATE_UV \ __asm__ volatile(\ - "xor %%"REG_a", %%"REG_a" \n\t"\ - ASMALIGN(4)\ - "nop \n\t"\ - "1: \n\t"\ - "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ - "mov (%%"REG_d"), %%"REG_S" \n\t"\ - "pxor %%mm4, %%mm4 \n\t"\ - "pxor %%mm5, %%mm5 \n\t"\ - "pxor %%mm6, %%mm6 \n\t"\ - "pxor %%mm7, %%mm7 \n\t"\ - ASMALIGN(4)\ - "2: \n\t"\ - "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ - "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ - "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ - "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ - "movq %%mm0, %%mm3 \n\t"\ - "punpcklwd %%mm1, %%mm0 \n\t"\ - "punpckhwd %%mm1, %%mm3 \n\t"\ - "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\ - "pmaddwd %%mm1, %%mm0 \n\t"\ - "pmaddwd %%mm1, %%mm3 \n\t"\ - "paddd %%mm0, %%mm4 \n\t"\ - "paddd %%mm3, %%mm5 \n\t"\ - "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ - "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ - "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ - "test %%"REG_S", %%"REG_S" \n\t"\ - "movq %%mm2, %%mm0 \n\t"\ - "punpcklwd %%mm3, %%mm2 \n\t"\ - "punpckhwd %%mm3, %%mm0 \n\t"\ - "pmaddwd %%mm1, %%mm2 \n\t"\ - "pmaddwd %%mm1, %%mm0 \n\t"\ - "paddd %%mm2, %%mm6 \n\t"\ - "paddd %%mm0, %%mm7 \n\t"\ - " jnz 2b \n\t"\ - "psrad $16, %%mm4 \n\t"\ - "psrad $16, %%mm5 \n\t"\ - "psrad $16, %%mm6 \n\t"\ - "psrad $16, %%mm7 \n\t"\ - "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ - "packssdw %%mm5, %%mm4 \n\t"\ - "packssdw %%mm7, %%mm6 \n\t"\ - "paddw %%mm0, %%mm4 \n\t"\ - "paddw %%mm0, %%mm6 \n\t"\ - "movq %%mm4, "U_TEMP"(%0) \n\t"\ - "movq %%mm6, "V_TEMP"(%0) \n\t"\ + "xor %%"REG_a", %%"REG_a" \n\t"\ + ASMALIGN(4)\ + "nop \n\t"\ + "1: \n\t"\ + "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ + "mov (%%"REG_d"), %%"REG_S" \n\t"\ + "pxor %%mm4, %%mm4 \n\t"\ + "pxor %%mm5, %%mm5 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + "pxor %%mm7, %%mm7 \n\t"\ + ASMALIGN(4)\ + "2: \n\t"\ + "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ + "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ + "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ + "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ + "movq %%mm0, %%mm3 \n\t"\ + "punpcklwd %%mm1, %%mm0 \n\t"\ + "punpckhwd %%mm1, %%mm3 \n\t"\ + "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\ + "pmaddwd %%mm1, %%mm0 \n\t"\ + "pmaddwd %%mm1, %%mm3 \n\t"\ + "paddd %%mm0, %%mm4 \n\t"\ + "paddd %%mm3, %%mm5 \n\t"\ + "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ + "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ + "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ + "test %%"REG_S", %%"REG_S" \n\t"\ + "movq %%mm2, %%mm0 \n\t"\ + "punpcklwd %%mm3, %%mm2 \n\t"\ + "punpckhwd %%mm3, %%mm0 \n\t"\ + "pmaddwd %%mm1, %%mm2 \n\t"\ + "pmaddwd %%mm1, %%mm0 \n\t"\ + "paddd %%mm2, %%mm6 \n\t"\ + "paddd %%mm0, %%mm7 \n\t"\ + " jnz 2b \n\t"\ + "psrad $16, %%mm4 \n\t"\ + "psrad $16, %%mm5 \n\t"\ + "psrad $16, %%mm6 \n\t"\ + "psrad $16, %%mm7 \n\t"\ + "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ + "packssdw %%mm5, %%mm4 \n\t"\ + "packssdw %%mm7, %%mm6 \n\t"\ + "paddw %%mm0, %%mm4 \n\t"\ + "paddw %%mm0, %%mm6 \n\t"\ + "movq %%mm4, "U_TEMP"(%0) \n\t"\ + "movq %%mm6, "V_TEMP"(%0) \n\t"\ #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ "lea "offset"(%0), %%"REG_d" \n\t"\ @@ -351,14 +347,14 @@ "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ -/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ + /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ -/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ + /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ "paddw %%mm3, %%mm4 \n\t"\ "movq %%mm2, %%mm0 \n\t"\ "movq %%mm5, %%mm6 \n\t"\ @@ -875,7 +871,7 @@ "cmp "#dstw", "#index" \n\t"\ " jb 1b \n\t" -#if HAVE_MMX2 +#if COMPILE_TEMPLATE_MMX2 #undef WRITEBGR24 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX2(dst, dstw, index) #else @@ -905,24 +901,24 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, con const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW) { -#if HAVE_MMX - if(!(c->flags & SWS_BITEXACT)){ - if (c->flags & SWS_ACCURATE_RND){ - if (uDest){ +#if COMPILE_TEMPLATE_MMX + if(!(c->flags & SWS_BITEXACT)) { + if (c->flags & SWS_ACCURATE_RND) { + if (uDest) { YSCALEYUV2YV12X_ACCURATE( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW) YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW) } - if (CONFIG_SWSCALE_ALPHA && aDest){ + if (CONFIG_SWSCALE_ALPHA && aDest) { YSCALEYUV2YV12X_ACCURATE( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW) } YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW) - }else{ - if (uDest){ + } else { + if (uDest) { YSCALEYUV2YV12X( "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW) YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW) } - if (CONFIG_SWSCALE_ALPHA && aDest){ + if (CONFIG_SWSCALE_ALPHA && aDest) { YSCALEYUV2YV12X( "0", ALP_MMX_FILTER_OFFSET, aDest, dstW) } @@ -931,40 +927,40 @@ static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, con return; } #endif -#if HAVE_ALTIVEC -yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize, - chrFilter, chrSrc, chrFilterSize, - dest, uDest, vDest, dstW, chrDstW); -#else //HAVE_ALTIVEC -yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, - chrFilter, chrSrc, chrFilterSize, - alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW); -#endif //!HAVE_ALTIVEC +#if COMPILE_TEMPLATE_ALTIVEC + yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize, + chrFilter, chrSrc, chrFilterSize, + dest, uDest, vDest, dstW, chrDstW); +#else //COMPILE_TEMPLATE_ALTIVEC + yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize, + chrFilter, chrSrc, chrFilterSize, + alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW); +#endif //!COMPILE_TEMPLATE_ALTIVEC } static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, - uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat) + uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat) { -yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize, - chrFilter, chrSrc, chrFilterSize, - dest, uDest, dstW, chrDstW, dstFormat); + yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize, + chrFilter, chrSrc, chrFilterSize, + dest, uDest, dstW, chrDstW, dstFormat); } static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc, uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW) { int i; -#if HAVE_MMX - if(!(c->flags & SWS_BITEXACT)){ +#if COMPILE_TEMPLATE_MMX + if(!(c->flags & SWS_BITEXACT)) { long p= 4; uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW}; uint8_t *dst[4]= {aDest, dest, uDest, vDest}; x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW}; - if (c->flags & SWS_ACCURATE_RND){ - while(p--){ - if (dst[p]){ + if (c->flags & SWS_ACCURATE_RND) { + while(p--) { + if (dst[p]) { __asm__ volatile( YSCALEYUV2YV121_ACCURATE :: "r" (src[p]), "r" (dst[p] + counter[p]), @@ -973,9 +969,9 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const ); } } - }else{ - while(p--){ - if (dst[p]){ + } else { + while(p--) { + if (dst[p]) { __asm__ volatile( YSCALEYUV2YV121 :: "r" (src[p]), "r" (dst[p] + counter[p]), @@ -988,11 +984,10 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const return; } #endif - for (i=0; i>7; - if (val&256){ + if (val&256) { if (val<0) val=0; else val=255; } @@ -1001,12 +996,11 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const } if (uDest) - for (i=0; i>7; int v=(chrSrc[i + VOFW]+64)>>7; - if ((u|v)&256){ + if ((u|v)&256) { if (u<0) u=0; else if (u>255) u=255; if (v<0) v=0; @@ -1018,7 +1012,7 @@ static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const } if (CONFIG_SWSCALE_ALPHA && aDest) - for (i=0; i>7; aDest[i]= av_clip_uint8(val); } @@ -1032,13 +1026,13 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY) { -#if HAVE_MMX +#if COMPILE_TEMPLATE_MMX x86_reg dummy=0; - if(!(c->flags & SWS_BITEXACT)){ - if (c->flags & SWS_ACCURATE_RND){ - switch(c->dstFormat){ + if(!(c->flags & SWS_BITEXACT)) { + if (c->flags & SWS_ACCURATE_RND) { + switch(c->dstFormat) { case PIX_FMT_RGB32: - if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){ + if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX "movq %%mm2, "U_TEMP"(%0) \n\t" @@ -1052,7 +1046,7 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) YSCALEYUV2PACKEDX_END - }else{ + } else { YSCALEYUV2PACKEDX_ACCURATE YSCALEYUV2RGBX "pcmpeqd %%mm7, %%mm7 \n\t" @@ -1116,11 +1110,10 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, YSCALEYUV2PACKEDX_END return; } - }else{ - switch(c->dstFormat) - { + } else { + switch(c->dstFormat) { case PIX_FMT_RGB32: - if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){ + if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { YSCALEYUV2PACKEDX YSCALEYUV2RGBX YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7) @@ -1129,7 +1122,7 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, "packuswb %%mm7, %%mm1 \n\t" WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) YSCALEYUV2PACKEDX_END - }else{ + } else { YSCALEYUV2PACKEDX YSCALEYUV2RGBX "pcmpeqd %%mm7, %%mm7 \n\t" @@ -1193,14 +1186,14 @@ static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, } } } -#endif /* HAVE_MMX */ -#if HAVE_ALTIVEC +#endif /* COMPILE_TEMPLATE_MMX */ +#if COMPILE_TEMPLATE_ALTIVEC /* The following list of supported dstFormat values should match what's found in the body of ff_yuv2packedX_altivec() */ if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf && - (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA || - c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 || - c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)) + (c->dstFormat==PIX_FMT_ABGR || c->dstFormat==PIX_FMT_BGRA || + c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 || + c->dstFormat==PIX_FMT_RGBA || c->dstFormat==PIX_FMT_ARGB)) ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, chrFilter, chrSrc, chrFilterSize, dest, dstW, dstY); @@ -1221,31 +1214,30 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons int uvalpha1=4095-uvalpha; int i; -#if HAVE_MMX - if(!(c->flags & SWS_BITEXACT)){ - switch(c->dstFormat) - { - //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( - case PIX_FMT_RGB32: - if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){ +#if COMPILE_TEMPLATE_MMX + if(!(c->flags & SWS_BITEXACT)) { + switch(c->dstFormat) { + //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( + case PIX_FMT_RGB32: + if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { #if ARCH_X86_64 - __asm__ volatile( - YSCALEYUV2RGB(%%REGBP, %5) - YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7) + __asm__ volatile( + YSCALEYUV2RGB(%%r8, %5) + YSCALEYUV2RGB_YA(%%r8, %5, %6, %7) "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ "packuswb %%mm7, %%mm1 \n\t" - WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) + WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest), "a" (&c->redDither) ,"r" (abuf0), "r" (abuf1) - : "%"REG_BP - ); + : "%r8" + ); #else - *(uint16_t **)(&c->u_temp)=abuf0; - *(uint16_t **)(&c->v_temp)=abuf1; - __asm__ volatile( + *(uint16_t **)(&c->u_temp)=abuf0; + *(uint16_t **)(&c->v_temp)=abuf1; + __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" @@ -1266,10 +1258,10 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), "a" (&c->redDither) - ); + ); #endif - }else{ - __asm__ volatile( + } else { + __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" @@ -1281,11 +1273,11 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), "a" (&c->redDither) - ); - } - return; - case PIX_FMT_BGR24: - __asm__ volatile( + ); + } + return; + case PIX_FMT_BGR24: + __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" @@ -1296,10 +1288,10 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), "a" (&c->redDither) - ); - return; - case PIX_FMT_RGB555: - __asm__ volatile( + ); + return; + case PIX_FMT_RGB555: + __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" @@ -1318,10 +1310,10 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), "a" (&c->redDither) - ); - return; - case PIX_FMT_RGB565: - __asm__ volatile( + ); + return; + case PIX_FMT_RGB565: + __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" @@ -1339,10 +1331,10 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), "a" (&c->redDither) - ); - return; - case PIX_FMT_YUYV422: - __asm__ volatile( + ); + return; + case PIX_FMT_YUYV422: + __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" @@ -1352,20 +1344,20 @@ static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, cons "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), "a" (&c->redDither) - ); - return; - default: break; + ); + return; + default: break; } } -#endif //HAVE_MMX -YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C) +#endif //COMPILE_TEMPLATE_MMX + YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C) } /** * YV12 to RGB without scaling or interpolating */ static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1, - const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y) + const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y) { const int yalpha1=0; int i; @@ -1373,238 +1365,231 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, cons const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 const int yalpha= 4096; //FIXME ... - if (flags&SWS_FULL_CHR_H_INT) - { + if (flags&SWS_FULL_CHR_H_INT) { c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y); return; } -#if HAVE_MMX - if(!(flags & SWS_BITEXACT)){ - if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster - { - switch(dstFormat) - { +#if COMPILE_TEMPLATE_MMX + if(!(flags & SWS_BITEXACT)) { + if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster + switch(dstFormat) { case PIX_FMT_RGB32: - if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){ + if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1(%%REGBP, %5) - YSCALEYUV2RGB1_ALPHA(%%REGBP) - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - - :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), - "a" (&c->redDither) + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1(%%REGBP, %5) + YSCALEYUV2RGB1_ALPHA(%%REGBP) + WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + + :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), + "a" (&c->redDither) ); - }else{ + } else { __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1(%%REGBP, %5) + "pcmpeqd %%mm7, %%mm7 \n\t" + WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + + :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), + "a" (&c->redDither) + ); + } + return; + case PIX_FMT_BGR24: + __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" YSCALEYUV2RGB1(%%REGBP, %5) - "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + "pxor %%mm7, %%mm7 \n\t" + WRITEBGR24(%%REGb, 8280(%5), %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), "a" (&c->redDither) - ); - } - return; - case PIX_FMT_BGR24: - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1(%%REGBP, %5) - "pxor %%mm7, %%mm7 \n\t" - WRITEBGR24(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - - :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), - "a" (&c->redDither) ); return; case PIX_FMT_RGB555: __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1(%%REGBP, %5) - "pxor %%mm7, %%mm7 \n\t" - /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1(%%REGBP, %5) + "pxor %%mm7, %%mm7 \n\t" + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ #ifdef DITHER1XBPP - "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" - "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" - "paddusb "RED_DITHER"(%5), %%mm5 \n\t" + "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" + "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" + "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB15(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + WRITERGB15(%%REGb, 8280(%5), %%REGBP) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), - "a" (&c->redDither) + :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), + "a" (&c->redDither) ); return; case PIX_FMT_RGB565: __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1(%%REGBP, %5) - "pxor %%mm7, %%mm7 \n\t" - /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1(%%REGBP, %5) + "pxor %%mm7, %%mm7 \n\t" + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ #ifdef DITHER1XBPP - "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" - "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" - "paddusb "RED_DITHER"(%5), %%mm5 \n\t" + "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" + "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" + "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB16(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + WRITERGB16(%%REGb, 8280(%5), %%REGBP) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), - "a" (&c->redDither) + :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), + "a" (&c->redDither) ); return; case PIX_FMT_YUYV422: __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2PACKED1(%%REGBP, %5) - WRITEYUY2(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - - :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), - "a" (&c->redDither) - ); - return; - } - } - else - { - switch(dstFormat) - { - case PIX_FMT_RGB32: - if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){ - __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1b(%%REGBP, %5) - YSCALEYUV2RGB1_ALPHA(%%REGBP) - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + YSCALEYUV2PACKED1(%%REGBP, %5) + WRITEYUY2(%%REGb, 8280(%5), %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), + :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), "a" (&c->redDither) + ); + return; + } + } else { + switch(dstFormat) { + case PIX_FMT_RGB32: + if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { + __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1b(%%REGBP, %5) + YSCALEYUV2RGB1_ALPHA(%%REGBP) + WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + + :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), + "a" (&c->redDither) ); - }else{ + } else { __asm__ volatile( + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1b(%%REGBP, %5) + "pcmpeqd %%mm7, %%mm7 \n\t" + WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + + :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), + "a" (&c->redDither) + ); + } + return; + case PIX_FMT_BGR24: + __asm__ volatile( "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" "mov %4, %%"REG_b" \n\t" "push %%"REG_BP" \n\t" YSCALEYUV2RGB1b(%%REGBP, %5) - "pcmpeqd %%mm7, %%mm7 \n\t" - WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) + "pxor %%mm7, %%mm7 \n\t" + WRITEBGR24(%%REGb, 8280(%5), %%REGBP) "pop %%"REG_BP" \n\t" "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), "a" (&c->redDither) - ); - } - return; - case PIX_FMT_BGR24: - __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1b(%%REGBP, %5) - "pxor %%mm7, %%mm7 \n\t" - WRITEBGR24(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - - :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), - "a" (&c->redDither) ); return; case PIX_FMT_RGB555: __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1b(%%REGBP, %5) - "pxor %%mm7, %%mm7 \n\t" - /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1b(%%REGBP, %5) + "pxor %%mm7, %%mm7 \n\t" + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ #ifdef DITHER1XBPP - "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" - "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" - "paddusb "RED_DITHER"(%5), %%mm5 \n\t" + "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" + "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" + "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB15(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + WRITERGB15(%%REGb, 8280(%5), %%REGBP) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), - "a" (&c->redDither) + :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), + "a" (&c->redDither) ); return; case PIX_FMT_RGB565: __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2RGB1b(%%REGBP, %5) - "pxor %%mm7, %%mm7 \n\t" - /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2RGB1b(%%REGBP, %5) + "pxor %%mm7, %%mm7 \n\t" + /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ #ifdef DITHER1XBPP - "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" - "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" - "paddusb "RED_DITHER"(%5), %%mm5 \n\t" + "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" + "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" + "paddusb "RED_DITHER"(%5), %%mm5 \n\t" #endif - WRITERGB16(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + WRITERGB16(%%REGb, 8280(%5), %%REGBP) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), - "a" (&c->redDither) + :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), + "a" (&c->redDither) ); return; case PIX_FMT_YUYV422: __asm__ volatile( - "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" - "mov %4, %%"REG_b" \n\t" - "push %%"REG_BP" \n\t" - YSCALEYUV2PACKED1b(%%REGBP, %5) - WRITEYUY2(%%REGb, 8280(%5), %%REGBP) - "pop %%"REG_BP" \n\t" - "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" + "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" + "mov %4, %%"REG_b" \n\t" + "push %%"REG_BP" \n\t" + YSCALEYUV2PACKED1b(%%REGBP, %5) + WRITEYUY2(%%REGb, 8280(%5), %%REGBP) + "pop %%"REG_BP" \n\t" + "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" - :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), - "a" (&c->redDither) + :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest), + "a" (&c->redDither) ); return; } } } -#endif /* HAVE_MMX */ - if (uvalpha < 2048) - { +#endif /* COMPILE_TEMPLATE_MMX */ + if (uvalpha < 2048) { YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C) - }else{ + } else { YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C) } } @@ -1613,21 +1598,21 @@ static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, cons static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused) { -#if HAVE_MMX +#if COMPILE_TEMPLATE_MMX __asm__ volatile( - "movq "MANGLE(bm01010101)", %%mm2 \n\t" - "mov %0, %%"REG_a" \n\t" - "1: \n\t" - "movq (%1, %%"REG_a",2), %%mm0 \n\t" - "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" - "pand %%mm2, %%mm0 \n\t" - "pand %%mm2, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "movq %%mm0, (%2, %%"REG_a") \n\t" - "add $8, %%"REG_a" \n\t" - " js 1b \n\t" - : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width) - : "%"REG_a + "movq "MANGLE(bm01010101)", %%mm2 \n\t" + "mov %0, %%"REG_a" \n\t" + "1: \n\t" + "movq (%1, %%"REG_a",2), %%mm0 \n\t" + "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" + "pand %%mm2, %%mm0 \n\t" + "pand %%mm2, %%mm1 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "movq %%mm0, (%2, %%"REG_a") \n\t" + "add $8, %%"REG_a" \n\t" + " js 1b \n\t" + : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width) + : "%"REG_a ); #else int i; @@ -1638,32 +1623,31 @@ static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) { -#if HAVE_MMX +#if COMPILE_TEMPLATE_MMX __asm__ volatile( - "movq "MANGLE(bm01010101)", %%mm4 \n\t" - "mov %0, %%"REG_a" \n\t" - "1: \n\t" - "movq (%1, %%"REG_a",4), %%mm0 \n\t" - "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" - "psrlw $8, %%mm0 \n\t" - "psrlw $8, %%mm1 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "movq %%mm0, %%mm1 \n\t" - "psrlw $8, %%mm0 \n\t" - "pand %%mm4, %%mm1 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "packuswb %%mm1, %%mm1 \n\t" - "movd %%mm0, (%3, %%"REG_a") \n\t" - "movd %%mm1, (%2, %%"REG_a") \n\t" - "add $4, %%"REG_a" \n\t" - " js 1b \n\t" - : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) - : "%"REG_a + "movq "MANGLE(bm01010101)", %%mm4 \n\t" + "mov %0, %%"REG_a" \n\t" + "1: \n\t" + "movq (%1, %%"REG_a",4), %%mm0 \n\t" + "movq 8(%1, %%"REG_a",4), %%mm1 \n\t" + "psrlw $8, %%mm0 \n\t" + "psrlw $8, %%mm1 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "movq %%mm0, %%mm1 \n\t" + "psrlw $8, %%mm0 \n\t" + "pand %%mm4, %%mm1 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "packuswb %%mm1, %%mm1 \n\t" + "movd %%mm0, (%3, %%"REG_a") \n\t" + "movd %%mm1, (%2, %%"REG_a") \n\t" + "add $4, %%"REG_a" \n\t" + " js 1b \n\t" + : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width) + : "%"REG_a ); #else int i; - for (i=0; i>shb)&maskb;\ - int g= (((const type*)src)[i]>>shg)&maskg;\ - int r= (((const type*)src)[i]>>shr)&maskr;\ -\ - dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\ - }\ +static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) +{ +#if COMPILE_TEMPLATE_MMX + __asm__ volatile( + "movq "MANGLE(bm01010101)", %%mm4 \n\t" + "mov %0, %%"REG_a" \n\t" + "1: \n\t" + "movq (%1, %%"REG_a",2), %%mm0 \n\t" + "movq 8(%1, %%"REG_a",2), %%mm1 \n\t" + "movq (%2, %%"REG_a",2), %%mm2 \n\t" + "movq 8(%2, %%"REG_a",2), %%mm3 \n\t" + "pand %%mm4, %%mm0 \n\t" + "pand %%mm4, %%mm1 \n\t" + "pand %%mm4, %%mm2 \n\t" + "pand %%mm4, %%mm3 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "packuswb %%mm3, %%mm2 \n\t" + "movq %%mm0, (%3, %%"REG_a") \n\t" + "movq %%mm2, (%4, %%"REG_a") \n\t" + "add $8, %%"REG_a" \n\t" + " js 1b \n\t" + : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width) + : "%"REG_a + ); +#else + int i; + for (i=0; i>shb;\ - int g= (((const type*)src)[i]&maskg)>>shg;\ - int r= (((const type*)src)[i]&maskr)>>shr;\ -\ - dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\ - dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\ - }\ -}\ -static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, const uint8_t *dummy, long width, uint32_t *unused)\ -{\ - int i;\ - for (i=0; i>shb;\ - int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\ - g&= maskg|(2*maskg);\ -\ - g>>=shg;\ -\ - dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\ - dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\ - }\ +static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV, + const uint8_t *src1, const uint8_t *src2, + long width, uint32_t *unused) +{ + RENAME(nvXXtoUV)(dstU, dstV, src1, width); } -BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8) -BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU , BU<< 8, RV<< 8, GV , BV<< 8, RGB2YUV_SHIFT+8) -BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU , RV<<11, GV<<5, BV , RGB2YUV_SHIFT+8) -BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU , RV<<10, GV<<5, BV , RGB2YUV_SHIFT+7) -BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU , GU<<5, BU<<11, RV , GV<<5, BV<<11, RGB2YUV_SHIFT+8) -BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU , GU<<5, BU<<10, RV , GV<<5, BV<<10, RGB2YUV_SHIFT+7) +static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV, + const uint8_t *src1, const uint8_t *src2, + long width, uint32_t *unused) +{ + RENAME(nvXXtoUV)(dstV, dstU, src1, width); +} -#if HAVE_MMX -static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, int srcFormat) +#if COMPILE_TEMPLATE_MMX +static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat) { - if(srcFormat == PIX_FMT_BGR24){ + if(srcFormat == PIX_FMT_BGR24) { __asm__ volatile( "movq "MANGLE(ff_bgr24toY1Coeff)", %%mm5 \n\t" "movq "MANGLE(ff_bgr24toY2Coeff)", %%mm6 \n\t" : ); - }else{ + } else { __asm__ volatile( "movq "MANGLE(ff_rgb24toY1Coeff)", %%mm5 \n\t" "movq "MANGLE(ff_rgb24toY2Coeff)", %%mm6 \n\t" @@ -1854,7 +1882,7 @@ static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long w ); } -static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, int srcFormat) +static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat) { __asm__ volatile( "movq 24+%4, %%mm6 \n\t" @@ -1915,29 +1943,27 @@ static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uin static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused) { -#if HAVE_MMX +#if COMPILE_TEMPLATE_MMX RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24); #else int i; - for (i=0; i>RGB2YUV_SHIFT); } -#endif /* HAVE_MMX */ +#endif /* COMPILE_TEMPLATE_MMX */ } static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) { -#if HAVE_MMX +#if COMPILE_TEMPLATE_MMX RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24); #else int i; - for (i=0; i>RGB2YUV_SHIFT; dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT; } -#endif /* HAVE_MMX */ +#endif /* COMPILE_TEMPLATE_MMX */ assert(src1 == src2); } static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused) { int i; - for (i=0; i>8; - dstV[i]= p>>16; - } -} - -static inline void RENAME(monowhite2Y)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused) -{ - int i, j; - for (i=0; i>(7-j))&1)*255; - } -} - -static inline void RENAME(monoblack2Y)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused) -{ - int i, j; - for (i=0; i>(7-j))&1)*255; - } -} - // bilinear / bicubic scaling static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc, const int16_t *filter, const int16_t *filterPos, long filterSize) { -#if HAVE_MMX +#if COMPILE_TEMPLATE_MMX assert(filterSize % 4 == 0 && filterSize>0); - if (filterSize==4) // Always true for upscaling, sometimes for down, too. - { + if (filterSize==4) { // Always true for upscaling, sometimes for down, too. x86_reg counter= -2*dstW; filter-= counter*2; filterPos-= counter/2; dst-= counter/2; __asm__ volatile( #if defined(PIC) - "push %%"REG_b" \n\t" + "push %%"REG_b" \n\t" #endif - "pxor %%mm7, %%mm7 \n\t" - "push %%"REG_BP" \n\t" // we use 7 regs here ... - "mov %%"REG_a", %%"REG_BP" \n\t" - ASMALIGN(4) - "1: \n\t" - "movzwl (%2, %%"REG_BP"), %%eax \n\t" - "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" - "movq (%1, %%"REG_BP", 4), %%mm1 \n\t" - "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t" - "movd (%3, %%"REG_a"), %%mm0 \n\t" - "movd (%3, %%"REG_b"), %%mm2 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "pmaddwd %%mm1, %%mm0 \n\t" - "pmaddwd %%mm2, %%mm3 \n\t" - "movq %%mm0, %%mm4 \n\t" - "punpckldq %%mm3, %%mm0 \n\t" - "punpckhdq %%mm3, %%mm4 \n\t" - "paddd %%mm4, %%mm0 \n\t" - "psrad $7, %%mm0 \n\t" - "packssdw %%mm0, %%mm0 \n\t" - "movd %%mm0, (%4, %%"REG_BP") \n\t" - "add $4, %%"REG_BP" \n\t" - " jnc 1b \n\t" - - "pop %%"REG_BP" \n\t" + "pxor %%mm7, %%mm7 \n\t" + "push %%"REG_BP" \n\t" // we use 7 regs here ... + "mov %%"REG_a", %%"REG_BP" \n\t" + ASMALIGN(4) + "1: \n\t" + "movzwl (%2, %%"REG_BP"), %%eax \n\t" + "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" + "movq (%1, %%"REG_BP", 4), %%mm1 \n\t" + "movq 8(%1, %%"REG_BP", 4), %%mm3 \n\t" + "movd (%3, %%"REG_a"), %%mm0 \n\t" + "movd (%3, %%"REG_b"), %%mm2 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "pmaddwd %%mm1, %%mm0 \n\t" + "pmaddwd %%mm2, %%mm3 \n\t" + "movq %%mm0, %%mm4 \n\t" + "punpckldq %%mm3, %%mm0 \n\t" + "punpckhdq %%mm3, %%mm4 \n\t" + "paddd %%mm4, %%mm0 \n\t" + "psrad $7, %%mm0 \n\t" + "packssdw %%mm0, %%mm0 \n\t" + "movd %%mm0, (%4, %%"REG_BP") \n\t" + "add $4, %%"REG_BP" \n\t" + " jnc 1b \n\t" + + "pop %%"REG_BP" \n\t" #if defined(PIC) - "pop %%"REG_b" \n\t" + "pop %%"REG_b" \n\t" #endif - : "+a" (counter) - : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) + : "+a" (counter) + : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) #if !defined(PIC) - : "%"REG_b + : "%"REG_b #endif ); - } - else if (filterSize==8) - { + } else if (filterSize==8) { x86_reg counter= -2*dstW; filter-= counter*4; filterPos-= counter/2; dst-= counter/2; __asm__ volatile( #if defined(PIC) - "push %%"REG_b" \n\t" + "push %%"REG_b" \n\t" #endif - "pxor %%mm7, %%mm7 \n\t" - "push %%"REG_BP" \n\t" // we use 7 regs here ... - "mov %%"REG_a", %%"REG_BP" \n\t" - ASMALIGN(4) - "1: \n\t" - "movzwl (%2, %%"REG_BP"), %%eax \n\t" - "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" - "movq (%1, %%"REG_BP", 8), %%mm1 \n\t" - "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t" - "movd (%3, %%"REG_a"), %%mm0 \n\t" - "movd (%3, %%"REG_b"), %%mm2 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "pmaddwd %%mm1, %%mm0 \n\t" - "pmaddwd %%mm2, %%mm3 \n\t" - - "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t" - "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t" - "movd 4(%3, %%"REG_a"), %%mm4 \n\t" - "movd 4(%3, %%"REG_b"), %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "pmaddwd %%mm1, %%mm4 \n\t" - "pmaddwd %%mm2, %%mm5 \n\t" - "paddd %%mm4, %%mm0 \n\t" - "paddd %%mm5, %%mm3 \n\t" - "movq %%mm0, %%mm4 \n\t" - "punpckldq %%mm3, %%mm0 \n\t" - "punpckhdq %%mm3, %%mm4 \n\t" - "paddd %%mm4, %%mm0 \n\t" - "psrad $7, %%mm0 \n\t" - "packssdw %%mm0, %%mm0 \n\t" - "movd %%mm0, (%4, %%"REG_BP") \n\t" - "add $4, %%"REG_BP" \n\t" - " jnc 1b \n\t" - - "pop %%"REG_BP" \n\t" + "pxor %%mm7, %%mm7 \n\t" + "push %%"REG_BP" \n\t" // we use 7 regs here ... + "mov %%"REG_a", %%"REG_BP" \n\t" + ASMALIGN(4) + "1: \n\t" + "movzwl (%2, %%"REG_BP"), %%eax \n\t" + "movzwl 2(%2, %%"REG_BP"), %%ebx \n\t" + "movq (%1, %%"REG_BP", 8), %%mm1 \n\t" + "movq 16(%1, %%"REG_BP", 8), %%mm3 \n\t" + "movd (%3, %%"REG_a"), %%mm0 \n\t" + "movd (%3, %%"REG_b"), %%mm2 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "pmaddwd %%mm1, %%mm0 \n\t" + "pmaddwd %%mm2, %%mm3 \n\t" + + "movq 8(%1, %%"REG_BP", 8), %%mm1 \n\t" + "movq 24(%1, %%"REG_BP", 8), %%mm5 \n\t" + "movd 4(%3, %%"REG_a"), %%mm4 \n\t" + "movd 4(%3, %%"REG_b"), %%mm2 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "pmaddwd %%mm1, %%mm4 \n\t" + "pmaddwd %%mm2, %%mm5 \n\t" + "paddd %%mm4, %%mm0 \n\t" + "paddd %%mm5, %%mm3 \n\t" + "movq %%mm0, %%mm4 \n\t" + "punpckldq %%mm3, %%mm0 \n\t" + "punpckhdq %%mm3, %%mm4 \n\t" + "paddd %%mm4, %%mm0 \n\t" + "psrad $7, %%mm0 \n\t" + "packssdw %%mm0, %%mm0 \n\t" + "movd %%mm0, (%4, %%"REG_BP") \n\t" + "add $4, %%"REG_BP" \n\t" + " jnc 1b \n\t" + + "pop %%"REG_BP" \n\t" #if defined(PIC) - "pop %%"REG_b" \n\t" + "pop %%"REG_b" \n\t" #endif - : "+a" (counter) - : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) + : "+a" (counter) + : "c" (filter), "d" (filterPos), "S" (src), "D" (dst) #if !defined(PIC) - : "%"REG_b + : "%"REG_b #endif ); - } - else - { + } else { uint8_t *offset = src+filterSize; x86_reg counter= -2*dstW; //filter-= counter*filterSize/2; filterPos-= counter/2; dst-= counter/2; __asm__ volatile( - "pxor %%mm7, %%mm7 \n\t" - ASMALIGN(4) - "1: \n\t" - "mov %2, %%"REG_c" \n\t" - "movzwl (%%"REG_c", %0), %%eax \n\t" - "movzwl 2(%%"REG_c", %0), %%edx \n\t" - "mov %5, %%"REG_c" \n\t" - "pxor %%mm4, %%mm4 \n\t" - "pxor %%mm5, %%mm5 \n\t" - "2: \n\t" - "movq (%1), %%mm1 \n\t" - "movq (%1, %6), %%mm3 \n\t" - "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t" - "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "pmaddwd %%mm1, %%mm0 \n\t" - "pmaddwd %%mm2, %%mm3 \n\t" - "paddd %%mm3, %%mm5 \n\t" - "paddd %%mm0, %%mm4 \n\t" - "add $8, %1 \n\t" - "add $4, %%"REG_c" \n\t" - "cmp %4, %%"REG_c" \n\t" - " jb 2b \n\t" - "add %6, %1 \n\t" - "movq %%mm4, %%mm0 \n\t" - "punpckldq %%mm5, %%mm4 \n\t" - "punpckhdq %%mm5, %%mm0 \n\t" - "paddd %%mm0, %%mm4 \n\t" - "psrad $7, %%mm4 \n\t" - "packssdw %%mm4, %%mm4 \n\t" - "mov %3, %%"REG_a" \n\t" - "movd %%mm4, (%%"REG_a", %0) \n\t" - "add $4, %0 \n\t" - " jnc 1b \n\t" - - : "+r" (counter), "+r" (filter) - : "m" (filterPos), "m" (dst), "m"(offset), - "m" (src), "r" ((x86_reg)filterSize*2) - : "%"REG_a, "%"REG_c, "%"REG_d + "pxor %%mm7, %%mm7 \n\t" + ASMALIGN(4) + "1: \n\t" + "mov %2, %%"REG_c" \n\t" + "movzwl (%%"REG_c", %0), %%eax \n\t" + "movzwl 2(%%"REG_c", %0), %%edx \n\t" + "mov %5, %%"REG_c" \n\t" + "pxor %%mm4, %%mm4 \n\t" + "pxor %%mm5, %%mm5 \n\t" + "2: \n\t" + "movq (%1), %%mm1 \n\t" + "movq (%1, %6), %%mm3 \n\t" + "movd (%%"REG_c", %%"REG_a"), %%mm0 \n\t" + "movd (%%"REG_c", %%"REG_d"), %%mm2 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "pmaddwd %%mm1, %%mm0 \n\t" + "pmaddwd %%mm2, %%mm3 \n\t" + "paddd %%mm3, %%mm5 \n\t" + "paddd %%mm0, %%mm4 \n\t" + "add $8, %1 \n\t" + "add $4, %%"REG_c" \n\t" + "cmp %4, %%"REG_c" \n\t" + " jb 2b \n\t" + "add %6, %1 \n\t" + "movq %%mm4, %%mm0 \n\t" + "punpckldq %%mm5, %%mm4 \n\t" + "punpckhdq %%mm5, %%mm0 \n\t" + "paddd %%mm0, %%mm4 \n\t" + "psrad $7, %%mm4 \n\t" + "packssdw %%mm4, %%mm4 \n\t" + "mov %3, %%"REG_a" \n\t" + "movd %%mm4, (%%"REG_a", %0) \n\t" + "add $4, %0 \n\t" + " jnc 1b \n\t" + + : "+r" (counter), "+r" (filter) + : "m" (filterPos), "m" (dst), "m"(offset), + "m" (src), "r" ((x86_reg)filterSize*2) + : "%"REG_a, "%"REG_c, "%"REG_d ); } #else -#if HAVE_ALTIVEC +#if COMPILE_TEMPLATE_ALTIVEC hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize); #else int i; - for (i=0; i>7, (1<<15)-1); // the cubic equation does overflow ... //dst[i] = val>>7; } -#endif /* HAVE_ALTIVEC */ -#endif /* HAVE_MMX */ +#endif /* COMPILE_ALTIVEC */ +#endif /* COMPILE_MMX */ } -static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, - int dstWidth, const uint8_t *src, int srcW, - int xInc) +//FIXME all pal and rgb srcFormats could do this convertion as well +//FIXME all scalers more complex than bilinear could do half of this transform +static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width) { int i; - unsigned int xpos=0; - for (i=0;i>16; - register unsigned int xalpha=(xpos&0xFFFF)>>9; - dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; - xpos+=xInc; + for (i = 0; i < width; i++) { + dst[i ] = (FFMIN(dst[i ],30775)*4663 - 9289992)>>12; //-264 + dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264 } } - - // *** horizontal scale Y line to temp buffer -static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc, - int flags, const int16_t *hLumFilter, - const int16_t *hLumFilterPos, int hLumFilterSize, - int srcFormat, uint8_t *formatConvBuffer, - uint32_t *pal, int isAlpha) +static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width) { - int32_t *mmx2FilterPos = c->lumMmx2FilterPos; - int16_t *mmx2Filter = c->lumMmx2Filter; - int canMMX2BeUsed = c->canMMX2BeUsed; - void *funnyYCode = c->funnyYCode; - - if (isAlpha) { - if (srcFormat == PIX_FMT_RGB32 || srcFormat == PIX_FMT_BGR32 ) - src += 3; - } else { - if (srcFormat == PIX_FMT_RGB32_1 || srcFormat == PIX_FMT_BGR32_1) - src += ALT32_CORR; + int i; + for (i = 0; i < width; i++) { + dst[i ] = (dst[i ]*1799 + 4081085)>>11; //1469 + dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469 } +} +static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width) +{ + int i; + for (i = 0; i < width; i++) + dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14; +} +static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width) +{ + int i; + for (i = 0; i < width; i++) + dst[i] = (dst[i]*14071 + 33561947)>>14; +} - if (c->hyscale_internal) { - c->hyscale_internal(formatConvBuffer, src, srcW, pal); - src= formatConvBuffer; - } +#define FAST_BILINEAR_X86 \ + "subl %%edi, %%esi \n\t" /* src[xx+1] - src[xx] */ \ + "imull %%ecx, %%esi \n\t" /* (src[xx+1] - src[xx])*xalpha */ \ + "shll $16, %%edi \n\t" \ + "addl %%edi, %%esi \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */ \ + "mov %1, %%"REG_D"\n\t" \ + "shrl $9, %%esi \n\t" \ -#if HAVE_MMX - // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). - if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) -#else - if (!(flags&SWS_FAST_BILINEAR)) -#endif - { - c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); - } - else // fast bilinear upscale / crap downscale - { +static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, + long dstWidth, const uint8_t *src, int srcW, + int xInc) +{ #if ARCH_X86 && CONFIG_GPL -#if HAVE_MMX2 - int i; +#if COMPILE_TEMPLATE_MMX2 + int32_t *mmx2FilterPos = c->lumMmx2FilterPos; + int16_t *mmx2Filter = c->lumMmx2Filter; + int canMMX2BeUsed = c->canMMX2BeUsed; + void *mmx2FilterCode= c->lumMmx2FilterCode; + int i; #if defined(PIC) - uint64_t ebxsave __attribute__((aligned(8))); + DECLARE_ALIGNED(8, uint64_t, ebxsave); #endif - if (canMMX2BeUsed) - { - __asm__ volatile( + if (canMMX2BeUsed) { + __asm__ volatile( #if defined(PIC) "mov %%"REG_b", %5 \n\t" #endif @@ -2321,7 +2285,7 @@ static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, #if ARCH_X86_64 -#define FUNNY_Y_CODE \ +#define CALL_MMX2_FILTER_CODE \ "movl (%%"REG_b"), %%esi \n\t"\ "call *%4 \n\t"\ "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ @@ -2331,7 +2295,7 @@ static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, #else -#define FUNNY_Y_CODE \ +#define CALL_MMX2_FILTER_CODE \ "movl (%%"REG_b"), %%esi \n\t"\ "call *%4 \n\t"\ "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ @@ -2340,20 +2304,20 @@ static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, #endif /* ARCH_X86_64 */ -FUNNY_Y_CODE -FUNNY_Y_CODE -FUNNY_Y_CODE -FUNNY_Y_CODE -FUNNY_Y_CODE -FUNNY_Y_CODE -FUNNY_Y_CODE -FUNNY_Y_CODE + CALL_MMX2_FILTER_CODE + CALL_MMX2_FILTER_CODE + CALL_MMX2_FILTER_CODE + CALL_MMX2_FILTER_CODE + CALL_MMX2_FILTER_CODE + CALL_MMX2_FILTER_CODE + CALL_MMX2_FILTER_CODE + CALL_MMX2_FILTER_CODE #if defined(PIC) "mov %5, %%"REG_b" \n\t" #endif :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), - "m" (funnyYCode) + "m" (mmx2FilterCode) #if defined(PIC) ,"m" (ebxsave) #endif @@ -2361,44 +2325,32 @@ FUNNY_Y_CODE #if !defined(PIC) ,"%"REG_b #endif - ); - for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; - } - else - { -#endif /* HAVE_MMX2 */ - x86_reg xInc_shr16 = xInc >> 16; - uint16_t xInc_mask = xInc & 0xffff; - //NO MMX just normal asm ... - __asm__ volatile( + ); + for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128; + } else { +#endif /* COMPILE_TEMPLATE_MMX2 */ + x86_reg xInc_shr16 = xInc >> 16; + uint16_t xInc_mask = xInc & 0xffff; + //NO MMX just normal asm ... + __asm__ volatile( "xor %%"REG_a", %%"REG_a" \n\t" // i "xor %%"REG_d", %%"REG_d" \n\t" // xx - "xorl %%ecx, %%ecx \n\t" // 2*xalpha + "xorl %%ecx, %%ecx \n\t" // xalpha ASMALIGN(4) "1: \n\t" "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] - "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] - "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha - "shll $16, %%edi \n\t" - "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) - "mov %1, %%"REG_D" \n\t" - "shrl $9, %%esi \n\t" + FAST_BILINEAR_X86 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t" - "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF - "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry + "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF + "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry "movzbl (%0, %%"REG_d"), %%edi \n\t" //src[xx] "movzbl 1(%0, %%"REG_d"), %%esi \n\t" //src[xx+1] - "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] - "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha - "shll $16, %%edi \n\t" - "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) - "mov %1, %%"REG_D" \n\t" - "shrl $9, %%esi \n\t" + FAST_BILINEAR_X86 "movw %%si, 2(%%"REG_D", %%"REG_a", 2) \n\t" - "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF - "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry + "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF + "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry "add $2, %%"REG_a" \n\t" @@ -2408,95 +2360,66 @@ FUNNY_Y_CODE :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask) : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" - ); -#if HAVE_MMX2 - } //if MMX2 can't be used + ); +#if COMPILE_TEMPLATE_MMX2 + } //if MMX2 can't be used #endif #else - c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc); -#endif /* ARCH_X86 */ - } - - if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){ - int i; - //FIXME all pal and rgb srcFormats could do this convertion as well - //FIXME all scalers more complex than bilinear could do half of this transform - if(c->srcRange){ - for (i=0; i>14; - }else{ - for (i=0; i>14; - } - } -} - -static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst, - int dstWidth, const uint8_t *src1, - const uint8_t *src2, int srcW, int xInc) -{ int i; unsigned int xpos=0; - for (i=0;i>16; register unsigned int xalpha=(xpos&0xFFFF)>>9; - dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); - dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); - /* slower - dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; - dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; - */ + dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha; xpos+=xInc; } +#endif /* ARCH_X86 */ } -inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2, - int srcW, int xInc, int flags, const int16_t *hChrFilter, - const int16_t *hChrFilterPos, int hChrFilterSize, - int srcFormat, uint8_t *formatConvBuffer, - uint32_t *pal) + // *** horizontal scale Y line to temp buffer +static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc, + const int16_t *hLumFilter, + const int16_t *hLumFilterPos, int hLumFilterSize, + uint8_t *formatConvBuffer, + uint32_t *pal, int isAlpha) { - int32_t *mmx2FilterPos = c->chrMmx2FilterPos; - int16_t *mmx2Filter = c->chrMmx2Filter; - int canMMX2BeUsed = c->canMMX2BeUsed; - void *funnyUVCode = c->funnyUVCode; + void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12; + void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange; - if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE) - return; - - if (srcFormat==PIX_FMT_RGB32_1) { - src1 += ALT32_CORR; - src2 += ALT32_CORR; - } + src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset; - if (c->hcscale_internal) { - c->hcscale_internal(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); - src1= formatConvBuffer; - src2= formatConvBuffer+VOFW; + if (toYV12) { + toYV12(formatConvBuffer, src, srcW, pal); + src= formatConvBuffer; } -#if HAVE_MMX - // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). - if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) -#else - if (!(flags&SWS_FAST_BILINEAR)) -#endif + if (!c->hyscale_fast) { - c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); - c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); + c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); + } else { // fast bilinear upscale / crap downscale + c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc); } - else // fast bilinear upscale / crap downscale - { + + if (convertRange) + convertRange(dst, dstWidth); +} + +static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst, + long dstWidth, const uint8_t *src1, + const uint8_t *src2, int srcW, int xInc) +{ #if ARCH_X86 && CONFIG_GPL -#if HAVE_MMX2 - int i; +#if COMPILE_TEMPLATE_MMX2 + int32_t *mmx2FilterPos = c->chrMmx2FilterPos; + int16_t *mmx2Filter = c->chrMmx2Filter; + int canMMX2BeUsed = c->canMMX2BeUsed; + void *mmx2FilterCode= c->chrMmx2FilterCode; + int i; #if defined(PIC) - uint64_t ebxsave __attribute__((aligned(8))); + DECLARE_ALIGNED(8, uint64_t, ebxsave); #endif - if (canMMX2BeUsed) - { - __asm__ volatile( + if (canMMX2BeUsed) { + __asm__ volatile( #if defined(PIC) "mov %%"REG_b", %6 \n\t" #endif @@ -2510,31 +2433,10 @@ inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, PREFETCH" 32(%%"REG_c") \n\t" PREFETCH" 64(%%"REG_c") \n\t" -#if ARCH_X86_64 - -#define FUNNY_UV_CODE \ - "movl (%%"REG_b"), %%esi \n\t"\ - "call *%4 \n\t"\ - "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ - "add %%"REG_S", %%"REG_c" \n\t"\ - "add %%"REG_a", %%"REG_D" \n\t"\ - "xor %%"REG_a", %%"REG_a" \n\t"\ - -#else - -#define FUNNY_UV_CODE \ - "movl (%%"REG_b"), %%esi \n\t"\ - "call *%4 \n\t"\ - "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ - "add %%"REG_a", %%"REG_D" \n\t"\ - "xor %%"REG_a", %%"REG_a" \n\t"\ - -#endif /* ARCH_X86_64 */ - -FUNNY_UV_CODE -FUNNY_UV_CODE -FUNNY_UV_CODE -FUNNY_UV_CODE + CALL_MMX2_FILTER_CODE + CALL_MMX2_FILTER_CODE + CALL_MMX2_FILTER_CODE + CALL_MMX2_FILTER_CODE "xor %%"REG_a", %%"REG_a" \n\t" // i "mov %5, %%"REG_c" \n\t" // src "mov %1, %%"REG_D" \n\t" // buf1 @@ -2543,107 +2445,120 @@ FUNNY_UV_CODE PREFETCH" 32(%%"REG_c") \n\t" PREFETCH" 64(%%"REG_c") \n\t" -FUNNY_UV_CODE -FUNNY_UV_CODE -FUNNY_UV_CODE -FUNNY_UV_CODE + CALL_MMX2_FILTER_CODE + CALL_MMX2_FILTER_CODE + CALL_MMX2_FILTER_CODE + CALL_MMX2_FILTER_CODE #if defined(PIC) "mov %6, %%"REG_b" \n\t" #endif :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos), - "m" (funnyUVCode), "m" (src2) + "m" (mmx2FilterCode), "m" (src2) #if defined(PIC) ,"m" (ebxsave) #endif : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D #if !defined(PIC) - ,"%"REG_b + ,"%"REG_b #endif - ); - for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) - { - //printf("%d %d %d\n", dstWidth, i, srcW); - dst[i] = src1[srcW-1]*128; - dst[i+VOFW] = src2[srcW-1]*128; - } + ); + for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { + //printf("%d %d %d\n", dstWidth, i, srcW); + dst[i] = src1[srcW-1]*128; + dst[i+VOFW] = src2[srcW-1]*128; } - else - { -#endif /* HAVE_MMX2 */ - x86_reg xInc_shr16 = (x86_reg) (xInc >> 16); - uint16_t xInc_mask = xInc & 0xffff; - __asm__ volatile( + } else { +#endif /* COMPILE_TEMPLATE_MMX2 */ + x86_reg xInc_shr16 = (x86_reg) (xInc >> 16); + uint16_t xInc_mask = xInc & 0xffff; + __asm__ volatile( "xor %%"REG_a", %%"REG_a" \n\t" // i "xor %%"REG_d", %%"REG_d" \n\t" // xx - "xorl %%ecx, %%ecx \n\t" // 2*xalpha + "xorl %%ecx, %%ecx \n\t" // xalpha ASMALIGN(4) "1: \n\t" "mov %0, %%"REG_S" \n\t" "movzbl (%%"REG_S", %%"REG_d"), %%edi \n\t" //src[xx] "movzbl 1(%%"REG_S", %%"REG_d"), %%esi \n\t" //src[xx+1] - "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] - "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha - "shll $16, %%edi \n\t" - "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) - "mov %1, %%"REG_D" \n\t" - "shrl $9, %%esi \n\t" + FAST_BILINEAR_X86 "movw %%si, (%%"REG_D", %%"REG_a", 2) \n\t" "movzbl (%5, %%"REG_d"), %%edi \n\t" //src[xx] "movzbl 1(%5, %%"REG_d"), %%esi \n\t" //src[xx+1] - "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] - "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha - "shll $16, %%edi \n\t" - "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) - "mov %1, %%"REG_D" \n\t" - "shrl $9, %%esi \n\t" + FAST_BILINEAR_X86 "movw %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2) \n\t" - "addw %4, %%cx \n\t" //2*xalpha += xInc&0xFF - "adc %3, %%"REG_d" \n\t" //xx+= xInc>>8 + carry + "addw %4, %%cx \n\t" //xalpha += xInc&0xFFFF + "adc %3, %%"REG_d" \n\t" //xx+= xInc>>16 + carry "add $1, %%"REG_a" \n\t" "cmp %2, %%"REG_a" \n\t" " jb 1b \n\t" /* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here, - which is needed to support GCC 4.0. */ -#if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) +which is needed to support GCC 4.0. */ +#if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4) :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask), #else :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask), #endif "r" (src2) : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi" - ); -#if HAVE_MMX2 - } //if MMX2 can't be used + ); +#if COMPILE_TEMPLATE_MMX2 + } //if MMX2 can't be used #endif #else - c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc); + int i; + unsigned int xpos=0; + for (i=0;i>16; + register unsigned int xalpha=(xpos&0xFFFF)>>9; + dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); + dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); + /* slower + dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha; + dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha; + */ + xpos+=xInc; + } #endif /* ARCH_X86 */ +} + +inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2, + int srcW, int xInc, const int16_t *hChrFilter, + const int16_t *hChrFilterPos, int hChrFilterSize, + uint8_t *formatConvBuffer, + uint32_t *pal) +{ + + src1 += c->chrSrcOffset; + src2 += c->chrSrcOffset; + + if (c->chrToYV12) { + c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal); + src1= formatConvBuffer; + src2= formatConvBuffer+VOFW; } - if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){ - int i; - //FIXME all pal and rgb srcFormats could do this convertion as well - //FIXME all scalers more complex than bilinear could do half of this transform - if(c->srcRange){ - for (i=0; i>11; //1469 - dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469 - } - }else{ - for (i=0; i>12; //-264 - dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264 - } - } + + if (!c->hcscale_fast) + { + c->hScale(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); + c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); + } else { // fast bilinear upscale / crap downscale + c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc); } + + if (c->chrConvertRange) + c->chrConvertRange(dst, dstWidth); } -static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, - int srcSliceH, uint8_t* dst[], int dstStride[]){ +#define DEBUG_SWSCALE_BUFFERS 0 +#define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__) +static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, + int srcSliceH, uint8_t* dst[], int dstStride[]) +{ /* load a few things into local vars to make the code more readable? and faster */ const int srcW= c->srcW; const int dstW= c->dstW; @@ -2652,8 +2567,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s const int chrSrcW= c->chrSrcW; const int lumXInc= c->lumXInc; const int chrXInc= c->chrXInc; - const int dstFormat= c->dstFormat; - const int srcFormat= c->srcFormat; + const enum PixelFormat dstFormat= c->dstFormat; const int flags= c->flags; int16_t *vLumFilterPos= c->vLumFilterPos; int16_t *vChrFilterPos= c->vChrFilterPos; @@ -2688,7 +2602,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s int lastInLumBuf= c->lastInLumBuf; int lastInChrBuf= c->lastInChrBuf; - if (isPacked(c->srcFormat)){ + if (isPacked(c->srcFormat)) { src[0]= src[1]= src[2]= @@ -2701,27 +2615,17 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s srcStride[1]<<= c->vChrDrop; srcStride[2]<<= c->vChrDrop; - //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2], - // (int)dst[0], (int)dst[1], (int)dst[2]); + DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n", + src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3], + dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]); + DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n", + srcSliceY, srcSliceH, dstY, dstH); + DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n", + vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize); -#if 0 //self test FIXME move to a vfilter or something - { - static volatile int i=0; - i++; - if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH) - selfTest(src, srcStride, c->srcW, c->srcH); - i--; - } -#endif - - //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2], - //dstStride[0],dstStride[1],dstStride[2]); - - if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) - { + if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) { static int warnedAlready=0; //FIXME move this into the context perhaps - if (flags & SWS_PRINT_INFO && !warnedAlready) - { + if (flags & SWS_PRINT_INFO && !warnedAlready) { av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n" " ->cannot do aligned memory accesses anymore\n"); warnedAlready=1; @@ -2731,9 +2635,9 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s /* Note the user might start scaling the picture in the middle so this will not get executed. This is not really intended but works currently, so people might do it. */ - if (srcSliceY ==0){ - lumBufIndex=0; - chrBufIndex=0; + if (srcSliceY ==0) { + lumBufIndex=-1; + chrBufIndex=-1; dstY=0; lastInLumBuf= -1; lastInChrBuf= -1; @@ -2741,7 +2645,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s lastDstY= dstY; - for (;dstY < dstH; dstY++){ + for (;dstY < dstH; dstY++) { unsigned char *dest =dst[0]+dstStride[0]*dstY; const int chrDstY= dstY>>c->chrDstVSubSample; unsigned char *uDest=dst[1]+dstStride[1]*chrDstY; @@ -2750,114 +2654,75 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input - const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input - const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input + int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input + int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input + int enough_lines; - //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n", - // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize, c->chrSrcVSubSample); //handle holes (FAST_BILINEAR & weird filters) if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1; if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1; - //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize); assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1); assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1); // Do we have enough lines in this slice to output the dstY line - if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample)) - { - //Do horizontal scaling - while(lastInLumBuf < lastLumSrcY) - { - uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; - uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3]; - lumBufIndex++; - //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf, lastLumSrcY); - assert(lumBufIndex < 2*vLumBufSize); - assert(lastInLumBuf + 1 - srcSliceY < srcSliceH); - assert(lastInLumBuf + 1 - srcSliceY >= 0); - //printf("%d %d\n", lumBufIndex, vLumBufSize); - RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc, - flags, hLumFilter, hLumFilterPos, hLumFilterSize, - c->srcFormat, formatConvBuffer, - pal, 0); - if (CONFIG_SWSCALE_ALPHA && alpPixBuf) - RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc, - flags, hLumFilter, hLumFilterPos, hLumFilterSize, - c->srcFormat, formatConvBuffer, - pal, 1); - lastInLumBuf++; - } - while(lastInChrBuf < lastChrSrcY) - { - uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; - uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; - chrBufIndex++; - assert(chrBufIndex < 2*vChrBufSize); - assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH)); - assert(lastInChrBuf + 1 - chrSrcSliceY >= 0); - //FIXME replace parameters through context struct (some at least) - - if (!(isGray(srcFormat) || isGray(dstFormat))) - RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, - flags, hChrFilter, hChrFilterPos, hChrFilterSize, - c->srcFormat, formatConvBuffer, - pal); - lastInChrBuf++; - } - //wrap buf index around to stay inside the ring buffer - if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize; - if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize; + enough_lines = lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample); + if (!enough_lines) { + lastLumSrcY = srcSliceY + srcSliceH - 1; + lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1; } - else // not enough lines left in this slice -> load the rest in the buffer - { - /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n", - firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY, - lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize, - vChrBufSize, vLumBufSize);*/ - - //Do horizontal scaling - while(lastInLumBuf+1 < srcSliceY + srcSliceH) - { - uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; - uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3]; - lumBufIndex++; - assert(lumBufIndex < 2*vLumBufSize); - assert(lastInLumBuf + 1 - srcSliceY < srcSliceH); - assert(lastInLumBuf + 1 - srcSliceY >= 0); - RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc, - flags, hLumFilter, hLumFilterPos, hLumFilterSize, - c->srcFormat, formatConvBuffer, - pal, 0); - if (CONFIG_SWSCALE_ALPHA && alpPixBuf) - RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc, - flags, hLumFilter, hLumFilterPos, hLumFilterSize, - c->srcFormat, formatConvBuffer, - pal, 1); - lastInLumBuf++; - } - while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH)) - { - uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; - uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; - chrBufIndex++; - assert(chrBufIndex < 2*vChrBufSize); - assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH); - assert(lastInChrBuf + 1 - chrSrcSliceY >= 0); - - if (!(isGray(srcFormat) || isGray(dstFormat))) - RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, - flags, hChrFilter, hChrFilterPos, hChrFilterSize, - c->srcFormat, formatConvBuffer, - pal); - lastInChrBuf++; - } - //wrap buf index around to stay inside the ring buffer - if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize; - if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize; - break; //we can't output a dstY line so let's try with the next slice + + DEBUG_BUFFERS("dstY: %d\n", dstY); + DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n", + firstLumSrcY, lastLumSrcY, lastInLumBuf); + DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n", + firstChrSrcY, lastChrSrcY, lastInChrBuf); + + //Do horizontal scaling + while(lastInLumBuf < lastLumSrcY) { + uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0]; + uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3]; + lumBufIndex++; + DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n", + lumBufIndex, lastInLumBuf); + assert(lumBufIndex < 2*vLumBufSize); + assert(lastInLumBuf + 1 - srcSliceY < srcSliceH); + assert(lastInLumBuf + 1 - srcSliceY >= 0); + RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc, + hLumFilter, hLumFilterPos, hLumFilterSize, + formatConvBuffer, + pal, 0); + if (CONFIG_SWSCALE_ALPHA && alpPixBuf) + RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc, + hLumFilter, hLumFilterPos, hLumFilterSize, + formatConvBuffer, + pal, 1); + lastInLumBuf++; } + while(lastInChrBuf < lastChrSrcY) { + uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1]; + uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2]; + chrBufIndex++; + DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n", + chrBufIndex, lastInChrBuf); + assert(chrBufIndex < 2*vChrBufSize); + assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH)); + assert(lastInChrBuf + 1 - chrSrcSliceY >= 0); + //FIXME replace parameters through context struct (some at least) + + if (c->needs_hcscale) + RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc, + hChrFilter, hChrFilterPos, hChrFilterSize, + formatConvBuffer, + pal); + lastInChrBuf++; + } + //wrap buf index around to stay inside the ring buffer + if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize; + if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize; + if (!enough_lines) + break; //we can't output a dstY line so let's try with the next slice -#if HAVE_MMX +#if COMPILE_TEMPLATE_MMX c->blueDither= ff_dither8[dstY&1]; if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555) c->greenDither= ff_dither8[dstY&1]; @@ -2865,176 +2730,169 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s c->greenDither= ff_dither4[dstY&1]; c->redDither= ff_dither8[(dstY+1)&1]; #endif - if (dstY < dstH-2) - { + if (dstY < dstH-2) { const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; -#if HAVE_MMX +#if COMPILE_TEMPLATE_MMX int i; - if (flags & SWS_ACCURATE_RND){ - int s= APCK_SIZE / 8; - for (i=0; i1)]; - lumMmxFilter[s*i+APCK_COEF/4 ]= - lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ] - + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0); - if (CONFIG_SWSCALE_ALPHA && alpPixBuf){ - *(void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ]; - *(void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)]; - alpMmxFilter[s*i+APCK_COEF/4 ]= - alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ]; + if (flags & SWS_ACCURATE_RND) { + int s= APCK_SIZE / 8; + for (i=0; i1)]; + lumMmxFilter[s*i+APCK_COEF/4 ]= + lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ] + + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0); + if (CONFIG_SWSCALE_ALPHA && alpPixBuf) { + *(void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ]; + *(void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)]; + alpMmxFilter[s*i+APCK_COEF/4 ]= + alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ]; + } } - } - for (i=0; i1)]; - chrMmxFilter[s*i+APCK_COEF/4 ]= - chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ] - + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0); - } - }else{ - for (i=0; i> 32; - lumMmxFilter[4*i+2]= - lumMmxFilter[4*i+3]= - ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; - if (CONFIG_SWSCALE_ALPHA && alpPixBuf){ - alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i]; - alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32; - alpMmxFilter[4*i+2]= - alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2]; + for (i=0; i1)]; + chrMmxFilter[s*i+APCK_COEF/4 ]= + chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ] + + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0); + } + } else { + for (i=0; i> 32; + lumMmxFilter[4*i+2]= + lumMmxFilter[4*i+3]= + ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001; + if (CONFIG_SWSCALE_ALPHA && alpPixBuf) { + alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i]; + alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32; + alpMmxFilter[4*i+2]= + alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2]; + } + } + for (i=0; i> 32; + chrMmxFilter[4*i+2]= + chrMmxFilter[4*i+3]= + ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; } } - for (i=0; i> 32; - chrMmxFilter[4*i+2]= - chrMmxFilter[4*i+3]= - ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001; - } - } #endif - if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){ + if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) { const int chrSkipMask= (1<chrDstVSubSample)-1; if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi c->yuv2nv12X(c, - vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, - vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, - dest, uDest, dstW, chrDstW, dstFormat); - } - else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like - { + vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, + vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, + dest, uDest, dstW, chrDstW, dstFormat); + } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like const int chrSkipMask= (1<chrDstVSubSample)-1; if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi - if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12 - { - int16_t *lumBuf = lumPixBuf[0]; - int16_t *chrBuf= chrPixBuf[0]; - int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpPixBuf[0] : NULL; + if (is16BPS(dstFormat)) { + yuv2yuvX16inC( + vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, + vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, + alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW, + dstFormat); + } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12 + int16_t *lumBuf = lumSrcPtr[0]; + int16_t *chrBuf= chrSrcPtr[0]; + int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL; c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW); - } - else //General YV12 - { + } else { //General YV12 c->yuv2yuvX(c, - vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, - vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, - alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW); + vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, + vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, + alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW); } - } - else - { + } else { assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); - if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB - { + if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB int chrAlpha= vChrFilter[2*dstY+1]; - if(flags & SWS_FULL_CHR_H_INT){ + if(flags & SWS_FULL_CHR_H_INT) { yuv2rgbXinC_full(c, //FIXME write a packed1_full function - vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, - vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, - alpSrcPtr, dest, dstW, dstY); - }else{ + vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, + vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, + alpSrcPtr, dest, dstW, dstY); + } else { c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), - alpPixBuf ? *alpSrcPtr : NULL, - dest, dstW, chrAlpha, dstFormat, flags, dstY); + alpPixBuf ? *alpSrcPtr : NULL, + dest, dstW, chrAlpha, dstFormat, flags, dstY); } - } - else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB - { + } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB int lumAlpha= vLumFilter[2*dstY+1]; int chrAlpha= vChrFilter[2*dstY+1]; lumMmxFilter[2]= lumMmxFilter[3]= vLumFilter[2*dstY ]*0x10001; chrMmxFilter[2]= chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001; - if(flags & SWS_FULL_CHR_H_INT){ + if(flags & SWS_FULL_CHR_H_INT) { yuv2rgbXinC_full(c, //FIXME write a packed2_full function - vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, - vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, - alpSrcPtr, dest, dstW, dstY); - }else{ + vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, + vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, + alpSrcPtr, dest, dstW, dstY); + } else { c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), - alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL, - dest, dstW, lumAlpha, chrAlpha, dstY); + alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL, + dest, dstW, lumAlpha, chrAlpha, dstY); } - } - else //general RGB - { - if(flags & SWS_FULL_CHR_H_INT){ + } else { //general RGB + if(flags & SWS_FULL_CHR_H_INT) { yuv2rgbXinC_full(c, - vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, - vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, - alpSrcPtr, dest, dstW, dstY); - }else{ + vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, + vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, + alpSrcPtr, dest, dstW, dstY); + } else { c->yuv2packedX(c, - vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, - vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, - alpSrcPtr, dest, dstW, dstY); + vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, + vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, + alpSrcPtr, dest, dstW, dstY); } } } - } - else // hmm looks like we can't use MMX here without overwriting this array's tail - { + } else { // hmm looks like we can't use MMX here without overwriting this array's tail const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize; const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize; const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL; - if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){ + if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) { const int chrSkipMask= (1<chrDstVSubSample)-1; if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi yuv2nv12XinC( - vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, - vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, - dest, uDest, dstW, chrDstW, dstFormat); - } - else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 - { + vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, + vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, + dest, uDest, dstW, chrDstW, dstFormat); + } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 const int chrSkipMask= (1<chrDstVSubSample)-1; if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi - yuv2yuvXinC( - vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, - vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, - alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW); - } - else - { + if (is16BPS(dstFormat)) { + yuv2yuvX16inC( + vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, + vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, + alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW, + dstFormat); + } else { + yuv2yuvXinC( + vLumFilter+dstY*vLumFilterSize , lumSrcPtr, vLumFilterSize, + vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, + alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW); + } + } else { assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); - if(flags & SWS_FULL_CHR_H_INT){ + if(flags & SWS_FULL_CHR_H_INT) { yuv2rgbXinC_full(c, - vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, - vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, - alpSrcPtr, dest, dstW, dstY); - }else{ + vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, + vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, + alpSrcPtr, dest, dstW, dstY); + } else { yuv2packedXinC(c, - vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, - vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, - alpSrcPtr, dest, dstW, dstY); + vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, + vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize, + alpSrcPtr, dest, dstW, dstY); } } } @@ -3043,7 +2901,7 @@ static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int s if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf) fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255); -#if HAVE_MMX +#if COMPILE_TEMPLATE_MMX if (flags & SWS_CPU_CAPS_MMX2 ) __asm__ volatile("sfence":::"memory"); /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */ if (flags & SWS_CPU_CAPS_3DNOW) __asm__ volatile("femms" :::"memory"); @@ -3072,80 +2930,137 @@ static void RENAME(sws_init_swScale)(SwsContext *c) c->hScale = RENAME(hScale ); - c->hyscale_fast = RENAME(hyscale_fast); - c->hcscale_fast = RENAME(hcscale_fast); +#if COMPILE_TEMPLATE_MMX + // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one). + if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed) +#else + if (c->flags & SWS_FAST_BILINEAR) +#endif + { + c->hyscale_fast = RENAME(hyscale_fast); + c->hcscale_fast = RENAME(hcscale_fast); + } - c->hcscale_internal = NULL; + c->chrToYV12 = NULL; switch(srcFormat) { - case PIX_FMT_YUYV422 : c->hcscale_internal = RENAME(yuy2ToUV); break; - case PIX_FMT_UYVY422 : c->hcscale_internal = RENAME(uyvyToUV); break; + case PIX_FMT_YUYV422 : c->chrToYV12 = RENAME(yuy2ToUV); break; + case PIX_FMT_UYVY422 : c->chrToYV12 = RENAME(uyvyToUV); break; + case PIX_FMT_NV12 : c->chrToYV12 = RENAME(nv12ToUV); break; + case PIX_FMT_NV21 : c->chrToYV12 = RENAME(nv21ToUV); break; case PIX_FMT_RGB8 : case PIX_FMT_BGR8 : case PIX_FMT_PAL8 : case PIX_FMT_BGR4_BYTE: - case PIX_FMT_RGB4_BYTE: c->hcscale_internal = RENAME(palToUV); break; + case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break; + case PIX_FMT_YUV420P16BE: + case PIX_FMT_YUV422P16BE: + case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break; + case PIX_FMT_YUV420P16LE: + case PIX_FMT_YUV422P16LE: + case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break; } if (c->chrSrcHSubSample) { switch(srcFormat) { + case PIX_FMT_RGB48BE: + case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break; case PIX_FMT_RGB32 : - case PIX_FMT_RGB32_1: c->hcscale_internal = RENAME(bgr32ToUV_half); break; - case PIX_FMT_BGR24 : c->hcscale_internal = RENAME(bgr24ToUV_half); break; - case PIX_FMT_BGR565 : c->hcscale_internal = RENAME(bgr16ToUV_half); break; - case PIX_FMT_BGR555 : c->hcscale_internal = RENAME(bgr15ToUV_half); break; + case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV_half; break; + case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV_half); break; + case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break; + case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break; case PIX_FMT_BGR32 : - case PIX_FMT_BGR32_1: c->hcscale_internal = RENAME(rgb32ToUV_half); break; - case PIX_FMT_RGB24 : c->hcscale_internal = RENAME(rgb24ToUV_half); break; - case PIX_FMT_RGB565 : c->hcscale_internal = RENAME(rgb16ToUV_half); break; - case PIX_FMT_RGB555 : c->hcscale_internal = RENAME(rgb15ToUV_half); break; + case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV_half; break; + case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV_half); break; + case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break; + case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break; } } else { switch(srcFormat) { + case PIX_FMT_RGB48BE: + case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break; case PIX_FMT_RGB32 : - case PIX_FMT_RGB32_1: c->hcscale_internal = RENAME(bgr32ToUV); break; - case PIX_FMT_BGR24 : c->hcscale_internal = RENAME(bgr24ToUV); break; - case PIX_FMT_BGR565 : c->hcscale_internal = RENAME(bgr16ToUV); break; - case PIX_FMT_BGR555 : c->hcscale_internal = RENAME(bgr15ToUV); break; + case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV; break; + case PIX_FMT_BGR24 : c->chrToYV12 = RENAME(bgr24ToUV); break; + case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break; + case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break; case PIX_FMT_BGR32 : - case PIX_FMT_BGR32_1: c->hcscale_internal = RENAME(rgb32ToUV); break; - case PIX_FMT_RGB24 : c->hcscale_internal = RENAME(rgb24ToUV); break; - case PIX_FMT_RGB565 : c->hcscale_internal = RENAME(rgb16ToUV); break; - case PIX_FMT_RGB555 : c->hcscale_internal = RENAME(rgb15ToUV); break; + case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV; break; + case PIX_FMT_RGB24 : c->chrToYV12 = RENAME(rgb24ToUV); break; + case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break; + case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break; } } - c->hyscale_internal = NULL; + c->lumToYV12 = NULL; + c->alpToYV12 = NULL; switch (srcFormat) { case PIX_FMT_YUYV422 : - case PIX_FMT_GRAY16BE : c->hyscale_internal = RENAME(yuy2ToY); break; + case PIX_FMT_YUV420P16BE: + case PIX_FMT_YUV422P16BE: + case PIX_FMT_YUV444P16BE: + case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break; case PIX_FMT_UYVY422 : - case PIX_FMT_GRAY16LE : c->hyscale_internal = RENAME(uyvyToY); break; - case PIX_FMT_BGR24 : c->hyscale_internal = RENAME(bgr24ToY); break; - case PIX_FMT_BGR565 : c->hyscale_internal = RENAME(bgr16ToY); break; - case PIX_FMT_BGR555 : c->hyscale_internal = RENAME(bgr15ToY); break; - case PIX_FMT_RGB24 : c->hyscale_internal = RENAME(rgb24ToY); break; - case PIX_FMT_RGB565 : c->hyscale_internal = RENAME(rgb16ToY); break; - case PIX_FMT_RGB555 : c->hyscale_internal = RENAME(rgb15ToY); break; + case PIX_FMT_YUV420P16LE: + case PIX_FMT_YUV422P16LE: + case PIX_FMT_YUV444P16LE: + case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break; + case PIX_FMT_BGR24 : c->lumToYV12 = RENAME(bgr24ToY); break; + case PIX_FMT_BGR565 : c->lumToYV12 = bgr16ToY; break; + case PIX_FMT_BGR555 : c->lumToYV12 = bgr15ToY; break; + case PIX_FMT_RGB24 : c->lumToYV12 = RENAME(rgb24ToY); break; + case PIX_FMT_RGB565 : c->lumToYV12 = rgb16ToY; break; + case PIX_FMT_RGB555 : c->lumToYV12 = rgb15ToY; break; case PIX_FMT_RGB8 : case PIX_FMT_BGR8 : case PIX_FMT_PAL8 : case PIX_FMT_BGR4_BYTE: - case PIX_FMT_RGB4_BYTE: c->hyscale_internal = RENAME(palToY); break; - case PIX_FMT_MONOBLACK: c->hyscale_internal = RENAME(monoblack2Y); break; - case PIX_FMT_MONOWHITE: c->hyscale_internal = RENAME(monowhite2Y); break; + case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break; + case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break; + case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break; + case PIX_FMT_RGB32 : + case PIX_FMT_RGB32_1: c->lumToYV12 = bgr32ToY; break; + case PIX_FMT_BGR32 : + case PIX_FMT_BGR32_1: c->lumToYV12 = rgb32ToY; break; + case PIX_FMT_RGB48BE: + case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break; } if (c->alpPixBuf) { switch (srcFormat) { case PIX_FMT_RGB32 : case PIX_FMT_RGB32_1: case PIX_FMT_BGR32 : - case PIX_FMT_BGR32_1: c->hyscale_internal = RENAME(abgrToA); break; + case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break; } - } else { - switch (srcFormat) { - case PIX_FMT_RGB32 : - case PIX_FMT_RGB32_1: c->hyscale_internal = RENAME(bgr32ToY); break; - case PIX_FMT_BGR32 : - case PIX_FMT_BGR32_1: c->hyscale_internal = RENAME(rgb32ToY); break; + } + + switch (srcFormat) { + case PIX_FMT_RGB32 : + case PIX_FMT_BGR32 : + c->alpSrcOffset = 3; + break; + case PIX_FMT_RGB32_1: + case PIX_FMT_BGR32_1: + c->lumSrcOffset = ALT32_CORR; + c->chrSrcOffset = ALT32_CORR; + break; + case PIX_FMT_RGB48LE: + c->lumSrcOffset = 1; + c->chrSrcOffset = 1; + c->alpSrcOffset = 1; + break; + } + + if (c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))) { + if (c->srcRange) { + c->lumConvertRange = RENAME(lumRangeFromJpeg); + c->chrConvertRange = RENAME(chrRangeFromJpeg); + } else { + c->lumConvertRange = RENAME(lumRangeToJpeg); + c->chrConvertRange = RENAME(chrRangeToJpeg); } } + + if (!(isGray(srcFormat) || isGray(c->dstFormat) || + srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE)) + c->needs_hcscale = 1; }