From: Maksym Veremeyenko Date: Thu, 16 Feb 2012 17:10:00 +0000 (+0200) Subject: use sse2 instruction for line compositing X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=372d8d2e3f9a5ee8254075b921a78657b5fd2e15;p=mlt use sse2 instruction for line compositing --- diff --git a/src/modules/core/composite_line_yuv_sse2_simple.c b/src/modules/core/composite_line_yuv_sse2_simple.c new file mode 100644 index 00000000..2ed48018 --- /dev/null +++ b/src/modules/core/composite_line_yuv_sse2_simple.c @@ -0,0 +1,167 @@ +void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint8_t *alpha_b, uint8_t *alpha_a, int weight) +{ + const static unsigned char const1[] = + { + 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00 + }; + + __asm__ volatile + ( + "pxor %%xmm0, %%xmm0 \n\t" /* clear zero register */ + "movdqu (%4), %%xmm9 \n\t" /* load const1 */ + "movd %0, %%xmm1 \n\t" /* load weight and decompose */ + "movlhps %%xmm1, %%xmm1 \n\t" + "pshuflw $0, %%xmm1, %%xmm1 \n\t" + "pshufhw $0, %%xmm1, %%xmm1 \n\t" + + /* + xmm1 (weight) + + 00 W 00 W 00 W 00 W 00 W 00 W 00 W 00 W + */ + "loop_start: \n\t" + "movq (%1), %%xmm2 \n\t" /* load source alpha */ + "punpcklbw %%xmm0, %%xmm2 \n\t" /* unpack alpha 8 8-bits alphas to 8 16-bits values */ + + /* + xmm2 (src alpha) + xmm3 (dst alpha) + + 00 A8 00 A7 00 A6 00 A5 00 A4 00 A3 00 A2 00 A1 + */ + "pmullw %%xmm1, %%xmm2 \n\t" /* premultiply source alpha */ + "psrlw $8, %%xmm2 \n\t" + + /* + xmm2 (premultiplied) + + 00 A8 00 A7 00 A6 00 A5 00 A4 00 A3 00 A2 00 A1 + */ + + + /* + DSTa = DSTa + (SRCa * (0xFF - DSTa)) >> 8 + */ + "movq (%5), %%xmm3 \n\t" /* load dst alpha */ + "punpcklbw %%xmm0, %%xmm3 \n\t" /* unpack dst 8 8-bits alphas to 8 16-bits values */ + "movdqa %%xmm9, %%xmm4 \n\t" + "psubw %%xmm3, %%xmm4 \n\t" + "pmullw %%xmm2, %%xmm4 \n\t" + "psrlw $8, %%xmm4 \n\t" + "paddw %%xmm4, %%xmm3 \n\t" + "packuswb %%xmm0, %%xmm3 \n\t" + "movq %%xmm3, (%5) \n\t" /* save dst alpha */ + + "movdqu (%2), %%xmm3 \n\t" /* load src */ + "movdqu (%3), %%xmm4 \n\t" /* load dst */ + "movdqa %%xmm3, %%xmm5 \n\t" /* dub src */ + "movdqa %%xmm4, %%xmm6 \n\t" /* dub dst */ + + /* + xmm3 (src) + xmm4 (dst) + xmm5 (src) + xmm6 (dst) + + U8 V8 U7 V7 U6 V6 U5 V5 U4 V4 U3 V3 U2 V2 U1 V1 + */ + + "punpcklbw %%xmm0, %%xmm5 \n\t" /* unpack src low */ + "punpcklbw %%xmm0, %%xmm6 \n\t" /* unpack dst low */ + "punpckhbw %%xmm0, %%xmm3 \n\t" /* unpack src high */ + "punpckhbw %%xmm0, %%xmm4 \n\t" /* unpack dst high */ + + /* + xmm5 (src_l) + xmm6 (dst_l) + + 00 U4 00 V4 00 U3 00 V3 00 U2 00 V2 00 U1 00 V1 + + xmm3 (src_u) + xmm4 (dst_u) + + 00 U8 00 V8 00 U7 00 V7 00 U6 00 V6 00 U5 00 V5 + */ + + "movdqa %%xmm2, %%xmm7 \n\t" /* dub alpha */ + "movdqa %%xmm2, %%xmm8 \n\t" /* dub alpha */ + "movlhps %%xmm7, %%xmm7 \n\t" /* dub low */ + "movhlps %%xmm8, %%xmm8 \n\t" /* dub high */ + + /* + xmm7 (src alpha) + + 00 A4 00 A3 00 A2 00 A1 00 A4 00 A3 00 A2 00 A1 + xmm8 (src alpha) + + 00 A8 00 A7 00 A6 00 A5 00 A8 00 A7 00 A6 00 A5 + */ + + "pshuflw $0x50, %%xmm7, %%xmm7 \n\t" + "pshuflw $0x50, %%xmm8, %%xmm8 \n\t" + "pshufhw $0xFA, %%xmm7, %%xmm7 \n\t" + "pshufhw $0xFA, %%xmm8, %%xmm8 \n\t" + + /* + xmm7 (src alpha lower) + + 00 A4 00 A4 00 A3 00 A3 00 A2 00 A2 00 A1 00 A1 + + xmm8 (src alpha upper) + 00 A8 00 A8 00 A7 00 A7 00 A6 00 A6 00 A5 00 A5 + */ + + + /* + DST = SRC * ALPHA + DST * (0xFF - ALPHA) + SRC * ALPHA + DST * 0xFF - DST * ALPHA + (SRC - DST) * ALPHA + DST * 0xFF + + */ + "psubw %%xmm4, %%xmm3 \n\t" /* src = src - dst */ + "psubw %%xmm6, %%xmm5 \n\t" + "pmullw %%xmm8, %%xmm3 \n\t" /* src = src * alpha */ + "pmullw %%xmm7, %%xmm5 \n\t" + "pmullw %%xmm9, %%xmm4 \n\t" /* dst = dst * 0xFF */ + "pmullw %%xmm9, %%xmm6 \n\t" + "paddw %%xmm3, %%xmm4 \n\t" /* dst = dst + src */ + "paddw %%xmm5, %%xmm6 \n\t" + "psrlw $8, %%xmm4 \n\t" /* dst = dst >> 8 */ + "psrlw $8, %%xmm6 \n\t" +// "pminsw %%xmm9, %%xmm4 \n\t" /* clamp values */ +// "pminsw %%xmm9, %%xmm6 \n\t" + + /* + xmm6 (dst_l) + + 00 U4 00 V4 00 U3 00 V3 00 U2 00 V2 00 U1 00 V1 + + xmm4 (dst_u) + + 00 U8 00 V8 00 U7 00 V7 00 U6 00 V6 00 U5 00 V5 + */ + "packuswb %%xmm4, %%xmm6 \n\t" + + /* + xmm6 (dst) + + U8 V8 U7 V7 U6 V6 U5 V5 U4 V4 U3 V3 U2 V2 U1 V1 + */ + "movdqu %%xmm6, (%3) \n\t" /* store dst */ + + /* + increment pointers + */ + "add $0x08, %1 \n\t" + "add $0x08, %5 \n\t" + "add $0x10, %2 \n\t" + "add $0x10, %3 \n\t" + + "dec %6 \n\t" + "jnz loop_start \n\t" + + : + : "r" (weight >> 8), "r" (alpha_b), "r" (src), "r" (dest), "r" (const1) , "r" (alpha_a), "r" (width / 8) + : //"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9", "memory" + ); +}; diff --git a/src/modules/core/transition_composite.c b/src/modules/core/transition_composite.c index f7054acb..ca904718 100644 --- a/src/modules/core/transition_composite.c +++ b/src/modules/core/transition_composite.c @@ -365,13 +365,28 @@ static inline uint8_t sample_mix( uint8_t dest, uint8_t src, int mix ) /** Composite a source line over a destination line */ +#if defined(USE_SSE) && defined(ARCH_X86_64) +#include "composite_line_yuv_sse2_simple.c" +#endif static void composite_line_yuv( uint8_t *dest, uint8_t *src, int width, uint8_t *alpha_b, uint8_t *alpha_a, int weight, uint16_t *luma, int soft, uint32_t step ) { - register int j; + register int j = 0; register int mix; - for ( j = 0; j < width; j ++ ) +#if defined(USE_SSE) && defined(ARCH_X86_64) + if ( !luma && width > 7 ) + { + composite_line_yuv_sse2_simple(dest, src, width, alpha_b, alpha_a, weight); + j = width - width % 8; + dest += j * 2; + src += j * 2; + alpha_a += j; + alpha_b += j; + } +#endif + + for ( ; j < width; j ++ ) { mix = calculate_mix( luma, j, soft, weight, *alpha_b ++, step ); *dest = sample_mix( *dest, *src++, mix );