From: Maksym Veremeyenko <verem@m1stereo.tv>
Date: Thu, 16 Feb 2012 17:10:00 +0000 (+0200)
Subject: use sse2 instruction for line compositing
X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=372d8d2e3f9a5ee8254075b921a78657b5fd2e15;p=mlt

use sse2 instruction for line compositing
---

diff --git a/src/modules/core/composite_line_yuv_sse2_simple.c b/src/modules/core/composite_line_yuv_sse2_simple.c
new file mode 100644
index 00000000..2ed48018
--- /dev/null
+++ b/src/modules/core/composite_line_yuv_sse2_simple.c
@@ -0,0 +1,167 @@
+void composite_line_yuv_sse2_simple(uint8_t *dest, uint8_t *src, int width, uint8_t *alpha_b, uint8_t *alpha_a, int weight)
+{
+    const static unsigned char const1[] =
+    {
+        0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00
+    };
+
+    __asm__ volatile
+    (
+        "pxor           %%xmm0, %%xmm0          \n\t"   /* clear zero register */
+        "movdqu         (%4), %%xmm9            \n\t"   /* load const1 */
+        "movd           %0, %%xmm1              \n\t"   /* load weight and decompose */
+        "movlhps        %%xmm1, %%xmm1          \n\t"
+        "pshuflw        $0, %%xmm1, %%xmm1      \n\t"
+        "pshufhw        $0, %%xmm1, %%xmm1      \n\t"
+
+        /*
+            xmm1 (weight)
+
+                    00  W 00  W 00  W 00  W 00  W 00  W 00  W 00  W
+        */
+        "loop_start:                            \n\t"
+        "movq           (%1), %%xmm2            \n\t"   /* load source alpha */
+        "punpcklbw      %%xmm0, %%xmm2          \n\t"   /* unpack alpha 8 8-bits alphas to 8 16-bits values */
+
+        /*
+            xmm2 (src alpha)
+            xmm3 (dst alpha)
+
+                    00 A8 00 A7 00 A6 00 A5 00 A4 00 A3 00 A2 00 A1
+        */
+        "pmullw         %%xmm1, %%xmm2          \n\t"   /* premultiply source alpha */
+        "psrlw          $8, %%xmm2              \n\t"
+
+        /*
+            xmm2 (premultiplied)
+
+                    00 A8 00 A7 00 A6 00 A5 00 A4 00 A3 00 A2 00 A1
+        */
+
+
+        /*
+            DSTa = DSTa + (SRCa * (0xFF - DSTa)) >> 8
+        */
+        "movq           (%5), %%xmm3            \n\t"   /* load dst alpha */
+        "punpcklbw      %%xmm0, %%xmm3          \n\t"   /* unpack dst 8 8-bits alphas to 8 16-bits values */
+        "movdqa         %%xmm9, %%xmm4          \n\t"
+        "psubw          %%xmm3, %%xmm4          \n\t"
+        "pmullw         %%xmm2, %%xmm4          \n\t"
+        "psrlw          $8, %%xmm4              \n\t"
+        "paddw          %%xmm4, %%xmm3          \n\t"
+        "packuswb       %%xmm0, %%xmm3          \n\t"
+        "movq           %%xmm3, (%5)            \n\t"   /* save dst alpha */
+
+        "movdqu         (%2), %%xmm3            \n\t"   /* load src */
+        "movdqu         (%3), %%xmm4            \n\t"   /* load dst */
+        "movdqa         %%xmm3, %%xmm5          \n\t"   /* dub src */
+        "movdqa         %%xmm4, %%xmm6          \n\t"   /* dub dst */
+
+        /*
+            xmm3 (src)
+            xmm4 (dst)
+            xmm5 (src)
+            xmm6 (dst)
+
+                    U8 V8 U7 V7 U6 V6 U5 V5 U4 V4 U3 V3 U2 V2 U1 V1
+        */
+
+        "punpcklbw      %%xmm0, %%xmm5          \n\t"   /* unpack src low */
+        "punpcklbw      %%xmm0, %%xmm6          \n\t"   /* unpack dst low */
+        "punpckhbw      %%xmm0, %%xmm3          \n\t"   /* unpack src high */
+        "punpckhbw      %%xmm0, %%xmm4          \n\t"   /* unpack dst high */
+
+        /*
+            xmm5 (src_l)
+            xmm6 (dst_l)
+
+                    00 U4 00 V4 00 U3 00 V3 00 U2 00 V2 00 U1 00 V1
+
+            xmm3 (src_u)
+            xmm4 (dst_u)
+
+                    00 U8 00 V8 00 U7 00 V7 00 U6 00 V6 00 U5 00 V5
+        */
+
+        "movdqa         %%xmm2, %%xmm7          \n\t"   /* dub alpha */
+        "movdqa         %%xmm2, %%xmm8          \n\t"   /* dub alpha */
+        "movlhps        %%xmm7, %%xmm7          \n\t"   /* dub low */
+        "movhlps        %%xmm8, %%xmm8          \n\t"   /* dub high */
+
+        /*
+            xmm7 (src alpha)
+
+                    00 A4 00 A3 00 A2 00 A1 00 A4 00 A3 00 A2 00 A1
+            xmm8 (src alpha)
+
+                    00 A8 00 A7 00 A6 00 A5 00 A8 00 A7 00 A6 00 A5
+        */
+
+        "pshuflw        $0x50, %%xmm7, %%xmm7     \n\t"
+        "pshuflw        $0x50, %%xmm8, %%xmm8     \n\t"
+        "pshufhw        $0xFA, %%xmm7, %%xmm7     \n\t"
+        "pshufhw        $0xFA, %%xmm8, %%xmm8     \n\t"
+
+        /*
+            xmm7 (src alpha lower)
+
+                    00 A4 00 A4 00 A3 00 A3 00 A2 00 A2 00 A1 00 A1
+
+            xmm8 (src alpha upper)
+                    00 A8 00 A8 00 A7 00 A7 00 A6 00 A6 00 A5 00 A5
+        */
+
+
+        /*
+            DST = SRC * ALPHA + DST * (0xFF - ALPHA)
+                SRC * ALPHA + DST * 0xFF - DST * ALPHA
+                (SRC - DST) * ALPHA + DST * 0xFF
+
+        */
+        "psubw          %%xmm4, %%xmm3          \n\t"   /* src = src - dst */
+        "psubw          %%xmm6, %%xmm5          \n\t"
+        "pmullw         %%xmm8, %%xmm3          \n\t"   /* src = src * alpha */
+        "pmullw         %%xmm7, %%xmm5          \n\t"
+        "pmullw         %%xmm9, %%xmm4          \n\t"   /* dst = dst * 0xFF */
+        "pmullw         %%xmm9, %%xmm6          \n\t"
+        "paddw          %%xmm3, %%xmm4          \n\t"   /* dst = dst + src */
+        "paddw          %%xmm5, %%xmm6          \n\t"
+        "psrlw          $8, %%xmm4              \n\t"   /* dst = dst >> 8 */
+        "psrlw          $8, %%xmm6              \n\t"
+//        "pminsw         %%xmm9, %%xmm4          \n\t"   /* clamp values */
+//        "pminsw         %%xmm9, %%xmm6          \n\t"
+
+        /*
+            xmm6 (dst_l)
+
+                    00 U4 00 V4 00 U3 00 V3 00 U2 00 V2 00 U1 00 V1
+
+            xmm4 (dst_u)
+
+                    00 U8 00 V8 00 U7 00 V7 00 U6 00 V6 00 U5 00 V5
+        */
+        "packuswb       %%xmm4, %%xmm6          \n\t"
+
+        /*
+            xmm6 (dst)
+
+                    U8 V8 U7 V7 U6 V6 U5 V5 U4 V4 U3 V3 U2 V2 U1 V1
+        */
+        "movdqu         %%xmm6, (%3)            \n\t"   /* store dst */
+
+        /*
+            increment pointers
+        */
+        "add            $0x08, %1               \n\t"
+        "add            $0x08, %5               \n\t"
+        "add            $0x10, %2               \n\t"
+        "add            $0x10, %3               \n\t"
+
+        "dec            %6                      \n\t"
+        "jnz            loop_start              \n\t"
+
+        :
+        : "r" (weight >> 8), "r" (alpha_b), "r" (src), "r" (dest), "r" (const1) , "r" (alpha_a), "r" (width / 8)
+        : //"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9", "memory"
+    );
+};
diff --git a/src/modules/core/transition_composite.c b/src/modules/core/transition_composite.c
index f7054acb..ca904718 100644
--- a/src/modules/core/transition_composite.c
+++ b/src/modules/core/transition_composite.c
@@ -365,13 +365,28 @@ static inline uint8_t sample_mix( uint8_t dest, uint8_t src, int mix )
 
 /** Composite a source line over a destination line
 */
+#if defined(USE_SSE) && defined(ARCH_X86_64)
+#include "composite_line_yuv_sse2_simple.c"
+#endif
 
 static void composite_line_yuv( uint8_t *dest, uint8_t *src, int width, uint8_t *alpha_b, uint8_t *alpha_a, int weight, uint16_t *luma, int soft, uint32_t step )
 {
-	register int j;
+	register int j = 0;
 	register int mix;
 
-	for ( j = 0; j < width; j ++ )
+#if defined(USE_SSE) && defined(ARCH_X86_64)
+	if ( !luma && width > 7 )
+	{
+		composite_line_yuv_sse2_simple(dest, src, width, alpha_b, alpha_a, weight);
+		j = width - width % 8;
+		dest += j * 2;
+		src += j * 2;
+		alpha_a += j;
+		alpha_b += j;
+	}
+#endif
+
+	for ( ; j < width; j ++ )
 	{
 		mix = calculate_mix( luma, j, soft, weight, *alpha_b ++, step );
 		*dest = sample_mix( *dest, *src++, mix );