From 4b1b33664e401495b19c378953f7b64305026882 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sesse@debian.org>
Date: Sun, 27 Sep 2009 21:58:50 +0200
Subject: [PATCH] Use SSE intrinsics for the horizontal scaling as well.

---
 Makefile    |  2 +-
 libqscale.c | 48 +++++++++++++++++++-----------------------------
 2 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/Makefile b/Makefile
index d199187..06689ed 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 CC=gcc
 OBJS=qscale.o libqscale.o
 
-CFLAGS=-Wall -O2 -msse -fPIC
+CFLAGS=-Wall -O2 -msse3 -fPIC
 LDFLAGS=-lm -ljpeg
 
 qscale: qscale.o libqscale.o
diff --git a/libqscale.c b/libqscale.c
index 9a37853..91a6f78 100644
--- a/libqscale.c
+++ b/libqscale.c
@@ -300,36 +300,26 @@ static void hscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsi
 		unsigned char ch;
 		for (x = 0; x < nw; ++x) {
 #if USE_HORIZONTAL_SSE
+			v4sf acc = { 0.0f, 0.0f, 0.0f, 0.0f };
+			static const v4sf low = { 0.0f, 0.0f, 0.0f, 0.0f };
+			static const v4sf high = { 255.0f, 255.0f, 255.0f, 255.0f };
 			int result;
-			float acc;
-			long tmp;
-			static const float low = 0.0, high = 255.0;
-			__asm__ (
-				"pxor %1, %1               \n"
-				"xor %2, %2                \n"
-				"0:                        \n"
-				"movups (%4,%2),%%xmm1     \n"
-				"movups (%3,%2),%%xmm2     \n"
-				"mulps %%xmm2,%%xmm1       \n"
-				"addps %%xmm1,%1           \n"
-				"add $16,%2                \n"
-				"dec %5                    \n"
-				"jnz 0b                    \n"
-				"haddps %1,%1              \n"
-				"haddps %1,%1              \n"
-				"maxss %6,%1               \n"
-				"minss %7,%1               \n"
-				"cvtss2si %1,%0            \n"
-				: "=r" (result),
-				  "=&x" (acc),
-				  "=&r" (tmp)
-				: "r" (&coeffs[pd[x].startcoeff]),
-				  "r" (&sptr[pd[x].start]),
-				  "r" ((pd[x].end - pd[x].start + 1)/4),
-				  "m" (low),
-				  "m" (high)
-				: "memory", "xmm1", "xmm2"
-			);
+			int i;
+		
+			const float *sptr_xmm = &sptr[pd[x].start];
+			const float *coeffptr = &coeffs[pd[x].startcoeff];
+			const int filter_len = (pd[x].end - pd[x].start + 1) / 4;
+
+			for (i = 0; i < filter_len; ++i) {
+				v4sf pixels = __builtin_ia32_loadups(&sptr_xmm[i * 4]);
+				v4sf coeffs = __builtin_ia32_loadups(&coeffptr[i * 4]);
+				acc = __builtin_ia32_addps(acc, __builtin_ia32_mulps(pixels, coeffs));
+			}
+			acc = __builtin_ia32_haddps(acc, acc);	
+			acc = __builtin_ia32_haddps(acc, acc);
+			acc = __builtin_ia32_maxss(acc, low);
+			acc = __builtin_ia32_minss(acc, high);
+			result = __builtin_ia32_cvtss2si(acc);
 
 			*dptr++ = (unsigned char)result;
 #else
-- 
2.39.2