Use SSE intrinsics for the horizontal scaling as well.

author Steinar H. Gunderson <sesse@debian.org>

Sun, 27 Sep 2009 19:58:50 +0000 (21:58 +0200)

committer Steinar H. Gunderson <sesse@debian.org>

Sun, 27 Sep 2009 19:58:50 +0000 (21:58 +0200)
author Steinar H. Gunderson <sesse@debian.org>
Sun, 27 Sep 2009 19:58:50 +0000 (21:58 +0200)
committer Steinar H. Gunderson <sesse@debian.org>
Sun, 27 Sep 2009 19:58:50 +0000 (21:58 +0200)
diff --git a/Makefile b/Makefile

index d19918720fc270ede45c25cb1b17fe81bbb78c0e..06689ed6d9472900e4f4f0737683af506ceafccd 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
  CC=gcc
  OBJS=qscale.o libqscale.o
  
-CFLAGS=-Wall -O2 -msse -fPIC
+CFLAGS=-Wall -O2 -msse3 -fPIC
  LDFLAGS=-lm -ljpeg
  
  qscale: qscale.o libqscale.o
diff --git a/libqscale.c b/libqscale.c

index 9a37853f0d15e8f5ecde44a45d3088dd904be0f7..91a6f788063f53aa2bbf48d2bdb3b6557e2481cc 100644 (file)
--- a/libqscale.c
+++ b/libqscale.c
@@ -300,36 +300,26 @@ static void hscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsi
                 unsigned char ch;
                 for (x = 0; x < nw; ++x) {
  #if USE_HORIZONTAL_SSE
+                       v4sf acc = { 0.0f, 0.0f, 0.0f, 0.0f };
+                       static const v4sf low = { 0.0f, 0.0f, 0.0f, 0.0f };
+                       static const v4sf high = { 255.0f, 255.0f, 255.0f, 255.0f };
                         int result;
-                       float acc;
-                       long tmp;
-                       static const float low = 0.0, high = 255.0;
-                       __asm__ (
-                               "pxor %1, %1               \n"
-                               "xor %2, %2                \n"
-                               "0:                        \n"
-                               "movups (%4,%2),%%xmm1     \n"
-                               "movups (%3,%2),%%xmm2     \n"
-                               "mulps %%xmm2,%%xmm1       \n"
-                               "addps %%xmm1,%1           \n"
-                               "add $16,%2                \n"
-                               "dec %5                    \n"
-                               "jnz 0b                    \n"
-                               "haddps %1,%1              \n"
-                               "haddps %1,%1              \n"
-                               "maxss %6,%1               \n"
-                               "minss %7,%1               \n"
-                               "cvtss2si %1,%0            \n"
-                               : "=r" (result),
-                                 "=&x" (acc),
-                                 "=&r" (tmp)
-                               : "r" (&coeffs[pd[x].startcoeff]),
-                                 "r" (&sptr[pd[x].start]),
-                                 "r" ((pd[x].end - pd[x].start + 1)/4),
-                                 "m" (low),
-                                 "m" (high)
-                               : "memory", "xmm1", "xmm2"
-                       );
+                       int i;
+               
+                       const float *sptr_xmm = &sptr[pd[x].start];
+                       const float *coeffptr = &coeffs[pd[x].startcoeff];
+                       const int filter_len = (pd[x].end - pd[x].start + 1) / 4;
+
+                       for (i = 0; i < filter_len; ++i) {
+                               v4sf pixels = __builtin_ia32_loadups(&sptr_xmm[i * 4]);
+                               v4sf coeffs = __builtin_ia32_loadups(&coeffptr[i * 4]);
+                               acc = __builtin_ia32_addps(acc, __builtin_ia32_mulps(pixels, coeffs));
+                       }
+                       acc = __builtin_ia32_haddps(acc, acc);  
+                       acc = __builtin_ia32_haddps(acc, acc);
+                       acc = __builtin_ia32_maxss(acc, low);
+                       acc = __builtin_ia32_minss(acc, high);
+                       result = __builtin_ia32_cvtss2si(acc);
  
                         *dptr++ = (unsigned char)result;
  #else
author	Steinar H. Gunderson <sesse@debian.org>
	Sun, 27 Sep 2009 19:58:50 +0000 (21:58 +0200)
committer	Steinar H. Gunderson <sesse@debian.org>
	Sun, 27 Sep 2009 19:58:50 +0000 (21:58 +0200)
Makefile		patch \| blob \| history
libqscale.c		patch \| blob \| history