]> git.sesse.net Git - qscale/commitdiff
Use SSE intrinsics for the horizontal scaling as well.
authorSteinar H. Gunderson <sesse@debian.org>
Sun, 27 Sep 2009 19:58:50 +0000 (21:58 +0200)
committerSteinar H. Gunderson <sesse@debian.org>
Sun, 27 Sep 2009 19:58:50 +0000 (21:58 +0200)
Makefile
libqscale.c

index d19918720fc270ede45c25cb1b17fe81bbb78c0e..06689ed6d9472900e4f4f0737683af506ceafccd 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 CC=gcc
 OBJS=qscale.o libqscale.o
 
-CFLAGS=-Wall -O2 -msse -fPIC
+CFLAGS=-Wall -O2 -msse3 -fPIC
 LDFLAGS=-lm -ljpeg
 
 qscale: qscale.o libqscale.o
index 9a37853f0d15e8f5ecde44a45d3088dd904be0f7..91a6f788063f53aa2bbf48d2bdb3b6557e2481cc 100644 (file)
@@ -300,36 +300,26 @@ static void hscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsi
                unsigned char ch;
                for (x = 0; x < nw; ++x) {
 #if USE_HORIZONTAL_SSE
+                       v4sf acc = { 0.0f, 0.0f, 0.0f, 0.0f };
+                       static const v4sf low = { 0.0f, 0.0f, 0.0f, 0.0f };
+                       static const v4sf high = { 255.0f, 255.0f, 255.0f, 255.0f };
                        int result;
-                       float acc;
-                       long tmp;
-                       static const float low = 0.0, high = 255.0;
-                       __asm__ (
-                               "pxor %1, %1               \n"
-                               "xor %2, %2                \n"
-                               "0:                        \n"
-                               "movups (%4,%2),%%xmm1     \n"
-                               "movups (%3,%2),%%xmm2     \n"
-                               "mulps %%xmm2,%%xmm1       \n"
-                               "addps %%xmm1,%1           \n"
-                               "add $16,%2                \n"
-                               "dec %5                    \n"
-                               "jnz 0b                    \n"
-                               "haddps %1,%1              \n"
-                               "haddps %1,%1              \n"
-                               "maxss %6,%1               \n"
-                               "minss %7,%1               \n"
-                               "cvtss2si %1,%0            \n"
-                               : "=r" (result),
-                                 "=&x" (acc),
-                                 "=&r" (tmp)
-                               : "r" (&coeffs[pd[x].startcoeff]),
-                                 "r" (&sptr[pd[x].start]),
-                                 "r" ((pd[x].end - pd[x].start + 1)/4),
-                                 "m" (low),
-                                 "m" (high)
-                               : "memory", "xmm1", "xmm2"
-                       );
+                       int i;
+               
+                       const float *sptr_xmm = &sptr[pd[x].start];
+                       const float *coeffptr = &coeffs[pd[x].startcoeff];
+                       const int filter_len = (pd[x].end - pd[x].start + 1) / 4;
+
+                       for (i = 0; i < filter_len; ++i) {
+                               v4sf pixels = __builtin_ia32_loadups(&sptr_xmm[i * 4]);
+                               v4sf coeffs = __builtin_ia32_loadups(&coeffptr[i * 4]);
+                               acc = __builtin_ia32_addps(acc, __builtin_ia32_mulps(pixels, coeffs));
+                       }
+                       acc = __builtin_ia32_haddps(acc, acc);  
+                       acc = __builtin_ia32_haddps(acc, acc);
+                       acc = __builtin_ia32_maxss(acc, low);
+                       acc = __builtin_ia32_minss(acc, high);
+                       result = __builtin_ia32_cvtss2si(acc);
 
                        *dptr++ = (unsigned char)result;
 #else