From 4b1b33664e401495b19c378953f7b64305026882 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Sun, 27 Sep 2009 21:58:50 +0200 Subject: [PATCH] Use SSE intrinsics for the horizontal scaling as well. --- Makefile | 2 +- libqscale.c | 48 +++++++++++++++++++----------------------------- 2 files changed, 20 insertions(+), 30 deletions(-) diff --git a/Makefile b/Makefile index d199187..06689ed 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ CC=gcc OBJS=qscale.o libqscale.o -CFLAGS=-Wall -O2 -msse -fPIC +CFLAGS=-Wall -O2 -msse3 -fPIC LDFLAGS=-lm -ljpeg qscale: qscale.o libqscale.o diff --git a/libqscale.c b/libqscale.c index 9a37853..91a6f78 100644 --- a/libqscale.c +++ b/libqscale.c @@ -300,36 +300,26 @@ static void hscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsi unsigned char ch; for (x = 0; x < nw; ++x) { #if USE_HORIZONTAL_SSE + v4sf acc = { 0.0f, 0.0f, 0.0f, 0.0f }; + static const v4sf low = { 0.0f, 0.0f, 0.0f, 0.0f }; + static const v4sf high = { 255.0f, 255.0f, 255.0f, 255.0f }; int result; - float acc; - long tmp; - static const float low = 0.0, high = 255.0; - __asm__ ( - "pxor %1, %1 \n" - "xor %2, %2 \n" - "0: \n" - "movups (%4,%2),%%xmm1 \n" - "movups (%3,%2),%%xmm2 \n" - "mulps %%xmm2,%%xmm1 \n" - "addps %%xmm1,%1 \n" - "add $16,%2 \n" - "dec %5 \n" - "jnz 0b \n" - "haddps %1,%1 \n" - "haddps %1,%1 \n" - "maxss %6,%1 \n" - "minss %7,%1 \n" - "cvtss2si %1,%0 \n" - : "=r" (result), - "=&x" (acc), - "=&r" (tmp) - : "r" (&coeffs[pd[x].startcoeff]), - "r" (&sptr[pd[x].start]), - "r" ((pd[x].end - pd[x].start + 1)/4), - "m" (low), - "m" (high) - : "memory", "xmm1", "xmm2" - ); + int i; + + const float *sptr_xmm = &sptr[pd[x].start]; + const float *coeffptr = &coeffs[pd[x].startcoeff]; + const int filter_len = (pd[x].end - pd[x].start + 1) / 4; + + for (i = 0; i < filter_len; ++i) { + v4sf pixels = __builtin_ia32_loadups(&sptr_xmm[i * 4]); + v4sf coeffs = __builtin_ia32_loadups(&coeffptr[i * 4]); + acc = __builtin_ia32_addps(acc, __builtin_ia32_mulps(pixels, coeffs)); + } + acc = __builtin_ia32_haddps(acc, acc); + acc = __builtin_ia32_haddps(acc, acc); + acc = __builtin_ia32_maxss(acc, low); + acc = __builtin_ia32_minss(acc, high); + result = __builtin_ia32_cvtss2si(acc); *dptr++ = (unsigned char)result; #else -- 2.39.2