From f9fa8c079cf510206fa4f3cf37f0bf7208c0ca85 Mon Sep 17 00:00:00 2001 From: "sgunderson@bigfoot.com" <> Date: Mon, 4 Feb 2008 02:11:09 +0100 Subject: [PATCH] Make the SSE code optional. --- qscale.c | 65 ++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 21 deletions(-) diff --git a/qscale.c b/qscale.c index ca7c7dd..d89c8b9 100644 --- a/qscale.c +++ b/qscale.c @@ -23,8 +23,20 @@ #include #include "jpeglib.h" +/* The number of pixels to process at a time when scaling vertically. */ #define CACHE_LINE_FACTOR 16 +/* Whether to use SSE for horizontal scaling or not (requires SSE3). */ +#define USE_HORIZONTAL_SSE 1 + +/* Whether to use SSE for vertical scaling or not (requires only SSE1). */ +#define USE_VERTICAL_SSE 1 + +#if USE_VERTICAL_SSE +#undef CACHE_LINE_FACTOR +#define CACHE_LINE_FACTOR 16 +#endif + #ifndef M_PI #define M_PI 3.14159265358979323846264 #endif @@ -84,6 +96,7 @@ void hscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsigned nw end = w - 1; } +#if USE_HORIZONTAL_SSE /* round up so we get a multiple of four for the SSE code */ int num = (end - start + 1); if (num % 4 != 0) { @@ -96,6 +109,7 @@ void hscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsigned nw end += 4 - (num % 4); } } +#endif pd[x].start = start; pd[x].end = end; @@ -123,6 +137,7 @@ void hscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsigned nw unsigned char *dptr = npix + y*dstride; unsigned char ch; for (x = 0; x < nw; ++x) { +#if USE_HORIZONTAL_SSE int result; float acc; long tmp; @@ -154,7 +169,16 @@ void hscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsigned nw : "memory", "xmm1", "xmm2" ); -#if 0 + *dptr++ = (unsigned char)result; +#else + float acc = 0.0; + float *cf = &coeffs[pd[x].startcoeff]; + unsigned sx; + + for (sx = pd[x].start; sx <= pd[x].end; ++sx) { + acc += sptr[sx] * *cf++; + } + if (acc < 0.0) ch = 0; else if (acc > 255.0) @@ -163,7 +187,6 @@ void hscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsigned nw ch = (unsigned char)acc; *dptr++ = ch; #endif - *dptr++ = (unsigned char)result; } ch = dptr[-1]; for ( ; x < dstride; ++x) { @@ -221,25 +244,7 @@ void vscale(unsigned char *pix, float *npix, unsigned w, unsigned h, unsigned nh unsigned char *sptr = pix + x; float *dptr = npix + x; for (y = 0; y < nh; ++y) { -#if 0 - int i; - float acc[CACHE_LINE_FACTOR]; - for (i = 0; i < CACHE_LINE_FACTOR; ++i) - acc[i] = 0.0; - float *cf = &coeffs[pd[y].startcoeff]; - unsigned sy; - - for (sy = pd[y].start; sy <= pd[y].end; ++sy) { - for (i = 0; i < CACHE_LINE_FACTOR; ++i) { - acc[i] += sptr[sy * w + i] * *cf; - } - ++cf; - } - - for (i = 0; i < CACHE_LINE_FACTOR; ++i) { - dptr[i] = acc[i]; - } -#else +#if USE_VERTICAL_SSE /* * xmm0 - xmm3: acc[0..15] * xmm4: current filter coefficient @@ -327,6 +332,24 @@ void vscale(unsigned char *pix, float *npix, unsigned w, unsigned h, unsigned nh "r" (dptr) /* 4: dstptr base */ : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); +#else + int i; + float acc[CACHE_LINE_FACTOR]; + for (i = 0; i < CACHE_LINE_FACTOR; ++i) + acc[i] = 0.0; + float *cf = &coeffs[pd[y].startcoeff]; + unsigned sy; + + for (sy = pd[y].start; sy <= pd[y].end; ++sy) { + for (i = 0; i < CACHE_LINE_FACTOR; ++i) { + acc[i] += sptr[sy * w + i] * *cf; + } + ++cf; + } + + for (i = 0; i < CACHE_LINE_FACTOR; ++i) { + dptr[i] = acc[i]; + } #endif dptr += dstride; } -- 2.39.5