Use SSE intrinsics instead of asm() for vertical SSE, finally offloading the register...

author Steinar H. Gunderson <sesse@debian.org>

Sun, 27 Sep 2009 19:58:13 +0000 (21:58 +0200)

committer Steinar H. Gunderson <sesse@debian.org>

Sun, 27 Sep 2009 19:58:13 +0000 (21:58 +0200)
author Steinar H. Gunderson <sesse@debian.org>
Sun, 27 Sep 2009 19:58:13 +0000 (21:58 +0200)
committer Steinar H. Gunderson <sesse@debian.org>
Sun, 27 Sep 2009 19:58:13 +0000 (21:58 +0200)
diff --git a/README b/README

index 98b88c3f406aba47113484f73520ebdf7cd11d41..be6feb972d34f9ede69615b1eb5059712ebeb6b0 100644 (file)
--- a/README
+++ b/README
@@ -75,9 +75,6 @@ The following optimizations are possible but not done (yet?):
     filter kernel is large (ie. large amounts of scaling).
   - qscale does not use multiple processors or cores (although different cores 
     can of course work on different images at the same time).
     filter kernel is large (ie. large amounts of scaling).
   - qscale does not use multiple processors or cores (although different cores 
     can of course work on different images at the same time).
- - qscale does not make very good use of the extra eight SSE registers found
-   on 64-bit x86-compatible (usually called amd64 or x86-64) machines. In
-   fact, out of the box it might not even compile on such machines.
  
  Finally, a word of advice: Even the fastest scaler can never be as fast as not
  scaling. If you can do less work, or move it to a time where it is less user
  
  Finally, a word of advice: Even the fastest scaler can never be as fast as not
  scaling. If you can do less work, or move it to a time where it is less user
diff --git a/libqscale.c b/libqscale.c

index 47b6fe8959c8d0db09545c0f3448a012658d6dde..9a37853f0d15e8f5ecde44a45d3088dd904be0f7 100644 (file)
--- a/libqscale.c
+++ b/libqscale.c
@@ -42,6 +42,13 @@
  #define M_PI 3.14159265358979323846264
  #endif
  
  #define M_PI 3.14159265358979323846264
  #endif
  
+#if USE_VERTICAL_SSE || USE_HORIZONTAL_SSE
+typedef float v4sf __attribute__((vector_size(16)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef char v16qi __attribute__((vector_size(16)));
+#endif
+
  qscale_img *qscale_load_jpeg(const char *filename)
  {
         FILE *file = fopen(filename, "rb");
  qscale_img *qscale_load_jpeg(const char *filename)
  {
         FILE *file = fopen(filename, "rb");
@@ -411,93 +418,51 @@ static void vscale(unsigned char *pix, float *npix, unsigned w, unsigned h, unsi
                 float *dptr = npix + x;
                 for (y = 0; y < nh; ++y) {
  #if USE_VERTICAL_SSE
                 float *dptr = npix + x;
                 for (y = 0; y < nh; ++y) {
  #if USE_VERTICAL_SSE
-                       /*
-                        * xmm0 - xmm3: acc[0..15]
-                        * xmm4: current filter coefficient
-                        * xmm5, xmm6, xmm7: scratchpad
-                        */
-                       __asm__ (
-                               /* clear */
-                               "pxor %%xmm0, %%xmm0          \n"
-                               "pxor %%xmm1, %%xmm1          \n"
-                               "pxor %%xmm2, %%xmm2          \n"
-                               "pxor %%xmm3, %%xmm3          \n"
-
-                               /* main loop */
-                               "0:                           \n"
-                               
-                               /* a zero is useful during unpacking */
-                               "pxor %%xmm4, %%xmm4          \n"
-                               
-                               /* fetch all 16 source bytes */
-                               "movups (%0), %%xmm5          \n"
-                               "prefetcht0 (%0,%3,4)         \n"
-
-                               /* unpack into words (xmm5, xmm7) */
-                               "movaps %%xmm5, %%xmm7        \n"
-                               "punpcklbw %%xmm4, %%xmm5     \n"
-                               "punpckhbw %%xmm4, %%xmm7     \n"
-
-                               /* unpack xmm5 into dwords (xmm5, xmm6) */
-                               "movaps %%xmm5, %%xmm6        \n"
-                               "punpcklwd %%xmm4, %%xmm5     \n"
-                               "punpckhwd %%xmm4, %%xmm6     \n"
-
-                               /* convert xmm5, xmm6 to floats */
-                               "cvtdq2ps %%xmm5, %%xmm5      \n"
-                               "cvtdq2ps %%xmm6, %%xmm6      \n"
-
-                               /* fetch the coefficient */
-                               "movss (%2), %%xmm4           \n"
-                               "shufps $0x0, %%xmm4, %%xmm4  \n"
-
-                               /* do the muls for xmm5 and xmm6 */
-                               "mulps %%xmm4, %%xmm5         \n"
-                               "mulps %%xmm4, %%xmm6         \n"
-                               "addps %%xmm5, %%xmm0         \n"
-                               "addps %%xmm6, %%xmm1         \n"
-
-                               /* get the zero back again */
-                               "pxor %%xmm4, %%xmm4          \n"
-
-                               /* unpack xmm7 into dwords (xmm7, xmm6) */
-                               "movaps %%xmm7, %%xmm6        \n"
-                               "punpcklwd %%xmm4, %%xmm7     \n"
-                               "punpckhwd %%xmm4, %%xmm6     \n"
-
-                               /* convert xmm7, xmm6 to floats */
-                               "cvtdq2ps %%xmm7, %%xmm7      \n"
-                               "cvtdq2ps %%xmm6, %%xmm6      \n"
-
-                               /* fetch the coefficient */
-                               "movss (%2), %%xmm4           \n"
-                               "shufps $0x0, %%xmm4, %%xmm4  \n"
-
-                               /* do the second set of muls */
-                               "mulps %%xmm4, %%xmm7         \n"
-                               "mulps %%xmm4, %%xmm6         \n"
-                               "addps %%xmm7, %%xmm2         \n"
-                               "addps %%xmm6, %%xmm3         \n"
-
-                               /* move along, and loop */
-                               "add $4, %2                   \n"
-                               "add %3, %0                   \n"
-                               "dec %1                       \n"
-                               "jnz 0b                       \n"
-
-                               /* store the values */
-                               "movaps %%xmm0, (%4)          \n"
-                               "movaps %%xmm1, 16(%4)        \n"
-                               "movaps %%xmm2, 32(%4)        \n"
-                               "movaps %%xmm3, 48(%4)        \n"
-                               : :
-                               "r" (&sptr[pd[y].start * w]),        /* 0: srcptr base */
-                               "r" (pd[y].end - pd[y].start + 1),   /* 1: filter len */
-                               "r" (&coeffs[pd[y].startcoeff]),     /* 2: coeffs base */
-                               "r" ((long)w),                       /* 3: stride */
-                               "r" (dptr)                           /* 4: dstptr base */
-                               : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-                       );
+                       /* A zero is useful during unpacking. */
+                       static const v4sf zero = { 0.0f, 0.0f, 0.0f, 0.0f };
+                       const unsigned char *sptr_xmm = &sptr[pd[y].start * w];
+                       const float *coeffptr = &coeffs[pd[y].startcoeff];
+                       const int filter_len = pd[y].end - pd[y].start + 1;
+                       int i;
+
+                       v4sf acc0 = { 0.0f, 0.0f, 0.0f, 0.0f };
+                       v4sf acc1 = { 0.0f, 0.0f, 0.0f, 0.0f };
+                       v4sf acc2 = { 0.0f, 0.0f, 0.0f, 0.0f };
+                       v4sf acc3 = { 0.0f, 0.0f, 0.0f, 0.0f };
+                       
+                       for (i = 0; i < filter_len; ++i, ++coeffptr, sptr_xmm += w) {
+                               __builtin_prefetch(sptr_xmm + w, 0);
+                               v16qi src = (v16qi)__builtin_ia32_loadups((float*)sptr_xmm);
+
+                               // unpack into words
+                               v8hi src_lo = (v8hi)__builtin_ia32_punpcklbw128(src, (v16qi)zero);
+                               v8hi src_hi = (v8hi)__builtin_ia32_punpckhbw128(src, (v16qi)zero);
+
+                               // unpack into dwords, convert to floats
+                               v4si src0_i = (v4si)__builtin_ia32_punpcklwd128(src_lo, (v8hi)zero);
+                               v4si src1_i = (v4si)__builtin_ia32_punpckhwd128(src_lo, (v8hi)zero);
+                               v4si src2_i = (v4si)__builtin_ia32_punpcklwd128(src_hi, (v8hi)zero);
+                               v4si src3_i = (v4si)__builtin_ia32_punpckhwd128(src_hi, (v8hi)zero);
+
+                               v4sf src0 = __builtin_ia32_cvtdq2ps(src0_i);
+                               v4sf src1 = __builtin_ia32_cvtdq2ps(src1_i);
+                               v4sf src2 = __builtin_ia32_cvtdq2ps(src2_i);
+                               v4sf src3 = __builtin_ia32_cvtdq2ps(src3_i);
+                       
+                               // fetch the coefficient, and replicate it
+                               v4sf coeff = { *coeffptr, *coeffptr, *coeffptr, *coeffptr };
+
+                               // do the actual muladds
+                               acc0 = __builtin_ia32_addps(acc0, __builtin_ia32_mulps(src0, coeff));
+                               acc1 = __builtin_ia32_addps(acc1, __builtin_ia32_mulps(src1, coeff));
+                               acc2 = __builtin_ia32_addps(acc2, __builtin_ia32_mulps(src2, coeff));
+                               acc3 = __builtin_ia32_addps(acc3, __builtin_ia32_mulps(src3, coeff));
+                       }
+
+                       *(v4sf *)(&dptr[0]) = acc0;
+                       *(v4sf *)(&dptr[4]) = acc1;
+                       *(v4sf *)(&dptr[8]) = acc2;
+                       *(v4sf *)(&dptr[12]) = acc3;
  #else
                         int i;
                         float acc[CACHE_LINE_FACTOR];
  #else
                         int i;
                         float acc[CACHE_LINE_FACTOR];
author	Steinar H. Gunderson <sesse@debian.org>
	Sun, 27 Sep 2009 19:58:13 +0000 (21:58 +0200)
committer	Steinar H. Gunderson <sesse@debian.org>
	Sun, 27 Sep 2009 19:58:13 +0000 (21:58 +0200)
README		patch \| blob \| history
libqscale.c		patch \| blob \| history