X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=qscale.c;h=0f9598f32a29de1fed6b8ba694802a4dae1810a1;hb=544e75829b0ab9b74f7fe2539801c168c4ab531d;hp=7943f90bc58f5dc8aa372ea5c17c9ee52d3f031e;hpb=90ce069cb73a68a368113ff1e7e33d658c9d44cb;p=qscale diff --git a/qscale.c b/qscale.c index 7943f90..0f9598f 100644 --- a/qscale.c +++ b/qscale.c @@ -101,32 +101,35 @@ void hscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsigned nw unsigned char *dptr = npix + y*dstride; unsigned char ch; for (x = 0; x < nw; ++x) { + int result; float acc; - int tmp; + long tmp; static const float low = 0.0, high = 255.0; asm ( - "pxor %0, %0 \n" - "xor %1, %1 \n" + "pxor %1, %1 \n" + "xor %2, %2 \n" ".lbl2: \n" - "movups (%3,%1),%%xmm1 \n" - "movups (%2,%1),%%xmm2 \n" + "movups (%4,%2),%%xmm1 \n" + "movups (%3,%2),%%xmm2 \n" "mulps %%xmm2,%%xmm1 \n" - "addps %%xmm1,%0 \n" - "addl $16,%1 \n" - "dec %4 \n" + "addps %%xmm1,%1 \n" + "add $16,%2 \n" + "dec %5 \n" "jnz .lbl2 \n" - "haddps %0,%0 \n" - "haddps %0,%0 \n" - "maxss %5,%0 \n" - "minss %6,%0 \n" - : "=&x" (acc), + "haddps %1,%1 \n" + "haddps %1,%1 \n" + "maxss %6,%1 \n" + "minss %7,%1 \n" + "cvtss2si %1,%0 \n" + : "=r" (result), + "=&x" (acc), "=&r" (tmp) : "r" (&coeffs[pd[x].startcoeff]), "r" (&sptr[pd[x].start]), "r" ((pd[x].end - pd[x].start + 1)/4), "m" (low), "m" (high) - : "xmm1", "xmm2" + : "memory", "xmm1", "xmm2" ); #if 0 @@ -138,7 +141,7 @@ void hscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsigned nw ch = (unsigned char)acc; *dptr++ = ch; #endif - *dptr++ = (unsigned char)acc; + *dptr++ = (unsigned char)result; } ch = dptr[-1]; for ( ; x < dstride; ++x) { @@ -222,83 +225,83 @@ void vscale(unsigned char *pix, float *npix, unsigned w, unsigned h, unsigned nh */ asm ( /* clear */ - "pxor %%xmm0, %%xmm0 \n" - "pxor %%xmm1, %%xmm1 \n" - "pxor %%xmm2, %%xmm2 \n" - "pxor %%xmm3, %%xmm3 \n" + "pxor %%xmm0, %%xmm0 \n" + "pxor %%xmm1, %%xmm1 \n" + "pxor %%xmm2, %%xmm2 \n" + "pxor %%xmm3, %%xmm3 \n" /* main loop */ - ".lbl: \n" + ".lbl: \n" /* a zero is useful during unpacking */ - "pxor %%xmm4, %%xmm4 \n" + "pxor %%xmm4, %%xmm4 \n" /* fetch all 16 source bytes */ - "movups (%0), %%xmm5 \n" - "prefetcht0 (%0,%3,4) \n" + "movups (%0), %%xmm5 \n" + "prefetcht0 (%0,%3,4) \n" /* unpack into words (xmm5, xmm7) */ - "movaps %%xmm5, %%xmm7 \n" - "punpcklbw %%xmm4, %%xmm5 \n" - "punpckhbw %%xmm4, %%xmm7 \n" + "movaps %%xmm5, %%xmm7 \n" + "punpcklbw %%xmm4, %%xmm5 \n" + "punpckhbw %%xmm4, %%xmm7 \n" /* unpack xmm5 into dwords (xmm5, xmm6) */ - "movaps %%xmm5, %%xmm6 \n" - "punpcklwd %%xmm4, %%xmm5 \n" - "punpckhwd %%xmm4, %%xmm6 \n" + "movaps %%xmm5, %%xmm6 \n" + "punpcklwd %%xmm4, %%xmm5 \n" + "punpckhwd %%xmm4, %%xmm6 \n" /* convert xmm5, xmm6 to floats */ - "cvtdq2ps %%xmm5, %%xmm5 \n" - "cvtdq2ps %%xmm6, %%xmm6 \n" + "cvtdq2ps %%xmm5, %%xmm5 \n" + "cvtdq2ps %%xmm6, %%xmm6 \n" /* fetch the coefficient */ - "movss (%2), %%xmm4 \n" - "shufps $0x0, %%xmm4, %%xmm4 \n" + "movss (%2), %%xmm4 \n" + "shufps $0x0, %%xmm4, %%xmm4 \n" /* do the muls for xmm5 and xmm6 */ - "mulps %%xmm4, %%xmm5 \n" - "mulps %%xmm4, %%xmm6 \n" - "addps %%xmm5, %%xmm0 \n" - "addps %%xmm6, %%xmm1 \n" + "mulps %%xmm4, %%xmm5 \n" + "mulps %%xmm4, %%xmm6 \n" + "addps %%xmm5, %%xmm0 \n" + "addps %%xmm6, %%xmm1 \n" /* get the zero back again */ - "pxor %%xmm4, %%xmm4 \n" + "pxor %%xmm4, %%xmm4 \n" /* unpack xmm7 into dwords (xmm7, xmm6) */ - "movaps %%xmm7, %%xmm6 \n" - "punpcklwd %%xmm4, %%xmm7 \n" - "punpckhwd %%xmm4, %%xmm6 \n" + "movaps %%xmm7, %%xmm6 \n" + "punpcklwd %%xmm4, %%xmm7 \n" + "punpckhwd %%xmm4, %%xmm6 \n" /* convert xmm7, xmm6 to floats */ - "cvtdq2ps %%xmm7, %%xmm7 \n" - "cvtdq2ps %%xmm6, %%xmm6 \n" + "cvtdq2ps %%xmm7, %%xmm7 \n" + "cvtdq2ps %%xmm6, %%xmm6 \n" /* fetch the coefficient */ - "movss (%2), %%xmm4 \n" - "shufps $0x0, %%xmm4, %%xmm4 \n" + "movss (%2), %%xmm4 \n" + "shufps $0x0, %%xmm4, %%xmm4 \n" /* do the second set of muls */ - "mulps %%xmm4, %%xmm7 \n" - "mulps %%xmm4, %%xmm6 \n" - "addps %%xmm7, %%xmm2 \n" - "addps %%xmm6, %%xmm3 \n" + "mulps %%xmm4, %%xmm7 \n" + "mulps %%xmm4, %%xmm6 \n" + "addps %%xmm7, %%xmm2 \n" + "addps %%xmm6, %%xmm3 \n" /* move along, and loop */ - "add $4, %2 \n" - "add %3, %0 \n" - "dec %1 \n" - "jnz .lbl \n" + "add $4, %2 \n" + "add %3, %0 \n" + "dec %1 \n" + "jnz .lbl \n" /* store the values */ - "movaps %%xmm0, (%4) \n" - "movaps %%xmm1, 16(%4) \n" - "movaps %%xmm2, 32(%4) \n" - "movaps %%xmm3, 48(%4) \n" + "movaps %%xmm0, (%4) \n" + "movaps %%xmm1, 16(%4) \n" + "movaps %%xmm2, 32(%4) \n" + "movaps %%xmm3, 48(%4) \n" : : "r" (&sptr[pd[y].start * w]), /* 0: srcptr base */ "r" (pd[y].end - pd[y].start + 1), /* 1: filter len */ "r" (&coeffs[pd[y].startcoeff]), /* 2: coeffs base */ - "r" (w), /* 3: stride */ + "r" ((long)w), /* 3: stride */ "r" (dptr) /* 4: dstptr base */ : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" ); @@ -363,8 +366,6 @@ int main(int argc, char **argv) unsigned w2 = dinfo.image_width * dinfo.comp_info[2].h_samp_factor / dinfo.max_h_samp_factor; unsigned h2 = dinfo.image_height * dinfo.comp_info[2].v_samp_factor / dinfo.max_v_samp_factor; - fprintf(stderr, "h0 = %u, h1=%u, h2=%u\n", h0, h1, h2); - fprintf(stderr, "Scaling using Lanczos filter:\n"); fprintf(stderr, " Y component: %ux%u -> %ux%u\n", dinfo.comp_info[0].width_in_blocks * DCTSIZE, dinfo.comp_info[0].height_in_blocks * DCTSIZE, nw0, nh0); fprintf(stderr, " Cb component: %ux%u -> %ux%u\n", dinfo.comp_info[1].width_in_blocks * DCTSIZE, dinfo.comp_info[1].height_in_blocks * DCTSIZE, nw1, nh1);