From 0ab8a3ee1805b253d0dbc3a20bcb0c751bfccc74 Mon Sep 17 00:00:00 2001 From: "sgunderson@bigfoot.com" <> Date: Sun, 3 Feb 2008 17:12:59 +0100 Subject: [PATCH] Invert directions again, increasing speed but adding bugs. --- qscale.c | 175 +++++++++++++++++++------------------------------------ 1 file changed, 61 insertions(+), 114 deletions(-) diff --git a/qscale.c b/qscale.c index c847a12..8fecd66 100644 --- a/qscale.c +++ b/qscale.c @@ -25,25 +25,22 @@ double lanczos_tap(double x) return sinc(x*M_PI) * sinc(x*M_PI / 3.0); } -struct filter { - struct pix_desc *pd; - float *coeffs; -}; + struct pix_desc { unsigned start, end; unsigned startcoeff; }; -void hscale_calc_filter(struct filter *filter, unsigned w, unsigned h, unsigned nw) +void hscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsigned nw, unsigned dstride) { struct pix_desc *pd = (struct pix_desc *)malloc(nw * sizeof(struct pix_desc)); int size_coeffs = 8; float *coeffs = (float *)malloc(size_coeffs * sizeof(float)); int num_coeffs = 0; - int x, sx; + int x, y, sx; double sf = (double)w / (double)nw; double support = (w > nw) ? (3.0 * sf) : (3.0 / sf); - + /* calculate the filter */ for (x = 0; x < nw; ++x) { int start = ceil(x * sf - support); @@ -77,39 +74,35 @@ void hscale_calc_filter(struct filter *filter, unsigned w, unsigned h, unsigned coeffs[pd[x].startcoeff + sx - start] /= sum; } } - - filter->pd = pd; - filter->coeffs = coeffs; -} - -void hscale(struct filter *filter, unsigned char *pix, float *npix, unsigned w, unsigned h, unsigned nw, unsigned sstride, unsigned dstride) -{ - struct pix_desc *pd = filter->pd; - float *coeffs = filter->coeffs; - int y; - + for (y = 0; y < h; ++y) { - unsigned char *sptr = pix + y*sstride; - float *dptr = npix + y*dstride; - float acc = 0.0; - int x; + float *sptr = pix + y*w; + unsigned char *dptr = npix + y*dstride; + unsigned char ch = 0; for (x = 0; x < nw; ++x) { + float acc = 0.0; float *cf = &coeffs[pd[x].startcoeff]; unsigned sx; - acc = 0.0; for (sx = pd[x].start; sx <= pd[x].end; ++sx) { acc += sptr[sx] * *cf++; } - *dptr++ = acc; + + if (acc < 0.0) + ch = 0; + else if (acc > 255.0) + ch = 255; + else + ch = (unsigned char)acc; + *dptr++ = ch; } for ( ; x < dstride; ++x) { - *dptr++ = acc; + *dptr++ = ch; } } } -void vscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsigned nh, unsigned dstride) +void vscale(unsigned char *pix, float *npix, unsigned w, unsigned h, unsigned nh, unsigned dstride) { struct pix_desc *pd = (struct pix_desc *)malloc(nh * sizeof(struct pix_desc)); int size_coeffs = 8; @@ -155,8 +148,8 @@ void vscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsigned nh #if CACHE_LINE_FACTOR > 1 for (x = 0; x < w; x += CACHE_LINE_FACTOR) { - float *sptr = pix + x; - unsigned char *dptr = npix + x; + unsigned char *sptr = pix + x; + float *dptr = npix + x; for (y = 0; y < nh; ++y) { int i; float acc[CACHE_LINE_FACTOR]; @@ -166,6 +159,7 @@ void vscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsigned nh unsigned sy; for (sy = pd[y].start; sy <= pd[y].end; ++sy) { + //asm volatile ("prefetcht0 %0" :: "m" (sptr[(sy+1) * w])); for (i = 0; i < CACHE_LINE_FACTOR; ++i) { acc[i] += sptr[sy * w + i] * *cf; } @@ -173,14 +167,7 @@ void vscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsigned nh } for (i = 0; i < CACHE_LINE_FACTOR; ++i) { - unsigned char ch; - if (acc[i] < 0.0) - ch = 0; - else if (acc[i] > 255.0) - ch = 255; - else - ch = (unsigned char)acc[i]; - dptr[i] = ch; + dptr[i] = acc[i]; } dptr += dstride; } @@ -189,8 +176,8 @@ void vscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsigned nh #else for (x = 0; x < w; ++x) { #endif - float *sptr = pix + x; - unsigned char *dptr = npix + x; + unsigned char *sptr = pix + x; + float *dptr = npix + x; for (y = 0; y < nh; ++y) { float acc = 0.0; float *cf = &coeffs[pd[y].startcoeff]; @@ -200,14 +187,7 @@ void vscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsigned nh acc += sptr[sy * w] * *cf++; } - unsigned char ch; - if (acc < 0.0) - ch = 0; - else if (acc > 255.0) - ch = 255; - else - ch = (unsigned char)acc; - *dptr = ch; + *dptr = acc; dptr += dstride; } } @@ -244,38 +224,15 @@ int main(int argc, char **argv) fprintf(stderr, " Y component: %ux%u -> %ux%u\n", dinfo.comp_info[0].width_in_blocks * DCTSIZE, dinfo.comp_info[0].height_in_blocks * DCTSIZE, nw0, nh0); fprintf(stderr, " Cb component: %ux%u -> %ux%u\n", dinfo.comp_info[1].width_in_blocks * DCTSIZE, dinfo.comp_info[1].height_in_blocks * DCTSIZE, nw1, nh1); fprintf(stderr, " Cr component: %ux%u -> %ux%u\n", dinfo.comp_info[2].width_in_blocks * DCTSIZE, dinfo.comp_info[2].height_in_blocks * DCTSIZE, nw2, nh2); - - float *npix_y = (float *)malloc((dinfo.comp_info[0].height_in_blocks + dinfo.comp_info[0].v_samp_factor - 1) * DCTSIZE * stride0 * sizeof(float)); - float *npix_cb = (float *)malloc((dinfo.comp_info[1].height_in_blocks + dinfo.comp_info[1].v_samp_factor - 1) * DCTSIZE * stride1 * sizeof(float)); - float *npix_cr = (float *)malloc((dinfo.comp_info[2].height_in_blocks + dinfo.comp_info[2].v_samp_factor - 1) * DCTSIZE * stride2 * sizeof(float)); - JSAMPLE *data_y = (unsigned char *)malloc(nh0 * stride0); - JSAMPLE *data_cb = (unsigned char *)malloc(nh1 * stride1); - JSAMPLE *data_cr = (unsigned char *)malloc(nh2 * stride2); - - struct filter filt0, filt1, filt2; - - hscale_calc_filter(&filt0, - /* w= */ dinfo.image_width * dinfo.comp_info[0].h_samp_factor / dinfo.max_h_samp_factor, - /* h= */ dinfo.comp_info[0].v_samp_factor * DCTSIZE, - /* nw= */ nw0 - ); - hscale_calc_filter(&filt1, - /* w= */ dinfo.image_width * dinfo.comp_info[1].h_samp_factor / dinfo.max_h_samp_factor, - /* h= */ dinfo.comp_info[1].v_samp_factor * DCTSIZE, - /* nw= */ nw1 - ); - hscale_calc_filter(&filt2, - /* w= */ dinfo.image_width * dinfo.comp_info[2].h_samp_factor / dinfo.max_h_samp_factor, - /* h= */ dinfo.comp_info[2].v_samp_factor * DCTSIZE, - /* nw= */ nw2 - ); + + JSAMPLE *data_y = (JSAMPLE*)malloc(dinfo.comp_info[0].height_in_blocks * dinfo.comp_info[0].width_in_blocks * DCTSIZE * DCTSIZE); + JSAMPLE *data_cb = (JSAMPLE*)malloc(dinfo.comp_info[1].height_in_blocks * dinfo.comp_info[1].width_in_blocks * DCTSIZE * DCTSIZE); + JSAMPLE *data_cr = (JSAMPLE*)malloc(dinfo.comp_info[2].height_in_blocks * dinfo.comp_info[2].width_in_blocks * DCTSIZE * DCTSIZE); + JSAMPLE *data_ny, *data_ncb, *data_ncr; int total_lines = 0, blocks = 0; while (total_lines < dinfo.comp_info[0].height_in_blocks * DCTSIZE) { unsigned max_lines = dinfo.max_v_samp_factor * DCTSIZE; - JSAMPLE tmp_y [(dinfo.comp_info[0].width_in_blocks * DCTSIZE) * (DCTSIZE * dinfo.comp_info[0].v_samp_factor)]; - JSAMPLE tmp_cb[(dinfo.comp_info[1].width_in_blocks * DCTSIZE) * (DCTSIZE * dinfo.comp_info[1].v_samp_factor)]; - JSAMPLE tmp_cr[(dinfo.comp_info[2].width_in_blocks * DCTSIZE) * (DCTSIZE * dinfo.comp_info[2].v_samp_factor)]; JSAMPROW y_row_ptrs[max_lines]; JSAMPROW cb_row_ptrs[max_lines]; @@ -284,49 +241,39 @@ int main(int argc, char **argv) int i; for (i = 0; i < max_lines; ++i) { - y_row_ptrs[i] = tmp_y + i * dinfo.comp_info[0].width_in_blocks * DCTSIZE; - cb_row_ptrs[i] = tmp_cb + i * dinfo.comp_info[1].width_in_blocks * DCTSIZE; - cr_row_ptrs[i] = tmp_cr + i * dinfo.comp_info[2].width_in_blocks * DCTSIZE; + y_row_ptrs[i] = data_y + (i+blocks*DCTSIZE*dinfo.comp_info[0].v_samp_factor) * dinfo.comp_info[0].width_in_blocks * DCTSIZE; + cb_row_ptrs[i] = data_cb + (i+blocks*DCTSIZE*dinfo.comp_info[1].v_samp_factor) * dinfo.comp_info[1].width_in_blocks * DCTSIZE; + cr_row_ptrs[i] = data_cr + (i+blocks*DCTSIZE*dinfo.comp_info[2].v_samp_factor) * dinfo.comp_info[2].width_in_blocks * DCTSIZE; } - int lines = jpeg_read_raw_data(&dinfo, ptrs, max_lines); - if (lines == 0) - break; - - hscale(/* filter= */ &filt0, - /* from= */ tmp_y, - /* to= */ npix_y + blocks * DCTSIZE * dinfo.comp_info[0].v_samp_factor * stride0, - /* w= */ dinfo.image_width * dinfo.comp_info[0].h_samp_factor / dinfo.max_h_samp_factor, - /* h= */ dinfo.comp_info[0].v_samp_factor * DCTSIZE, - /* nw= */ nw0, - /* sstride= */ dinfo.comp_info[0].width_in_blocks * DCTSIZE, - /* dstride= */ stride0); - - hscale(/* filter= */ &filt1, - /* from= */ tmp_cb, - /* to= */ npix_cb + blocks * DCTSIZE * dinfo.comp_info[1].v_samp_factor * stride1, - /* w= */ dinfo.image_width * dinfo.comp_info[1].h_samp_factor / dinfo.max_h_samp_factor, - /* h= */ dinfo.comp_info[1].v_samp_factor * DCTSIZE, - /* nw= */ nw1, - /* sstride= */ dinfo.comp_info[1].width_in_blocks * DCTSIZE, - /* dstride= */ stride1); - - hscale(/* filter= */ &filt2, - /* from= */ tmp_cr, - /* to= */ npix_cr + blocks * DCTSIZE * dinfo.comp_info[2].v_samp_factor * stride2, - /* w= */ dinfo.image_width * dinfo.comp_info[2].h_samp_factor / dinfo.max_h_samp_factor, - /* h= */ dinfo.comp_info[2].v_samp_factor * DCTSIZE, - /* nw= */ nw2, - /* sstride= */ dinfo.comp_info[2].width_in_blocks * DCTSIZE, - /* dstride= */ stride2); - total_lines += max_lines; ++blocks; + + if (jpeg_read_raw_data(&dinfo, ptrs, max_lines) == 0) + break; } - vscale(npix_y, data_y, stride0, dinfo.image_height * dinfo.comp_info[0].v_samp_factor / dinfo.max_v_samp_factor, nh0, stride0); - vscale(npix_cb, data_cb, stride1, dinfo.image_height * dinfo.comp_info[1].v_samp_factor / dinfo.max_v_samp_factor, nh1, stride1); - vscale(npix_cr, data_cr, stride2, dinfo.image_height * dinfo.comp_info[2].v_samp_factor / dinfo.max_v_samp_factor, nh2, stride2); + { + float *npix = (float*)malloc(dinfo.comp_info[0].width_in_blocks * DCTSIZE * nh0 * sizeof(float)); + vscale(data_y, npix, dinfo.comp_info[0].width_in_blocks * DCTSIZE, dinfo.comp_info[0].height_in_blocks * DCTSIZE, nh0, dinfo.comp_info[0].width_in_blocks * DCTSIZE); + data_ny = (unsigned char *)malloc(nw0 * stride0); + hscale(npix, data_ny, dinfo.comp_info[0].width_in_blocks * DCTSIZE, nh0, nw0, stride0); + free(npix); + } + { + float *npix = (float*)malloc(dinfo.comp_info[1].width_in_blocks * DCTSIZE * nh1 * sizeof(float)); + vscale(data_cr, npix, dinfo.comp_info[1].width_in_blocks * DCTSIZE, dinfo.comp_info[1].height_in_blocks * DCTSIZE, nh1, dinfo.comp_info[1].width_in_blocks * DCTSIZE); + data_ncr = (unsigned char *)malloc(nw1 * stride1); + hscale(npix, data_ncr, dinfo.comp_info[1].width_in_blocks * DCTSIZE, nh1, nw1, stride1); + free(npix); + } + { + float *npix = (float*)malloc(dinfo.comp_info[2].width_in_blocks * DCTSIZE * nh2 * sizeof(float)); + vscale(data_cb, npix, dinfo.comp_info[2].width_in_blocks * DCTSIZE, dinfo.comp_info[2].height_in_blocks * DCTSIZE, nh2, dinfo.comp_info[2].width_in_blocks * DCTSIZE); + data_ncb = (unsigned char *)malloc(nw2 * stride2); + hscale(npix, data_ncb, dinfo.comp_info[2].width_in_blocks * DCTSIZE, nh2, nw2, stride2); + free(npix); + } jpeg_destroy_decompress(&dinfo); struct jpeg_compress_struct cinfo; @@ -373,9 +320,9 @@ int main(int argc, char **argv) if (crline > nh2 - 1) crline = nh2 - 1; - y_row_ptrs[i] = data_y + yline * stride0; - cb_row_ptrs[i] = data_cb + cbline * stride1; - cr_row_ptrs[i] = data_cr + crline * stride2; + y_row_ptrs[i] = data_ny + yline * stride0; + cb_row_ptrs[i] = data_ncb + cbline * stride1; + cr_row_ptrs[i] = data_ncr + crline * stride2; } total_lines += max_lines; -- 2.39.2