2 * High quality image resampling with polyphase filters
3 * Copyright (c) 2001 Fabrice Bellard.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 * High quality image resampling with polyphase filters .
29 #include "fastmemcpy.h"
32 #define NB_COMPONENTS 3
35 #define NB_PHASES (1 << PHASE_BITS)
37 #define FCENTER 1 /* index of the center of the filter */
38 //#define TEST 1 /* Test it */
40 #define POS_FRAC_BITS 16
41 #define POS_FRAC (1 << POS_FRAC_BITS)
42 /* 6 bits precision is needed for MMX */
45 #define LINE_BUF_HEIGHT (NB_TAPS * 4)
47 struct ImgReSampleContext {
48 int iwidth, iheight, owidth, oheight, topBand, bottomBand, leftBand, rightBand;
50 int16_t h_filters[NB_PHASES][NB_TAPS] __align8; /* horizontal filters */
51 int16_t v_filters[NB_PHASES][NB_TAPS] __align8; /* vertical filters */
55 static inline int get_phase(int pos)
57 return ((pos) >> (POS_FRAC_BITS - PHASE_BITS)) & ((1 << PHASE_BITS) - 1);
60 /* This function must be optimized */
61 static void h_resample_fast(uint8_t *dst, int dst_width, const uint8_t *src,
62 int src_width, int src_start, int src_incr,
65 int src_pos, phase, sum, i;
70 for(i=0;i<dst_width;i++) {
73 if ((src_pos >> POS_FRAC_BITS) < 0 ||
74 (src_pos >> POS_FRAC_BITS) > (src_width - NB_TAPS))
77 s = src + (src_pos >> POS_FRAC_BITS);
78 phase = get_phase(src_pos);
79 filter = filters + phase * NB_TAPS;
81 sum = s[0] * filter[0] +
89 for(j=0;j<NB_TAPS;j++)
90 sum += s[j] * filter[j];
93 sum = sum >> FILTER_BITS;
104 /* This function must be optimized */
105 static void v_resample(uint8_t *dst, int dst_width, const uint8_t *src,
106 int wrap, int16_t *filter)
112 for(i=0;i<dst_width;i++) {
114 sum = s[0 * wrap] * filter[0] +
115 s[1 * wrap] * filter[1] +
116 s[2 * wrap] * filter[2] +
117 s[3 * wrap] * filter[3];
124 for(j=0;j<NB_TAPS;j++) {
125 sum += s1[0] * filter[j];
130 sum = sum >> FILTER_BITS;
143 #include "i386/mmx.h"
145 #define FILTER4(reg) \
147 s = src + (src_pos >> POS_FRAC_BITS);\
148 phase = get_phase(src_pos);\
149 filter = filters + phase * NB_TAPS;\
151 punpcklbw_r2r(mm7, reg);\
152 movq_m2r(*filter, mm6);\
153 pmaddwd_r2r(reg, mm6);\
156 paddd_r2r(mm6, reg);\
157 psrad_i2r(FILTER_BITS, reg);\
158 src_pos += src_incr;\
161 #define DUMP(reg) movq_r2m(reg, tmp); printf(#reg "=%016Lx\n", tmp.uq);
163 /* XXX: do four pixels at a time */
164 static void h_resample_fast4_mmx(uint8_t *dst, int dst_width,
165 const uint8_t *src, int src_width,
166 int src_start, int src_incr, int16_t *filters)
176 while (dst_width >= 4) {
183 packuswb_r2r(mm7, mm0);
184 packuswb_r2r(mm7, mm1);
185 packuswb_r2r(mm7, mm3);
186 packuswb_r2r(mm7, mm2);
198 while (dst_width > 0) {
200 packuswb_r2r(mm7, mm0);
209 static void v_resample4_mmx(uint8_t *dst, int dst_width, const uint8_t *src,
210 int wrap, int16_t *filter)
227 while (dst_width >= 4) {
228 movq_m2r(s[0 * wrap], mm0);
229 punpcklbw_r2r(mm7, mm0);
230 movq_m2r(s[1 * wrap], mm1);
231 punpcklbw_r2r(mm7, mm1);
232 movq_m2r(s[2 * wrap], mm2);
233 punpcklbw_r2r(mm7, mm2);
234 movq_m2r(s[3 * wrap], mm3);
235 punpcklbw_r2r(mm7, mm3);
237 pmullw_m2r(coefs[0], mm0);
238 pmullw_m2r(coefs[1], mm1);
239 pmullw_m2r(coefs[2], mm2);
240 pmullw_m2r(coefs[3], mm3);
245 psraw_i2r(FILTER_BITS, mm0);
247 packuswb_r2r(mm7, mm0);
250 *(uint32_t *)dst = tmp.ud[0];
255 while (dst_width > 0) {
256 sum = s[0 * wrap] * filter[0] +
257 s[1 * wrap] * filter[1] +
258 s[2 * wrap] * filter[2] +
259 s[3 * wrap] * filter[3];
260 sum = sum >> FILTER_BITS;
276 vector unsigned char v;
281 vector signed short v;
285 void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
286 int wrap, int16_t *filter)
290 vector unsigned char *tv, tmp, dstv, zero;
291 vec_ss_t srchv[4], srclv[4], fv[4];
292 vector signed short zeros, sumhv, sumlv;
298 The vec_madds later on does an implicit >>15 on the result.
299 Since FILTER_BITS is 8, and we have 15 bits of magnitude in
300 a signed short, we have just enough bits to pre-shift our
301 filter constants <<7 to compensate for vec_madds.
303 fv[i].s[0] = filter[i] << (15-FILTER_BITS);
304 fv[i].v = vec_splat(fv[i].v, 0);
307 zero = vec_splat_u8(0);
308 zeros = vec_splat_s16(0);
312 When we're resampling, we'd ideally like both our input buffers,
313 and output buffers to be 16-byte aligned, so we can do both aligned
314 reads and writes. Sadly we can't always have this at the moment, so
315 we opt for aligned writes, as unaligned writes have a huge overhead.
316 To do this, do enough scalar resamples to get dst 16-byte aligned.
318 i = (-(int)dst) & 0xf;
320 sum = s[0 * wrap] * filter[0] +
321 s[1 * wrap] * filter[1] +
322 s[2 * wrap] * filter[2] +
323 s[3 * wrap] * filter[3];
324 sum = sum >> FILTER_BITS;
325 if (sum<0) sum = 0; else if (sum>255) sum=255;
333 /* Do our altivec resampling on 16 pixels at once. */
334 while(dst_width>=16) {
336 Read 16 (potentially unaligned) bytes from each of
337 4 lines into 4 vectors, and split them into shorts.
338 Interleave the multipy/accumulate for the resample
339 filter with the loads to hide the 3 cycle latency
342 tv = (vector unsigned char *) &s[0 * wrap];
343 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap]));
344 srchv[0].v = (vector signed short) vec_mergeh(zero, tmp);
345 srclv[0].v = (vector signed short) vec_mergel(zero, tmp);
346 sumhv = vec_madds(srchv[0].v, fv[0].v, zeros);
347 sumlv = vec_madds(srclv[0].v, fv[0].v, zeros);
349 tv = (vector unsigned char *) &s[1 * wrap];
350 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap]));
351 srchv[1].v = (vector signed short) vec_mergeh(zero, tmp);
352 srclv[1].v = (vector signed short) vec_mergel(zero, tmp);
353 sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv);
354 sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv);
356 tv = (vector unsigned char *) &s[2 * wrap];
357 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap]));
358 srchv[2].v = (vector signed short) vec_mergeh(zero, tmp);
359 srclv[2].v = (vector signed short) vec_mergel(zero, tmp);
360 sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv);
361 sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv);
363 tv = (vector unsigned char *) &s[3 * wrap];
364 tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap]));
365 srchv[3].v = (vector signed short) vec_mergeh(zero, tmp);
366 srclv[3].v = (vector signed short) vec_mergel(zero, tmp);
367 sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv);
368 sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv);
371 Pack the results into our destination vector,
372 and do an aligned write of that back to memory.
374 dstv = vec_packsu(sumhv, sumlv) ;
375 vec_st(dstv, 0, (vector unsigned char *) dst);
383 If there are any leftover pixels, resample them
384 with the slow scalar method.
387 sum = s[0 * wrap] * filter[0] +
388 s[1 * wrap] * filter[1] +
389 s[2 * wrap] * filter[2] +
390 s[3 * wrap] * filter[3];
391 sum = sum >> FILTER_BITS;
392 if (sum<0) sum = 0; else if (sum>255) sum=255;
401 /* slow version to handle limit cases. Does not need optimisation */
402 static void h_resample_slow(uint8_t *dst, int dst_width,
403 const uint8_t *src, int src_width,
404 int src_start, int src_incr, int16_t *filters)
406 int src_pos, phase, sum, j, v, i;
407 const uint8_t *s, *src_end;
410 src_end = src + src_width;
412 for(i=0;i<dst_width;i++) {
413 s = src + (src_pos >> POS_FRAC_BITS);
414 phase = get_phase(src_pos);
415 filter = filters + phase * NB_TAPS;
417 for(j=0;j<NB_TAPS;j++) {
420 else if (s >= src_end)
424 sum += v * filter[j];
427 sum = sum >> FILTER_BITS;
438 static void h_resample(uint8_t *dst, int dst_width, const uint8_t *src,
439 int src_width, int src_start, int src_incr,
445 n = (0 - src_start + src_incr - 1) / src_incr;
446 h_resample_slow(dst, n, src, src_width, src_start, src_incr, filters);
449 src_start += n * src_incr;
451 src_end = src_start + dst_width * src_incr;
452 if (src_end > ((src_width - NB_TAPS) << POS_FRAC_BITS)) {
453 n = (((src_width - NB_TAPS + 1) << POS_FRAC_BITS) - 1 - src_start) /
459 if ((mm_flags & MM_MMX) && NB_TAPS == 4)
460 h_resample_fast4_mmx(dst, n,
461 src, src_width, src_start, src_incr, filters);
464 h_resample_fast(dst, n,
465 src, src_width, src_start, src_incr, filters);
469 src_start += n * src_incr;
470 h_resample_slow(dst, dst_width,
471 src, src_width, src_start, src_incr, filters);
475 static void component_resample(ImgReSampleContext *s,
476 uint8_t *output, int owrap, int owidth, int oheight,
477 uint8_t *input, int iwrap, int iwidth, int iheight)
479 int src_y, src_y1, last_src_y, ring_y, phase_y, y1, y;
480 uint8_t *new_line, *src_line;
482 last_src_y = - FCENTER - 1;
483 /* position of the bottom of the filter in the source image */
484 src_y = (last_src_y + NB_TAPS) * POS_FRAC;
485 ring_y = NB_TAPS; /* position in ring buffer */
486 for(y=0;y<oheight;y++) {
487 /* apply horizontal filter on new lines from input if needed */
488 src_y1 = src_y >> POS_FRAC_BITS;
489 while (last_src_y < src_y1) {
490 if (++ring_y >= LINE_BUF_HEIGHT + NB_TAPS)
493 /* handle limit conditions : replicate line (slightly
494 inefficient because we filter multiple times) */
498 } else if (y1 >= iheight) {
501 src_line = input + y1 * iwrap;
502 new_line = s->line_buf + ring_y * owidth;
503 /* apply filter and handle limit cases correctly */
504 h_resample(new_line, owidth,
505 src_line, iwidth, - FCENTER * POS_FRAC, s->h_incr,
506 &s->h_filters[0][0]);
507 /* handle ring buffer wraping */
508 if (ring_y >= LINE_BUF_HEIGHT) {
509 memcpy(s->line_buf + (ring_y - LINE_BUF_HEIGHT) * owidth,
513 /* apply vertical filter */
514 phase_y = get_phase(src_y);
516 /* desactivated MMX because loss of precision */
517 if ((mm_flags & MM_MMX) && NB_TAPS == 4 && 0)
518 v_resample4_mmx(output, owidth,
519 s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth,
520 &s->v_filters[phase_y][0]);
524 if ((mm_flags & MM_ALTIVEC) && NB_TAPS == 4 && FILTER_BITS <= 6)
525 v_resample16_altivec(output, owidth,
526 s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth,
527 &s->v_filters[phase_y][0]);
530 v_resample(output, owidth,
531 s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth,
532 &s->v_filters[phase_y][0]);
539 /* XXX: the following filter is quite naive, but it seems to suffice
541 static void build_filter(int16_t *filter, float factor)
544 float x, y, tab[NB_TAPS], norm, mult;
546 /* if upsampling, only need to interpolate, no filter */
550 for(ph=0;ph<NB_PHASES;ph++) {
552 for(i=0;i<NB_TAPS;i++) {
554 x = M_PI * ((float)(i - FCENTER) - (float)ph / NB_PHASES) * factor;
563 /* normalize so that an uniform color remains the same */
564 mult = (float)(1 << FILTER_BITS) / norm;
565 for(i=0;i<NB_TAPS;i++) {
566 v = (int)(tab[i] * mult);
567 filter[ph * NB_TAPS + i] = v;
572 ImgReSampleContext *img_resample_init(int owidth, int oheight,
573 int iwidth, int iheight)
575 return img_resample_full_init(owidth, oheight, iwidth, iheight, 0, 0, 0, 0);
578 ImgReSampleContext *img_resample_full_init(int owidth, int oheight,
579 int iwidth, int iheight,
580 int topBand, int bottomBand,
581 int leftBand, int rightBand)
583 ImgReSampleContext *s;
585 s = av_mallocz(sizeof(ImgReSampleContext));
588 s->line_buf = av_mallocz(owidth * (LINE_BUF_HEIGHT + NB_TAPS));
593 s->oheight = oheight;
595 s->iheight = iheight;
596 s->topBand = topBand;
597 s->bottomBand = bottomBand;
598 s->leftBand = leftBand;
599 s->rightBand = rightBand;
601 s->h_incr = ((iwidth - leftBand - rightBand) * POS_FRAC) / owidth;
602 s->v_incr = ((iheight - topBand - bottomBand) * POS_FRAC) / oheight;
604 build_filter(&s->h_filters[0][0], (float) owidth / (float) (iwidth - leftBand - rightBand));
605 build_filter(&s->v_filters[0][0], (float) oheight / (float) (iheight - topBand - bottomBand));
613 void img_resample(ImgReSampleContext *s,
614 AVPicture *output, const AVPicture *input)
619 shift = (i == 0) ? 0 : 1;
620 component_resample(s, output->data[i], output->linesize[i],
621 s->owidth >> shift, s->oheight >> shift,
622 input->data[i] + (input->linesize[i] * (s->topBand >> shift)) + (s->leftBand >> shift),
623 input->linesize[i], ((s->iwidth - s->leftBand - s->rightBand) >> shift),
624 (s->iheight - s->topBand - s->bottomBand) >> shift);
628 void img_resample_close(ImgReSampleContext *s)
630 av_free(s->line_buf);
636 void *av_mallocz(int size)
640 memset(ptr, 0, size);
644 void av_free(void *ptr)
646 /* XXX: this test should not be needed on most libcs */
654 uint8_t img[XSIZE * YSIZE];
659 uint8_t img1[XSIZE1 * YSIZE1];
660 uint8_t img2[XSIZE1 * YSIZE1];
662 void save_pgm(const char *filename, uint8_t *img, int xsize, int ysize)
665 f=fopen(filename,"w");
666 fprintf(f,"P5\n%d %d\n%d\n", xsize, ysize, 255);
667 fwrite(img,1, xsize * ysize,f);
671 static void dump_filter(int16_t *filter)
675 for(ph=0;ph<NB_PHASES;ph++) {
677 for(i=0;i<NB_TAPS;i++) {
678 printf(" %5.2f", filter[ph * NB_TAPS + i] / 256.0);
688 int main(int argc, char **argv)
690 int x, y, v, i, xsize, ysize;
691 ImgReSampleContext *s;
692 float fact, factors[] = { 1/2.0, 3.0/4.0, 1.0, 4.0/3.0, 16.0/9.0, 2.0 };
695 /* build test image */
696 for(y=0;y<YSIZE;y++) {
697 for(x=0;x<XSIZE;x++) {
698 if (x < XSIZE/2 && y < YSIZE/2) {
699 if (x < XSIZE/4 && y < YSIZE/4) {
705 } else if (x < XSIZE/4) {
710 } else if (y < XSIZE/4) {
722 if (((x+3) % 4) <= 1 &&
729 } else if (x < XSIZE/2) {
730 v = ((x - (XSIZE/2)) * 255) / (XSIZE/2);
731 } else if (y < XSIZE/2) {
732 v = ((y - (XSIZE/2)) * 255) / (XSIZE/2);
734 v = ((x + y - XSIZE) * 255) / XSIZE;
736 img[(YSIZE - y) * XSIZE + (XSIZE - x)] = v;
739 save_pgm("/tmp/in.pgm", img, XSIZE, YSIZE);
740 for(i=0;i<sizeof(factors)/sizeof(float);i++) {
742 xsize = (int)(XSIZE * fact);
743 ysize = (int)((YSIZE - 100) * fact);
744 s = img_resample_full_init(xsize, ysize, XSIZE, YSIZE, 50 ,50, 0, 0);
745 printf("Factor=%0.2f\n", fact);
746 dump_filter(&s->h_filters[0][0]);
747 component_resample(s, img1, xsize, xsize, ysize,
748 img + 50 * XSIZE, XSIZE, XSIZE, YSIZE - 100);
749 img_resample_close(s);
751 sprintf(buf, "/tmp/out%d.pgm", i);
752 save_pgm(buf, img1, xsize, ysize);
757 printf("MMX test\n");
759 xsize = (int)(XSIZE * fact);
760 ysize = (int)(YSIZE * fact);
762 s = img_resample_init(xsize, ysize, XSIZE, YSIZE);
763 component_resample(s, img1, xsize, xsize, ysize,
764 img, XSIZE, XSIZE, YSIZE);
767 s = img_resample_init(xsize, ysize, XSIZE, YSIZE);
768 component_resample(s, img2, xsize, xsize, ysize,
769 img, XSIZE, XSIZE, YSIZE);
770 if (memcmp(img1, img2, xsize * ysize) != 0) {
771 fprintf(stderr, "mmx error\n");