]> git.sesse.net Git - qscale/blob - libqscale.c
fa7f2c3f8018c7329632a884eadb9699cc03866c
[qscale] / libqscale.c
1 #include <math.h>
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <malloc.h>
5
6 #include "libqscale.h"
7
8 /* The number of pixels to process at a time when scaling vertically. */
9 #define CACHE_LINE_FACTOR 16
10
11 /* Whether to use SSE for horizontal scaling or not (requires SSE3). */
12 #define USE_HORIZONTAL_SSE 1
13
14 /* Whether to use SSE for vertical scaling or not (requires only SSE1). */
15 #define USE_VERTICAL_SSE 1
16
17 #if USE_VERTICAL_SSE
18 #undef CACHE_LINE_FACTOR
19 #define CACHE_LINE_FACTOR 16
20 #endif
21
22 #ifndef M_PI
23 #define M_PI 3.14159265358979323846264
24 #endif
25
26 qscale_img *qscale_load_jpeg(const char *filename)
27 {
28         FILE *file = fopen(filename, "rb");
29         qscale_img *img;
30         if (file == NULL) {
31                 return NULL;
32         }
33
34         img = qscale_load_jpeg_from_stdio(file);
35
36         fclose(file);
37         return img;
38 }
39
40 qscale_img *qscale_load_jpeg_from_stdio(FILE *file)
41 {
42         qscale_img *img = (qscale_img *)malloc(sizeof(qscale_img));
43         if (img == NULL) {
44                 return NULL;
45         }
46
47         img->data_y = img->data_cb = img->data_cr = NULL;
48
49         /* FIXME: Better error handling here (ie., return NULL). */
50         struct jpeg_decompress_struct dinfo;
51         struct jpeg_error_mgr jerr;
52         dinfo.err = jpeg_std_error(&jerr);
53         jpeg_create_decompress(&dinfo);
54         jpeg_stdio_src(&dinfo, stdin);
55         jpeg_read_header(&dinfo, TRUE);
56         dinfo.raw_data_out = TRUE;
57         jpeg_start_decompress(&dinfo);
58         
59         /* We do not handle anything but YCbCr images (yet?). */
60         if (dinfo.num_components != 3) {
61                 qscale_destroy(img);
62                 return NULL;
63         }
64
65         img->width = dinfo.image_width;
66         img->height = dinfo.image_height;
67
68         img->w0 = dinfo.image_width * dinfo.comp_info[0].h_samp_factor / dinfo.max_h_samp_factor;
69         img->h0 = dinfo.image_height * dinfo.comp_info[0].v_samp_factor / dinfo.max_v_samp_factor;
70
71         img->w1 = dinfo.image_width * dinfo.comp_info[1].h_samp_factor / dinfo.max_h_samp_factor;
72         img->h1 = dinfo.image_height * dinfo.comp_info[1].v_samp_factor / dinfo.max_v_samp_factor;
73
74         img->w2 = dinfo.image_width * dinfo.comp_info[2].h_samp_factor / dinfo.max_h_samp_factor;
75         img->h2 = dinfo.image_height * dinfo.comp_info[2].v_samp_factor / dinfo.max_v_samp_factor;
76
77         img->data_y  = (JSAMPLE*)memalign(16, dinfo.comp_info[0].height_in_blocks * dinfo.comp_info[0].width_in_blocks * DCTSIZE * DCTSIZE);
78         img->data_cb = (JSAMPLE*)memalign(16, dinfo.comp_info[1].height_in_blocks * dinfo.comp_info[1].width_in_blocks * DCTSIZE * DCTSIZE);
79         img->data_cr = (JSAMPLE*)memalign(16, dinfo.comp_info[2].height_in_blocks * dinfo.comp_info[2].width_in_blocks * DCTSIZE * DCTSIZE);
80
81         if (img->data_y == NULL || img->data_cb == NULL || img->data_cr == NULL) {
82                 qscale_destroy(img);
83                 return NULL;
84         }
85
86         int total_lines = 0, blocks = 0;
87         while (total_lines < dinfo.comp_info[0].height_in_blocks * DCTSIZE) {
88                 unsigned max_lines = dinfo.max_v_samp_factor * DCTSIZE;
89
90                 JSAMPROW y_row_ptrs[max_lines];
91                 JSAMPROW cb_row_ptrs[max_lines];
92                 JSAMPROW cr_row_ptrs[max_lines];
93                 JSAMPROW* ptrs[] = { y_row_ptrs, cb_row_ptrs, cr_row_ptrs };
94
95                 int i;
96                 for (i = 0; i < max_lines; ++i) {
97                         y_row_ptrs[i]  = img->data_y  + (i+blocks*DCTSIZE*dinfo.comp_info[0].v_samp_factor) * dinfo.comp_info[0].width_in_blocks * DCTSIZE;
98                         cb_row_ptrs[i] = img->data_cb + (i+blocks*DCTSIZE*dinfo.comp_info[1].v_samp_factor) * dinfo.comp_info[1].width_in_blocks * DCTSIZE;
99                         cr_row_ptrs[i] = img->data_cr + (i+blocks*DCTSIZE*dinfo.comp_info[2].v_samp_factor) * dinfo.comp_info[2].width_in_blocks * DCTSIZE;
100                 }
101
102                 total_lines += max_lines;
103                 ++blocks;
104
105                 if (jpeg_read_raw_data(&dinfo, ptrs, max_lines) == 0)
106                         break;
107         }
108
109         jpeg_destroy_decompress(&dinfo);
110         return img;
111 }
112
113 void qscale_destroy(qscale_img *img)
114 {
115         free(img->data_y);
116         free(img->data_cb);
117         free(img->data_cr);
118         free(img);
119 }
120
121
122 static double sinc(double x)
123 {
124         static const double cutoff = 1.220703668e-4;  /* sqrt(sqrt(eps)) */
125
126         if (abs(x) < cutoff) {
127                 /* For small |x|, use Taylor series instead */
128                 const double x2 = x * x;
129                 const double x4 = x2 * x2;
130
131                 return 1.0 - x2 / 6.0 + x4 / 120.0;
132         } else {
133                 return sin(x) / x;
134         }
135 }
136
137 static double lanczos_tap(double x)
138 {
139         if (x < -3.0 || x > 3.0)
140                 return 0.0;
141         if (x < 0.0)
142                 return sinc(-x*M_PI) * sinc(-x*M_PI / 3.0);
143         else
144                 return sinc(x*M_PI) * sinc(x*M_PI / 3.0);
145 }
146
147 struct pix_desc {
148         unsigned start, end;
149         unsigned startcoeff;
150 };
151
152 static void hscale(float *pix, unsigned char *npix, unsigned w, unsigned h, unsigned nw, unsigned sstride, unsigned dstride)
153 {
154         struct pix_desc *pd = (struct pix_desc *)malloc(nw * sizeof(struct pix_desc));
155         int size_coeffs = 8;
156         float *coeffs = (float *)malloc(size_coeffs * sizeof(float));
157         int num_coeffs = 0;
158         int x, y;
159         double sf = (double)w / (double)nw;
160         double support = (w > nw) ? (3.0 * sf) : (3.0 / sf);
161
162         /* calculate the filter */
163         for (x = 0; x < nw; ++x) {
164                 int start = ceil(x * sf - support);
165                 int end = floor(x * sf + support);
166                 int sx;
167                 double sum = 0.0;
168
169                 if (start < 0) {
170                         start = 0;
171                 }
172                 if (end > w - 1) {
173                         end = w - 1;
174                 }
175
176 #if USE_HORIZONTAL_SSE
177                 /* round up so we get a multiple of four for the SSE code */
178                 int num = (end - start + 1);
179                 if (num % 4 != 0) {
180                         /* prefer aligning it if possible */
181                         if (start % 4 != 0 && start % 4 <= num % 4) {
182                                 num += start % 4;
183                                 start -= start % 4;
184                         }
185                         if (num % 4 != 0) {
186                                 end += 4 - (num % 4);
187                         }
188                 }
189 #endif
190
191                 pd[x].start = start;
192                 pd[x].end = end;
193                 pd[x].startcoeff = num_coeffs;
194
195                 for (sx = start; sx <= end; ++sx) {
196                         double nd = (w > nw) ? (sx/sf - x) : (sx - x*sf);
197                         double f = lanczos_tap(nd);
198                         if (num_coeffs == size_coeffs) {
199                                 size_coeffs <<= 1;
200                                 coeffs = (float *)realloc(coeffs, size_coeffs * sizeof(float));
201                         }
202
203                         coeffs[num_coeffs++] = f;
204                         sum += f;
205                 }
206
207                 for (sx = start; sx <= end; ++sx) {
208                         coeffs[pd[x].startcoeff + sx - start] /= sum;
209                 }
210         }
211
212         for (y = 0; y < h; ++y) {
213                 float *sptr = pix + y*sstride;
214                 unsigned char *dptr = npix + y*dstride;
215                 unsigned char ch;
216                 for (x = 0; x < nw; ++x) {
217 #if USE_HORIZONTAL_SSE
218                         int result;
219                         float acc;
220                         long tmp;
221                         static const float low = 0.0, high = 255.0;
222                         __asm__ (
223                                 "pxor %1, %1               \n"
224                                 "xor %2, %2                \n"
225                                 "0:                        \n"
226                                 "movups (%4,%2),%%xmm1     \n"
227                                 "movups (%3,%2),%%xmm2     \n"
228                                 "mulps %%xmm2,%%xmm1       \n"
229                                 "addps %%xmm1,%1           \n"
230                                 "add $16,%2                \n"
231                                 "dec %5                    \n"
232                                 "jnz 0b                    \n"
233                                 "haddps %1,%1              \n"
234                                 "haddps %1,%1              \n"
235                                 "maxss %6,%1               \n"
236                                 "minss %7,%1               \n"
237                                 "cvtss2si %1,%0            \n"
238                                 : "=r" (result),
239                                   "=&x" (acc),
240                                   "=&r" (tmp)
241                                 : "r" (&coeffs[pd[x].startcoeff]),
242                                   "r" (&sptr[pd[x].start]),
243                                   "r" ((pd[x].end - pd[x].start + 1)/4),
244                                   "m" (low),
245                                   "m" (high)
246                                 : "memory", "xmm1", "xmm2"
247                         );
248
249                         *dptr++ = (unsigned char)result;
250 #else
251                         float acc = 0.0;
252                         float *cf = &coeffs[pd[x].startcoeff];
253                         unsigned sx;
254                         
255                         for (sx = pd[x].start; sx <= pd[x].end; ++sx) {
256                                 acc += sptr[sx] * *cf++;
257                         }
258
259                         if (acc < 0.0)
260                                 ch = 0;
261                         else if (acc > 255.0)
262                                 ch = 255;
263                         else
264                                 ch = (unsigned char)acc;
265                         *dptr++ = ch;
266 #endif
267                 }
268                 ch = dptr[-1];
269                 for ( ; x < dstride; ++x) {
270                         *dptr++ = ch;
271                 }
272         }
273 }
274
275 static void vscale(unsigned char *pix, float *npix, unsigned w, unsigned h, unsigned nh, unsigned dstride)
276 {
277         struct pix_desc *pd = (struct pix_desc *)malloc(nh * sizeof(struct pix_desc));
278         int size_coeffs = 8;
279         float *coeffs = (float *)malloc(size_coeffs * sizeof(float));
280         int num_coeffs = 0;
281         int x, y, sy;
282         double sf = (double)h / (double)nh;
283         double support = (h > nh) ? (3.0 * sf) : (3.0 / sf);
284
285         /* calculate the filter */
286         for (y = 0; y < nh; ++y) {
287                 int start = ceil(y * sf - support);
288                 int end = floor(y * sf + support);
289                 double sum = 0.0;
290
291                 if (start < 0) {
292                         start = 0;
293                 }
294                 if (end > h - 1) {
295                         end = h - 1;
296                 }
297
298                 pd[y].start = start;
299                 pd[y].end = end;
300                 pd[y].startcoeff = num_coeffs;
301
302                 for (sy = start; sy <= end; ++sy) {
303                         double nd = (h > nh) ? (sy/sf - y) : (sy - y*sf);
304                         double f = lanczos_tap(nd);
305                         if (num_coeffs == size_coeffs) {
306                                 size_coeffs <<= 1;
307                                 coeffs = (float *)realloc(coeffs, size_coeffs * sizeof(float));
308                         }
309                         
310                         coeffs[num_coeffs++] = f;
311                         sum += f;
312                 }
313
314                 for (sy = start; sy <= end; ++sy) {
315                         coeffs[pd[y].startcoeff + sy - start] /= sum;
316                 }
317         }
318
319 #if CACHE_LINE_FACTOR > 1
320         for (x = 0; x < (w/CACHE_LINE_FACTOR) * CACHE_LINE_FACTOR; x += CACHE_LINE_FACTOR) {
321                 unsigned char *sptr = pix + x;
322                 float *dptr = npix + x;
323                 for (y = 0; y < nh; ++y) {
324 #if USE_VERTICAL_SSE
325                         /*
326                          * xmm0 - xmm3: acc[0..15]
327                          * xmm4: current filter coefficient
328                          * xmm5, xmm6, xmm7: scratchpad
329                          */
330                         __asm__ (
331                                 /* clear */
332                                 "pxor %%xmm0, %%xmm0          \n"
333                                 "pxor %%xmm1, %%xmm1          \n"
334                                 "pxor %%xmm2, %%xmm2          \n"
335                                 "pxor %%xmm3, %%xmm3          \n"
336
337                                 /* main loop */
338                                 "0:                           \n"
339                                 
340                                 /* a zero is useful during unpacking */
341                                 "pxor %%xmm4, %%xmm4          \n"
342                                 
343                                 /* fetch all 16 source bytes */
344                                 "movups (%0), %%xmm5          \n"
345                                 "prefetcht0 (%0,%3,4)         \n"
346
347                                 /* unpack into words (xmm5, xmm7) */
348                                 "movaps %%xmm5, %%xmm7        \n"
349                                 "punpcklbw %%xmm4, %%xmm5     \n"
350                                 "punpckhbw %%xmm4, %%xmm7     \n"
351
352                                 /* unpack xmm5 into dwords (xmm5, xmm6) */
353                                 "movaps %%xmm5, %%xmm6        \n"
354                                 "punpcklwd %%xmm4, %%xmm5     \n"
355                                 "punpckhwd %%xmm4, %%xmm6     \n"
356
357                                 /* convert xmm5, xmm6 to floats */
358                                 "cvtdq2ps %%xmm5, %%xmm5      \n"
359                                 "cvtdq2ps %%xmm6, %%xmm6      \n"
360
361                                 /* fetch the coefficient */
362                                 "movss (%2), %%xmm4           \n"
363                                 "shufps $0x0, %%xmm4, %%xmm4  \n"
364
365                                 /* do the muls for xmm5 and xmm6 */
366                                 "mulps %%xmm4, %%xmm5         \n"
367                                 "mulps %%xmm4, %%xmm6         \n"
368                                 "addps %%xmm5, %%xmm0         \n"
369                                 "addps %%xmm6, %%xmm1         \n"
370
371                                 /* get the zero back again */
372                                 "pxor %%xmm4, %%xmm4          \n"
373
374                                 /* unpack xmm7 into dwords (xmm7, xmm6) */
375                                 "movaps %%xmm7, %%xmm6        \n"
376                                 "punpcklwd %%xmm4, %%xmm7     \n"
377                                 "punpckhwd %%xmm4, %%xmm6     \n"
378
379                                 /* convert xmm7, xmm6 to floats */
380                                 "cvtdq2ps %%xmm7, %%xmm7      \n"
381                                 "cvtdq2ps %%xmm6, %%xmm6      \n"
382
383                                 /* fetch the coefficient */
384                                 "movss (%2), %%xmm4           \n"
385                                 "shufps $0x0, %%xmm4, %%xmm4  \n"
386
387                                 /* do the second set of muls */
388                                 "mulps %%xmm4, %%xmm7         \n"
389                                 "mulps %%xmm4, %%xmm6         \n"
390                                 "addps %%xmm7, %%xmm2         \n"
391                                 "addps %%xmm6, %%xmm3         \n"
392
393                                 /* move along, and loop */
394                                 "add $4, %2                   \n"
395                                 "add %3, %0                   \n"
396                                 "dec %1                       \n"
397                                 "jnz 0b                       \n"
398
399                                 /* store the values */
400                                 "movaps %%xmm0, (%4)          \n"
401                                 "movaps %%xmm1, 16(%4)        \n"
402                                 "movaps %%xmm2, 32(%4)        \n"
403                                 "movaps %%xmm3, 48(%4)        \n"
404                                 : :
405                                 "r" (&sptr[pd[y].start * w]),        /* 0: srcptr base */
406                                 "r" (pd[y].end - pd[y].start + 1),   /* 1: filter len */
407                                 "r" (&coeffs[pd[y].startcoeff]),     /* 2: coeffs base */
408                                 "r" ((long)w),                       /* 3: stride */
409                                 "r" (dptr)                           /* 4: dstptr base */
410                                 : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
411                         );
412 #else
413                         int i;
414                         float acc[CACHE_LINE_FACTOR];
415                         for (i = 0; i < CACHE_LINE_FACTOR; ++i)
416                                 acc[i] = 0.0;
417                         float *cf = &coeffs[pd[y].startcoeff];
418                         unsigned sy;
419                 
420                         for (sy = pd[y].start; sy <= pd[y].end; ++sy) {
421                                 for (i = 0; i < CACHE_LINE_FACTOR; ++i) {
422                                         acc[i] += sptr[sy * w + i] * *cf;
423                                 }
424                                 ++cf;
425                         }
426
427                         for (i = 0; i < CACHE_LINE_FACTOR; ++i) {
428                                 dptr[i] = acc[i];
429                         }
430 #endif
431                         dptr += dstride;
432                 }
433         }
434         for (x = (x/CACHE_LINE_FACTOR)*CACHE_LINE_FACTOR; x < w; ++x) {
435 #else
436         for (x = 0; x < w; ++x) {
437 #endif
438                 unsigned char *sptr = pix + x;
439                 float *dptr = npix + x;
440                 for (y = 0; y < nh; ++y) {
441                         float acc = 0.0;
442                         float *cf = &coeffs[pd[y].startcoeff];
443                         unsigned sy;
444                         
445                         for (sy = pd[y].start; sy <= pd[y].end; ++sy) {
446                                 acc += sptr[sy * w] * *cf++;
447                         }
448
449                         *dptr = acc;
450                         dptr += dstride;
451                 }
452         }
453 }
454
455 qscale_img *qscale_scale(qscale_img *src, unsigned width, unsigned height, unsigned samp_h0, unsigned samp_v0, unsigned samp_h1, unsigned samp_v1, unsigned samp_h2, unsigned samp_v2, enum qscale_scaling_filter scaling_filter)
456 {
457         qscale_img *dst = (qscale_img *)malloc(sizeof(qscale_img));
458         if (dst == NULL) {
459                 return NULL;
460         }
461
462         dst->width = width;
463         dst->height = height;
464
465         unsigned max_samp_h, max_samp_v;
466         max_samp_h = samp_h0;
467         if (samp_h1 > max_samp_h)
468                 max_samp_h = samp_h1;
469         if (samp_h2 > max_samp_h)
470                 max_samp_h = samp_h2;
471
472         max_samp_v = samp_v0;
473         if (samp_v1 > max_samp_v)
474                 max_samp_v = samp_v1;
475         if (samp_v2 > max_samp_v)
476                 max_samp_v = samp_v2;
477
478         dst->w0 = width * samp_h0 / max_samp_h;
479         dst->h0 = height * samp_v0 / max_samp_v;
480
481         dst->w1 = width * samp_h1 / max_samp_h;
482         dst->h1 = height * samp_v1 / max_samp_v;
483
484         dst->w2 = width * samp_h2 / max_samp_h;
485         dst->h2 = height * samp_v2 / max_samp_v;
486
487         unsigned dstride0 = (dst->w0 + DCTSIZE-1) & ~(DCTSIZE-1);
488         unsigned dstride1 = (dst->w1 + DCTSIZE-1) & ~(DCTSIZE-1);
489         unsigned dstride2 = (dst->w2 + DCTSIZE-1) & ~(DCTSIZE-1);
490
491         unsigned sstride0 = (src->w0 + DCTSIZE-1) & ~(DCTSIZE-1);
492         unsigned sstride1 = (src->w1 + DCTSIZE-1) & ~(DCTSIZE-1);
493         unsigned sstride2 = (src->w2 + DCTSIZE-1) & ~(DCTSIZE-1);
494
495         /* FIXME: handle out-of-memory gracefully */
496         {
497                 float *npix = (float*)memalign(16, sstride0 * dst->h0 * sizeof(float));
498                 vscale(src->data_y, npix, sstride0, src->h0, dst->h0, sstride0);
499                 dst->data_y = (unsigned char *)malloc(dst->h0 * dstride0);
500                 hscale(npix, dst->data_y, src->w0, dst->h0, dst->w0, sstride0, dstride0);
501                 free(npix);
502         }
503         {
504                 float *npix = (float*)memalign(16, sstride1 * dst->h1 * sizeof(float));
505                 vscale(src->data_cr, npix, sstride1, src->h1, dst->h1, sstride1);
506                 dst->data_cr = (unsigned char *)malloc(dst->h1 * dstride1);
507                 hscale(npix, dst->data_cr, src->w1, dst->h1, dst->w1, sstride1, dstride1);
508                 free(npix);
509         }
510         {
511                 float *npix = (float*)memalign(16, sstride2 * dst->h2 * sizeof(float));
512                 vscale(src->data_cb, npix, sstride2, src->h2, dst->h2, sstride2);
513                 dst->data_cb = (unsigned char *)malloc(dst->h2 * dstride2);
514                 hscale(npix, dst->data_cb, src->w2, dst->h2, dst->w2, sstride2, dstride2);
515                 free(npix);
516         }
517
518         return dst;
519 }