git.sesse.net Git - ffmpeg/blob - libavfilter/vf_fspp.c

   1 /*
   2  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   3  * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
   4  * Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  21  */
  22
  23 /**
  24  * @file
  25  * Fast Simple Post-processing filter
  26  * This implementation is based on an algorithm described in
  27  * "Aria Nosratinia Embedded Post-Processing for
  28  * Enhancement of Compressed Images (1999)"
  29  * (http://www.utdallas.edu/~aria/papers/vlsisp99.pdf)
  30  * Further, with splitting (I)DCT into horizontal/vertical passes, one of
  31  * them can be performed once per block, not per pixel. This allows for much
  32  * higher speed.
  33  *
  34  * Originally written by Michael Niedermayer and Nikolaj for the MPlayer
  35  * project, and ported by Arwa Arif for FFmpeg.
  36  */
  37
  38 #include "libavutil/avassert.h"
  39 #include "libavutil/imgutils.h"
  40 #include "libavutil/mem_internal.h"
  41 #include "libavutil/opt.h"
  42 #include "libavutil/pixdesc.h"
  43 #include "internal.h"
  44 #include "vf_fspp.h"
  45
  46 #define OFFSET(x) offsetof(FSPPContext, x)
  47 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
  48 static const AVOption fspp_options[] = {
  49     { "quality",       "set quality",                          OFFSET(log2_count),    AV_OPT_TYPE_INT, {.i64 = 4},   4, MAX_LEVEL, FLAGS },
  50     { "qp",            "force a constant quantizer parameter", OFFSET(qp),            AV_OPT_TYPE_INT, {.i64 = 0},   0, 64,        FLAGS },
  51     { "strength",      "set filter strength",                  OFFSET(strength),      AV_OPT_TYPE_INT, {.i64 = 0}, -15, 32,        FLAGS },
  52     { "use_bframe_qp", "use B-frames' QP",                     OFFSET(use_bframe_qp), AV_OPT_TYPE_BOOL,{.i64 = 0},   0, 1,         FLAGS },
  53     { NULL }
  54 };
  55
  56 AVFILTER_DEFINE_CLASS(fspp);
  57
  58 DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
  59     {  0,  48,  12,  60,   3,  51,  15,  63, },
  60     { 32,  16,  44,  28,  35,  19,  47,  31, },
  61     {  8,  56,   4,  52,  11,  59,   7,  55, },
  62     { 40,  24,  36,  20,  43,  27,  39,  23, },
  63     {  2,  50,  14,  62,   1,  49,  13,  61, },
  64     { 34,  18,  46,  30,  33,  17,  45,  29, },
  65     { 10,  58,   6,  54,   9,  57,   5,  53, },
  66     { 42,  26,  38,  22,  41,  25,  37,  21, },
  67 };
  68
  69 static const short custom_threshold[64] = {
  70 // values (296) can't be too high
  71 // -it causes too big quant dependence
  72 // or maybe overflow(check), which results in some flashing
  73      71, 296, 295, 237,  71,  40,  38,  19,
  74     245, 193, 185, 121, 102,  73,  53,  27,
  75     158, 129, 141, 107,  97,  73,  50,  26,
  76     102, 116, 109,  98,  82,  66,  45,  23,
  77      71,  94,  95,  81,  70,  56,  38,  20,
  78      56,  77,  74,  66,  56,  44,  30,  15,
  79      38,  53,  50,  45,  38,  30,  21,  11,
  80      20,  27,  26,  23,  20,  15,  11,   5
  81 };
  82
  83 //This func reads from 1 slice, 1 and clears 0 & 1
  84 static void store_slice_c(uint8_t *dst, int16_t *src,
  85                           ptrdiff_t dst_stride, ptrdiff_t src_stride,
  86                           ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
  87 {
  88     int y, x;
  89 #define STORE(pos)                                                             \
  90     temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);        \
  91     src[x + pos] = src[x + pos - 8 * src_stride] = 0;                          \
  92     if (temp & 0x100) temp = ~(temp >> 31);                                    \
  93     dst[x + pos] = temp;
  94
  95     for (y = 0; y < height; y++) {
  96         const uint8_t *d = dither[y];
  97         for (x = 0; x < width; x += 8) {
  98             int temp;
  99             STORE(0);
 100             STORE(1);
 101             STORE(2);
 102             STORE(3);
 103             STORE(4);
 104             STORE(5);
 105             STORE(6);
 106             STORE(7);
 107         }
 108         src += src_stride;
 109         dst += dst_stride;
 110     }
 111 }
 112
 113 //This func reads from 2 slices, 0 & 2  and clears 2-nd
 114 static void store_slice2_c(uint8_t *dst, int16_t *src,
 115                            ptrdiff_t dst_stride, ptrdiff_t src_stride,
 116                            ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
 117 {
 118     int y, x;
 119 #define STORE2(pos)                                                                                       \
 120     temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale);  \
 121     src[x + pos + 16 * src_stride] = 0;                                                                   \
 122     if (temp & 0x100) temp = ~(temp >> 31);                                                               \
 123     dst[x + pos] = temp;
 124
 125     for (y = 0; y < height; y++) {
 126         const uint8_t *d = dither[y];
 127         for (x = 0; x < width; x += 8) {
 128             int temp;
 129             STORE2(0);
 130             STORE2(1);
 131             STORE2(2);
 132             STORE2(3);
 133             STORE2(4);
 134             STORE2(5);
 135             STORE2(6);
 136             STORE2(7);
 137         }
 138         src += src_stride;
 139         dst += dst_stride;
 140     }
 141 }
 142
 143 static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
 144 {
 145     int a;
 146     for (a = 0; a < 64; a++)
 147         thr_adr[a] = q * thr_adr_noq[a];
 148 }
 149
 150 static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
 151                    int dst_stride, int src_stride,
 152                    int width, int height,
 153                    uint8_t *qp_store, int qp_stride, int is_luma)
 154 {
 155     int x, x0, y, es, qy, t;
 156
 157     const int stride = is_luma ? p->temp_stride : (width + 16);
 158     const int step = 6 - p->log2_count;
 159     const int qpsh = 4 - p->hsub * !is_luma;
 160     const int qpsv = 4 - p->vsub * !is_luma;
 161
 162     DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 * BLOCKSZ];
 163     int16_t *block  = (int16_t *)block_align;
 164     int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
 165
 166     memset(block3, 0, 4 * 8 * BLOCKSZ);
 167
 168     if (!src || !dst) return;
 169
 170     for (y = 0; y < height; y++) {
 171         int index = 8 + 8 * stride + y * stride;
 172         memcpy(p->src + index, src + y * src_stride, width);
 173         for (x = 0; x < 8; x++) {
 174             p->src[index         - x - 1] = p->src[index +         x    ];
 175             p->src[index + width + x    ] = p->src[index + width - x - 1];
 176         }
 177     }
 178
 179     for (y = 0; y < 8; y++) {
 180         memcpy(p->src + (     7 - y    ) * stride, p->src + (     y + 8    ) * stride, stride);
 181         memcpy(p->src + (height + 8 + y) * stride, p->src + (height - y + 7) * stride, stride);
 182     }
 183     //FIXME (try edge emu)
 184
 185     for (y = 8; y < 24; y++)
 186         memset(p->temp + 8 + y * stride, 0, width * sizeof(int16_t));
 187
 188     for (y = step; y < height + 8; y += step) {    //step= 1,2
 189         const int y1 = y - 8 + step;                 //l5-7  l4-6;
 190         qy = y - 4;
 191
 192         if (qy > height - 1) qy = height - 1;
 193         if (qy < 0) qy = 0;
 194
 195         qy = (qy >> qpsv) * qp_stride;
 196         p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
 197
 198         for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ - 1)) {
 199             p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
 200
 201             if (p->qp)
 202                 p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
 203             else
 204                 for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
 205                     t = x + x0 - 2;                    //correct t=x+x0-2-(y&1), but its the same
 206
 207                     if (t < 0) t = 0;                   //t always < width-2
 208
 209                     t = qp_store[qy + (t >> qpsh)];
 210                     t = ff_norm_qscale(t, p->qscale_type);
 211
 212                     if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
 213                     p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
 214                 }
 215             p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
 216             memmove(block,  block  + (BLOCKSZ - 1) * 64, 8 * 8 * sizeof(int16_t)); //cycling
 217             memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 * sizeof(int16_t));
 218         }
 219
 220         es = width + 8 - x0; //  8, ...
 221         if (es > 8)
 222             p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
 223
 224         p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1));
 225         if (es > 3)
 226             p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
 227
 228         if (!(y1 & 7) && y1) {
 229             if (y1 & 8)
 230                 p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
 231                                dst_stride, stride, width, 8, 5 - p->log2_count);
 232             else
 233                 p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
 234                                 dst_stride, stride, width, 8, 5 - p->log2_count);
 235         }
 236     }
 237
 238     if (y & 7) {  // height % 8 != 0
 239         if (y & 8)
 240             p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
 241                            dst_stride, stride, width, y&7, 5 - p->log2_count);
 242         else
 243             p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
 244                             dst_stride, stride, width, y&7, 5 - p->log2_count);
 245     }
 246 }
 247
 248 static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
 249 {
 250     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 251     int_simd16_t tmp10, tmp11, tmp12, tmp13;
 252     int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
 253     int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
 254
 255     int16_t *dataptr;
 256     int16_t *wsptr;
 257     int16_t *threshold;
 258     int ctr;
 259
 260     dataptr = data;
 261     wsptr = output;
 262
 263     for (; cnt > 0; cnt -= 2) { //start positions
 264         threshold = (int16_t *)thr_adr;//threshold_mtx
 265         for (ctr = DCTSIZE; ctr > 0; ctr--) {
 266             // Process columns from input, add to output.
 267             tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
 268             tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
 269
 270             tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
 271             tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
 272
 273             tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
 274             tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
 275
 276             tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
 277             tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
 278
 279             // Even part of FDCT
 280
 281             tmp10 = tmp0 + tmp3;
 282             tmp13 = tmp0 - tmp3;
 283             tmp11 = tmp1 + tmp2;
 284             tmp12 = tmp1 - tmp2;
 285
 286             d0 = tmp10 + tmp11;
 287             d4 = tmp10 - tmp11;
 288
 289             z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
 290             d2 = tmp13 + z1;
 291             d6 = tmp13 - z1;
 292
 293             // Even part of IDCT
 294
 295             THRESHOLD(tmp0, d0, threshold[0 * 8]);
 296             THRESHOLD(tmp1, d2, threshold[2 * 8]);
 297             THRESHOLD(tmp2, d4, threshold[4 * 8]);
 298             THRESHOLD(tmp3, d6, threshold[6 * 8]);
 299             tmp0 += 2;
 300             tmp10 = (tmp0 + tmp2) >> 2;
 301             tmp11 = (tmp0 - tmp2) >> 2;
 302
 303             tmp13 = (tmp1 + tmp3) >>2; //+2 !  (psnr decides)
 304             tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
 305
 306             tmp0 = tmp10 + tmp13; //->temps
 307             tmp3 = tmp10 - tmp13; //->temps
 308             tmp1 = tmp11 + tmp12; //->temps
 309             tmp2 = tmp11 - tmp12; //->temps
 310
 311             // Odd part of FDCT
 312
 313             tmp10 = tmp4 + tmp5;
 314             tmp11 = tmp5 + tmp6;
 315             tmp12 = tmp6 + tmp7;
 316
 317             z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
 318             z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
 319             z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
 320             z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
 321
 322             z11 = tmp7 + z3;
 323             z13 = tmp7 - z3;
 324
 325             d5 = z13 + z2;
 326             d3 = z13 - z2;
 327             d1 = z11 + z4;
 328             d7 = z11 - z4;
 329
 330             // Odd part of IDCT
 331
 332             THRESHOLD(tmp4, d1, threshold[1 * 8]);
 333             THRESHOLD(tmp5, d3, threshold[3 * 8]);
 334             THRESHOLD(tmp6, d5, threshold[5 * 8]);
 335             THRESHOLD(tmp7, d7, threshold[7 * 8]);
 336
 337             //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
 338             z13 = tmp6 + tmp5;
 339             z10 = (tmp6 - tmp5) << 1;
 340             z11 = tmp4 + tmp7;
 341             z12 = (tmp4 - tmp7) << 1;
 342
 343             tmp7  = (z11 + z13) >> 2; //+2 !
 344             tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
 345             z5    = MULTIPLY16H(z10 + z12,        FIX_1_847759065);
 346             tmp10 = MULTIPLY16H(z12,              FIX_1_082392200) - z5;
 347             tmp12 = MULTIPLY16H(z10,              FIX_2_613125930) + z5; // - !!
 348
 349             tmp6 = tmp12 - tmp7;
 350             tmp5 = tmp11 - tmp6;
 351             tmp4 = tmp10 + tmp5;
 352
 353             wsptr[DCTSIZE * 0] +=  (tmp0 + tmp7);
 354             wsptr[DCTSIZE * 1] +=  (tmp1 + tmp6);
 355             wsptr[DCTSIZE * 2] +=  (tmp2 + tmp5);
 356             wsptr[DCTSIZE * 3] +=  (tmp3 - tmp4);
 357             wsptr[DCTSIZE * 4] +=  (tmp3 + tmp4);
 358             wsptr[DCTSIZE * 5] +=  (tmp2 - tmp5);
 359             wsptr[DCTSIZE * 6]  =  (tmp1 - tmp6);
 360             wsptr[DCTSIZE * 7]  =  (tmp0 - tmp7);
 361             //
 362             dataptr++; //next column
 363             wsptr++;
 364             threshold++;
 365         }
 366         dataptr += 8; //skip each second start pos
 367         wsptr   += 8;
 368     }
 369 }
 370
 371 static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
 372 {
 373     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 374     int_simd16_t tmp10, tmp11, tmp12, tmp13;
 375     int_simd16_t z5, z10, z11, z12, z13;
 376     int16_t *outptr;
 377     int16_t *wsptr;
 378
 379     cnt *= 4;
 380     wsptr = workspace;
 381     outptr = output_adr;
 382     for (; cnt > 0; cnt--) {
 383         // Even part
 384         //Simd version reads 4x4 block and transposes it
 385         tmp10 = wsptr[2] +  wsptr[3];
 386         tmp11 = wsptr[2] -  wsptr[3];
 387
 388         tmp13 = wsptr[0] +  wsptr[1];
 389         tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow
 390
 391         tmp0 = tmp10 + tmp13; //->temps
 392         tmp3 = tmp10 - tmp13; //->temps
 393         tmp1 = tmp11 + tmp12;
 394         tmp2 = tmp11 - tmp12;
 395
 396         // Odd part
 397         //Also transpose, with previous:
 398         // ---- ----      ||||
 399         // ---- ---- idct ||||
 400         // ---- ---- ---> ||||
 401         // ---- ----      ||||
 402         z13 = wsptr[4] + wsptr[5];
 403         z10 = wsptr[4] - wsptr[5];
 404         z11 = wsptr[6] + wsptr[7];
 405         z12 = wsptr[6] - wsptr[7];
 406
 407         tmp7 = z11 + z13;
 408         tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
 409
 410         z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
 411         tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
 412         tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - FIX_
 413
 414         tmp6 = (tmp12 << 3) - tmp7;
 415         tmp5 = (tmp11 << 3) - tmp6;
 416         tmp4 = (tmp10 << 3) + tmp5;
 417
 418         // Final output stage: descale and write column
 419         outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
 420         outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
 421         outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
 422         outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
 423         outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
 424         outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
 425         outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
 426         outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
 427         outptr++;
 428
 429         wsptr += DCTSIZE;       // advance pointer to next row
 430     }
 431 }
 432
 433 static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
 434 {
 435     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 436     int_simd16_t tmp10, tmp11, tmp12, tmp13;
 437     int_simd16_t z1, z2, z3, z4, z5, z11, z13;
 438     int16_t *dataptr;
 439
 440     cnt *= 4;
 441     // Pass 1: process rows.
 442
 443     dataptr = data;
 444     for (; cnt > 0; cnt--) {
 445         tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
 446         tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
 447         tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
 448         tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
 449         tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
 450         tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
 451         tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
 452         tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
 453
 454         // Even part
 455
 456         tmp10 = tmp0 + tmp3;
 457         tmp13 = tmp0 - tmp3;
 458         tmp11 = tmp1 + tmp2;
 459         tmp12 = tmp1 - tmp2;
 460         //Even columns are written first, this leads to different order of columns
 461         //in column_fidct(), but they are processed independently, so all ok.
 462         //Later in the row_idct() columns readed at the same order.
 463         dataptr[2] = tmp10 + tmp11;
 464         dataptr[3] = tmp10 - tmp11;
 465
 466         z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
 467         dataptr[0] = tmp13 + z1;
 468         dataptr[1] = tmp13 - z1;
 469
 470         // Odd part
 471
 472         tmp10 = (tmp4 + tmp5) << 2;
 473         tmp11 = (tmp5 + tmp6) << 2;
 474         tmp12 = (tmp6 + tmp7) << 2;
 475
 476         z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
 477         z2 = MULTIPLY16H(tmp10,         FIX_0_541196100) + z5;
 478         z4 = MULTIPLY16H(tmp12,         FIX_1_306562965) + z5;
 479         z3 = MULTIPLY16H(tmp11,         FIX_0_707106781);
 480
 481         z11 = tmp7 + z3;
 482         z13 = tmp7 - z3;
 483
 484         dataptr[4] = z13 + z2;
 485         dataptr[5] = z13 - z2;
 486         dataptr[6] = z11 + z4;
 487         dataptr[7] = z11 - z4;
 488
 489         pixels++;               // advance pointer to next column
 490         dataptr += DCTSIZE;
 491     }
 492 }
 493
 494 static int query_formats(AVFilterContext *ctx)
 495 {
 496     static const enum AVPixelFormat pix_fmts[] = {
 497         AV_PIX_FMT_YUV444P,  AV_PIX_FMT_YUV422P,
 498         AV_PIX_FMT_YUV420P,  AV_PIX_FMT_YUV411P,
 499         AV_PIX_FMT_YUV410P,  AV_PIX_FMT_YUV440P,
 500         AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ422P,
 501         AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ440P,
 502         AV_PIX_FMT_GBRP, AV_PIX_FMT_GRAY8,
 503         AV_PIX_FMT_NONE
 504     };
 505
 506     AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
 507     if (!fmts_list)
 508         return AVERROR(ENOMEM);
 509     return ff_set_common_formats(ctx, fmts_list);
 510 }
 511
 512 static int config_input(AVFilterLink *inlink)
 513 {
 514     AVFilterContext *ctx = inlink->dst;
 515     FSPPContext *fspp = ctx->priv;
 516     const int h = FFALIGN(inlink->h + 16, 16);
 517     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
 518
 519     fspp->hsub = desc->log2_chroma_w;
 520     fspp->vsub = desc->log2_chroma_h;
 521
 522     fspp->temp_stride = FFALIGN(inlink->w + 16, 16);
 523     fspp->temp = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->temp));
 524     fspp->src  = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->src));
 525
 526     if (!fspp->temp || !fspp->src)
 527         return AVERROR(ENOMEM);
 528
 529     if (!fspp->use_bframe_qp && !fspp->qp) {
 530         fspp->non_b_qp_alloc_size = AV_CEIL_RSHIFT(inlink->w, 4) * AV_CEIL_RSHIFT(inlink->h, 4);
 531         fspp->non_b_qp_table = av_calloc(fspp->non_b_qp_alloc_size, sizeof(*fspp->non_b_qp_table));
 532         if (!fspp->non_b_qp_table)
 533             return AVERROR(ENOMEM);
 534     }
 535
 536     fspp->store_slice  = store_slice_c;
 537     fspp->store_slice2 = store_slice2_c;
 538     fspp->mul_thrmat   = mul_thrmat_c;
 539     fspp->column_fidct = column_fidct_c;
 540     fspp->row_idct     = row_idct_c;
 541     fspp->row_fdct     = row_fdct_c;
 542
 543     if (ARCH_X86)
 544         ff_fspp_init_x86(fspp);
 545
 546     return 0;
 547 }
 548
 549 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 550 {
 551     AVFilterContext *ctx = inlink->dst;
 552     FSPPContext *fspp = ctx->priv;
 553     AVFilterLink *outlink = ctx->outputs[0];
 554     AVFrame *out = in;
 555
 556     int qp_stride = 0;
 557     uint8_t *qp_table = NULL;
 558     int i, bias;
 559     int custom_threshold_m[64];
 560
 561     bias = (1 << 4) + fspp->strength;
 562
 563     for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
 564         custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
 565
 566     for (i = 0; i < 8; i++) {
 567         fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 + 2]
 568                                       |(((uint64_t)custom_threshold_m[i * 8 + 6]) << 16)
 569                                       |(((uint64_t)custom_threshold_m[i * 8 + 0]) << 32)
 570                                       |(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48);
 571
 572         fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i * 8 + 5]
 573                                           |(((uint64_t)custom_threshold_m[i * 8 + 3]) << 16)
 574                                           |(((uint64_t)custom_threshold_m[i * 8 + 1]) << 32)
 575                                           |(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48);
 576     }
 577
 578     if (fspp->qp)
 579         fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
 580
 581     /* if we are not in a constant user quantizer mode and we don't want to use
 582      * the quantizers from the B-frames (B-frames often have a higher QP), we
 583      * need to save the qp table from the last non B-frame; this is what the
 584      * following code block does */
 585     if (!fspp->qp) {
 586         qp_table = av_frame_get_qp_table(in, &qp_stride, &fspp->qscale_type);
 587
 588         if (qp_table && !fspp->use_bframe_qp && in->pict_type != AV_PICTURE_TYPE_B) {
 589             int w, h;
 590
 591             /* if the qp stride is not set, it means the QP are only defined on
 592              * a line basis */
 593            if (!qp_stride) {
 594                 w = AV_CEIL_RSHIFT(inlink->w, 4);
 595                 h = 1;
 596             } else {
 597                 w = qp_stride;
 598                 h = AV_CEIL_RSHIFT(inlink->h, 4);
 599             }
 600             if (w * h > fspp->non_b_qp_alloc_size) {
 601                 int ret = av_reallocp_array(&fspp->non_b_qp_table, w, h);
 602                 if (ret < 0) {
 603                     fspp->non_b_qp_alloc_size = 0;
 604                     return ret;
 605                 }
 606                 fspp->non_b_qp_alloc_size = w * h;
 607             }
 608
 609             av_assert0(w * h <= fspp->non_b_qp_alloc_size);
 610             memcpy(fspp->non_b_qp_table, qp_table, w * h);
 611         }
 612     }
 613
 614     if (fspp->log2_count && !ctx->is_disabled) {
 615         if (!fspp->use_bframe_qp && fspp->non_b_qp_table)
 616             qp_table = fspp->non_b_qp_table;
 617
 618         if (qp_table || fspp->qp) {
 619             const int cw = AV_CEIL_RSHIFT(inlink->w, fspp->hsub);
 620             const int ch = AV_CEIL_RSHIFT(inlink->h, fspp->vsub);
 621
 622             /* get a new frame if in-place is not possible or if the dimensions
 623              * are not multiple of 8 */
 624             if (!av_frame_is_writable(in) || (inlink->w & 7) || (inlink->h & 7)) {
 625                 const int aligned_w = FFALIGN(inlink->w, 8);
 626                 const int aligned_h = FFALIGN(inlink->h, 8);
 627
 628                 out = ff_get_video_buffer(outlink, aligned_w, aligned_h);
 629                 if (!out) {
 630                     av_frame_free(&in);
 631                     return AVERROR(ENOMEM);
 632                 }
 633                 av_frame_copy_props(out, in);
 634                 out->width = in->width;
 635                 out->height = in->height;
 636             }
 637
 638             filter(fspp, out->data[0], in->data[0], out->linesize[0], in->linesize[0],
 639                    inlink->w, inlink->h, qp_table, qp_stride, 1);
 640             filter(fspp, out->data[1], in->data[1], out->linesize[1], in->linesize[1],
 641                    cw,        ch,        qp_table, qp_stride, 0);
 642             filter(fspp, out->data[2], in->data[2], out->linesize[2], in->linesize[2],
 643                    cw,        ch,        qp_table, qp_stride, 0);
 644             emms_c();
 645         }
 646     }
 647
 648     if (in != out) {
 649         if (in->data[3])
 650             av_image_copy_plane(out->data[3], out->linesize[3],
 651                                 in ->data[3], in ->linesize[3],
 652                                 inlink->w, inlink->h);
 653         av_frame_free(&in);
 654     }
 655     return ff_filter_frame(outlink, out);
 656 }
 657
 658 static av_cold void uninit(AVFilterContext *ctx)
 659 {
 660     FSPPContext *fspp = ctx->priv;
 661     av_freep(&fspp->temp);
 662     av_freep(&fspp->src);
 663     av_freep(&fspp->non_b_qp_table);
 664 }
 665
 666 static const AVFilterPad fspp_inputs[] = {
 667     {
 668         .name         = "default",
 669         .type         = AVMEDIA_TYPE_VIDEO,
 670         .config_props = config_input,
 671         .filter_frame = filter_frame,
 672     },
 673     { NULL }
 674 };
 675
 676 static const AVFilterPad fspp_outputs[] = {
 677     {
 678         .name = "default",
 679         .type = AVMEDIA_TYPE_VIDEO,
 680     },
 681     { NULL }
 682 };
 683
 684 AVFilter ff_vf_fspp = {
 685     .name            = "fspp",
 686     .description     = NULL_IF_CONFIG_SMALL("Apply Fast Simple Post-processing filter."),
 687     .priv_size       = sizeof(FSPPContext),
 688     .uninit          = uninit,
 689     .query_formats   = query_formats,
 690     .inputs          = fspp_inputs,
 691     .outputs         = fspp_outputs,
 692     .priv_class      = &fspp_class,
 693     .flags           = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL,
 694 };