git.sesse.net Git - ffmpeg/blob - libavfilter/vf_fspp.c

   1 /*
   2  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   3  * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
   4  * Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  21  */
  22
  23 /**
  24  * @file
  25  * Fast Simple Post-processing filter
  26  * This implementation is based on an algorithm described in
  27  * "Aria Nosratinia Embedded Post-Processing for
  28  * Enhancement of Compressed Images (1999)"
  29  * (http://www.utdallas.edu/~aria/papers/vlsisp99.pdf)
  30  * Further, with splitting (I)DCT into horizontal/vertical passes, one of
  31  * them can be performed once per block, not per pixel. This allows for much
  32  * higher speed.
  33  *
  34  * Originally written by Michael Niedermayer and Nikolaj for the MPlayer
  35  * project, and ported by Arwa Arif for FFmpeg.
  36  */
  37
  38 #include "libavutil/avassert.h"
  39 #include "libavutil/imgutils.h"
  40 #include "libavutil/opt.h"
  41 #include "libavutil/pixdesc.h"
  42 #include "internal.h"
  43 #include "libavcodec/avcodec.h" //for reference to FF_QSCALE_TYPE
  44 #include "vf_fspp.h"
  45
  46 #define OFFSET(x) offsetof(FSPPContext, x)
  47 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
  48 static const AVOption fspp_options[] = {
  49     { "quality",       "set quality",                          OFFSET(log2_count),    AV_OPT_TYPE_INT, {.i64 = 4},   4, MAX_LEVEL, FLAGS },
  50     { "qp",            "force a constant quantizer parameter", OFFSET(qp),            AV_OPT_TYPE_INT, {.i64 = 0},   0, 64,        FLAGS },
  51     { "strength",      "set filter strength",                  OFFSET(strength),      AV_OPT_TYPE_INT, {.i64 = 0}, -15, 32,        FLAGS },
  52     { "use_bframe_qp", "use B-frames' QP",                     OFFSET(use_bframe_qp), AV_OPT_TYPE_INT, {.i64 = 0},   0, 1,         FLAGS },
  53     { NULL }
  54 };
  55
  56 AVFILTER_DEFINE_CLASS(fspp);
  57
  58 DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
  59     {  0,  48,  12,  60,   3,  51,  15,  63, },
  60     { 32,  16,  44,  28,  35,  19,  47,  31, },
  61     {  8,  56,   4,  52,  11,  59,   7,  55, },
  62     { 40,  24,  36,  20,  43,  27,  39,  23, },
  63     {  2,  50,  14,  62,   1,  49,  13,  61, },
  64     { 34,  18,  46,  30,  33,  17,  45,  29, },
  65     { 10,  58,   6,  54,   9,  57,   5,  53, },
  66     { 42,  26,  38,  22,  41,  25,  37,  21, },
  67 };
  68
  69 static const short custom_threshold[64] = {
  70 // values (296) can't be too high
  71 // -it causes too big quant dependence
  72 // or maybe overflow(check), which results in some flashing
  73      71, 296, 295, 237,  71,  40,  38,  19,
  74     245, 193, 185, 121, 102,  73,  53,  27,
  75     158, 129, 141, 107,  97,  73,  50,  26,
  76     102, 116, 109,  98,  82,  66,  45,  23,
  77      71,  94,  95,  81,  70,  56,  38,  20,
  78      56,  77,  74,  66,  56,  44,  30,  15,
  79      38,  53,  50,  45,  38,  30,  21,  11,
  80      20,  27,  26,  23,  20,  15,  11,   5
  81 };
  82
  83 static inline int norm_qscale(int qscale, int type)
  84 {
  85     switch (type) {
  86     case FF_QSCALE_TYPE_MPEG1: return qscale;
  87     case FF_QSCALE_TYPE_MPEG2: return qscale >> 1;
  88     case FF_QSCALE_TYPE_H264:  return qscale >> 2;
  89     case FF_QSCALE_TYPE_VP56:  return (63 - qscale + 2) >> 2;
  90     }
  91     return qscale;
  92 }
  93
  94 //This func reads from 1 slice, 1 and clears 0 & 1
  95 static void store_slice_c(uint8_t *dst, int16_t *src,
  96                           ptrdiff_t dst_stride, ptrdiff_t src_stride,
  97                           ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
  98 {
  99     int y, x;
 100 #define STORE(pos)                                                             \
 101     temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);        \
 102     src[x + pos] = src[x + pos - 8 * src_stride] = 0;                          \
 103     if (temp & 0x100) temp = ~(temp >> 31);                                    \
 104     dst[x + pos] = temp;
 105
 106     for (y = 0; y < height; y++) {
 107         const uint8_t *d = dither[y];
 108         for (x = 0; x < width; x += 8) {
 109             int temp;
 110             STORE(0);
 111             STORE(1);
 112             STORE(2);
 113             STORE(3);
 114             STORE(4);
 115             STORE(5);
 116             STORE(6);
 117             STORE(7);
 118         }
 119         src += src_stride;
 120         dst += dst_stride;
 121     }
 122 }
 123
 124 //This func reads from 2 slices, 0 & 2  and clears 2-nd
 125 static void store_slice2_c(uint8_t *dst, int16_t *src,
 126                            ptrdiff_t dst_stride, ptrdiff_t src_stride,
 127                            ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
 128 {
 129     int y, x;
 130 #define STORE2(pos)                                                                                       \
 131     temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale);  \
 132     src[x + pos + 16 * src_stride] = 0;                                                                   \
 133     if (temp & 0x100) temp = ~(temp >> 31);                                                               \
 134     dst[x + pos] = temp;
 135
 136     for (y = 0; y < height; y++) {
 137         const uint8_t *d = dither[y];
 138         for (x = 0; x < width; x += 8) {
 139             int temp;
 140             STORE2(0);
 141             STORE2(1);
 142             STORE2(2);
 143             STORE2(3);
 144             STORE2(4);
 145             STORE2(5);
 146             STORE2(6);
 147             STORE2(7);
 148         }
 149         src += src_stride;
 150         dst += dst_stride;
 151     }
 152 }
 153
 154 static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
 155 {
 156     int a;
 157     for (a = 0; a < 64; a++)
 158         thr_adr[a] = q * thr_adr_noq[a];
 159 }
 160
 161 static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
 162                    int dst_stride, int src_stride,
 163                    int width, int height,
 164                    uint8_t *qp_store, int qp_stride, int is_luma)
 165 {
 166     int x, x0, y, es, qy, t;
 167
 168     const int stride = is_luma ? p->temp_stride : (width + 16);
 169     const int step = 6 - p->log2_count;
 170     const int qpsh = 4 - p->hsub * !is_luma;
 171     const int qpsv = 4 - p->vsub * !is_luma;
 172
 173     DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 * BLOCKSZ];
 174     int16_t *block  = (int16_t *)block_align;
 175     int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
 176
 177     memset(block3, 0, 4 * 8 * BLOCKSZ);
 178
 179     if (!src || !dst) return;
 180
 181     for (y = 0; y < height; y++) {
 182         int index = 8 + 8 * stride + y * stride;
 183         memcpy(p->src + index, src + y * src_stride, width);
 184         for (x = 0; x < 8; x++) {
 185             p->src[index         - x - 1] = p->src[index +         x    ];
 186             p->src[index + width + x    ] = p->src[index + width - x - 1];
 187         }
 188     }
 189
 190     for (y = 0; y < 8; y++) {
 191         memcpy(p->src + (     7 - y    ) * stride, p->src + (     y + 8    ) * stride, stride);
 192         memcpy(p->src + (height + 8 + y) * stride, p->src + (height - y + 7) * stride, stride);
 193     }
 194     //FIXME (try edge emu)
 195
 196     for (y = 8; y < 24; y++)
 197         memset(p->temp + 8 + y * stride, 0, width * sizeof(int16_t));
 198
 199     for (y = step; y < height + 8; y += step) {    //step= 1,2
 200         const int y1 = y - 8 + step;                 //l5-7  l4-6;
 201         qy = y - 4;
 202
 203         if (qy > height - 1) qy = height - 1;
 204         if (qy < 0) qy = 0;
 205
 206         qy = (qy >> qpsv) * qp_stride;
 207         p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
 208
 209         for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ - 1)) {
 210             p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
 211
 212             if (p->qp)
 213                 p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
 214             else
 215                 for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
 216                     t = x + x0 - 2;                    //correct t=x+x0-2-(y&1), but its the same
 217
 218                     if (t < 0) t = 0;                   //t always < width-2
 219
 220                     t = qp_store[qy + (t >> qpsh)];
 221                     t = norm_qscale(t, p->qscale_type);
 222
 223                     if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
 224                     p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
 225                 }
 226             p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
 227             memmove(block,  block  + (BLOCKSZ - 1) * 64, 8 * 8 * sizeof(int16_t)); //cycling
 228             memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 * sizeof(int16_t));
 229         }
 230
 231         es = width + 8 - x0; //  8, ...
 232         if (es > 8)
 233             p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
 234
 235         p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1));
 236         p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
 237
 238         if (!(y1 & 7) && y1) {
 239             if (y1 & 8)
 240                 p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
 241                                dst_stride, stride, width, 8, 5 - p->log2_count);
 242             else
 243                 p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
 244                                 dst_stride, stride, width, 8, 5 - p->log2_count);
 245         }
 246     }
 247
 248     if (y & 7) {  // height % 8 != 0
 249         if (y & 8)
 250             p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
 251                            dst_stride, stride, width, y&7, 5 - p->log2_count);
 252         else
 253             p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
 254                             dst_stride, stride, width, y&7, 5 - p->log2_count);
 255     }
 256 }
 257
 258 static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
 259 {
 260     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 261     int_simd16_t tmp10, tmp11, tmp12, tmp13;
 262     int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
 263     int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
 264
 265     int16_t *dataptr;
 266     int16_t *wsptr;
 267     int16_t *threshold;
 268     int ctr;
 269
 270     dataptr = data;
 271     wsptr = output;
 272
 273     for (; cnt > 0; cnt -= 2) { //start positions
 274         threshold = (int16_t *)thr_adr;//threshold_mtx
 275         for (ctr = DCTSIZE; ctr > 0; ctr--) {
 276             // Process columns from input, add to output.
 277             tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
 278             tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
 279
 280             tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
 281             tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
 282
 283             tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
 284             tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
 285
 286             tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
 287             tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
 288
 289             // Even part of FDCT
 290
 291             tmp10 = tmp0 + tmp3;
 292             tmp13 = tmp0 - tmp3;
 293             tmp11 = tmp1 + tmp2;
 294             tmp12 = tmp1 - tmp2;
 295
 296             d0 = tmp10 + tmp11;
 297             d4 = tmp10 - tmp11;
 298
 299             z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
 300             d2 = tmp13 + z1;
 301             d6 = tmp13 - z1;
 302
 303             // Even part of IDCT
 304
 305             THRESHOLD(tmp0, d0, threshold[0 * 8]);
 306             THRESHOLD(tmp1, d2, threshold[2 * 8]);
 307             THRESHOLD(tmp2, d4, threshold[4 * 8]);
 308             THRESHOLD(tmp3, d6, threshold[6 * 8]);
 309             tmp0 += 2;
 310             tmp10 = (tmp0 + tmp2) >> 2;
 311             tmp11 = (tmp0 - tmp2) >> 2;
 312
 313             tmp13 = (tmp1 + tmp3) >>2; //+2 !  (psnr decides)
 314             tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
 315
 316             tmp0 = tmp10 + tmp13; //->temps
 317             tmp3 = tmp10 - tmp13; //->temps
 318             tmp1 = tmp11 + tmp12; //->temps
 319             tmp2 = tmp11 - tmp12; //->temps
 320
 321             // Odd part of FDCT
 322
 323             tmp10 = tmp4 + tmp5;
 324             tmp11 = tmp5 + tmp6;
 325             tmp12 = tmp6 + tmp7;
 326
 327             z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
 328             z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
 329             z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
 330             z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
 331
 332             z11 = tmp7 + z3;
 333             z13 = tmp7 - z3;
 334
 335             d5 = z13 + z2;
 336             d3 = z13 - z2;
 337             d1 = z11 + z4;
 338             d7 = z11 - z4;
 339
 340             // Odd part of IDCT
 341
 342             THRESHOLD(tmp4, d1, threshold[1 * 8]);
 343             THRESHOLD(tmp5, d3, threshold[3 * 8]);
 344             THRESHOLD(tmp6, d5, threshold[5 * 8]);
 345             THRESHOLD(tmp7, d7, threshold[7 * 8]);
 346
 347             //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
 348             z13 = tmp6 + tmp5;
 349             z10 = (tmp6 - tmp5) << 1;
 350             z11 = tmp4 + tmp7;
 351             z12 = (tmp4 - tmp7) << 1;
 352
 353             tmp7  = (z11 + z13) >> 2; //+2 !
 354             tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
 355             z5    = MULTIPLY16H(z10 + z12,        FIX_1_847759065);
 356             tmp10 = MULTIPLY16H(z12,              FIX_1_082392200) - z5;
 357             tmp12 = MULTIPLY16H(z10,              FIX_2_613125930) + z5; // - !!
 358
 359             tmp6 = tmp12 - tmp7;
 360             tmp5 = tmp11 - tmp6;
 361             tmp4 = tmp10 + tmp5;
 362
 363             wsptr[DCTSIZE * 0] +=  (tmp0 + tmp7);
 364             wsptr[DCTSIZE * 1] +=  (tmp1 + tmp6);
 365             wsptr[DCTSIZE * 2] +=  (tmp2 + tmp5);
 366             wsptr[DCTSIZE * 3] +=  (tmp3 - tmp4);
 367             wsptr[DCTSIZE * 4] +=  (tmp3 + tmp4);
 368             wsptr[DCTSIZE * 5] +=  (tmp2 - tmp5);
 369             wsptr[DCTSIZE * 6]  =  (tmp1 - tmp6);
 370             wsptr[DCTSIZE * 7]  =  (tmp0 - tmp7);
 371             //
 372             dataptr++; //next column
 373             wsptr++;
 374             threshold++;
 375         }
 376         dataptr += 8; //skip each second start pos
 377         wsptr   += 8;
 378     }
 379 }
 380
 381 static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
 382 {
 383     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 384     int_simd16_t tmp10, tmp11, tmp12, tmp13;
 385     int_simd16_t z5, z10, z11, z12, z13;
 386     int16_t *outptr;
 387     int16_t *wsptr;
 388
 389     cnt *= 4;
 390     wsptr = workspace;
 391     outptr = output_adr;
 392     for (; cnt > 0; cnt--) {
 393         // Even part
 394         //Simd version reads 4x4 block and transposes it
 395         tmp10 = wsptr[2] +  wsptr[3];
 396         tmp11 = wsptr[2] -  wsptr[3];
 397
 398         tmp13 = wsptr[0] +  wsptr[1];
 399         tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow
 400
 401         tmp0 = tmp10 + tmp13; //->temps
 402         tmp3 = tmp10 - tmp13; //->temps
 403         tmp1 = tmp11 + tmp12;
 404         tmp2 = tmp11 - tmp12;
 405
 406         // Odd part
 407         //Also transpose, with previous:
 408         // ---- ----      ||||
 409         // ---- ---- idct ||||
 410         // ---- ---- ---> ||||
 411         // ---- ----      ||||
 412         z13 = wsptr[4] + wsptr[5];
 413         z10 = wsptr[4] - wsptr[5];
 414         z11 = wsptr[6] + wsptr[7];
 415         z12 = wsptr[6] - wsptr[7];
 416
 417         tmp7 = z11 + z13;
 418         tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
 419
 420         z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
 421         tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
 422         tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - FIX_
 423
 424         tmp6 = (tmp12 << 3) - tmp7;
 425         tmp5 = (tmp11 << 3) - tmp6;
 426         tmp4 = (tmp10 << 3) + tmp5;
 427
 428         // Final output stage: descale and write column
 429         outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
 430         outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
 431         outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
 432         outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
 433         outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
 434         outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
 435         outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
 436         outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
 437         outptr++;
 438
 439         wsptr += DCTSIZE;       // advance pointer to next row
 440     }
 441 }
 442
 443 static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
 444 {
 445     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 446     int_simd16_t tmp10, tmp11, tmp12, tmp13;
 447     int_simd16_t z1, z2, z3, z4, z5, z11, z13;
 448     int16_t *dataptr;
 449
 450     cnt *= 4;
 451     // Pass 1: process rows.
 452
 453     dataptr = data;
 454     for (; cnt > 0; cnt--) {
 455         tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
 456         tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
 457         tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
 458         tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
 459         tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
 460         tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
 461         tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
 462         tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
 463
 464         // Even part
 465
 466         tmp10 = tmp0 + tmp3;
 467         tmp13 = tmp0 - tmp3;
 468         tmp11 = tmp1 + tmp2;
 469         tmp12 = tmp1 - tmp2;
 470         //Even columns are written first, this leads to different order of columns
 471         //in column_fidct(), but they are processed independently, so all ok.
 472         //Later in the row_idct() columns readed at the same order.
 473         dataptr[2] = tmp10 + tmp11;
 474         dataptr[3] = tmp10 - tmp11;
 475
 476         z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
 477         dataptr[0] = tmp13 + z1;
 478         dataptr[1] = tmp13 - z1;
 479
 480         // Odd part
 481
 482         tmp10 = (tmp4 + tmp5) << 2;
 483         tmp11 = (tmp5 + tmp6) << 2;
 484         tmp12 = (tmp6 + tmp7) << 2;
 485
 486         z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
 487         z2 = MULTIPLY16H(tmp10,         FIX_0_541196100) + z5;
 488         z4 = MULTIPLY16H(tmp12,         FIX_1_306562965) + z5;
 489         z3 = MULTIPLY16H(tmp11,         FIX_0_707106781);
 490
 491         z11 = tmp7 + z3;
 492         z13 = tmp7 - z3;
 493
 494         dataptr[4] = z13 + z2;
 495         dataptr[5] = z13 - z2;
 496         dataptr[6] = z11 + z4;
 497         dataptr[7] = z11 - z4;
 498
 499         pixels++;               // advance pointer to next column
 500         dataptr += DCTSIZE;
 501     }
 502 }
 503
 504 static int query_formats(AVFilterContext *ctx)
 505 {
 506     static const enum PixelFormat pix_fmts[] = {
 507         AV_PIX_FMT_YUV444P,  AV_PIX_FMT_YUV422P,
 508         AV_PIX_FMT_YUV420P,  AV_PIX_FMT_YUV411P,
 509         AV_PIX_FMT_YUV410P,  AV_PIX_FMT_YUV440P,
 510         AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ422P,
 511         AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ440P,
 512         AV_PIX_FMT_GBRP, AV_PIX_FMT_GRAY8,
 513         AV_PIX_FMT_NONE
 514     };
 515     ff_set_common_formats(ctx, ff_make_format_list(pix_fmts));
 516     return 0;
 517 }
 518
 519 static int config_input(AVFilterLink *inlink)
 520 {
 521     AVFilterContext *ctx = inlink->dst;
 522     FSPPContext *fspp = ctx->priv;
 523     const int h = FFALIGN(inlink->h + 16, 16);
 524     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
 525
 526     fspp->hsub = desc->log2_chroma_w;
 527     fspp->vsub = desc->log2_chroma_h;
 528
 529     fspp->temp_stride = FFALIGN(inlink->w + 16, 16);
 530     fspp->temp = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->temp));
 531     fspp->src  = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->src));
 532
 533     if (!fspp->temp || !fspp->src)
 534         return AVERROR(ENOMEM);
 535
 536     if (!fspp->use_bframe_qp && !fspp->qp) {
 537         fspp->non_b_qp_alloc_size = FF_CEIL_RSHIFT(inlink->w, 4) * FF_CEIL_RSHIFT(inlink->h, 4);
 538         fspp->non_b_qp_table = av_calloc(fspp->non_b_qp_alloc_size, sizeof(*fspp->non_b_qp_table));
 539         if (!fspp->non_b_qp_table)
 540             return AVERROR(ENOMEM);
 541     }
 542
 543     fspp->store_slice  = store_slice_c;
 544     fspp->store_slice2 = store_slice2_c;
 545     fspp->mul_thrmat   = mul_thrmat_c;
 546     fspp->column_fidct = column_fidct_c;
 547     fspp->row_idct     = row_idct_c;
 548     fspp->row_fdct     = row_fdct_c;
 549
 550     if (ARCH_X86)
 551         ff_fspp_init_x86(fspp);
 552
 553     return 0;
 554 }
 555
 556 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 557 {
 558     AVFilterContext *ctx = inlink->dst;
 559     FSPPContext *fspp = ctx->priv;
 560     AVFilterLink *outlink = ctx->outputs[0];
 561     AVFrame *out = in;
 562
 563     int qp_stride = 0;
 564     uint8_t *qp_table = NULL;
 565     int i, bias;
 566     int custom_threshold_m[64];
 567
 568     bias = (1 << 4) + fspp->strength;
 569
 570     for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
 571         custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
 572
 573     for (i = 0; i < 8; i++) {
 574         fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 + 2]
 575                                       |(((uint64_t)custom_threshold_m[i * 8 + 6]) << 16)
 576                                       |(((uint64_t)custom_threshold_m[i * 8 + 0]) << 32)
 577                                       |(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48);
 578
 579         fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i * 8 + 5]
 580                                           |(((uint64_t)custom_threshold_m[i * 8 + 3]) << 16)
 581                                           |(((uint64_t)custom_threshold_m[i * 8 + 1]) << 32)
 582                                           |(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48);
 583     }
 584
 585     if (fspp->qp)
 586         fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
 587
 588     /* if we are not in a constant user quantizer mode and we don't want to use
 589      * the quantizers from the B-frames (B-frames often have a higher QP), we
 590      * need to save the qp table from the last non B-frame; this is what the
 591      * following code block does */
 592     if (!fspp->qp) {
 593         qp_table = av_frame_get_qp_table(in, &qp_stride, &fspp->qscale_type);
 594
 595         if (qp_table && !fspp->use_bframe_qp && in->pict_type != AV_PICTURE_TYPE_B) {
 596             int w, h;
 597
 598             /* if the qp stride is not set, it means the QP are only defined on
 599              * a line basis */
 600            if (!qp_stride) {
 601                 w = FF_CEIL_RSHIFT(inlink->w, 4);
 602                 h = 1;
 603             } else {
 604                 w = qp_stride;
 605                 h = FF_CEIL_RSHIFT(inlink->h, 4);
 606             }
 607             if (w * h > fspp->non_b_qp_alloc_size) {
 608                 int ret = av_reallocp_array(&fspp->non_b_qp_table, w, h);
 609                 if (ret < 0) {
 610                     fspp->non_b_qp_alloc_size = 0;
 611                     return ret;
 612                 }
 613                 fspp->non_b_qp_alloc_size = w * h;
 614             }
 615
 616             av_assert0(w * h <= fspp->non_b_qp_alloc_size);
 617             memcpy(fspp->non_b_qp_table, qp_table, w * h);
 618         }
 619     }
 620
 621     if (fspp->log2_count && !ctx->is_disabled) {
 622         if (!fspp->use_bframe_qp && fspp->non_b_qp_table)
 623             qp_table = fspp->non_b_qp_table;
 624
 625         if (qp_table || fspp->qp) {
 626             const int cw = FF_CEIL_RSHIFT(inlink->w, fspp->hsub);
 627             const int ch = FF_CEIL_RSHIFT(inlink->h, fspp->vsub);
 628
 629             /* get a new frame if in-place is not possible or if the dimensions
 630              * are not multiple of 8 */
 631             if (!av_frame_is_writable(in) || (inlink->w & 7) || (inlink->h & 7)) {
 632                 const int aligned_w = FFALIGN(inlink->w, 8);
 633                 const int aligned_h = FFALIGN(inlink->h, 8);
 634
 635                 out = ff_get_video_buffer(outlink, aligned_w, aligned_h);
 636                 if (!out) {
 637                     av_frame_free(&in);
 638                     return AVERROR(ENOMEM);
 639                 }
 640                 av_frame_copy_props(out, in);
 641             }
 642
 643             filter(fspp, out->data[0], in->data[0], out->linesize[0], in->linesize[0],
 644                    inlink->w, inlink->h, qp_table, qp_stride, 1);
 645             filter(fspp, out->data[1], in->data[1], out->linesize[1], in->linesize[1],
 646                    cw,        ch,        qp_table, qp_stride, 0);
 647             filter(fspp, out->data[2], in->data[2], out->linesize[2], in->linesize[2],
 648                    cw,        ch,        qp_table, qp_stride, 0);
 649             emms_c();
 650         }
 651     }
 652
 653     if (in != out) {
 654         if (in->data[3])
 655             av_image_copy_plane(out->data[3], out->linesize[3],
 656                                 in ->data[3], in ->linesize[3],
 657                                 inlink->w, inlink->h);
 658         av_frame_free(&in);
 659     }
 660     return ff_filter_frame(outlink, out);
 661 }
 662
 663 static av_cold void uninit(AVFilterContext *ctx)
 664 {
 665     FSPPContext *fspp = ctx->priv;
 666     av_freep(&fspp->temp);
 667     av_freep(&fspp->src);
 668     av_freep(&fspp->non_b_qp_table);
 669 }
 670
 671 static const AVFilterPad fspp_inputs[] = {
 672     {
 673         .name         = "default",
 674         .type         = AVMEDIA_TYPE_VIDEO,
 675         .config_props = config_input,
 676         .filter_frame = filter_frame,
 677     },
 678     { NULL }
 679 };
 680
 681 static const AVFilterPad fspp_outputs[] = {
 682     {
 683         .name = "default",
 684         .type = AVMEDIA_TYPE_VIDEO,
 685     },
 686     { NULL }
 687 };
 688
 689 AVFilter ff_vf_fspp = {
 690     .name            = "fspp",
 691     .description     = NULL_IF_CONFIG_SMALL("Apply Fast Simple Post-processing filter."),
 692     .priv_size       = sizeof(FSPPContext),
 693     .uninit          = uninit,
 694     .query_formats   = query_formats,
 695     .inputs          = fspp_inputs,
 696     .outputs         = fspp_outputs,
 697     .priv_class      = &fspp_class,
 698     .flags           = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL,
 699 };