git.sesse.net Git - ffmpeg/blob - libavfilter/vf_fspp.c

   1 /*
   2  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   3  * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
   4  * Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  21  */
  22
  23 /**
  24  * @file
  25  * Fast Simple Post-processing filter
  26  * This implementation is based on an algorithm described in
  27  * "Aria Nosratinia Embedded Post-Processing for
  28  * Enhancement of Compressed Images (1999)"
  29  * (http://www.utdallas.edu/~aria/papers/vlsisp99.pdf)
  30  * Further, with splitting (I)DCT into horizontal/vertical passes, one of
  31  * them can be performed once per block, not per pixel. This allows for much
  32  * higher speed.
  33  *
  34  * Originally written by Michael Niedermayer and Nikolaj for the MPlayer
  35  * project, and ported by Arwa Arif for FFmpeg.
  36  */
  37
  38 #include "libavutil/avassert.h"
  39 #include "libavutil/imgutils.h"
  40 #include "libavutil/mem_internal.h"
  41 #include "libavutil/opt.h"
  42 #include "libavutil/pixdesc.h"
  43 #include "internal.h"
  44 #include "qp_table.h"
  45 #include "vf_fspp.h"
  46
  47 #define OFFSET(x) offsetof(FSPPContext, x)
  48 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
  49 static const AVOption fspp_options[] = {
  50     { "quality",       "set quality",                          OFFSET(log2_count),    AV_OPT_TYPE_INT, {.i64 = 4},   4, MAX_LEVEL, FLAGS },
  51     { "qp",            "force a constant quantizer parameter", OFFSET(qp),            AV_OPT_TYPE_INT, {.i64 = 0},   0, 64,        FLAGS },
  52     { "strength",      "set filter strength",                  OFFSET(strength),      AV_OPT_TYPE_INT, {.i64 = 0}, -15, 32,        FLAGS },
  53     { "use_bframe_qp", "use B-frames' QP",                     OFFSET(use_bframe_qp), AV_OPT_TYPE_BOOL,{.i64 = 0},   0, 1,         FLAGS },
  54     { NULL }
  55 };
  56
  57 AVFILTER_DEFINE_CLASS(fspp);
  58
  59 DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
  60     {  0,  48,  12,  60,   3,  51,  15,  63, },
  61     { 32,  16,  44,  28,  35,  19,  47,  31, },
  62     {  8,  56,   4,  52,  11,  59,   7,  55, },
  63     { 40,  24,  36,  20,  43,  27,  39,  23, },
  64     {  2,  50,  14,  62,   1,  49,  13,  61, },
  65     { 34,  18,  46,  30,  33,  17,  45,  29, },
  66     { 10,  58,   6,  54,   9,  57,   5,  53, },
  67     { 42,  26,  38,  22,  41,  25,  37,  21, },
  68 };
  69
  70 static const short custom_threshold[64] = {
  71 // values (296) can't be too high
  72 // -it causes too big quant dependence
  73 // or maybe overflow(check), which results in some flashing
  74      71, 296, 295, 237,  71,  40,  38,  19,
  75     245, 193, 185, 121, 102,  73,  53,  27,
  76     158, 129, 141, 107,  97,  73,  50,  26,
  77     102, 116, 109,  98,  82,  66,  45,  23,
  78      71,  94,  95,  81,  70,  56,  38,  20,
  79      56,  77,  74,  66,  56,  44,  30,  15,
  80      38,  53,  50,  45,  38,  30,  21,  11,
  81      20,  27,  26,  23,  20,  15,  11,   5
  82 };
  83
  84 //This func reads from 1 slice, 1 and clears 0 & 1
  85 static void store_slice_c(uint8_t *dst, int16_t *src,
  86                           ptrdiff_t dst_stride, ptrdiff_t src_stride,
  87                           ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
  88 {
  89     int y, x;
  90 #define STORE(pos)                                                             \
  91     temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);        \
  92     src[x + pos] = src[x + pos - 8 * src_stride] = 0;                          \
  93     if (temp & 0x100) temp = ~(temp >> 31);                                    \
  94     dst[x + pos] = temp;
  95
  96     for (y = 0; y < height; y++) {
  97         const uint8_t *d = dither[y];
  98         for (x = 0; x < width; x += 8) {
  99             int temp;
 100             STORE(0);
 101             STORE(1);
 102             STORE(2);
 103             STORE(3);
 104             STORE(4);
 105             STORE(5);
 106             STORE(6);
 107             STORE(7);
 108         }
 109         src += src_stride;
 110         dst += dst_stride;
 111     }
 112 }
 113
 114 //This func reads from 2 slices, 0 & 2  and clears 2-nd
 115 static void store_slice2_c(uint8_t *dst, int16_t *src,
 116                            ptrdiff_t dst_stride, ptrdiff_t src_stride,
 117                            ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
 118 {
 119     int y, x;
 120 #define STORE2(pos)                                                                                       \
 121     temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale);  \
 122     src[x + pos + 16 * src_stride] = 0;                                                                   \
 123     if (temp & 0x100) temp = ~(temp >> 31);                                                               \
 124     dst[x + pos] = temp;
 125
 126     for (y = 0; y < height; y++) {
 127         const uint8_t *d = dither[y];
 128         for (x = 0; x < width; x += 8) {
 129             int temp;
 130             STORE2(0);
 131             STORE2(1);
 132             STORE2(2);
 133             STORE2(3);
 134             STORE2(4);
 135             STORE2(5);
 136             STORE2(6);
 137             STORE2(7);
 138         }
 139         src += src_stride;
 140         dst += dst_stride;
 141     }
 142 }
 143
 144 static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
 145 {
 146     int a;
 147     for (a = 0; a < 64; a++)
 148         thr_adr[a] = q * thr_adr_noq[a];
 149 }
 150
 151 static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
 152                    int dst_stride, int src_stride,
 153                    int width, int height,
 154                    uint8_t *qp_store, int qp_stride, int is_luma)
 155 {
 156     int x, x0, y, es, qy, t;
 157
 158     const int stride = is_luma ? p->temp_stride : (width + 16);
 159     const int step = 6 - p->log2_count;
 160     const int qpsh = 4 - p->hsub * !is_luma;
 161     const int qpsv = 4 - p->vsub * !is_luma;
 162
 163     DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 * BLOCKSZ];
 164     int16_t *block  = (int16_t *)block_align;
 165     int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
 166
 167     memset(block3, 0, 4 * 8 * BLOCKSZ);
 168
 169     if (!src || !dst) return;
 170
 171     for (y = 0; y < height; y++) {
 172         int index = 8 + 8 * stride + y * stride;
 173         memcpy(p->src + index, src + y * src_stride, width);
 174         for (x = 0; x < 8; x++) {
 175             p->src[index         - x - 1] = p->src[index +         x    ];
 176             p->src[index + width + x    ] = p->src[index + width - x - 1];
 177         }
 178     }
 179
 180     for (y = 0; y < 8; y++) {
 181         memcpy(p->src + (     7 - y    ) * stride, p->src + (     y + 8    ) * stride, stride);
 182         memcpy(p->src + (height + 8 + y) * stride, p->src + (height - y + 7) * stride, stride);
 183     }
 184     //FIXME (try edge emu)
 185
 186     for (y = 8; y < 24; y++)
 187         memset(p->temp + 8 + y * stride, 0, width * sizeof(int16_t));
 188
 189     for (y = step; y < height + 8; y += step) {    //step= 1,2
 190         const int y1 = y - 8 + step;                 //l5-7  l4-6;
 191         qy = y - 4;
 192
 193         if (qy > height - 1) qy = height - 1;
 194         if (qy < 0) qy = 0;
 195
 196         qy = (qy >> qpsv) * qp_stride;
 197         p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
 198
 199         for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ - 1)) {
 200             p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
 201
 202             if (p->qp)
 203                 p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
 204             else
 205                 for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
 206                     t = x + x0 - 2;                    //correct t=x+x0-2-(y&1), but its the same
 207
 208                     if (t < 0) t = 0;                   //t always < width-2
 209
 210                     t = qp_store[qy + (t >> qpsh)];
 211                     t = ff_norm_qscale(t, p->qscale_type);
 212
 213                     if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
 214                     p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
 215                 }
 216             p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
 217             memmove(block,  block  + (BLOCKSZ - 1) * 64, 8 * 8 * sizeof(int16_t)); //cycling
 218             memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 * sizeof(int16_t));
 219         }
 220
 221         es = width + 8 - x0; //  8, ...
 222         if (es > 8)
 223             p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
 224
 225         p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1));
 226         if (es > 3)
 227             p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
 228
 229         if (!(y1 & 7) && y1) {
 230             if (y1 & 8)
 231                 p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
 232                                dst_stride, stride, width, 8, 5 - p->log2_count);
 233             else
 234                 p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
 235                                 dst_stride, stride, width, 8, 5 - p->log2_count);
 236         }
 237     }
 238
 239     if (y & 7) {  // height % 8 != 0
 240         if (y & 8)
 241             p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
 242                            dst_stride, stride, width, y&7, 5 - p->log2_count);
 243         else
 244             p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
 245                             dst_stride, stride, width, y&7, 5 - p->log2_count);
 246     }
 247 }
 248
 249 static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
 250 {
 251     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 252     int_simd16_t tmp10, tmp11, tmp12, tmp13;
 253     int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
 254     int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
 255
 256     int16_t *dataptr;
 257     int16_t *wsptr;
 258     int16_t *threshold;
 259     int ctr;
 260
 261     dataptr = data;
 262     wsptr = output;
 263
 264     for (; cnt > 0; cnt -= 2) { //start positions
 265         threshold = (int16_t *)thr_adr;//threshold_mtx
 266         for (ctr = DCTSIZE; ctr > 0; ctr--) {
 267             // Process columns from input, add to output.
 268             tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
 269             tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
 270
 271             tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
 272             tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
 273
 274             tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
 275             tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
 276
 277             tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
 278             tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
 279
 280             // Even part of FDCT
 281
 282             tmp10 = tmp0 + tmp3;
 283             tmp13 = tmp0 - tmp3;
 284             tmp11 = tmp1 + tmp2;
 285             tmp12 = tmp1 - tmp2;
 286
 287             d0 = tmp10 + tmp11;
 288             d4 = tmp10 - tmp11;
 289
 290             z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
 291             d2 = tmp13 + z1;
 292             d6 = tmp13 - z1;
 293
 294             // Even part of IDCT
 295
 296             THRESHOLD(tmp0, d0, threshold[0 * 8]);
 297             THRESHOLD(tmp1, d2, threshold[2 * 8]);
 298             THRESHOLD(tmp2, d4, threshold[4 * 8]);
 299             THRESHOLD(tmp3, d6, threshold[6 * 8]);
 300             tmp0 += 2;
 301             tmp10 = (tmp0 + tmp2) >> 2;
 302             tmp11 = (tmp0 - tmp2) >> 2;
 303
 304             tmp13 = (tmp1 + tmp3) >>2; //+2 !  (psnr decides)
 305             tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
 306
 307             tmp0 = tmp10 + tmp13; //->temps
 308             tmp3 = tmp10 - tmp13; //->temps
 309             tmp1 = tmp11 + tmp12; //->temps
 310             tmp2 = tmp11 - tmp12; //->temps
 311
 312             // Odd part of FDCT
 313
 314             tmp10 = tmp4 + tmp5;
 315             tmp11 = tmp5 + tmp6;
 316             tmp12 = tmp6 + tmp7;
 317
 318             z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
 319             z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
 320             z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
 321             z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
 322
 323             z11 = tmp7 + z3;
 324             z13 = tmp7 - z3;
 325
 326             d5 = z13 + z2;
 327             d3 = z13 - z2;
 328             d1 = z11 + z4;
 329             d7 = z11 - z4;
 330
 331             // Odd part of IDCT
 332
 333             THRESHOLD(tmp4, d1, threshold[1 * 8]);
 334             THRESHOLD(tmp5, d3, threshold[3 * 8]);
 335             THRESHOLD(tmp6, d5, threshold[5 * 8]);
 336             THRESHOLD(tmp7, d7, threshold[7 * 8]);
 337
 338             //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
 339             z13 = tmp6 + tmp5;
 340             z10 = (tmp6 - tmp5) << 1;
 341             z11 = tmp4 + tmp7;
 342             z12 = (tmp4 - tmp7) << 1;
 343
 344             tmp7  = (z11 + z13) >> 2; //+2 !
 345             tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
 346             z5    = MULTIPLY16H(z10 + z12,        FIX_1_847759065);
 347             tmp10 = MULTIPLY16H(z12,              FIX_1_082392200) - z5;
 348             tmp12 = MULTIPLY16H(z10,              FIX_2_613125930) + z5; // - !!
 349
 350             tmp6 = tmp12 - tmp7;
 351             tmp5 = tmp11 - tmp6;
 352             tmp4 = tmp10 + tmp5;
 353
 354             wsptr[DCTSIZE * 0] +=  (tmp0 + tmp7);
 355             wsptr[DCTSIZE * 1] +=  (tmp1 + tmp6);
 356             wsptr[DCTSIZE * 2] +=  (tmp2 + tmp5);
 357             wsptr[DCTSIZE * 3] +=  (tmp3 - tmp4);
 358             wsptr[DCTSIZE * 4] +=  (tmp3 + tmp4);
 359             wsptr[DCTSIZE * 5] +=  (tmp2 - tmp5);
 360             wsptr[DCTSIZE * 6]  =  (tmp1 - tmp6);
 361             wsptr[DCTSIZE * 7]  =  (tmp0 - tmp7);
 362             //
 363             dataptr++; //next column
 364             wsptr++;
 365             threshold++;
 366         }
 367         dataptr += 8; //skip each second start pos
 368         wsptr   += 8;
 369     }
 370 }
 371
 372 static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
 373 {
 374     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 375     int_simd16_t tmp10, tmp11, tmp12, tmp13;
 376     int_simd16_t z5, z10, z11, z12, z13;
 377     int16_t *outptr;
 378     int16_t *wsptr;
 379
 380     cnt *= 4;
 381     wsptr = workspace;
 382     outptr = output_adr;
 383     for (; cnt > 0; cnt--) {
 384         // Even part
 385         //Simd version reads 4x4 block and transposes it
 386         tmp10 = wsptr[2] +  wsptr[3];
 387         tmp11 = wsptr[2] -  wsptr[3];
 388
 389         tmp13 = wsptr[0] +  wsptr[1];
 390         tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow
 391
 392         tmp0 = tmp10 + tmp13; //->temps
 393         tmp3 = tmp10 - tmp13; //->temps
 394         tmp1 = tmp11 + tmp12;
 395         tmp2 = tmp11 - tmp12;
 396
 397         // Odd part
 398         //Also transpose, with previous:
 399         // ---- ----      ||||
 400         // ---- ---- idct ||||
 401         // ---- ---- ---> ||||
 402         // ---- ----      ||||
 403         z13 = wsptr[4] + wsptr[5];
 404         z10 = wsptr[4] - wsptr[5];
 405         z11 = wsptr[6] + wsptr[7];
 406         z12 = wsptr[6] - wsptr[7];
 407
 408         tmp7 = z11 + z13;
 409         tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
 410
 411         z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
 412         tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
 413         tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - FIX_
 414
 415         tmp6 = (tmp12 << 3) - tmp7;
 416         tmp5 = (tmp11 << 3) - tmp6;
 417         tmp4 = (tmp10 << 3) + tmp5;
 418
 419         // Final output stage: descale and write column
 420         outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
 421         outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
 422         outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
 423         outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
 424         outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
 425         outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
 426         outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
 427         outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
 428         outptr++;
 429
 430         wsptr += DCTSIZE;       // advance pointer to next row
 431     }
 432 }
 433
 434 static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
 435 {
 436     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 437     int_simd16_t tmp10, tmp11, tmp12, tmp13;
 438     int_simd16_t z1, z2, z3, z4, z5, z11, z13;
 439     int16_t *dataptr;
 440
 441     cnt *= 4;
 442     // Pass 1: process rows.
 443
 444     dataptr = data;
 445     for (; cnt > 0; cnt--) {
 446         tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
 447         tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
 448         tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
 449         tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
 450         tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
 451         tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
 452         tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
 453         tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
 454
 455         // Even part
 456
 457         tmp10 = tmp0 + tmp3;
 458         tmp13 = tmp0 - tmp3;
 459         tmp11 = tmp1 + tmp2;
 460         tmp12 = tmp1 - tmp2;
 461         //Even columns are written first, this leads to different order of columns
 462         //in column_fidct(), but they are processed independently, so all ok.
 463         //Later in the row_idct() columns readed at the same order.
 464         dataptr[2] = tmp10 + tmp11;
 465         dataptr[3] = tmp10 - tmp11;
 466
 467         z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
 468         dataptr[0] = tmp13 + z1;
 469         dataptr[1] = tmp13 - z1;
 470
 471         // Odd part
 472
 473         tmp10 = (tmp4 + tmp5) << 2;
 474         tmp11 = (tmp5 + tmp6) << 2;
 475         tmp12 = (tmp6 + tmp7) << 2;
 476
 477         z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
 478         z2 = MULTIPLY16H(tmp10,         FIX_0_541196100) + z5;
 479         z4 = MULTIPLY16H(tmp12,         FIX_1_306562965) + z5;
 480         z3 = MULTIPLY16H(tmp11,         FIX_0_707106781);
 481
 482         z11 = tmp7 + z3;
 483         z13 = tmp7 - z3;
 484
 485         dataptr[4] = z13 + z2;
 486         dataptr[5] = z13 - z2;
 487         dataptr[6] = z11 + z4;
 488         dataptr[7] = z11 - z4;
 489
 490         pixels++;               // advance pointer to next column
 491         dataptr += DCTSIZE;
 492     }
 493 }
 494
 495 static int query_formats(AVFilterContext *ctx)
 496 {
 497     static const enum AVPixelFormat pix_fmts[] = {
 498         AV_PIX_FMT_YUV444P,  AV_PIX_FMT_YUV422P,
 499         AV_PIX_FMT_YUV420P,  AV_PIX_FMT_YUV411P,
 500         AV_PIX_FMT_YUV410P,  AV_PIX_FMT_YUV440P,
 501         AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ422P,
 502         AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ440P,
 503         AV_PIX_FMT_GBRP, AV_PIX_FMT_GRAY8,
 504         AV_PIX_FMT_NONE
 505     };
 506
 507     AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
 508     if (!fmts_list)
 509         return AVERROR(ENOMEM);
 510     return ff_set_common_formats(ctx, fmts_list);
 511 }
 512
 513 static int config_input(AVFilterLink *inlink)
 514 {
 515     AVFilterContext *ctx = inlink->dst;
 516     FSPPContext *fspp = ctx->priv;
 517     const int h = FFALIGN(inlink->h + 16, 16);
 518     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
 519
 520     fspp->hsub = desc->log2_chroma_w;
 521     fspp->vsub = desc->log2_chroma_h;
 522
 523     fspp->temp_stride = FFALIGN(inlink->w + 16, 16);
 524     fspp->temp = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->temp));
 525     fspp->src  = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->src));
 526
 527     if (!fspp->temp || !fspp->src)
 528         return AVERROR(ENOMEM);
 529
 530     fspp->store_slice  = store_slice_c;
 531     fspp->store_slice2 = store_slice2_c;
 532     fspp->mul_thrmat   = mul_thrmat_c;
 533     fspp->column_fidct = column_fidct_c;
 534     fspp->row_idct     = row_idct_c;
 535     fspp->row_fdct     = row_fdct_c;
 536
 537     if (ARCH_X86)
 538         ff_fspp_init_x86(fspp);
 539
 540     return 0;
 541 }
 542
 543 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 544 {
 545     AVFilterContext *ctx = inlink->dst;
 546     FSPPContext *fspp = ctx->priv;
 547     AVFilterLink *outlink = ctx->outputs[0];
 548     AVFrame *out = in;
 549
 550     int qp_stride = 0;
 551     int8_t *qp_table = NULL;
 552     int i, bias;
 553     int ret = 0;
 554     int custom_threshold_m[64];
 555
 556     bias = (1 << 4) + fspp->strength;
 557
 558     for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
 559         custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
 560
 561     for (i = 0; i < 8; i++) {
 562         fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 + 2]
 563                                       |(((uint64_t)custom_threshold_m[i * 8 + 6]) << 16)
 564                                       |(((uint64_t)custom_threshold_m[i * 8 + 0]) << 32)
 565                                       |(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48);
 566
 567         fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i * 8 + 5]
 568                                           |(((uint64_t)custom_threshold_m[i * 8 + 3]) << 16)
 569                                           |(((uint64_t)custom_threshold_m[i * 8 + 1]) << 32)
 570                                           |(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48);
 571     }
 572
 573     if (fspp->qp)
 574         fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
 575
 576     /* if we are not in a constant user quantizer mode and we don't want to use
 577      * the quantizers from the B-frames (B-frames often have a higher QP), we
 578      * need to save the qp table from the last non B-frame; this is what the
 579      * following code block does */
 580     if (!fspp->qp && (fspp->use_bframe_qp || in->pict_type != AV_PICTURE_TYPE_B)) {
 581         ret = ff_qp_table_extract(in, &qp_table, &qp_stride, NULL, &fspp->qscale_type);
 582         if (ret < 0) {
 583             av_frame_free(&in);
 584             return ret;
 585         }
 586
 587         if (!fspp->use_bframe_qp && in->pict_type != AV_PICTURE_TYPE_B) {
 588             av_freep(&fspp->non_b_qp_table);
 589             fspp->non_b_qp_table  = qp_table;
 590             fspp->non_b_qp_stride = qp_stride;
 591         }
 592     }
 593
 594     if (fspp->log2_count && !ctx->is_disabled) {
 595         if (!fspp->use_bframe_qp && fspp->non_b_qp_table) {
 596             qp_table = fspp->non_b_qp_table;
 597             qp_stride = fspp->non_b_qp_stride;
 598         }
 599
 600         if (qp_table || fspp->qp) {
 601             const int cw = AV_CEIL_RSHIFT(inlink->w, fspp->hsub);
 602             const int ch = AV_CEIL_RSHIFT(inlink->h, fspp->vsub);
 603
 604             /* get a new frame if in-place is not possible or if the dimensions
 605              * are not multiple of 8 */
 606             if (!av_frame_is_writable(in) || (inlink->w & 7) || (inlink->h & 7)) {
 607                 const int aligned_w = FFALIGN(inlink->w, 8);
 608                 const int aligned_h = FFALIGN(inlink->h, 8);
 609
 610                 out = ff_get_video_buffer(outlink, aligned_w, aligned_h);
 611                 if (!out) {
 612                     av_frame_free(&in);
 613                     ret = AVERROR(ENOMEM);
 614                     goto finish;
 615                 }
 616                 av_frame_copy_props(out, in);
 617                 out->width = in->width;
 618                 out->height = in->height;
 619             }
 620
 621             filter(fspp, out->data[0], in->data[0], out->linesize[0], in->linesize[0],
 622                    inlink->w, inlink->h, qp_table, qp_stride, 1);
 623             filter(fspp, out->data[1], in->data[1], out->linesize[1], in->linesize[1],
 624                    cw,        ch,        qp_table, qp_stride, 0);
 625             filter(fspp, out->data[2], in->data[2], out->linesize[2], in->linesize[2],
 626                    cw,        ch,        qp_table, qp_stride, 0);
 627             emms_c();
 628         }
 629     }
 630
 631     if (in != out) {
 632         if (in->data[3])
 633             av_image_copy_plane(out->data[3], out->linesize[3],
 634                                 in ->data[3], in ->linesize[3],
 635                                 inlink->w, inlink->h);
 636         av_frame_free(&in);
 637     }
 638     ret = ff_filter_frame(outlink, out);
 639 finish:
 640     if (qp_table != fspp->non_b_qp_table)
 641         av_freep(&qp_table);
 642     return ret;
 643 }
 644
 645 static av_cold void uninit(AVFilterContext *ctx)
 646 {
 647     FSPPContext *fspp = ctx->priv;
 648     av_freep(&fspp->temp);
 649     av_freep(&fspp->src);
 650     av_freep(&fspp->non_b_qp_table);
 651 }
 652
 653 static const AVFilterPad fspp_inputs[] = {
 654     {
 655         .name         = "default",
 656         .type         = AVMEDIA_TYPE_VIDEO,
 657         .config_props = config_input,
 658         .filter_frame = filter_frame,
 659     },
 660     { NULL }
 661 };
 662
 663 static const AVFilterPad fspp_outputs[] = {
 664     {
 665         .name = "default",
 666         .type = AVMEDIA_TYPE_VIDEO,
 667     },
 668     { NULL }
 669 };
 670
 671 const AVFilter ff_vf_fspp = {
 672     .name            = "fspp",
 673     .description     = NULL_IF_CONFIG_SMALL("Apply Fast Simple Post-processing filter."),
 674     .priv_size       = sizeof(FSPPContext),
 675     .uninit          = uninit,
 676     .query_formats   = query_formats,
 677     .inputs          = fspp_inputs,
 678     .outputs         = fspp_outputs,
 679     .priv_class      = &fspp_class,
 680     .flags           = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL,
 681 };