git.sesse.net Git - ffmpeg/blob - libavfilter/vf_fspp.c

   1 /*
   2  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   3  * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
   4  * Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  21  */
  22
  23 /**
  24  * @file
  25  * Fast Simple Post-processing filter
  26  * This implementation is based on an algorithm described in
  27  * "Aria Nosratinia Embedded Post-Processing for
  28  * Enhancement of Compressed Images (1999)"
  29  * (http://www.utdallas.edu/~aria/papers/vlsisp99.pdf)
  30  * Further, with splitting (I)DCT into horizontal/vertical passes, one of
  31  * them can be performed once per block, not per pixel. This allows for much
  32  * higher speed.
  33  *
  34  * Originally written by Michael Niedermayer and Nikolaj for the MPlayer
  35  * project, and ported by Arwa Arif for FFmpeg.
  36  */
  37
  38 #include "libavutil/avassert.h"
  39 #include "libavutil/imgutils.h"
  40 #include "libavutil/opt.h"
  41 #include "libavutil/pixdesc.h"
  42 #include "internal.h"
  43 #include "vf_fspp.h"
  44
  45 #define OFFSET(x) offsetof(FSPPContext, x)
  46 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
  47 static const AVOption fspp_options[] = {
  48     { "quality",       "set quality",                          OFFSET(log2_count),    AV_OPT_TYPE_INT, {.i64 = 4},   4, MAX_LEVEL, FLAGS },
  49     { "qp",            "force a constant quantizer parameter", OFFSET(qp),            AV_OPT_TYPE_INT, {.i64 = 0},   0, 64,        FLAGS },
  50     { "strength",      "set filter strength",                  OFFSET(strength),      AV_OPT_TYPE_INT, {.i64 = 0}, -15, 32,        FLAGS },
  51     { "use_bframe_qp", "use B-frames' QP",                     OFFSET(use_bframe_qp), AV_OPT_TYPE_INT, {.i64 = 0},   0, 1,         FLAGS },
  52     { NULL }
  53 };
  54
  55 AVFILTER_DEFINE_CLASS(fspp);
  56
  57 DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
  58     {  0,  48,  12,  60,   3,  51,  15,  63, },
  59     { 32,  16,  44,  28,  35,  19,  47,  31, },
  60     {  8,  56,   4,  52,  11,  59,   7,  55, },
  61     { 40,  24,  36,  20,  43,  27,  39,  23, },
  62     {  2,  50,  14,  62,   1,  49,  13,  61, },
  63     { 34,  18,  46,  30,  33,  17,  45,  29, },
  64     { 10,  58,   6,  54,   9,  57,   5,  53, },
  65     { 42,  26,  38,  22,  41,  25,  37,  21, },
  66 };
  67
  68 static const short custom_threshold[64] = {
  69 // values (296) can't be too high
  70 // -it causes too big quant dependence
  71 // or maybe overflow(check), which results in some flashing
  72      71, 296, 295, 237,  71,  40,  38,  19,
  73     245, 193, 185, 121, 102,  73,  53,  27,
  74     158, 129, 141, 107,  97,  73,  50,  26,
  75     102, 116, 109,  98,  82,  66,  45,  23,
  76      71,  94,  95,  81,  70,  56,  38,  20,
  77      56,  77,  74,  66,  56,  44,  30,  15,
  78      38,  53,  50,  45,  38,  30,  21,  11,
  79      20,  27,  26,  23,  20,  15,  11,   5
  80 };
  81
  82 //This func reads from 1 slice, 1 and clears 0 & 1
  83 static void store_slice_c(uint8_t *dst, int16_t *src,
  84                           ptrdiff_t dst_stride, ptrdiff_t src_stride,
  85                           ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
  86 {
  87     int y, x;
  88 #define STORE(pos)                                                             \
  89     temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);        \
  90     src[x + pos] = src[x + pos - 8 * src_stride] = 0;                          \
  91     if (temp & 0x100) temp = ~(temp >> 31);                                    \
  92     dst[x + pos] = temp;
  93
  94     for (y = 0; y < height; y++) {
  95         const uint8_t *d = dither[y];
  96         for (x = 0; x < width; x += 8) {
  97             int temp;
  98             STORE(0);
  99             STORE(1);
 100             STORE(2);
 101             STORE(3);
 102             STORE(4);
 103             STORE(5);
 104             STORE(6);
 105             STORE(7);
 106         }
 107         src += src_stride;
 108         dst += dst_stride;
 109     }
 110 }
 111
 112 //This func reads from 2 slices, 0 & 2  and clears 2-nd
 113 static void store_slice2_c(uint8_t *dst, int16_t *src,
 114                            ptrdiff_t dst_stride, ptrdiff_t src_stride,
 115                            ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
 116 {
 117     int y, x;
 118 #define STORE2(pos)                                                                                       \
 119     temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale);  \
 120     src[x + pos + 16 * src_stride] = 0;                                                                   \
 121     if (temp & 0x100) temp = ~(temp >> 31);                                                               \
 122     dst[x + pos] = temp;
 123
 124     for (y = 0; y < height; y++) {
 125         const uint8_t *d = dither[y];
 126         for (x = 0; x < width; x += 8) {
 127             int temp;
 128             STORE2(0);
 129             STORE2(1);
 130             STORE2(2);
 131             STORE2(3);
 132             STORE2(4);
 133             STORE2(5);
 134             STORE2(6);
 135             STORE2(7);
 136         }
 137         src += src_stride;
 138         dst += dst_stride;
 139     }
 140 }
 141
 142 static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
 143 {
 144     int a;
 145     for (a = 0; a < 64; a++)
 146         thr_adr[a] = q * thr_adr_noq[a];
 147 }
 148
 149 static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
 150                    int dst_stride, int src_stride,
 151                    int width, int height,
 152                    uint8_t *qp_store, int qp_stride, int is_luma)
 153 {
 154     int x, x0, y, es, qy, t;
 155
 156     const int stride = is_luma ? p->temp_stride : (width + 16);
 157     const int step = 6 - p->log2_count;
 158     const int qpsh = 4 - p->hsub * !is_luma;
 159     const int qpsv = 4 - p->vsub * !is_luma;
 160
 161     DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 * BLOCKSZ];
 162     int16_t *block  = (int16_t *)block_align;
 163     int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
 164
 165     memset(block3, 0, 4 * 8 * BLOCKSZ);
 166
 167     if (!src || !dst) return;
 168
 169     for (y = 0; y < height; y++) {
 170         int index = 8 + 8 * stride + y * stride;
 171         memcpy(p->src + index, src + y * src_stride, width);
 172         for (x = 0; x < 8; x++) {
 173             p->src[index         - x - 1] = p->src[index +         x    ];
 174             p->src[index + width + x    ] = p->src[index + width - x - 1];
 175         }
 176     }
 177
 178     for (y = 0; y < 8; y++) {
 179         memcpy(p->src + (     7 - y    ) * stride, p->src + (     y + 8    ) * stride, stride);
 180         memcpy(p->src + (height + 8 + y) * stride, p->src + (height - y + 7) * stride, stride);
 181     }
 182     //FIXME (try edge emu)
 183
 184     for (y = 8; y < 24; y++)
 185         memset(p->temp + 8 + y * stride, 0, width * sizeof(int16_t));
 186
 187     for (y = step; y < height + 8; y += step) {    //step= 1,2
 188         const int y1 = y - 8 + step;                 //l5-7  l4-6;
 189         qy = y - 4;
 190
 191         if (qy > height - 1) qy = height - 1;
 192         if (qy < 0) qy = 0;
 193
 194         qy = (qy >> qpsv) * qp_stride;
 195         p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
 196
 197         for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ - 1)) {
 198             p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
 199
 200             if (p->qp)
 201                 p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
 202             else
 203                 for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
 204                     t = x + x0 - 2;                    //correct t=x+x0-2-(y&1), but its the same
 205
 206                     if (t < 0) t = 0;                   //t always < width-2
 207
 208                     t = qp_store[qy + (t >> qpsh)];
 209                     t = ff_norm_qscale(t, p->qscale_type);
 210
 211                     if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
 212                     p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
 213                 }
 214             p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
 215             memmove(block,  block  + (BLOCKSZ - 1) * 64, 8 * 8 * sizeof(int16_t)); //cycling
 216             memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 * sizeof(int16_t));
 217         }
 218
 219         es = width + 8 - x0; //  8, ...
 220         if (es > 8)
 221             p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
 222
 223         p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1));
 224         if (es > 3)
 225             p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
 226
 227         if (!(y1 & 7) && y1) {
 228             if (y1 & 8)
 229                 p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
 230                                dst_stride, stride, width, 8, 5 - p->log2_count);
 231             else
 232                 p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
 233                                 dst_stride, stride, width, 8, 5 - p->log2_count);
 234         }
 235     }
 236
 237     if (y & 7) {  // height % 8 != 0
 238         if (y & 8)
 239             p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
 240                            dst_stride, stride, width, y&7, 5 - p->log2_count);
 241         else
 242             p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
 243                             dst_stride, stride, width, y&7, 5 - p->log2_count);
 244     }
 245 }
 246
 247 static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
 248 {
 249     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 250     int_simd16_t tmp10, tmp11, tmp12, tmp13;
 251     int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
 252     int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
 253
 254     int16_t *dataptr;
 255     int16_t *wsptr;
 256     int16_t *threshold;
 257     int ctr;
 258
 259     dataptr = data;
 260     wsptr = output;
 261
 262     for (; cnt > 0; cnt -= 2) { //start positions
 263         threshold = (int16_t *)thr_adr;//threshold_mtx
 264         for (ctr = DCTSIZE; ctr > 0; ctr--) {
 265             // Process columns from input, add to output.
 266             tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
 267             tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
 268
 269             tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
 270             tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
 271
 272             tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
 273             tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
 274
 275             tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
 276             tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
 277
 278             // Even part of FDCT
 279
 280             tmp10 = tmp0 + tmp3;
 281             tmp13 = tmp0 - tmp3;
 282             tmp11 = tmp1 + tmp2;
 283             tmp12 = tmp1 - tmp2;
 284
 285             d0 = tmp10 + tmp11;
 286             d4 = tmp10 - tmp11;
 287
 288             z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
 289             d2 = tmp13 + z1;
 290             d6 = tmp13 - z1;
 291
 292             // Even part of IDCT
 293
 294             THRESHOLD(tmp0, d0, threshold[0 * 8]);
 295             THRESHOLD(tmp1, d2, threshold[2 * 8]);
 296             THRESHOLD(tmp2, d4, threshold[4 * 8]);
 297             THRESHOLD(tmp3, d6, threshold[6 * 8]);
 298             tmp0 += 2;
 299             tmp10 = (tmp0 + tmp2) >> 2;
 300             tmp11 = (tmp0 - tmp2) >> 2;
 301
 302             tmp13 = (tmp1 + tmp3) >>2; //+2 !  (psnr decides)
 303             tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
 304
 305             tmp0 = tmp10 + tmp13; //->temps
 306             tmp3 = tmp10 - tmp13; //->temps
 307             tmp1 = tmp11 + tmp12; //->temps
 308             tmp2 = tmp11 - tmp12; //->temps
 309
 310             // Odd part of FDCT
 311
 312             tmp10 = tmp4 + tmp5;
 313             tmp11 = tmp5 + tmp6;
 314             tmp12 = tmp6 + tmp7;
 315
 316             z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
 317             z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
 318             z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
 319             z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
 320
 321             z11 = tmp7 + z3;
 322             z13 = tmp7 - z3;
 323
 324             d5 = z13 + z2;
 325             d3 = z13 - z2;
 326             d1 = z11 + z4;
 327             d7 = z11 - z4;
 328
 329             // Odd part of IDCT
 330
 331             THRESHOLD(tmp4, d1, threshold[1 * 8]);
 332             THRESHOLD(tmp5, d3, threshold[3 * 8]);
 333             THRESHOLD(tmp6, d5, threshold[5 * 8]);
 334             THRESHOLD(tmp7, d7, threshold[7 * 8]);
 335
 336             //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
 337             z13 = tmp6 + tmp5;
 338             z10 = (tmp6 - tmp5) << 1;
 339             z11 = tmp4 + tmp7;
 340             z12 = (tmp4 - tmp7) << 1;
 341
 342             tmp7  = (z11 + z13) >> 2; //+2 !
 343             tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
 344             z5    = MULTIPLY16H(z10 + z12,        FIX_1_847759065);
 345             tmp10 = MULTIPLY16H(z12,              FIX_1_082392200) - z5;
 346             tmp12 = MULTIPLY16H(z10,              FIX_2_613125930) + z5; // - !!
 347
 348             tmp6 = tmp12 - tmp7;
 349             tmp5 = tmp11 - tmp6;
 350             tmp4 = tmp10 + tmp5;
 351
 352             wsptr[DCTSIZE * 0] +=  (tmp0 + tmp7);
 353             wsptr[DCTSIZE * 1] +=  (tmp1 + tmp6);
 354             wsptr[DCTSIZE * 2] +=  (tmp2 + tmp5);
 355             wsptr[DCTSIZE * 3] +=  (tmp3 - tmp4);
 356             wsptr[DCTSIZE * 4] +=  (tmp3 + tmp4);
 357             wsptr[DCTSIZE * 5] +=  (tmp2 - tmp5);
 358             wsptr[DCTSIZE * 6]  =  (tmp1 - tmp6);
 359             wsptr[DCTSIZE * 7]  =  (tmp0 - tmp7);
 360             //
 361             dataptr++; //next column
 362             wsptr++;
 363             threshold++;
 364         }
 365         dataptr += 8; //skip each second start pos
 366         wsptr   += 8;
 367     }
 368 }
 369
 370 static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
 371 {
 372     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 373     int_simd16_t tmp10, tmp11, tmp12, tmp13;
 374     int_simd16_t z5, z10, z11, z12, z13;
 375     int16_t *outptr;
 376     int16_t *wsptr;
 377
 378     cnt *= 4;
 379     wsptr = workspace;
 380     outptr = output_adr;
 381     for (; cnt > 0; cnt--) {
 382         // Even part
 383         //Simd version reads 4x4 block and transposes it
 384         tmp10 = wsptr[2] +  wsptr[3];
 385         tmp11 = wsptr[2] -  wsptr[3];
 386
 387         tmp13 = wsptr[0] +  wsptr[1];
 388         tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow
 389
 390         tmp0 = tmp10 + tmp13; //->temps
 391         tmp3 = tmp10 - tmp13; //->temps
 392         tmp1 = tmp11 + tmp12;
 393         tmp2 = tmp11 - tmp12;
 394
 395         // Odd part
 396         //Also transpose, with previous:
 397         // ---- ----      ||||
 398         // ---- ---- idct ||||
 399         // ---- ---- ---> ||||
 400         // ---- ----      ||||
 401         z13 = wsptr[4] + wsptr[5];
 402         z10 = wsptr[4] - wsptr[5];
 403         z11 = wsptr[6] + wsptr[7];
 404         z12 = wsptr[6] - wsptr[7];
 405
 406         tmp7 = z11 + z13;
 407         tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
 408
 409         z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
 410         tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
 411         tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - FIX_
 412
 413         tmp6 = (tmp12 << 3) - tmp7;
 414         tmp5 = (tmp11 << 3) - tmp6;
 415         tmp4 = (tmp10 << 3) + tmp5;
 416
 417         // Final output stage: descale and write column
 418         outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
 419         outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
 420         outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
 421         outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
 422         outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
 423         outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
 424         outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
 425         outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
 426         outptr++;
 427
 428         wsptr += DCTSIZE;       // advance pointer to next row
 429     }
 430 }
 431
 432 static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
 433 {
 434     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 435     int_simd16_t tmp10, tmp11, tmp12, tmp13;
 436     int_simd16_t z1, z2, z3, z4, z5, z11, z13;
 437     int16_t *dataptr;
 438
 439     cnt *= 4;
 440     // Pass 1: process rows.
 441
 442     dataptr = data;
 443     for (; cnt > 0; cnt--) {
 444         tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
 445         tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
 446         tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
 447         tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
 448         tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
 449         tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
 450         tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
 451         tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
 452
 453         // Even part
 454
 455         tmp10 = tmp0 + tmp3;
 456         tmp13 = tmp0 - tmp3;
 457         tmp11 = tmp1 + tmp2;
 458         tmp12 = tmp1 - tmp2;
 459         //Even columns are written first, this leads to different order of columns
 460         //in column_fidct(), but they are processed independently, so all ok.
 461         //Later in the row_idct() columns readed at the same order.
 462         dataptr[2] = tmp10 + tmp11;
 463         dataptr[3] = tmp10 - tmp11;
 464
 465         z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
 466         dataptr[0] = tmp13 + z1;
 467         dataptr[1] = tmp13 - z1;
 468
 469         // Odd part
 470
 471         tmp10 = (tmp4 + tmp5) << 2;
 472         tmp11 = (tmp5 + tmp6) << 2;
 473         tmp12 = (tmp6 + tmp7) << 2;
 474
 475         z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
 476         z2 = MULTIPLY16H(tmp10,         FIX_0_541196100) + z5;
 477         z4 = MULTIPLY16H(tmp12,         FIX_1_306562965) + z5;
 478         z3 = MULTIPLY16H(tmp11,         FIX_0_707106781);
 479
 480         z11 = tmp7 + z3;
 481         z13 = tmp7 - z3;
 482
 483         dataptr[4] = z13 + z2;
 484         dataptr[5] = z13 - z2;
 485         dataptr[6] = z11 + z4;
 486         dataptr[7] = z11 - z4;
 487
 488         pixels++;               // advance pointer to next column
 489         dataptr += DCTSIZE;
 490     }
 491 }
 492
 493 static int query_formats(AVFilterContext *ctx)
 494 {
 495     static const enum AVPixelFormat pix_fmts[] = {
 496         AV_PIX_FMT_YUV444P,  AV_PIX_FMT_YUV422P,
 497         AV_PIX_FMT_YUV420P,  AV_PIX_FMT_YUV411P,
 498         AV_PIX_FMT_YUV410P,  AV_PIX_FMT_YUV440P,
 499         AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ422P,
 500         AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ440P,
 501         AV_PIX_FMT_GBRP, AV_PIX_FMT_GRAY8,
 502         AV_PIX_FMT_NONE
 503     };
 504     ff_set_common_formats(ctx, ff_make_format_list(pix_fmts));
 505     return 0;
 506 }
 507
 508 static int config_input(AVFilterLink *inlink)
 509 {
 510     AVFilterContext *ctx = inlink->dst;
 511     FSPPContext *fspp = ctx->priv;
 512     const int h = FFALIGN(inlink->h + 16, 16);
 513     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
 514
 515     fspp->hsub = desc->log2_chroma_w;
 516     fspp->vsub = desc->log2_chroma_h;
 517
 518     fspp->temp_stride = FFALIGN(inlink->w + 16, 16);
 519     fspp->temp = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->temp));
 520     fspp->src  = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->src));
 521
 522     if (!fspp->temp || !fspp->src)
 523         return AVERROR(ENOMEM);
 524
 525     if (!fspp->use_bframe_qp && !fspp->qp) {
 526         fspp->non_b_qp_alloc_size = FF_CEIL_RSHIFT(inlink->w, 4) * FF_CEIL_RSHIFT(inlink->h, 4);
 527         fspp->non_b_qp_table = av_calloc(fspp->non_b_qp_alloc_size, sizeof(*fspp->non_b_qp_table));
 528         if (!fspp->non_b_qp_table)
 529             return AVERROR(ENOMEM);
 530     }
 531
 532     fspp->store_slice  = store_slice_c;
 533     fspp->store_slice2 = store_slice2_c;
 534     fspp->mul_thrmat   = mul_thrmat_c;
 535     fspp->column_fidct = column_fidct_c;
 536     fspp->row_idct     = row_idct_c;
 537     fspp->row_fdct     = row_fdct_c;
 538
 539     if (ARCH_X86)
 540         ff_fspp_init_x86(fspp);
 541
 542     return 0;
 543 }
 544
 545 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 546 {
 547     AVFilterContext *ctx = inlink->dst;
 548     FSPPContext *fspp = ctx->priv;
 549     AVFilterLink *outlink = ctx->outputs[0];
 550     AVFrame *out = in;
 551
 552     int qp_stride = 0;
 553     uint8_t *qp_table = NULL;
 554     int i, bias;
 555     int custom_threshold_m[64];
 556
 557     bias = (1 << 4) + fspp->strength;
 558
 559     for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
 560         custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
 561
 562     for (i = 0; i < 8; i++) {
 563         fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 + 2]
 564                                       |(((uint64_t)custom_threshold_m[i * 8 + 6]) << 16)
 565                                       |(((uint64_t)custom_threshold_m[i * 8 + 0]) << 32)
 566                                       |(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48);
 567
 568         fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i * 8 + 5]
 569                                           |(((uint64_t)custom_threshold_m[i * 8 + 3]) << 16)
 570                                           |(((uint64_t)custom_threshold_m[i * 8 + 1]) << 32)
 571                                           |(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48);
 572     }
 573
 574     if (fspp->qp)
 575         fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
 576
 577     /* if we are not in a constant user quantizer mode and we don't want to use
 578      * the quantizers from the B-frames (B-frames often have a higher QP), we
 579      * need to save the qp table from the last non B-frame; this is what the
 580      * following code block does */
 581     if (!fspp->qp) {
 582         qp_table = av_frame_get_qp_table(in, &qp_stride, &fspp->qscale_type);
 583
 584         if (qp_table && !fspp->use_bframe_qp && in->pict_type != AV_PICTURE_TYPE_B) {
 585             int w, h;
 586
 587             /* if the qp stride is not set, it means the QP are only defined on
 588              * a line basis */
 589            if (!qp_stride) {
 590                 w = FF_CEIL_RSHIFT(inlink->w, 4);
 591                 h = 1;
 592             } else {
 593                 w = qp_stride;
 594                 h = FF_CEIL_RSHIFT(inlink->h, 4);
 595             }
 596             if (w * h > fspp->non_b_qp_alloc_size) {
 597                 int ret = av_reallocp_array(&fspp->non_b_qp_table, w, h);
 598                 if (ret < 0) {
 599                     fspp->non_b_qp_alloc_size = 0;
 600                     return ret;
 601                 }
 602                 fspp->non_b_qp_alloc_size = w * h;
 603             }
 604
 605             av_assert0(w * h <= fspp->non_b_qp_alloc_size);
 606             memcpy(fspp->non_b_qp_table, qp_table, w * h);
 607         }
 608     }
 609
 610     if (fspp->log2_count && !ctx->is_disabled) {
 611         if (!fspp->use_bframe_qp && fspp->non_b_qp_table)
 612             qp_table = fspp->non_b_qp_table;
 613
 614         if (qp_table || fspp->qp) {
 615             const int cw = FF_CEIL_RSHIFT(inlink->w, fspp->hsub);
 616             const int ch = FF_CEIL_RSHIFT(inlink->h, fspp->vsub);
 617
 618             /* get a new frame if in-place is not possible or if the dimensions
 619              * are not multiple of 8 */
 620             if (!av_frame_is_writable(in) || (inlink->w & 7) || (inlink->h & 7)) {
 621                 const int aligned_w = FFALIGN(inlink->w, 8);
 622                 const int aligned_h = FFALIGN(inlink->h, 8);
 623
 624                 out = ff_get_video_buffer(outlink, aligned_w, aligned_h);
 625                 if (!out) {
 626                     av_frame_free(&in);
 627                     return AVERROR(ENOMEM);
 628                 }
 629                 av_frame_copy_props(out, in);
 630                 out->width = in->width;
 631                 out->height = in->height;
 632             }
 633
 634             filter(fspp, out->data[0], in->data[0], out->linesize[0], in->linesize[0],
 635                    inlink->w, inlink->h, qp_table, qp_stride, 1);
 636             filter(fspp, out->data[1], in->data[1], out->linesize[1], in->linesize[1],
 637                    cw,        ch,        qp_table, qp_stride, 0);
 638             filter(fspp, out->data[2], in->data[2], out->linesize[2], in->linesize[2],
 639                    cw,        ch,        qp_table, qp_stride, 0);
 640             emms_c();
 641         }
 642     }
 643
 644     if (in != out) {
 645         if (in->data[3])
 646             av_image_copy_plane(out->data[3], out->linesize[3],
 647                                 in ->data[3], in ->linesize[3],
 648                                 inlink->w, inlink->h);
 649         av_frame_free(&in);
 650     }
 651     return ff_filter_frame(outlink, out);
 652 }
 653
 654 static av_cold void uninit(AVFilterContext *ctx)
 655 {
 656     FSPPContext *fspp = ctx->priv;
 657     av_freep(&fspp->temp);
 658     av_freep(&fspp->src);
 659     av_freep(&fspp->non_b_qp_table);
 660 }
 661
 662 static const AVFilterPad fspp_inputs[] = {
 663     {
 664         .name         = "default",
 665         .type         = AVMEDIA_TYPE_VIDEO,
 666         .config_props = config_input,
 667         .filter_frame = filter_frame,
 668     },
 669     { NULL }
 670 };
 671
 672 static const AVFilterPad fspp_outputs[] = {
 673     {
 674         .name = "default",
 675         .type = AVMEDIA_TYPE_VIDEO,
 676     },
 677     { NULL }
 678 };
 679
 680 AVFilter ff_vf_fspp = {
 681     .name            = "fspp",
 682     .description     = NULL_IF_CONFIG_SMALL("Apply Fast Simple Post-processing filter."),
 683     .priv_size       = sizeof(FSPPContext),
 684     .uninit          = uninit,
 685     .query_formats   = query_formats,
 686     .inputs          = fspp_inputs,
 687     .outputs         = fspp_outputs,
 688     .priv_class      = &fspp_class,
 689     .flags           = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL,
 690 };