git.sesse.net Git - ffmpeg/blob - libavfilter/vf_fspp.c

   1 /*
   2  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   3  * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
   4  * Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  21  */
  22
  23 /**
  24  * @file
  25  * Fast Simple Post-processing filter
  26  * This implementation is based on an algorithm described in
  27  * "Aria Nosratinia Embedded Post-Processing for
  28  * Enhancement of Compressed Images (1999)"
  29  * (http://www.utdallas.edu/~aria/papers/vlsisp99.pdf)
  30  * Further, with splitting (I)DCT into horizontal/vertical passes, one of
  31  * them can be performed once per block, not per pixel. This allows for much
  32  * higher speed.
  33  *
  34  * Originally written by Michael Niedermayer and Nikolaj for the MPlayer
  35  * project, and ported by Arwa Arif for FFmpeg.
  36  */
  37
  38 #include "libavutil/avassert.h"
  39 #include "libavutil/imgutils.h"
  40 #include "libavutil/opt.h"
  41 #include "libavutil/pixdesc.h"
  42 #include "internal.h"
  43 #include "vf_fspp.h"
  44
  45 #define OFFSET(x) offsetof(FSPPContext, x)
  46 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
  47 static const AVOption fspp_options[] = {
  48     { "quality",       "set quality",                          OFFSET(log2_count),    AV_OPT_TYPE_INT, {.i64 = 4},   4, MAX_LEVEL, FLAGS },
  49     { "qp",            "force a constant quantizer parameter", OFFSET(qp),            AV_OPT_TYPE_INT, {.i64 = 0},   0, 64,        FLAGS },
  50     { "strength",      "set filter strength",                  OFFSET(strength),      AV_OPT_TYPE_INT, {.i64 = 0}, -15, 32,        FLAGS },
  51     { "use_bframe_qp", "use B-frames' QP",                     OFFSET(use_bframe_qp), AV_OPT_TYPE_INT, {.i64 = 0},   0, 1,         FLAGS },
  52     { NULL }
  53 };
  54
  55 AVFILTER_DEFINE_CLASS(fspp);
  56
  57 DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
  58     {  0,  48,  12,  60,   3,  51,  15,  63, },
  59     { 32,  16,  44,  28,  35,  19,  47,  31, },
  60     {  8,  56,   4,  52,  11,  59,   7,  55, },
  61     { 40,  24,  36,  20,  43,  27,  39,  23, },
  62     {  2,  50,  14,  62,   1,  49,  13,  61, },
  63     { 34,  18,  46,  30,  33,  17,  45,  29, },
  64     { 10,  58,   6,  54,   9,  57,   5,  53, },
  65     { 42,  26,  38,  22,  41,  25,  37,  21, },
  66 };
  67
  68 static const short custom_threshold[64] = {
  69 // values (296) can't be too high
  70 // -it causes too big quant dependence
  71 // or maybe overflow(check), which results in some flashing
  72      71, 296, 295, 237,  71,  40,  38,  19,
  73     245, 193, 185, 121, 102,  73,  53,  27,
  74     158, 129, 141, 107,  97,  73,  50,  26,
  75     102, 116, 109,  98,  82,  66,  45,  23,
  76      71,  94,  95,  81,  70,  56,  38,  20,
  77      56,  77,  74,  66,  56,  44,  30,  15,
  78      38,  53,  50,  45,  38,  30,  21,  11,
  79      20,  27,  26,  23,  20,  15,  11,   5
  80 };
  81
  82 //This func reads from 1 slice, 1 and clears 0 & 1
  83 static void store_slice_c(uint8_t *dst, int16_t *src,
  84                           ptrdiff_t dst_stride, ptrdiff_t src_stride,
  85                           ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
  86 {
  87     int y, x;
  88 #define STORE(pos)                                                             \
  89     temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);        \
  90     src[x + pos] = src[x + pos - 8 * src_stride] = 0;                          \
  91     if (temp & 0x100) temp = ~(temp >> 31);                                    \
  92     dst[x + pos] = temp;
  93
  94     for (y = 0; y < height; y++) {
  95         const uint8_t *d = dither[y];
  96         for (x = 0; x < width; x += 8) {
  97             int temp;
  98             STORE(0);
  99             STORE(1);
 100             STORE(2);
 101             STORE(3);
 102             STORE(4);
 103             STORE(5);
 104             STORE(6);
 105             STORE(7);
 106         }
 107         src += src_stride;
 108         dst += dst_stride;
 109     }
 110 }
 111
 112 //This func reads from 2 slices, 0 & 2  and clears 2-nd
 113 static void store_slice2_c(uint8_t *dst, int16_t *src,
 114                            ptrdiff_t dst_stride, ptrdiff_t src_stride,
 115                            ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
 116 {
 117     int y, x;
 118 #define STORE2(pos)                                                                                       \
 119     temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale);  \
 120     src[x + pos + 16 * src_stride] = 0;                                                                   \
 121     if (temp & 0x100) temp = ~(temp >> 31);                                                               \
 122     dst[x + pos] = temp;
 123
 124     for (y = 0; y < height; y++) {
 125         const uint8_t *d = dither[y];
 126         for (x = 0; x < width; x += 8) {
 127             int temp;
 128             STORE2(0);
 129             STORE2(1);
 130             STORE2(2);
 131             STORE2(3);
 132             STORE2(4);
 133             STORE2(5);
 134             STORE2(6);
 135             STORE2(7);
 136         }
 137         src += src_stride;
 138         dst += dst_stride;
 139     }
 140 }
 141
 142 static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
 143 {
 144     int a;
 145     for (a = 0; a < 64; a++)
 146         thr_adr[a] = q * thr_adr_noq[a];
 147 }
 148
 149 static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
 150                    int dst_stride, int src_stride,
 151                    int width, int height,
 152                    uint8_t *qp_store, int qp_stride, int is_luma)
 153 {
 154     int x, x0, y, es, qy, t;
 155
 156     const int stride = is_luma ? p->temp_stride : (width + 16);
 157     const int step = 6 - p->log2_count;
 158     const int qpsh = 4 - p->hsub * !is_luma;
 159     const int qpsv = 4 - p->vsub * !is_luma;
 160
 161     DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 * BLOCKSZ];
 162     int16_t *block  = (int16_t *)block_align;
 163     int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
 164
 165     memset(block3, 0, 4 * 8 * BLOCKSZ);
 166
 167     if (!src || !dst) return;
 168
 169     for (y = 0; y < height; y++) {
 170         int index = 8 + 8 * stride + y * stride;
 171         memcpy(p->src + index, src + y * src_stride, width);
 172         for (x = 0; x < 8; x++) {
 173             p->src[index         - x - 1] = p->src[index +         x    ];
 174             p->src[index + width + x    ] = p->src[index + width - x - 1];
 175         }
 176     }
 177
 178     for (y = 0; y < 8; y++) {
 179         memcpy(p->src + (     7 - y    ) * stride, p->src + (     y + 8    ) * stride, stride);
 180         memcpy(p->src + (height + 8 + y) * stride, p->src + (height - y + 7) * stride, stride);
 181     }
 182     //FIXME (try edge emu)
 183
 184     for (y = 8; y < 24; y++)
 185         memset(p->temp + 8 + y * stride, 0, width * sizeof(int16_t));
 186
 187     for (y = step; y < height + 8; y += step) {    //step= 1,2
 188         const int y1 = y - 8 + step;                 //l5-7  l4-6;
 189         qy = y - 4;
 190
 191         if (qy > height - 1) qy = height - 1;
 192         if (qy < 0) qy = 0;
 193
 194         qy = (qy >> qpsv) * qp_stride;
 195         p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
 196
 197         for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ - 1)) {
 198             p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
 199
 200             if (p->qp)
 201                 p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
 202             else
 203                 for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
 204                     t = x + x0 - 2;                    //correct t=x+x0-2-(y&1), but its the same
 205
 206                     if (t < 0) t = 0;                   //t always < width-2
 207
 208                     t = qp_store[qy + (t >> qpsh)];
 209                     t = ff_norm_qscale(t, p->qscale_type);
 210
 211                     if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
 212                     p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
 213                 }
 214             p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
 215             memmove(block,  block  + (BLOCKSZ - 1) * 64, 8 * 8 * sizeof(int16_t)); //cycling
 216             memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 * sizeof(int16_t));
 217         }
 218
 219         es = width + 8 - x0; //  8, ...
 220         if (es > 8)
 221             p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
 222
 223         p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1));
 224         p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
 225
 226         if (!(y1 & 7) && y1) {
 227             if (y1 & 8)
 228                 p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
 229                                dst_stride, stride, width, 8, 5 - p->log2_count);
 230             else
 231                 p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
 232                                 dst_stride, stride, width, 8, 5 - p->log2_count);
 233         }
 234     }
 235
 236     if (y & 7) {  // height % 8 != 0
 237         if (y & 8)
 238             p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
 239                            dst_stride, stride, width, y&7, 5 - p->log2_count);
 240         else
 241             p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
 242                             dst_stride, stride, width, y&7, 5 - p->log2_count);
 243     }
 244 }
 245
 246 static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
 247 {
 248     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 249     int_simd16_t tmp10, tmp11, tmp12, tmp13;
 250     int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
 251     int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
 252
 253     int16_t *dataptr;
 254     int16_t *wsptr;
 255     int16_t *threshold;
 256     int ctr;
 257
 258     dataptr = data;
 259     wsptr = output;
 260
 261     for (; cnt > 0; cnt -= 2) { //start positions
 262         threshold = (int16_t *)thr_adr;//threshold_mtx
 263         for (ctr = DCTSIZE; ctr > 0; ctr--) {
 264             // Process columns from input, add to output.
 265             tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
 266             tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
 267
 268             tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
 269             tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
 270
 271             tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
 272             tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
 273
 274             tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
 275             tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
 276
 277             // Even part of FDCT
 278
 279             tmp10 = tmp0 + tmp3;
 280             tmp13 = tmp0 - tmp3;
 281             tmp11 = tmp1 + tmp2;
 282             tmp12 = tmp1 - tmp2;
 283
 284             d0 = tmp10 + tmp11;
 285             d4 = tmp10 - tmp11;
 286
 287             z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
 288             d2 = tmp13 + z1;
 289             d6 = tmp13 - z1;
 290
 291             // Even part of IDCT
 292
 293             THRESHOLD(tmp0, d0, threshold[0 * 8]);
 294             THRESHOLD(tmp1, d2, threshold[2 * 8]);
 295             THRESHOLD(tmp2, d4, threshold[4 * 8]);
 296             THRESHOLD(tmp3, d6, threshold[6 * 8]);
 297             tmp0 += 2;
 298             tmp10 = (tmp0 + tmp2) >> 2;
 299             tmp11 = (tmp0 - tmp2) >> 2;
 300
 301             tmp13 = (tmp1 + tmp3) >>2; //+2 !  (psnr decides)
 302             tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
 303
 304             tmp0 = tmp10 + tmp13; //->temps
 305             tmp3 = tmp10 - tmp13; //->temps
 306             tmp1 = tmp11 + tmp12; //->temps
 307             tmp2 = tmp11 - tmp12; //->temps
 308
 309             // Odd part of FDCT
 310
 311             tmp10 = tmp4 + tmp5;
 312             tmp11 = tmp5 + tmp6;
 313             tmp12 = tmp6 + tmp7;
 314
 315             z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
 316             z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
 317             z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
 318             z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
 319
 320             z11 = tmp7 + z3;
 321             z13 = tmp7 - z3;
 322
 323             d5 = z13 + z2;
 324             d3 = z13 - z2;
 325             d1 = z11 + z4;
 326             d7 = z11 - z4;
 327
 328             // Odd part of IDCT
 329
 330             THRESHOLD(tmp4, d1, threshold[1 * 8]);
 331             THRESHOLD(tmp5, d3, threshold[3 * 8]);
 332             THRESHOLD(tmp6, d5, threshold[5 * 8]);
 333             THRESHOLD(tmp7, d7, threshold[7 * 8]);
 334
 335             //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
 336             z13 = tmp6 + tmp5;
 337             z10 = (tmp6 - tmp5) << 1;
 338             z11 = tmp4 + tmp7;
 339             z12 = (tmp4 - tmp7) << 1;
 340
 341             tmp7  = (z11 + z13) >> 2; //+2 !
 342             tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
 343             z5    = MULTIPLY16H(z10 + z12,        FIX_1_847759065);
 344             tmp10 = MULTIPLY16H(z12,              FIX_1_082392200) - z5;
 345             tmp12 = MULTIPLY16H(z10,              FIX_2_613125930) + z5; // - !!
 346
 347             tmp6 = tmp12 - tmp7;
 348             tmp5 = tmp11 - tmp6;
 349             tmp4 = tmp10 + tmp5;
 350
 351             wsptr[DCTSIZE * 0] +=  (tmp0 + tmp7);
 352             wsptr[DCTSIZE * 1] +=  (tmp1 + tmp6);
 353             wsptr[DCTSIZE * 2] +=  (tmp2 + tmp5);
 354             wsptr[DCTSIZE * 3] +=  (tmp3 - tmp4);
 355             wsptr[DCTSIZE * 4] +=  (tmp3 + tmp4);
 356             wsptr[DCTSIZE * 5] +=  (tmp2 - tmp5);
 357             wsptr[DCTSIZE * 6]  =  (tmp1 - tmp6);
 358             wsptr[DCTSIZE * 7]  =  (tmp0 - tmp7);
 359             //
 360             dataptr++; //next column
 361             wsptr++;
 362             threshold++;
 363         }
 364         dataptr += 8; //skip each second start pos
 365         wsptr   += 8;
 366     }
 367 }
 368
 369 static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
 370 {
 371     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 372     int_simd16_t tmp10, tmp11, tmp12, tmp13;
 373     int_simd16_t z5, z10, z11, z12, z13;
 374     int16_t *outptr;
 375     int16_t *wsptr;
 376
 377     cnt *= 4;
 378     wsptr = workspace;
 379     outptr = output_adr;
 380     for (; cnt > 0; cnt--) {
 381         // Even part
 382         //Simd version reads 4x4 block and transposes it
 383         tmp10 = wsptr[2] +  wsptr[3];
 384         tmp11 = wsptr[2] -  wsptr[3];
 385
 386         tmp13 = wsptr[0] +  wsptr[1];
 387         tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow
 388
 389         tmp0 = tmp10 + tmp13; //->temps
 390         tmp3 = tmp10 - tmp13; //->temps
 391         tmp1 = tmp11 + tmp12;
 392         tmp2 = tmp11 - tmp12;
 393
 394         // Odd part
 395         //Also transpose, with previous:
 396         // ---- ----      ||||
 397         // ---- ---- idct ||||
 398         // ---- ---- ---> ||||
 399         // ---- ----      ||||
 400         z13 = wsptr[4] + wsptr[5];
 401         z10 = wsptr[4] - wsptr[5];
 402         z11 = wsptr[6] + wsptr[7];
 403         z12 = wsptr[6] - wsptr[7];
 404
 405         tmp7 = z11 + z13;
 406         tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
 407
 408         z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
 409         tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
 410         tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - FIX_
 411
 412         tmp6 = (tmp12 << 3) - tmp7;
 413         tmp5 = (tmp11 << 3) - tmp6;
 414         tmp4 = (tmp10 << 3) + tmp5;
 415
 416         // Final output stage: descale and write column
 417         outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
 418         outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
 419         outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
 420         outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
 421         outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
 422         outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
 423         outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
 424         outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
 425         outptr++;
 426
 427         wsptr += DCTSIZE;       // advance pointer to next row
 428     }
 429 }
 430
 431 static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
 432 {
 433     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 434     int_simd16_t tmp10, tmp11, tmp12, tmp13;
 435     int_simd16_t z1, z2, z3, z4, z5, z11, z13;
 436     int16_t *dataptr;
 437
 438     cnt *= 4;
 439     // Pass 1: process rows.
 440
 441     dataptr = data;
 442     for (; cnt > 0; cnt--) {
 443         tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
 444         tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
 445         tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
 446         tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
 447         tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
 448         tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
 449         tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
 450         tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
 451
 452         // Even part
 453
 454         tmp10 = tmp0 + tmp3;
 455         tmp13 = tmp0 - tmp3;
 456         tmp11 = tmp1 + tmp2;
 457         tmp12 = tmp1 - tmp2;
 458         //Even columns are written first, this leads to different order of columns
 459         //in column_fidct(), but they are processed independently, so all ok.
 460         //Later in the row_idct() columns readed at the same order.
 461         dataptr[2] = tmp10 + tmp11;
 462         dataptr[3] = tmp10 - tmp11;
 463
 464         z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
 465         dataptr[0] = tmp13 + z1;
 466         dataptr[1] = tmp13 - z1;
 467
 468         // Odd part
 469
 470         tmp10 = (tmp4 + tmp5) << 2;
 471         tmp11 = (tmp5 + tmp6) << 2;
 472         tmp12 = (tmp6 + tmp7) << 2;
 473
 474         z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
 475         z2 = MULTIPLY16H(tmp10,         FIX_0_541196100) + z5;
 476         z4 = MULTIPLY16H(tmp12,         FIX_1_306562965) + z5;
 477         z3 = MULTIPLY16H(tmp11,         FIX_0_707106781);
 478
 479         z11 = tmp7 + z3;
 480         z13 = tmp7 - z3;
 481
 482         dataptr[4] = z13 + z2;
 483         dataptr[5] = z13 - z2;
 484         dataptr[6] = z11 + z4;
 485         dataptr[7] = z11 - z4;
 486
 487         pixels++;               // advance pointer to next column
 488         dataptr += DCTSIZE;
 489     }
 490 }
 491
 492 static int query_formats(AVFilterContext *ctx)
 493 {
 494     static const enum PixelFormat pix_fmts[] = {
 495         AV_PIX_FMT_YUV444P,  AV_PIX_FMT_YUV422P,
 496         AV_PIX_FMT_YUV420P,  AV_PIX_FMT_YUV411P,
 497         AV_PIX_FMT_YUV410P,  AV_PIX_FMT_YUV440P,
 498         AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ422P,
 499         AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ440P,
 500         AV_PIX_FMT_GBRP, AV_PIX_FMT_GRAY8,
 501         AV_PIX_FMT_NONE
 502     };
 503     ff_set_common_formats(ctx, ff_make_format_list(pix_fmts));
 504     return 0;
 505 }
 506
 507 static int config_input(AVFilterLink *inlink)
 508 {
 509     AVFilterContext *ctx = inlink->dst;
 510     FSPPContext *fspp = ctx->priv;
 511     const int h = FFALIGN(inlink->h + 16, 16);
 512     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
 513
 514     fspp->hsub = desc->log2_chroma_w;
 515     fspp->vsub = desc->log2_chroma_h;
 516
 517     fspp->temp_stride = FFALIGN(inlink->w + 16, 16);
 518     fspp->temp = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->temp));
 519     fspp->src  = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->src));
 520
 521     if (!fspp->temp || !fspp->src)
 522         return AVERROR(ENOMEM);
 523
 524     if (!fspp->use_bframe_qp && !fspp->qp) {
 525         fspp->non_b_qp_alloc_size = FF_CEIL_RSHIFT(inlink->w, 4) * FF_CEIL_RSHIFT(inlink->h, 4);
 526         fspp->non_b_qp_table = av_calloc(fspp->non_b_qp_alloc_size, sizeof(*fspp->non_b_qp_table));
 527         if (!fspp->non_b_qp_table)
 528             return AVERROR(ENOMEM);
 529     }
 530
 531     fspp->store_slice  = store_slice_c;
 532     fspp->store_slice2 = store_slice2_c;
 533     fspp->mul_thrmat   = mul_thrmat_c;
 534     fspp->column_fidct = column_fidct_c;
 535     fspp->row_idct     = row_idct_c;
 536     fspp->row_fdct     = row_fdct_c;
 537
 538     if (ARCH_X86)
 539         ff_fspp_init_x86(fspp);
 540
 541     return 0;
 542 }
 543
 544 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 545 {
 546     AVFilterContext *ctx = inlink->dst;
 547     FSPPContext *fspp = ctx->priv;
 548     AVFilterLink *outlink = ctx->outputs[0];
 549     AVFrame *out = in;
 550
 551     int qp_stride = 0;
 552     uint8_t *qp_table = NULL;
 553     int i, bias;
 554     int custom_threshold_m[64];
 555
 556     bias = (1 << 4) + fspp->strength;
 557
 558     for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
 559         custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
 560
 561     for (i = 0; i < 8; i++) {
 562         fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 + 2]
 563                                       |(((uint64_t)custom_threshold_m[i * 8 + 6]) << 16)
 564                                       |(((uint64_t)custom_threshold_m[i * 8 + 0]) << 32)
 565                                       |(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48);
 566
 567         fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i * 8 + 5]
 568                                           |(((uint64_t)custom_threshold_m[i * 8 + 3]) << 16)
 569                                           |(((uint64_t)custom_threshold_m[i * 8 + 1]) << 32)
 570                                           |(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48);
 571     }
 572
 573     if (fspp->qp)
 574         fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
 575
 576     /* if we are not in a constant user quantizer mode and we don't want to use
 577      * the quantizers from the B-frames (B-frames often have a higher QP), we
 578      * need to save the qp table from the last non B-frame; this is what the
 579      * following code block does */
 580     if (!fspp->qp) {
 581         qp_table = av_frame_get_qp_table(in, &qp_stride, &fspp->qscale_type);
 582
 583         if (qp_table && !fspp->use_bframe_qp && in->pict_type != AV_PICTURE_TYPE_B) {
 584             int w, h;
 585
 586             /* if the qp stride is not set, it means the QP are only defined on
 587              * a line basis */
 588            if (!qp_stride) {
 589                 w = FF_CEIL_RSHIFT(inlink->w, 4);
 590                 h = 1;
 591             } else {
 592                 w = qp_stride;
 593                 h = FF_CEIL_RSHIFT(inlink->h, 4);
 594             }
 595             if (w * h > fspp->non_b_qp_alloc_size) {
 596                 int ret = av_reallocp_array(&fspp->non_b_qp_table, w, h);
 597                 if (ret < 0) {
 598                     fspp->non_b_qp_alloc_size = 0;
 599                     return ret;
 600                 }
 601                 fspp->non_b_qp_alloc_size = w * h;
 602             }
 603
 604             av_assert0(w * h <= fspp->non_b_qp_alloc_size);
 605             memcpy(fspp->non_b_qp_table, qp_table, w * h);
 606         }
 607     }
 608
 609     if (fspp->log2_count && !ctx->is_disabled) {
 610         if (!fspp->use_bframe_qp && fspp->non_b_qp_table)
 611             qp_table = fspp->non_b_qp_table;
 612
 613         if (qp_table || fspp->qp) {
 614             const int cw = FF_CEIL_RSHIFT(inlink->w, fspp->hsub);
 615             const int ch = FF_CEIL_RSHIFT(inlink->h, fspp->vsub);
 616
 617             /* get a new frame if in-place is not possible or if the dimensions
 618              * are not multiple of 8 */
 619             if (!av_frame_is_writable(in) || (inlink->w & 7) || (inlink->h & 7)) {
 620                 const int aligned_w = FFALIGN(inlink->w, 8);
 621                 const int aligned_h = FFALIGN(inlink->h, 8);
 622
 623                 out = ff_get_video_buffer(outlink, aligned_w, aligned_h);
 624                 if (!out) {
 625                     av_frame_free(&in);
 626                     return AVERROR(ENOMEM);
 627                 }
 628                 av_frame_copy_props(out, in);
 629             }
 630
 631             filter(fspp, out->data[0], in->data[0], out->linesize[0], in->linesize[0],
 632                    inlink->w, inlink->h, qp_table, qp_stride, 1);
 633             filter(fspp, out->data[1], in->data[1], out->linesize[1], in->linesize[1],
 634                    cw,        ch,        qp_table, qp_stride, 0);
 635             filter(fspp, out->data[2], in->data[2], out->linesize[2], in->linesize[2],
 636                    cw,        ch,        qp_table, qp_stride, 0);
 637             emms_c();
 638         }
 639     }
 640
 641     if (in != out) {
 642         if (in->data[3])
 643             av_image_copy_plane(out->data[3], out->linesize[3],
 644                                 in ->data[3], in ->linesize[3],
 645                                 inlink->w, inlink->h);
 646         av_frame_free(&in);
 647     }
 648     return ff_filter_frame(outlink, out);
 649 }
 650
 651 static av_cold void uninit(AVFilterContext *ctx)
 652 {
 653     FSPPContext *fspp = ctx->priv;
 654     av_freep(&fspp->temp);
 655     av_freep(&fspp->src);
 656     av_freep(&fspp->non_b_qp_table);
 657 }
 658
 659 static const AVFilterPad fspp_inputs[] = {
 660     {
 661         .name         = "default",
 662         .type         = AVMEDIA_TYPE_VIDEO,
 663         .config_props = config_input,
 664         .filter_frame = filter_frame,
 665     },
 666     { NULL }
 667 };
 668
 669 static const AVFilterPad fspp_outputs[] = {
 670     {
 671         .name = "default",
 672         .type = AVMEDIA_TYPE_VIDEO,
 673     },
 674     { NULL }
 675 };
 676
 677 AVFilter ff_vf_fspp = {
 678     .name            = "fspp",
 679     .description     = NULL_IF_CONFIG_SMALL("Apply Fast Simple Post-processing filter."),
 680     .priv_size       = sizeof(FSPPContext),
 681     .uninit          = uninit,
 682     .query_formats   = query_formats,
 683     .inputs          = fspp_inputs,
 684     .outputs         = fspp_outputs,
 685     .priv_class      = &fspp_class,
 686     .flags           = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL,
 687 };