git.sesse.net Git - ffmpeg/blob - libavfilter/vf_fspp.c

   1 /*
   2  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   3  * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
   4  * Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or modify
   9  * it under the terms of the GNU General Public License as published by
  10  * the Free Software Foundation; either version 2 of the License, or
  11  * (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16  * GNU General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU General Public License along
  19  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  20  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  21  */
  22
  23 /**
  24  * @file
  25  * Fast Simple Post-processing filter
  26  * This implementation is based on an algorithm described in
  27  * "Aria Nosratinia Embedded Post-Processing for
  28  * Enhancement of Compressed Images (1999)"
  29  * (http://www.utdallas.edu/~aria/papers/vlsisp99.pdf)
  30  * Further, with splitting (I)DCT into horizontal/vertical passes, one of
  31  * them can be performed once per block, not per pixel. This allows for much
  32  * higher speed.
  33  *
  34  * Originally written by Michael Niedermayer and Nikolaj for the MPlayer
  35  * project, and ported by Arwa Arif for FFmpeg.
  36  */
  37
  38 #include "libavutil/avassert.h"
  39 #include "libavutil/imgutils.h"
  40 #include "libavutil/opt.h"
  41 #include "libavutil/pixdesc.h"
  42 #include "internal.h"
  43 #include "vf_fspp.h"
  44
  45 #define OFFSET(x) offsetof(FSPPContext, x)
  46 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
  47 static const AVOption fspp_options[] = {
  48     { "quality",       "set quality",                          OFFSET(log2_count),    AV_OPT_TYPE_INT, {.i64 = 4},   4, MAX_LEVEL, FLAGS },
  49     { "qp",            "force a constant quantizer parameter", OFFSET(qp),            AV_OPT_TYPE_INT, {.i64 = 0},   0, 64,        FLAGS },
  50     { "strength",      "set filter strength",                  OFFSET(strength),      AV_OPT_TYPE_INT, {.i64 = 0}, -15, 32,        FLAGS },
  51     { "use_bframe_qp", "use B-frames' QP",                     OFFSET(use_bframe_qp), AV_OPT_TYPE_BOOL,{.i64 = 0},   0, 1,         FLAGS },
  52     { NULL }
  53 };
  54
  55 AVFILTER_DEFINE_CLASS(fspp);
  56
  57 DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
  58     {  0,  48,  12,  60,   3,  51,  15,  63, },
  59     { 32,  16,  44,  28,  35,  19,  47,  31, },
  60     {  8,  56,   4,  52,  11,  59,   7,  55, },
  61     { 40,  24,  36,  20,  43,  27,  39,  23, },
  62     {  2,  50,  14,  62,   1,  49,  13,  61, },
  63     { 34,  18,  46,  30,  33,  17,  45,  29, },
  64     { 10,  58,   6,  54,   9,  57,   5,  53, },
  65     { 42,  26,  38,  22,  41,  25,  37,  21, },
  66 };
  67
  68 static const short custom_threshold[64] = {
  69 // values (296) can't be too high
  70 // -it causes too big quant dependence
  71 // or maybe overflow(check), which results in some flashing
  72      71, 296, 295, 237,  71,  40,  38,  19,
  73     245, 193, 185, 121, 102,  73,  53,  27,
  74     158, 129, 141, 107,  97,  73,  50,  26,
  75     102, 116, 109,  98,  82,  66,  45,  23,
  76      71,  94,  95,  81,  70,  56,  38,  20,
  77      56,  77,  74,  66,  56,  44,  30,  15,
  78      38,  53,  50,  45,  38,  30,  21,  11,
  79      20,  27,  26,  23,  20,  15,  11,   5
  80 };
  81
  82 //This func reads from 1 slice, 1 and clears 0 & 1
  83 static void store_slice_c(uint8_t *dst, int16_t *src,
  84                           ptrdiff_t dst_stride, ptrdiff_t src_stride,
  85                           ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
  86 {
  87     int y, x;
  88 #define STORE(pos)                                                             \
  89     temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);        \
  90     src[x + pos] = src[x + pos - 8 * src_stride] = 0;                          \
  91     if (temp & 0x100) temp = ~(temp >> 31);                                    \
  92     dst[x + pos] = temp;
  93
  94     for (y = 0; y < height; y++) {
  95         const uint8_t *d = dither[y];
  96         for (x = 0; x < width; x += 8) {
  97             int temp;
  98             STORE(0);
  99             STORE(1);
 100             STORE(2);
 101             STORE(3);
 102             STORE(4);
 103             STORE(5);
 104             STORE(6);
 105             STORE(7);
 106         }
 107         src += src_stride;
 108         dst += dst_stride;
 109     }
 110 }
 111
 112 //This func reads from 2 slices, 0 & 2  and clears 2-nd
 113 static void store_slice2_c(uint8_t *dst, int16_t *src,
 114                            ptrdiff_t dst_stride, ptrdiff_t src_stride,
 115                            ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
 116 {
 117     int y, x;
 118 #define STORE2(pos)                                                                                       \
 119     temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale);  \
 120     src[x + pos + 16 * src_stride] = 0;                                                                   \
 121     if (temp & 0x100) temp = ~(temp >> 31);                                                               \
 122     dst[x + pos] = temp;
 123
 124     for (y = 0; y < height; y++) {
 125         const uint8_t *d = dither[y];
 126         for (x = 0; x < width; x += 8) {
 127             int temp;
 128             STORE2(0);
 129             STORE2(1);
 130             STORE2(2);
 131             STORE2(3);
 132             STORE2(4);
 133             STORE2(5);
 134             STORE2(6);
 135             STORE2(7);
 136         }
 137         src += src_stride;
 138         dst += dst_stride;
 139     }
 140 }
 141
 142 static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
 143 {
 144     int a;
 145     for (a = 0; a < 64; a++)
 146         thr_adr[a] = q * thr_adr_noq[a];
 147 }
 148
 149 static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
 150                    int dst_stride, int src_stride,
 151                    int width, int height,
 152                    uint8_t *qp_store, int qp_stride, int is_luma)
 153 {
 154     int x, x0, y, es, qy, t;
 155
 156     const int stride = is_luma ? p->temp_stride : (width + 16);
 157     const int step = 6 - p->log2_count;
 158     const int qpsh = 4 - p->hsub * !is_luma;
 159     const int qpsv = 4 - p->vsub * !is_luma;
 160
 161     DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 * BLOCKSZ];
 162     int16_t *block  = (int16_t *)block_align;
 163     int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
 164
 165     memset(block3, 0, 4 * 8 * BLOCKSZ);
 166
 167     if (!src || !dst) return;
 168
 169     for (y = 0; y < height; y++) {
 170         int index = 8 + 8 * stride + y * stride;
 171         memcpy(p->src + index, src + y * src_stride, width);
 172         for (x = 0; x < 8; x++) {
 173             p->src[index         - x - 1] = p->src[index +         x    ];
 174             p->src[index + width + x    ] = p->src[index + width - x - 1];
 175         }
 176     }
 177
 178     for (y = 0; y < 8; y++) {
 179         memcpy(p->src + (     7 - y    ) * stride, p->src + (     y + 8    ) * stride, stride);
 180         memcpy(p->src + (height + 8 + y) * stride, p->src + (height - y + 7) * stride, stride);
 181     }
 182     //FIXME (try edge emu)
 183
 184     for (y = 8; y < 24; y++)
 185         memset(p->temp + 8 + y * stride, 0, width * sizeof(int16_t));
 186
 187     for (y = step; y < height + 8; y += step) {    //step= 1,2
 188         const int y1 = y - 8 + step;                 //l5-7  l4-6;
 189         qy = y - 4;
 190
 191         if (qy > height - 1) qy = height - 1;
 192         if (qy < 0) qy = 0;
 193
 194         qy = (qy >> qpsv) * qp_stride;
 195         p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
 196
 197         for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ - 1)) {
 198             p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
 199
 200             if (p->qp)
 201                 p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
 202             else
 203                 for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
 204                     t = x + x0 - 2;                    //correct t=x+x0-2-(y&1), but its the same
 205
 206                     if (t < 0) t = 0;                   //t always < width-2
 207
 208                     t = qp_store[qy + (t >> qpsh)];
 209                     t = ff_norm_qscale(t, p->qscale_type);
 210
 211                     if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
 212                     p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
 213                 }
 214             p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
 215             memmove(block,  block  + (BLOCKSZ - 1) * 64, 8 * 8 * sizeof(int16_t)); //cycling
 216             memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 * sizeof(int16_t));
 217         }
 218
 219         es = width + 8 - x0; //  8, ...
 220         if (es > 8)
 221             p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
 222
 223         p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1));
 224         if (es > 3)
 225             p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
 226
 227         if (!(y1 & 7) && y1) {
 228             if (y1 & 8)
 229                 p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
 230                                dst_stride, stride, width, 8, 5 - p->log2_count);
 231             else
 232                 p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
 233                                 dst_stride, stride, width, 8, 5 - p->log2_count);
 234         }
 235     }
 236
 237     if (y & 7) {  // height % 8 != 0
 238         if (y & 8)
 239             p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
 240                            dst_stride, stride, width, y&7, 5 - p->log2_count);
 241         else
 242             p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
 243                             dst_stride, stride, width, y&7, 5 - p->log2_count);
 244     }
 245 }
 246
 247 static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
 248 {
 249     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 250     int_simd16_t tmp10, tmp11, tmp12, tmp13;
 251     int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
 252     int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
 253
 254     int16_t *dataptr;
 255     int16_t *wsptr;
 256     int16_t *threshold;
 257     int ctr;
 258
 259     dataptr = data;
 260     wsptr = output;
 261
 262     for (; cnt > 0; cnt -= 2) { //start positions
 263         threshold = (int16_t *)thr_adr;//threshold_mtx
 264         for (ctr = DCTSIZE; ctr > 0; ctr--) {
 265             // Process columns from input, add to output.
 266             tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
 267             tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
 268
 269             tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
 270             tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
 271
 272             tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
 273             tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
 274
 275             tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
 276             tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
 277
 278             // Even part of FDCT
 279
 280             tmp10 = tmp0 + tmp3;
 281             tmp13 = tmp0 - tmp3;
 282             tmp11 = tmp1 + tmp2;
 283             tmp12 = tmp1 - tmp2;
 284
 285             d0 = tmp10 + tmp11;
 286             d4 = tmp10 - tmp11;
 287
 288             z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
 289             d2 = tmp13 + z1;
 290             d6 = tmp13 - z1;
 291
 292             // Even part of IDCT
 293
 294             THRESHOLD(tmp0, d0, threshold[0 * 8]);
 295             THRESHOLD(tmp1, d2, threshold[2 * 8]);
 296             THRESHOLD(tmp2, d4, threshold[4 * 8]);
 297             THRESHOLD(tmp3, d6, threshold[6 * 8]);
 298             tmp0 += 2;
 299             tmp10 = (tmp0 + tmp2) >> 2;
 300             tmp11 = (tmp0 - tmp2) >> 2;
 301
 302             tmp13 = (tmp1 + tmp3) >>2; //+2 !  (psnr decides)
 303             tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
 304
 305             tmp0 = tmp10 + tmp13; //->temps
 306             tmp3 = tmp10 - tmp13; //->temps
 307             tmp1 = tmp11 + tmp12; //->temps
 308             tmp2 = tmp11 - tmp12; //->temps
 309
 310             // Odd part of FDCT
 311
 312             tmp10 = tmp4 + tmp5;
 313             tmp11 = tmp5 + tmp6;
 314             tmp12 = tmp6 + tmp7;
 315
 316             z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
 317             z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
 318             z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
 319             z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
 320
 321             z11 = tmp7 + z3;
 322             z13 = tmp7 - z3;
 323
 324             d5 = z13 + z2;
 325             d3 = z13 - z2;
 326             d1 = z11 + z4;
 327             d7 = z11 - z4;
 328
 329             // Odd part of IDCT
 330
 331             THRESHOLD(tmp4, d1, threshold[1 * 8]);
 332             THRESHOLD(tmp5, d3, threshold[3 * 8]);
 333             THRESHOLD(tmp6, d5, threshold[5 * 8]);
 334             THRESHOLD(tmp7, d7, threshold[7 * 8]);
 335
 336             //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
 337             z13 = tmp6 + tmp5;
 338             z10 = (tmp6 - tmp5) << 1;
 339             z11 = tmp4 + tmp7;
 340             z12 = (tmp4 - tmp7) << 1;
 341
 342             tmp7  = (z11 + z13) >> 2; //+2 !
 343             tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
 344             z5    = MULTIPLY16H(z10 + z12,        FIX_1_847759065);
 345             tmp10 = MULTIPLY16H(z12,              FIX_1_082392200) - z5;
 346             tmp12 = MULTIPLY16H(z10,              FIX_2_613125930) + z5; // - !!
 347
 348             tmp6 = tmp12 - tmp7;
 349             tmp5 = tmp11 - tmp6;
 350             tmp4 = tmp10 + tmp5;
 351
 352             wsptr[DCTSIZE * 0] +=  (tmp0 + tmp7);
 353             wsptr[DCTSIZE * 1] +=  (tmp1 + tmp6);
 354             wsptr[DCTSIZE * 2] +=  (tmp2 + tmp5);
 355             wsptr[DCTSIZE * 3] +=  (tmp3 - tmp4);
 356             wsptr[DCTSIZE * 4] +=  (tmp3 + tmp4);
 357             wsptr[DCTSIZE * 5] +=  (tmp2 - tmp5);
 358             wsptr[DCTSIZE * 6]  =  (tmp1 - tmp6);
 359             wsptr[DCTSIZE * 7]  =  (tmp0 - tmp7);
 360             //
 361             dataptr++; //next column
 362             wsptr++;
 363             threshold++;
 364         }
 365         dataptr += 8; //skip each second start pos
 366         wsptr   += 8;
 367     }
 368 }
 369
 370 static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
 371 {
 372     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 373     int_simd16_t tmp10, tmp11, tmp12, tmp13;
 374     int_simd16_t z5, z10, z11, z12, z13;
 375     int16_t *outptr;
 376     int16_t *wsptr;
 377
 378     cnt *= 4;
 379     wsptr = workspace;
 380     outptr = output_adr;
 381     for (; cnt > 0; cnt--) {
 382         // Even part
 383         //Simd version reads 4x4 block and transposes it
 384         tmp10 = wsptr[2] +  wsptr[3];
 385         tmp11 = wsptr[2] -  wsptr[3];
 386
 387         tmp13 = wsptr[0] +  wsptr[1];
 388         tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow
 389
 390         tmp0 = tmp10 + tmp13; //->temps
 391         tmp3 = tmp10 - tmp13; //->temps
 392         tmp1 = tmp11 + tmp12;
 393         tmp2 = tmp11 - tmp12;
 394
 395         // Odd part
 396         //Also transpose, with previous:
 397         // ---- ----      ||||
 398         // ---- ---- idct ||||
 399         // ---- ---- ---> ||||
 400         // ---- ----      ||||
 401         z13 = wsptr[4] + wsptr[5];
 402         z10 = wsptr[4] - wsptr[5];
 403         z11 = wsptr[6] + wsptr[7];
 404         z12 = wsptr[6] - wsptr[7];
 405
 406         tmp7 = z11 + z13;
 407         tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
 408
 409         z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
 410         tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
 411         tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - FIX_
 412
 413         tmp6 = (tmp12 << 3) - tmp7;
 414         tmp5 = (tmp11 << 3) - tmp6;
 415         tmp4 = (tmp10 << 3) + tmp5;
 416
 417         // Final output stage: descale and write column
 418         outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
 419         outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
 420         outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
 421         outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
 422         outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
 423         outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
 424         outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
 425         outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
 426         outptr++;
 427
 428         wsptr += DCTSIZE;       // advance pointer to next row
 429     }
 430 }
 431
 432 static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
 433 {
 434     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 435     int_simd16_t tmp10, tmp11, tmp12, tmp13;
 436     int_simd16_t z1, z2, z3, z4, z5, z11, z13;
 437     int16_t *dataptr;
 438
 439     cnt *= 4;
 440     // Pass 1: process rows.
 441
 442     dataptr = data;
 443     for (; cnt > 0; cnt--) {
 444         tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
 445         tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
 446         tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
 447         tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
 448         tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
 449         tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
 450         tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
 451         tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
 452
 453         // Even part
 454
 455         tmp10 = tmp0 + tmp3;
 456         tmp13 = tmp0 - tmp3;
 457         tmp11 = tmp1 + tmp2;
 458         tmp12 = tmp1 - tmp2;
 459         //Even columns are written first, this leads to different order of columns
 460         //in column_fidct(), but they are processed independently, so all ok.
 461         //Later in the row_idct() columns readed at the same order.
 462         dataptr[2] = tmp10 + tmp11;
 463         dataptr[3] = tmp10 - tmp11;
 464
 465         z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
 466         dataptr[0] = tmp13 + z1;
 467         dataptr[1] = tmp13 - z1;
 468
 469         // Odd part
 470
 471         tmp10 = (tmp4 + tmp5) << 2;
 472         tmp11 = (tmp5 + tmp6) << 2;
 473         tmp12 = (tmp6 + tmp7) << 2;
 474
 475         z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
 476         z2 = MULTIPLY16H(tmp10,         FIX_0_541196100) + z5;
 477         z4 = MULTIPLY16H(tmp12,         FIX_1_306562965) + z5;
 478         z3 = MULTIPLY16H(tmp11,         FIX_0_707106781);
 479
 480         z11 = tmp7 + z3;
 481         z13 = tmp7 - z3;
 482
 483         dataptr[4] = z13 + z2;
 484         dataptr[5] = z13 - z2;
 485         dataptr[6] = z11 + z4;
 486         dataptr[7] = z11 - z4;
 487
 488         pixels++;               // advance pointer to next column
 489         dataptr += DCTSIZE;
 490     }
 491 }
 492
 493 static int query_formats(AVFilterContext *ctx)
 494 {
 495     static const enum AVPixelFormat pix_fmts[] = {
 496         AV_PIX_FMT_YUV444P,  AV_PIX_FMT_YUV422P,
 497         AV_PIX_FMT_YUV420P,  AV_PIX_FMT_YUV411P,
 498         AV_PIX_FMT_YUV410P,  AV_PIX_FMT_YUV440P,
 499         AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ422P,
 500         AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ440P,
 501         AV_PIX_FMT_GBRP, AV_PIX_FMT_GRAY8,
 502         AV_PIX_FMT_NONE
 503     };
 504
 505     AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
 506     if (!fmts_list)
 507         return AVERROR(ENOMEM);
 508     return ff_set_common_formats(ctx, fmts_list);
 509 }
 510
 511 static int config_input(AVFilterLink *inlink)
 512 {
 513     AVFilterContext *ctx = inlink->dst;
 514     FSPPContext *fspp = ctx->priv;
 515     const int h = FFALIGN(inlink->h + 16, 16);
 516     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
 517
 518     fspp->hsub = desc->log2_chroma_w;
 519     fspp->vsub = desc->log2_chroma_h;
 520
 521     fspp->temp_stride = FFALIGN(inlink->w + 16, 16);
 522     fspp->temp = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->temp));
 523     fspp->src  = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->src));
 524
 525     if (!fspp->temp || !fspp->src)
 526         return AVERROR(ENOMEM);
 527
 528     if (!fspp->use_bframe_qp && !fspp->qp) {
 529         fspp->non_b_qp_alloc_size = AV_CEIL_RSHIFT(inlink->w, 4) * AV_CEIL_RSHIFT(inlink->h, 4);
 530         fspp->non_b_qp_table = av_calloc(fspp->non_b_qp_alloc_size, sizeof(*fspp->non_b_qp_table));
 531         if (!fspp->non_b_qp_table)
 532             return AVERROR(ENOMEM);
 533     }
 534
 535     fspp->store_slice  = store_slice_c;
 536     fspp->store_slice2 = store_slice2_c;
 537     fspp->mul_thrmat   = mul_thrmat_c;
 538     fspp->column_fidct = column_fidct_c;
 539     fspp->row_idct     = row_idct_c;
 540     fspp->row_fdct     = row_fdct_c;
 541
 542     if (ARCH_X86)
 543         ff_fspp_init_x86(fspp);
 544
 545     return 0;
 546 }
 547
 548 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 549 {
 550     AVFilterContext *ctx = inlink->dst;
 551     FSPPContext *fspp = ctx->priv;
 552     AVFilterLink *outlink = ctx->outputs[0];
 553     AVFrame *out = in;
 554
 555     int qp_stride = 0;
 556     uint8_t *qp_table = NULL;
 557     int i, bias;
 558     int custom_threshold_m[64];
 559
 560     bias = (1 << 4) + fspp->strength;
 561
 562     for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
 563         custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
 564
 565     for (i = 0; i < 8; i++) {
 566         fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 + 2]
 567                                       |(((uint64_t)custom_threshold_m[i * 8 + 6]) << 16)
 568                                       |(((uint64_t)custom_threshold_m[i * 8 + 0]) << 32)
 569                                       |(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48);
 570
 571         fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i * 8 + 5]
 572                                           |(((uint64_t)custom_threshold_m[i * 8 + 3]) << 16)
 573                                           |(((uint64_t)custom_threshold_m[i * 8 + 1]) << 32)
 574                                           |(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48);
 575     }
 576
 577     if (fspp->qp)
 578         fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
 579
 580     /* if we are not in a constant user quantizer mode and we don't want to use
 581      * the quantizers from the B-frames (B-frames often have a higher QP), we
 582      * need to save the qp table from the last non B-frame; this is what the
 583      * following code block does */
 584     if (!fspp->qp) {
 585         qp_table = av_frame_get_qp_table(in, &qp_stride, &fspp->qscale_type);
 586
 587         if (qp_table && !fspp->use_bframe_qp && in->pict_type != AV_PICTURE_TYPE_B) {
 588             int w, h;
 589
 590             /* if the qp stride is not set, it means the QP are only defined on
 591              * a line basis */
 592            if (!qp_stride) {
 593                 w = AV_CEIL_RSHIFT(inlink->w, 4);
 594                 h = 1;
 595             } else {
 596                 w = qp_stride;
 597                 h = AV_CEIL_RSHIFT(inlink->h, 4);
 598             }
 599             if (w * h > fspp->non_b_qp_alloc_size) {
 600                 int ret = av_reallocp_array(&fspp->non_b_qp_table, w, h);
 601                 if (ret < 0) {
 602                     fspp->non_b_qp_alloc_size = 0;
 603                     return ret;
 604                 }
 605                 fspp->non_b_qp_alloc_size = w * h;
 606             }
 607
 608             av_assert0(w * h <= fspp->non_b_qp_alloc_size);
 609             memcpy(fspp->non_b_qp_table, qp_table, w * h);
 610         }
 611     }
 612
 613     if (fspp->log2_count && !ctx->is_disabled) {
 614         if (!fspp->use_bframe_qp && fspp->non_b_qp_table)
 615             qp_table = fspp->non_b_qp_table;
 616
 617         if (qp_table || fspp->qp) {
 618             const int cw = AV_CEIL_RSHIFT(inlink->w, fspp->hsub);
 619             const int ch = AV_CEIL_RSHIFT(inlink->h, fspp->vsub);
 620
 621             /* get a new frame if in-place is not possible or if the dimensions
 622              * are not multiple of 8 */
 623             if (!av_frame_is_writable(in) || (inlink->w & 7) || (inlink->h & 7)) {
 624                 const int aligned_w = FFALIGN(inlink->w, 8);
 625                 const int aligned_h = FFALIGN(inlink->h, 8);
 626
 627                 out = ff_get_video_buffer(outlink, aligned_w, aligned_h);
 628                 if (!out) {
 629                     av_frame_free(&in);
 630                     return AVERROR(ENOMEM);
 631                 }
 632                 av_frame_copy_props(out, in);
 633                 out->width = in->width;
 634                 out->height = in->height;
 635             }
 636
 637             filter(fspp, out->data[0], in->data[0], out->linesize[0], in->linesize[0],
 638                    inlink->w, inlink->h, qp_table, qp_stride, 1);
 639             filter(fspp, out->data[1], in->data[1], out->linesize[1], in->linesize[1],
 640                    cw,        ch,        qp_table, qp_stride, 0);
 641             filter(fspp, out->data[2], in->data[2], out->linesize[2], in->linesize[2],
 642                    cw,        ch,        qp_table, qp_stride, 0);
 643             emms_c();
 644         }
 645     }
 646
 647     if (in != out) {
 648         if (in->data[3])
 649             av_image_copy_plane(out->data[3], out->linesize[3],
 650                                 in ->data[3], in ->linesize[3],
 651                                 inlink->w, inlink->h);
 652         av_frame_free(&in);
 653     }
 654     return ff_filter_frame(outlink, out);
 655 }
 656
 657 static av_cold void uninit(AVFilterContext *ctx)
 658 {
 659     FSPPContext *fspp = ctx->priv;
 660     av_freep(&fspp->temp);
 661     av_freep(&fspp->src);
 662     av_freep(&fspp->non_b_qp_table);
 663 }
 664
 665 static const AVFilterPad fspp_inputs[] = {
 666     {
 667         .name         = "default",
 668         .type         = AVMEDIA_TYPE_VIDEO,
 669         .config_props = config_input,
 670         .filter_frame = filter_frame,
 671     },
 672     { NULL }
 673 };
 674
 675 static const AVFilterPad fspp_outputs[] = {
 676     {
 677         .name = "default",
 678         .type = AVMEDIA_TYPE_VIDEO,
 679     },
 680     { NULL }
 681 };
 682
 683 AVFilter ff_vf_fspp = {
 684     .name            = "fspp",
 685     .description     = NULL_IF_CONFIG_SMALL("Apply Fast Simple Post-processing filter."),
 686     .priv_size       = sizeof(FSPPContext),
 687     .uninit          = uninit,
 688     .query_formats   = query_formats,
 689     .inputs          = fspp_inputs,
 690     .outputs         = fspp_outputs,
 691     .priv_class      = &fspp_class,
 692     .flags           = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL,
 693 };