git.sesse.net Git - ffmpeg/blob - libavfilter/vf_nnedi.c

   1 /*
   2  * Copyright (C) 2010-2011 Kevin Stone
   3  * Copyright (C) 2016 Paul B Mahol
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License as published by
   9  * the Free Software Foundation; either version 2 of the License, or
  10  * (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License along
  18  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  19  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20  */
  21
  22 #include <float.h>
  23
  24 #include "libavutil/common.h"
  25 #include "libavutil/float_dsp.h"
  26 #include "libavutil/imgutils.h"
  27 #include "libavutil/mem_internal.h"
  28 #include "libavutil/opt.h"
  29 #include "libavutil/pixdesc.h"
  30 #include "avfilter.h"
  31 #include "formats.h"
  32 #include "internal.h"
  33 #include "video.h"
  34
  35 static const size_t NNEDI_WEIGHTS_SIZE = 13574928;
  36 static const uint8_t NNEDI_XDIM[] = { 8, 16, 32, 48, 8, 16, 32 };
  37 static const uint8_t NNEDI_YDIM[] = { 6, 6, 6, 6, 4, 4, 4 };
  38 static const uint16_t NNEDI_NNS[] = { 16, 32, 64, 128, 256 };
  39
  40 typedef struct PrescreenerCoefficients {
  41     DECLARE_ALIGNED(32, float, kernel_l0)[4][16 * 4];
  42     DECLARE_ALIGNED(32, float, bias_l0)[4];
  43
  44     DECLARE_ALIGNED(32, float, kernel_l1)[4][4];
  45     DECLARE_ALIGNED(32, float, bias_l1)[4];
  46
  47     DECLARE_ALIGNED(32, float, kernel_l2)[4][8];
  48     DECLARE_ALIGNED(32, float, bias_l2)[4];
  49 } PrescreenerCoefficients;
  50
  51 typedef struct PredictorCoefficients {
  52     int xdim, ydim, nns, nsize;
  53     float *data;
  54     float *softmax_q1;
  55     float *elliott_q1;
  56     float *softmax_bias_q1;
  57     float *elliott_bias_q1;
  58     float *softmax_q2;
  59     float *elliott_q2;
  60     float *softmax_bias_q2;
  61     float *elliott_bias_q2;
  62 } PredictorCoefficients;
  63
  64 typedef struct NNEDIContext {
  65     const AVClass *class;
  66
  67     char *weights_file;
  68
  69     AVFrame *src;
  70     AVFrame *second;
  71     AVFrame *dst;
  72     int eof;
  73     int64_t cur_pts;
  74
  75     AVFloatDSPContext *fdsp;
  76     int depth;
  77     int nb_planes;
  78     int nb_threads;
  79     int linesize[4];
  80     int planewidth[4];
  81     int planeheight[4];
  82     int field_n;
  83
  84     PrescreenerCoefficients prescreener[4];
  85     PredictorCoefficients coeffs[2][5][7];
  86
  87     float half;
  88     float in_scale;
  89     float out_scale;
  90
  91     // Parameters
  92     int deint;
  93     int field;
  94     int process_plane;
  95     int nsize;
  96     int nnsparam;
  97     int qual;
  98     int etype;
  99     int pscrn;
 100
 101     int input_size;
 102     uint8_t **prescreen_buf;
 103     float **input_buf;
 104     float **output_buf;
 105
 106     void (*read)(const uint8_t *src, float *dst,
 107                  int src_stride, int dst_stride,
 108                  int width, int height, float scale);
 109     void (*write)(const float *src, uint8_t *dst,
 110                   int src_stride, int dst_stride,
 111                   int width, int height, int depth, float scale);
 112     void (*prescreen[2])(AVFilterContext *ctx,
 113                          const void *src, ptrdiff_t src_stride,
 114                          uint8_t *prescreen, int N,
 115                          const PrescreenerCoefficients *const coeffs);
 116 } NNEDIContext;
 117
 118 #define OFFSET(x) offsetof(NNEDIContext, x)
 119 #define RFLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
 120 #define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
 121
 122 static const AVOption nnedi_options[] = {
 123     {"weights",  "set weights file", OFFSET(weights_file),  AV_OPT_TYPE_STRING, {.str="nnedi3_weights.bin"}, 0, 0, FLAGS },
 124     {"deint",         "set which frames to deinterlace", OFFSET(deint),         AV_OPT_TYPE_INT, {.i64=0}, 0, 1, RFLAGS, "deint" },
 125         {"all",        "deinterlace all frames",                       0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "deint" },
 126         {"interlaced", "only deinterlace frames marked as interlaced", 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "deint" },
 127     {"field",  "set mode of operation", OFFSET(field),         AV_OPT_TYPE_INT, {.i64=-1}, -2, 3, RFLAGS, "field" },
 128         {"af", "use frame flags, both fields",  0, AV_OPT_TYPE_CONST, {.i64=-2}, 0, 0, RFLAGS, "field" },
 129         {"a",  "use frame flags, single field", 0, AV_OPT_TYPE_CONST, {.i64=-1}, 0, 0, RFLAGS, "field" },
 130         {"t",  "use top field only",            0, AV_OPT_TYPE_CONST, {.i64=0},  0, 0, RFLAGS, "field" },
 131         {"b",  "use bottom field only",         0, AV_OPT_TYPE_CONST, {.i64=1},  0, 0, RFLAGS, "field" },
 132         {"tf", "use both fields, top first",    0, AV_OPT_TYPE_CONST, {.i64=2},  0, 0, RFLAGS, "field" },
 133         {"bf", "use both fields, bottom first", 0, AV_OPT_TYPE_CONST, {.i64=3},  0, 0, RFLAGS, "field" },
 134     {"planes", "set which planes to process", OFFSET(process_plane), AV_OPT_TYPE_INT, {.i64=7}, 0, 15, RFLAGS },
 135     {"nsize",  "set size of local neighborhood around each pixel, used by the predictor neural network", OFFSET(nsize), AV_OPT_TYPE_INT, {.i64=6}, 0, 6, RFLAGS, "nsize" },
 136         {"s8x6",     NULL, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "nsize" },
 137         {"s16x6",    NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "nsize" },
 138         {"s32x6",    NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, RFLAGS, "nsize" },
 139         {"s48x6",    NULL, 0, AV_OPT_TYPE_CONST, {.i64=3}, 0, 0, RFLAGS, "nsize" },
 140         {"s8x4",     NULL, 0, AV_OPT_TYPE_CONST, {.i64=4}, 0, 0, RFLAGS, "nsize" },
 141         {"s16x4",    NULL, 0, AV_OPT_TYPE_CONST, {.i64=5}, 0, 0, RFLAGS, "nsize" },
 142         {"s32x4",    NULL, 0, AV_OPT_TYPE_CONST, {.i64=6}, 0, 0, RFLAGS, "nsize" },
 143     {"nns",    "set number of neurons in predictor neural network", OFFSET(nnsparam), AV_OPT_TYPE_INT, {.i64=1}, 0, 4, RFLAGS, "nns" },
 144         {"n16",       NULL, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "nns" },
 145         {"n32",       NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "nns" },
 146         {"n64",       NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, RFLAGS, "nns" },
 147         {"n128",      NULL, 0, AV_OPT_TYPE_CONST, {.i64=3}, 0, 0, RFLAGS, "nns" },
 148         {"n256",      NULL, 0, AV_OPT_TYPE_CONST, {.i64=4}, 0, 0, RFLAGS, "nns" },
 149     {"qual",  "set quality", OFFSET(qual), AV_OPT_TYPE_INT, {.i64=1}, 1, 2, RFLAGS, "qual" },
 150         {"fast", NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "qual" },
 151         {"slow", NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, RFLAGS, "qual" },
 152     {"etype", "set which set of weights to use in the predictor", OFFSET(etype), AV_OPT_TYPE_INT, {.i64=0}, 0, 1, RFLAGS, "etype" },
 153         {"a",  "weights trained to minimize absolute error", 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "etype" },
 154         {"abs","weights trained to minimize absolute error", 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "etype" },
 155         {"s",  "weights trained to minimize squared error",  0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "etype" },
 156         {"mse","weights trained to minimize squared error",  0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "etype" },
 157     {"pscrn", "set prescreening", OFFSET(pscrn), AV_OPT_TYPE_INT, {.i64=2}, 0, 4, RFLAGS, "pscrn" },
 158         {"none",      NULL, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "pscrn" },
 159         {"original",  NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "pscrn" },
 160         {"new",       NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, RFLAGS, "pscrn" },
 161         {"new2",      NULL, 0, AV_OPT_TYPE_CONST, {.i64=3}, 0, 0, RFLAGS, "pscrn" },
 162         {"new3",      NULL, 0, AV_OPT_TYPE_CONST, {.i64=4}, 0, 0, RFLAGS, "pscrn" },
 163     { NULL }
 164 };
 165
 166 AVFILTER_DEFINE_CLASS(nnedi);
 167
 168 static int config_output(AVFilterLink *outlink)
 169 {
 170     AVFilterContext *ctx = outlink->src;
 171
 172     outlink->time_base.num = ctx->inputs[0]->time_base.num;
 173     outlink->time_base.den = ctx->inputs[0]->time_base.den * 2;
 174     outlink->w             = ctx->inputs[0]->w;
 175     outlink->h             = ctx->inputs[0]->h;
 176
 177     outlink->frame_rate = av_mul_q(ctx->inputs[0]->frame_rate,
 178                                    (AVRational){2, 1});
 179
 180     return 0;
 181 }
 182
 183 static int query_formats(AVFilterContext *ctx)
 184 {
 185     static const enum AVPixelFormat pix_fmts[] = {
 186         AV_PIX_FMT_GRAY8,
 187         AV_PIX_FMT_GRAY9, AV_PIX_FMT_GRAY10, AV_PIX_FMT_GRAY12, AV_PIX_FMT_GRAY14, AV_PIX_FMT_GRAY16,
 188         AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
 189         AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
 190         AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P,
 191         AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ440P,
 192         AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ420P,
 193         AV_PIX_FMT_YUVJ411P,
 194         AV_PIX_FMT_YUVA420P, AV_PIX_FMT_YUVA422P, AV_PIX_FMT_YUVA444P,
 195         AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRAP,
 196         AV_PIX_FMT_YUV420P9, AV_PIX_FMT_YUV422P9, AV_PIX_FMT_YUV444P9,
 197         AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10,
 198         AV_PIX_FMT_YUV440P10,
 199         AV_PIX_FMT_YUV420P12, AV_PIX_FMT_YUV422P12, AV_PIX_FMT_YUV444P12,
 200         AV_PIX_FMT_YUV440P12,
 201         AV_PIX_FMT_YUV420P14, AV_PIX_FMT_YUV422P14, AV_PIX_FMT_YUV444P14,
 202         AV_PIX_FMT_YUV420P16, AV_PIX_FMT_YUV422P16, AV_PIX_FMT_YUV444P16,
 203         AV_PIX_FMT_GBRP9, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12, AV_PIX_FMT_GBRP14, AV_PIX_FMT_GBRP16,
 204         AV_PIX_FMT_YUVA444P9, AV_PIX_FMT_YUVA444P10, AV_PIX_FMT_YUVA444P12, AV_PIX_FMT_YUVA444P16,
 205         AV_PIX_FMT_YUVA422P9, AV_PIX_FMT_YUVA422P10, AV_PIX_FMT_YUVA422P12, AV_PIX_FMT_YUVA422P16,
 206         AV_PIX_FMT_YUVA420P9, AV_PIX_FMT_YUVA420P10, AV_PIX_FMT_YUVA420P16,
 207         AV_PIX_FMT_GBRAP10,   AV_PIX_FMT_GBRAP12,    AV_PIX_FMT_GBRAP16,
 208         AV_PIX_FMT_NONE
 209     };
 210
 211     AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
 212     if (!fmts_list)
 213         return AVERROR(ENOMEM);
 214     return ff_set_common_formats(ctx, fmts_list);
 215 }
 216
 217 static float dot_dsp(const NNEDIContext *const s, const float *kernel, const float *input,
 218                      int n, float scale, float bias)
 219 {
 220     float sum;
 221
 222     sum = s->fdsp->scalarproduct_float(kernel, input, n);
 223
 224     return sum * scale + bias;
 225 }
 226
 227 static float elliott(float x)
 228 {
 229     return x / (1.0f + fabsf(x));
 230 }
 231
 232 static void transform_elliott(float *input, int size)
 233 {
 234     for (int i = 0; i < size; i++)
 235         input[i] = elliott(input[i]);
 236 }
 237
 238 static void process_old(AVFilterContext *ctx,
 239                         const void *src, ptrdiff_t src_stride,
 240                         uint8_t *prescreen, int N,
 241                         const PrescreenerCoefficients *const m_data)
 242 {
 243     NNEDIContext *s = ctx->priv;
 244     const float *src_p = src;
 245
 246     // Adjust source pointer to point to top-left of filter window.
 247     const float *window = src_p - 2 * src_stride - 5;
 248
 249     for (int j = 0; j < N; j++) {
 250         LOCAL_ALIGNED_32(float, input, [48]);
 251         float state[12];
 252
 253         for (int i = 0; i < 4; i++)
 254             memcpy(input + i * 12, window + i * src_stride + j, 12 * sizeof(float));
 255
 256         // Layer 0.
 257         for (int n = 0; n < 4; n++)
 258             state[n] = dot_dsp(s, m_data->kernel_l0[n], input, 48, 1.0f, m_data->bias_l0[n]);
 259         transform_elliott(state + 1, 3);
 260
 261         // Layer 1.
 262         for (int n = 0; n < 4; n++)
 263             state[n + 4] = dot_dsp(s, m_data->kernel_l1[n], state, 4, 1.0f, m_data->bias_l1[n]);
 264         transform_elliott(state + 4, 3);
 265
 266         // Layer 2.
 267         for (int n = 0; n < 4; n++)
 268             state[n + 8] = dot_dsp(s, m_data->kernel_l2[n], state, 8, 1.0f, m_data->bias_l2[n]);
 269
 270         prescreen[j] = FFMAX(state[10], state[11]) <= FFMAX(state[8], state[9]) ? 255 : 0;
 271     }
 272 }
 273
 274 static void process_new(AVFilterContext *ctx,
 275                         const void *src, ptrdiff_t src_stride,
 276                         uint8_t *prescreen, int N,
 277                         const PrescreenerCoefficients *const m_data)
 278 {
 279     NNEDIContext *s = ctx->priv;
 280     const float *src_p = src;
 281
 282     // Adjust source pointer to point to top-left of filter window.
 283     const float *window = src_p - 2 * src_stride - 6;
 284
 285     for (int j = 0; j < N; j += 4) {
 286         LOCAL_ALIGNED_32(float, input, [64]);
 287         float state[8];
 288
 289         for (int i = 0; i < 4; i++)
 290             memcpy(input + i * 16, window + i * src_stride + j, 16 * sizeof(float));
 291
 292         for (int n = 0; n < 4; n++)
 293             state[n] = dot_dsp(s, m_data->kernel_l0[n], input, 64, 1.0f, m_data->bias_l0[n]);
 294         transform_elliott(state, 4);
 295
 296         for (int n = 0; n < 4; n++)
 297             state[n + 4] = dot_dsp(s, m_data->kernel_l1[n], state, 4, 1.0f, m_data->bias_l1[n]);
 298
 299         for (int n = 0; n < 4; n++)
 300             prescreen[j + n] = state[n + 4] > 0.f;
 301     }
 302 }
 303
 304 static int filter_offset(int nn, const PredictorCoefficients *const model)
 305 {
 306     return nn * model->nsize;
 307 }
 308
 309 static const float *softmax_q1_filter(int nn,
 310                                       const PredictorCoefficients *const model)
 311 {
 312     return model->softmax_q1 + filter_offset(nn, model);
 313 }
 314
 315 static const float *elliott_q1_filter(int nn,
 316                                       const PredictorCoefficients *const model)
 317 {
 318     return model->elliott_q1 + filter_offset(nn, model);
 319 }
 320
 321 static const float *softmax_q2_filter(int nn,
 322                                       const PredictorCoefficients *const model)
 323 {
 324     return model->softmax_q2 + filter_offset(nn, model);
 325 }
 326
 327 static const float *elliott_q2_filter(int nn,
 328                                       const PredictorCoefficients *const model)
 329 {
 330     return model->elliott_q2 + filter_offset(nn, model);
 331 }
 332
 333 static void gather_input(const float *src, ptrdiff_t src_stride,
 334                          float *buf, float mstd[4],
 335                          const PredictorCoefficients *const model)
 336 {
 337     float sum = 0.f;
 338     float sum_sq = 0.f;
 339     float tmp;
 340
 341     for (int i = 0; i < model->ydim; i++) {
 342         memcpy(buf, src, model->xdim * sizeof(float));
 343
 344         for (int j = 0; j < model->xdim; j++) {
 345             const float val = src[j];
 346
 347             sum += val;
 348             sum_sq += val * val;
 349         }
 350
 351         src += src_stride;
 352         buf += model->xdim;
 353     }
 354
 355     mstd[0] = sum / model->nsize;
 356     mstd[3] = 0.f;
 357
 358     tmp = sum_sq / model->nsize - mstd[0] * mstd[0];
 359     if (tmp < FLT_EPSILON) {
 360         mstd[1] = 0.0f;
 361         mstd[2] = 0.0f;
 362     } else {
 363         mstd[1] = sqrtf(tmp);
 364         mstd[2] = 1.0f / mstd[1];
 365     }
 366 }
 367
 368 static float softmax_exp(float x)
 369 {
 370     return expf(av_clipf(x, -80.f, 80.f));
 371 }
 372
 373 static void transform_softmax_exp(float *input, int size)
 374 {
 375     for (int i = 0; i < size; i++)
 376         input[i] = softmax_exp(input[i]);
 377 }
 378
 379 static void wae5(const float *softmax, const float *el,
 380                  int n, float mstd[4])
 381 {
 382     float vsum = 0.0f, wsum = 0.0f;
 383
 384     for (int i = 0; i < n; i++) {
 385         vsum += softmax[i] * elliott(el[i]);
 386         wsum += softmax[i];
 387     }
 388
 389     if (wsum > 1e-10f)
 390         mstd[3] += (5.0f * vsum) / wsum * mstd[1] + mstd[0];
 391     else
 392         mstd[3] += mstd[0];
 393 }
 394
 395 static void predictor(AVFilterContext *ctx,
 396                       const void *src, ptrdiff_t src_stride, void *dst,
 397                       const uint8_t *prescreen, int N,
 398                       const PredictorCoefficients *const model, int use_q2)
 399 {
 400     const NNEDIContext *const s = ctx->priv;
 401     const float *src_p = src;
 402     float *dst_p = dst;
 403
 404     // Adjust source pointer to point to top-left of filter window.
 405     const float *window = src_p - (model->ydim / 2) * src_stride - (model->xdim / 2 - 1);
 406     const int filter_size = model->nsize;
 407     const int nns = model->nns;
 408
 409     for (int i = 0; i < N; i++) {
 410         LOCAL_ALIGNED_32(float, input, [48 * 6]);
 411         float activation[256 * 2];
 412         float mstd[4];
 413         float scale;
 414
 415         if (prescreen[i])
 416             continue;
 417
 418         gather_input(window + i, src_stride, input, mstd, model);
 419         scale = mstd[2];
 420
 421         for (int nn = 0; nn < nns; nn++)
 422             activation[nn] = dot_dsp(s, softmax_q1_filter(nn, model), input, filter_size, scale, model->softmax_bias_q1[nn]);
 423
 424         for (int nn = 0; nn < nns; nn++)
 425             activation[nns + nn] = dot_dsp(s, elliott_q1_filter(nn, model), input, filter_size, scale, model->elliott_bias_q1[nn]);
 426
 427         transform_softmax_exp(activation, nns);
 428         wae5(activation, activation + nns, nns, mstd);
 429
 430         if (use_q2) {
 431             for (int nn = 0; nn < nns; nn++)
 432                 activation[nn] = dot_dsp(s, softmax_q2_filter(nn, model), input, filter_size, scale, model->softmax_bias_q2[nn]);
 433
 434             for (int nn = 0; nn < nns; nn++)
 435                 activation[nns + nn] = dot_dsp(s, elliott_q2_filter(nn, model), input, filter_size, scale, model->elliott_bias_q2[nn]);
 436
 437             transform_softmax_exp(activation, nns);
 438             wae5(activation, activation + nns, nns, mstd);
 439         }
 440
 441         dst_p[i] = mstd[3] * (use_q2 ? 0.5f : 1.f);
 442     }
 443 }
 444
 445 static void read_bytes(const uint8_t *src, float *dst,
 446                        int src_stride, int dst_stride,
 447                        int width, int height, float scale)
 448 {
 449     for (int y = 0; y < height; y++) {
 450         for (int x = 0; x < 32; x++)
 451             dst[-x - 1] = src[x];
 452
 453         for (int x = 0; x < width; x++)
 454             dst[x] = src[x];
 455
 456         for (int x = 0; x < 32; x++)
 457             dst[width + x] = src[width - x - 1];
 458
 459         dst += dst_stride;
 460         src += src_stride;
 461     }
 462 }
 463
 464 static void read_words(const uint8_t *srcp, float *dst,
 465                        int src_stride, int dst_stride,
 466                        int width, int height, float scale)
 467 {
 468     const uint16_t *src = (const uint16_t *)srcp;
 469
 470     src_stride /= 2;
 471
 472     for (int y = 0; y < height; y++) {
 473         for (int x = 0; x < 32; x++)
 474             dst[-x - 1] = src[x] * scale;
 475
 476         for (int x = 0; x < width; x++)
 477             dst[x] = src[x] * scale;
 478
 479         for (int x = 0; x < 32; x++)
 480             dst[width + x] = src[width - x - 1] * scale;
 481
 482         dst += dst_stride;
 483         src += src_stride;
 484     }
 485 }
 486
 487 static void write_bytes(const float *src, uint8_t *dst,
 488                         int src_stride, int dst_stride,
 489                         int width, int height, int depth,
 490                         float scale)
 491 {
 492     for (int y = 0; y < height; y++) {
 493         for (int x = 0; x < width; x++)
 494             dst[x] = av_clip_uint8(src[x]);
 495
 496         dst += dst_stride;
 497         src += src_stride;
 498     }
 499 }
 500
 501 static void write_words(const float *src, uint8_t *dstp,
 502                         int src_stride, int dst_stride,
 503                         int width, int height, int depth,
 504                         float scale)
 505 {
 506     uint16_t *dst = (uint16_t *)dstp;
 507
 508     dst_stride /= 2;
 509
 510     for (int y = 0; y < height; y++) {
 511         for (int x = 0; x < width; x++)
 512             dst[x] = av_clip_uintp2_c(src[x] * scale, depth);
 513
 514         dst += dst_stride;
 515         src += src_stride;
 516     }
 517 }
 518
 519 static void interpolation(const void *src, ptrdiff_t src_stride,
 520                           void *dst, const uint8_t *prescreen, int n)
 521 {
 522     const float *src_p = src;
 523     float *dst_p = dst;
 524     const float *window = src_p - 2 * src_stride;
 525
 526     for (int i = 0; i < n; i++) {
 527         float accum = 0.0f;
 528
 529         if (!prescreen[i])
 530             continue;
 531
 532         accum += (-3.0f / 32.0f) * window[0 * src_stride + i];
 533         accum += (19.0f / 32.0f) * window[1 * src_stride + i];
 534         accum += (19.0f / 32.0f) * window[2 * src_stride + i];
 535         accum += (-3.0f / 32.0f) * window[3 * src_stride + i];
 536
 537         dst_p[i] = accum;
 538     }
 539 }
 540
 541 static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
 542 {
 543     const NNEDIContext *const s = ctx->priv;
 544     AVFrame *out = s->dst;
 545     AVFrame *in = s->src;
 546     const float in_scale = s->in_scale;
 547     const float out_scale = s->out_scale;
 548     const int depth = s->depth;
 549     const int interlaced = in->interlaced_frame;
 550     const int tff = s->field_n == (s->field < 0 ? interlaced ? in->top_field_first : 1 :
 551                                   (s->field & 1) ^ 1);
 552
 553
 554     for (int p = 0; p < s->nb_planes; p++) {
 555         const int height = s->planeheight[p];
 556         const int width = s->planewidth[p];
 557         const int slice_start = 2 * ((height / 2 * jobnr) / nb_jobs);
 558         const int slice_end = 2 * ((height / 2 * (jobnr+1)) / nb_jobs);
 559         const uint8_t *src_data = in->data[p];
 560         uint8_t *dst_data = out->data[p];
 561         uint8_t *dst = out->data[p] + slice_start * out->linesize[p];
 562         const int src_linesize = in->linesize[p];
 563         const int dst_linesize = out->linesize[p];
 564         uint8_t *prescreen_buf = s->prescreen_buf[jobnr];
 565         float *srcbuf = s->input_buf[jobnr];
 566         const int srcbuf_stride = width + 64;
 567         float *dstbuf = s->output_buf[jobnr];
 568         const int dstbuf_stride = width;
 569         const int slice_height = (slice_end - slice_start) / 2;
 570         const int last_slice = slice_end == height;
 571         const uint8_t *in_line;
 572         uint8_t *out_line;
 573         int y_out;
 574
 575         if (!(s->process_plane & (1 << p))) {
 576             av_image_copy_plane(dst, out->linesize[p],
 577                                 in->data[p] + slice_start * in->linesize[p],
 578                                 in->linesize[p],
 579                                 s->linesize[p], slice_end - slice_start);
 580             continue;
 581         }
 582
 583         y_out    = slice_start + (tff ^ (slice_start & 1));
 584         in_line  = src_data + (y_out * src_linesize);
 585         out_line = dst_data + (y_out * dst_linesize);
 586
 587         while (y_out < slice_end) {
 588             memcpy(out_line, in_line, s->linesize[p]);
 589             y_out += 2;
 590             in_line  += src_linesize * 2;
 591             out_line += dst_linesize * 2;
 592         }
 593
 594         y_out = slice_start + ((!tff) ^ (slice_start & 1));
 595
 596         s->read(src_data + FFMAX(y_out - 5, tff) * src_linesize,
 597                 srcbuf + 32,
 598                 src_linesize * 2, srcbuf_stride,
 599                 width, 1, in_scale);
 600         srcbuf += srcbuf_stride;
 601
 602         s->read(src_data + FFMAX(y_out - 3, tff) * src_linesize,
 603                 srcbuf + 32,
 604                 src_linesize * 2, srcbuf_stride,
 605                 width, 1, in_scale);
 606         srcbuf += srcbuf_stride;
 607
 608         s->read(src_data + FFMAX(y_out - 1, tff) * src_linesize,
 609                 srcbuf + 32,
 610                 src_linesize * 2, srcbuf_stride,
 611                 width, 1, in_scale);
 612         srcbuf += srcbuf_stride;
 613
 614         in_line  = src_data + FFMIN(y_out + 1, height - 1 - !tff) * src_linesize;
 615         out_line = dst_data + (y_out * dst_linesize);
 616
 617         s->read(in_line, srcbuf + 32, src_linesize * 2, srcbuf_stride,
 618                 width, slice_height - last_slice, in_scale);
 619
 620         y_out += (slice_height - last_slice) * 2;
 621
 622         s->read(src_data + FFMIN(y_out + 1, height - 1 - !tff) * src_linesize,
 623                 srcbuf + 32 + srcbuf_stride * (slice_height - last_slice),
 624                 src_linesize * 2, srcbuf_stride,
 625                 width, 1, in_scale);
 626
 627         s->read(src_data + FFMIN(y_out + 3, height - 1 - !tff) * src_linesize,
 628                 srcbuf + 32 + srcbuf_stride * (slice_height + 1 - last_slice),
 629                 src_linesize * 2, srcbuf_stride,
 630                 width, 1, in_scale);
 631
 632         s->read(src_data + FFMIN(y_out + 5, height - 1 - !tff) * src_linesize,
 633                 srcbuf + 32 + srcbuf_stride * (slice_height + 2 - last_slice),
 634                 src_linesize * 2, srcbuf_stride,
 635                 width, 1, in_scale);
 636
 637         for (int y = 0; y < slice_end - slice_start; y += 2) {
 638             if (s->prescreen > 0)
 639                 s->prescreen[s->pscrn > 1](ctx, srcbuf + (y / 2) * srcbuf_stride + 32,
 640                              srcbuf_stride, prescreen_buf, width,
 641                              &s->prescreener[s->pscrn - 1]);
 642
 643             predictor(ctx,
 644                       srcbuf + (y / 2) * srcbuf_stride + 32,
 645                       srcbuf_stride,
 646                       dstbuf + (y / 2) * dstbuf_stride,
 647                       prescreen_buf, width,
 648                       &s->coeffs[s->etype][s->nnsparam][s->nsize], s->qual == 2);
 649
 650             if (s->prescreen > 0)
 651                 interpolation(srcbuf + (y / 2) * srcbuf_stride + 32,
 652                               srcbuf_stride,
 653                               dstbuf + (y / 2) * dstbuf_stride,
 654                               prescreen_buf, width);
 655         }
 656
 657         s->write(dstbuf, out_line, dstbuf_stride, dst_linesize * 2,
 658                  width, slice_height, depth, out_scale);
 659     }
 660
 661     return 0;
 662 }
 663
 664 static int get_frame(AVFilterContext *ctx, int is_second)
 665 {
 666     NNEDIContext *s = ctx->priv;
 667     AVFilterLink *outlink = ctx->outputs[0];
 668     AVFrame *src = s->src;
 669
 670     s->dst = ff_get_video_buffer(outlink, outlink->w, outlink->h);
 671     if (!s->dst)
 672         return AVERROR(ENOMEM);
 673     av_frame_copy_props(s->dst, src);
 674     s->dst->interlaced_frame = 0;
 675
 676     ctx->internal->execute(ctx, filter_slice, NULL, NULL, FFMIN(s->planeheight[1] / 2, s->nb_threads));
 677
 678     if (s->field == -2 || s->field > 1)
 679         s->field_n = !s->field_n;
 680
 681     return 0;
 682 }
 683
 684 static int filter_frame(AVFilterLink *inlink, AVFrame *src)
 685 {
 686     AVFilterContext *ctx = inlink->dst;
 687     AVFilterLink *outlink = ctx->outputs[0];
 688     NNEDIContext *s = ctx->priv;
 689     int ret;
 690
 691     if ((s->field > 1 ||
 692          s->field == -2) && !s->second) {
 693         goto second;
 694     } else if (s->field > 1 ||
 695                s->field == -2) {
 696         AVFrame *dst;
 697
 698         s->src = s->second;
 699         ret = get_frame(ctx, 1);
 700         if (ret < 0) {
 701             av_frame_free(&s->dst);
 702             av_frame_free(&s->second);
 703             s->src = NULL;
 704             return ret;
 705         }
 706         dst = s->dst;
 707
 708         if (src->pts != AV_NOPTS_VALUE &&
 709             dst->pts != AV_NOPTS_VALUE)
 710             dst->pts += src->pts;
 711         else
 712             dst->pts = AV_NOPTS_VALUE;
 713
 714         ret = ff_filter_frame(outlink, dst);
 715         if (ret < 0)
 716             return ret;
 717         if (s->eof)
 718             return 0;
 719         s->cur_pts = s->second->pts;
 720         av_frame_free(&s->second);
 721 second:
 722         if ((s->deint && src->interlaced_frame &&
 723              !ctx->is_disabled) ||
 724             (!s->deint && !ctx->is_disabled)) {
 725             s->second = src;
 726         }
 727     }
 728
 729     if ((s->deint && !src->interlaced_frame) || ctx->is_disabled) {
 730         AVFrame *dst = av_frame_clone(src);
 731         if (!dst) {
 732             av_frame_free(&src);
 733             av_frame_free(&s->second);
 734             return AVERROR(ENOMEM);
 735         }
 736
 737         if (s->field > 1 || s->field == -2) {
 738             av_frame_free(&s->second);
 739             if ((s->deint && src->interlaced_frame) ||
 740                 (!s->deint))
 741                 s->second = src;
 742         } else {
 743             av_frame_free(&src);
 744         }
 745         if (dst->pts != AV_NOPTS_VALUE)
 746             dst->pts *= 2;
 747         return ff_filter_frame(outlink, dst);
 748     }
 749
 750     s->src = src;
 751     ret = get_frame(ctx, 0);
 752     if (ret < 0) {
 753         av_frame_free(&s->dst);
 754         av_frame_free(&s->src);
 755         av_frame_free(&s->second);
 756         return ret;
 757     }
 758
 759     if (src->pts != AV_NOPTS_VALUE)
 760         s->dst->pts = src->pts * 2;
 761     if (s->field <= 1 && s->field > -2) {
 762         av_frame_free(&src);
 763         s->src = NULL;
 764     }
 765
 766     return ff_filter_frame(outlink, s->dst);
 767 }
 768
 769 static int request_frame(AVFilterLink *link)
 770 {
 771     AVFilterContext *ctx = link->src;
 772     NNEDIContext *s = ctx->priv;
 773     int ret;
 774
 775     if (s->eof)
 776         return AVERROR_EOF;
 777
 778     ret  = ff_request_frame(ctx->inputs[0]);
 779
 780     if (ret == AVERROR_EOF && s->second) {
 781         AVFrame *next = av_frame_clone(s->second);
 782
 783         if (!next)
 784             return AVERROR(ENOMEM);
 785
 786         next->pts = s->second->pts * 2 - s->cur_pts;
 787         s->eof = 1;
 788
 789         filter_frame(ctx->inputs[0], next);
 790     } else if (ret < 0) {
 791         return ret;
 792     }
 793
 794     return 0;
 795 }
 796
 797 static void copy_weights(float *dst, int n, const float **data)
 798 {
 799     memcpy(dst, *data, n * sizeof(float));
 800     *data += n;
 801 }
 802
 803 static float *allocate(float **ptr, int size)
 804 {
 805     float *ret = *ptr;
 806
 807     *ptr += size;
 808
 809     return ret;
 810 }
 811
 812 static int allocate_model(PredictorCoefficients *coeffs, int xdim, int ydim, int nns)
 813 {
 814     int filter_size = nns * xdim * ydim;
 815     int bias_size = nns;
 816     float *data;
 817
 818     data = av_calloc(filter_size + bias_size, 4 * sizeof(float));
 819     if (!data)
 820         return AVERROR(ENOMEM);
 821
 822     coeffs->data = data;
 823     coeffs->xdim = xdim;
 824     coeffs->ydim = ydim;
 825     coeffs->nsize = xdim * ydim;
 826     coeffs->nns  = nns;
 827
 828     coeffs->softmax_q1 = allocate(&data, filter_size);
 829     coeffs->elliott_q1 = allocate(&data, filter_size);
 830     coeffs->softmax_bias_q1 = allocate(&data, bias_size);
 831     coeffs->elliott_bias_q1 = allocate(&data, bias_size);
 832
 833     coeffs->softmax_q2 = allocate(&data, filter_size);
 834     coeffs->elliott_q2 = allocate(&data, filter_size);
 835     coeffs->softmax_bias_q2 = allocate(&data, bias_size);
 836     coeffs->elliott_bias_q2 = allocate(&data, bias_size);
 837
 838     return 0;
 839 }
 840
 841 static int read_weights(AVFilterContext *ctx, const float *bdata)
 842 {
 843     NNEDIContext *s = ctx->priv;
 844     int ret;
 845
 846     copy_weights(&s->prescreener[0].kernel_l0[0][0], 4 * 48, &bdata);
 847     copy_weights(s->prescreener[0].bias_l0, 4, &bdata);
 848
 849     copy_weights(&s->prescreener[0].kernel_l1[0][0], 4 * 4, &bdata);
 850     copy_weights(s->prescreener[0].bias_l1, 4, &bdata);
 851
 852     copy_weights(&s->prescreener[0].kernel_l2[0][0], 4 * 8, &bdata);
 853     copy_weights(s->prescreener[0].bias_l2, 4, &bdata);
 854
 855     for (int i = 0; i < 3; i++) {
 856         PrescreenerCoefficients *data = &s->prescreener[i + 1];
 857         float kernel_l0_shuffled[4 * 64];
 858         float kernel_l1_shuffled[4 * 4];
 859
 860         copy_weights(kernel_l0_shuffled, 4 * 64, &bdata);
 861         copy_weights(data->bias_l0, 4, &bdata);
 862
 863         copy_weights(kernel_l1_shuffled, 4 * 4, &bdata);
 864         copy_weights(data->bias_l1, 4, &bdata);
 865
 866         for (int n = 0; n < 4; n++) {
 867             for (int k = 0; k < 64; k++)
 868                 data->kernel_l0[n][k] = kernel_l0_shuffled[(k / 8) * 32 + n * 8 + k % 8];
 869             for (int k = 0; k < 4; k++)
 870                 data->kernel_l1[n][k] = kernel_l1_shuffled[k * 4 + n];
 871         }
 872     }
 873
 874     for (int m = 0; m < 2; m++) {
 875         // Grouping by neuron count.
 876         for (int i = 0; i < 5; i++) {
 877             const int nns = NNEDI_NNS[i];
 878
 879             // Grouping by window size.
 880             for (int j = 0; j < 7; j++) {
 881                 PredictorCoefficients *model = &s->coeffs[m][i][j];
 882                 const int xdim = NNEDI_XDIM[j];
 883                 const int ydim = NNEDI_YDIM[j];
 884                 const int filter_size = xdim * ydim;
 885
 886                 ret = allocate_model(model, xdim, ydim, nns);
 887                 if (ret < 0)
 888                     return ret;
 889
 890                 // Quality 1 model. NNS[i] * (XDIM[j] * YDIM[j]) * 2 coefficients.
 891                 copy_weights(model->softmax_q1, nns * filter_size, &bdata);
 892                 copy_weights(model->elliott_q1, nns * filter_size, &bdata);
 893
 894                 // Quality 1 model bias. NNS[i] * 2 coefficients.
 895                 copy_weights(model->softmax_bias_q1, nns, &bdata);
 896                 copy_weights(model->elliott_bias_q1, nns, &bdata);
 897
 898                 // Quality 2 model. NNS[i] * (XDIM[j] * YDIM[j]) * 2 coefficients.
 899                 copy_weights(model->softmax_q2, nns * filter_size, &bdata);
 900                 copy_weights(model->elliott_q2, nns * filter_size, &bdata);
 901
 902                 // Quality 2 model bias. NNS[i] * 2 coefficients.
 903                 copy_weights(model->softmax_bias_q2, nns, &bdata);
 904                 copy_weights(model->elliott_bias_q2, nns, &bdata);
 905             }
 906         }
 907     }
 908
 909     return 0;
 910 }
 911
 912 static float mean(const float *input, int size)
 913 {
 914     float sum = 0.f;
 915
 916     for (int i = 0; i < size; i++)
 917         sum += input[i];
 918
 919     return sum / size;
 920 }
 921
 922 static void transform(float *input, int size, float mean, float half)
 923 {
 924     for (int i = 0; i < size; i++)
 925         input[i] = (input[i] - mean) / half;
 926 }
 927
 928 static void subtract_mean_old(PrescreenerCoefficients *coeffs, float half)
 929 {
 930     for (int n = 0; n < 4; n++) {
 931         float m = mean(coeffs->kernel_l0[n], 48);
 932
 933         transform(coeffs->kernel_l0[n], 48, m, half);
 934     }
 935 }
 936
 937 static void subtract_mean_new(PrescreenerCoefficients *coeffs, float half)
 938 {
 939     for (int n = 0; n < 4; n++) {
 940         float m = mean(coeffs->kernel_l0[n], 64);
 941
 942         transform(coeffs->kernel_l0[n], 64, m, half);
 943     }
 944 }
 945
 946 static void subtract_mean_predictor(PredictorCoefficients *model)
 947 {
 948     int filter_size = model->nsize;
 949     int nns = model->nns;
 950
 951     float softmax_means[256]; // Average of individual softmax filters.
 952     float elliott_means[256]; // Average of individual elliott filters.
 953     float mean_filter[48 * 6]; // Pointwise average of all softmax filters.
 954     float mean_bias;
 955
 956     // Quality 1.
 957     for (int nn = 0; nn < nns; nn++) {
 958         softmax_means[nn] = mean(model->softmax_q1 + nn * filter_size, filter_size);
 959         elliott_means[nn] = mean(model->elliott_q1 + nn * filter_size, filter_size);
 960
 961         for (int k = 0; k < filter_size; k++)
 962             mean_filter[k] += model->softmax_q1[nn * filter_size + k] - softmax_means[nn];
 963     }
 964
 965     for (int k = 0; k < filter_size; k++)
 966         mean_filter[k] /= nns;
 967
 968     mean_bias = mean(model->softmax_bias_q1, nns);
 969
 970     for (int nn = 0; nn < nns; nn++) {
 971         for (int k = 0; k < filter_size; k++) {
 972             model->softmax_q1[nn * filter_size + k] -= softmax_means[nn] + mean_filter[k];
 973             model->elliott_q1[nn * filter_size + k] -= elliott_means[nn];
 974         }
 975         model->softmax_bias_q1[nn] -= mean_bias;
 976     }
 977
 978     // Quality 2.
 979     memset(mean_filter, 0, 48 * 6 * sizeof(float));
 980
 981     for (int nn = 0; nn < nns; nn++) {
 982         softmax_means[nn] = mean(model->softmax_q2 + nn * filter_size, filter_size);
 983         elliott_means[nn] = mean(model->elliott_q2 + nn * filter_size, filter_size);
 984
 985         for (int k = 0; k < filter_size; k++) {
 986             mean_filter[k] += model->softmax_q2[nn * filter_size + k] - softmax_means[nn];
 987         }
 988     }
 989
 990     for (int k = 0; k < filter_size; k++)
 991         mean_filter[k] /= nns;
 992
 993     mean_bias = mean(model->softmax_bias_q2, nns);
 994
 995     for (int nn = 0; nn < nns; nn++) {
 996         for (int k = 0; k < filter_size; k++) {
 997             model->softmax_q2[nn * filter_size + k] -= softmax_means[nn] + mean_filter[k];
 998             model->elliott_q2[nn * filter_size + k] -= elliott_means[nn];
 999         }
1000
1001         model->softmax_bias_q2[nn] -= mean_bias;
1002     }
1003 }
1004
1005 static av_cold int init(AVFilterContext *ctx)
1006 {
1007     NNEDIContext *s = ctx->priv;
1008     FILE *weights_file = NULL;
1009     int64_t weights_size;
1010     float *bdata;
1011     size_t bytes_read;
1012     int ret = 0;
1013
1014     weights_file = av_fopen_utf8(s->weights_file, "rb");
1015     if (!weights_file) {
1016         av_log(ctx, AV_LOG_ERROR, "No weights file provided, aborting!\n");
1017         return AVERROR(EINVAL);
1018     }
1019
1020     if (fseek(weights_file, 0, SEEK_END)) {
1021         av_log(ctx, AV_LOG_ERROR, "Couldn't seek to the end of weights file.\n");
1022         fclose(weights_file);
1023         return AVERROR(EINVAL);
1024     }
1025
1026     weights_size = ftell(weights_file);
1027
1028     if (weights_size == -1) {
1029         fclose(weights_file);
1030         av_log(ctx, AV_LOG_ERROR, "Couldn't get size of weights file.\n");
1031         return AVERROR(EINVAL);
1032     } else if (weights_size != NNEDI_WEIGHTS_SIZE) {
1033         fclose(weights_file);
1034         av_log(ctx, AV_LOG_ERROR, "Unexpected weights file size.\n");
1035         return AVERROR(EINVAL);
1036     }
1037
1038     if (fseek(weights_file, 0, SEEK_SET)) {
1039         fclose(weights_file);
1040         av_log(ctx, AV_LOG_ERROR, "Couldn't seek to the start of weights file.\n");
1041         return AVERROR(EINVAL);
1042     }
1043
1044     bdata = av_malloc(NNEDI_WEIGHTS_SIZE);
1045     if (!bdata) {
1046         fclose(weights_file);
1047         return AVERROR(ENOMEM);
1048     }
1049
1050     bytes_read = fread(bdata, 1, NNEDI_WEIGHTS_SIZE, weights_file);
1051     if (bytes_read != NNEDI_WEIGHTS_SIZE) {
1052         fclose(weights_file);
1053         ret = AVERROR_INVALIDDATA;
1054         av_log(ctx, AV_LOG_ERROR, "Couldn't read weights file.\n");
1055         goto fail;
1056     }
1057
1058     fclose(weights_file);
1059
1060     s->fdsp = avpriv_float_dsp_alloc(0);
1061     if (!s->fdsp) {
1062         ret = AVERROR(ENOMEM);
1063         goto fail;
1064     }
1065
1066     ret = read_weights(ctx, bdata);
1067     if (ret < 0)
1068         goto fail;
1069
1070 fail:
1071     av_free(bdata);
1072     return ret;
1073 }
1074
1075 static int config_input(AVFilterLink *inlink)
1076 {
1077     AVFilterContext *ctx = inlink->dst;
1078     NNEDIContext *s = ctx->priv;
1079     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
1080     int ret;
1081
1082     s->depth = desc->comp[0].depth;
1083     s->nb_threads = ff_filter_get_nb_threads(ctx);
1084     s->nb_planes = av_pix_fmt_count_planes(inlink->format);
1085     if ((ret = av_image_fill_linesizes(s->linesize, inlink->format, inlink->w)) < 0)
1086         return ret;
1087
1088     s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
1089     s->planewidth[0] = s->planewidth[3] = inlink->w;
1090     s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
1091     s->planeheight[0] = s->planeheight[3] = inlink->h;
1092
1093     s->half = ((1 << 8) - 1) / 2.f;
1094     s->out_scale = 1 << (s->depth - 8);
1095     s->in_scale = 1.f / s->out_scale;
1096
1097     switch (s->depth) {
1098     case 8:
1099         s->read  = read_bytes;
1100         s->write = write_bytes;
1101         break;
1102     default:
1103         s->read  = read_words;
1104         s->write = write_words;
1105         break;
1106     }
1107
1108     subtract_mean_old(&s->prescreener[0], s->half);
1109     subtract_mean_new(&s->prescreener[1], s->half);
1110     subtract_mean_new(&s->prescreener[2], s->half);
1111     subtract_mean_new(&s->prescreener[3], s->half);
1112
1113     s->prescreen[0] = process_old;
1114     s->prescreen[1] = process_new;
1115
1116     for (int i = 0; i < 2; i++) {
1117         for (int j = 0; j < 5; j++) {
1118             for (int k = 0; k < 7; k++)
1119                 subtract_mean_predictor(&s->coeffs[i][j][k]);
1120         }
1121     }
1122
1123     s->input_size = (s->planewidth[0] + 64) * (s->planeheight[0] + 6);
1124     s->input_buf = av_calloc(s->nb_threads, sizeof(*s->input_buf));
1125     if (!s->input_buf)
1126         return AVERROR(ENOMEM);
1127
1128     for (int i = 0; i < s->nb_threads; i++) {
1129         s->input_buf[i] = av_calloc(s->input_size, sizeof(**s->input_buf));
1130         if (!s->input_buf[i])
1131             return AVERROR(ENOMEM);
1132     }
1133
1134     s->output_buf = av_calloc(s->nb_threads, sizeof(*s->output_buf));
1135     if (!s->output_buf)
1136         return AVERROR(ENOMEM);
1137
1138     for (int i = 0; i < s->nb_threads; i++) {
1139         s->output_buf[i] = av_calloc(s->input_size, sizeof(**s->output_buf));
1140         if (!s->output_buf[i])
1141             return AVERROR(ENOMEM);
1142     }
1143
1144     s->prescreen_buf = av_calloc(s->nb_threads, sizeof(*s->prescreen_buf));
1145     if (!s->prescreen_buf)
1146         return AVERROR(ENOMEM);
1147
1148     for (int i = 0; i < s->nb_threads; i++) {
1149         s->prescreen_buf[i] = av_calloc(s->planewidth[0], sizeof(**s->prescreen_buf));
1150         if (!s->prescreen_buf[i])
1151             return AVERROR(ENOMEM);
1152     }
1153
1154     return 0;
1155 }
1156
1157 static av_cold void uninit(AVFilterContext *ctx)
1158 {
1159     NNEDIContext *s = ctx->priv;
1160
1161     for (int i = 0; i < s->nb_threads && s->prescreen_buf; i++)
1162         av_freep(&s->prescreen_buf[i]);
1163
1164     av_freep(&s->prescreen_buf);
1165
1166     for (int i = 0; i < s->nb_threads && s->input_buf; i++)
1167         av_freep(&s->input_buf[i]);
1168
1169     av_freep(&s->input_buf);
1170
1171     for (int i = 0; i < s->nb_threads && s->output_buf; i++)
1172         av_freep(&s->output_buf[i]);
1173
1174     av_freep(&s->output_buf);
1175     av_freep(&s->fdsp);
1176
1177     for (int i = 0; i < 2; i++) {
1178         for (int j = 0; j < 5; j++) {
1179             for (int k = 0; k < 7; k++) {
1180                 av_freep(&s->coeffs[i][j][k].data);
1181             }
1182         }
1183     }
1184
1185     av_frame_free(&s->second);
1186 }
1187
1188 static const AVFilterPad inputs[] = {
1189     {
1190         .name          = "default",
1191         .type          = AVMEDIA_TYPE_VIDEO,
1192         .filter_frame  = filter_frame,
1193         .config_props  = config_input,
1194     },
1195     { NULL }
1196 };
1197
1198 static const AVFilterPad outputs[] = {
1199     {
1200         .name          = "default",
1201         .type          = AVMEDIA_TYPE_VIDEO,
1202         .config_props  = config_output,
1203         .request_frame = request_frame,
1204     },
1205     { NULL }
1206 };
1207
1208 AVFilter ff_vf_nnedi = {
1209     .name          = "nnedi",
1210     .description   = NULL_IF_CONFIG_SMALL("Apply neural network edge directed interpolation intra-only deinterlacer."),
1211     .priv_size     = sizeof(NNEDIContext),
1212     .priv_class    = &nnedi_class,
1213     .init          = init,
1214     .uninit        = uninit,
1215     .query_formats = query_formats,
1216     .inputs        = inputs,
1217     .outputs       = outputs,
1218     .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL | AVFILTER_FLAG_SLICE_THREADS,
1219     .process_command = ff_filter_process_command,
1220 };