git.sesse.net Git - ffmpeg/blob - libavfilter/vf_nnedi.c

   1 /*
   2  * Copyright (C) 2010-2011 Kevin Stone
   3  * Copyright (C) 2016 Paul B Mahol
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License as published by
   9  * the Free Software Foundation; either version 2 of the License, or
  10  * (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License along
  18  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
  19  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  20  */
  21
  22 #include <float.h>
  23
  24 #include "libavutil/common.h"
  25 #include "libavutil/float_dsp.h"
  26 #include "libavutil/imgutils.h"
  27 #include "libavutil/mem_internal.h"
  28 #include "libavutil/opt.h"
  29 #include "libavutil/pixdesc.h"
  30 #include "avfilter.h"
  31 #include "formats.h"
  32 #include "internal.h"
  33 #include "video.h"
  34
  35 static const size_t NNEDI_WEIGHTS_SIZE = 13574928;
  36 static const uint8_t NNEDI_XDIM[] = { 8, 16, 32, 48, 8, 16, 32 };
  37 static const uint8_t NNEDI_YDIM[] = { 6, 6, 6, 6, 4, 4, 4 };
  38 static const uint16_t NNEDI_NNS[] = { 16, 32, 64, 128, 256 };
  39
  40 typedef struct PrescreenerOldCoefficients {
  41     DECLARE_ALIGNED(32, float, kernel_l0)[4][14 * 4];
  42     DECLARE_ALIGNED(32, float, bias_l0)[4];
  43
  44     DECLARE_ALIGNED(32, float, kernel_l1)[4][4];
  45     DECLARE_ALIGNED(32, float, bias_l1)[4];
  46
  47     DECLARE_ALIGNED(32, float, kernel_l2)[4][8];
  48     DECLARE_ALIGNED(32, float, bias_l2)[4];
  49 } PrescreenerOldCoefficients;
  50
  51 typedef struct PrescreenerNewCoefficients {
  52     DECLARE_ALIGNED(32, float, kernel_l0)[4][16 * 4];
  53     DECLARE_ALIGNED(32, float, bias_l0)[4];
  54
  55     DECLARE_ALIGNED(32, float, kernel_l1)[4][4];
  56     DECLARE_ALIGNED(32, float, bias_l1)[4];
  57 } PrescreenerNewCoefficients;
  58
  59 typedef struct PredictorCoefficients {
  60     int xdim, ydim, nns, nsize;
  61     float *data;
  62     float *softmax_q1;
  63     float *elliott_q1;
  64     float *softmax_bias_q1;
  65     float *elliott_bias_q1;
  66     float *softmax_q2;
  67     float *elliott_q2;
  68     float *softmax_bias_q2;
  69     float *elliott_bias_q2;
  70 } PredictorCoefficients;
  71
  72 typedef struct NNEDIContext {
  73     const AVClass *class;
  74
  75     char *weights_file;
  76
  77     AVFrame *src;
  78     AVFrame *second;
  79     AVFrame *dst;
  80     int eof;
  81     int64_t cur_pts;
  82
  83     AVFloatDSPContext *fdsp;
  84     int depth;
  85     int nb_planes;
  86     int nb_threads;
  87     int linesize[4];
  88     int planewidth[4];
  89     int planeheight[4];
  90     int field_n;
  91
  92     PrescreenerOldCoefficients prescreener_old;
  93     PrescreenerNewCoefficients prescreener_new[3];
  94     PredictorCoefficients coeffs[2][5][7];
  95
  96     float half;
  97     float in_scale;
  98     float out_scale;
  99
 100     // Parameters
 101     int deint;
 102     int field;
 103     int process_plane;
 104     int nsize;
 105     int nnsparam;
 106     int qual;
 107     int etype;
 108     int pscrn;
 109
 110     int input_size;
 111     uint8_t *prescreen_buf;
 112     float *input_buf;
 113     float *output_buf;
 114
 115     void (*read)(const uint8_t *src, float *dst,
 116                  int src_stride, int dst_stride,
 117                  int width, int height, float scale);
 118     void (*write)(const float *src, uint8_t *dst,
 119                   int src_stride, int dst_stride,
 120                   int width, int height, int depth, float scale);
 121     void (*prescreen[2])(AVFilterContext *ctx,
 122                          const void *src, ptrdiff_t src_stride,
 123                          uint8_t *prescreen, int N, void *data);
 124 } NNEDIContext;
 125
 126 #define OFFSET(x) offsetof(NNEDIContext, x)
 127 #define RFLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
 128 #define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
 129
 130 static const AVOption nnedi_options[] = {
 131     {"weights",  "set weights file", OFFSET(weights_file),  AV_OPT_TYPE_STRING, {.str="nnedi3_weights.bin"}, 0, 0, FLAGS },
 132     {"deint",         "set which frames to deinterlace", OFFSET(deint),         AV_OPT_TYPE_INT, {.i64=0}, 0, 1, RFLAGS, "deint" },
 133         {"all",        "deinterlace all frames",                       0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "deint" },
 134         {"interlaced", "only deinterlace frames marked as interlaced", 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "deint" },
 135     {"field",  "set mode of operation", OFFSET(field),         AV_OPT_TYPE_INT, {.i64=-1}, -2, 3, RFLAGS, "field" },
 136         {"af", "use frame flags, both fields",  0, AV_OPT_TYPE_CONST, {.i64=-2}, 0, 0, RFLAGS, "field" },
 137         {"a",  "use frame flags, single field", 0, AV_OPT_TYPE_CONST, {.i64=-1}, 0, 0, RFLAGS, "field" },
 138         {"t",  "use top field only",            0, AV_OPT_TYPE_CONST, {.i64=0},  0, 0, RFLAGS, "field" },
 139         {"b",  "use bottom field only",         0, AV_OPT_TYPE_CONST, {.i64=1},  0, 0, RFLAGS, "field" },
 140         {"tf", "use both fields, top first",    0, AV_OPT_TYPE_CONST, {.i64=2},  0, 0, RFLAGS, "field" },
 141         {"bf", "use both fields, bottom first", 0, AV_OPT_TYPE_CONST, {.i64=3},  0, 0, RFLAGS, "field" },
 142     {"planes", "set which planes to process", OFFSET(process_plane), AV_OPT_TYPE_INT, {.i64=7}, 0, 15, RFLAGS },
 143     {"nsize",  "set size of local neighborhood around each pixel, used by the predictor neural network", OFFSET(nsize), AV_OPT_TYPE_INT, {.i64=6}, 0, 6, RFLAGS, "nsize" },
 144         {"s8x6",     NULL, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "nsize" },
 145         {"s16x6",    NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "nsize" },
 146         {"s32x6",    NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, RFLAGS, "nsize" },
 147         {"s48x6",    NULL, 0, AV_OPT_TYPE_CONST, {.i64=3}, 0, 0, RFLAGS, "nsize" },
 148         {"s8x4",     NULL, 0, AV_OPT_TYPE_CONST, {.i64=4}, 0, 0, RFLAGS, "nsize" },
 149         {"s16x4",    NULL, 0, AV_OPT_TYPE_CONST, {.i64=5}, 0, 0, RFLAGS, "nsize" },
 150         {"s32x4",    NULL, 0, AV_OPT_TYPE_CONST, {.i64=6}, 0, 0, RFLAGS, "nsize" },
 151     {"nns",    "set number of neurons in predictor neural network", OFFSET(nnsparam), AV_OPT_TYPE_INT, {.i64=1}, 0, 4, RFLAGS, "nns" },
 152         {"n16",       NULL, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "nns" },
 153         {"n32",       NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "nns" },
 154         {"n64",       NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, RFLAGS, "nns" },
 155         {"n128",      NULL, 0, AV_OPT_TYPE_CONST, {.i64=3}, 0, 0, RFLAGS, "nns" },
 156         {"n256",      NULL, 0, AV_OPT_TYPE_CONST, {.i64=4}, 0, 0, RFLAGS, "nns" },
 157     {"qual",  "set quality", OFFSET(qual), AV_OPT_TYPE_INT, {.i64=1}, 1, 2, RFLAGS, "qual" },
 158         {"fast", NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "qual" },
 159         {"slow", NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, RFLAGS, "qual" },
 160     {"etype", "set which set of weights to use in the predictor", OFFSET(etype), AV_OPT_TYPE_INT, {.i64=0}, 0, 1, RFLAGS, "etype" },
 161         {"a",  "weights trained to minimize absolute error", 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "etype" },
 162         {"abs","weights trained to minimize absolute error", 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "etype" },
 163         {"s",  "weights trained to minimize squared error",  0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "etype" },
 164         {"mse","weights trained to minimize squared error",  0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "etype" },
 165     {"pscrn", "set prescreening", OFFSET(pscrn), AV_OPT_TYPE_INT, {.i64=2}, 0, 4, RFLAGS, "pscrn" },
 166         {"none",      NULL, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "pscrn" },
 167         {"original",  NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "pscrn" },
 168         {"new",       NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, RFLAGS, "pscrn" },
 169         {"new2",      NULL, 0, AV_OPT_TYPE_CONST, {.i64=3}, 0, 0, RFLAGS, "pscrn" },
 170         {"new3",      NULL, 0, AV_OPT_TYPE_CONST, {.i64=4}, 0, 0, RFLAGS, "pscrn" },
 171     { NULL }
 172 };
 173
 174 AVFILTER_DEFINE_CLASS(nnedi);
 175
 176 static int config_output(AVFilterLink *outlink)
 177 {
 178     AVFilterContext *ctx = outlink->src;
 179
 180     outlink->time_base.num = ctx->inputs[0]->time_base.num;
 181     outlink->time_base.den = ctx->inputs[0]->time_base.den * 2;
 182     outlink->w             = ctx->inputs[0]->w;
 183     outlink->h             = ctx->inputs[0]->h;
 184
 185     outlink->frame_rate = av_mul_q(ctx->inputs[0]->frame_rate,
 186                                    (AVRational){2, 1});
 187
 188     return 0;
 189 }
 190
 191 static int query_formats(AVFilterContext *ctx)
 192 {
 193     static const enum AVPixelFormat pix_fmts[] = {
 194         AV_PIX_FMT_GRAY8,
 195         AV_PIX_FMT_GRAY9, AV_PIX_FMT_GRAY10, AV_PIX_FMT_GRAY12, AV_PIX_FMT_GRAY14, AV_PIX_FMT_GRAY16,
 196         AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
 197         AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
 198         AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P,
 199         AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ440P,
 200         AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ420P,
 201         AV_PIX_FMT_YUVJ411P,
 202         AV_PIX_FMT_YUVA420P, AV_PIX_FMT_YUVA422P, AV_PIX_FMT_YUVA444P,
 203         AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRAP,
 204         AV_PIX_FMT_YUV420P9, AV_PIX_FMT_YUV422P9, AV_PIX_FMT_YUV444P9,
 205         AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10,
 206         AV_PIX_FMT_YUV440P10,
 207         AV_PIX_FMT_YUV420P12, AV_PIX_FMT_YUV422P12, AV_PIX_FMT_YUV444P12,
 208         AV_PIX_FMT_YUV440P12,
 209         AV_PIX_FMT_YUV420P14, AV_PIX_FMT_YUV422P14, AV_PIX_FMT_YUV444P14,
 210         AV_PIX_FMT_YUV420P16, AV_PIX_FMT_YUV422P16, AV_PIX_FMT_YUV444P16,
 211         AV_PIX_FMT_GBRP9, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12, AV_PIX_FMT_GBRP14, AV_PIX_FMT_GBRP16,
 212         AV_PIX_FMT_YUVA444P9, AV_PIX_FMT_YUVA444P10, AV_PIX_FMT_YUVA444P12, AV_PIX_FMT_YUVA444P16,
 213         AV_PIX_FMT_YUVA422P9, AV_PIX_FMT_YUVA422P10, AV_PIX_FMT_YUVA422P12, AV_PIX_FMT_YUVA422P16,
 214         AV_PIX_FMT_YUVA420P9, AV_PIX_FMT_YUVA420P10, AV_PIX_FMT_YUVA420P16,
 215         AV_PIX_FMT_GBRAP10,   AV_PIX_FMT_GBRAP12,    AV_PIX_FMT_GBRAP16,
 216         AV_PIX_FMT_NONE
 217     };
 218
 219     AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
 220     if (!fmts_list)
 221         return AVERROR(ENOMEM);
 222     return ff_set_common_formats(ctx, fmts_list);
 223 }
 224
 225 static float dot_dsp(NNEDIContext *s, const float *kernel, const float *input,
 226                      int n, float scale, float bias)
 227 {
 228     float sum;
 229
 230     sum = s->fdsp->scalarproduct_float(kernel, input, n);
 231
 232     return sum * scale + bias;
 233 }
 234
 235 static float elliott(float x)
 236 {
 237     return x / (1.0f + fabsf(x));
 238 }
 239
 240 static void transform_elliott(float *input, int size)
 241 {
 242     for (int i = 0; i < size; i++)
 243         input[i] = elliott(input[i]);
 244 }
 245
 246 static void process_old(AVFilterContext *ctx,
 247                         const void *src, ptrdiff_t src_stride,
 248                         uint8_t *prescreen, int N,
 249                         void *data)
 250 {
 251     NNEDIContext *s = ctx->priv;
 252     const PrescreenerOldCoefficients *const m_data = data;
 253     const float *src_p = src;
 254
 255     // Adjust source pointer to point to top-left of filter window.
 256     const float *window = src_p - 2 * src_stride - 5;
 257
 258     for (int j = 0; j < N; j++) {
 259         LOCAL_ALIGNED_32(float, input, [48]);
 260         float state[12];
 261
 262         for (int i = 0; i < 4; i++)
 263             memcpy(input + i * 12, window + i * src_stride + j, 12 * sizeof(float));
 264
 265         // Layer 0.
 266         for (int n = 0; n < 4; n++)
 267             state[n] = dot_dsp(s, m_data->kernel_l0[n], input, 48, 1.0f, m_data->bias_l0[n]);
 268         transform_elliott(state + 1, 3);
 269
 270         // Layer 1.
 271         for (int n = 0; n < 4; n++)
 272             state[n + 4] = dot_dsp(s, m_data->kernel_l1[n], state, 4, 1.0f, m_data->bias_l1[n]);
 273         transform_elliott(state + 4, 3);
 274
 275         // Layer 2.
 276         for (int n = 0; n < 4; n++)
 277             state[n + 8] = dot_dsp(s, m_data->kernel_l2[n], state, 8, 1.0f, m_data->bias_l2[n]);
 278
 279         prescreen[j] = FFMAX(state[10], state[11]) <= FFMAX(state[8], state[9]) ? 255 : 0;
 280     }
 281 }
 282
 283 static void process_new(AVFilterContext *ctx,
 284                         const void *src, ptrdiff_t src_stride,
 285                         uint8_t *prescreen, int N,
 286                         void *data)
 287 {
 288     NNEDIContext *s = ctx->priv;
 289     const PrescreenerNewCoefficients *const m_data = data;
 290     const float *src_p = src;
 291
 292     // Adjust source pointer to point to top-left of filter window.
 293     const float *window = src_p - 2 * src_stride - 6;
 294
 295     for (int j = 0; j < N; j += 4) {
 296         LOCAL_ALIGNED_32(float, input, [64]);
 297         float state[8];
 298
 299         for (int i = 0; i < 4; i++)
 300             memcpy(input + i * 16, window + i * src_stride + j, 16 * sizeof(float));
 301
 302         for (int n = 0; n < 4; n++)
 303             state[n] = dot_dsp(s, m_data->kernel_l0[n], input, 64, 1.0f, m_data->bias_l0[n]);
 304         transform_elliott(state, 4);
 305
 306         for (int n = 0; n < 4; n++)
 307             state[n + 4] = dot_dsp(s, m_data->kernel_l1[n], state, 4, 1.0f, m_data->bias_l1[n]);
 308
 309         for (int n = 0; n < 4; n++)
 310             prescreen[j + n] = state[n + 4] > 0.f;
 311     }
 312 }
 313
 314 static int filter_offset(int nn, const PredictorCoefficients *const model)
 315 {
 316     return nn * model->nsize;
 317 }
 318
 319 static const float *softmax_q1_filter(int nn,
 320                                       const PredictorCoefficients *const model)
 321 {
 322     return model->softmax_q1 + filter_offset(nn, model);
 323 }
 324
 325 static const float *elliott_q1_filter(int nn,
 326                                       const PredictorCoefficients *const model)
 327 {
 328     return model->elliott_q1 + filter_offset(nn, model);
 329 }
 330
 331 static const float *softmax_q2_filter(int nn,
 332                                       const PredictorCoefficients *const model)
 333 {
 334     return model->softmax_q2 + filter_offset(nn, model);
 335 }
 336
 337 static const float *elliott_q2_filter(int nn,
 338                                       const PredictorCoefficients *const model)
 339 {
 340     return model->elliott_q2 + filter_offset(nn, model);
 341 }
 342
 343 static void gather_input(const float *src, ptrdiff_t src_stride,
 344                          float *buf, float mstd[4],
 345                          const PredictorCoefficients *const model)
 346 {
 347     float sum = 0;
 348     float sum_sq = 0;
 349     float tmp;
 350
 351     for (int i = 0; i < model->ydim; i++) {
 352         memcpy(buf, src, model->xdim * sizeof(float));
 353
 354         for (int j = 0; j < model->xdim; j++) {
 355             const float val = src[j];
 356
 357             sum += val;
 358             sum_sq += val * val;
 359         }
 360
 361         src += src_stride;
 362         buf += model->xdim;
 363     }
 364
 365     mstd[0] = sum / model->nsize;
 366     mstd[3] = 0.f;
 367
 368     tmp = sum_sq / model->nsize - mstd[0] * mstd[0];
 369     if (tmp < FLT_EPSILON) {
 370         mstd[1] = 0.0f;
 371         mstd[2] = 0.0f;
 372     } else {
 373         mstd[1] = sqrtf(tmp);
 374         mstd[2] = 1.0f / mstd[1];
 375     }
 376 }
 377
 378 static float softmax_exp(float x)
 379 {
 380     return expf(av_clipf(x, -80.f, 80.f));
 381 }
 382
 383 static void transform_softmax_exp(float *input, int size)
 384 {
 385     for (int i = 0; i < size; i++)
 386         input[i] = softmax_exp(input[i]);
 387 }
 388
 389 static void wae5(const float *softmax, const float *el,
 390                  int n, float mstd[4])
 391 {
 392     float vsum = 0.0f, wsum = 0.0f;
 393
 394     for (int i = 0; i < n; i++) {
 395         vsum += softmax[i] * elliott(el[i]);
 396         wsum += softmax[i];
 397     }
 398
 399     if (wsum > 1e-10f)
 400         mstd[3] += (5.0f * vsum) / wsum * mstd[1] + mstd[0];
 401     else
 402         mstd[3] += mstd[0];
 403 }
 404
 405 static void predictor(AVFilterContext *ctx,
 406                       const void *src, ptrdiff_t src_stride, void *dst,
 407                       const uint8_t *prescreen, int N,
 408                       void *data, int use_q2)
 409 {
 410     NNEDIContext *s = ctx->priv;
 411     const PredictorCoefficients *const model = data;
 412     const float *src_p = src;
 413     float *dst_p = dst;
 414
 415     // Adjust source pointer to point to top-left of filter window.
 416     const float *window = src_p - (model->ydim / 2) * src_stride - (model->xdim / 2 - 1);
 417     int filter_size = model->nsize;
 418     int nns = model->nns;
 419
 420     for (int i = 0; i < N; i++) {
 421         LOCAL_ALIGNED_32(float, input, [48 * 6]);
 422         float activation[256 * 2];
 423         float mstd[4];
 424         float scale;
 425
 426         if (prescreen[i])
 427             continue;
 428
 429         gather_input(window + i, src_stride, input, mstd, model);
 430         scale = mstd[2];
 431
 432         for (int nn = 0; nn < nns; nn++)
 433             activation[nn] = dot_dsp(s, softmax_q1_filter(nn, model), input, filter_size, scale, model->softmax_bias_q1[nn]);
 434
 435         for (int nn = 0; nn < nns; nn++)
 436             activation[model->nns + nn] = dot_dsp(s, elliott_q1_filter(nn, model), input, filter_size, scale, model->elliott_bias_q1[nn]);
 437
 438         transform_softmax_exp(activation, nns);
 439         wae5(activation, activation + nns, nns, mstd);
 440
 441         if (use_q2) {
 442             for (int nn = 0; nn < nns; nn++)
 443                 activation[nn] = dot_dsp(s, softmax_q2_filter(nn, model), input, filter_size, scale, model->softmax_bias_q2[nn]);
 444
 445             for (int nn = 0; nn < nns; nn++)
 446                 activation[nns + nn] = dot_dsp(s, elliott_q2_filter(nn, model), input, filter_size, scale, model->elliott_bias_q2[nn]);
 447
 448             transform_softmax_exp(activation, nns);
 449             wae5(activation, activation + nns, nns, mstd);
 450         }
 451
 452         dst_p[i] = mstd[3] / (use_q2 ? 2 : 1);
 453     }
 454 }
 455
 456 static void read_bytes(const uint8_t *src, float *dst,
 457                        int src_stride, int dst_stride,
 458                        int width, int height, float scale)
 459 {
 460     for (int y = 0; y < height; y++) {
 461         for (int x = 0; x < 32; x++)
 462             dst[-x - 1] = src[x];
 463
 464         for (int x = 0; x < width; x++)
 465             dst[x] = src[x];
 466
 467         for (int x = 0; x < 32; x++)
 468             dst[width + x] = src[width - x - 1];
 469
 470         dst += dst_stride;
 471         src += src_stride;
 472     }
 473 }
 474
 475 static void read_words(const uint8_t *srcp, float *dst,
 476                        int src_stride, int dst_stride,
 477                        int width, int height, float scale)
 478 {
 479     const uint16_t *src = (const uint16_t *)srcp;
 480
 481     src_stride /= 2;
 482
 483     for (int y = 0; y < height; y++) {
 484         for (int x = 0; x < 32; x++)
 485             dst[-x - 1] = src[x] * scale;
 486
 487         for (int x = 0; x < width; x++)
 488             dst[x] = src[x] * scale;
 489
 490         for (int x = 0; x < 32; x++)
 491             dst[width + x] = src[width - x - 1] * scale;
 492
 493         dst += dst_stride;
 494         src += src_stride;
 495     }
 496 }
 497
 498 static void write_bytes(const float *src, uint8_t *dst,
 499                         int src_stride, int dst_stride,
 500                         int width, int height, int depth,
 501                         float scale)
 502 {
 503     for (int y = 0; y < height; y++) {
 504         for (int x = 0; x < width; x++)
 505             dst[x] = av_clip_uint8(src[x]);
 506
 507         dst += dst_stride;
 508         src += src_stride;
 509     }
 510 }
 511
 512 static void write_words(const float *src, uint8_t *dstp,
 513                         int src_stride, int dst_stride,
 514                         int width, int height, int depth,
 515                         float scale)
 516 {
 517     uint16_t *dst = (uint16_t *)dstp;
 518
 519     dst_stride /= 2;
 520
 521     for (int y = 0; y < height; y++) {
 522         for (int x = 0; x < width; x++)
 523             dst[x] = av_clip_uintp2_c(src[x] * scale, depth);
 524
 525         dst += dst_stride;
 526         src += src_stride;
 527     }
 528 }
 529
 530 static void interpolation(const void *src, ptrdiff_t src_stride,
 531                           void *dst, const uint8_t *prescreen, int n)
 532 {
 533     const float *src_p = src;
 534     float *dst_p = dst;
 535     const float *window = src_p - 2 * src_stride;
 536
 537     for (int i = 0; i < n; i++) {
 538         float accum = 0.0f;
 539
 540         if (!prescreen[i])
 541             continue;
 542
 543         accum += (-3.0f / 32.0f) * window[0 * src_stride + i];
 544         accum += (19.0f / 32.0f) * window[1 * src_stride + i];
 545         accum += (19.0f / 32.0f) * window[2 * src_stride + i];
 546         accum += (-3.0f / 32.0f) * window[3 * src_stride + i];
 547
 548         dst_p[i] = accum;
 549     }
 550 }
 551
 552 static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
 553 {
 554     NNEDIContext *s = ctx->priv;
 555     AVFrame *out = s->dst;
 556     AVFrame *in = s->src;
 557     const float in_scale = s->in_scale;
 558     const float out_scale = s->out_scale;
 559     const int depth = s->depth;
 560     const int interlaced = in->interlaced_frame;
 561     const int tff = s->field_n == (s->field < 0 ? interlaced ? in->top_field_first : 1 :
 562                                   (s->field & 1) ^ 1);
 563
 564
 565     for (int p = 0; p < s->nb_planes; p++) {
 566         const int height = s->planeheight[p];
 567         const int width = s->planewidth[p];
 568         const int slice_start = 2 * ((height / 2 * jobnr) / nb_jobs);
 569         const int slice_end = 2 * ((height / 2 * (jobnr+1)) / nb_jobs);
 570         const uint8_t *src_data = in->data[p];
 571         uint8_t *dst_data = out->data[p];
 572         uint8_t *dst = out->data[p] + slice_start * out->linesize[p];
 573         const int src_linesize = in->linesize[p];
 574         const int dst_linesize = out->linesize[p];
 575         uint8_t *prescreen_buf = s->prescreen_buf + s->planewidth[0] * jobnr;
 576         float *srcbuf = s->input_buf + s->input_size * jobnr;
 577         const int srcbuf_stride = width + 64;
 578         float *dstbuf = s->output_buf + s->input_size * jobnr;
 579         const int dstbuf_stride = width;
 580         const int slice_height = (slice_end - slice_start) / 2;
 581         const int last_slice = slice_end == height;
 582         const uint8_t *in_line;
 583         uint8_t *out_line;
 584         int y_out;
 585
 586         if (!(s->process_plane & (1 << p))) {
 587             av_image_copy_plane(dst, out->linesize[p],
 588                                 in->data[p] + slice_start * in->linesize[p],
 589                                 in->linesize[p],
 590                                 s->linesize[p], slice_end - slice_start);
 591             continue;
 592         }
 593
 594         y_out    = slice_start + (tff ^ (slice_start & 1));
 595         in_line  = src_data + (y_out * src_linesize);
 596         out_line = dst_data + (y_out * dst_linesize);
 597
 598         while (y_out < slice_end) {
 599             memcpy(out_line, in_line, s->linesize[p]);
 600             y_out += 2;
 601             in_line  += src_linesize * 2;
 602             out_line += dst_linesize * 2;
 603         }
 604
 605         y_out = slice_start + ((!tff) ^ (slice_start & 1));
 606
 607         s->read(src_data + FFMAX(y_out - 5, tff) * src_linesize,
 608                 srcbuf + 32,
 609                 src_linesize * 2, srcbuf_stride,
 610                 width, 1, in_scale);
 611         srcbuf += srcbuf_stride;
 612
 613         s->read(src_data + FFMAX(y_out - 3, tff) * src_linesize,
 614                 srcbuf + 32,
 615                 src_linesize * 2, srcbuf_stride,
 616                 width, 1, in_scale);
 617         srcbuf += srcbuf_stride;
 618
 619         s->read(src_data + FFMAX(y_out - 1, tff) * src_linesize,
 620                 srcbuf + 32,
 621                 src_linesize * 2, srcbuf_stride,
 622                 width, 1, in_scale);
 623         srcbuf += srcbuf_stride;
 624
 625         in_line  = src_data + FFMIN(y_out + 1, height - 1 - !tff) * src_linesize;
 626         out_line = dst_data + (y_out * dst_linesize);
 627
 628         s->read(in_line, srcbuf + 32, src_linesize * 2, srcbuf_stride,
 629                 width, slice_height - last_slice, in_scale);
 630
 631         y_out += (slice_height - last_slice) * 2;
 632
 633         s->read(src_data + FFMIN(y_out + 1, height - 1 - !tff) * src_linesize,
 634                 srcbuf + 32 + srcbuf_stride * (slice_height - last_slice),
 635                 src_linesize * 2, srcbuf_stride,
 636                 width, 1, in_scale);
 637
 638         s->read(src_data + FFMIN(y_out + 3, height - 1 - !tff) * src_linesize,
 639                 srcbuf + 32 + srcbuf_stride * (slice_height + 1 - last_slice),
 640                 src_linesize * 2, srcbuf_stride,
 641                 width, 1, in_scale);
 642
 643         s->read(src_data + FFMIN(y_out + 5, height - 1 - !tff) * src_linesize,
 644                 srcbuf + 32 + srcbuf_stride * (slice_height + 2 - last_slice),
 645                 src_linesize * 2, srcbuf_stride,
 646                 width, 1, in_scale);
 647
 648         for (int y = 0; y < slice_end - slice_start; y += 2) {
 649             if (s->pscrn > 1) {
 650                 s->prescreen[1](ctx, srcbuf + (y / 2) * srcbuf_stride + 32,
 651                                 srcbuf_stride, prescreen_buf, width,
 652                                 &s->prescreener_new[s->pscrn - 2]);
 653             } else if (s->pscrn == 1) {
 654                 s->prescreen[0](ctx, srcbuf + (y / 2) * srcbuf_stride + 32,
 655                                 srcbuf_stride, prescreen_buf, width,
 656                                 &s->prescreener_old);
 657             }
 658
 659             predictor(ctx,
 660                       srcbuf + (y / 2) * srcbuf_stride + 32,
 661                       srcbuf_stride,
 662                       dstbuf + (y / 2) * dstbuf_stride,
 663                       prescreen_buf, width,
 664                       &s->coeffs[s->etype][s->nnsparam][s->nsize], s->qual == 2);
 665
 666             if (s->prescreen > 0)
 667                 interpolation(srcbuf + (y / 2) * srcbuf_stride + 32,
 668                               srcbuf_stride,
 669                               dstbuf + (y / 2) * dstbuf_stride,
 670                               prescreen_buf, width);
 671         }
 672
 673         s->write(dstbuf, out_line, dstbuf_stride, dst_linesize * 2,
 674                  width, slice_height, depth, out_scale);
 675     }
 676
 677     return 0;
 678 }
 679
 680 static int get_frame(AVFilterContext *ctx, int is_second)
 681 {
 682     NNEDIContext *s = ctx->priv;
 683     AVFilterLink *outlink = ctx->outputs[0];
 684     AVFrame *src = s->src;
 685
 686     s->dst = ff_get_video_buffer(outlink, outlink->w, outlink->h);
 687     if (!s->dst)
 688         return AVERROR(ENOMEM);
 689     av_frame_copy_props(s->dst, src);
 690     s->dst->interlaced_frame = 0;
 691
 692     ctx->internal->execute(ctx, filter_slice, NULL, NULL, FFMIN(s->planeheight[1] / 2, s->nb_threads));
 693
 694     if (s->field == -2 || s->field > 1)
 695         s->field_n = !s->field_n;
 696
 697     return 0;
 698 }
 699
 700 static int filter_frame(AVFilterLink *inlink, AVFrame *src)
 701 {
 702     AVFilterContext *ctx = inlink->dst;
 703     AVFilterLink *outlink = ctx->outputs[0];
 704     NNEDIContext *s = ctx->priv;
 705     int ret;
 706
 707     if ((s->field > 1 ||
 708          s->field == -2) && !s->second) {
 709         goto second;
 710     } else if (s->field > 1 ||
 711                s->field == -2) {
 712         AVFrame *dst;
 713
 714         s->src = s->second;
 715         ret = get_frame(ctx, 1);
 716         if (ret < 0) {
 717             av_frame_free(&s->dst);
 718             av_frame_free(&s->second);
 719             s->src = NULL;
 720             return ret;
 721         }
 722         dst = s->dst;
 723
 724         if (src->pts != AV_NOPTS_VALUE &&
 725             dst->pts != AV_NOPTS_VALUE)
 726             dst->pts += src->pts;
 727         else
 728             dst->pts = AV_NOPTS_VALUE;
 729
 730         ret = ff_filter_frame(outlink, dst);
 731         if (ret < 0)
 732             return ret;
 733         if (s->eof)
 734             return 0;
 735         s->cur_pts = s->second->pts;
 736         av_frame_free(&s->second);
 737 second:
 738         if ((s->deint && src->interlaced_frame &&
 739              !ctx->is_disabled) ||
 740             (!s->deint && !ctx->is_disabled)) {
 741             s->second = src;
 742         }
 743     }
 744
 745     if ((s->deint && !src->interlaced_frame) || ctx->is_disabled) {
 746         AVFrame *dst = av_frame_clone(src);
 747         if (!dst) {
 748             av_frame_free(&src);
 749             av_frame_free(&s->second);
 750             return AVERROR(ENOMEM);
 751         }
 752
 753         if (s->field > 1 || s->field == -2) {
 754             av_frame_free(&s->second);
 755             if ((s->deint && src->interlaced_frame) ||
 756                 (!s->deint))
 757                 s->second = src;
 758         } else {
 759             av_frame_free(&src);
 760         }
 761         if (dst->pts != AV_NOPTS_VALUE)
 762             dst->pts *= 2;
 763         return ff_filter_frame(outlink, dst);
 764     }
 765
 766     s->src = src;
 767     ret = get_frame(ctx, 0);
 768     if (ret < 0) {
 769         av_frame_free(&s->dst);
 770         av_frame_free(&s->src);
 771         av_frame_free(&s->second);
 772         return ret;
 773     }
 774
 775     if (src->pts != AV_NOPTS_VALUE)
 776         s->dst->pts = src->pts * 2;
 777     if (s->field <= 1 && s->field > -2) {
 778         av_frame_free(&src);
 779         s->src = NULL;
 780     }
 781
 782     return ff_filter_frame(outlink, s->dst);
 783 }
 784
 785 static int request_frame(AVFilterLink *link)
 786 {
 787     AVFilterContext *ctx = link->src;
 788     NNEDIContext *s = ctx->priv;
 789     int ret;
 790
 791     if (s->eof)
 792         return AVERROR_EOF;
 793
 794     ret  = ff_request_frame(ctx->inputs[0]);
 795
 796     if (ret == AVERROR_EOF && s->second) {
 797         AVFrame *next = av_frame_clone(s->second);
 798
 799         if (!next)
 800             return AVERROR(ENOMEM);
 801
 802         next->pts = s->second->pts * 2 - s->cur_pts;
 803         s->eof = 1;
 804
 805         filter_frame(ctx->inputs[0], next);
 806     } else if (ret < 0) {
 807         return ret;
 808     }
 809
 810     return 0;
 811 }
 812
 813 static void copy_weights(float *dst, int n, const float **data)
 814 {
 815     memcpy(dst, *data, n * sizeof(float));
 816     *data += n;
 817 }
 818
 819 static float *allocate(float **ptr, int size)
 820 {
 821     float *ret = *ptr;
 822
 823     *ptr += size;
 824
 825     return ret;
 826 }
 827
 828 static int allocate_model(PredictorCoefficients *coeffs, int xdim, int ydim, int nns)
 829 {
 830     int filter_size = nns * xdim * ydim;
 831     int bias_size = nns;
 832     float *data;
 833
 834     data = av_malloc_array(filter_size + bias_size, 4 * sizeof(float));
 835     if (!data)
 836         return AVERROR(ENOMEM);
 837
 838     coeffs->data = data;
 839     coeffs->xdim = xdim;
 840     coeffs->ydim = ydim;
 841     coeffs->nsize = xdim * ydim;
 842     coeffs->nns  = nns;
 843
 844     coeffs->softmax_q1 = allocate(&data, filter_size);
 845     coeffs->elliott_q1 = allocate(&data, filter_size);
 846     coeffs->softmax_bias_q1 = allocate(&data, bias_size);
 847     coeffs->elliott_bias_q1 = allocate(&data, bias_size);
 848
 849     coeffs->softmax_q2 = allocate(&data, filter_size);
 850     coeffs->elliott_q2 = allocate(&data, filter_size);
 851     coeffs->softmax_bias_q2 = allocate(&data, bias_size);
 852     coeffs->elliott_bias_q2 = allocate(&data, bias_size);
 853
 854     return 0;
 855 }
 856
 857 static int read_weights(AVFilterContext *ctx, const float *bdata)
 858 {
 859     NNEDIContext *s = ctx->priv;
 860     int ret;
 861
 862     copy_weights(&s->prescreener_old.kernel_l0[0][0], 4 * 48, &bdata);
 863     copy_weights(s->prescreener_old.bias_l0, 4, &bdata);
 864
 865     copy_weights(&s->prescreener_old.kernel_l1[0][0], 4 * 4, &bdata);
 866     copy_weights(s->prescreener_old.bias_l1, 4, &bdata);
 867
 868     copy_weights(&s->prescreener_old.kernel_l2[0][0], 4 * 8, &bdata);
 869     copy_weights(s->prescreener_old.bias_l2, 4, &bdata);
 870
 871     for (int i = 0; i < 3; i++) {
 872         PrescreenerNewCoefficients *data = &s->prescreener_new[i];
 873         float kernel_l0_shuffled[4 * 64];
 874         float kernel_l1_shuffled[4 * 4];
 875
 876         copy_weights(kernel_l0_shuffled, 4 * 64, &bdata);
 877         copy_weights(data->bias_l0, 4, &bdata);
 878
 879         copy_weights(kernel_l1_shuffled, 4 * 4, &bdata);
 880         copy_weights(data->bias_l1, 4, &bdata);
 881
 882         for (int n = 0; n < 4; n++) {
 883             for (int k = 0; k < 64; k++)
 884                 data->kernel_l0[n][k] = kernel_l0_shuffled[(k / 8) * 32 + n * 8 + k % 8];
 885             for (int k = 0; k < 4; k++)
 886                 data->kernel_l1[n][k] = kernel_l1_shuffled[k * 4 + n];
 887         }
 888     }
 889
 890     for (int m = 0; m < 2; m++) {
 891         // Grouping by neuron count.
 892         for (int i = 0; i < 5; i++) {
 893             int nns = NNEDI_NNS[i];
 894
 895             // Grouping by window size.
 896             for (int j = 0; j < 7; j++) {
 897                 PredictorCoefficients *model = &s->coeffs[m][i][j];
 898                 int xdim = NNEDI_XDIM[j];
 899                 int ydim = NNEDI_YDIM[j];
 900                 int filter_size = xdim * ydim;
 901
 902                 ret = allocate_model(model, xdim, ydim, nns);
 903                 if (ret < 0)
 904                     return ret;
 905
 906                 // Quality 1 model. NNS[i] * (XDIM[j] * YDIM[j]) * 2 coefficients.
 907                 copy_weights(model->softmax_q1, nns * filter_size, &bdata);
 908                 copy_weights(model->elliott_q1, nns * filter_size, &bdata);
 909
 910                 // Quality 1 model bias. NNS[i] * 2 coefficients.
 911                 copy_weights(model->softmax_bias_q1, nns, &bdata);
 912                 copy_weights(model->elliott_bias_q1, nns, &bdata);
 913
 914                 // Quality 2 model. NNS[i] * (XDIM[j] * YDIM[j]) * 2 coefficients.
 915                 copy_weights(model->softmax_q2, nns * filter_size, &bdata);
 916                 copy_weights(model->elliott_q2, nns * filter_size, &bdata);
 917
 918                 // Quality 2 model bias. NNS[i] * 2 coefficients.
 919                 copy_weights(model->softmax_bias_q2, nns, &bdata);
 920                 copy_weights(model->elliott_bias_q2, nns, &bdata);
 921             }
 922         }
 923     }
 924
 925     return 0;
 926 }
 927
 928 static float mean(const float *input, int size)
 929 {
 930     float sum = 0.;
 931
 932     for (int i = 0; i < size; i++)
 933         sum += input[i];
 934
 935     return sum / size;
 936 }
 937
 938 static void transform(float *input, int size, float mean, float half)
 939 {
 940     for (int i = 0; i < size; i++)
 941         input[i] = (input[i] - mean) / half;
 942 }
 943
 944 static void subtract_mean_old(PrescreenerOldCoefficients *coeffs, float half)
 945 {
 946     for (int n = 0; n < 4; n++) {
 947         float m = mean(coeffs->kernel_l0[n], 48);
 948
 949         transform(coeffs->kernel_l0[n], 48, m, half);
 950     }
 951 }
 952
 953 static void subtract_mean_new(PrescreenerNewCoefficients *coeffs, float half)
 954 {
 955     for (int n = 0; n < 4; n++) {
 956         float m = mean(coeffs->kernel_l0[n], 64);
 957
 958         transform(coeffs->kernel_l0[n], 64, m, half);
 959     }
 960 }
 961
 962 static void subtract_mean_predictor(PredictorCoefficients *model)
 963 {
 964     int filter_size = model->nsize;
 965     int nns = model->nns;
 966
 967     float softmax_means[256]; // Average of individual softmax filters.
 968     float elliott_means[256]; // Average of individual elliott filters.
 969     float mean_filter[48 * 6]; // Pointwise average of all softmax filters.
 970     float mean_bias;
 971
 972     // Quality 1.
 973     for (int nn = 0; nn < nns; nn++) {
 974         softmax_means[nn] = mean(model->softmax_q1 + nn * filter_size, filter_size);
 975         elliott_means[nn] = mean(model->elliott_q1 + nn * filter_size, filter_size);
 976
 977         for (int k = 0; k < filter_size; k++)
 978             mean_filter[k] += model->softmax_q1[nn * filter_size + k] - softmax_means[nn];
 979     }
 980
 981     for (int k = 0; k < filter_size; k++)
 982         mean_filter[k] /= nns;
 983
 984     mean_bias = mean(model->softmax_bias_q1, nns);
 985
 986     for (int nn = 0; nn < nns; nn++) {
 987         for (int k = 0; k < filter_size; k++) {
 988             model->softmax_q1[nn * filter_size + k] -= softmax_means[nn] + mean_filter[k];
 989             model->elliott_q1[nn * filter_size + k] -= elliott_means[nn];
 990         }
 991         model->softmax_bias_q1[nn] -= mean_bias;
 992     }
 993
 994     // Quality 2.
 995     memset(mean_filter, 0, 48 * 6 * sizeof(float));
 996
 997     for (int nn = 0; nn < nns; nn++) {
 998         softmax_means[nn] = mean(model->softmax_q2 + nn * filter_size, filter_size);
 999         elliott_means[nn] = mean(model->elliott_q2 + nn * filter_size, filter_size);
1000
1001         for (int k = 0; k < filter_size; k++) {
1002             mean_filter[k] += model->softmax_q2[nn * filter_size + k] - softmax_means[nn];
1003         }
1004     }
1005
1006     for (int k = 0; k < filter_size; k++)
1007         mean_filter[k] /= nns;
1008
1009     mean_bias = mean(model->softmax_bias_q2, nns);
1010
1011     for (int nn = 0; nn < nns; nn++) {
1012         for (int k = 0; k < filter_size; k++) {
1013             model->softmax_q2[nn * filter_size + k] -= softmax_means[nn] + mean_filter[k];
1014             model->elliott_q2[nn * filter_size + k] -= elliott_means[nn];
1015         }
1016
1017         model->softmax_bias_q2[nn] -= mean_bias;
1018     }
1019 }
1020
1021 static av_cold int init(AVFilterContext *ctx)
1022 {
1023     NNEDIContext *s = ctx->priv;
1024     FILE *weights_file = NULL;
1025     int64_t weights_size;
1026     float *bdata;
1027     size_t bytes_read;
1028     int ret = 0;
1029
1030     weights_file = av_fopen_utf8(s->weights_file, "rb");
1031     if (!weights_file) {
1032         av_log(ctx, AV_LOG_ERROR, "No weights file provided, aborting!\n");
1033         return AVERROR(EINVAL);
1034     }
1035
1036     if (fseek(weights_file, 0, SEEK_END)) {
1037         av_log(ctx, AV_LOG_ERROR, "Couldn't seek to the end of weights file.\n");
1038         fclose(weights_file);
1039         return AVERROR(EINVAL);
1040     }
1041
1042     weights_size = ftell(weights_file);
1043
1044     if (weights_size == -1) {
1045         fclose(weights_file);
1046         av_log(ctx, AV_LOG_ERROR, "Couldn't get size of weights file.\n");
1047         return AVERROR(EINVAL);
1048     } else if (weights_size != NNEDI_WEIGHTS_SIZE) {
1049         fclose(weights_file);
1050         av_log(ctx, AV_LOG_ERROR, "Unexpected weights file size.\n");
1051         return AVERROR(EINVAL);
1052     }
1053
1054     if (fseek(weights_file, 0, SEEK_SET)) {
1055         fclose(weights_file);
1056         av_log(ctx, AV_LOG_ERROR, "Couldn't seek to the start of weights file.\n");
1057         return AVERROR(EINVAL);
1058     }
1059
1060     bdata = av_malloc(NNEDI_WEIGHTS_SIZE);
1061     if (!bdata) {
1062         fclose(weights_file);
1063         return AVERROR(ENOMEM);
1064     }
1065
1066     bytes_read = fread(bdata, 1, NNEDI_WEIGHTS_SIZE, weights_file);
1067     if (bytes_read != NNEDI_WEIGHTS_SIZE) {
1068         fclose(weights_file);
1069         ret = AVERROR_INVALIDDATA;
1070         av_log(ctx, AV_LOG_ERROR, "Couldn't read weights file.\n");
1071         goto fail;
1072     }
1073
1074     fclose(weights_file);
1075
1076     s->fdsp = avpriv_float_dsp_alloc(0);
1077     if (!s->fdsp) {
1078         ret = AVERROR(ENOMEM);
1079         goto fail;
1080     }
1081
1082     ret = read_weights(ctx, bdata);
1083     if (ret < 0)
1084         goto fail;
1085
1086 fail:
1087     av_free(bdata);
1088     return ret;
1089 }
1090
1091 static int config_input(AVFilterLink *inlink)
1092 {
1093     AVFilterContext *ctx = inlink->dst;
1094     NNEDIContext *s = ctx->priv;
1095     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
1096     int ret;
1097
1098     s->depth = desc->comp[0].depth;
1099     s->nb_threads = ff_filter_get_nb_threads(ctx);
1100     s->nb_planes = av_pix_fmt_count_planes(inlink->format);
1101     if ((ret = av_image_fill_linesizes(s->linesize, inlink->format, inlink->w)) < 0)
1102         return ret;
1103
1104     s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
1105     s->planewidth[0] = s->planewidth[3] = inlink->w;
1106     s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
1107     s->planeheight[0] = s->planeheight[3] = inlink->h;
1108
1109     s->half = ((1 << 8) - 1) / 2.f;
1110     s->out_scale = 1 << (s->depth - 8);
1111     s->in_scale = 1.f / s->out_scale;
1112
1113     switch (s->depth) {
1114     case 8:
1115         s->read  = read_bytes;
1116         s->write = write_bytes;
1117         break;
1118     default:
1119         s->read  = read_words;
1120         s->write = write_words;
1121         break;
1122     }
1123
1124     subtract_mean_old(&s->prescreener_old, s->half);
1125     subtract_mean_new(&s->prescreener_new[0], s->half);
1126     subtract_mean_new(&s->prescreener_new[1], s->half);
1127     subtract_mean_new(&s->prescreener_new[2], s->half);
1128
1129     s->prescreen[0] = process_old;
1130     s->prescreen[1] = process_new;
1131
1132     for (int i = 0; i < 2; i++) {
1133         for (int j = 0; j < 5; j++) {
1134             for (int k = 0; k < 7; k++)
1135                 subtract_mean_predictor(&s->coeffs[i][j][k]);
1136         }
1137     }
1138
1139     s->prescreen_buf = av_calloc(s->nb_threads * s->planewidth[0], sizeof(*s->prescreen_buf));
1140     if (!s->prescreen_buf)
1141         return AVERROR(ENOMEM);
1142
1143     s->input_size = (s->planewidth[0] + 64) * (s->planeheight[0] + 6);
1144     s->input_buf = av_calloc(s->nb_threads * s->input_size, sizeof(*s->input_buf));
1145     if (!s->input_buf)
1146         return AVERROR(ENOMEM);
1147
1148     s->output_buf = av_calloc(s->nb_threads * s->input_size, sizeof(*s->output_buf));
1149     if (!s->output_buf)
1150         return AVERROR(ENOMEM);
1151
1152     return 0;
1153 }
1154
1155 static av_cold void uninit(AVFilterContext *ctx)
1156 {
1157     NNEDIContext *s = ctx->priv;
1158
1159     av_freep(&s->prescreen_buf);
1160     av_freep(&s->input_buf);
1161     av_freep(&s->output_buf);
1162     av_freep(&s->fdsp);
1163
1164     for (int i = 0; i < 2; i++) {
1165         for (int j = 0; j < 5; j++) {
1166             for (int k = 0; k < 7; k++) {
1167                 av_freep(&s->coeffs[i][j][k].data);
1168             }
1169         }
1170     }
1171
1172     av_frame_free(&s->second);
1173 }
1174
1175 static const AVFilterPad inputs[] = {
1176     {
1177         .name          = "default",
1178         .type          = AVMEDIA_TYPE_VIDEO,
1179         .filter_frame  = filter_frame,
1180         .config_props  = config_input,
1181     },
1182     { NULL }
1183 };
1184
1185 static const AVFilterPad outputs[] = {
1186     {
1187         .name          = "default",
1188         .type          = AVMEDIA_TYPE_VIDEO,
1189         .config_props  = config_output,
1190         .request_frame = request_frame,
1191     },
1192     { NULL }
1193 };
1194
1195 AVFilter ff_vf_nnedi = {
1196     .name          = "nnedi",
1197     .description   = NULL_IF_CONFIG_SMALL("Apply neural network edge directed interpolation intra-only deinterlacer."),
1198     .priv_size     = sizeof(NNEDIContext),
1199     .priv_class    = &nnedi_class,
1200     .init          = init,
1201     .uninit        = uninit,
1202     .query_formats = query_formats,
1203     .inputs        = inputs,
1204     .outputs       = outputs,
1205     .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL | AVFILTER_FLAG_SLICE_THREADS,
1206     .process_command = ff_filter_process_command,
1207 };