2 * Copyright (C) 2010-2011 Kevin Stone
3 * Copyright (C) 2016 Paul B Mahol
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License along
18 * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 #include "libavutil/avassert.h"
25 #include "libavutil/common.h"
26 #include "libavutil/float_dsp.h"
27 #include "libavutil/imgutils.h"
28 #include "libavutil/mem_internal.h"
29 #include "libavutil/opt.h"
30 #include "libavutil/pixdesc.h"
36 static const size_t NNEDI_WEIGHTS_SIZE = 13574928;
37 static const uint8_t NNEDI_XDIM[] = { 8, 16, 32, 48, 8, 16, 32 };
38 static const uint8_t NNEDI_YDIM[] = { 6, 6, 6, 6, 4, 4, 4 };
39 static const uint16_t NNEDI_NNS[] = { 16, 32, 64, 128, 256 };
41 typedef struct PrescreenerCoefficients {
42 DECLARE_ALIGNED(32, float, kernel_l0)[4][16 * 4];
43 DECLARE_ALIGNED(32, float, bias_l0)[4];
45 DECLARE_ALIGNED(32, float, kernel_l1)[4][4];
46 DECLARE_ALIGNED(32, float, bias_l1)[4];
48 DECLARE_ALIGNED(32, float, kernel_l2)[4][8];
49 DECLARE_ALIGNED(32, float, bias_l2)[4];
50 } PrescreenerCoefficients;
52 typedef struct PredictorCoefficients {
53 int xdim, ydim, nns, nsize;
57 float *softmax_bias_q1;
58 float *elliott_bias_q1;
61 float *softmax_bias_q2;
62 float *elliott_bias_q2;
63 } PredictorCoefficients;
65 typedef struct NNEDIContext {
76 AVFloatDSPContext *fdsp;
85 PrescreenerCoefficients prescreener[4];
86 PredictorCoefficients coeffs[2][5][7];
103 uint8_t **prescreen_buf;
107 void (*read)(const uint8_t *src, float *dst,
108 int src_stride, int dst_stride,
109 int width, int height, float scale);
110 void (*write)(const float *src, uint8_t *dst,
111 int src_stride, int dst_stride,
112 int width, int height, int depth, float scale);
113 void (*prescreen[2])(AVFilterContext *ctx,
114 const void *src, ptrdiff_t src_stride,
115 uint8_t *prescreen, int N,
116 const PrescreenerCoefficients *const coeffs);
119 #define OFFSET(x) offsetof(NNEDIContext, x)
120 #define RFLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
121 #define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
123 static const AVOption nnedi_options[] = {
124 {"weights", "set weights file", OFFSET(weights_file), AV_OPT_TYPE_STRING, {.str="nnedi3_weights.bin"}, 0, 0, FLAGS },
125 {"deint", "set which frames to deinterlace", OFFSET(deint), AV_OPT_TYPE_INT, {.i64=0}, 0, 1, RFLAGS, "deint" },
126 {"all", "deinterlace all frames", 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "deint" },
127 {"interlaced", "only deinterlace frames marked as interlaced", 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "deint" },
128 {"field", "set mode of operation", OFFSET(field), AV_OPT_TYPE_INT, {.i64=-1}, -2, 3, RFLAGS, "field" },
129 {"af", "use frame flags, both fields", 0, AV_OPT_TYPE_CONST, {.i64=-2}, 0, 0, RFLAGS, "field" },
130 {"a", "use frame flags, single field", 0, AV_OPT_TYPE_CONST, {.i64=-1}, 0, 0, RFLAGS, "field" },
131 {"t", "use top field only", 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "field" },
132 {"b", "use bottom field only", 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "field" },
133 {"tf", "use both fields, top first", 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, RFLAGS, "field" },
134 {"bf", "use both fields, bottom first", 0, AV_OPT_TYPE_CONST, {.i64=3}, 0, 0, RFLAGS, "field" },
135 {"planes", "set which planes to process", OFFSET(process_plane), AV_OPT_TYPE_INT, {.i64=7}, 0, 15, RFLAGS },
136 {"nsize", "set size of local neighborhood around each pixel, used by the predictor neural network", OFFSET(nsize), AV_OPT_TYPE_INT, {.i64=6}, 0, 6, RFLAGS, "nsize" },
137 {"s8x6", NULL, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "nsize" },
138 {"s16x6", NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "nsize" },
139 {"s32x6", NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, RFLAGS, "nsize" },
140 {"s48x6", NULL, 0, AV_OPT_TYPE_CONST, {.i64=3}, 0, 0, RFLAGS, "nsize" },
141 {"s8x4", NULL, 0, AV_OPT_TYPE_CONST, {.i64=4}, 0, 0, RFLAGS, "nsize" },
142 {"s16x4", NULL, 0, AV_OPT_TYPE_CONST, {.i64=5}, 0, 0, RFLAGS, "nsize" },
143 {"s32x4", NULL, 0, AV_OPT_TYPE_CONST, {.i64=6}, 0, 0, RFLAGS, "nsize" },
144 {"nns", "set number of neurons in predictor neural network", OFFSET(nnsparam), AV_OPT_TYPE_INT, {.i64=1}, 0, 4, RFLAGS, "nns" },
145 {"n16", NULL, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "nns" },
146 {"n32", NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "nns" },
147 {"n64", NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, RFLAGS, "nns" },
148 {"n128", NULL, 0, AV_OPT_TYPE_CONST, {.i64=3}, 0, 0, RFLAGS, "nns" },
149 {"n256", NULL, 0, AV_OPT_TYPE_CONST, {.i64=4}, 0, 0, RFLAGS, "nns" },
150 {"qual", "set quality", OFFSET(qual), AV_OPT_TYPE_INT, {.i64=1}, 1, 2, RFLAGS, "qual" },
151 {"fast", NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "qual" },
152 {"slow", NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, RFLAGS, "qual" },
153 {"etype", "set which set of weights to use in the predictor", OFFSET(etype), AV_OPT_TYPE_INT, {.i64=0}, 0, 1, RFLAGS, "etype" },
154 {"a", "weights trained to minimize absolute error", 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "etype" },
155 {"abs","weights trained to minimize absolute error", 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "etype" },
156 {"s", "weights trained to minimize squared error", 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "etype" },
157 {"mse","weights trained to minimize squared error", 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "etype" },
158 {"pscrn", "set prescreening", OFFSET(pscrn), AV_OPT_TYPE_INT, {.i64=2}, 0, 4, RFLAGS, "pscrn" },
159 {"none", NULL, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, RFLAGS, "pscrn" },
160 {"original", NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, RFLAGS, "pscrn" },
161 {"new", NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, RFLAGS, "pscrn" },
162 {"new2", NULL, 0, AV_OPT_TYPE_CONST, {.i64=3}, 0, 0, RFLAGS, "pscrn" },
163 {"new3", NULL, 0, AV_OPT_TYPE_CONST, {.i64=4}, 0, 0, RFLAGS, "pscrn" },
167 AVFILTER_DEFINE_CLASS(nnedi);
169 static int config_output(AVFilterLink *outlink)
171 AVFilterContext *ctx = outlink->src;
173 outlink->time_base.num = ctx->inputs[0]->time_base.num;
174 outlink->time_base.den = ctx->inputs[0]->time_base.den * 2;
175 outlink->w = ctx->inputs[0]->w;
176 outlink->h = ctx->inputs[0]->h;
178 outlink->frame_rate = av_mul_q(ctx->inputs[0]->frame_rate,
184 static int query_formats(AVFilterContext *ctx)
186 static const enum AVPixelFormat pix_fmts[] = {
188 AV_PIX_FMT_GRAY9, AV_PIX_FMT_GRAY10, AV_PIX_FMT_GRAY12, AV_PIX_FMT_GRAY14, AV_PIX_FMT_GRAY16,
189 AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
190 AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
191 AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P,
192 AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ440P,
193 AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ420P,
195 AV_PIX_FMT_YUVA420P, AV_PIX_FMT_YUVA422P, AV_PIX_FMT_YUVA444P,
196 AV_PIX_FMT_GBRP, AV_PIX_FMT_GBRAP,
197 AV_PIX_FMT_YUV420P9, AV_PIX_FMT_YUV422P9, AV_PIX_FMT_YUV444P9,
198 AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV444P10,
199 AV_PIX_FMT_YUV440P10,
200 AV_PIX_FMT_YUV420P12, AV_PIX_FMT_YUV422P12, AV_PIX_FMT_YUV444P12,
201 AV_PIX_FMT_YUV440P12,
202 AV_PIX_FMT_YUV420P14, AV_PIX_FMT_YUV422P14, AV_PIX_FMT_YUV444P14,
203 AV_PIX_FMT_YUV420P16, AV_PIX_FMT_YUV422P16, AV_PIX_FMT_YUV444P16,
204 AV_PIX_FMT_GBRP9, AV_PIX_FMT_GBRP10, AV_PIX_FMT_GBRP12, AV_PIX_FMT_GBRP14, AV_PIX_FMT_GBRP16,
205 AV_PIX_FMT_YUVA444P9, AV_PIX_FMT_YUVA444P10, AV_PIX_FMT_YUVA444P12, AV_PIX_FMT_YUVA444P16,
206 AV_PIX_FMT_YUVA422P9, AV_PIX_FMT_YUVA422P10, AV_PIX_FMT_YUVA422P12, AV_PIX_FMT_YUVA422P16,
207 AV_PIX_FMT_YUVA420P9, AV_PIX_FMT_YUVA420P10, AV_PIX_FMT_YUVA420P16,
208 AV_PIX_FMT_GBRAP10, AV_PIX_FMT_GBRAP12, AV_PIX_FMT_GBRAP16,
212 AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
214 return AVERROR(ENOMEM);
215 return ff_set_common_formats(ctx, fmts_list);
218 static float dot_dsp(const NNEDIContext *const s, const float *kernel, const float *input,
219 int n, float scale, float bias)
223 sum = s->fdsp->scalarproduct_float(kernel, input, n);
225 y = sum * scale + bias + 1e-20f;
230 static float elliott(float x)
232 return x / (1.0f + fabsf(x));
235 static void transform_elliott(float *input, int size)
237 for (int i = 0; i < size; i++)
238 input[i] = elliott(input[i]);
241 static void process_old(AVFilterContext *ctx,
242 const void *src, ptrdiff_t src_stride,
243 uint8_t *prescreen, int N,
244 const PrescreenerCoefficients *const m_data)
246 NNEDIContext *s = ctx->priv;
247 const float *src_p = src;
249 // Adjust source pointer to point to top-left of filter window.
250 const float *window = src_p - 2 * src_stride - 5;
252 for (int j = 0; j < N; j++) {
253 LOCAL_ALIGNED_32(float, input, [48]);
256 for (int i = 0; i < 4; i++)
257 memcpy(input + i * 12, window + i * src_stride + j, 12 * sizeof(float));
260 for (int n = 0; n < 4; n++)
261 state[n] = dot_dsp(s, m_data->kernel_l0[n], input, 48, 1.0f, m_data->bias_l0[n]);
262 transform_elliott(state + 1, 3);
265 for (int n = 0; n < 4; n++)
266 state[n + 4] = dot_dsp(s, m_data->kernel_l1[n], state, 4, 1.0f, m_data->bias_l1[n]);
267 transform_elliott(state + 4, 3);
270 for (int n = 0; n < 4; n++)
271 state[n + 8] = dot_dsp(s, m_data->kernel_l2[n], state, 8, 1.0f, m_data->bias_l2[n]);
273 prescreen[j] = FFMAX(state[10], state[11]) <= FFMAX(state[8], state[9]) ? 255 : 0;
277 static void process_new(AVFilterContext *ctx,
278 const void *src, ptrdiff_t src_stride,
279 uint8_t *prescreen, int N,
280 const PrescreenerCoefficients *const m_data)
282 NNEDIContext *s = ctx->priv;
283 const float *src_p = src;
285 // Adjust source pointer to point to top-left of filter window.
286 const float *window = src_p - 2 * src_stride - 6;
288 for (int j = 0; j < N; j += 4) {
289 LOCAL_ALIGNED_32(float, input, [64]);
292 for (int i = 0; i < 4; i++)
293 memcpy(input + i * 16, window + i * src_stride + j, 16 * sizeof(float));
295 for (int n = 0; n < 4; n++)
296 state[n] = dot_dsp(s, m_data->kernel_l0[n], input, 64, 1.0f, m_data->bias_l0[n]);
297 transform_elliott(state, 4);
299 for (int n = 0; n < 4; n++)
300 state[n + 4] = dot_dsp(s, m_data->kernel_l1[n], state, 4, 1.0f, m_data->bias_l1[n]);
302 for (int n = 0; n < 4; n++)
303 prescreen[j + n] = state[n + 4] > 0.f;
307 static int filter_offset(int nn, const PredictorCoefficients *const model)
309 return nn * model->nsize;
312 static const float *softmax_q1_filter(int nn,
313 const PredictorCoefficients *const model)
315 return model->softmax_q1 + filter_offset(nn, model);
318 static const float *elliott_q1_filter(int nn,
319 const PredictorCoefficients *const model)
321 return model->elliott_q1 + filter_offset(nn, model);
324 static const float *softmax_q2_filter(int nn,
325 const PredictorCoefficients *const model)
327 return model->softmax_q2 + filter_offset(nn, model);
330 static const float *elliott_q2_filter(int nn,
331 const PredictorCoefficients *const model)
333 return model->elliott_q2 + filter_offset(nn, model);
336 static void gather_input(const float *src, ptrdiff_t src_stride,
337 float *buf, float mstd[4],
338 const PredictorCoefficients *const model)
340 const float scale = 1.f / model->nsize;
345 for (int i = 0; i < model->ydim; i++) {
346 memcpy(buf, src, model->xdim * sizeof(float));
348 for (int j = 0; j < model->xdim; j++) {
349 const float val = src[j];
359 mstd[0] = sum * scale;
362 tmp = sum_sq * scale - mstd[0] * mstd[0];
363 if (tmp < FLT_EPSILON) {
367 mstd[1] = sqrtf(tmp);
368 mstd[2] = 1.0f / mstd[1];
372 static float softmax_exp(float x)
374 return expf(av_clipf(x, -80.f, 80.f));
377 static void transform_softmax_exp(float *input, int size)
379 for (int i = 0; i < size; i++)
380 input[i] = softmax_exp(input[i]);
383 static void wae5(const float *softmax, const float *el,
384 int n, float mstd[4])
386 float vsum = 0.0f, wsum = 0.0f;
388 for (int i = 0; i < n; i++) {
389 vsum += softmax[i] * elliott(el[i]);
394 mstd[3] += (5.0f * vsum) / wsum * mstd[1] + mstd[0];
399 static void predictor(AVFilterContext *ctx,
400 const void *src, ptrdiff_t src_stride, void *dst,
401 const uint8_t *prescreen, int N,
402 const PredictorCoefficients *const model, int use_q2)
404 const NNEDIContext *const s = ctx->priv;
405 const float *src_p = src;
408 // Adjust source pointer to point to top-left of filter window.
409 const float *window = src_p - (model->ydim / 2) * src_stride - (model->xdim / 2 - 1);
410 const int filter_size = model->nsize;
411 const int nns = model->nns;
413 for (int i = 0; i < N; i++) {
414 LOCAL_ALIGNED_32(float, input, [48 * 6]);
415 float activation[256 * 2];
422 gather_input(window + i, src_stride, input, mstd, model);
425 for (int nn = 0; nn < nns; nn++)
426 activation[nn] = dot_dsp(s, softmax_q1_filter(nn, model), input, filter_size, scale, model->softmax_bias_q1[nn]);
428 for (int nn = 0; nn < nns; nn++)
429 activation[nns + nn] = dot_dsp(s, elliott_q1_filter(nn, model), input, filter_size, scale, model->elliott_bias_q1[nn]);
431 transform_softmax_exp(activation, nns);
432 wae5(activation, activation + nns, nns, mstd);
435 for (int nn = 0; nn < nns; nn++)
436 activation[nn] = dot_dsp(s, softmax_q2_filter(nn, model), input, filter_size, scale, model->softmax_bias_q2[nn]);
438 for (int nn = 0; nn < nns; nn++)
439 activation[nns + nn] = dot_dsp(s, elliott_q2_filter(nn, model), input, filter_size, scale, model->elliott_bias_q2[nn]);
441 transform_softmax_exp(activation, nns);
442 wae5(activation, activation + nns, nns, mstd);
445 dst_p[i] = mstd[3] * (use_q2 ? 0.5f : 1.f);
449 static void read_bytes(const uint8_t *src, float *dst,
450 int src_stride, int dst_stride,
451 int width, int height, float scale)
453 for (int y = 0; y < height; y++) {
454 for (int x = 0; x < 32; x++)
455 dst[-x - 1] = src[x];
457 for (int x = 0; x < width; x++)
460 for (int x = 0; x < 32; x++)
461 dst[width + x] = src[width - x - 1];
468 static void read_words(const uint8_t *srcp, float *dst,
469 int src_stride, int dst_stride,
470 int width, int height, float scale)
472 const uint16_t *src = (const uint16_t *)srcp;
476 for (int y = 0; y < height; y++) {
477 for (int x = 0; x < 32; x++)
478 dst[-x - 1] = src[x] * scale;
480 for (int x = 0; x < width; x++)
481 dst[x] = src[x] * scale;
483 for (int x = 0; x < 32; x++)
484 dst[width + x] = src[width - x - 1] * scale;
491 static void write_bytes(const float *src, uint8_t *dst,
492 int src_stride, int dst_stride,
493 int width, int height, int depth,
496 for (int y = 0; y < height; y++) {
497 for (int x = 0; x < width; x++)
498 dst[x] = av_clip_uint8(src[x]);
505 static void write_words(const float *src, uint8_t *dstp,
506 int src_stride, int dst_stride,
507 int width, int height, int depth,
510 uint16_t *dst = (uint16_t *)dstp;
514 for (int y = 0; y < height; y++) {
515 for (int x = 0; x < width; x++)
516 dst[x] = av_clip_uintp2_c(src[x] * scale, depth);
523 static void interpolation(const void *src, ptrdiff_t src_stride,
524 void *dst, const uint8_t *prescreen, int n)
526 const float *src_p = src;
528 const float *window = src_p - 2 * src_stride;
530 for (int i = 0; i < n; i++) {
536 accum += (-3.0f / 32.0f) * window[0 * src_stride + i];
537 accum += (19.0f / 32.0f) * window[1 * src_stride + i];
538 accum += (19.0f / 32.0f) * window[2 * src_stride + i];
539 accum += (-3.0f / 32.0f) * window[3 * src_stride + i];
545 static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
547 const NNEDIContext *const s = ctx->priv;
548 AVFrame *out = s->dst;
549 AVFrame *in = s->src;
550 const float in_scale = s->in_scale;
551 const float out_scale = s->out_scale;
552 const int depth = s->depth;
553 const int interlaced = in->interlaced_frame;
554 const int tff = s->field_n == (s->field < 0 ? interlaced ? in->top_field_first : 1 :
558 for (int p = 0; p < s->nb_planes; p++) {
559 const int height = s->planeheight[p];
560 const int width = s->planewidth[p];
561 const int slice_start = 2 * ((height / 2 * jobnr) / nb_jobs);
562 const int slice_end = 2 * ((height / 2 * (jobnr+1)) / nb_jobs);
563 const uint8_t *src_data = in->data[p];
564 uint8_t *dst_data = out->data[p];
565 uint8_t *dst = out->data[p] + slice_start * out->linesize[p];
566 const int src_linesize = in->linesize[p];
567 const int dst_linesize = out->linesize[p];
568 uint8_t *prescreen_buf = s->prescreen_buf[jobnr];
569 float *srcbuf = s->input_buf[jobnr];
570 const int srcbuf_stride = width + 64;
571 float *dstbuf = s->output_buf[jobnr];
572 const int dstbuf_stride = width;
573 const int slice_height = (slice_end - slice_start) / 2;
574 const int last_slice = slice_end == height;
575 const uint8_t *in_line;
579 if (!(s->process_plane & (1 << p))) {
580 av_image_copy_plane(dst, out->linesize[p],
581 in->data[p] + slice_start * in->linesize[p],
583 s->linesize[p], slice_end - slice_start);
587 y_out = slice_start + (tff ^ (slice_start & 1));
588 in_line = src_data + (y_out * src_linesize);
589 out_line = dst_data + (y_out * dst_linesize);
591 while (y_out < slice_end) {
592 memcpy(out_line, in_line, s->linesize[p]);
594 in_line += src_linesize * 2;
595 out_line += dst_linesize * 2;
598 y_out = slice_start + ((!tff) ^ (slice_start & 1));
600 s->read(src_data + FFMAX(y_out - 5, tff) * src_linesize,
602 src_linesize * 2, srcbuf_stride,
604 srcbuf += srcbuf_stride;
606 s->read(src_data + FFMAX(y_out - 3, tff) * src_linesize,
608 src_linesize * 2, srcbuf_stride,
610 srcbuf += srcbuf_stride;
612 s->read(src_data + FFMAX(y_out - 1, tff) * src_linesize,
614 src_linesize * 2, srcbuf_stride,
616 srcbuf += srcbuf_stride;
618 in_line = src_data + FFMIN(y_out + 1, height - 1 - !tff) * src_linesize;
619 out_line = dst_data + (y_out * dst_linesize);
621 s->read(in_line, srcbuf + 32, src_linesize * 2, srcbuf_stride,
622 width, slice_height - last_slice, in_scale);
624 y_out += (slice_height - last_slice) * 2;
626 s->read(src_data + FFMIN(y_out + 1, height - 1 - !tff) * src_linesize,
627 srcbuf + 32 + srcbuf_stride * (slice_height - last_slice),
628 src_linesize * 2, srcbuf_stride,
631 s->read(src_data + FFMIN(y_out + 3, height - 1 - !tff) * src_linesize,
632 srcbuf + 32 + srcbuf_stride * (slice_height + 1 - last_slice),
633 src_linesize * 2, srcbuf_stride,
636 s->read(src_data + FFMIN(y_out + 5, height - 1 - !tff) * src_linesize,
637 srcbuf + 32 + srcbuf_stride * (slice_height + 2 - last_slice),
638 src_linesize * 2, srcbuf_stride,
641 for (int y = 0; y < slice_end - slice_start; y += 2) {
642 if (s->prescreen > 0)
643 s->prescreen[s->pscrn > 1](ctx, srcbuf + (y / 2) * srcbuf_stride + 32,
644 srcbuf_stride, prescreen_buf, width,
645 &s->prescreener[s->pscrn - 1]);
648 srcbuf + (y / 2) * srcbuf_stride + 32,
650 dstbuf + (y / 2) * dstbuf_stride,
651 prescreen_buf, width,
652 &s->coeffs[s->etype][s->nnsparam][s->nsize], s->qual == 2);
654 if (s->prescreen > 0)
655 interpolation(srcbuf + (y / 2) * srcbuf_stride + 32,
657 dstbuf + (y / 2) * dstbuf_stride,
658 prescreen_buf, width);
661 s->write(dstbuf, out_line, dstbuf_stride, dst_linesize * 2,
662 width, slice_height, depth, out_scale);
668 static int get_frame(AVFilterContext *ctx, int is_second)
670 NNEDIContext *s = ctx->priv;
671 AVFilterLink *outlink = ctx->outputs[0];
672 AVFrame *src = s->src;
674 s->dst = ff_get_video_buffer(outlink, outlink->w, outlink->h);
676 return AVERROR(ENOMEM);
677 av_frame_copy_props(s->dst, src);
678 s->dst->interlaced_frame = 0;
680 ctx->internal->execute(ctx, filter_slice, NULL, NULL, FFMIN(s->planeheight[1] / 2, s->nb_threads));
682 if (s->field == -2 || s->field > 1)
683 s->field_n = !s->field_n;
688 static int filter_frame(AVFilterLink *inlink, AVFrame *src)
690 AVFilterContext *ctx = inlink->dst;
691 AVFilterLink *outlink = ctx->outputs[0];
692 NNEDIContext *s = ctx->priv;
696 s->field == -2) && !s->second) {
698 } else if (s->field > 1 ||
703 ret = get_frame(ctx, 1);
705 av_frame_free(&s->dst);
706 av_frame_free(&s->second);
712 if (src->pts != AV_NOPTS_VALUE &&
713 dst->pts != AV_NOPTS_VALUE)
714 dst->pts += src->pts;
716 dst->pts = AV_NOPTS_VALUE;
718 ret = ff_filter_frame(outlink, dst);
723 s->cur_pts = s->second->pts;
724 av_frame_free(&s->second);
726 if ((s->deint && src->interlaced_frame &&
727 !ctx->is_disabled) ||
728 (!s->deint && !ctx->is_disabled)) {
733 if ((s->deint && !src->interlaced_frame) || ctx->is_disabled) {
734 AVFrame *dst = av_frame_clone(src);
737 av_frame_free(&s->second);
738 return AVERROR(ENOMEM);
741 if (s->field > 1 || s->field == -2) {
742 av_frame_free(&s->second);
743 if ((s->deint && src->interlaced_frame) ||
749 if (dst->pts != AV_NOPTS_VALUE)
751 return ff_filter_frame(outlink, dst);
755 ret = get_frame(ctx, 0);
757 av_frame_free(&s->dst);
758 av_frame_free(&s->src);
759 av_frame_free(&s->second);
763 if (src->pts != AV_NOPTS_VALUE)
764 s->dst->pts = src->pts * 2;
765 if (s->field <= 1 && s->field > -2) {
770 return ff_filter_frame(outlink, s->dst);
773 static int request_frame(AVFilterLink *link)
775 AVFilterContext *ctx = link->src;
776 NNEDIContext *s = ctx->priv;
782 ret = ff_request_frame(ctx->inputs[0]);
784 if (ret == AVERROR_EOF && s->second) {
785 AVFrame *next = av_frame_clone(s->second);
788 return AVERROR(ENOMEM);
790 next->pts = s->second->pts * 2 - s->cur_pts;
793 filter_frame(ctx->inputs[0], next);
794 } else if (ret < 0) {
801 static void copy_weights(float *dst, int n, const float **data)
803 memcpy(dst, *data, n * sizeof(float));
807 static float *allocate(float **ptr, int size)
816 static int allocate_model(PredictorCoefficients *coeffs, int xdim, int ydim, int nns)
818 int filter_size = nns * xdim * ydim;
822 data = av_calloc(filter_size + bias_size, 4 * sizeof(float));
824 return AVERROR(ENOMEM);
829 coeffs->nsize = xdim * ydim;
832 coeffs->softmax_q1 = allocate(&data, filter_size);
833 coeffs->elliott_q1 = allocate(&data, filter_size);
834 coeffs->softmax_bias_q1 = allocate(&data, bias_size);
835 coeffs->elliott_bias_q1 = allocate(&data, bias_size);
837 coeffs->softmax_q2 = allocate(&data, filter_size);
838 coeffs->elliott_q2 = allocate(&data, filter_size);
839 coeffs->softmax_bias_q2 = allocate(&data, bias_size);
840 coeffs->elliott_bias_q2 = allocate(&data, bias_size);
845 static int read_weights(AVFilterContext *ctx, const float *bdata)
847 NNEDIContext *s = ctx->priv;
850 copy_weights(&s->prescreener[0].kernel_l0[0][0], 4 * 48, &bdata);
851 copy_weights(s->prescreener[0].bias_l0, 4, &bdata);
853 copy_weights(&s->prescreener[0].kernel_l1[0][0], 4 * 4, &bdata);
854 copy_weights(s->prescreener[0].bias_l1, 4, &bdata);
856 copy_weights(&s->prescreener[0].kernel_l2[0][0], 4 * 8, &bdata);
857 copy_weights(s->prescreener[0].bias_l2, 4, &bdata);
859 for (int i = 0; i < 3; i++) {
860 PrescreenerCoefficients *data = &s->prescreener[i + 1];
861 float kernel_l0_shuffled[4 * 64];
862 float kernel_l1_shuffled[4 * 4];
864 copy_weights(kernel_l0_shuffled, 4 * 64, &bdata);
865 copy_weights(data->bias_l0, 4, &bdata);
867 copy_weights(kernel_l1_shuffled, 4 * 4, &bdata);
868 copy_weights(data->bias_l1, 4, &bdata);
870 for (int n = 0; n < 4; n++) {
871 for (int k = 0; k < 64; k++)
872 data->kernel_l0[n][k] = kernel_l0_shuffled[(k / 8) * 32 + n * 8 + k % 8];
873 for (int k = 0; k < 4; k++)
874 data->kernel_l1[n][k] = kernel_l1_shuffled[k * 4 + n];
878 for (int m = 0; m < 2; m++) {
879 // Grouping by neuron count.
880 for (int i = 0; i < 5; i++) {
881 const int nns = NNEDI_NNS[i];
883 // Grouping by window size.
884 for (int j = 0; j < 7; j++) {
885 PredictorCoefficients *model = &s->coeffs[m][i][j];
886 const int xdim = NNEDI_XDIM[j];
887 const int ydim = NNEDI_YDIM[j];
888 const int filter_size = xdim * ydim;
890 ret = allocate_model(model, xdim, ydim, nns);
894 // Quality 1 model. NNS[i] * (XDIM[j] * YDIM[j]) * 2 coefficients.
895 copy_weights(model->softmax_q1, nns * filter_size, &bdata);
896 copy_weights(model->elliott_q1, nns * filter_size, &bdata);
898 // Quality 1 model bias. NNS[i] * 2 coefficients.
899 copy_weights(model->softmax_bias_q1, nns, &bdata);
900 copy_weights(model->elliott_bias_q1, nns, &bdata);
902 // Quality 2 model. NNS[i] * (XDIM[j] * YDIM[j]) * 2 coefficients.
903 copy_weights(model->softmax_q2, nns * filter_size, &bdata);
904 copy_weights(model->elliott_q2, nns * filter_size, &bdata);
906 // Quality 2 model bias. NNS[i] * 2 coefficients.
907 copy_weights(model->softmax_bias_q2, nns, &bdata);
908 copy_weights(model->elliott_bias_q2, nns, &bdata);
916 static float mean(const float *input, int size)
920 for (int i = 0; i < size; i++)
926 static void transform(float *input, int size, float mean, float half)
928 for (int i = 0; i < size; i++)
929 input[i] = (input[i] - mean) / half;
932 static void subtract_mean_old(PrescreenerCoefficients *coeffs, float half)
934 for (int n = 0; n < 4; n++) {
935 float m = mean(coeffs->kernel_l0[n], 48);
937 transform(coeffs->kernel_l0[n], 48, m, half);
941 static void subtract_mean_new(PrescreenerCoefficients *coeffs, float half)
943 for (int n = 0; n < 4; n++) {
944 float m = mean(coeffs->kernel_l0[n], 64);
946 transform(coeffs->kernel_l0[n], 64, m, half);
950 static void subtract_mean_predictor(PredictorCoefficients *model)
952 const int filter_size = model->nsize;
953 const int nns = model->nns;
954 const float scale = 1.f / nns;
956 double softmax_means[256]; // Average of individual softmax filters.
957 double elliott_means[256]; // Average of individual elliott filters.
958 double mean_filter[48 * 6]; // Pointwise average of all softmax filters.
962 for (int nn = 0; nn < nns; nn++) {
963 softmax_means[nn] = mean(model->softmax_q1 + nn * filter_size, filter_size);
964 elliott_means[nn] = mean(model->elliott_q1 + nn * filter_size, filter_size);
966 for (int k = 0; k < filter_size; k++)
967 mean_filter[k] += model->softmax_q1[nn * filter_size + k] - softmax_means[nn];
970 for (int k = 0; k < filter_size; k++)
971 mean_filter[k] *= scale;
973 mean_bias = mean(model->softmax_bias_q1, nns);
975 for (int nn = 0; nn < nns; nn++) {
976 for (int k = 0; k < filter_size; k++) {
977 model->softmax_q1[nn * filter_size + k] -= softmax_means[nn] + mean_filter[k];
978 model->elliott_q1[nn * filter_size + k] -= elliott_means[nn];
980 model->softmax_bias_q1[nn] -= mean_bias;
984 memset(mean_filter, 0, sizeof(mean_filter));
986 for (int nn = 0; nn < nns; nn++) {
987 softmax_means[nn] = mean(model->softmax_q2 + nn * filter_size, filter_size);
988 elliott_means[nn] = mean(model->elliott_q2 + nn * filter_size, filter_size);
990 for (int k = 0; k < filter_size; k++) {
991 mean_filter[k] += model->softmax_q2[nn * filter_size + k] - softmax_means[nn];
995 for (int k = 0; k < filter_size; k++)
996 mean_filter[k] *= scale;
998 mean_bias = mean(model->softmax_bias_q2, nns);
1000 for (int nn = 0; nn < nns; nn++) {
1001 for (int k = 0; k < filter_size; k++) {
1002 model->softmax_q2[nn * filter_size + k] -= softmax_means[nn] + mean_filter[k];
1003 model->elliott_q2[nn * filter_size + k] -= elliott_means[nn];
1006 model->softmax_bias_q2[nn] -= mean_bias;
1010 static av_cold int init(AVFilterContext *ctx)
1012 NNEDIContext *s = ctx->priv;
1013 FILE *weights_file = NULL;
1014 int64_t weights_size;
1019 weights_file = av_fopen_utf8(s->weights_file, "rb");
1020 if (!weights_file) {
1021 av_log(ctx, AV_LOG_ERROR, "No weights file provided, aborting!\n");
1022 return AVERROR(EINVAL);
1025 if (fseek(weights_file, 0, SEEK_END)) {
1026 av_log(ctx, AV_LOG_ERROR, "Couldn't seek to the end of weights file.\n");
1027 fclose(weights_file);
1028 return AVERROR(EINVAL);
1031 weights_size = ftell(weights_file);
1033 if (weights_size == -1) {
1034 fclose(weights_file);
1035 av_log(ctx, AV_LOG_ERROR, "Couldn't get size of weights file.\n");
1036 return AVERROR(EINVAL);
1037 } else if (weights_size != NNEDI_WEIGHTS_SIZE) {
1038 fclose(weights_file);
1039 av_log(ctx, AV_LOG_ERROR, "Unexpected weights file size.\n");
1040 return AVERROR(EINVAL);
1043 if (fseek(weights_file, 0, SEEK_SET)) {
1044 fclose(weights_file);
1045 av_log(ctx, AV_LOG_ERROR, "Couldn't seek to the start of weights file.\n");
1046 return AVERROR(EINVAL);
1049 bdata = av_malloc(NNEDI_WEIGHTS_SIZE);
1051 fclose(weights_file);
1052 return AVERROR(ENOMEM);
1055 bytes_read = fread(bdata, 1, NNEDI_WEIGHTS_SIZE, weights_file);
1056 if (bytes_read != NNEDI_WEIGHTS_SIZE) {
1057 fclose(weights_file);
1058 ret = AVERROR_INVALIDDATA;
1059 av_log(ctx, AV_LOG_ERROR, "Couldn't read weights file.\n");
1063 fclose(weights_file);
1065 s->fdsp = avpriv_float_dsp_alloc(0);
1067 ret = AVERROR(ENOMEM);
1071 ret = read_weights(ctx, bdata);
1080 static int config_input(AVFilterLink *inlink)
1082 AVFilterContext *ctx = inlink->dst;
1083 NNEDIContext *s = ctx->priv;
1084 const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
1087 s->depth = desc->comp[0].depth;
1088 s->nb_threads = ff_filter_get_nb_threads(ctx);
1089 s->nb_planes = av_pix_fmt_count_planes(inlink->format);
1090 if ((ret = av_image_fill_linesizes(s->linesize, inlink->format, inlink->w)) < 0)
1093 s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
1094 s->planewidth[0] = s->planewidth[3] = inlink->w;
1095 s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
1096 s->planeheight[0] = s->planeheight[3] = inlink->h;
1098 s->half = ((1 << 8) - 1) / 2.f;
1099 s->out_scale = 1 << (s->depth - 8);
1100 s->in_scale = 1.f / s->out_scale;
1104 s->read = read_bytes;
1105 s->write = write_bytes;
1108 s->read = read_words;
1109 s->write = write_words;
1113 subtract_mean_old(&s->prescreener[0], s->half);
1114 subtract_mean_new(&s->prescreener[1], s->half);
1115 subtract_mean_new(&s->prescreener[2], s->half);
1116 subtract_mean_new(&s->prescreener[3], s->half);
1118 s->prescreen[0] = process_old;
1119 s->prescreen[1] = process_new;
1121 for (int i = 0; i < 2; i++) {
1122 for (int j = 0; j < 5; j++) {
1123 for (int k = 0; k < 7; k++)
1124 subtract_mean_predictor(&s->coeffs[i][j][k]);
1128 s->input_size = (s->planewidth[0] + 64) * (s->planeheight[0] + 6);
1129 s->input_buf = av_calloc(s->nb_threads, sizeof(*s->input_buf));
1131 return AVERROR(ENOMEM);
1133 for (int i = 0; i < s->nb_threads; i++) {
1134 s->input_buf[i] = av_calloc(s->input_size, sizeof(**s->input_buf));
1135 if (!s->input_buf[i])
1136 return AVERROR(ENOMEM);
1139 s->output_buf = av_calloc(s->nb_threads, sizeof(*s->output_buf));
1141 return AVERROR(ENOMEM);
1143 for (int i = 0; i < s->nb_threads; i++) {
1144 s->output_buf[i] = av_calloc(s->input_size, sizeof(**s->output_buf));
1145 if (!s->output_buf[i])
1146 return AVERROR(ENOMEM);
1149 s->prescreen_buf = av_calloc(s->nb_threads, sizeof(*s->prescreen_buf));
1150 if (!s->prescreen_buf)
1151 return AVERROR(ENOMEM);
1153 for (int i = 0; i < s->nb_threads; i++) {
1154 s->prescreen_buf[i] = av_calloc(s->planewidth[0], sizeof(**s->prescreen_buf));
1155 if (!s->prescreen_buf[i])
1156 return AVERROR(ENOMEM);
1162 static av_cold void uninit(AVFilterContext *ctx)
1164 NNEDIContext *s = ctx->priv;
1166 for (int i = 0; i < s->nb_threads && s->prescreen_buf; i++)
1167 av_freep(&s->prescreen_buf[i]);
1169 av_freep(&s->prescreen_buf);
1171 for (int i = 0; i < s->nb_threads && s->input_buf; i++)
1172 av_freep(&s->input_buf[i]);
1174 av_freep(&s->input_buf);
1176 for (int i = 0; i < s->nb_threads && s->output_buf; i++)
1177 av_freep(&s->output_buf[i]);
1179 av_freep(&s->output_buf);
1182 for (int i = 0; i < 2; i++) {
1183 for (int j = 0; j < 5; j++) {
1184 for (int k = 0; k < 7; k++) {
1185 av_freep(&s->coeffs[i][j][k].data);
1190 av_frame_free(&s->second);
1193 static const AVFilterPad inputs[] = {
1196 .type = AVMEDIA_TYPE_VIDEO,
1197 .filter_frame = filter_frame,
1198 .config_props = config_input,
1203 static const AVFilterPad outputs[] = {
1206 .type = AVMEDIA_TYPE_VIDEO,
1207 .config_props = config_output,
1208 .request_frame = request_frame,
1213 AVFilter ff_vf_nnedi = {
1215 .description = NULL_IF_CONFIG_SMALL("Apply neural network edge directed interpolation intra-only deinterlacer."),
1216 .priv_size = sizeof(NNEDIContext),
1217 .priv_class = &nnedi_class,
1220 .query_formats = query_formats,
1223 .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL | AVFILTER_FLAG_SLICE_THREADS,
1224 .process_command = ff_filter_process_command,