2 * Copyright (C) 2010-2011 Kevin Stone
3 * Copyright (C) 2016 Paul B Mahol
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License along
18 * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 #include "libavutil/common.h"
25 #include "libavutil/float_dsp.h"
26 #include "libavutil/imgutils.h"
27 #include "libavutil/opt.h"
28 #include "libavutil/pixdesc.h"
34 typedef struct FrameData {
50 typedef struct NNEDIContext {
61 AVFloatDSPContext *fdsp;
86 void (*copy_pad)(const AVFrame *, FrameData *, struct NNEDIContext *, int);
87 void (*evalfunc_0)(struct NNEDIContext *, FrameData *);
88 void (*evalfunc_1)(struct NNEDIContext *, FrameData *);
90 // Functions used in evalfunc_0
91 void (*readpixels)(const uint8_t *, const int, float *);
92 void (*compute_network0)(struct NNEDIContext *s, const float *, const float *, uint8_t *);
93 int32_t (*process_line0)(const uint8_t *, int, uint8_t *, const uint8_t *, const int, const int, const int);
95 // Functions used in evalfunc_1
96 void (*extract)(const uint8_t *, const int, const int, const int, float *, float *);
97 void (*dot_prod)(struct NNEDIContext *, const float *, const float *, float *, const int, const int, const float *);
98 void (*expfunc)(float *, const int);
99 void (*wae5)(const float *, const int, float *);
101 FrameData frame_data;
104 #define OFFSET(x) offsetof(NNEDIContext, x)
105 #define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
107 static const AVOption nnedi_options[] = {
108 {"weights", "set weights file", OFFSET(weights_file), AV_OPT_TYPE_STRING, {.str="nnedi3_weights.bin"}, 0, 0, FLAGS },
109 {"deint", "set which frames to deinterlace", OFFSET(deint), AV_OPT_TYPE_INT, {.i64=0}, 0, 1, FLAGS, "deint" },
110 {"all", "deinterlace all frames", 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, FLAGS, "deint" },
111 {"interlaced", "only deinterlace frames marked as interlaced", 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, FLAGS, "deint" },
112 {"field", "set mode of operation", OFFSET(field), AV_OPT_TYPE_INT, {.i64=-1}, -2, 3, FLAGS, "field" },
113 {"af", "use frame flags, both fields", 0, AV_OPT_TYPE_CONST, {.i64=-2}, 0, 0, FLAGS, "field" },
114 {"a", "use frame flags, single field", 0, AV_OPT_TYPE_CONST, {.i64=-1}, 0, 0, FLAGS, "field" },
115 {"t", "use top field only", 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, FLAGS, "field" },
116 {"b", "use bottom field only", 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, FLAGS, "field" },
117 {"tf", "use both fields, top first", 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, FLAGS, "field" },
118 {"bf", "use both fields, bottom first", 0, AV_OPT_TYPE_CONST, {.i64=3}, 0, 0, FLAGS, "field" },
119 {"planes", "set which planes to process", OFFSET(process_plane), AV_OPT_TYPE_INT, {.i64=7}, 0, 7, FLAGS },
120 {"nsize", "set size of local neighborhood around each pixel, used by the predictor neural network", OFFSET(nsize), AV_OPT_TYPE_INT, {.i64=6}, 0, 6, FLAGS, "nsize" },
121 {"s8x6", NULL, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, FLAGS, "nsize" },
122 {"s16x6", NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, FLAGS, "nsize" },
123 {"s32x6", NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, FLAGS, "nsize" },
124 {"s48x6", NULL, 0, AV_OPT_TYPE_CONST, {.i64=3}, 0, 0, FLAGS, "nsize" },
125 {"s8x4", NULL, 0, AV_OPT_TYPE_CONST, {.i64=4}, 0, 0, FLAGS, "nsize" },
126 {"s16x4", NULL, 0, AV_OPT_TYPE_CONST, {.i64=5}, 0, 0, FLAGS, "nsize" },
127 {"s32x4", NULL, 0, AV_OPT_TYPE_CONST, {.i64=6}, 0, 0, FLAGS, "nsize" },
128 {"nns", "set number of neurons in predictor neural network", OFFSET(nnsparam), AV_OPT_TYPE_INT, {.i64=1}, 0, 4, FLAGS, "nns" },
129 {"n16", NULL, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, FLAGS, "nns" },
130 {"n32", NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, FLAGS, "nns" },
131 {"n64", NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, FLAGS, "nns" },
132 {"n128", NULL, 0, AV_OPT_TYPE_CONST, {.i64=3}, 0, 0, FLAGS, "nns" },
133 {"n256", NULL, 0, AV_OPT_TYPE_CONST, {.i64=4}, 0, 0, FLAGS, "nns" },
134 {"qual", "set quality", OFFSET(qual), AV_OPT_TYPE_INT, {.i64=1}, 1, 2, FLAGS, "qual" },
135 {"fast", NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, FLAGS, "qual" },
136 {"slow", NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, FLAGS, "qual" },
137 {"etype", "set which set of weights to use in the predictor", OFFSET(etype), AV_OPT_TYPE_INT, {.i64=0}, 0, 1, FLAGS, "etype" },
138 {"a", "weights trained to minimize absolute error", 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, FLAGS, "etype" },
139 {"s", "weights trained to minimize squared error", 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, FLAGS, "etype" },
140 {"pscrn", "set prescreening", OFFSET(pscrn), AV_OPT_TYPE_INT, {.i64=2}, 0, 2, FLAGS, "pscrn" },
141 {"none", NULL, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, FLAGS, "pscrn" },
142 {"original", NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, FLAGS, "pscrn" },
143 {"new", NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, FLAGS, "pscrn" },
144 {"fapprox", NULL, OFFSET(fapprox), AV_OPT_TYPE_INT, {.i64=0}, 0, 3, FLAGS },
148 AVFILTER_DEFINE_CLASS(nnedi);
150 static int config_input(AVFilterLink *inlink)
152 AVFilterContext *ctx = inlink->dst;
153 NNEDIContext *s = ctx->priv;
154 const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
157 s->nb_planes = av_pix_fmt_count_planes(inlink->format);
158 if ((ret = av_image_fill_linesizes(s->linesize, inlink->format, inlink->w)) < 0)
161 s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
162 s->planeheight[0] = s->planeheight[3] = inlink->h;
167 static int config_output(AVFilterLink *outlink)
169 AVFilterContext *ctx = outlink->src;
170 NNEDIContext *s = ctx->priv;
172 outlink->time_base.num = ctx->inputs[0]->time_base.num;
173 outlink->time_base.den = ctx->inputs[0]->time_base.den * 2;
174 outlink->w = ctx->inputs[0]->w;
175 outlink->h = ctx->inputs[0]->h;
177 if (s->field > 1 || s->field == -2)
178 outlink->frame_rate = av_mul_q(ctx->inputs[0]->frame_rate,
184 static int query_formats(AVFilterContext *ctx)
186 static const enum AVPixelFormat pix_fmts[] = {
187 AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
188 AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
189 AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P,
190 AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ440P,
191 AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ420P,
198 AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
200 return AVERROR(ENOMEM);
201 return ff_set_common_formats(ctx, fmts_list);
204 static void copy_pad(const AVFrame *src, FrameData *frame_data, NNEDIContext *s, int fn)
206 const int off = 1 - fn;
209 for (plane = 0; plane < s->nb_planes; plane++) {
210 const uint8_t *srcp = (const uint8_t *)src->data[plane];
211 uint8_t *dstp = (uint8_t *)frame_data->paddedp[plane];
213 const int src_stride = src->linesize[plane];
214 const int dst_stride = frame_data->padded_stride[plane];
216 const int src_height = s->planeheight[plane];
217 const int dst_height = frame_data->padded_height[plane];
219 const int src_width = s->linesize[plane];
220 const int dst_width = frame_data->padded_width[plane];
224 if (!(s->process_plane & (1 << plane)))
228 for (y = off; y < src_height; y += 2)
229 memcpy(dstp + 32 + (6 + y) * dst_stride,
230 srcp + y * src_stride,
231 src_width * sizeof(uint8_t));
234 dstp += (6 + off) * dst_stride;
235 for (y = 6 + off; y < dst_height - 6; y += 2) {
238 for (x = 0; x < 32; x++)
239 dstp[x] = dstp[64 - x];
241 for (x = dst_width - 32; x < dst_width; x++, c += 2)
242 dstp[x] = dstp[x - c];
244 dstp += dst_stride * 2;
247 dstp = (uint8_t *)frame_data->paddedp[plane];
248 for (y = off; y < 6; y += 2)
249 memcpy(dstp + y * dst_stride,
250 dstp + (12 + 2 * off - y) * dst_stride,
251 dst_width * sizeof(uint8_t));
253 for (y = dst_height - 6 + off; y < dst_height; y += 2, c += 4)
254 memcpy(dstp + y * dst_stride,
255 dstp + (y - c) * dst_stride,
256 dst_width * sizeof(uint8_t));
260 static void elliott(float *data, const int n)
264 for (i = 0; i < n; i++)
265 data[i] = data[i] / (1.0f + FFABS(data[i]));
268 static void dot_prod(NNEDIContext *s, const float *data, const float *weights, float *vals, const int n, const int len, const float *scale)
272 for (i = 0; i < n; i++) {
275 sum = s->fdsp->scalarproduct_float(data, &weights[i * len], len);
277 vals[i] = sum * scale[0] + weights[n * len + i];
281 static void dot_prods(NNEDIContext *s, const float *dataf, const float *weightsf, float *vals, const int n, const int len, const float *scale)
283 const int16_t *data = (int16_t *)dataf;
284 const int16_t *weights = (int16_t *)weightsf;
285 const float *wf = (float *)&weights[n * len];
288 for (i = 0; i < n; i++) {
289 int sum = 0, off = ((i >> 2) << 3) + (i & 3);
290 for (j = 0; j < len; j++)
291 sum += data[j] * weights[i * len + j];
293 vals[i] = sum * wf[off] * scale[0] + wf[off + 4];
297 static void compute_network0(NNEDIContext *s, const float *input, const float *weights, uint8_t *d)
299 float t, temp[12], scale = 1.0f;
301 dot_prod(s, input, weights, temp, 4, 48, &scale);
305 dot_prod(s, temp, weights + 4 * 49, temp + 4, 4, 4, &scale);
306 elliott(temp + 4, 4);
307 dot_prod(s, temp, weights + 4 * 49 + 4 * 5, temp + 8, 4, 8, &scale);
308 if (FFMAX(temp[10], temp[11]) <= FFMAX(temp[8], temp[9]))
314 static void compute_network0_i16(NNEDIContext *s, const float *inputf, const float *weightsf, uint8_t *d)
316 const float *wf = weightsf + 2 * 48;
317 float t, temp[12], scale = 1.0f;
319 dot_prods(s, inputf, weightsf, temp, 4, 48, &scale);
323 dot_prod(s, temp, wf + 8, temp + 4, 4, 4, &scale);
324 elliott(temp + 4, 4);
325 dot_prod(s, temp, wf + 8 + 4 * 5, temp + 8, 4, 8, &scale);
326 if (FFMAX(temp[10], temp[11]) <= FFMAX(temp[8], temp[9]))
332 static void pixel2float48(const uint8_t *t8, const int pitch, float *p)
334 const uint8_t *t = (const uint8_t *)t8;
337 for (y = 0; y < 4; y++)
338 for (x = 0; x < 12; x++)
339 p[y * 12 + x] = t[y * pitch * 2 + x];
342 static void byte2word48(const uint8_t *t, const int pitch, float *pf)
344 int16_t *p = (int16_t *)pf;
347 for (y = 0; y < 4; y++)
348 for (x = 0; x < 12; x++)
349 p[y * 12 + x] = t[y * pitch * 2 + x];
352 static int32_t process_line0(const uint8_t *tempu, int width, uint8_t *dstp8, const uint8_t *src3p8, const int src_pitch, const int max_value, const int chroma)
354 uint8_t *dstp = (uint8_t *)dstp8;
355 const uint8_t *src3p = (const uint8_t *)src3p8;
357 int maximum = max_value - 1; // Technically the -1 is only needed for 8 and 16 bit input.
359 for (x = 0; x < width; x++) {
361 int tmp = 19 * (src3p[x + src_pitch * 2] + src3p[x + src_pitch * 4]) - 3 * (src3p[x] + src3p[x + src_pitch * 6]);
363 dstp[x] = FFMAX(FFMIN(tmp, maximum), minimum);
372 // new prescreener functions
373 static void byte2word64(const uint8_t *t, const int pitch, float *p)
375 int16_t *ps = (int16_t *)p;
378 for (y = 0; y < 4; y++)
379 for (x = 0; x < 16; x++)
380 ps[y * 16 + x] = t[y * pitch * 2 + x];
383 static void compute_network0new(NNEDIContext *s, const float *datai, const float *weights, uint8_t *d)
385 int16_t *data = (int16_t *)datai;
386 int16_t *ws = (int16_t *)weights;
387 float *wf = (float *)&ws[4 * 64];
391 for (i = 0; i < 4; i++) {
395 for (j = 0; j < 64; j++)
396 sum += data[j] * ws[(i << 3) + ((j >> 3) << 5) + (j & 7)];
397 t = sum * wf[i] + wf[4 + i];
398 vals[i] = t / (1.0f + FFABS(t));
401 for (i = 0; i < 4; i++) {
404 for (j = 0; j < 4; j++)
405 sum += vals[j] * wf[8 + i + (j << 2)];
406 vals[4 + i] = sum + wf[8 + 16 + i];
410 for (i = 0; i < 4; i++) {
411 if (vals[4 + i] > 0.0f)
412 mask |= (0x1 << (i << 3));
415 ((int *)d)[0] = mask;
418 static void evalfunc_0(NNEDIContext *s, FrameData *frame_data)
420 float *input = frame_data->input;
421 const float *weights0 = s->weights0;
422 float *temp = frame_data->temp;
423 uint8_t *tempu = (uint8_t *)temp;
426 // And now the actual work.
427 for (plane = 0; plane < s->nb_planes; plane++) {
428 const uint8_t *srcp = (const uint8_t *)frame_data->paddedp[plane];
429 const int src_stride = frame_data->padded_stride[plane] / sizeof(uint8_t);
431 const int width = frame_data->padded_width[plane];
432 const int height = frame_data->padded_height[plane];
434 uint8_t *dstp = (uint8_t *)frame_data->dstp[plane];
435 const int dst_stride = frame_data->dst_stride[plane] / sizeof(uint8_t);
436 const uint8_t *src3p;
440 if (!(s->process_plane & (1 << plane)))
443 for (y = 1 - frame_data->field[plane]; y < height - 12; y += 2) {
444 memcpy(dstp + y * dst_stride,
445 srcp + 32 + (6 + y) * src_stride,
446 (width - 64) * sizeof(uint8_t));
450 ystart = 6 + frame_data->field[plane];
452 srcp += ystart * src_stride;
453 dstp += (ystart - 6) * dst_stride - 32;
454 src3p = srcp - src_stride * 3;
455 lcount = frame_data->lcount[plane] - 6;
457 if (s->pscrn == 1) { // original
458 for (y = ystart; y < ystop; y += 2) {
459 for (x = 32; x < width - 32; x++) {
460 s->readpixels((const uint8_t *)(src3p + x - 5), src_stride, input);
461 s->compute_network0(s, input, weights0, tempu+x);
463 lcount[y] += s->process_line0(tempu + 32, width - 64, (uint8_t *)(dstp + 32), (const uint8_t *)(src3p + 32), src_stride, s->max_value, plane);
464 src3p += src_stride * 2;
465 dstp += dst_stride * 2;
467 } else if (s->pscrn > 1) { // new
468 for (y = ystart; y < ystop; y += 2) {
469 for (x = 32; x < width - 32; x += 4) {
470 s->readpixels((const uint8_t *)(src3p + x - 6), src_stride, input);
471 s->compute_network0(s, input, weights0, tempu + x);
473 lcount[y] += s->process_line0(tempu + 32, width - 64, (uint8_t *)(dstp + 32), (const uint8_t *)(src3p + 32), src_stride, s->max_value, plane);
474 src3p += src_stride * 2;
475 dstp += dst_stride * 2;
477 } else { // no prescreening
478 for (y = ystart; y < ystop; y += 2) {
479 memset(dstp + 32, 255, (width - 64) * sizeof(uint8_t));
480 lcount[y] += width - 64;
481 dstp += dst_stride * 2;
487 static void extract_m8(const uint8_t *srcp8, const int stride, const int xdia, const int ydia, float *mstd, float *input)
489 // uint8_t or uint16_t or float
490 const uint8_t *srcp = (const uint8_t *)srcp8;
494 // int32_t or int64_t or double
495 int64_t sum = 0, sumsq = 0;
498 for (y = 0; y < ydia; y++) {
499 const uint8_t *srcpT = srcp + y * stride * 2;
501 for (x = 0; x < xdia; x++) {
503 sumsq += (uint32_t)srcpT[x] * (uint32_t)srcpT[x];
508 scale = 1.0f / (xdia * ydia);
509 mstd[0] = sum * scale;
510 tmp = (double)sumsq * scale - (double)mstd[0] * mstd[0];
512 if (tmp <= FLT_EPSILON)
513 mstd[1] = mstd[2] = 0.0f;
516 mstd[2] = 1.0f / mstd[1];
520 static void extract_m8_i16(const uint8_t *srcp, const int stride, const int xdia, const int ydia, float *mstd, float *inputf)
522 int16_t *input = (int16_t *)inputf;
524 int sum = 0, sumsq = 0;
527 for (y = 0; y < ydia; y++) {
528 const uint8_t *srcpT = srcp + y * stride * 2;
529 for (x = 0; x < xdia; x++) {
531 sumsq += srcpT[x] * srcpT[x];
536 scale = 1.0f / (float)(xdia * ydia);
537 mstd[0] = sum * scale;
538 mstd[1] = sumsq * scale - mstd[0] * mstd[0];
540 if (mstd[1] <= FLT_EPSILON)
541 mstd[1] = mstd[2] = 0.0f;
543 mstd[1] = sqrt(mstd[1]);
544 mstd[2] = 1.0f / mstd[1];
549 static const float exp_lo = -80.0f;
550 static const float exp_hi = +80.0f;
552 static void e2_m16(float *s, const int n)
556 for (i = 0; i < n; i++)
557 s[i] = exp(av_clipf(s[i], exp_lo, exp_hi));
560 const float min_weight_sum = 1e-10f;
562 static void weighted_avg_elliott_mul5_m16(const float *w, const int n, float *mstd)
564 float vsum = 0.0f, wsum = 0.0f;
567 for (i = 0; i < n; i++) {
568 vsum += w[i] * (w[n + i] / (1.0f + FFABS(w[n + i])));
571 if (wsum > min_weight_sum)
572 mstd[3] += ((5.0f * vsum) / wsum) * mstd[1] + mstd[0];
578 static void evalfunc_1(NNEDIContext *s, FrameData *frame_data)
580 float *input = frame_data->input;
581 float *temp = frame_data->temp;
582 float **weights1 = s->weights1;
583 const int qual = s->qual;
584 const int asize = s->asize;
585 const int nns = s->nns;
586 const int xdia = s->xdia;
587 const int xdiad2m1 = (xdia / 2) - 1;
588 const int ydia = s->ydia;
589 const float scale = 1.0f / (float)qual;
592 for (plane = 0; plane < s->nb_planes; plane++) {
593 const uint8_t *srcp = (const uint8_t *)frame_data->paddedp[plane];
594 const int src_stride = frame_data->padded_stride[plane] / sizeof(uint8_t);
596 const int width = frame_data->padded_width[plane];
597 const int height = frame_data->padded_height[plane];
599 uint8_t *dstp = (uint8_t *)frame_data->dstp[plane];
600 const int dst_stride = frame_data->dst_stride[plane] / sizeof(uint8_t);
602 const int ystart = frame_data->field[plane];
603 const int ystop = height - 12;
604 const uint8_t *srcpp;
606 if (!(s->process_plane & (1 << plane)))
609 srcp += (ystart + 6) * src_stride;
610 dstp += ystart * dst_stride - 32;
611 srcpp = srcp - (ydia - 1) * src_stride - xdiad2m1;
613 for (y = ystart; y < ystop; y += 2) {
614 for (x = 32; x < width - 32; x++) {
620 s->extract((const uint8_t *)(srcpp + x), src_stride, xdia, ydia, mstd, input);
621 for (i = 0; i < qual; i++) {
622 s->dot_prod(s, input, weights1[i], temp, nns * 2, asize, mstd + 2);
623 s->expfunc(temp, nns);
624 s->wae5(temp, nns, mstd);
627 dstp[x] = FFMIN(FFMAX((int)(mstd[3] * scale + 0.5f), 0), s->max_value);
629 srcpp += src_stride * 2;
630 dstp += dst_stride * 2;
638 static int roundds(const double f)
640 if (f - floor(f) >= 0.5)
641 return FFMIN((int)ceil(f), 32767);
642 return FFMAX((int)floor(f), -32768);
645 static void select_functions(NNEDIContext *s)
647 s->copy_pad = copy_pad;
648 s->evalfunc_0 = evalfunc_0;
649 s->evalfunc_1 = evalfunc_1;
652 s->process_line0 = process_line0;
654 if (s->pscrn < 2) { // original prescreener
655 if (s->fapprox & 1) { // int16 dot products
656 s->readpixels = byte2word48;
657 s->compute_network0 = compute_network0_i16;
659 s->readpixels = pixel2float48;
660 s->compute_network0 = compute_network0;
662 } else { // new prescreener
663 // only int16 dot products
664 s->readpixels = byte2word64;
665 s->compute_network0 = compute_network0new;
669 s->wae5 = weighted_avg_elliott_mul5_m16;
671 if (s->fapprox & 2) { // use int16 dot products
672 s->extract = extract_m8_i16;
673 s->dot_prod = dot_prods;
674 } else { // use float dot products
675 s->extract = extract_m8;
676 s->dot_prod = dot_prod;
682 static int modnpf(const int m, const int n)
686 return m + n - (m % n);
689 static int get_frame(AVFilterContext *ctx, int is_second)
691 NNEDIContext *s = ctx->priv;
692 AVFilterLink *outlink = ctx->outputs[0];
693 AVFrame *src = s->src;
694 FrameData *frame_data;
695 int effective_field = s->field;
700 if (effective_field > 1)
701 effective_field -= 2;
702 else if (effective_field < 0)
703 effective_field += 2;
705 if (s->field < 0 && src->interlaced_frame && src->top_field_first == 0)
707 else if (s->field < 0 && src->interlaced_frame && src->top_field_first == 1)
710 effective_field = !effective_field;
712 if (s->field > 1 || s->field == -2) {
714 field_n = (effective_field == 0);
716 field_n = (effective_field == 1);
719 field_n = effective_field;
722 s->dst = ff_get_video_buffer(outlink, outlink->w, outlink->h);
724 return AVERROR(ENOMEM);
725 av_frame_copy_props(s->dst, src);
726 s->dst->interlaced_frame = 0;
728 frame_data = &s->frame_data;
730 for (plane = 0; plane < s->nb_planes; plane++) {
731 int dst_height = s->planeheight[plane];
732 int dst_width = s->linesize[plane];
734 const int min_alignment = 16;
735 const int min_pad = 10;
737 if (!(s->process_plane & (1 << plane))) {
738 av_image_copy_plane(s->dst->data[plane], s->dst->linesize[plane],
739 src->data[plane], src->linesize[plane],
741 s->planeheight[plane]);
745 frame_data->padded_width[plane] = dst_width + 64;
746 frame_data->padded_height[plane] = dst_height + 12;
747 frame_data->padded_stride[plane] = modnpf(frame_data->padded_width[plane] + min_pad, min_alignment); // TODO: maybe min_pad is in pixels too?
748 if (!frame_data->paddedp[plane]) {
749 frame_data->paddedp[plane] = av_malloc_array(frame_data->padded_stride[plane], frame_data->padded_height[plane]);
750 if (!frame_data->paddedp[plane])
751 return AVERROR(ENOMEM);
754 frame_data->dstp[plane] = s->dst->data[plane];
755 frame_data->dst_stride[plane] = s->dst->linesize[plane];
757 if (!frame_data->lcount[plane]) {
758 frame_data->lcount[plane] = av_calloc(dst_height, sizeof(int32_t) * 16);
759 if (!frame_data->lcount[plane])
760 return AVERROR(ENOMEM);
762 memset(frame_data->lcount[plane], 0, dst_height * sizeof(int32_t) * 16);
765 frame_data->field[plane] = field_n;
768 if (!frame_data->input) {
769 frame_data->input = av_malloc(512 * sizeof(float));
770 if (!frame_data->input)
771 return AVERROR(ENOMEM);
773 // evalfunc_0 requires at least padded_width[0] bytes.
774 // evalfunc_1 requires at least 512 floats.
775 if (!frame_data->temp) {
776 temp_size = FFMAX(frame_data->padded_width[0], 512 * sizeof(float));
777 frame_data->temp = av_malloc(temp_size);
778 if (!frame_data->temp)
779 return AVERROR(ENOMEM);
782 // Copy src to a padded "frame" in frame_data and mirror the edges.
783 s->copy_pad(src, frame_data, s, field_n);
785 // Handles prescreening and the cubic interpolation.
786 s->evalfunc_0(s, frame_data);
789 s->evalfunc_1(s, frame_data);
794 static int filter_frame(AVFilterLink *inlink, AVFrame *src)
796 AVFilterContext *ctx = inlink->dst;
797 AVFilterLink *outlink = ctx->outputs[0];
798 NNEDIContext *s = ctx->priv;
802 s->field == -2) && !s->second) {
804 } else if (s->field > 1 ||
809 ret = get_frame(ctx, 1);
811 av_frame_free(&s->dst);
812 av_frame_free(&s->src);
813 av_frame_free(&s->second);
818 if (src->pts != AV_NOPTS_VALUE &&
819 dst->pts != AV_NOPTS_VALUE)
820 dst->pts += src->pts;
822 dst->pts = AV_NOPTS_VALUE;
824 ret = ff_filter_frame(outlink, dst);
829 s->cur_pts = s->second->pts;
830 av_frame_free(&s->second);
832 if ((s->deint && src->interlaced_frame &&
833 !ctx->is_disabled) ||
834 (!s->deint && !ctx->is_disabled)) {
839 if ((s->deint && !src->interlaced_frame) || ctx->is_disabled) {
840 AVFrame *dst = av_frame_clone(src);
843 av_frame_free(&s->second);
844 return AVERROR(ENOMEM);
847 if (s->field > 1 || s->field == -2) {
848 av_frame_free(&s->second);
849 if ((s->deint && src->interlaced_frame) ||
855 if (dst->pts != AV_NOPTS_VALUE)
857 return ff_filter_frame(outlink, dst);
861 ret = get_frame(ctx, 0);
863 av_frame_free(&s->dst);
864 av_frame_free(&s->src);
865 av_frame_free(&s->second);
869 if (src->pts != AV_NOPTS_VALUE)
870 s->dst->pts = src->pts * 2;
871 if (s->field <= 1 && s->field > -2) {
876 return ff_filter_frame(outlink, s->dst);
879 static int request_frame(AVFilterLink *link)
881 AVFilterContext *ctx = link->src;
882 NNEDIContext *s = ctx->priv;
888 ret = ff_request_frame(ctx->inputs[0]);
890 if (ret == AVERROR_EOF && s->second) {
891 AVFrame *next = av_frame_clone(s->second);
894 return AVERROR(ENOMEM);
896 next->pts = s->second->pts * 2 - s->cur_pts;
899 filter_frame(ctx->inputs[0], next);
900 } else if (ret < 0) {
907 static av_cold int init(AVFilterContext *ctx)
909 NNEDIContext *s = ctx->priv;
910 FILE *weights_file = NULL;
911 int64_t expected_size = 13574928;
912 int64_t weights_size;
915 const int xdia_table[NUM_NSIZE] = { 8, 16, 32, 48, 8, 16, 32 };
916 const int ydia_table[NUM_NSIZE] = { 6, 6, 6, 6, 4, 4, 4 };
917 const int nns_table[NUM_NNS] = { 16, 32, 64, 128, 256 };
918 const int dims0 = 49 * 4 + 5 * 4 + 9 * 4;
919 const int dims0new = 4 * 65 + 4 * 5;
920 const int dims1 = nns_table[s->nnsparam] * 2 * (xdia_table[s->nsize] * ydia_table[s->nsize] + 1);
923 int ret = 0, i, j, k;
925 weights_file = fopen(s->weights_file, "rb");
927 av_log(ctx, AV_LOG_ERROR, "No weights file provided, aborting!\n");
928 return AVERROR(EINVAL);
931 if (fseek(weights_file, 0, SEEK_END)) {
932 av_log(ctx, AV_LOG_ERROR, "Couldn't seek to the end of weights file.\n");
933 fclose(weights_file);
934 return AVERROR(EINVAL);
937 weights_size = ftell(weights_file);
939 if (weights_size == -1) {
940 fclose(weights_file);
941 av_log(ctx, AV_LOG_ERROR, "Couldn't get size of weights file.\n");
942 return AVERROR(EINVAL);
943 } else if (weights_size != expected_size) {
944 fclose(weights_file);
945 av_log(ctx, AV_LOG_ERROR, "Unexpected weights file size.\n");
946 return AVERROR(EINVAL);
949 if (fseek(weights_file, 0, SEEK_SET)) {
950 fclose(weights_file);
951 av_log(ctx, AV_LOG_ERROR, "Couldn't seek to the start of weights file.\n");
952 return AVERROR(EINVAL);
955 bdata = (float *)av_malloc(expected_size);
957 fclose(weights_file);
958 return AVERROR(ENOMEM);
961 bytes_read = fread(bdata, 1, expected_size, weights_file);
963 if (bytes_read != (size_t)expected_size) {
964 fclose(weights_file);
965 ret = AVERROR_INVALIDDATA;
966 av_log(ctx, AV_LOG_ERROR, "Couldn't read weights file.\n");
970 fclose(weights_file);
972 for (j = 0; j < NUM_NNS; j++) {
973 for (i = 0; i < NUM_NSIZE; i++) {
974 if (i == s->nsize && j == s->nnsparam)
975 dims1offset = dims1tsize;
976 dims1tsize += nns_table[j] * 2 * (xdia_table[i] * ydia_table[i] + 1) * 2;
980 s->weights0 = av_malloc_array(FFMAX(dims0, dims0new), sizeof(float));
982 ret = AVERROR(ENOMEM);
986 for (i = 0; i < 2; i++) {
987 s->weights1[i] = av_malloc_array(dims1, sizeof(float));
988 if (!s->weights1[i]) {
989 ret = AVERROR(ENOMEM);
994 // Adjust prescreener weights
995 if (s->pscrn >= 2) {// using new prescreener
999 double mean[4] = { 0.0, 0.0, 0.0, 0.0 };
1000 int *offt = av_calloc(4 * 64, sizeof(int));
1003 ret = AVERROR(ENOMEM);
1007 for (j = 0; j < 4; j++)
1008 for (k = 0; k < 64; k++)
1009 offt[j * 64 + k] = ((k >> 3) << 5) + ((j & 3) << 3) + (k & 7);
1011 bdw = bdata + dims0 + dims0new * (s->pscrn - 2);
1012 ws = (int16_t *)s->weights0;
1013 wf = (float *)&ws[4 * 64];
1014 // Calculate mean weight of each first layer neuron
1015 for (j = 0; j < 4; j++) {
1017 for (k = 0; k < 64; k++)
1018 cmean += bdw[offt[j * 64 + k]];
1019 mean[j] = cmean / 64.0;
1021 // Factor mean removal and 1.0/127.5 scaling
1022 // into first layer weights. scale to int16 range
1023 for (j = 0; j < 4; j++) {
1024 double scale, mval = 0.0;
1026 for (k = 0; k < 64; k++)
1027 mval = FFMAX(mval, FFABS((bdw[offt[j * 64 + k]] - mean[j]) / 127.5));
1028 scale = 32767.0 / mval;
1029 for (k = 0; k < 64; k++)
1030 ws[offt[j * 64 + k]] = roundds(((bdw[offt[j * 64 + k]] - mean[j]) / 127.5) * scale);
1031 wf[j] = (float)(mval / 32767.0);
1033 memcpy(wf + 4, bdw + 4 * 64, (dims0new - 4 * 64) * sizeof(float));
1035 } else { // using old prescreener
1036 double mean[4] = { 0.0, 0.0, 0.0, 0.0 };
1037 // Calculate mean weight of each first layer neuron
1038 for (j = 0; j < 4; j++) {
1040 for (k = 0; k < 48; k++)
1041 cmean += bdata[j * 48 + k];
1042 mean[j] = cmean / 48.0;
1044 if (s->fapprox & 1) {// use int16 dot products in first layer
1045 int16_t *ws = (int16_t *)s->weights0;
1046 float *wf = (float *)&ws[4 * 48];
1047 // Factor mean removal and 1.0/127.5 scaling
1048 // into first layer weights. scale to int16 range
1049 for (j = 0; j < 4; j++) {
1050 double scale, mval = 0.0;
1051 for (k = 0; k < 48; k++)
1052 mval = FFMAX(mval, FFABS((bdata[j * 48 + k] - mean[j]) / 127.5));
1053 scale = 32767.0 / mval;
1054 for (k = 0; k < 48; k++)
1055 ws[j * 48 + k] = roundds(((bdata[j * 48 + k] - mean[j]) / 127.5) * scale);
1056 wf[j] = (float)(mval / 32767.0);
1058 memcpy(wf + 4, bdata + 4 * 48, (dims0 - 4 * 48) * sizeof(float));
1059 } else {// use float dot products in first layer
1060 double half = (1 << 8) - 1;
1064 // Factor mean removal and 1.0/half scaling
1065 // into first layer weights.
1066 for (j = 0; j < 4; j++)
1067 for (k = 0; k < 48; k++)
1068 s->weights0[j * 48 + k] = (float)((bdata[j * 48 + k] - mean[j]) / half);
1069 memcpy(s->weights0 + 4 * 48, bdata + 4 * 48, (dims0 - 4 * 48) * sizeof(float));
1073 // Adjust prediction weights
1074 for (i = 0; i < 2; i++) {
1075 const float *bdataT = bdata + dims0 + dims0new * 3 + dims1tsize * s->etype + dims1offset + i * dims1;
1076 const int nnst = nns_table[s->nnsparam];
1077 const int asize = xdia_table[s->nsize] * ydia_table[s->nsize];
1078 const int boff = nnst * 2 * asize;
1079 double *mean = (double *)av_calloc(asize + 1 + nnst * 2, sizeof(double));
1082 ret = AVERROR(ENOMEM);
1086 // Calculate mean weight of each neuron (ignore bias)
1087 for (j = 0; j < nnst * 2; j++) {
1089 for (k = 0; k < asize; k++)
1090 cmean += bdataT[j * asize + k];
1091 mean[asize + 1 + j] = cmean / (double)asize;
1093 // Calculate mean softmax neuron
1094 for (j = 0; j < nnst; j++) {
1095 for (k = 0; k < asize; k++)
1096 mean[k] += bdataT[j * asize + k] - mean[asize + 1 + j];
1097 mean[asize] += bdataT[boff + j];
1099 for (j = 0; j < asize + 1; j++)
1100 mean[j] /= (double)(nnst);
1102 if (s->fapprox & 2) { // use int16 dot products
1103 int16_t *ws = (int16_t *)s->weights1[i];
1104 float *wf = (float *)&ws[nnst * 2 * asize];
1105 // Factor mean removal into weights, remove global offset from
1106 // softmax neurons, and scale weights to int16 range.
1107 for (j = 0; j < nnst; j++) { // softmax neurons
1108 double scale, mval = 0.0;
1109 for (k = 0; k < asize; k++)
1110 mval = FFMAX(mval, FFABS(bdataT[j * asize + k] - mean[asize + 1 + j] - mean[k]));
1111 scale = 32767.0 / mval;
1112 for (k = 0; k < asize; k++)
1113 ws[j * asize + k] = roundds((bdataT[j * asize + k] - mean[asize + 1 + j] - mean[k]) * scale);
1114 wf[(j >> 2) * 8 + (j & 3)] = (float)(mval / 32767.0);
1115 wf[(j >> 2) * 8 + (j & 3) + 4] = (float)(bdataT[boff + j] - mean[asize]);
1117 for (j = nnst; j < nnst * 2; j++) { // elliott neurons
1118 double scale, mval = 0.0;
1119 for (k = 0; k < asize; k++)
1120 mval = FFMAX(mval, FFABS(bdataT[j * asize + k] - mean[asize + 1 + j]));
1121 scale = 32767.0 / mval;
1122 for (k = 0; k < asize; k++)
1123 ws[j * asize + k] = roundds((bdataT[j * asize + k] - mean[asize + 1 + j]) * scale);
1124 wf[(j >> 2) * 8 + (j & 3)] = (float)(mval / 32767.0);
1125 wf[(j >> 2) * 8 + (j & 3) + 4] = bdataT[boff + j];
1127 } else { // use float dot products
1128 // Factor mean removal into weights, and remove global
1129 // offset from softmax neurons.
1130 for (j = 0; j < nnst * 2; j++) {
1131 for (k = 0; k < asize; k++) {
1132 const double q = j < nnst ? mean[k] : 0.0;
1133 s->weights1[i][j * asize + k] = (float)(bdataT[j * asize + k] - mean[asize + 1 + j] - q);
1135 s->weights1[i][boff + j] = (float)(bdataT[boff + j] - (j < nnst ? mean[asize] : 0.0));
1141 s->nns = nns_table[s->nnsparam];
1142 s->xdia = xdia_table[s->nsize];
1143 s->ydia = ydia_table[s->nsize];
1144 s->asize = xdia_table[s->nsize] * ydia_table[s->nsize];
1146 s->max_value = 65535 >> 8;
1148 select_functions(s);
1150 s->fdsp = avpriv_float_dsp_alloc(0);
1152 ret = AVERROR(ENOMEM);
1159 static av_cold void uninit(AVFilterContext *ctx)
1161 NNEDIContext *s = ctx->priv;
1164 av_freep(&s->weights0);
1166 for (i = 0; i < 2; i++)
1167 av_freep(&s->weights1[i]);
1169 for (i = 0; i < s->nb_planes; i++) {
1170 av_freep(&s->frame_data.paddedp[i]);
1171 av_freep(&s->frame_data.lcount[i]);
1174 av_freep(&s->frame_data.input);
1175 av_freep(&s->frame_data.temp);
1177 av_frame_free(&s->second);
1180 static const AVFilterPad inputs[] = {
1183 .type = AVMEDIA_TYPE_VIDEO,
1184 .filter_frame = filter_frame,
1185 .config_props = config_input,
1190 static const AVFilterPad outputs[] = {
1193 .type = AVMEDIA_TYPE_VIDEO,
1194 .config_props = config_output,
1195 .request_frame = request_frame,
1200 AVFilter ff_vf_nnedi = {
1202 .description = NULL_IF_CONFIG_SMALL("Apply neural network edge directed interpolation intra-only deinterlacer."),
1203 .priv_size = sizeof(NNEDIContext),
1204 .priv_class = &nnedi_class,
1207 .query_formats = query_formats,
1210 .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL,