git.sesse.net Git - ffmpeg/blob - libavfilter/vf_dnn_processing.c

   1 /*
   2  * Copyright (c) 2019 Guo Yejun
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /**
  22  * @file
  23  * implementing a generic image processing filter using deep learning networks.
  24  */
  25
  26 #include "libavformat/avio.h"
  27 #include "libavutil/opt.h"
  28 #include "libavutil/pixdesc.h"
  29 #include "libavutil/avassert.h"
  30 #include "libavutil/imgutils.h"
  31 #include "filters.h"
  32 #include "dnn_filter_common.h"
  33 #include "formats.h"
  34 #include "internal.h"
  35 #include "libswscale/swscale.h"
  36 #include "libavutil/time.h"
  37
  38 typedef struct DnnProcessingContext {
  39     const AVClass *class;
  40     DnnContext dnnctx;
  41     struct SwsContext *sws_uv_scale;
  42     int sws_uv_height;
  43 } DnnProcessingContext;
  44
  45 #define OFFSET(x) offsetof(DnnProcessingContext, dnnctx.x)
  46 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
  47 static const AVOption dnn_processing_options[] = {
  48     { "dnn_backend", "DNN backend",                OFFSET(backend_type),     AV_OPT_TYPE_INT,       { .i64 = 0 },    INT_MIN, INT_MAX, FLAGS, "backend" },
  49     { "native",      "native backend flag",        0,                        AV_OPT_TYPE_CONST,     { .i64 = 0 },    0, 0, FLAGS, "backend" },
  50 #if (CONFIG_LIBTENSORFLOW == 1)
  51     { "tensorflow",  "tensorflow backend flag",    0,                        AV_OPT_TYPE_CONST,     { .i64 = 1 },    0, 0, FLAGS, "backend" },
  52 #endif
  53 #if (CONFIG_LIBOPENVINO == 1)
  54     { "openvino",    "openvino backend flag",      0,                        AV_OPT_TYPE_CONST,     { .i64 = 2 },    0, 0, FLAGS, "backend" },
  55 #endif
  56     DNN_COMMON_OPTIONS
  57     { NULL }
  58 };
  59
  60 AVFILTER_DEFINE_CLASS(dnn_processing);
  61
  62 static av_cold int init(AVFilterContext *context)
  63 {
  64     DnnProcessingContext *ctx = context->priv;
  65     return ff_dnn_init(&ctx->dnnctx, DFT_PROCESS_FRAME, context);
  66 }
  67
  68 static int query_formats(AVFilterContext *context)
  69 {
  70     static const enum AVPixelFormat pix_fmts[] = {
  71         AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24,
  72         AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAYF32,
  73         AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
  74         AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
  75         AV_PIX_FMT_NV12,
  76         AV_PIX_FMT_NONE
  77     };
  78     AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
  79     return ff_set_common_formats(context, fmts_list);
  80 }
  81
  82 #define LOG_FORMAT_CHANNEL_MISMATCH()                       \
  83     av_log(ctx, AV_LOG_ERROR,                               \
  84            "the frame's format %s does not match "          \
  85            "the model input channel %d\n",                  \
  86            av_get_pix_fmt_name(fmt),                        \
  87            model_input->channels);
  88
  89 static int check_modelinput_inlink(const DNNData *model_input, const AVFilterLink *inlink)
  90 {
  91     AVFilterContext *ctx   = inlink->dst;
  92     enum AVPixelFormat fmt = inlink->format;
  93
  94     // the design is to add explicit scale filter before this filter
  95     if (model_input->height != -1 && model_input->height != inlink->h) {
  96         av_log(ctx, AV_LOG_ERROR, "the model requires frame height %d but got %d\n",
  97                                    model_input->height, inlink->h);
  98         return AVERROR(EIO);
  99     }
 100     if (model_input->width != -1 && model_input->width != inlink->w) {
 101         av_log(ctx, AV_LOG_ERROR, "the model requires frame width %d but got %d\n",
 102                                    model_input->width, inlink->w);
 103         return AVERROR(EIO);
 104     }
 105     if (model_input->dt != DNN_FLOAT) {
 106         avpriv_report_missing_feature(ctx, "data type rather than DNN_FLOAT");
 107         return AVERROR(EIO);
 108     }
 109
 110     switch (fmt) {
 111     case AV_PIX_FMT_RGB24:
 112     case AV_PIX_FMT_BGR24:
 113         if (model_input->channels != 3) {
 114             LOG_FORMAT_CHANNEL_MISMATCH();
 115             return AVERROR(EIO);
 116         }
 117         return 0;
 118     case AV_PIX_FMT_GRAYF32:
 119     case AV_PIX_FMT_YUV420P:
 120     case AV_PIX_FMT_YUV422P:
 121     case AV_PIX_FMT_YUV444P:
 122     case AV_PIX_FMT_YUV410P:
 123     case AV_PIX_FMT_YUV411P:
 124     case AV_PIX_FMT_NV12:
 125         if (model_input->channels != 1) {
 126             LOG_FORMAT_CHANNEL_MISMATCH();
 127             return AVERROR(EIO);
 128         }
 129         return 0;
 130     default:
 131         avpriv_report_missing_feature(ctx, "%s", av_get_pix_fmt_name(fmt));
 132         return AVERROR(EIO);
 133     }
 134
 135     return 0;
 136 }
 137
 138 static int config_input(AVFilterLink *inlink)
 139 {
 140     AVFilterContext *context     = inlink->dst;
 141     DnnProcessingContext *ctx = context->priv;
 142     DNNReturnType result;
 143     DNNData model_input;
 144     int check;
 145
 146     result = ff_dnn_get_input(&ctx->dnnctx, &model_input);
 147     if (result != DNN_SUCCESS) {
 148         av_log(ctx, AV_LOG_ERROR, "could not get input from the model\n");
 149         return AVERROR(EIO);
 150     }
 151
 152     check = check_modelinput_inlink(&model_input, inlink);
 153     if (check != 0) {
 154         return check;
 155     }
 156
 157     return 0;
 158 }
 159
 160 static av_always_inline int isPlanarYUV(enum AVPixelFormat pix_fmt)
 161 {
 162     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
 163     av_assert0(desc);
 164     return !(desc->flags & AV_PIX_FMT_FLAG_RGB) && desc->nb_components == 3;
 165 }
 166
 167 static int prepare_uv_scale(AVFilterLink *outlink)
 168 {
 169     AVFilterContext *context = outlink->src;
 170     DnnProcessingContext *ctx = context->priv;
 171     AVFilterLink *inlink = context->inputs[0];
 172     enum AVPixelFormat fmt = inlink->format;
 173
 174     if (isPlanarYUV(fmt)) {
 175         if (inlink->w != outlink->w || inlink->h != outlink->h) {
 176             if (fmt == AV_PIX_FMT_NV12) {
 177                 ctx->sws_uv_scale = sws_getContext(inlink->w >> 1, inlink->h >> 1, AV_PIX_FMT_YA8,
 178                                                    outlink->w >> 1, outlink->h >> 1, AV_PIX_FMT_YA8,
 179                                                    SWS_BICUBIC, NULL, NULL, NULL);
 180                 ctx->sws_uv_height = inlink->h >> 1;
 181             } else {
 182                 const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
 183                 int sws_src_h = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
 184                 int sws_src_w = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
 185                 int sws_dst_h = AV_CEIL_RSHIFT(outlink->h, desc->log2_chroma_h);
 186                 int sws_dst_w = AV_CEIL_RSHIFT(outlink->w, desc->log2_chroma_w);
 187                 ctx->sws_uv_scale = sws_getContext(sws_src_w, sws_src_h, AV_PIX_FMT_GRAY8,
 188                                                    sws_dst_w, sws_dst_h, AV_PIX_FMT_GRAY8,
 189                                                    SWS_BICUBIC, NULL, NULL, NULL);
 190                 ctx->sws_uv_height = sws_src_h;
 191             }
 192         }
 193     }
 194
 195     return 0;
 196 }
 197
 198 static int config_output(AVFilterLink *outlink)
 199 {
 200     AVFilterContext *context = outlink->src;
 201     DnnProcessingContext *ctx = context->priv;
 202     DNNReturnType result;
 203     AVFilterLink *inlink = context->inputs[0];
 204
 205     // have a try run in case that the dnn model resize the frame
 206     result = ff_dnn_get_output(&ctx->dnnctx, inlink->w, inlink->h, &outlink->w, &outlink->h);
 207     if (result != DNN_SUCCESS) {
 208         av_log(ctx, AV_LOG_ERROR, "could not get output from the model\n");
 209         return AVERROR(EIO);
 210     }
 211
 212     prepare_uv_scale(outlink);
 213
 214     return 0;
 215 }
 216
 217 static int copy_uv_planes(DnnProcessingContext *ctx, AVFrame *out, const AVFrame *in)
 218 {
 219     const AVPixFmtDescriptor *desc;
 220     int uv_height;
 221
 222     if (!ctx->sws_uv_scale) {
 223         av_assert0(in->height == out->height && in->width == out->width);
 224         desc = av_pix_fmt_desc_get(in->format);
 225         uv_height = AV_CEIL_RSHIFT(in->height, desc->log2_chroma_h);
 226         for (int i = 1; i < 3; ++i) {
 227             int bytewidth = av_image_get_linesize(in->format, in->width, i);
 228             av_image_copy_plane(out->data[i], out->linesize[i],
 229                                 in->data[i], in->linesize[i],
 230                                 bytewidth, uv_height);
 231         }
 232     } else if (in->format == AV_PIX_FMT_NV12) {
 233         sws_scale(ctx->sws_uv_scale, (const uint8_t **)(in->data + 1), in->linesize + 1,
 234                   0, ctx->sws_uv_height, out->data + 1, out->linesize + 1);
 235     } else {
 236         sws_scale(ctx->sws_uv_scale, (const uint8_t **)(in->data + 1), in->linesize + 1,
 237                   0, ctx->sws_uv_height, out->data + 1, out->linesize + 1);
 238         sws_scale(ctx->sws_uv_scale, (const uint8_t **)(in->data + 2), in->linesize + 2,
 239                   0, ctx->sws_uv_height, out->data + 2, out->linesize + 2);
 240     }
 241
 242     return 0;
 243 }
 244
 245 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 246 {
 247     AVFilterContext *context  = inlink->dst;
 248     AVFilterLink *outlink = context->outputs[0];
 249     DnnProcessingContext *ctx = context->priv;
 250     DNNReturnType dnn_result;
 251     AVFrame *out;
 252
 253     out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
 254     if (!out) {
 255         av_frame_free(&in);
 256         return AVERROR(ENOMEM);
 257     }
 258     av_frame_copy_props(out, in);
 259
 260     dnn_result = ff_dnn_execute_model(&ctx->dnnctx, in, out);
 261     if (dnn_result != DNN_SUCCESS){
 262         av_log(ctx, AV_LOG_ERROR, "failed to execute model\n");
 263         av_frame_free(&in);
 264         av_frame_free(&out);
 265         return AVERROR(EIO);
 266     }
 267
 268     if (isPlanarYUV(in->format))
 269         copy_uv_planes(ctx, out, in);
 270
 271     av_frame_free(&in);
 272     return ff_filter_frame(outlink, out);
 273 }
 274
 275 static int activate_sync(AVFilterContext *filter_ctx)
 276 {
 277     AVFilterLink *inlink = filter_ctx->inputs[0];
 278     AVFilterLink *outlink = filter_ctx->outputs[0];
 279     AVFrame *in = NULL;
 280     int64_t pts;
 281     int ret, status;
 282     int got_frame = 0;
 283
 284     FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
 285
 286     do {
 287         // drain all input frames
 288         ret = ff_inlink_consume_frame(inlink, &in);
 289         if (ret < 0)
 290             return ret;
 291         if (ret > 0) {
 292             ret = filter_frame(inlink, in);
 293             if (ret < 0)
 294                 return ret;
 295             got_frame = 1;
 296         }
 297     } while (ret > 0);
 298
 299     // if frame got, schedule to next filter
 300     if (got_frame)
 301         return 0;
 302
 303     if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
 304         if (status == AVERROR_EOF) {
 305             ff_outlink_set_status(outlink, status, pts);
 306             return ret;
 307         }
 308     }
 309
 310     FF_FILTER_FORWARD_WANTED(outlink, inlink);
 311
 312     return FFERROR_NOT_READY;
 313 }
 314
 315 static int flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts)
 316 {
 317     DnnProcessingContext *ctx = outlink->src->priv;
 318     int ret;
 319     DNNAsyncStatusType async_state;
 320
 321     ret = ff_dnn_flush(&ctx->dnnctx);
 322     if (ret != DNN_SUCCESS) {
 323         return -1;
 324     }
 325
 326     do {
 327         AVFrame *in_frame = NULL;
 328         AVFrame *out_frame = NULL;
 329         async_state = ff_dnn_get_async_result(&ctx->dnnctx, &in_frame, &out_frame);
 330         if (out_frame) {
 331             if (isPlanarYUV(in_frame->format))
 332                 copy_uv_planes(ctx, out_frame, in_frame);
 333             av_frame_free(&in_frame);
 334             ret = ff_filter_frame(outlink, out_frame);
 335             if (ret < 0)
 336                 return ret;
 337             if (out_pts)
 338                 *out_pts = out_frame->pts + pts;
 339         }
 340         av_usleep(5000);
 341     } while (async_state >= DAST_NOT_READY);
 342
 343     return 0;
 344 }
 345
 346 static int activate_async(AVFilterContext *filter_ctx)
 347 {
 348     AVFilterLink *inlink = filter_ctx->inputs[0];
 349     AVFilterLink *outlink = filter_ctx->outputs[0];
 350     DnnProcessingContext *ctx = filter_ctx->priv;
 351     AVFrame *in = NULL, *out = NULL;
 352     int64_t pts;
 353     int ret, status;
 354     int got_frame = 0;
 355     int async_state;
 356
 357     FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
 358
 359     do {
 360         // drain all input frames
 361         ret = ff_inlink_consume_frame(inlink, &in);
 362         if (ret < 0)
 363             return ret;
 364         if (ret > 0) {
 365             out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
 366             if (!out) {
 367                 av_frame_free(&in);
 368                 return AVERROR(ENOMEM);
 369             }
 370             av_frame_copy_props(out, in);
 371             if (ff_dnn_execute_model_async(&ctx->dnnctx, in, out) != DNN_SUCCESS) {
 372                 return AVERROR(EIO);
 373             }
 374         }
 375     } while (ret > 0);
 376
 377     // drain all processed frames
 378     do {
 379         AVFrame *in_frame = NULL;
 380         AVFrame *out_frame = NULL;
 381         async_state = ff_dnn_get_async_result(&ctx->dnnctx, &in_frame, &out_frame);
 382         if (out_frame) {
 383             if (isPlanarYUV(in_frame->format))
 384                 copy_uv_planes(ctx, out_frame, in_frame);
 385             av_frame_free(&in_frame);
 386             ret = ff_filter_frame(outlink, out_frame);
 387             if (ret < 0)
 388                 return ret;
 389             got_frame = 1;
 390         }
 391     } while (async_state == DAST_SUCCESS);
 392
 393     // if frame got, schedule to next filter
 394     if (got_frame)
 395         return 0;
 396
 397     if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
 398         if (status == AVERROR_EOF) {
 399             int64_t out_pts = pts;
 400             ret = flush_frame(outlink, pts, &out_pts);
 401             ff_outlink_set_status(outlink, status, out_pts);
 402             return ret;
 403         }
 404     }
 405
 406     FF_FILTER_FORWARD_WANTED(outlink, inlink);
 407
 408     return 0;
 409 }
 410
 411 static int activate(AVFilterContext *filter_ctx)
 412 {
 413     DnnProcessingContext *ctx = filter_ctx->priv;
 414
 415     if (ctx->dnnctx.async)
 416         return activate_async(filter_ctx);
 417     else
 418         return activate_sync(filter_ctx);
 419 }
 420
 421 static av_cold void uninit(AVFilterContext *ctx)
 422 {
 423     DnnProcessingContext *context = ctx->priv;
 424
 425     sws_freeContext(context->sws_uv_scale);
 426     ff_dnn_uninit(&context->dnnctx);
 427 }
 428
 429 static const AVFilterPad dnn_processing_inputs[] = {
 430     {
 431         .name         = "default",
 432         .type         = AVMEDIA_TYPE_VIDEO,
 433         .config_props = config_input,
 434     },
 435     { NULL }
 436 };
 437
 438 static const AVFilterPad dnn_processing_outputs[] = {
 439     {
 440         .name = "default",
 441         .type = AVMEDIA_TYPE_VIDEO,
 442         .config_props  = config_output,
 443     },
 444     { NULL }
 445 };
 446
 447 AVFilter ff_vf_dnn_processing = {
 448     .name          = "dnn_processing",
 449     .description   = NULL_IF_CONFIG_SMALL("Apply DNN processing filter to the input."),
 450     .priv_size     = sizeof(DnnProcessingContext),
 451     .init          = init,
 452     .uninit        = uninit,
 453     .query_formats = query_formats,
 454     .inputs        = dnn_processing_inputs,
 455     .outputs       = dnn_processing_outputs,
 456     .priv_class    = &dnn_processing_class,
 457     .activate      = activate,
 458 };