2 * Copyright (c) 2020 Yaroslav Pogrebnyak <yyyaroslav@gmail.com>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 * Overlay one video on top of another using cuda hardware acceleration
26 #include "libavutil/log.h"
27 #include "libavutil/mem.h"
28 #include "libavutil/opt.h"
29 #include "libavutil/pixdesc.h"
30 #include "libavutil/hwcontext.h"
31 #include "libavutil/hwcontext_cuda_internal.h"
32 #include "libavutil/cuda_check.h"
35 #include "framesync.h"
38 #define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, ctx->hwctx->internal->cuda_dl, x)
39 #define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
44 static const enum AVPixelFormat supported_main_formats[] = {
50 static const enum AVPixelFormat supported_overlay_formats[] = {
60 typedef struct OverlayCUDAContext {
63 enum AVPixelFormat in_format_overlay;
64 enum AVPixelFormat in_format_main;
66 AVBufferRef *hw_device_ctx;
67 AVCUDADeviceContext *hwctx;
82 * Helper to find out if provided format is supported by filter
84 static int format_is_supported(const enum AVPixelFormat formats[], enum AVPixelFormat fmt)
86 for (int i = 0; formats[i] != AV_PIX_FMT_NONE; i++)
87 if (formats[i] == fmt)
93 * Helper checks if we can process main and overlay pixel formats
95 static int formats_match(const enum AVPixelFormat format_main, const enum AVPixelFormat format_overlay) {
98 return format_overlay == AV_PIX_FMT_NV12;
99 case AV_PIX_FMT_YUV420P:
100 return format_overlay == AV_PIX_FMT_YUV420P ||
101 format_overlay == AV_PIX_FMT_YUVA420P;
108 * Call overlay kernell for a plane
110 static int overlay_cuda_call_kernel(
111 OverlayCUDAContext *ctx,
112 int x_position, int y_position,
113 uint8_t* main_data, int main_linesize,
114 int main_width, int main_height,
115 uint8_t* overlay_data, int overlay_linesize,
116 int overlay_width, int overlay_height,
117 uint8_t* alpha_data, int alpha_linesize,
118 int alpha_adj_x, int alpha_adj_y) {
120 CudaFunctions *cu = ctx->hwctx->internal->cuda_dl;
122 void* kernel_args[] = {
123 &x_position, &y_position,
124 &main_data, &main_linesize,
125 &overlay_data, &overlay_linesize,
126 &overlay_width, &overlay_height,
127 &alpha_data, &alpha_linesize,
128 &alpha_adj_x, &alpha_adj_y,
131 return CHECK_CU(cu->cuLaunchKernel(
133 DIV_UP(main_width, BLOCK_X), DIV_UP(main_height, BLOCK_Y), 1,
135 0, ctx->cu_stream, kernel_args, NULL));
139 * Perform blend overlay picture over main picture
141 static int overlay_cuda_blend(FFFrameSync *fs)
145 AVFilterContext *avctx = fs->parent;
146 OverlayCUDAContext *ctx = avctx->priv;
147 AVFilterLink *outlink = avctx->outputs[0];
149 CudaFunctions *cu = ctx->hwctx->internal->cuda_dl;
150 CUcontext dummy, cuda_ctx = ctx->hwctx->cuda_ctx;
152 AVFrame *input_main, *input_overlay;
154 ctx->cu_ctx = cuda_ctx;
156 // read main and overlay frames from inputs
157 ret = ff_framesync_dualinput_get(fs, &input_main, &input_overlay);
165 return ff_filter_frame(outlink, input_main);
167 ret = av_frame_make_writable(input_main);
169 av_frame_free(&input_main);
175 ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
177 av_frame_free(&input_main);
181 // overlay first plane
183 overlay_cuda_call_kernel(ctx,
184 ctx->x_position, ctx->y_position,
185 input_main->data[0], input_main->linesize[0],
186 input_main->width, input_main->height,
187 input_overlay->data[0], input_overlay->linesize[0],
188 input_overlay->width, input_overlay->height,
189 input_overlay->data[3], input_overlay->linesize[3], 1, 1);
191 // overlay rest planes depending on pixel format
193 switch(ctx->in_format_overlay) {
194 case AV_PIX_FMT_NV12:
195 overlay_cuda_call_kernel(ctx,
196 ctx->x_position, ctx->y_position / 2,
197 input_main->data[1], input_main->linesize[1],
198 input_main->width, input_main->height / 2,
199 input_overlay->data[1], input_overlay->linesize[1],
200 input_overlay->width, input_overlay->height / 2,
203 case AV_PIX_FMT_YUV420P:
204 case AV_PIX_FMT_YUVA420P:
205 overlay_cuda_call_kernel(ctx,
206 ctx->x_position / 2 , ctx->y_position / 2,
207 input_main->data[1], input_main->linesize[1],
208 input_main->width / 2, input_main->height / 2,
209 input_overlay->data[1], input_overlay->linesize[1],
210 input_overlay->width / 2, input_overlay->height / 2,
211 input_overlay->data[3], input_overlay->linesize[3], 2, 2);
213 overlay_cuda_call_kernel(ctx,
214 ctx->x_position / 2 , ctx->y_position / 2,
215 input_main->data[2], input_main->linesize[2],
216 input_main->width / 2, input_main->height / 2,
217 input_overlay->data[2], input_overlay->linesize[2],
218 input_overlay->width / 2, input_overlay->height / 2,
219 input_overlay->data[3], input_overlay->linesize[3], 2, 2);
222 av_log(ctx, AV_LOG_ERROR, "Passed unsupported overlay pixel format\n");
223 av_frame_free(&input_main);
224 CHECK_CU(cu->cuCtxPopCurrent(&dummy));
228 CHECK_CU(cu->cuCtxPopCurrent(&dummy));
230 return ff_filter_frame(outlink, input_main);
234 * Initialize overlay_cuda
236 static av_cold int overlay_cuda_init(AVFilterContext *avctx)
238 OverlayCUDAContext* ctx = avctx->priv;
239 ctx->fs.on_event = &overlay_cuda_blend;
245 * Uninitialize overlay_cuda
247 static av_cold void overlay_cuda_uninit(AVFilterContext *avctx)
249 OverlayCUDAContext* ctx = avctx->priv;
251 ff_framesync_uninit(&ctx->fs);
253 if (ctx->hwctx && ctx->cu_module) {
255 CudaFunctions *cu = ctx->hwctx->internal->cuda_dl;
256 CHECK_CU(cu->cuCtxPushCurrent(ctx->cu_ctx));
257 CHECK_CU(cu->cuModuleUnload(ctx->cu_module));
258 CHECK_CU(cu->cuCtxPopCurrent(&dummy));
261 av_buffer_unref(&ctx->hw_device_ctx);
266 * Activate overlay_cuda
268 static int overlay_cuda_activate(AVFilterContext *avctx)
270 OverlayCUDAContext *ctx = avctx->priv;
272 return ff_framesync_activate(&ctx->fs);
278 static int overlay_cuda_query_formats(AVFilterContext *avctx)
280 static const enum AVPixelFormat pixel_formats[] = {
281 AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE,
284 AVFilterFormats *pix_fmts = ff_make_format_list(pixel_formats);
286 return ff_set_common_formats(avctx, pix_fmts);
292 static int overlay_cuda_config_output(AVFilterLink *outlink)
295 extern char vf_overlay_cuda_ptx[];
298 AVFilterContext* avctx = outlink->src;
299 OverlayCUDAContext* ctx = avctx->priv;
301 AVFilterLink *inlink = avctx->inputs[0];
302 AVHWFramesContext *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
304 AVFilterLink *inlink_overlay = avctx->inputs[1];
305 AVHWFramesContext *frames_ctx_overlay = (AVHWFramesContext*)inlink_overlay->hw_frames_ctx->data;
307 CUcontext dummy, cuda_ctx;
310 // check main input formats
313 av_log(ctx, AV_LOG_ERROR, "No hw context provided on main input\n");
314 return AVERROR(EINVAL);
317 ctx->in_format_main = frames_ctx->sw_format;
318 if (!format_is_supported(supported_main_formats, ctx->in_format_main)) {
319 av_log(ctx, AV_LOG_ERROR, "Unsupported main input format: %s\n",
320 av_get_pix_fmt_name(ctx->in_format_main));
321 return AVERROR(ENOSYS);
324 // check overlay input formats
326 if (!frames_ctx_overlay) {
327 av_log(ctx, AV_LOG_ERROR, "No hw context provided on overlay input\n");
328 return AVERROR(EINVAL);
331 ctx->in_format_overlay = frames_ctx_overlay->sw_format;
332 if (!format_is_supported(supported_overlay_formats, ctx->in_format_overlay)) {
333 av_log(ctx, AV_LOG_ERROR, "Unsupported overlay input format: %s\n",
334 av_get_pix_fmt_name(ctx->in_format_overlay));
335 return AVERROR(ENOSYS);
338 // check we can overlay pictures with those pixel formats
340 if (!formats_match(ctx->in_format_main, ctx->in_format_overlay)) {
341 av_log(ctx, AV_LOG_ERROR, "Can't overlay %s on %s \n",
342 av_get_pix_fmt_name(ctx->in_format_overlay), av_get_pix_fmt_name(ctx->in_format_main));
343 return AVERROR(EINVAL);
348 ctx->hw_device_ctx = av_buffer_ref(frames_ctx->device_ref);
349 if (!ctx->hw_device_ctx)
350 return AVERROR(ENOMEM);
351 ctx->hwctx = ((AVHWDeviceContext*)ctx->hw_device_ctx->data)->hwctx;
353 cuda_ctx = ctx->hwctx->cuda_ctx;
354 ctx->fs.time_base = inlink->time_base;
356 ctx->cu_stream = ctx->hwctx->stream;
358 outlink->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx);
359 if (!outlink->hw_frames_ctx)
360 return AVERROR(ENOMEM);
364 cu = ctx->hwctx->internal->cuda_dl;
366 err = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx));
371 err = CHECK_CU(cu->cuModuleLoadData(&ctx->cu_module, vf_overlay_cuda_ptx));
373 CHECK_CU(cu->cuCtxPopCurrent(&dummy));
377 err = CHECK_CU(cu->cuModuleGetFunction(&ctx->cu_func, ctx->cu_module, "Overlay_Cuda"));
379 CHECK_CU(cu->cuCtxPopCurrent(&dummy));
383 CHECK_CU(cu->cuCtxPopCurrent(&dummy));
387 err = ff_framesync_init_dualinput(&ctx->fs, avctx);
392 return ff_framesync_configure(&ctx->fs);
396 #define OFFSET(x) offsetof(OverlayCUDAContext, x)
397 #define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
399 static const AVOption overlay_cuda_options[] = {
400 { "x", "Overlay x position",
401 OFFSET(x_position), AV_OPT_TYPE_INT, { .i64 = 0 }, INT_MIN, INT_MAX, .flags = FLAGS },
402 { "y", "Overlay y position",
403 OFFSET(y_position), AV_OPT_TYPE_INT, { .i64 = 0 }, INT_MIN, INT_MAX, .flags = FLAGS },
404 { "eof_action", "Action to take when encountering EOF from secondary input ",
405 OFFSET(fs.opt_eof_action), AV_OPT_TYPE_INT, { .i64 = EOF_ACTION_REPEAT },
406 EOF_ACTION_REPEAT, EOF_ACTION_PASS, .flags = FLAGS, "eof_action" },
407 { "repeat", "Repeat the previous frame.", 0, AV_OPT_TYPE_CONST, { .i64 = EOF_ACTION_REPEAT }, .flags = FLAGS, "eof_action" },
408 { "endall", "End both streams.", 0, AV_OPT_TYPE_CONST, { .i64 = EOF_ACTION_ENDALL }, .flags = FLAGS, "eof_action" },
409 { "pass", "Pass through the main input.", 0, AV_OPT_TYPE_CONST, { .i64 = EOF_ACTION_PASS }, .flags = FLAGS, "eof_action" },
410 { "shortest", "force termination when the shortest input terminates", OFFSET(fs.opt_shortest), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, FLAGS },
411 { "repeatlast", "repeat overlay of the last overlay frame", OFFSET(fs.opt_repeatlast), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, FLAGS },
415 FRAMESYNC_DEFINE_CLASS(overlay_cuda, OverlayCUDAContext, fs);
417 static const AVFilterPad overlay_cuda_inputs[] = {
420 .type = AVMEDIA_TYPE_VIDEO,
424 .type = AVMEDIA_TYPE_VIDEO,
429 static const AVFilterPad overlay_cuda_outputs[] = {
432 .type = AVMEDIA_TYPE_VIDEO,
433 .config_props = &overlay_cuda_config_output,
438 const AVFilter ff_vf_overlay_cuda = {
439 .name = "overlay_cuda",
440 .description = NULL_IF_CONFIG_SMALL("Overlay one video on top of another using CUDA"),
441 .priv_size = sizeof(OverlayCUDAContext),
442 .priv_class = &overlay_cuda_class,
443 .init = &overlay_cuda_init,
444 .uninit = &overlay_cuda_uninit,
445 .activate = &overlay_cuda_activate,
446 .query_formats = &overlay_cuda_query_formats,
447 .inputs = overlay_cuda_inputs,
448 .outputs = overlay_cuda_outputs,
449 .preinit = overlay_cuda_framesync_preinit,
450 .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,