X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavfilter%2Fvf_scale_cuda.c;h=c97a802ddc6d9380768c9e28c92fb2b4d964af75;hb=2544c7ea67ca9521c5de36396bc9ac7058223742;hp=23ac27a7dc4c0176d4e975721033a6bf92bfe7b6;hpb=ce265b0bf5d0c77a092a1f5fbeb652c7cdea5fc7;p=ffmpeg diff --git a/libavfilter/vf_scale_cuda.c b/libavfilter/vf_scale_cuda.c index 23ac27a7dc4..c97a802ddc6 100644 --- a/libavfilter/vf_scale_cuda.c +++ b/libavfilter/vf_scale_cuda.c @@ -20,7 +20,6 @@ * DEALINGS IN THE SOFTWARE. */ -#include #include #include @@ -28,6 +27,7 @@ #include "libavutil/common.h" #include "libavutil/hwcontext.h" #include "libavutil/hwcontext_cuda_internal.h" +#include "libavutil/cuda_check.h" #include "libavutil/internal.h" #include "libavutil/opt.h" #include "libavutil/pixdesc.h" @@ -52,8 +52,13 @@ static const enum AVPixelFormat supported_formats[] = { #define BLOCKX 32 #define BLOCKY 16 +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) + typedef struct CUDAScaleContext { const AVClass *class; + + AVCUDADeviceContext *hwctx; + enum AVPixelFormat in_fmt; enum AVPixelFormat out_fmt; @@ -77,7 +82,6 @@ typedef struct CUDAScaleContext { char *h_expr; ///< height expression string CUcontext cu_ctx; - CUevent cu_event; CUmodule cu_module; CUfunction cu_func_uchar; CUfunction cu_func_uchar2; @@ -85,12 +89,7 @@ typedef struct CUDAScaleContext { CUfunction cu_func_ushort; CUfunction cu_func_ushort2; CUfunction cu_func_ushort4; - CUtexref cu_tex_uchar; - CUtexref cu_tex_uchar2; - CUtexref cu_tex_uchar4; - CUtexref cu_tex_ushort; - CUtexref cu_tex_ushort2; - CUtexref cu_tex_ushort4; + CUstream cu_stream; CUdeviceptr srcBuffer; CUdeviceptr dstBuffer; @@ -255,55 +254,49 @@ static av_cold int cudascale_config_props(AVFilterLink *outlink) AVHWFramesContext *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data; AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx; CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx; - CUresult err; + CudaFunctions *cu = device_hwctx->internal->cuda_dl; int w, h; int ret; extern char vf_scale_cuda_ptx[]; - err = cuCtxPushCurrent(cuda_ctx); - if (err != CUDA_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, "Error pushing cuda context\n"); - ret = AVERROR_UNKNOWN; + s->hwctx = device_hwctx; + s->cu_stream = s->hwctx->stream; + + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); + if (ret < 0) goto fail; - } - err = cuModuleLoadData(&s->cu_module, vf_scale_cuda_ptx); - if (err != CUDA_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, "Error loading module data\n"); - ret = AVERROR_UNKNOWN; + ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module, vf_scale_cuda_ptx)); + if (ret < 0) + goto fail; + + CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Subsample_Bilinear_uchar")); + if (ret < 0) goto fail; - } - cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Subsample_Bilinear_uchar"); - cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Subsample_Bilinear_uchar2"); - cuModuleGetFunction(&s->cu_func_uchar4, s->cu_module, "Subsample_Bilinear_uchar4"); - cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Subsample_Bilinear_ushort"); - cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Subsample_Bilinear_ushort2"); - cuModuleGetFunction(&s->cu_func_ushort4, s->cu_module, "Subsample_Bilinear_ushort4"); - - cuModuleGetTexRef(&s->cu_tex_uchar, s->cu_module, "uchar_tex"); - cuModuleGetTexRef(&s->cu_tex_uchar2, s->cu_module, "uchar2_tex"); - cuModuleGetTexRef(&s->cu_tex_uchar4, s->cu_module, "uchar4_tex"); - cuModuleGetTexRef(&s->cu_tex_ushort, s->cu_module, "ushort_tex"); - cuModuleGetTexRef(&s->cu_tex_ushort2, s->cu_module, "ushort2_tex"); - cuModuleGetTexRef(&s->cu_tex_ushort4, s->cu_module, "ushort4_tex"); - - cuTexRefSetFlags(s->cu_tex_uchar, CU_TRSF_READ_AS_INTEGER); - cuTexRefSetFlags(s->cu_tex_uchar2, CU_TRSF_READ_AS_INTEGER); - cuTexRefSetFlags(s->cu_tex_uchar4, CU_TRSF_READ_AS_INTEGER); - cuTexRefSetFlags(s->cu_tex_ushort, CU_TRSF_READ_AS_INTEGER); - cuTexRefSetFlags(s->cu_tex_ushort2, CU_TRSF_READ_AS_INTEGER); - cuTexRefSetFlags(s->cu_tex_ushort4, CU_TRSF_READ_AS_INTEGER); - - cuTexRefSetFilterMode(s->cu_tex_uchar, CU_TR_FILTER_MODE_LINEAR); - cuTexRefSetFilterMode(s->cu_tex_uchar2, CU_TR_FILTER_MODE_LINEAR); - cuTexRefSetFilterMode(s->cu_tex_uchar4, CU_TR_FILTER_MODE_LINEAR); - cuTexRefSetFilterMode(s->cu_tex_ushort, CU_TR_FILTER_MODE_LINEAR); - cuTexRefSetFilterMode(s->cu_tex_ushort2, CU_TR_FILTER_MODE_LINEAR); - cuTexRefSetFilterMode(s->cu_tex_ushort4, CU_TR_FILTER_MODE_LINEAR); - - cuCtxPopCurrent(&dummy); + CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Subsample_Bilinear_uchar2")); + if (ret < 0) + goto fail; + + CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar4, s->cu_module, "Subsample_Bilinear_uchar4")); + if (ret < 0) + goto fail; + + CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Subsample_Bilinear_ushort")); + if (ret < 0) + goto fail; + + CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Subsample_Bilinear_ushort2")); + if (ret < 0) + goto fail; + + CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort4, s->cu_module, "Subsample_Bilinear_ushort4")); + if (ret < 0) + goto fail; + + + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); if ((ret = ff_scale_eval_dimensions(s, s->w_expr, s->h_expr, @@ -339,29 +332,48 @@ fail: return ret; } -static int call_resize_kernel(CUDAScaleContext *s, CUfunction func, CUtexref tex, int channels, +static int call_resize_kernel(AVFilterContext *ctx, CUfunction func, int channels, uint8_t *src_dptr, int src_width, int src_height, int src_pitch, uint8_t *dst_dptr, int dst_width, int dst_height, int dst_pitch, int pixel_size) { - CUdeviceptr src_devptr = (CUdeviceptr)src_dptr; + CUDAScaleContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; CUdeviceptr dst_devptr = (CUdeviceptr)dst_dptr; - void *args_uchar[] = { &dst_devptr, &dst_width, &dst_height, &dst_pitch, &src_width, &src_height }; - CUDA_ARRAY_DESCRIPTOR desc; - - desc.Width = src_width; - desc.Height = src_height; - desc.NumChannels = channels; - if (pixel_size == 1) { - desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; - } else { - desc.Format = CU_AD_FORMAT_UNSIGNED_INT16; - } + CUtexObject tex = 0; + void *args_uchar[] = { &tex, &dst_devptr, &dst_width, &dst_height, &dst_pitch, &src_width, &src_height }; + int ret; - cuTexRefSetAddress2D_v3(tex, &desc, src_devptr, src_pitch * pixel_size); - cuLaunchKernel(func, DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, 0, args_uchar, NULL); + CUDA_TEXTURE_DESC tex_desc = { + .filterMode = CU_TR_FILTER_MODE_LINEAR, + .flags = CU_TRSF_READ_AS_INTEGER, + }; - return 0; + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = pixel_size == 1 ? + CU_AD_FORMAT_UNSIGNED_INT8 : + CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = channels, + .res.pitch2D.width = src_width, + .res.pitch2D.height = src_height, + .res.pitch2D.pitchInBytes = src_pitch, + .res.pitch2D.devPtr = (CUdeviceptr)src_dptr, + }; + + ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL)); + if (ret < 0) + goto exit; + + ret = CHECK_CU(cu->cuLaunchKernel(func, + DIV_UP(dst_width, BLOCKX), DIV_UP(dst_height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args_uchar, NULL)); + +exit: + if (tex) + CHECK_CU(cu->cuTexObjectDestroy(tex)); + + return ret; } static int scalecuda_resize(AVFilterContext *ctx, @@ -372,59 +384,59 @@ static int scalecuda_resize(AVFilterContext *ctx, switch (in_frames_ctx->sw_format) { case AV_PIX_FMT_YUV420P: - call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + call_resize_kernel(ctx, s->cu_func_uchar, 1, in->data[0], in->width, in->height, in->linesize[0], out->data[0], out->width, out->height, out->linesize[0], 1); - call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + call_resize_kernel(ctx, s->cu_func_uchar, 1, in->data[0]+in->linesize[0]*in->height, in->width/2, in->height/2, in->linesize[0]/2, out->data[0]+out->linesize[0]*out->height, out->width/2, out->height/2, out->linesize[0]/2, 1); - call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + call_resize_kernel(ctx, s->cu_func_uchar, 1, in->data[0]+ ALIGN_UP((in->linesize[0]*in->height*5)/4, s->tex_alignment), in->width/2, in->height/2, in->linesize[0]/2, out->data[0]+(out->linesize[0]*out->height*5)/4, out->width/2, out->height/2, out->linesize[0]/2, 1); break; case AV_PIX_FMT_YUV444P: - call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + call_resize_kernel(ctx, s->cu_func_uchar, 1, in->data[0], in->width, in->height, in->linesize[0], out->data[0], out->width, out->height, out->linesize[0], 1); - call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + call_resize_kernel(ctx, s->cu_func_uchar, 1, in->data[0]+in->linesize[0]*in->height, in->width, in->height, in->linesize[0], out->data[0]+out->linesize[0]*out->height, out->width, out->height, out->linesize[0], 1); - call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + call_resize_kernel(ctx, s->cu_func_uchar, 1, in->data[0]+in->linesize[0]*in->height*2, in->width, in->height, in->linesize[0], out->data[0]+out->linesize[0]*out->height*2, out->width, out->height, out->linesize[0], 1); break; case AV_PIX_FMT_NV12: - call_resize_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + call_resize_kernel(ctx, s->cu_func_uchar, 1, in->data[0], in->width, in->height, in->linesize[0], out->data[0], out->width, out->height, out->linesize[0], 1); - call_resize_kernel(s, s->cu_func_uchar2, s->cu_tex_uchar2, 2, + call_resize_kernel(ctx, s->cu_func_uchar2, 2, in->data[1], in->width/2, in->height/2, in->linesize[1], out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f), out->width/2, out->height/2, out->linesize[1]/2, 1); break; case AV_PIX_FMT_P010LE: - call_resize_kernel(s, s->cu_func_ushort, s->cu_tex_ushort, 1, + call_resize_kernel(ctx, s->cu_func_ushort, 1, in->data[0], in->width, in->height, in->linesize[0]/2, out->data[0], out->width, out->height, out->linesize[0]/2, 2); - call_resize_kernel(s, s->cu_func_ushort2, s->cu_tex_ushort2, 2, + call_resize_kernel(ctx, s->cu_func_ushort2, 2, in->data[1], in->width / 2, in->height / 2, in->linesize[1]/2, out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2, out->linesize[1] / 4, 2); break; case AV_PIX_FMT_P016LE: - call_resize_kernel(s, s->cu_func_ushort, s->cu_tex_ushort, 1, + call_resize_kernel(ctx, s->cu_func_ushort, 1, in->data[0], in->width, in->height, in->linesize[0] / 2, out->data[0], out->width, out->height, out->linesize[0] / 2, 2); - call_resize_kernel(s, s->cu_func_ushort2, s->cu_tex_ushort2, 2, + call_resize_kernel(ctx, s->cu_func_ushort2, 2, in->data[1], in->width / 2, in->height / 2, in->linesize[1] / 2, out->data[0] + out->linesize[0] * ((out->height + 31) & ~0x1f), out->width / 2, out->height / 2, out->linesize[1] / 4, 2); @@ -463,14 +475,12 @@ static int cudascale_scale(AVFilterContext *ctx, AVFrame *out, AVFrame *in) static int cudascale_filter_frame(AVFilterLink *link, AVFrame *in) { - AVFilterContext *ctx = link->dst; - CUDAScaleContext *s = ctx->priv; - AVFilterLink *outlink = ctx->outputs[0]; - AVHWFramesContext *frames_ctx = (AVHWFramesContext*)s->frames_ctx->data; - AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx; + AVFilterContext *ctx = link->dst; + CUDAScaleContext *s = ctx->priv; + AVFilterLink *outlink = ctx->outputs[0]; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; AVFrame *out = NULL; - CUresult err; CUcontext dummy; int ret = 0; @@ -480,15 +490,13 @@ static int cudascale_filter_frame(AVFilterLink *link, AVFrame *in) goto fail; } - err = cuCtxPushCurrent(device_hwctx->cuda_ctx); - if (err != CUDA_SUCCESS) { - ret = AVERROR_UNKNOWN; + ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + if (ret < 0) goto fail; - } ret = cudascale_scale(ctx, out, in); - cuCtxPopCurrent(&dummy); + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); if (ret < 0) goto fail;