X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavfilter%2Fvf_thumbnail_cuda.c;h=0c06815643756647b648793f7d42434e190abb77;hb=ef6a9e5e311f09fa8032974fa4d0c1e166a959bb;hp=09377ca7f400c1a12972750f9a27ef78098d8f82;hpb=b4ca32414ea28ad29b4bd387c298f5a676dace2a;p=ffmpeg diff --git a/libavfilter/vf_thumbnail_cuda.c b/libavfilter/vf_thumbnail_cuda.c index 09377ca7f40..0c068156437 100644 --- a/libavfilter/vf_thumbnail_cuda.c +++ b/libavfilter/vf_thumbnail_cuda.c @@ -20,16 +20,17 @@ * DEALINGS IN THE SOFTWARE. */ -#include - #include "libavutil/hwcontext.h" #include "libavutil/hwcontext_cuda_internal.h" +#include "libavutil/cuda_check.h" #include "libavutil/opt.h" #include "libavutil/pixdesc.h" #include "avfilter.h" #include "internal.h" +#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, s->hwctx->internal->cuda_dl, x) + #define HIST_SIZE (3*256) #define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) ) #define BLOCKX 32 @@ -57,6 +58,7 @@ typedef struct ThumbnailCudaContext { AVRational tb; ///< copy of the input timebase to ease access AVBufferRef *hw_frames_ctx; + AVCUDADeviceContext *hwctx; CUmodule cu_module; @@ -64,12 +66,10 @@ typedef struct ThumbnailCudaContext { CUfunction cu_func_uchar2; CUfunction cu_func_ushort; CUfunction cu_func_ushort2; - CUtexref cu_tex_uchar; - CUtexref cu_tex_uchar2; - CUtexref cu_tex_ushort; - CUtexref cu_tex_ushort2; + CUstream cu_stream; CUdeviceptr data; + } ThumbnailCudaContext; #define OFFSET(x) offsetof(ThumbnailCudaContext, x) @@ -154,27 +154,44 @@ static AVFrame *get_best_frame(AVFilterContext *ctx) return picref; } -static int thumbnail_kernel(ThumbnailCudaContext *s, CUfunction func, CUtexref tex, int channels, +static int thumbnail_kernel(AVFilterContext *ctx, CUfunction func, int channels, int *histogram, uint8_t *src_dptr, int src_width, int src_height, int src_pitch, int pixel_size) { - CUdeviceptr src_devptr = (CUdeviceptr)src_dptr; - void *args[] = { &histogram, &src_width, &src_height }; - CUDA_ARRAY_DESCRIPTOR desc; - - desc.Width = src_width; - desc.Height = src_height; - desc.NumChannels = channels; - if (pixel_size == 1) { - desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; - } - else { - desc.Format = CU_AD_FORMAT_UNSIGNED_INT16; - } + int ret; + ThumbnailCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; + CUtexObject tex = 0; + void *args[] = { &tex, &histogram, &src_width, &src_height }; - cuTexRefSetAddress2D_v3(tex, &desc, src_devptr, src_pitch); - cuLaunchKernel(func, DIV_UP(src_width, BLOCKX), DIV_UP(src_height, BLOCKY), 1, BLOCKX, BLOCKY, 1, 0, 0, args, NULL); + CUDA_TEXTURE_DESC tex_desc = { + .filterMode = CU_TR_FILTER_MODE_LINEAR, + .flags = CU_TRSF_READ_AS_INTEGER, + }; - return 0; + CUDA_RESOURCE_DESC res_desc = { + .resType = CU_RESOURCE_TYPE_PITCH2D, + .res.pitch2D.format = pixel_size == 1 ? + CU_AD_FORMAT_UNSIGNED_INT8 : + CU_AD_FORMAT_UNSIGNED_INT16, + .res.pitch2D.numChannels = channels, + .res.pitch2D.width = src_width, + .res.pitch2D.height = src_height, + .res.pitch2D.pitchInBytes = src_pitch, + .res.pitch2D.devPtr = (CUdeviceptr)src_dptr, + }; + + ret = CHECK_CU(cu->cuTexObjectCreate(&tex, &res_desc, &tex_desc, NULL)); + if (ret < 0) + goto exit; + + ret = CHECK_CU(cu->cuLaunchKernel(func, + DIV_UP(src_width, BLOCKX), DIV_UP(src_height, BLOCKY), 1, + BLOCKX, BLOCKY, 1, 0, s->cu_stream, args, NULL)); +exit: + if (tex) + CHECK_CU(cu->cuTexObjectDestroy(tex)); + + return ret; } static int thumbnail(AVFilterContext *ctx, int *histogram, AVFrame *in) @@ -184,40 +201,40 @@ static int thumbnail(AVFilterContext *ctx, int *histogram, AVFrame *in) switch (in_frames_ctx->sw_format) { case AV_PIX_FMT_NV12: - thumbnail_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_uchar, 1, histogram, in->data[0], in->width, in->height, in->linesize[0], 1); - thumbnail_kernel(s, s->cu_func_uchar2, s->cu_tex_uchar2, 2, + thumbnail_kernel(ctx, s->cu_func_uchar2, 2, histogram + 256, in->data[1], in->width / 2, in->height / 2, in->linesize[1], 1); break; case AV_PIX_FMT_YUV420P: - thumbnail_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_uchar, 1, histogram, in->data[0], in->width, in->height, in->linesize[0], 1); - thumbnail_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_uchar, 1, histogram + 256, in->data[1], in->width / 2, in->height / 2, in->linesize[1], 1); - thumbnail_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_uchar, 1, histogram + 512, in->data[2], in->width / 2, in->height / 2, in->linesize[2], 1); break; case AV_PIX_FMT_YUV444P: - thumbnail_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_uchar, 1, histogram, in->data[0], in->width, in->height, in->linesize[0], 1); - thumbnail_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_uchar, 1, histogram + 256, in->data[1], in->width, in->height, in->linesize[1], 1); - thumbnail_kernel(s, s->cu_func_uchar, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_uchar, 1, histogram + 512, in->data[2], in->width, in->height, in->linesize[2], 1); break; case AV_PIX_FMT_P010LE: case AV_PIX_FMT_P016LE: - thumbnail_kernel(s, s->cu_func_ushort, s->cu_tex_ushort, 1, + thumbnail_kernel(ctx, s->cu_func_ushort, 1, histogram, in->data[0], in->width, in->height, in->linesize[0], 2); - thumbnail_kernel(s, s->cu_func_ushort2, s->cu_tex_ushort2, 2, + thumbnail_kernel(ctx, s->cu_func_ushort2, 2, histogram + 256, in->data[1], in->width / 2, in->height / 2, in->linesize[1], 2); break; case AV_PIX_FMT_YUV444P16: - thumbnail_kernel(s, s->cu_func_ushort2, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_ushort2, 1, histogram, in->data[0], in->width, in->height, in->linesize[0], 2); - thumbnail_kernel(s, s->cu_func_ushort2, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_ushort2, 1, histogram + 256, in->data[1], in->width, in->height, in->linesize[1], 2); - thumbnail_kernel(s, s->cu_func_ushort2, s->cu_tex_uchar, 1, + thumbnail_kernel(ctx, s->cu_func_ushort2, 1, histogram + 512, in->data[2], in->width, in->height, in->linesize[2], 2); break; default: @@ -231,11 +248,10 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame) { AVFilterContext *ctx = inlink->dst; ThumbnailCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; AVFilterLink *outlink = ctx->outputs[0]; int *hist = s->frames[s->n].histogram; AVHWFramesContext *hw_frames_ctx = (AVHWFramesContext*)s->hw_frames_ctx->data; - AVCUDADeviceContext *device_hwctx = hw_frames_ctx->device_ctx->hwctx; - CUresult err; CUcontext dummy; CUDA_MEMCPY2D cpy = { 0 }; int ret = 0; @@ -243,11 +259,11 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame) // keep a reference of each frame s->frames[s->n].buf = frame; - err = cuCtxPushCurrent(device_hwctx->cuda_ctx); - if (err != CUDA_SUCCESS) - return AVERROR_UNKNOWN; + ret = CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx)); + if (ret < 0) + return ret; - cuMemsetD8(s->data, 0, HIST_SIZE * sizeof(int)); + CHECK_CU(cu->cuMemsetD8Async(s->data, 0, HIST_SIZE * sizeof(int), s->cu_stream)); thumbnail(ctx, (int*)s->data, frame); @@ -260,11 +276,9 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame) cpy.WidthInBytes = HIST_SIZE * sizeof(int); cpy.Height = 1; - err = cuMemcpy2D(&cpy); - if (err != CUDA_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, "Error transferring the data from the CUDA frame\n"); - return AVERROR_UNKNOWN; - } + ret = CHECK_CU(cu->cuMemcpy2DAsync(&cpy, s->cu_stream)); + if (ret < 0) + return ret; if (hw_frames_ctx->sw_format == AV_PIX_FMT_NV12 || hw_frames_ctx->sw_format == AV_PIX_FMT_YUV420P || hw_frames_ctx->sw_format == AV_PIX_FMT_P010LE || hw_frames_ctx->sw_format == AV_PIX_FMT_P016LE) @@ -274,7 +288,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame) hist[i] = 4 * hist[i]; } - cuCtxPopCurrent(&dummy); + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); if (ret < 0) return ret; @@ -290,14 +304,15 @@ static av_cold void uninit(AVFilterContext *ctx) { int i; ThumbnailCudaContext *s = ctx->priv; + CudaFunctions *cu = s->hwctx->internal->cuda_dl; if (s->data) { - cuMemFree(s->data); + CHECK_CU(cu->cuMemFree(s->data)); s->data = 0; } if (s->cu_module) { - cuModuleUnload(s->cu_module); + CHECK_CU(cu->cuModuleUnload(s->cu_module)); s->cu_module = NULL; } @@ -340,49 +355,43 @@ static int config_props(AVFilterLink *inlink) AVHWFramesContext *hw_frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data; AVCUDADeviceContext *device_hwctx = hw_frames_ctx->device_ctx->hwctx; CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx; - CUresult err; + CudaFunctions *cu = device_hwctx->internal->cuda_dl; + int ret; extern char vf_thumbnail_cuda_ptx[]; - err = cuCtxPushCurrent(cuda_ctx); - if (err != CUDA_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, "Error pushing cuda context\n"); - return AVERROR_UNKNOWN; - } + s->hwctx = device_hwctx; + s->cu_stream = s->hwctx->stream; - err = cuModuleLoadData(&s->cu_module, vf_thumbnail_cuda_ptx); - if (err != CUDA_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, "Error loading module data\n"); - return AVERROR_UNKNOWN; - } + ret = CHECK_CU(cu->cuCtxPushCurrent(cuda_ctx)); + if (ret < 0) + return ret; - cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Thumbnail_uchar"); - cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Thumbnail_uchar2"); - cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Thumbnail_ushort"); - cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Thumbnail_ushort2"); - - cuModuleGetTexRef(&s->cu_tex_uchar, s->cu_module, "uchar_tex"); - cuModuleGetTexRef(&s->cu_tex_uchar2, s->cu_module, "uchar2_tex"); - cuModuleGetTexRef(&s->cu_tex_ushort, s->cu_module, "ushort_tex"); - cuModuleGetTexRef(&s->cu_tex_ushort2, s->cu_module, "ushort2_tex"); - - cuTexRefSetFlags(s->cu_tex_uchar, CU_TRSF_READ_AS_INTEGER); - cuTexRefSetFlags(s->cu_tex_uchar2, CU_TRSF_READ_AS_INTEGER); - cuTexRefSetFlags(s->cu_tex_ushort, CU_TRSF_READ_AS_INTEGER); - cuTexRefSetFlags(s->cu_tex_ushort2, CU_TRSF_READ_AS_INTEGER); - - cuTexRefSetFilterMode(s->cu_tex_uchar, CU_TR_FILTER_MODE_LINEAR); - cuTexRefSetFilterMode(s->cu_tex_uchar2, CU_TR_FILTER_MODE_LINEAR); - cuTexRefSetFilterMode(s->cu_tex_ushort, CU_TR_FILTER_MODE_LINEAR); - cuTexRefSetFilterMode(s->cu_tex_ushort2, CU_TR_FILTER_MODE_LINEAR); - - err = cuMemAlloc(&s->data, HIST_SIZE * sizeof(int)); - if (err != CUDA_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, "Error allocating cuda memory\n"); - return AVERROR_UNKNOWN; - } + ret = CHECK_CU(cu->cuModuleLoadData(&s->cu_module, vf_thumbnail_cuda_ptx)); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Thumbnail_uchar")); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Thumbnail_uchar2")); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Thumbnail_ushort")); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Thumbnail_ushort2")); + if (ret < 0) + return ret; + + ret = CHECK_CU(cu->cuMemAlloc(&s->data, HIST_SIZE * sizeof(int))); + if (ret < 0) + return ret; - cuCtxPopCurrent(&dummy); + CHECK_CU(cu->cuCtxPopCurrent(&dummy)); s->hw_frames_ctx = ctx->inputs[0]->hw_frames_ctx;