X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavutil%2Fhwcontext_cuda.c;h=3b1d53e7995463f8bbcfed307720961de37b05e6;hb=be0b77e6e83b61c2da338201b5ddfae1c9acedc5;hp=37827a770c3a621234e75da9d8a07b7893f6ae4f;hpb=2f96190732d15510ba29471fa45d66841c0c3df1;p=ffmpeg diff --git a/libavutil/hwcontext_cuda.c b/libavutil/hwcontext_cuda.c index 37827a770c3..3b1d53e7995 100644 --- a/libavutil/hwcontext_cuda.c +++ b/libavutil/hwcontext_cuda.c @@ -24,6 +24,7 @@ #include "mem.h" #include "pixdesc.h" #include "pixfmt.h" +#include "imgutils.h" #define CUDA_FRAME_ALIGNMENT 256 @@ -38,6 +39,8 @@ static const enum AVPixelFormat supported_formats[] = { AV_PIX_FMT_P010, AV_PIX_FMT_P016, AV_PIX_FMT_YUV444P16, + AV_PIX_FMT_0RGB32, + AV_PIX_FMT_0BGR32, }; static int cuda_frames_get_constraints(AVHWDeviceContext *ctx, @@ -115,7 +118,6 @@ fail: static int cuda_frames_init(AVHWFramesContext *ctx) { CUDAFramesContext *priv = ctx->internal->priv; - int aligned_width = FFALIGN(ctx->width, CUDA_FRAME_ALIGNMENT); int i; for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) { @@ -131,25 +133,9 @@ static int cuda_frames_init(AVHWFramesContext *ctx) av_pix_fmt_get_chroma_sub_sample(ctx->sw_format, &priv->shift_width, &priv->shift_height); if (!ctx->pool) { - int size; - - switch (ctx->sw_format) { - case AV_PIX_FMT_NV12: - case AV_PIX_FMT_YUV420P: - size = aligned_width * ctx->height * 3 / 2; - break; - case AV_PIX_FMT_YUV444P: - case AV_PIX_FMT_P010: - case AV_PIX_FMT_P016: - size = aligned_width * ctx->height * 3; - break; - case AV_PIX_FMT_YUV444P16: - size = aligned_width * ctx->height * 6; - break; - default: - av_log(ctx, AV_LOG_ERROR, "BUG: Pixel format missing from size calculation."); - return AVERROR_BUG; - } + int size = av_image_get_buffer_size(ctx->sw_format, ctx->width, ctx->height, CUDA_FRAME_ALIGNMENT); + if (size < 0) + return size; ctx->internal->pool_internal = av_buffer_pool_init2(size, ctx, cuda_pool_alloc, NULL); if (!ctx->internal->pool_internal) @@ -161,49 +147,23 @@ static int cuda_frames_init(AVHWFramesContext *ctx) static int cuda_get_buffer(AVHWFramesContext *ctx, AVFrame *frame) { - int aligned_width; - int width_in_bytes = ctx->width; - - if (ctx->sw_format == AV_PIX_FMT_P010 || - ctx->sw_format == AV_PIX_FMT_P016 || - ctx->sw_format == AV_PIX_FMT_YUV444P16) { - width_in_bytes *= 2; - } - aligned_width = FFALIGN(width_in_bytes, CUDA_FRAME_ALIGNMENT); + int res; frame->buf[0] = av_buffer_pool_get(ctx->pool); if (!frame->buf[0]) return AVERROR(ENOMEM); - switch (ctx->sw_format) { - case AV_PIX_FMT_NV12: - case AV_PIX_FMT_P010: - case AV_PIX_FMT_P016: - frame->data[0] = frame->buf[0]->data; - frame->data[1] = frame->data[0] + aligned_width * ctx->height; - frame->linesize[0] = aligned_width; - frame->linesize[1] = aligned_width; - break; - case AV_PIX_FMT_YUV420P: - frame->data[0] = frame->buf[0]->data; - frame->data[2] = frame->data[0] + aligned_width * ctx->height; - frame->data[1] = frame->data[2] + aligned_width * ctx->height / 4; - frame->linesize[0] = aligned_width; - frame->linesize[1] = aligned_width / 2; - frame->linesize[2] = aligned_width / 2; - break; - case AV_PIX_FMT_YUV444P: - case AV_PIX_FMT_YUV444P16: - frame->data[0] = frame->buf[0]->data; - frame->data[1] = frame->data[0] + aligned_width * ctx->height; - frame->data[2] = frame->data[1] + aligned_width * ctx->height; - frame->linesize[0] = aligned_width; - frame->linesize[1] = aligned_width; - frame->linesize[2] = aligned_width; - break; - default: - av_frame_unref(frame); - return AVERROR_BUG; + res = av_image_fill_arrays(frame->data, frame->linesize, frame->buf[0]->data, + ctx->sw_format, ctx->width, ctx->height, CUDA_FRAME_ALIGNMENT); + if (res < 0) + return res; + + // YUV420P is a special case. + // Nvenc expects the U/V planes in swapped order from how ffmpeg expects them, also chroma is half-aligned + if (ctx->sw_format == AV_PIX_FMT_YUV420P) { + frame->linesize[1] = frame->linesize[2] = frame->linesize[0] / 2; + frame->data[2] = frame->data[1]; + frame->data[1] = frame->data[2] + frame->linesize[2] * ctx->height / 2; } frame->format = AV_PIX_FMT_CUDA; @@ -258,13 +218,19 @@ static int cuda_transfer_data_from(AVHWFramesContext *ctx, AVFrame *dst, .Height = src->height >> (i ? priv->shift_height : 0), }; - err = cu->cuMemcpy2D(&cpy); + err = cu->cuMemcpy2DAsync(&cpy, device_hwctx->stream); if (err != CUDA_SUCCESS) { av_log(ctx, AV_LOG_ERROR, "Error transferring the data from the CUDA frame\n"); return AVERROR_UNKNOWN; } } + err = cu->cuStreamSynchronize(device_hwctx->stream); + if (err != CUDA_SUCCESS) { + av_log(ctx, AV_LOG_ERROR, "Error synchronizing CUDA stream\n"); + return AVERROR_UNKNOWN; + } + cu->cuCtxPopCurrent(&dummy); return 0; @@ -297,13 +263,19 @@ static int cuda_transfer_data_to(AVHWFramesContext *ctx, AVFrame *dst, .Height = src->height >> (i ? priv->shift_height : 0), }; - err = cu->cuMemcpy2D(&cpy); + err = cu->cuMemcpy2DAsync(&cpy, device_hwctx->stream); if (err != CUDA_SUCCESS) { - av_log(ctx, AV_LOG_ERROR, "Error transferring the data from the CUDA frame\n"); + av_log(ctx, AV_LOG_ERROR, "Error transferring the data to the CUDA frame\n"); return AVERROR_UNKNOWN; } } + err = cu->cuStreamSynchronize(device_hwctx->stream); + if (err != CUDA_SUCCESS) { + av_log(ctx, AV_LOG_ERROR, "Error synchronizing CUDA stream\n"); + return AVERROR_UNKNOWN; + } + cu->cuCtxPopCurrent(&dummy); return 0; @@ -386,6 +358,9 @@ static int cuda_device_create(AVHWDeviceContext *ctx, const char *device, goto error; } + // Setting stream to NULL will make functions automatically use the default CUstream + hwctx->stream = NULL; + cu->cuCtxPopCurrent(&dummy); hwctx->internal->is_allocated = 1;