X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavfilter%2Fvf_scale_cuda.cu;h=44eef535fdaf814c0284261234486477a902047f;hb=6f34f031908b8f16482e951ee5232116fb42b46a;hp=3f3f40546d4bca8aad298398f9b99ffcf1c57ce7;hpb=79025da3f2e7ab047c8f3c0c817952a98480b26b;p=ffmpeg diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu index 3f3f40546d4..44eef535fda 100644 --- a/libavfilter/vf_scale_cuda.cu +++ b/libavfilter/vf_scale_cuda.cu @@ -20,12 +20,14 @@ * DEALINGS IN THE SOFTWARE. */ -extern "C" { +#include "cuda/vector_helpers.cuh" -__global__ void Subsample_Bilinear_uchar(cudaTextureObject_t uchar_tex, - unsigned char *dst, - int dst_width, int dst_height, int dst_pitch, - int src_width, int src_height) +template +__device__ inline void Subsample_Nearest(cudaTextureObject_t tex, + T *dst, + int dst_width, int dst_height, int dst_pitch, + int src_width, int src_height, + int bit_depth) { int xo = blockIdx.x * blockDim.x + threadIdx.x; int yo = blockIdx.y * blockDim.y + threadIdx.y; @@ -36,55 +38,17 @@ __global__ void Subsample_Bilinear_uchar(cudaTextureObject_t uchar_tex, float vscale = (float)src_height / (float)dst_height; float xi = (xo + 0.5f) * hscale; float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - int y0 = tex2D(uchar_tex, xi-dx, yi-dy); - int y1 = tex2D(uchar_tex, xi+dx, yi-dy); - int y2 = tex2D(uchar_tex, xi-dx, yi+dy); - int y3 = tex2D(uchar_tex, xi+dx, yi+dy); - dst[yo*dst_pitch+xo] = (unsigned char)((y0+y1+y2+y3+2) >> 2); - } -} - -__global__ void Subsample_Bilinear_uchar2(cudaTextureObject_t uchar2_tex, - uchar2 *dst, - int dst_width, int dst_height, int dst_pitch2, - int src_width, int src_height) -{ - int xo = blockIdx.x * blockDim.x + threadIdx.x; - int yo = blockIdx.y * blockDim.y + threadIdx.y; - if (yo < dst_height && xo < dst_width) - { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - uchar2 c0 = tex2D(uchar2_tex, xi-dx, yi-dy); - uchar2 c1 = tex2D(uchar2_tex, xi+dx, yi-dy); - uchar2 c2 = tex2D(uchar2_tex, xi-dx, yi+dy); - uchar2 c3 = tex2D(uchar2_tex, xi+dx, yi+dy); - int2 uv; - uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; - uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; - dst[yo*dst_pitch2+xo] = make_uchar2((unsigned char)uv.x, (unsigned char)uv.y); + dst[yo*dst_pitch+xo] = tex2D(tex, xi, yi); } } -__global__ void Subsample_Bilinear_uchar4(cudaTextureObject_t uchar4_tex, - uchar4 *dst, - int dst_width, int dst_height, int dst_pitch, - int src_width, int src_height) +template +__device__ inline void Subsample_Bilinear(cudaTextureObject_t tex, + T *dst, + int dst_width, int dst_height, int dst_pitch, + int src_width, int src_height, + int bit_depth) { int xo = blockIdx.x * blockDim.x + threadIdx.x; int yo = blockIdx.y * blockDim.y + threadIdx.y; @@ -101,111 +65,59 @@ __global__ void Subsample_Bilinear_uchar4(cudaTextureObject_t uchar4_tex, // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} float dx = wh / (0.5f + wh); float dy = wv / (0.5f + wv); - uchar4 c0 = tex2D(uchar4_tex, xi-dx, yi-dy); - uchar4 c1 = tex2D(uchar4_tex, xi+dx, yi-dy); - uchar4 c2 = tex2D(uchar4_tex, xi-dx, yi+dy); - uchar4 c3 = tex2D(uchar4_tex, xi+dx, yi+dy); - int4 res; - res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; - res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; - res.z = ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2; - res.w = ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2; - dst[yo*dst_pitch+xo] = make_uchar4( - (unsigned char)res.x, (unsigned char)res.y, (unsigned char)res.z, (unsigned char)res.w); + + intT r = { 0 }; + vec_set_scalar(r, 2); + r += tex2D(tex, xi - dx, yi - dy); + r += tex2D(tex, xi + dx, yi - dy); + r += tex2D(tex, xi - dx, yi + dy); + r += tex2D(tex, xi + dx, yi + dy); + vec_set(dst[yo*dst_pitch+xo], r >> 2); } } -__global__ void Subsample_Bilinear_ushort(cudaTextureObject_t ushort_tex, - unsigned short *dst, - int dst_width, int dst_height, int dst_pitch, - int src_width, int src_height) -{ - int xo = blockIdx.x * blockDim.x + threadIdx.x; - int yo = blockIdx.y * blockDim.y + threadIdx.y; +extern "C" { - if (yo < dst_height && xo < dst_width) - { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - int y0 = tex2D(ushort_tex, xi-dx, yi-dy); - int y1 = tex2D(ushort_tex, xi+dx, yi-dy); - int y2 = tex2D(ushort_tex, xi-dx, yi+dy); - int y3 = tex2D(ushort_tex, xi+dx, yi+dy); - dst[yo*dst_pitch+xo] = (unsigned short)((y0+y1+y2+y3+2) >> 2); +#define NEAREST_KERNEL(T) \ + __global__ void Subsample_Nearest_ ## T(cudaTextureObject_t src_tex, \ + T *dst, \ + int dst_width, int dst_height, int dst_pitch, \ + int src_width, int src_height, \ + int bit_depth) \ + { \ + Subsample_Nearest(src_tex, dst, \ + dst_width, dst_height, dst_pitch, \ + src_width, src_height, \ + bit_depth); \ } -} -__global__ void Subsample_Bilinear_ushort2(cudaTextureObject_t ushort2_tex, - ushort2 *dst, - int dst_width, int dst_height, int dst_pitch2, - int src_width, int src_height) -{ - int xo = blockIdx.x * blockDim.x + threadIdx.x; - int yo = blockIdx.y * blockDim.y + threadIdx.y; +NEAREST_KERNEL(uchar) +NEAREST_KERNEL(uchar2) +NEAREST_KERNEL(uchar4) - if (yo < dst_height && xo < dst_width) - { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - ushort2 c0 = tex2D(ushort2_tex, xi-dx, yi-dy); - ushort2 c1 = tex2D(ushort2_tex, xi+dx, yi-dy); - ushort2 c2 = tex2D(ushort2_tex, xi-dx, yi+dy); - ushort2 c3 = tex2D(ushort2_tex, xi+dx, yi+dy); - int2 uv; - uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; - uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; - dst[yo*dst_pitch2+xo] = make_ushort2((unsigned short)uv.x, (unsigned short)uv.y); +NEAREST_KERNEL(ushort) +NEAREST_KERNEL(ushort2) +NEAREST_KERNEL(ushort4) + +#define BILINEAR_KERNEL(T) \ + __global__ void Subsample_Bilinear_ ## T(cudaTextureObject_t src_tex, \ + T *dst, \ + int dst_width, int dst_height, int dst_pitch, \ + int src_width, int src_height, \ + int bit_depth) \ + { \ + Subsample_Bilinear(src_tex, dst, \ + dst_width, dst_height, dst_pitch, \ + src_width, src_height, \ + bit_depth); \ } -} -__global__ void Subsample_Bilinear_ushort4(cudaTextureObject_t ushort4_tex, - ushort4 *dst, - int dst_width, int dst_height, int dst_pitch, - int src_width, int src_height) -{ - int xo = blockIdx.x * blockDim.x + threadIdx.x; - int yo = blockIdx.y * blockDim.y + threadIdx.y; +BILINEAR_KERNEL(uchar) +BILINEAR_KERNEL(uchar2) +BILINEAR_KERNEL(uchar4) - if (yo < dst_height && xo < dst_width) - { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - ushort4 c0 = tex2D(ushort4_tex, xi-dx, yi-dy); - ushort4 c1 = tex2D(ushort4_tex, xi+dx, yi-dy); - ushort4 c2 = tex2D(ushort4_tex, xi-dx, yi+dy); - ushort4 c3 = tex2D(ushort4_tex, xi+dx, yi+dy); - int4 res; - res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; - res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; - res.z = ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2; - res.w = ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2; - dst[yo*dst_pitch+xo] = make_ushort4( - (unsigned short)res.x, (unsigned short)res.y, (unsigned short)res.z, (unsigned short)res.w); - } -} +BILINEAR_KERNEL(ushort) +BILINEAR_KERNEL(ushort2) +BILINEAR_KERNEL(ushort4) }