X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavfilter%2Fvf_scale_cuda.cu;h=44eef535fdaf814c0284261234486477a902047f;hb=d7e0d428faaa04e2fd850eca82f314ca2ad3dfe5;hp=5f5ec81989a6162afc79f84acd0ec43006c00f3e;hpb=185aa5e896e15ae96145609944bfc6bbb239bc64;p=ffmpeg diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu index 5f5ec81989a..44eef535fda 100644 --- a/libavfilter/vf_scale_cuda.cu +++ b/libavfilter/vf_scale_cuda.cu @@ -20,18 +20,14 @@ * DEALINGS IN THE SOFTWARE. */ -extern "C" { - -texture uchar_tex; -texture uchar2_tex; -texture uchar4_tex; -texture ushort_tex; -texture ushort2_tex; -texture ushort4_tex; +#include "cuda/vector_helpers.cuh" -__global__ void Subsample_Bilinear_uchar(unsigned char *dst, - int dst_width, int dst_height, int dst_pitch, - int src_width, int src_height) +template +__device__ inline void Subsample_Nearest(cudaTextureObject_t tex, + T *dst, + int dst_width, int dst_height, int dst_pitch, + int src_width, int src_height, + int bit_depth) { int xo = blockIdx.x * blockDim.x + threadIdx.x; int yo = blockIdx.y * blockDim.y + threadIdx.y; @@ -42,53 +38,17 @@ __global__ void Subsample_Bilinear_uchar(unsigned char *dst, float vscale = (float)src_height / (float)dst_height; float xi = (xo + 0.5f) * hscale; float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - int y0 = tex2D(uchar_tex, xi-dx, yi-dy); - int y1 = tex2D(uchar_tex, xi+dx, yi-dy); - int y2 = tex2D(uchar_tex, xi-dx, yi+dy); - int y3 = tex2D(uchar_tex, xi+dx, yi+dy); - dst[yo*dst_pitch+xo] = (unsigned char)((y0+y1+y2+y3+2) >> 2); - } -} -__global__ void Subsample_Bilinear_uchar2(uchar2 *dst, - int dst_width, int dst_height, int dst_pitch2, - int src_width, int src_height) -{ - int xo = blockIdx.x * blockDim.x + threadIdx.x; - int yo = blockIdx.y * blockDim.y + threadIdx.y; - - if (yo < dst_height && xo < dst_width) - { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - uchar2 c0 = tex2D(uchar2_tex, xi-dx, yi-dy); - uchar2 c1 = tex2D(uchar2_tex, xi+dx, yi-dy); - uchar2 c2 = tex2D(uchar2_tex, xi-dx, yi+dy); - uchar2 c3 = tex2D(uchar2_tex, xi+dx, yi+dy); - int2 uv; - uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; - uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; - dst[yo*dst_pitch2+xo] = make_uchar2((unsigned char)uv.x, (unsigned char)uv.y); + dst[yo*dst_pitch+xo] = tex2D(tex, xi, yi); } } -__global__ void Subsample_Bilinear_uchar4(uchar4 *dst, - int dst_width, int dst_height, int dst_pitch, - int src_width, int src_height) +template +__device__ inline void Subsample_Bilinear(cudaTextureObject_t tex, + T *dst, + int dst_width, int dst_height, int dst_pitch, + int src_width, int src_height, + int bit_depth) { int xo = blockIdx.x * blockDim.x + threadIdx.x; int yo = blockIdx.y * blockDim.y + threadIdx.y; @@ -105,108 +65,59 @@ __global__ void Subsample_Bilinear_uchar4(uchar4 *dst, // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} float dx = wh / (0.5f + wh); float dy = wv / (0.5f + wv); - uchar4 c0 = tex2D(uchar4_tex, xi-dx, yi-dy); - uchar4 c1 = tex2D(uchar4_tex, xi+dx, yi-dy); - uchar4 c2 = tex2D(uchar4_tex, xi-dx, yi+dy); - uchar4 c3 = tex2D(uchar4_tex, xi+dx, yi+dy); - int4 res; - res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; - res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; - res.z = ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2; - res.w = ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2; - dst[yo*dst_pitch+xo] = make_uchar4( - (unsigned char)res.x, (unsigned char)res.y, (unsigned char)res.z, (unsigned char)res.w); + + intT r = { 0 }; + vec_set_scalar(r, 2); + r += tex2D(tex, xi - dx, yi - dy); + r += tex2D(tex, xi + dx, yi - dy); + r += tex2D(tex, xi - dx, yi + dy); + r += tex2D(tex, xi + dx, yi + dy); + vec_set(dst[yo*dst_pitch+xo], r >> 2); } } -__global__ void Subsample_Bilinear_ushort(unsigned short *dst, - int dst_width, int dst_height, int dst_pitch, - int src_width, int src_height) -{ - int xo = blockIdx.x * blockDim.x + threadIdx.x; - int yo = blockIdx.y * blockDim.y + threadIdx.y; +extern "C" { - if (yo < dst_height && xo < dst_width) - { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - int y0 = tex2D(ushort_tex, xi-dx, yi-dy); - int y1 = tex2D(ushort_tex, xi+dx, yi-dy); - int y2 = tex2D(ushort_tex, xi-dx, yi+dy); - int y3 = tex2D(ushort_tex, xi+dx, yi+dy); - dst[yo*dst_pitch+xo] = (unsigned short)((y0+y1+y2+y3+2) >> 2); +#define NEAREST_KERNEL(T) \ + __global__ void Subsample_Nearest_ ## T(cudaTextureObject_t src_tex, \ + T *dst, \ + int dst_width, int dst_height, int dst_pitch, \ + int src_width, int src_height, \ + int bit_depth) \ + { \ + Subsample_Nearest(src_tex, dst, \ + dst_width, dst_height, dst_pitch, \ + src_width, src_height, \ + bit_depth); \ } -} -__global__ void Subsample_Bilinear_ushort2(ushort2 *dst, - int dst_width, int dst_height, int dst_pitch2, - int src_width, int src_height) -{ - int xo = blockIdx.x * blockDim.x + threadIdx.x; - int yo = blockIdx.y * blockDim.y + threadIdx.y; +NEAREST_KERNEL(uchar) +NEAREST_KERNEL(uchar2) +NEAREST_KERNEL(uchar4) - if (yo < dst_height && xo < dst_width) - { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - ushort2 c0 = tex2D(ushort2_tex, xi-dx, yi-dy); - ushort2 c1 = tex2D(ushort2_tex, xi+dx, yi-dy); - ushort2 c2 = tex2D(ushort2_tex, xi-dx, yi+dy); - ushort2 c3 = tex2D(ushort2_tex, xi+dx, yi+dy); - int2 uv; - uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; - uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; - dst[yo*dst_pitch2+xo] = make_ushort2((unsigned short)uv.x, (unsigned short)uv.y); +NEAREST_KERNEL(ushort) +NEAREST_KERNEL(ushort2) +NEAREST_KERNEL(ushort4) + +#define BILINEAR_KERNEL(T) \ + __global__ void Subsample_Bilinear_ ## T(cudaTextureObject_t src_tex, \ + T *dst, \ + int dst_width, int dst_height, int dst_pitch, \ + int src_width, int src_height, \ + int bit_depth) \ + { \ + Subsample_Bilinear(src_tex, dst, \ + dst_width, dst_height, dst_pitch, \ + src_width, src_height, \ + bit_depth); \ } -} -__global__ void Subsample_Bilinear_ushort4(ushort4 *dst, - int dst_width, int dst_height, int dst_pitch, - int src_width, int src_height) -{ - int xo = blockIdx.x * blockDim.x + threadIdx.x; - int yo = blockIdx.y * blockDim.y + threadIdx.y; +BILINEAR_KERNEL(uchar) +BILINEAR_KERNEL(uchar2) +BILINEAR_KERNEL(uchar4) - if (yo < dst_height && xo < dst_width) - { - float hscale = (float)src_width / (float)dst_width; - float vscale = (float)src_height / (float)dst_height; - float xi = (xo + 0.5f) * hscale; - float yi = (yo + 0.5f) * vscale; - // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv} - float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f); - float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f); - // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh} - float dx = wh / (0.5f + wh); - float dy = wv / (0.5f + wv); - ushort4 c0 = tex2D(ushort4_tex, xi-dx, yi-dy); - ushort4 c1 = tex2D(ushort4_tex, xi+dx, yi-dy); - ushort4 c2 = tex2D(ushort4_tex, xi-dx, yi+dy); - ushort4 c3 = tex2D(ushort4_tex, xi+dx, yi+dy); - int4 res; - res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2; - res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2; - res.z = ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2; - res.w = ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2; - dst[yo*dst_pitch+xo] = make_ushort4( - (unsigned short)res.x, (unsigned short)res.y, (unsigned short)res.z, (unsigned short)res.w); - } -} +BILINEAR_KERNEL(ushort) +BILINEAR_KERNEL(ushort2) +BILINEAR_KERNEL(ushort4) }