X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavfilter%2Fvf_scale_cuda.cu;h=44eef535fdaf814c0284261234486477a902047f;hb=6f34f031908b8f16482e951ee5232116fb42b46a;hp=3f3f40546d4bca8aad298398f9b99ffcf1c57ce7;hpb=79025da3f2e7ab047c8f3c0c817952a98480b26b;p=ffmpeg

diff --git a/libavfilter/vf_scale_cuda.cu b/libavfilter/vf_scale_cuda.cu
index 3f3f40546d4..44eef535fda 100644
--- a/libavfilter/vf_scale_cuda.cu
+++ b/libavfilter/vf_scale_cuda.cu
@@ -20,12 +20,14 @@
  * DEALINGS IN THE SOFTWARE.
  */
 
-extern "C" {
+#include "cuda/vector_helpers.cuh"
 
-__global__ void Subsample_Bilinear_uchar(cudaTextureObject_t uchar_tex,
-                                    unsigned char *dst,
-                                    int dst_width, int dst_height, int dst_pitch,
-                                    int src_width, int src_height)
+template<typename T>
+__device__ inline void Subsample_Nearest(cudaTextureObject_t tex,
+                                         T *dst,
+                                         int dst_width, int dst_height, int dst_pitch,
+                                         int src_width, int src_height,
+                                         int bit_depth)
 {
     int xo = blockIdx.x * blockDim.x + threadIdx.x;
     int yo = blockIdx.y * blockDim.y + threadIdx.y;
@@ -36,55 +38,17 @@ __global__ void Subsample_Bilinear_uchar(cudaTextureObject_t uchar_tex,
         float vscale = (float)src_height / (float)dst_height;
         float xi = (xo + 0.5f) * hscale;
         float yi = (yo + 0.5f) * vscale;
-        // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
-        float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
-        float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
-        // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
-        float dx = wh / (0.5f + wh);
-        float dy = wv / (0.5f + wv);
-        int y0 = tex2D<unsigned char>(uchar_tex, xi-dx, yi-dy);
-        int y1 = tex2D<unsigned char>(uchar_tex, xi+dx, yi-dy);
-        int y2 = tex2D<unsigned char>(uchar_tex, xi-dx, yi+dy);
-        int y3 = tex2D<unsigned char>(uchar_tex, xi+dx, yi+dy);
-        dst[yo*dst_pitch+xo] = (unsigned char)((y0+y1+y2+y3+2) >> 2);
-    }
-}
-
-__global__ void Subsample_Bilinear_uchar2(cudaTextureObject_t uchar2_tex,
-                                    uchar2 *dst,
-                                    int dst_width, int dst_height, int dst_pitch2,
-                                    int src_width, int src_height)
-{
-    int xo = blockIdx.x * blockDim.x + threadIdx.x;
-    int yo = blockIdx.y * blockDim.y + threadIdx.y;
 
-    if (yo < dst_height && xo < dst_width)
-    {
-        float hscale = (float)src_width / (float)dst_width;
-        float vscale = (float)src_height / (float)dst_height;
-        float xi = (xo + 0.5f) * hscale;
-        float yi = (yo + 0.5f) * vscale;
-        // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
-        float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
-        float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
-        // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
-        float dx = wh / (0.5f + wh);
-        float dy = wv / (0.5f + wv);
-        uchar2 c0 = tex2D<uchar2>(uchar2_tex, xi-dx, yi-dy);
-        uchar2 c1 = tex2D<uchar2>(uchar2_tex, xi+dx, yi-dy);
-        uchar2 c2 = tex2D<uchar2>(uchar2_tex, xi-dx, yi+dy);
-        uchar2 c3 = tex2D<uchar2>(uchar2_tex, xi+dx, yi+dy);
-        int2 uv;
-        uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
-        uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
-        dst[yo*dst_pitch2+xo] = make_uchar2((unsigned char)uv.x, (unsigned char)uv.y);
+        dst[yo*dst_pitch+xo] = tex2D<T>(tex, xi, yi);
     }
 }
 
-__global__ void Subsample_Bilinear_uchar4(cudaTextureObject_t uchar4_tex,
-                                    uchar4 *dst,
-                                    int dst_width, int dst_height, int dst_pitch,
-                                    int src_width, int src_height)
+template<typename T>
+__device__ inline void Subsample_Bilinear(cudaTextureObject_t tex,
+                                          T *dst,
+                                          int dst_width, int dst_height, int dst_pitch,
+                                          int src_width, int src_height,
+                                          int bit_depth)
 {
     int xo = blockIdx.x * blockDim.x + threadIdx.x;
     int yo = blockIdx.y * blockDim.y + threadIdx.y;
@@ -101,111 +65,59 @@ __global__ void Subsample_Bilinear_uchar4(cudaTextureObject_t uchar4_tex,
         // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
         float dx = wh / (0.5f + wh);
         float dy = wv / (0.5f + wv);
-        uchar4 c0 = tex2D<uchar4>(uchar4_tex, xi-dx, yi-dy);
-        uchar4 c1 = tex2D<uchar4>(uchar4_tex, xi+dx, yi-dy);
-        uchar4 c2 = tex2D<uchar4>(uchar4_tex, xi-dx, yi+dy);
-        uchar4 c3 = tex2D<uchar4>(uchar4_tex, xi+dx, yi+dy);
-        int4 res;
-        res.x =  ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
-        res.y =  ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
-        res.z =  ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2;
-        res.w =  ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2;
-        dst[yo*dst_pitch+xo] = make_uchar4(
-            (unsigned char)res.x, (unsigned char)res.y, (unsigned char)res.z, (unsigned char)res.w);
+
+        intT r = { 0 };
+        vec_set_scalar(r, 2);
+        r += tex2D<T>(tex, xi - dx, yi - dy);
+        r += tex2D<T>(tex, xi + dx, yi - dy);
+        r += tex2D<T>(tex, xi - dx, yi + dy);
+        r += tex2D<T>(tex, xi + dx, yi + dy);
+        vec_set(dst[yo*dst_pitch+xo], r >> 2);
     }
 }
 
-__global__ void Subsample_Bilinear_ushort(cudaTextureObject_t ushort_tex,
-                                    unsigned short *dst,
-                                    int dst_width, int dst_height, int dst_pitch,
-                                    int src_width, int src_height)
-{
-    int xo = blockIdx.x * blockDim.x + threadIdx.x;
-    int yo = blockIdx.y * blockDim.y + threadIdx.y;
+extern "C" {
 
-    if (yo < dst_height && xo < dst_width)
-    {
-        float hscale = (float)src_width / (float)dst_width;
-        float vscale = (float)src_height / (float)dst_height;
-        float xi = (xo + 0.5f) * hscale;
-        float yi = (yo + 0.5f) * vscale;
-        // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
-        float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
-        float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
-        // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
-        float dx = wh / (0.5f + wh);
-        float dy = wv / (0.5f + wv);
-        int y0 = tex2D<unsigned short>(ushort_tex, xi-dx, yi-dy);
-        int y1 = tex2D<unsigned short>(ushort_tex, xi+dx, yi-dy);
-        int y2 = tex2D<unsigned short>(ushort_tex, xi-dx, yi+dy);
-        int y3 = tex2D<unsigned short>(ushort_tex, xi+dx, yi+dy);
-        dst[yo*dst_pitch+xo] = (unsigned short)((y0+y1+y2+y3+2) >> 2);
+#define NEAREST_KERNEL(T) \
+    __global__ void Subsample_Nearest_ ## T(cudaTextureObject_t src_tex,                  \
+                                            T *dst,                                       \
+                                            int dst_width, int dst_height, int dst_pitch, \
+                                            int src_width, int src_height,                \
+                                            int bit_depth)                                \
+    {                                                                                     \
+        Subsample_Nearest<T>(src_tex, dst,                                                \
+                              dst_width, dst_height, dst_pitch,                           \
+                              src_width, src_height,                                      \
+                              bit_depth);                                                 \
     }
-}
 
-__global__ void Subsample_Bilinear_ushort2(cudaTextureObject_t ushort2_tex,
-                                    ushort2 *dst,
-                                    int dst_width, int dst_height, int dst_pitch2,
-                                    int src_width, int src_height)
-{
-    int xo = blockIdx.x * blockDim.x + threadIdx.x;
-    int yo = blockIdx.y * blockDim.y + threadIdx.y;
+NEAREST_KERNEL(uchar)
+NEAREST_KERNEL(uchar2)
+NEAREST_KERNEL(uchar4)
 
-    if (yo < dst_height && xo < dst_width)
-    {
-        float hscale = (float)src_width / (float)dst_width;
-        float vscale = (float)src_height / (float)dst_height;
-        float xi = (xo + 0.5f) * hscale;
-        float yi = (yo + 0.5f) * vscale;
-        // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
-        float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
-        float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
-        // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
-        float dx = wh / (0.5f + wh);
-        float dy = wv / (0.5f + wv);
-        ushort2 c0 = tex2D<ushort2>(ushort2_tex, xi-dx, yi-dy);
-        ushort2 c1 = tex2D<ushort2>(ushort2_tex, xi+dx, yi-dy);
-        ushort2 c2 = tex2D<ushort2>(ushort2_tex, xi-dx, yi+dy);
-        ushort2 c3 = tex2D<ushort2>(ushort2_tex, xi+dx, yi+dy);
-        int2 uv;
-        uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
-        uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
-        dst[yo*dst_pitch2+xo] = make_ushort2((unsigned short)uv.x, (unsigned short)uv.y);
+NEAREST_KERNEL(ushort)
+NEAREST_KERNEL(ushort2)
+NEAREST_KERNEL(ushort4)
+
+#define BILINEAR_KERNEL(T) \
+    __global__ void Subsample_Bilinear_ ## T(cudaTextureObject_t src_tex,                  \
+                                             T *dst,                                       \
+                                             int dst_width, int dst_height, int dst_pitch, \
+                                             int src_width, int src_height,                \
+                                             int bit_depth)                                \
+    {                                                                                      \
+        Subsample_Bilinear<T>(src_tex, dst,                                                \
+                              dst_width, dst_height, dst_pitch,                            \
+                              src_width, src_height,                                       \
+                              bit_depth);                                                  \
     }
-}
 
-__global__ void Subsample_Bilinear_ushort4(cudaTextureObject_t ushort4_tex,
-                                    ushort4 *dst,
-                                    int dst_width, int dst_height, int dst_pitch,
-                                    int src_width, int src_height)
-{
-    int xo = blockIdx.x * blockDim.x + threadIdx.x;
-    int yo = blockIdx.y * blockDim.y + threadIdx.y;
+BILINEAR_KERNEL(uchar)
+BILINEAR_KERNEL(uchar2)
+BILINEAR_KERNEL(uchar4)
 
-    if (yo < dst_height && xo < dst_width)
-    {
-        float hscale = (float)src_width / (float)dst_width;
-        float vscale = (float)src_height / (float)dst_height;
-        float xi = (xo + 0.5f) * hscale;
-        float yi = (yo + 0.5f) * vscale;
-        // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
-        float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
-        float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
-        // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
-        float dx = wh / (0.5f + wh);
-        float dy = wv / (0.5f + wv);
-        ushort4 c0 = tex2D<ushort4>(ushort4_tex, xi-dx, yi-dy);
-        ushort4 c1 = tex2D<ushort4>(ushort4_tex, xi+dx, yi-dy);
-        ushort4 c2 = tex2D<ushort4>(ushort4_tex, xi-dx, yi+dy);
-        ushort4 c3 = tex2D<ushort4>(ushort4_tex, xi+dx, yi+dy);
-        int4 res;
-        res.x =  ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
-        res.y =  ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
-        res.z =  ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2;
-        res.w =  ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2;
-        dst[yo*dst_pitch+xo] = make_ushort4(
-            (unsigned short)res.x, (unsigned short)res.y, (unsigned short)res.z, (unsigned short)res.w);
-    }
-}
+BILINEAR_KERNEL(ushort)
+BILINEAR_KERNEL(ushort2)
+BILINEAR_KERNEL(ushort4)
 
 }