git.sesse.net Git - ffmpeg/blob - libavfilter/vf_scale_cuda.cu

   1 /*
   2  * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  20  * DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 #include "cuda/vector_helpers.cuh"
  24
  25 template<typename T>
  26 __device__ inline void Subsample_Nearest(cudaTextureObject_t tex,
  27                                          T *dst,
  28                                          int dst_width, int dst_height, int dst_pitch,
  29                                          int src_width, int src_height,
  30                                          int bit_depth)
  31 {
  32     int xo = blockIdx.x * blockDim.x + threadIdx.x;
  33     int yo = blockIdx.y * blockDim.y + threadIdx.y;
  34
  35     if (yo < dst_height && xo < dst_width)
  36     {
  37         float hscale = (float)src_width / (float)dst_width;
  38         float vscale = (float)src_height / (float)dst_height;
  39         float xi = (xo + 0.5f) * hscale;
  40         float yi = (yo + 0.5f) * vscale;
  41
  42         dst[yo*dst_pitch+xo] = tex2D<T>(tex, xi, yi);
  43     }
  44 }
  45
  46 template<typename T>
  47 __device__ inline void Subsample_Bilinear(cudaTextureObject_t tex,
  48                                           T *dst,
  49                                           int dst_width, int dst_height, int dst_pitch,
  50                                           int src_width, int src_height,
  51                                           int bit_depth)
  52 {
  53     int xo = blockIdx.x * blockDim.x + threadIdx.x;
  54     int yo = blockIdx.y * blockDim.y + threadIdx.y;
  55
  56     if (yo < dst_height && xo < dst_width)
  57     {
  58         float hscale = (float)src_width / (float)dst_width;
  59         float vscale = (float)src_height / (float)dst_height;
  60         float xi = (xo + 0.5f) * hscale;
  61         float yi = (yo + 0.5f) * vscale;
  62         // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
  63         float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
  64         float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
  65         // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
  66         float dx = wh / (0.5f + wh);
  67         float dy = wv / (0.5f + wv);
  68
  69         intT r = { 0 };
  70         vec_set_scalar(r, 2);
  71         r += tex2D<T>(tex, xi - dx, yi - dy);
  72         r += tex2D<T>(tex, xi + dx, yi - dy);
  73         r += tex2D<T>(tex, xi - dx, yi + dy);
  74         r += tex2D<T>(tex, xi + dx, yi + dy);
  75         vec_set(dst[yo*dst_pitch+xo], r >> 2);
  76     }
  77 }
  78
  79 extern "C" {
  80
  81 #define NEAREST_KERNEL(T) \
  82     __global__ void Subsample_Nearest_ ## T(cudaTextureObject_t src_tex,                  \
  83                                             T *dst,                                       \
  84                                             int dst_width, int dst_height, int dst_pitch, \
  85                                             int src_width, int src_height,                \
  86                                             int bit_depth)                                \
  87     {                                                                                     \
  88         Subsample_Nearest<T>(src_tex, dst,                                                \
  89                               dst_width, dst_height, dst_pitch,                           \
  90                               src_width, src_height,                                      \
  91                               bit_depth);                                                 \
  92     }
  93
  94 NEAREST_KERNEL(uchar)
  95 NEAREST_KERNEL(uchar2)
  96 NEAREST_KERNEL(uchar4)
  97
  98 NEAREST_KERNEL(ushort)
  99 NEAREST_KERNEL(ushort2)
 100 NEAREST_KERNEL(ushort4)
 101
 102 #define BILINEAR_KERNEL(T) \
 103     __global__ void Subsample_Bilinear_ ## T(cudaTextureObject_t src_tex,                  \
 104                                              T *dst,                                       \
 105                                              int dst_width, int dst_height, int dst_pitch, \
 106                                              int src_width, int src_height,                \
 107                                              int bit_depth)                                \
 108     {                                                                                      \
 109         Subsample_Bilinear<T>(src_tex, dst,                                                \
 110                               dst_width, dst_height, dst_pitch,                            \
 111                               src_width, src_height,                                       \
 112                               bit_depth);                                                  \
 113     }
 114
 115 BILINEAR_KERNEL(uchar)
 116 BILINEAR_KERNEL(uchar2)
 117 BILINEAR_KERNEL(uchar4)
 118
 119 BILINEAR_KERNEL(ushort)
 120 BILINEAR_KERNEL(ushort2)
 121 BILINEAR_KERNEL(ushort4)
 122
 123 }