git.sesse.net Git - ffmpeg/blob - libavfilter/vf_scale_cuda.cu

   1 /*
   2  * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  20  * DEALINGS IN THE SOFTWARE.
  21  */
  22
  23 extern "C" {
  24
  25 __global__ void Subsample_Bilinear_uchar(cudaTextureObject_t uchar_tex,
  26                                     unsigned char *dst,
  27                                     int dst_width, int dst_height, int dst_pitch,
  28                                     int src_width, int src_height)
  29 {
  30     int xo = blockIdx.x * blockDim.x + threadIdx.x;
  31     int yo = blockIdx.y * blockDim.y + threadIdx.y;
  32
  33     if (yo < dst_height && xo < dst_width)
  34     {
  35         float hscale = (float)src_width / (float)dst_width;
  36         float vscale = (float)src_height / (float)dst_height;
  37         float xi = (xo + 0.5f) * hscale;
  38         float yi = (yo + 0.5f) * vscale;
  39         // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
  40         float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
  41         float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
  42         // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
  43         float dx = wh / (0.5f + wh);
  44         float dy = wv / (0.5f + wv);
  45         int y0 = tex2D<unsigned char>(uchar_tex, xi-dx, yi-dy);
  46         int y1 = tex2D<unsigned char>(uchar_tex, xi+dx, yi-dy);
  47         int y2 = tex2D<unsigned char>(uchar_tex, xi-dx, yi+dy);
  48         int y3 = tex2D<unsigned char>(uchar_tex, xi+dx, yi+dy);
  49         dst[yo*dst_pitch+xo] = (unsigned char)((y0+y1+y2+y3+2) >> 2);
  50     }
  51 }
  52
  53 __global__ void Subsample_Bilinear_uchar2(cudaTextureObject_t uchar2_tex,
  54                                     uchar2 *dst,
  55                                     int dst_width, int dst_height, int dst_pitch2,
  56                                     int src_width, int src_height)
  57 {
  58     int xo = blockIdx.x * blockDim.x + threadIdx.x;
  59     int yo = blockIdx.y * blockDim.y + threadIdx.y;
  60
  61     if (yo < dst_height && xo < dst_width)
  62     {
  63         float hscale = (float)src_width / (float)dst_width;
  64         float vscale = (float)src_height / (float)dst_height;
  65         float xi = (xo + 0.5f) * hscale;
  66         float yi = (yo + 0.5f) * vscale;
  67         // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
  68         float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
  69         float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
  70         // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
  71         float dx = wh / (0.5f + wh);
  72         float dy = wv / (0.5f + wv);
  73         uchar2 c0 = tex2D<uchar2>(uchar2_tex, xi-dx, yi-dy);
  74         uchar2 c1 = tex2D<uchar2>(uchar2_tex, xi+dx, yi-dy);
  75         uchar2 c2 = tex2D<uchar2>(uchar2_tex, xi-dx, yi+dy);
  76         uchar2 c3 = tex2D<uchar2>(uchar2_tex, xi+dx, yi+dy);
  77         int2 uv;
  78         uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
  79         uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
  80         dst[yo*dst_pitch2+xo] = make_uchar2((unsigned char)uv.x, (unsigned char)uv.y);
  81     }
  82 }
  83
  84 __global__ void Subsample_Bilinear_uchar4(cudaTextureObject_t uchar4_tex,
  85                                     uchar4 *dst,
  86                                     int dst_width, int dst_height, int dst_pitch,
  87                                     int src_width, int src_height)
  88 {
  89     int xo = blockIdx.x * blockDim.x + threadIdx.x;
  90     int yo = blockIdx.y * blockDim.y + threadIdx.y;
  91
  92     if (yo < dst_height && xo < dst_width)
  93     {
  94         float hscale = (float)src_width / (float)dst_width;
  95         float vscale = (float)src_height / (float)dst_height;
  96         float xi = (xo + 0.5f) * hscale;
  97         float yi = (yo + 0.5f) * vscale;
  98         // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
  99         float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
 100         float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
 101         // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
 102         float dx = wh / (0.5f + wh);
 103         float dy = wv / (0.5f + wv);
 104         uchar4 c0 = tex2D<uchar4>(uchar4_tex, xi-dx, yi-dy);
 105         uchar4 c1 = tex2D<uchar4>(uchar4_tex, xi+dx, yi-dy);
 106         uchar4 c2 = tex2D<uchar4>(uchar4_tex, xi-dx, yi+dy);
 107         uchar4 c3 = tex2D<uchar4>(uchar4_tex, xi+dx, yi+dy);
 108         int4 res;
 109         res.x =  ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
 110         res.y =  ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
 111         res.z =  ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2;
 112         res.w =  ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2;
 113         dst[yo*dst_pitch+xo] = make_uchar4(
 114             (unsigned char)res.x, (unsigned char)res.y, (unsigned char)res.z, (unsigned char)res.w);
 115     }
 116 }
 117
 118 __global__ void Subsample_Bilinear_ushort(cudaTextureObject_t ushort_tex,
 119                                     unsigned short *dst,
 120                                     int dst_width, int dst_height, int dst_pitch,
 121                                     int src_width, int src_height)
 122 {
 123     int xo = blockIdx.x * blockDim.x + threadIdx.x;
 124     int yo = blockIdx.y * blockDim.y + threadIdx.y;
 125
 126     if (yo < dst_height && xo < dst_width)
 127     {
 128         float hscale = (float)src_width / (float)dst_width;
 129         float vscale = (float)src_height / (float)dst_height;
 130         float xi = (xo + 0.5f) * hscale;
 131         float yi = (yo + 0.5f) * vscale;
 132         // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
 133         float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
 134         float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
 135         // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
 136         float dx = wh / (0.5f + wh);
 137         float dy = wv / (0.5f + wv);
 138         int y0 = tex2D<unsigned short>(ushort_tex, xi-dx, yi-dy);
 139         int y1 = tex2D<unsigned short>(ushort_tex, xi+dx, yi-dy);
 140         int y2 = tex2D<unsigned short>(ushort_tex, xi-dx, yi+dy);
 141         int y3 = tex2D<unsigned short>(ushort_tex, xi+dx, yi+dy);
 142         dst[yo*dst_pitch+xo] = (unsigned short)((y0+y1+y2+y3+2) >> 2);
 143     }
 144 }
 145
 146 __global__ void Subsample_Bilinear_ushort2(cudaTextureObject_t ushort2_tex,
 147                                     ushort2 *dst,
 148                                     int dst_width, int dst_height, int dst_pitch2,
 149                                     int src_width, int src_height)
 150 {
 151     int xo = blockIdx.x * blockDim.x + threadIdx.x;
 152     int yo = blockIdx.y * blockDim.y + threadIdx.y;
 153
 154     if (yo < dst_height && xo < dst_width)
 155     {
 156         float hscale = (float)src_width / (float)dst_width;
 157         float vscale = (float)src_height / (float)dst_height;
 158         float xi = (xo + 0.5f) * hscale;
 159         float yi = (yo + 0.5f) * vscale;
 160         // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
 161         float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
 162         float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
 163         // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
 164         float dx = wh / (0.5f + wh);
 165         float dy = wv / (0.5f + wv);
 166         ushort2 c0 = tex2D<ushort2>(ushort2_tex, xi-dx, yi-dy);
 167         ushort2 c1 = tex2D<ushort2>(ushort2_tex, xi+dx, yi-dy);
 168         ushort2 c2 = tex2D<ushort2>(ushort2_tex, xi-dx, yi+dy);
 169         ushort2 c3 = tex2D<ushort2>(ushort2_tex, xi+dx, yi+dy);
 170         int2 uv;
 171         uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
 172         uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
 173         dst[yo*dst_pitch2+xo] = make_ushort2((unsigned short)uv.x, (unsigned short)uv.y);
 174     }
 175 }
 176
 177 __global__ void Subsample_Bilinear_ushort4(cudaTextureObject_t ushort4_tex,
 178                                     ushort4 *dst,
 179                                     int dst_width, int dst_height, int dst_pitch,
 180                                     int src_width, int src_height)
 181 {
 182     int xo = blockIdx.x * blockDim.x + threadIdx.x;
 183     int yo = blockIdx.y * blockDim.y + threadIdx.y;
 184
 185     if (yo < dst_height && xo < dst_width)
 186     {
 187         float hscale = (float)src_width / (float)dst_width;
 188         float vscale = (float)src_height / (float)dst_height;
 189         float xi = (xo + 0.5f) * hscale;
 190         float yi = (yo + 0.5f) * vscale;
 191         // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
 192         float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
 193         float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
 194         // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
 195         float dx = wh / (0.5f + wh);
 196         float dy = wv / (0.5f + wv);
 197         ushort4 c0 = tex2D<ushort4>(ushort4_tex, xi-dx, yi-dy);
 198         ushort4 c1 = tex2D<ushort4>(ushort4_tex, xi+dx, yi-dy);
 199         ushort4 c2 = tex2D<ushort4>(ushort4_tex, xi-dx, yi+dy);
 200         ushort4 c3 = tex2D<ushort4>(ushort4_tex, xi+dx, yi+dy);
 201         int4 res;
 202         res.x =  ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
 203         res.y =  ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
 204         res.z =  ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2;
 205         res.w =  ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2;
 206         dst[yo*dst_pitch+xo] = make_ushort4(
 207             (unsigned short)res.x, (unsigned short)res.y, (unsigned short)res.z, (unsigned short)res.w);
 208     }
 209 }
 210
 211 }