2 * Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
25 texture<unsigned char, 2> uchar_tex;
26 texture<uchar2, 2> uchar2_tex;
27 texture<uchar4, 2> uchar4_tex;
28 texture<unsigned short, 2> ushort_tex;
29 texture<ushort2, 2> ushort2_tex;
30 texture<ushort4, 2> ushort4_tex;
32 __global__ void Subsample_Bilinear_uchar(unsigned char *dst,
33 int dst_width, int dst_height, int dst_pitch,
34 int src_width, int src_height)
36 int xo = blockIdx.x * blockDim.x + threadIdx.x;
37 int yo = blockIdx.y * blockDim.y + threadIdx.y;
39 if (yo < dst_height && xo < dst_width)
41 float hscale = (float)src_width / (float)dst_width;
42 float vscale = (float)src_height / (float)dst_height;
43 float xi = (xo + 0.5f) * hscale;
44 float yi = (yo + 0.5f) * vscale;
45 // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
46 float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
47 float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
48 // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
49 float dx = wh / (0.5f + wh);
50 float dy = wv / (0.5f + wv);
51 int y0 = tex2D(uchar_tex, xi-dx, yi-dy);
52 int y1 = tex2D(uchar_tex, xi+dx, yi-dy);
53 int y2 = tex2D(uchar_tex, xi-dx, yi+dy);
54 int y3 = tex2D(uchar_tex, xi+dx, yi+dy);
55 dst[yo*dst_pitch+xo] = (unsigned char)((y0+y1+y2+y3+2) >> 2);
59 __global__ void Subsample_Bilinear_uchar2(uchar2 *dst,
60 int dst_width, int dst_height, int dst_pitch2,
61 int src_width, int src_height)
63 int xo = blockIdx.x * blockDim.x + threadIdx.x;
64 int yo = blockIdx.y * blockDim.y + threadIdx.y;
66 if (yo < dst_height && xo < dst_width)
68 float hscale = (float)src_width / (float)dst_width;
69 float vscale = (float)src_height / (float)dst_height;
70 float xi = (xo + 0.5f) * hscale;
71 float yi = (yo + 0.5f) * vscale;
72 // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
73 float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
74 float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
75 // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
76 float dx = wh / (0.5f + wh);
77 float dy = wv / (0.5f + wv);
78 uchar2 c0 = tex2D(uchar2_tex, xi-dx, yi-dy);
79 uchar2 c1 = tex2D(uchar2_tex, xi+dx, yi-dy);
80 uchar2 c2 = tex2D(uchar2_tex, xi-dx, yi+dy);
81 uchar2 c3 = tex2D(uchar2_tex, xi+dx, yi+dy);
83 uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
84 uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
85 dst[yo*dst_pitch2+xo] = make_uchar2((unsigned char)uv.x, (unsigned char)uv.y);
89 __global__ void Subsample_Bilinear_uchar4(uchar4 *dst,
90 int dst_width, int dst_height, int dst_pitch,
91 int src_width, int src_height)
93 int xo = blockIdx.x * blockDim.x + threadIdx.x;
94 int yo = blockIdx.y * blockDim.y + threadIdx.y;
96 if (yo < dst_height && xo < dst_width)
98 float hscale = (float)src_width / (float)dst_width;
99 float vscale = (float)src_height / (float)dst_height;
100 float xi = (xo + 0.5f) * hscale;
101 float yi = (yo + 0.5f) * vscale;
102 // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
103 float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
104 float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
105 // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
106 float dx = wh / (0.5f + wh);
107 float dy = wv / (0.5f + wv);
108 uchar4 c0 = tex2D(uchar4_tex, xi-dx, yi-dy);
109 uchar4 c1 = tex2D(uchar4_tex, xi+dx, yi-dy);
110 uchar4 c2 = tex2D(uchar4_tex, xi-dx, yi+dy);
111 uchar4 c3 = tex2D(uchar4_tex, xi+dx, yi+dy);
113 res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
114 res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
115 res.z = ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2;
116 res.w = ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2;
117 dst[yo*dst_pitch+xo] = make_uchar4(
118 (unsigned char)res.x, (unsigned char)res.y, (unsigned char)res.z, (unsigned char)res.w);
122 __global__ void Subsample_Bilinear_ushort(unsigned short *dst,
123 int dst_width, int dst_height, int dst_pitch,
124 int src_width, int src_height)
126 int xo = blockIdx.x * blockDim.x + threadIdx.x;
127 int yo = blockIdx.y * blockDim.y + threadIdx.y;
129 if (yo < dst_height && xo < dst_width)
131 float hscale = (float)src_width / (float)dst_width;
132 float vscale = (float)src_height / (float)dst_height;
133 float xi = (xo + 0.5f) * hscale;
134 float yi = (yo + 0.5f) * vscale;
135 // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
136 float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
137 float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
138 // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
139 float dx = wh / (0.5f + wh);
140 float dy = wv / (0.5f + wv);
141 int y0 = tex2D(ushort_tex, xi-dx, yi-dy);
142 int y1 = tex2D(ushort_tex, xi+dx, yi-dy);
143 int y2 = tex2D(ushort_tex, xi-dx, yi+dy);
144 int y3 = tex2D(ushort_tex, xi+dx, yi+dy);
145 dst[yo*dst_pitch+xo] = (unsigned short)((y0+y1+y2+y3+2) >> 2);
149 __global__ void Subsample_Bilinear_ushort2(ushort2 *dst,
150 int dst_width, int dst_height, int dst_pitch2,
151 int src_width, int src_height)
153 int xo = blockIdx.x * blockDim.x + threadIdx.x;
154 int yo = blockIdx.y * blockDim.y + threadIdx.y;
156 if (yo < dst_height && xo < dst_width)
158 float hscale = (float)src_width / (float)dst_width;
159 float vscale = (float)src_height / (float)dst_height;
160 float xi = (xo + 0.5f) * hscale;
161 float yi = (yo + 0.5f) * vscale;
162 // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
163 float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
164 float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
165 // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
166 float dx = wh / (0.5f + wh);
167 float dy = wv / (0.5f + wv);
168 ushort2 c0 = tex2D(ushort2_tex, xi-dx, yi-dy);
169 ushort2 c1 = tex2D(ushort2_tex, xi+dx, yi-dy);
170 ushort2 c2 = tex2D(ushort2_tex, xi-dx, yi+dy);
171 ushort2 c3 = tex2D(ushort2_tex, xi+dx, yi+dy);
173 uv.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
174 uv.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
175 dst[yo*dst_pitch2+xo] = make_ushort2((unsigned short)uv.x, (unsigned short)uv.y);
179 __global__ void Subsample_Bilinear_ushort4(ushort4 *dst,
180 int dst_width, int dst_height, int dst_pitch,
181 int src_width, int src_height)
183 int xo = blockIdx.x * blockDim.x + threadIdx.x;
184 int yo = blockIdx.y * blockDim.y + threadIdx.y;
186 if (yo < dst_height && xo < dst_width)
188 float hscale = (float)src_width / (float)dst_width;
189 float vscale = (float)src_height / (float)dst_height;
190 float xi = (xo + 0.5f) * hscale;
191 float yi = (yo + 0.5f) * vscale;
192 // 3-tap filter weights are {wh,1.0,wh} and {wv,1.0,wv}
193 float wh = min(max(0.5f * (hscale - 1.0f), 0.0f), 1.0f);
194 float wv = min(max(0.5f * (vscale - 1.0f), 0.0f), 1.0f);
195 // Convert weights to two bilinear weights -> {wh,1.0,wh} -> {wh,0.5,0} + {0,0.5,wh}
196 float dx = wh / (0.5f + wh);
197 float dy = wv / (0.5f + wv);
198 ushort4 c0 = tex2D(ushort4_tex, xi-dx, yi-dy);
199 ushort4 c1 = tex2D(ushort4_tex, xi+dx, yi-dy);
200 ushort4 c2 = tex2D(ushort4_tex, xi-dx, yi+dy);
201 ushort4 c3 = tex2D(ushort4_tex, xi+dx, yi+dy);
203 res.x = ((int)c0.x+(int)c1.x+(int)c2.x+(int)c3.x+2) >> 2;
204 res.y = ((int)c0.y+(int)c1.y+(int)c2.y+(int)c3.y+2) >> 2;
205 res.z = ((int)c0.z+(int)c1.z+(int)c2.z+(int)c3.z+2) >> 2;
206 res.w = ((int)c0.w+(int)c1.w+(int)c2.w+(int)c3.w+2) >> 2;
207 dst[yo*dst_pitch+xo] = make_ushort4(
208 (unsigned short)res.x, (unsigned short)res.y, (unsigned short)res.z, (unsigned short)res.w);