2 // uniform sampler2D PREFIX(sample_tex_horizontal);
3 // uniform sampler2D PREFIX(sample_tex_vertical);
4 // uniform int PREFIX(output_width);
5 // uniform int PREFIX(output_height);
6 // uniform int PREFIX(num_horizontal_samples);
7 // uniform int PREFIX(num_vertical_samples);
8 // uniform int PREFIX(vertical_int_radius);
9 // uniform int PREFIX(num_horizontal_filters);
10 // uniform int PREFIX(num_vertical_filters);
11 // uniform int PREFIX(vertical_whole_pixel_offset);
12 // uniform float PREFIX(inv_vertical_scaling_factor);
13 // uniform float PREFIX(slice_height);
14 // uniform float PREFIX(horizontal_whole_pixel_offset);
15 // uniform float PREFIX(inv_input_height);
16 // uniform float PREFIX(input_texcoord_y_adjust);
18 // Number of samples we have room for between the stages. If this gets
19 // higher, we will have bigger but fewer blocks, which has a few pros and cons:
21 // + More inter-block parallelity (too low means you cannot fill a warp)
22 // + Less pixels being wasted to overlap (less work being done overall)
23 // - More risk of overflowing the L1 cache
24 // - Less overall GPU parallelity (too few blocks means some GPU cores will
25 // not have a block to work on)
27 // The current value is a tradeoff; some source/destination sizes will
28 // benefit from different values.
29 #define NUM_TEMP_SAMPLES 128
31 // Number of horizontal resampling coefficients (sample positions and
32 // weight) we store in shared memory; generally, as long as we have less
33 // than 25x downscaling or so, this will be enough.
34 #define NUM_STORED_HORIZONTAL_COEFFS 128
36 // Packing the intermediate results into fp16 saves a bit of shared memory,
37 // but more importantly, seems to reduce the number of bank conflicts
38 // (stalls when multiple members of a warp try to access the same bank
39 // at the same time, where bank = address & 0x3c). The actual effect will
40 // naturally depend a lot of the scaling factor, but we're talking about
41 // 5–15% improvement for NVIDIA in our microbenchmarks (although some are
42 // neutral and occasionally one might even be negative). Intel appears to be
43 // less sensitive, although that might be because more time overall goes
44 // towards the texture sampling.
46 // NVIDIA has an extension GL_NV_gpu_shader5 that gives native half types,
47 // but it doesn't buy us any speed, just less ugly syntax for packing/unpacking.
48 #ifdef GL_ARB_shading_language_packing
54 // In theory, these could conflict with another effect since we don't use
55 // PREFIX(), but there can only be one instance of each compute shader,
56 // and having PREFIX() everywhere on non-uniforms becomes increasingly unreadable.
58 shared vec2 horizontal_coeffs[NUM_STORED_HORIZONTAL_COEFFS];
60 shared uvec2 column[NUM_TEMP_SAMPLES]; // this is more fixed-ish, see below
62 shared vec4 column[NUM_TEMP_SAMPLES]; // this is more fixed-ish, see below
65 layout(local_size_x = 1, local_size_y = NUM_TEMP_SAMPLES) in;
67 // Find out where the C++ code placed the center of the filter (this is exactly the same calculation,
68 // just with whole_pixel_offset added in).
69 int center_input_from_output(uint output_y)
71 return int(roundEven((output_y + 0.5f) * PREFIX(inv_vertical_scaling_factor) - 0.5f)) + PREFIX(vertical_whole_pixel_offset);
74 float normalized_input_y(int min_input_y, int invocation_index)
76 return (min_input_y + invocation_index) * PREFIX(inv_input_height) + PREFIX(input_texcoord_y_adjust);
79 vec2 get_horizontal_coeff(int x)
83 sample_tc.y = int(gl_GlobalInvocationID.x) % PREFIX(num_horizontal_filters);
84 vec2 s = texelFetch(PREFIX(sample_tex_horizontal), sample_tc, 0).rg;
85 s.g += ((int(gl_GlobalInvocationID.x) / PREFIX(num_horizontal_filters)) * PREFIX(slice_height) + PREFIX(horizontal_whole_pixel_offset));
89 vec4 do_horizontal_sampling(float input_y)
92 if (PREFIX(num_horizontal_samples) <= NUM_STORED_HORIZONTAL_COEFFS) {
93 for (int i = 0; i < PREFIX(num_horizontal_samples); ++i) {
94 vec2 s = horizontal_coeffs[i];
95 sum += INPUT(vec2(s.y, input_y)) * s.x;
98 // Not enough shared memory available to hold the horizontal resampling coefficients,
99 // so load the remaining ones as we go. This is generally somewhat slower, even though
100 // all elements of the warp will be loading the same texture sample, so it definitely
101 // a slow path that we will only see in extreme downsampling (which is, unfortunately,
102 // the case that's the most hurt by loading coefficients on-the-fly).
104 // Other strategies would be possible, including loading coefficients in
105 // multiple phases (or skipping the shared memory altogether if there's no room),
106 // but this is the simplest and not too slow.
107 for (int i = 0; i < NUM_STORED_HORIZONTAL_COEFFS; ++i) {
108 vec2 s = horizontal_coeffs[i];
109 sum += INPUT(vec2(s.y, input_y)) * s.x;
111 for (int i = NUM_STORED_HORIZONTAL_COEFFS; i < PREFIX(num_horizontal_samples); ++i) {
112 vec2 s = get_horizontal_coeff(i);
113 sum += INPUT(vec2(s.y, input_y)) * s.x;
120 // This is a bit tricky: The x and y workgroup IDs are in the _output_ texture,
121 // but when doing horizontal sampling, the local y invocation ID is in the _input_
123 uint min_output_y = uint(PREFIX(output_samples_per_block) * int(gl_WorkGroupID.y));
124 uint max_output_y = min(min_output_y + uint(PREFIX(output_samples_per_block)), uint(PREFIX(output_size).y)); // Exclusive.
126 int min_input_y = center_input_from_output(min_output_y) - PREFIX(vertical_int_radius);
127 int max_input_y = center_input_from_output(max_output_y - 1u) + PREFIX(vertical_int_radius); // Inclusive.
129 // Load coefficients for the horizontal resampling.
130 if (gl_LocalInvocationID.y < uint(PREFIX(num_horizontal_samples))) {
131 horizontal_coeffs[gl_LocalInvocationID.y] = get_horizontal_coeff(int(gl_LocalInvocationID.y));
137 // Do the actual horizontal sampling for this row.
138 if (min_input_y + int(gl_LocalInvocationID.y) <= max_input_y) {
139 float input_y = normalized_input_y(min_input_y, int(gl_LocalInvocationID.y));
140 vec4 val = do_horizontal_sampling(input_y);
142 column[gl_LocalInvocationID.y] = uvec2(packHalf2x16(val.xy), packHalf2x16(val.zw));
144 column[gl_LocalInvocationID.y] = val;
151 // Vertical resampling. For downscaling, we'll only have one iteration
152 // through this loop, but for upscaling, we may have several.
154 // FIXME: if NUM_TEMP_SAMPLES is too small, we need yet more stuff
155 for (uint output_y = min_output_y + gl_LocalInvocationID.y; output_y < max_output_y; output_y += uint(NUM_TEMP_SAMPLES)) {
156 int base_idx = center_input_from_output(output_y) - PREFIX(vertical_int_radius) - min_input_y;
157 int sample_y = int(output_y) % PREFIX(num_vertical_filters);
159 vec4 sum = vec4(0.0);
160 for (int i = 0; i < PREFIX(num_vertical_samples); ++i) {
161 float weight = texelFetch(PREFIX(sample_tex_vertical), ivec2(i, sample_y), 0).r;
163 uvec2 packed_val = column[base_idx + i];
165 val.xy = unpackHalf2x16(packed_val.x);
166 val.zw = unpackHalf2x16(packed_val.y);
169 sum += column[base_idx + i] * weight;
172 OUTPUT(uvec2(gl_GlobalInvocationID.x, output_y), sum);
177 #undef NUM_TEMP_SAMPLES
178 #undef NUM_STORED_HORIZONTAL_COEFFS