// Implicit uniforms: // uniform sampler2D PREFIX(sample_tex_horizontal); // uniform sampler2D PREFIX(sample_tex_vertical); // uniform int PREFIX(output_width); // uniform int PREFIX(output_height); // uniform int PREFIX(num_horizontal_samples); // uniform int PREFIX(num_vertical_samples); // uniform int PREFIX(vertical_int_radius); // uniform int PREFIX(num_horizontal_filters); // uniform int PREFIX(num_vertical_filters); // uniform int PREFIX(vertical_whole_pixel_offset); // uniform float PREFIX(inv_vertical_scaling_factor); // uniform float PREFIX(slice_height); // uniform float PREFIX(horizontal_whole_pixel_offset); // uniform float PREFIX(inv_input_height); // uniform float PREFIX(input_texcoord_y_adjust); // Number of samples we have room for between the stages. If this gets // higher, we will have bigger but fewer blocks, which has a few pros and cons: // // + More inter-block parallelity (too low means you cannot fill a warp) // + Less pixels being wasted to overlap (less work being done overall) // - More risk of overflowing the L1 cache // - Less overall GPU parallelity (too few blocks means some GPU cores will // not have a block to work on) // // The current value is a tradeoff; some source/destination sizes will // benefit from different values. #define NUM_TEMP_SAMPLES 128 // Number of horizontal resampling coefficients (sample positions and // weight) we store in shared memory; generally, as long as we have less // than 25x downscaling or so, this will be enough. #define NUM_STORED_HORIZONTAL_COEFFS 128 // Packing the intermediate results into fp16 saves a bit of shared memory, // but more importantly, seems to reduce the number of bank conflicts // (stalls when multiple members of a warp try to access the same bank // at the same time, where bank = address & 0x3c). The actual effect will // naturally depend a lot of the scaling factor, but we're talking about // 5–15% improvement for NVIDIA in our microbenchmarks (although some are // neutral and occasionally one might even be negative). Intel appears to be // less sensitive, although that might be because more time overall goes // towards the texture sampling. // // NVIDIA has an extension GL_NV_gpu_shader5 that gives native half types, // but it doesn't buy us any speed, just less ugly syntax for packing/unpacking. #ifdef GL_ARB_shading_language_packing #define PACK_HALF 1 #else #define PACK_HALF 0 #endif // In theory, these could conflict with another effect since we don't use // PREFIX(), but there can only be one instance of each compute shader, // and having PREFIX() everywhere on non-uniforms becomes increasingly unreadable. shared vec2 horizontal_coeffs[NUM_STORED_HORIZONTAL_COEFFS]; #if PACK_HALF shared uvec2 column[NUM_TEMP_SAMPLES]; // this is more fixed-ish, see below #else shared vec4 column[NUM_TEMP_SAMPLES]; // this is more fixed-ish, see below #endif layout(local_size_x = 1, local_size_y = NUM_TEMP_SAMPLES) in; // Find out where the C++ code placed the center of the filter (this is exactly the same calculation, // just with whole_pixel_offset added in). int center_input_from_output(uint output_y) { return int(roundEven((output_y + 0.5f) * PREFIX(inv_vertical_scaling_factor) - 0.5f)) + PREFIX(vertical_whole_pixel_offset); } float normalized_input_y(int min_input_y, int invocation_index) { return (min_input_y + invocation_index) * PREFIX(inv_input_height) + PREFIX(input_texcoord_y_adjust); } vec2 get_horizontal_coeff(int x) { ivec2 sample_tc; sample_tc.x = x; sample_tc.y = int(gl_GlobalInvocationID.x) % PREFIX(num_horizontal_filters); vec2 s = texelFetch(PREFIX(sample_tex_horizontal), sample_tc, 0).rg; s.g += ((int(gl_GlobalInvocationID.x) / PREFIX(num_horizontal_filters)) * PREFIX(slice_height) + PREFIX(horizontal_whole_pixel_offset)); return s; } vec4 do_horizontal_sampling(float input_y) { vec4 sum = vec4(0.0); if (PREFIX(num_horizontal_samples) <= NUM_STORED_HORIZONTAL_COEFFS) { for (int i = 0; i < PREFIX(num_horizontal_samples); ++i) { vec2 s = horizontal_coeffs[i]; sum += INPUT(vec2(s.y, input_y)) * s.x; } } else { // Not enough shared memory available to hold the horizontal resampling coefficients, // so load the remaining ones as we go. This is generally somewhat slower, even though // all elements of the warp will be loading the same texture sample, so it definitely // a slow path that we will only see in extreme downsampling (which is, unfortunately, // the case that's the most hurt by loading coefficients on-the-fly). // // Other strategies would be possible, including loading coefficients in // multiple phases (or skipping the shared memory altogether if there's no room), // but this is the simplest and not too slow. for (int i = 0; i < NUM_STORED_HORIZONTAL_COEFFS; ++i) { vec2 s = horizontal_coeffs[i]; sum += INPUT(vec2(s.y, input_y)) * s.x; } for (int i = NUM_STORED_HORIZONTAL_COEFFS; i < PREFIX(num_horizontal_samples); ++i) { vec2 s = get_horizontal_coeff(i); sum += INPUT(vec2(s.y, input_y)) * s.x; } } return sum; } void FUNCNAME() { // This is a bit tricky: The x and y workgroup IDs are in the _output_ texture, // but when doing horizontal sampling, the local y invocation ID is in the _input_ // texture. uint min_output_y = uint(PREFIX(output_samples_per_block) * int(gl_WorkGroupID.y)); uint max_output_y = min(min_output_y + uint(PREFIX(output_samples_per_block)), uint(PREFIX(output_size).y)); // Exclusive. int min_input_y = center_input_from_output(min_output_y) - PREFIX(vertical_int_radius); int max_input_y = center_input_from_output(max_output_y - 1u) + PREFIX(vertical_int_radius); // Inclusive. // Load coefficients for the horizontal resampling. if (gl_LocalInvocationID.y < uint(PREFIX(num_horizontal_samples))) { horizontal_coeffs[gl_LocalInvocationID.y] = get_horizontal_coeff(int(gl_LocalInvocationID.y)); } memoryBarrier(); barrier(); // Do the actual horizontal sampling for this row. if (min_input_y + int(gl_LocalInvocationID.y) <= max_input_y) { float input_y = normalized_input_y(min_input_y, int(gl_LocalInvocationID.y)); vec4 val = do_horizontal_sampling(input_y); #if PACK_HALF column[gl_LocalInvocationID.y] = uvec2(packHalf2x16(val.xy), packHalf2x16(val.zw)); #else column[gl_LocalInvocationID.y] = val; #endif } memoryBarrier(); barrier(); // Vertical resampling. For downscaling, we'll only have one iteration // through this loop, but for upscaling, we may have several. // // FIXME: if NUM_TEMP_SAMPLES is too small, we need yet more stuff for (uint output_y = min_output_y + gl_LocalInvocationID.y; output_y < max_output_y; output_y += uint(NUM_TEMP_SAMPLES)) { int base_idx = center_input_from_output(output_y) - PREFIX(vertical_int_radius) - min_input_y; int sample_y = int(output_y) % PREFIX(num_vertical_filters); vec4 sum = vec4(0.0); for (int i = 0; i < PREFIX(num_vertical_samples); ++i) { float weight = texelFetch(PREFIX(sample_tex_vertical), ivec2(i, sample_y), 0).r; #if PACK_HALF uvec2 packed_val = column[base_idx + i]; vec4 val; val.xy = unpackHalf2x16(packed_val.x); val.zw = unpackHalf2x16(packed_val.y); sum += val * weight; #else sum += column[base_idx + i] * weight; #endif } OUTPUT(uvec2(gl_GlobalInvocationID.x, output_y), sum); } } #undef PACK_HALF #undef NUM_TEMP_SAMPLES #undef NUM_STORED_HORIZONTAL_COEFFS