Compute version of ResampleEffect.

[movit] / resample_effect.comp
diff --git a/resample_effect.comp b/resample_effect.comp

new file mode 100644 (file)

index 0000000..5d7fa0f
--- /dev/null
+++ b/resample_effect.comp
@@ -0,0 +1,178 @@
+// Implicit uniforms:
+// uniform sampler2D PREFIX(sample_tex_horizontal);
+// uniform sampler2D PREFIX(sample_tex_vertical);
+// uniform int PREFIX(output_width);
+// uniform int PREFIX(output_height);
+// uniform int PREFIX(num_horizontal_samples);
+// uniform int PREFIX(num_vertical_samples);
+// uniform int PREFIX(vertical_int_radius);
+// uniform int PREFIX(num_horizontal_filters);
+// uniform int PREFIX(num_vertical_filters);
+// uniform int PREFIX(vertical_whole_pixel_offset);
+// uniform float PREFIX(inv_vertical_scaling_factor);
+// uniform float PREFIX(slice_height);
+// uniform float PREFIX(horizontal_whole_pixel_offset);
+// uniform float PREFIX(inv_input_height);
+// uniform float PREFIX(input_texcoord_y_adjust);
+
+// Number of samples we have room for between the stages. If this gets
+// higher, we will have bigger but fewer blocks, which has a few pros and cons:
+//
+//  + More inter-block parallelity (too low means you cannot fill a warp)
+//  + Less pixels being wasted to overlap (less work being done overall)
+//  - More risk of overflowing the L1 cache
+//  - Less overall GPU parallelity (too few blocks means some GPU cores will
+//    not have a block to work on)
+//
+// The current value is a tradeoff; some source/destination sizes will
+// benefit from different values.
+#define NUM_TEMP_SAMPLES 128
+
+// Number of horizontal resampling coefficients (sample positions and
+// weight) we store in shared memory; generally, as long as we have less
+// than 25x downscaling or so, this will be enough.
+#define NUM_STORED_HORIZONTAL_COEFFS 128
+
+// Packing the intermediate results into fp16 saves a bit of shared memory,
+// but more importantly, seems to reduce the number of bank conflicts
+// (stalls when multiple members of a warp try to access the same bank
+// at the same time, where bank = address & 0x3c). The actual effect will
+// naturally depend a lot of the scaling factor, but we're talking about
+// 5–15% improvement for NVIDIA in our microbenchmarks (although some are
+// neutral and occasionally one might even be negative). Intel appears to be
+// less sensitive, although that might be because more time overall goes
+// towards the texture sampling.
+//
+// NVIDIA has an extension GL_NV_gpu_shader5 that gives native half types,
+// but it doesn't buy us any speed, just less ugly syntax for packing/unpacking.
+#ifdef GL_ARB_shading_language_packing
+#define PACK_HALF 1
+#else
+#define PACK_HALF 0
+#endif
+
+// In theory, these could conflict with another effect since we don't use
+// PREFIX(), but there can only be one instance of each compute shader,
+// and having PREFIX() everywhere on non-uniforms becomes increasingly unreadable.
+
+shared vec2 horizontal_coeffs[NUM_STORED_HORIZONTAL_COEFFS];
+#if PACK_HALF
+shared uvec2 column[NUM_TEMP_SAMPLES];  // this is more fixed-ish, see below
+#else
+shared vec4 column[NUM_TEMP_SAMPLES];  // this is more fixed-ish, see below
+#endif
+
+layout(local_size_x = 1, local_size_y = NUM_TEMP_SAMPLES) in;
+
+// Find out where the C++ code placed the center of the filter (this is exactly the same calculation,
+// just with whole_pixel_offset added in).
+int center_input_from_output(uint output_y)
+{
+       return int(roundEven((output_y + 0.5f) * PREFIX(inv_vertical_scaling_factor) - 0.5f)) + PREFIX(vertical_whole_pixel_offset);
+}
+
+float normalized_input_y(int min_input_y, int invocation_index)
+{
+       return (min_input_y + invocation_index) * PREFIX(inv_input_height) + PREFIX(input_texcoord_y_adjust);
+}
+
+vec2 get_horizontal_coeff(int x)
+{
+       ivec2 sample_tc;
+       sample_tc.x = x;
+       sample_tc.y = int(gl_GlobalInvocationID.x) % PREFIX(num_horizontal_filters);
+       vec2 s = texelFetch(PREFIX(sample_tex_horizontal), sample_tc, 0).rg;
+       s.g += ((int(gl_GlobalInvocationID.x) / PREFIX(num_horizontal_filters)) * PREFIX(slice_height) + PREFIX(horizontal_whole_pixel_offset));
+       return s;
+}
+
+vec4 do_horizontal_sampling(float input_y)
+{
+       vec4 sum = vec4(0.0);
+       if (PREFIX(num_horizontal_samples) <= NUM_STORED_HORIZONTAL_COEFFS) {
+               for (int i = 0; i < PREFIX(num_horizontal_samples); ++i) {
+                       vec2 s = horizontal_coeffs[i];
+                       sum += INPUT(vec2(s.y, input_y)) * s.x;
+               }
+       } else {
+               // Not enough shared memory available to hold the horizontal resampling coefficients,
+               // so load the remaining ones as we go. This is generally somewhat slower, even though
+               // all elements of the warp will be loading the same texture sample, so it definitely
+               // a slow path that we will only see in extreme downsampling (which is, unfortunately,
+               // the case that's the most hurt by loading coefficients on-the-fly).
+               //
+               // Other strategies would be possible, including loading coefficients in
+               // multiple phases (or skipping the shared memory altogether if there's no room),
+               // but this is the simplest and not too slow.
+               for (int i = 0; i < NUM_STORED_HORIZONTAL_COEFFS; ++i) {
+                       vec2 s = horizontal_coeffs[i];
+                       sum += INPUT(vec2(s.y, input_y)) * s.x;
+               }
+               for (int i = NUM_STORED_HORIZONTAL_COEFFS; i < PREFIX(num_horizontal_samples); ++i) {
+                       vec2 s = get_horizontal_coeff(i);
+                       sum += INPUT(vec2(s.y, input_y)) * s.x;
+               }
+       }
+       return sum;
+}
+
+void FUNCNAME() {
+       // This is a bit tricky: The x and y workgroup IDs are in the _output_ texture,
+       // but when doing horizontal sampling, the local y invocation ID is in the _input_
+       // texture.
+       uint min_output_y = uint(PREFIX(output_samples_per_block) * int(gl_WorkGroupID.y));
+       uint max_output_y = min(min_output_y + uint(PREFIX(output_samples_per_block)), uint(PREFIX(output_size).y));  // Exclusive.
+
+       int min_input_y = center_input_from_output(min_output_y) - PREFIX(vertical_int_radius);
+       int max_input_y = center_input_from_output(max_output_y - 1u) + PREFIX(vertical_int_radius);  // Inclusive.
+
+       // Load coefficients for the horizontal resampling.
+       if (gl_LocalInvocationID.y < uint(PREFIX(num_horizontal_samples))) {
+               horizontal_coeffs[gl_LocalInvocationID.y] = get_horizontal_coeff(int(gl_LocalInvocationID.y));
+       }
+
+       memoryBarrier();
+       barrier();
+
+       // Do the actual horizontal sampling for this row.
+       if (min_input_y + int(gl_LocalInvocationID.y) <= max_input_y) {
+               float input_y = normalized_input_y(min_input_y, int(gl_LocalInvocationID.y));
+               vec4 val = do_horizontal_sampling(input_y);
+#if PACK_HALF
+               column[gl_LocalInvocationID.y] = uvec2(packHalf2x16(val.xy), packHalf2x16(val.zw));
+#else
+               column[gl_LocalInvocationID.y] = val;
+#endif
+       }
+
+       memoryBarrier();
+       barrier();
+
+       // Vertical resampling. For downscaling, we'll only have one iteration
+       // through this loop, but for upscaling, we may have several.
+       //
+       // FIXME: if NUM_TEMP_SAMPLES is too small, we need yet more stuff
+       for (uint output_y = min_output_y + gl_LocalInvocationID.y; output_y < max_output_y; output_y += uint(NUM_TEMP_SAMPLES)) {
+               int base_idx = center_input_from_output(output_y) - PREFIX(vertical_int_radius) - min_input_y;
+               int sample_y = int(output_y) % PREFIX(num_vertical_filters);
+
+               vec4 sum = vec4(0.0);
+               for (int i = 0; i < PREFIX(num_vertical_samples); ++i) {
+                       float weight = texelFetch(PREFIX(sample_tex_vertical), ivec2(i, sample_y), 0).r;
+#if PACK_HALF
+                       uvec2 packed_val = column[base_idx + i];
+                       vec4 val;
+                       val.xy = unpackHalf2x16(packed_val.x);
+                       val.zw = unpackHalf2x16(packed_val.y);
+                       sum += val * weight;
+#else
+                       sum += column[base_idx + i] * weight;
+#endif
+               }
+               OUTPUT(uvec2(gl_GlobalInvocationID.x, output_y), sum);
+       }
+}
+
+#undef PACK_HALF
+#undef NUM_TEMP_SAMPLES
+#undef NUM_STORED_HORIZONTAL_COEFFS