git.sesse.net Git - movit/blob - resample_effect.comp

   1 // Implicit uniforms:
   2 // uniform sampler2D PREFIX(sample_tex_horizontal);
   3 // uniform sampler2D PREFIX(sample_tex_vertical);
   4 // uniform int PREFIX(output_width);
   5 // uniform int PREFIX(output_height);
   6 // uniform int PREFIX(num_horizontal_samples);
   7 // uniform int PREFIX(num_vertical_samples);
   8 // uniform int PREFIX(vertical_int_radius);
   9 // uniform int PREFIX(num_horizontal_filters);
  10 // uniform int PREFIX(num_vertical_filters);
  11 // uniform int PREFIX(vertical_whole_pixel_offset);
  12 // uniform float PREFIX(inv_vertical_scaling_factor);
  13 // uniform float PREFIX(slice_height);
  14 // uniform float PREFIX(horizontal_whole_pixel_offset);
  15 // uniform float PREFIX(inv_input_height);
  16 // uniform float PREFIX(input_texcoord_y_adjust);
  17
  18 // Number of samples we have room for between the stages. If this gets
  19 // higher, we will have bigger but fewer blocks, which has a few pros and cons:
  20 //
  21 //  + More inter-block parallelity (too low means you cannot fill a warp)
  22 //  + Less pixels being wasted to overlap (less work being done overall)
  23 //  - More risk of overflowing the L1 cache
  24 //  - Less overall GPU parallelity (too few blocks means some GPU cores will
  25 //    not have a block to work on)
  26 //
  27 // The current value is a tradeoff; some source/destination sizes will
  28 // benefit from different values.
  29 #define NUM_TEMP_SAMPLES 128
  30
  31 // Number of horizontal resampling coefficients (sample positions and
  32 // weight) we store in shared memory; generally, as long as we have less
  33 // than 25x downscaling or so, this will be enough.
  34 #define NUM_STORED_HORIZONTAL_COEFFS 128
  35
  36 // Packing the intermediate results into fp16 saves a bit of shared memory,
  37 // but more importantly, seems to reduce the number of bank conflicts
  38 // (stalls when multiple members of a warp try to access the same bank
  39 // at the same time, where bank = address & 0x3c). The actual effect will
  40 // naturally depend a lot of the scaling factor, but we're talking about
  41 // 5–15% improvement for NVIDIA in our microbenchmarks (although some are
  42 // neutral and occasionally one might even be negative). Intel appears to be
  43 // less sensitive, although that might be because more time overall goes
  44 // towards the texture sampling.
  45 //
  46 // NVIDIA has an extension GL_NV_gpu_shader5 that gives native half types,
  47 // but it doesn't buy us any speed, just less ugly syntax for packing/unpacking.
  48 #ifdef GL_ARB_shading_language_packing
  49 #define PACK_HALF 1
  50 #else
  51 #define PACK_HALF 0
  52 #endif
  53
  54 // In theory, these could conflict with another effect since we don't use
  55 // PREFIX(), but there can only be one instance of each compute shader,
  56 // and having PREFIX() everywhere on non-uniforms becomes increasingly unreadable.
  57
  58 shared vec2 horizontal_coeffs[NUM_STORED_HORIZONTAL_COEFFS];
  59 #if PACK_HALF
  60 shared uvec2 column[NUM_TEMP_SAMPLES];  // this is more fixed-ish, see below
  61 #else
  62 shared vec4 column[NUM_TEMP_SAMPLES];  // this is more fixed-ish, see below
  63 #endif
  64
  65 layout(local_size_x = 1, local_size_y = NUM_TEMP_SAMPLES) in;
  66
  67 // Find out where the C++ code placed the center of the filter (this is exactly the same calculation,
  68 // just with whole_pixel_offset added in).
  69 int center_input_from_output(uint output_y)
  70 {
  71         return int(roundEven((output_y + 0.5f) * PREFIX(inv_vertical_scaling_factor) - 0.5f)) + PREFIX(vertical_whole_pixel_offset);
  72 }
  73
  74 float normalized_input_y(int min_input_y, int invocation_index)
  75 {
  76         return (min_input_y + invocation_index) * PREFIX(inv_input_height) + PREFIX(input_texcoord_y_adjust);
  77 }
  78
  79 vec2 get_horizontal_coeff(int x)
  80 {
  81         ivec2 sample_tc;
  82         sample_tc.x = x;
  83         sample_tc.y = int(gl_GlobalInvocationID.x) % PREFIX(num_horizontal_filters);
  84         vec2 s = texelFetch(PREFIX(sample_tex_horizontal), sample_tc, 0).rg;
  85         s.g += ((int(gl_GlobalInvocationID.x) / PREFIX(num_horizontal_filters)) * PREFIX(slice_height) + PREFIX(horizontal_whole_pixel_offset));
  86         return s;
  87 }
  88
  89 vec4 do_horizontal_sampling(float input_y)
  90 {
  91         vec4 sum = vec4(0.0);
  92         if (PREFIX(num_horizontal_samples) <= NUM_STORED_HORIZONTAL_COEFFS) {
  93                 for (int i = 0; i < PREFIX(num_horizontal_samples); ++i) {
  94                         vec2 s = horizontal_coeffs[i];
  95                         sum += INPUT(vec2(s.y, input_y)) * s.x;
  96                 }
  97         } else {
  98                 // Not enough shared memory available to hold the horizontal resampling coefficients,
  99                 // so load the remaining ones as we go. This is generally somewhat slower, even though
 100                 // all elements of the warp will be loading the same texture sample, so it definitely
 101                 // a slow path that we will only see in extreme downsampling (which is, unfortunately,
 102                 // the case that's the most hurt by loading coefficients on-the-fly).
 103                 //
 104                 // Other strategies would be possible, including loading coefficients in
 105                 // multiple phases (or skipping the shared memory altogether if there's no room),
 106                 // but this is the simplest and not too slow.
 107                 for (int i = 0; i < NUM_STORED_HORIZONTAL_COEFFS; ++i) {
 108                         vec2 s = horizontal_coeffs[i];
 109                         sum += INPUT(vec2(s.y, input_y)) * s.x;
 110                 }
 111                 for (int i = NUM_STORED_HORIZONTAL_COEFFS; i < PREFIX(num_horizontal_samples); ++i) {
 112                         vec2 s = get_horizontal_coeff(i);
 113                         sum += INPUT(vec2(s.y, input_y)) * s.x;
 114                 }
 115         }
 116         return sum;
 117 }
 118
 119 void FUNCNAME() {
 120         // This is a bit tricky: The x and y workgroup IDs are in the _output_ texture,
 121         // but when doing horizontal sampling, the local y invocation ID is in the _input_
 122         // texture.
 123         uint min_output_y = uint(PREFIX(output_samples_per_block) * int(gl_WorkGroupID.y));
 124         uint max_output_y = min(min_output_y + uint(PREFIX(output_samples_per_block)), uint(PREFIX(output_size).y));  // Exclusive.
 125
 126         int min_input_y = center_input_from_output(min_output_y) - PREFIX(vertical_int_radius);
 127         int max_input_y = center_input_from_output(max_output_y - 1u) + PREFIX(vertical_int_radius);  // Inclusive.
 128
 129         // Load coefficients for the horizontal resampling.
 130         if (gl_LocalInvocationID.y < uint(PREFIX(num_horizontal_samples))) {
 131                 horizontal_coeffs[gl_LocalInvocationID.y] = get_horizontal_coeff(int(gl_LocalInvocationID.y));
 132         }
 133
 134         memoryBarrier();
 135         barrier();
 136
 137         // Do the actual horizontal sampling for this row.
 138         if (min_input_y + int(gl_LocalInvocationID.y) <= max_input_y) {
 139                 float input_y = normalized_input_y(min_input_y, int(gl_LocalInvocationID.y));
 140                 vec4 val = do_horizontal_sampling(input_y);
 141 #if PACK_HALF
 142                 column[gl_LocalInvocationID.y] = uvec2(packHalf2x16(val.xy), packHalf2x16(val.zw));
 143 #else
 144                 column[gl_LocalInvocationID.y] = val;
 145 #endif
 146         }
 147
 148         memoryBarrier();
 149         barrier();
 150
 151         // Vertical resampling. For downscaling, we'll only have one iteration
 152         // through this loop, but for upscaling, we may have several.
 153         //
 154         // FIXME: if NUM_TEMP_SAMPLES is too small, we need yet more stuff
 155         for (uint output_y = min_output_y + gl_LocalInvocationID.y; output_y < max_output_y; output_y += uint(NUM_TEMP_SAMPLES)) {
 156                 int base_idx = center_input_from_output(output_y) - PREFIX(vertical_int_radius) - min_input_y;
 157                 int sample_y = int(output_y) % PREFIX(num_vertical_filters);
 158
 159                 vec4 sum = vec4(0.0);
 160                 for (int i = 0; i < PREFIX(num_vertical_samples); ++i) {
 161                         float weight = texelFetch(PREFIX(sample_tex_vertical), ivec2(i, sample_y), 0).r;
 162 #if PACK_HALF
 163                         uvec2 packed_val = column[base_idx + i];
 164                         vec4 val;
 165                         val.xy = unpackHalf2x16(packed_val.x);
 166                         val.zw = unpackHalf2x16(packed_val.y);
 167                         sum += val * weight;
 168 #else
 169                         sum += column[base_idx + i] * weight;
 170 #endif
 171                 }
 172                 OUTPUT(uvec2(gl_GlobalInvocationID.x, output_y), sum);
 173         }
 174 }
 175
 176 #undef PACK_HALF
 177 #undef NUM_TEMP_SAMPLES
 178 #undef NUM_STORED_HORIZONTAL_COEFFS