From: Steinar H. Gunderson Date: Sun, 31 Dec 2017 09:43:56 +0000 (+0100) Subject: Compute version of ResampleEffect. X-Git-Url: https://git.sesse.net/?p=movit;a=commitdiff_plain;h=refs%2Fheads%2Fcompute_resample Compute version of ResampleEffect. This has been abandoned; see the comment in resample_effect.h for why. --- diff --git a/header.comp b/header.comp index 47f1315..1453448 100644 --- a/header.comp +++ b/header.comp @@ -2,6 +2,9 @@ #extension GL_ARB_compute_shader : enable #extension GL_ARB_shader_image_load_store : enable #extension GL_ARB_shader_image_size : enable +#ifdef GL_ARB_shading_language_packing +#extension GL_ARB_shading_language_packing : enable +#endif // FIXME this needs to be auto-output or something uniform restrict writeonly image2D tex_outbuf; diff --git a/resample_effect.comp b/resample_effect.comp new file mode 100644 index 0000000..5d7fa0f --- /dev/null +++ b/resample_effect.comp @@ -0,0 +1,178 @@ +// Implicit uniforms: +// uniform sampler2D PREFIX(sample_tex_horizontal); +// uniform sampler2D PREFIX(sample_tex_vertical); +// uniform int PREFIX(output_width); +// uniform int PREFIX(output_height); +// uniform int PREFIX(num_horizontal_samples); +// uniform int PREFIX(num_vertical_samples); +// uniform int PREFIX(vertical_int_radius); +// uniform int PREFIX(num_horizontal_filters); +// uniform int PREFIX(num_vertical_filters); +// uniform int PREFIX(vertical_whole_pixel_offset); +// uniform float PREFIX(inv_vertical_scaling_factor); +// uniform float PREFIX(slice_height); +// uniform float PREFIX(horizontal_whole_pixel_offset); +// uniform float PREFIX(inv_input_height); +// uniform float PREFIX(input_texcoord_y_adjust); + +// Number of samples we have room for between the stages. If this gets +// higher, we will have bigger but fewer blocks, which has a few pros and cons: +// +// + More inter-block parallelity (too low means you cannot fill a warp) +// + Less pixels being wasted to overlap (less work being done overall) +// - More risk of overflowing the L1 cache +// - Less overall GPU parallelity (too few blocks means some GPU cores will +// not have a block to work on) +// +// The current value is a tradeoff; some source/destination sizes will +// benefit from different values. +#define NUM_TEMP_SAMPLES 128 + +// Number of horizontal resampling coefficients (sample positions and +// weight) we store in shared memory; generally, as long as we have less +// than 25x downscaling or so, this will be enough. +#define NUM_STORED_HORIZONTAL_COEFFS 128 + +// Packing the intermediate results into fp16 saves a bit of shared memory, +// but more importantly, seems to reduce the number of bank conflicts +// (stalls when multiple members of a warp try to access the same bank +// at the same time, where bank = address & 0x3c). The actual effect will +// naturally depend a lot of the scaling factor, but we're talking about +// 5–15% improvement for NVIDIA in our microbenchmarks (although some are +// neutral and occasionally one might even be negative). Intel appears to be +// less sensitive, although that might be because more time overall goes +// towards the texture sampling. +// +// NVIDIA has an extension GL_NV_gpu_shader5 that gives native half types, +// but it doesn't buy us any speed, just less ugly syntax for packing/unpacking. +#ifdef GL_ARB_shading_language_packing +#define PACK_HALF 1 +#else +#define PACK_HALF 0 +#endif + +// In theory, these could conflict with another effect since we don't use +// PREFIX(), but there can only be one instance of each compute shader, +// and having PREFIX() everywhere on non-uniforms becomes increasingly unreadable. + +shared vec2 horizontal_coeffs[NUM_STORED_HORIZONTAL_COEFFS]; +#if PACK_HALF +shared uvec2 column[NUM_TEMP_SAMPLES]; // this is more fixed-ish, see below +#else +shared vec4 column[NUM_TEMP_SAMPLES]; // this is more fixed-ish, see below +#endif + +layout(local_size_x = 1, local_size_y = NUM_TEMP_SAMPLES) in; + +// Find out where the C++ code placed the center of the filter (this is exactly the same calculation, +// just with whole_pixel_offset added in). +int center_input_from_output(uint output_y) +{ + return int(roundEven((output_y + 0.5f) * PREFIX(inv_vertical_scaling_factor) - 0.5f)) + PREFIX(vertical_whole_pixel_offset); +} + +float normalized_input_y(int min_input_y, int invocation_index) +{ + return (min_input_y + invocation_index) * PREFIX(inv_input_height) + PREFIX(input_texcoord_y_adjust); +} + +vec2 get_horizontal_coeff(int x) +{ + ivec2 sample_tc; + sample_tc.x = x; + sample_tc.y = int(gl_GlobalInvocationID.x) % PREFIX(num_horizontal_filters); + vec2 s = texelFetch(PREFIX(sample_tex_horizontal), sample_tc, 0).rg; + s.g += ((int(gl_GlobalInvocationID.x) / PREFIX(num_horizontal_filters)) * PREFIX(slice_height) + PREFIX(horizontal_whole_pixel_offset)); + return s; +} + +vec4 do_horizontal_sampling(float input_y) +{ + vec4 sum = vec4(0.0); + if (PREFIX(num_horizontal_samples) <= NUM_STORED_HORIZONTAL_COEFFS) { + for (int i = 0; i < PREFIX(num_horizontal_samples); ++i) { + vec2 s = horizontal_coeffs[i]; + sum += INPUT(vec2(s.y, input_y)) * s.x; + } + } else { + // Not enough shared memory available to hold the horizontal resampling coefficients, + // so load the remaining ones as we go. This is generally somewhat slower, even though + // all elements of the warp will be loading the same texture sample, so it definitely + // a slow path that we will only see in extreme downsampling (which is, unfortunately, + // the case that's the most hurt by loading coefficients on-the-fly). + // + // Other strategies would be possible, including loading coefficients in + // multiple phases (or skipping the shared memory altogether if there's no room), + // but this is the simplest and not too slow. + for (int i = 0; i < NUM_STORED_HORIZONTAL_COEFFS; ++i) { + vec2 s = horizontal_coeffs[i]; + sum += INPUT(vec2(s.y, input_y)) * s.x; + } + for (int i = NUM_STORED_HORIZONTAL_COEFFS; i < PREFIX(num_horizontal_samples); ++i) { + vec2 s = get_horizontal_coeff(i); + sum += INPUT(vec2(s.y, input_y)) * s.x; + } + } + return sum; +} + +void FUNCNAME() { + // This is a bit tricky: The x and y workgroup IDs are in the _output_ texture, + // but when doing horizontal sampling, the local y invocation ID is in the _input_ + // texture. + uint min_output_y = uint(PREFIX(output_samples_per_block) * int(gl_WorkGroupID.y)); + uint max_output_y = min(min_output_y + uint(PREFIX(output_samples_per_block)), uint(PREFIX(output_size).y)); // Exclusive. + + int min_input_y = center_input_from_output(min_output_y) - PREFIX(vertical_int_radius); + int max_input_y = center_input_from_output(max_output_y - 1u) + PREFIX(vertical_int_radius); // Inclusive. + + // Load coefficients for the horizontal resampling. + if (gl_LocalInvocationID.y < uint(PREFIX(num_horizontal_samples))) { + horizontal_coeffs[gl_LocalInvocationID.y] = get_horizontal_coeff(int(gl_LocalInvocationID.y)); + } + + memoryBarrier(); + barrier(); + + // Do the actual horizontal sampling for this row. + if (min_input_y + int(gl_LocalInvocationID.y) <= max_input_y) { + float input_y = normalized_input_y(min_input_y, int(gl_LocalInvocationID.y)); + vec4 val = do_horizontal_sampling(input_y); +#if PACK_HALF + column[gl_LocalInvocationID.y] = uvec2(packHalf2x16(val.xy), packHalf2x16(val.zw)); +#else + column[gl_LocalInvocationID.y] = val; +#endif + } + + memoryBarrier(); + barrier(); + + // Vertical resampling. For downscaling, we'll only have one iteration + // through this loop, but for upscaling, we may have several. + // + // FIXME: if NUM_TEMP_SAMPLES is too small, we need yet more stuff + for (uint output_y = min_output_y + gl_LocalInvocationID.y; output_y < max_output_y; output_y += uint(NUM_TEMP_SAMPLES)) { + int base_idx = center_input_from_output(output_y) - PREFIX(vertical_int_radius) - min_input_y; + int sample_y = int(output_y) % PREFIX(num_vertical_filters); + + vec4 sum = vec4(0.0); + for (int i = 0; i < PREFIX(num_vertical_samples); ++i) { + float weight = texelFetch(PREFIX(sample_tex_vertical), ivec2(i, sample_y), 0).r; +#if PACK_HALF + uvec2 packed_val = column[base_idx + i]; + vec4 val; + val.xy = unpackHalf2x16(packed_val.x); + val.zw = unpackHalf2x16(packed_val.y); + sum += val * weight; +#else + sum += column[base_idx + i] * weight; +#endif + } + OUTPUT(uvec2(gl_GlobalInvocationID.x, output_y), sum); + } +} + +#undef PACK_HALF +#undef NUM_TEMP_SAMPLES +#undef NUM_STORED_HORIZONTAL_COEFFS diff --git a/resample_effect.cpp b/resample_effect.cpp index c688d70..7450864 100644 --- a/resample_effect.cpp +++ b/resample_effect.cpp @@ -197,6 +197,21 @@ void normalize_sum(Tap* vals, unsigned num) } } +template +void normalize_sum(T* vals, unsigned num) +{ + for (int normalize_pass = 0; normalize_pass < 2; ++normalize_pass) { + float sum = 0.0; + for (unsigned i = 0; i < num; ++i) { + sum += to_fp32(vals[i]); + } + float inv_sum = 1.0 / sum; + for (unsigned i = 0; i < num; ++i) { + vals[i] = from_fp32(to_fp32(vals[i]) * inv_sum); + } + } +} + // Make use of the bilinear filtering in the GPU to reduce the number of samples // we need to make. This is a bit more complex than BlurEffect since we cannot combine // two neighboring samples if their weights have differing signs, so we first need to @@ -309,13 +324,19 @@ ResampleEffect::ResampleEffect() register_int("width", &output_width); register_int("height", &output_height); - // The first blur pass will forward resolution information to us. - hpass_owner.reset(new SingleResamplePassEffect(this)); - hpass = hpass_owner.get(); - CHECK(hpass->set_int("direction", SingleResamplePassEffect::HORIZONTAL)); - vpass_owner.reset(new SingleResamplePassEffect(this)); - vpass = vpass_owner.get(); - CHECK(vpass->set_int("direction", SingleResamplePassEffect::VERTICAL)); + if (movit_compute_shaders_supported) { + // The effect will forward resolution information to us. + compute_effect_owner.reset(new ResampleComputeEffect(this)); + compute_effect = compute_effect_owner.get(); + } else { + // The first blur pass will forward resolution information to us. + hpass_owner.reset(new SingleResamplePassEffect(this)); + hpass = hpass_owner.get(); + CHECK(hpass->set_int("direction", SingleResamplePassEffect::HORIZONTAL)); + vpass_owner.reset(new SingleResamplePassEffect(this)); + vpass = vpass_owner.get(); + CHECK(vpass->set_int("direction", SingleResamplePassEffect::VERTICAL)); + } update_size(); } @@ -326,11 +347,17 @@ ResampleEffect::~ResampleEffect() void ResampleEffect::rewrite_graph(EffectChain *graph, Node *self) { - Node *hpass_node = graph->add_node(hpass_owner.release()); - Node *vpass_node = graph->add_node(vpass_owner.release()); - graph->connect_nodes(hpass_node, vpass_node); - graph->replace_receiver(self, hpass_node); - graph->replace_sender(self, vpass_node); + if (compute_effect != nullptr) { + Node *compute_node = graph->add_node(compute_effect_owner.release()); + graph->replace_receiver(self, compute_node); + graph->replace_sender(self, compute_node); + } else { + Node *hpass_node = graph->add_node(hpass_owner.release()); + Node *vpass_node = graph->add_node(vpass_owner.release()); + graph->connect_nodes(hpass_node, vpass_node); + graph->replace_receiver(self, hpass_node); + graph->replace_sender(self, vpass_node); + } self->disabled = true; } @@ -349,16 +376,22 @@ void ResampleEffect::inform_input_size(unsigned input_num, unsigned width, unsig void ResampleEffect::update_size() { bool ok = true; - ok |= hpass->set_int("input_width", input_width); - ok |= hpass->set_int("input_height", input_height); - ok |= hpass->set_int("output_width", output_width); - ok |= hpass->set_int("output_height", input_height); - - ok |= vpass->set_int("input_width", output_width); - ok |= vpass->set_int("input_height", input_height); - ok |= vpass->set_int("output_width", output_width); - ok |= vpass->set_int("output_height", output_height); + if (compute_effect != nullptr) { + ok |= compute_effect->set_int("input_width", input_width); + ok |= compute_effect->set_int("input_height", input_height); + ok |= compute_effect->set_int("output_width", output_width); + ok |= compute_effect->set_int("output_height", output_height); + } else { + ok |= hpass->set_int("input_width", input_width); + ok |= hpass->set_int("input_height", input_height); + ok |= hpass->set_int("output_width", output_width); + ok |= hpass->set_int("output_height", input_height); + ok |= vpass->set_int("input_width", output_width); + ok |= vpass->set_int("input_height", input_height); + ok |= vpass->set_int("output_width", output_width); + ok |= vpass->set_int("output_height", output_height); + } assert(ok); // The offset added due to zoom may have changed with the size. @@ -374,10 +407,17 @@ void ResampleEffect::update_offset_and_zoom() float extra_offset_x = zoom_center_x * (1.0f - 1.0f / zoom_x) * input_width; float extra_offset_y = (1.0f - zoom_center_y) * (1.0f - 1.0f / zoom_y) * input_height; - ok |= hpass->set_float("offset", extra_offset_x + offset_x); - ok |= vpass->set_float("offset", extra_offset_y - offset_y); // Compensate for the bottom-left origin. - ok |= hpass->set_float("zoom", zoom_x); - ok |= vpass->set_float("zoom", zoom_y); + if (compute_effect != nullptr) { + ok |= compute_effect->set_float("offset_x", extra_offset_x + offset_x); + ok |= compute_effect->set_float("offset_y", extra_offset_y - offset_y); // Compensate for the bottom-left origin. + ok |= compute_effect->set_float("zoom_x", zoom_x); + ok |= compute_effect->set_float("zoom_y", zoom_y); + } else { + ok |= hpass->set_float("offset", extra_offset_x + offset_x); + ok |= vpass->set_float("offset", extra_offset_y - offset_y); // Compensate for the bottom-left origin. + ok |= hpass->set_float("zoom", zoom_x); + ok |= vpass->set_float("zoom", zoom_y); + } assert(ok); } @@ -502,7 +542,7 @@ void SingleResamplePassEffect::update_texture(GLuint glsl_program_num, const str assert(false); } - ScalingWeights weights = calculate_bilinear_scaling_weights(src_size, dst_size, zoom, offset); + ScalingWeights weights = calculate_bilinear_scaling_weights(src_size, dst_size, zoom, offset, BilinearFormatConstraints::ALLOW_FP16_AND_FP32); src_bilinear_samples = weights.src_bilinear_samples; num_loops = weights.num_loops; slice_height = 1.0f / weights.num_loops; @@ -529,6 +569,106 @@ void SingleResamplePassEffect::update_texture(GLuint glsl_program_num, const str tex.update(weights.src_bilinear_samples, weights.dst_samples, internal_format, GL_RG, type, pixels); } +ResampleComputeEffect::ResampleComputeEffect(ResampleEffect *parent) + : parent(parent), + input_width(1280), + input_height(720), + offset_x(0.0), + offset_y(0.0), + zoom_x(1.0), + zoom_y(1.0), + last_input_width(-1), + last_input_height(-1), + last_output_width(-1), + last_output_height(-1), + last_offset_x(0.0 / 0.0), // NaN. + last_offset_y(0.0 / 0.0), // NaN. + last_zoom_x(0.0 / 0.0), // NaN. + last_zoom_y(0.0 / 0.0) // NaN. +{ + register_int("input_width", &input_width); + register_int("input_height", &input_height); + register_int("output_width", &output_width); + register_int("output_height", &output_height); + register_float("offset_x", &offset_x); + register_float("offset_y", &offset_y); + register_float("zoom_x", &zoom_x); + register_float("zoom_y", &zoom_y); + register_uniform_sampler2d("sample_tex_horizontal", &uniform_sample_tex_horizontal); + register_uniform_sampler2d("sample_tex_vertical", &uniform_sample_tex_vertical); + register_uniform_int("num_horizontal_samples", &uniform_num_horizontal_samples); + register_uniform_int("num_vertical_samples", &uniform_num_vertical_samples); + register_uniform_int("vertical_int_radius", &uniform_vertical_int_radius); + register_uniform_float("inv_vertical_scaling_factor", &uniform_inv_vertical_scaling_factor); + register_uniform_int("output_samples_per_block", &uniform_output_samples_per_block); + register_uniform_int("num_horizontal_filters", &uniform_num_horizontal_filters); + register_uniform_int("num_vertical_filters", &uniform_num_vertical_filters); + register_uniform_float("slice_height", &uniform_slice_height); + register_uniform_float("horizontal_whole_pixel_offset", &uniform_horizontal_whole_pixel_offset); + register_uniform_int("vertical_whole_pixel_offset", &uniform_vertical_whole_pixel_offset); + register_uniform_float("inv_input_height", &uniform_inv_input_height); + register_uniform_float("input_texcoord_y_adjust", &uniform_input_texcoord_y_adjust); + + call_once(lanczos_table_init_done, init_lanczos_table); +} + +ResampleComputeEffect::~ResampleComputeEffect() +{ +} + +string ResampleComputeEffect::output_fragment_shader() +{ + char buf[256] = ""; + return buf + read_file("resample_effect.comp"); +} + +// The compute shader does horizontal scaling first, using exactly the same +// two-component texture format as in the two-pass version (see the comments +// on ResampleComputeEffect). The vertical scaling calculates the offset values +// in the shader, so we only store a one-component texture with the weights +// for each filter. +void ResampleComputeEffect::update_texture(GLuint glsl_program_num, const string &prefix, unsigned *sampler_num) +{ + ScalingWeights horiz_weights = calculate_bilinear_scaling_weights(input_width, output_width, zoom_x, offset_x, BilinearFormatConstraints::ALLOW_FP32_ONLY); + ScalingWeights vert_weights = calculate_raw_scaling_weights(input_height, output_height, zoom_y, offset_y); + uniform_vertical_int_radius = vert_weights.int_radius; + vertical_scaling_factor = vert_weights.scaling_factor; + uniform_inv_vertical_scaling_factor = 1.0f / vert_weights.scaling_factor; + src_horizontal_bilinear_samples = horiz_weights.src_bilinear_samples; + src_vertical_samples = vert_weights.src_bilinear_samples; + uniform_num_horizontal_filters = horiz_weights.dst_samples; + uniform_num_vertical_filters = vert_weights.dst_samples; + slice_height = 1.0f / horiz_weights.num_loops; + + // Encode as a two-component texture. Note the GL_REPEAT. + glActiveTexture(GL_TEXTURE0 + *sampler_num); + check_error(); + glBindTexture(GL_TEXTURE_2D, tex_horiz.get_texnum()); + check_error(); + + tex_horiz.update(horiz_weights.src_bilinear_samples, horiz_weights.dst_samples, GL_RG32F, GL_RG, GL_FLOAT, horiz_weights.bilinear_weights_fp32.get()); + + glActiveTexture(GL_TEXTURE0 + *sampler_num + 1); + check_error(); + glBindTexture(GL_TEXTURE_2D, tex_vert.get_texnum()); + check_error(); + + // Storing the vertical weights as fp16 instead of fp32 saves a few + // percent on NVIDIA, and it doesn't seem to hurt quality any. + // (The horizontal weights is a different story, since the offsets + // can get large and are fairly accuracy-sensitive. Also, they are + // loaded only once per workgroup, at the very beginning.) + tex_vert.update(vert_weights.src_bilinear_samples, vert_weights.dst_samples, GL_R16F, GL_RED, GL_HALF_FLOAT, vert_weights.raw_weights.get()); + + // Figure out how many output samples each compute shader block is going to output. + int usable_input_samples_per_block = 128 - 2 * uniform_vertical_int_radius; + int output_samples_per_block = int(floor(usable_input_samples_per_block * vertical_scaling_factor)); + if (output_samples_per_block < 1) { + output_samples_per_block = 1; + } + uniform_output_samples_per_block = output_samples_per_block; +} + namespace { ScalingWeights calculate_scaling_weights(unsigned src_size, unsigned dst_size, float zoom, float offset) @@ -632,15 +772,18 @@ ScalingWeights calculate_scaling_weights(unsigned src_size, unsigned dst_size, f ScalingWeights ret; ret.src_bilinear_samples = src_samples; ret.dst_samples = dst_samples; + ret.int_radius = int_radius; + ret.scaling_factor = scaling_factor; ret.num_loops = num_loops; ret.bilinear_weights_fp16 = nullptr; ret.bilinear_weights_fp32 = move(weights); + ret.raw_weights = nullptr; return ret; } } // namespace -ScalingWeights calculate_bilinear_scaling_weights(unsigned src_size, unsigned dst_size, float zoom, float offset) +ScalingWeights calculate_bilinear_scaling_weights(unsigned src_size, unsigned dst_size, float zoom, float offset, BilinearFormatConstraints constraints) { ScalingWeights ret = calculate_scaling_weights(src_size, dst_size, zoom, offset); unique_ptr[]> weights = move(ret.bilinear_weights_fp32); @@ -652,17 +795,23 @@ ScalingWeights calculate_bilinear_scaling_weights(unsigned src_size, unsigned ds // samples, since one would assume overall errors in the shape don't matter as much. const float max_error = 2.0f / (255.0f * 255.0f); unique_ptr[]> bilinear_weights_fp16; - int src_bilinear_samples = combine_many_samples(weights.get(), src_size, src_samples, ret.dst_samples, &bilinear_weights_fp16); - unique_ptr[]> bilinear_weights_fp32 = nullptr; + unique_ptr[]> bilinear_weights_fp32; double max_sum_sq_error_fp16 = 0.0; - for (unsigned y = 0; y < ret.dst_samples; ++y) { - double sum_sq_error_fp16 = compute_sum_sq_error( - weights.get() + y * src_samples, src_samples, - bilinear_weights_fp16.get() + y * src_bilinear_samples, src_bilinear_samples, - src_size); - max_sum_sq_error_fp16 = std::max(max_sum_sq_error_fp16, sum_sq_error_fp16); - if (max_sum_sq_error_fp16 > max_error) { - break; + int src_bilinear_samples; + if (constraints == BilinearFormatConstraints::ALLOW_FP32_ONLY) { + max_sum_sq_error_fp16 = numeric_limits::max(); + } else { + assert(constraints == BilinearFormatConstraints::ALLOW_FP16_AND_FP32); + src_bilinear_samples = combine_many_samples(weights.get(), src_size, src_samples, ret.dst_samples, &bilinear_weights_fp16); + for (unsigned y = 0; y < ret.dst_samples; ++y) { + double sum_sq_error_fp16 = compute_sum_sq_error( + weights.get() + y * src_samples, src_samples, + bilinear_weights_fp16.get() + y * src_bilinear_samples, src_bilinear_samples, + src_size); + max_sum_sq_error_fp16 = std::max(max_sum_sq_error_fp16, sum_sq_error_fp16); + if (max_sum_sq_error_fp16 > max_error) { + break; + } } } @@ -677,6 +826,28 @@ ScalingWeights calculate_bilinear_scaling_weights(unsigned src_size, unsigned ds return ret; } +// Unlike calculate_bilinear_scaling_weights(), this just converts the weights, +// without any combining trickery. Thus, it is also much faster. +ScalingWeights calculate_raw_scaling_weights(unsigned src_size, unsigned dst_size, float zoom, float offset) +{ + ScalingWeights ret = calculate_scaling_weights(src_size, dst_size, zoom, offset); + unique_ptr[]> weights = move(ret.bilinear_weights_fp32); + const int src_samples = ret.src_bilinear_samples; + + // Convert to fp16 (without any positions, as they are calculated implicitly + // by the compute shader) and normalize. + unique_ptr raw_weights(new fp16_int_t[ret.dst_samples * src_samples]); + for (unsigned y = 0; y < ret.dst_samples; ++y) { + for (int i = 0; i < src_samples; ++i) { + raw_weights[y * src_samples + i] = fp32_to_fp16(weights[y * src_samples + i].weight); + } + normalize_sum(raw_weights.get() + y * src_samples, src_samples); + } + + ret.raw_weights = move(raw_weights); + return ret; +} + void SingleResamplePassEffect::set_gl_state(GLuint glsl_program_num, const string &prefix, unsigned *sampler_num) { Effect::set_gl_state(glsl_program_num, prefix, sampler_num); @@ -763,4 +934,65 @@ void Support2DTexture::update(GLint width, GLint height, GLenum internal_format, } } +void ResampleComputeEffect::get_compute_dimensions(unsigned output_width, unsigned output_height, + unsigned *x, unsigned *y, unsigned *z) const +{ + *x = output_width; + *y = (output_height + uniform_output_samples_per_block - 1) / uniform_output_samples_per_block; + *z = 1; +} + +void ResampleComputeEffect::set_gl_state(GLuint glsl_program_num, const string &prefix, unsigned *sampler_num) +{ + Effect::set_gl_state(glsl_program_num, prefix, sampler_num); + + assert(input_width > 0); + assert(input_height > 0); + assert(output_width > 0); + assert(output_height > 0); + + if (input_width != last_input_width || + input_height != last_input_height || + output_width != last_output_width || + output_height != last_output_height || + offset_x != last_offset_x || + offset_y != last_offset_y || + zoom_x != last_zoom_x || + zoom_x != last_zoom_y) { + update_texture(glsl_program_num, prefix, sampler_num); + last_input_width = input_width; + last_input_height = input_height; + last_output_width = output_width; + last_output_height = output_height; + last_offset_x = offset_x; + last_offset_y = offset_y; + last_zoom_x = zoom_x; + last_zoom_y = zoom_y; + } + + glActiveTexture(GL_TEXTURE0 + *sampler_num); + check_error(); + glBindTexture(GL_TEXTURE_2D, tex_horiz.get_texnum()); + check_error(); + uniform_sample_tex_horizontal = *sampler_num; + ++*sampler_num; + + glActiveTexture(GL_TEXTURE0 + *sampler_num); + check_error(); + glBindTexture(GL_TEXTURE_2D, tex_vert.get_texnum()); + check_error(); + uniform_sample_tex_vertical = *sampler_num; + ++*sampler_num; + + uniform_num_horizontal_samples = src_horizontal_bilinear_samples; + uniform_num_vertical_samples = src_vertical_samples; + uniform_slice_height = slice_height; + + uniform_horizontal_whole_pixel_offset = lrintf(offset_x) / float(input_width); + uniform_vertical_whole_pixel_offset = lrintf(offset_y); + + uniform_inv_input_height = 1.0f / float(input_height); + uniform_input_texcoord_y_adjust = 0.5f / float(input_height); +} + } // namespace movit diff --git a/resample_effect.h b/resample_effect.h index cf5f3bb..12fa1e2 100644 --- a/resample_effect.h +++ b/resample_effect.h @@ -29,6 +29,7 @@ namespace movit { class EffectChain; class Node; class SingleResamplePassEffect; +class ResampleComputeEffect; // Public so that it can be benchmarked externally. template @@ -39,12 +40,20 @@ struct Tap { struct ScalingWeights { unsigned src_bilinear_samples; unsigned dst_samples, num_loops; + int int_radius; // FIXME: really here? + float scaling_factor; // FIXME: really here? - // Exactly one of these is set. + // Exactly one of these three is set. std::unique_ptr[]> bilinear_weights_fp16; std::unique_ptr[]> bilinear_weights_fp32; + std::unique_ptr raw_weights; }; -ScalingWeights calculate_bilinear_scaling_weights(unsigned src_size, unsigned dst_size, float zoom, float offset); +enum class BilinearFormatConstraints { + ALLOW_FP16_AND_FP32, + ALLOW_FP32_ONLY +}; +ScalingWeights calculate_bilinear_scaling_weights(unsigned src_size, unsigned dst_size, float zoom, float offset, BilinearFormatConstraints constraints); +ScalingWeights calculate_raw_scaling_weights(unsigned src_size, unsigned dst_size, float zoom, float offset); // A simple manager for support data stored in a 2D texture. // Consider moving it to a shared location of more classes @@ -85,11 +94,17 @@ public: private: void update_size(); void update_offset_and_zoom(); + + // If compute shaders are supported, contains the effect. + // If not, nullptr. + std::unique_ptr compute_effect_owner; + ResampleComputeEffect *compute_effect = nullptr; // Both of these are owned by us if owns_effects is true (before finalize()), // and otherwise owned by the EffectChain. std::unique_ptr hpass_owner, vpass_owner; SingleResamplePassEffect *hpass = nullptr, *vpass = nullptr; + int input_width, input_height, output_width, output_height; float offset_x, offset_y; @@ -154,6 +169,78 @@ private: Support2DTexture tex; }; +class ResampleComputeEffect : public Effect { +public: + // If parent is non-nullptr, calls to inform_input_size will be forwarded, + // so that it can inform both passes about the right input and output + // resolutions. + ResampleComputeEffect(ResampleEffect *parent); + ~ResampleComputeEffect(); + std::string effect_type_id() const override { return "ResampleComputeEffect"; } + + std::string output_fragment_shader() override; + + // FIXME: This is the primary reason why this doesn't really work; + // there's no good reason why the regular resize should have bounce + // but we shouldn't. (If we did a 2D block instead of 1D columns, + // it would have been different, but we can't, due to the large size + // of the fringe.) + bool needs_texture_bounce() const override { return false; } + bool needs_srgb_primaries() const override { return false; } + AlphaHandling alpha_handling() const override { return INPUT_PREMULTIPLIED_ALPHA_KEEP_BLANK; } + + // We specifically do not want mipmaps on the input texture; + // they break minification. + MipmapRequirements needs_mipmaps() const override { return CANNOT_ACCEPT_MIPMAPS; } + + void inform_added(EffectChain *chain) override { this->chain = chain; } + void inform_input_size(unsigned input_num, unsigned width, unsigned height) override { + if (parent != nullptr) { + parent->inform_input_size(input_num, width, height); + } + } + bool changes_output_size() const override { return true; } + bool sets_virtual_output_size() const override { return false; } + + void get_output_size(unsigned *width, unsigned *height, unsigned *virtual_width, unsigned *virtual_height) const override { + *virtual_width = *width = this->output_width; + *virtual_height = *height = this->output_height; + } + + bool is_compute_shader() const override { return true; } + void get_compute_dimensions(unsigned output_width, unsigned output_height, + unsigned *x, unsigned *y, unsigned *z) const override; + + void set_gl_state(GLuint glsl_program_num, const std::string &prefix, unsigned *sampler_num) override; + +private: + void update_texture(GLuint glsl_program_num, const std::string &prefix, unsigned *sampler_num); + + ResampleEffect *parent; + EffectChain *chain; + Support2DTexture tex_horiz, tex_vert; + GLint uniform_sample_tex_horizontal, uniform_sample_tex_vertical; + float uniform_num_x_loops; + int uniform_num_horizontal_filters, uniform_num_vertical_filters; + float uniform_slice_height; + float uniform_horizontal_whole_pixel_offset; + int uniform_vertical_whole_pixel_offset; + int uniform_num_horizontal_samples, uniform_num_vertical_samples; + int uniform_output_samples_per_block; + + int input_width, input_height, output_width, output_height; + float offset_x, offset_y, zoom_x, zoom_y; + int last_input_width, last_input_height, last_output_width, last_output_height; + float last_offset_x, last_offset_y, last_zoom_x, last_zoom_y; + int src_horizontal_bilinear_samples; // Horizontal. + int src_vertical_samples; + float slice_height; + float uniform_inv_input_height, uniform_input_texcoord_y_adjust; + int uniform_vertical_int_radius; + float vertical_scaling_factor; + float uniform_inv_vertical_scaling_factor; +}; + } // namespace movit #endif // !defined(_MOVIT_RESAMPLE_EFFECT_H) diff --git a/resample_effect_test.cpp b/resample_effect_test.cpp index 16bd70c..95fa108 100644 --- a/resample_effect_test.cpp +++ b/resample_effect_test.cpp @@ -36,7 +36,16 @@ float lanczos(float x, float a) } // namespace -TEST(ResampleEffectTest, IdentityTransformDoesNothing) { +class ResampleEffectTest : public testing::TestWithParam { +protected: + ResampleEffectTest() : disabler(GetParam() == "fragment") {} + bool should_skip() { return disabler.should_skip(); } + +private: + DisableComputeShadersTemporarily disabler; +}; + +TEST_P(ResampleEffectTest, IdentityTransformDoesNothing) { const int size = 4; float data[size * size] = { @@ -56,7 +65,7 @@ TEST(ResampleEffectTest, IdentityTransformDoesNothing) { expect_equal(data, out_data, size, size); } -TEST(ResampleEffectTest, UpscaleByTwoGetsCorrectPixelCenters) { +TEST_P(ResampleEffectTest, UpscaleByTwoGetsCorrectPixelCenters) { const int size = 5; float data[size * size] = { @@ -94,7 +103,7 @@ TEST(ResampleEffectTest, UpscaleByTwoGetsCorrectPixelCenters) { expect_equal(expected_data, out_data, size * 2, size * 2); } -TEST(ResampleEffectTest, DownscaleByTwoGetsCorrectPixelCenters) { +TEST_P(ResampleEffectTest, DownscaleByTwoGetsCorrectPixelCenters) { const int size = 5; // This isn't a perfect dot, since the Lanczos filter has a slight @@ -136,7 +145,7 @@ TEST(ResampleEffectTest, DownscaleByTwoGetsCorrectPixelCenters) { expect_equal(expected_data, out_data, size, size); } -TEST(ResampleEffectTest, UpscaleByThreeGetsCorrectPixelCenters) { +TEST_P(ResampleEffectTest, UpscaleByThreeGetsCorrectPixelCenters) { const int size = 5; float data[size * size] = { @@ -179,7 +188,7 @@ TEST(ResampleEffectTest, UpscaleByThreeGetsCorrectPixelCenters) { } } -TEST(ResampleEffectTest, HeavyResampleGetsSumRight) { +TEST_P(ResampleEffectTest, HeavyResampleGetsSumRight) { // Do only one resample pass, more specifically the last one, which goes to // our fp32 output. This allows us to analyze the precision without intermediate // fp16 rounding. @@ -219,7 +228,7 @@ TEST(ResampleEffectTest, HeavyResampleGetsSumRight) { expect_equal(expected_data, out_data, dwidth, dheight, 0.12 / 1023.0); } -TEST(ResampleEffectTest, ReadWholePixelFromLeft) { +TEST_P(ResampleEffectTest, ReadWholePixelFromLeft) { const int size = 5; float data[size * size] = { @@ -248,7 +257,7 @@ TEST(ResampleEffectTest, ReadWholePixelFromLeft) { expect_equal(expected_data, out_data, size, size); } -TEST(ResampleEffectTest, ReadQuarterPixelFromLeft) { +TEST_P(ResampleEffectTest, ReadQuarterPixelFromLeft) { const int size = 5; float data[size * size] = { @@ -283,7 +292,7 @@ TEST(ResampleEffectTest, ReadQuarterPixelFromLeft) { expect_equal(expected_data, out_data, size, size); } -TEST(ResampleEffectTest, ReadQuarterPixelFromTop) { +TEST_P(ResampleEffectTest, ReadQuarterPixelFromTop) { const int width = 3; const int height = 5; @@ -315,7 +324,7 @@ TEST(ResampleEffectTest, ReadQuarterPixelFromTop) { expect_equal(expected_data, out_data, width, height); } -TEST(ResampleEffectTest, ReadHalfPixelFromLeftAndScale) { +TEST_P(ResampleEffectTest, ReadHalfPixelFromLeftAndScale) { const int src_width = 4; const int dst_width = 8; @@ -352,7 +361,7 @@ TEST(ResampleEffectTest, ReadHalfPixelFromLeftAndScale) { expect_equal(expected_data, out_data, dst_width, 1, 1.5f / 255.0f, 0.4f / 255.0f); } -TEST(ResampleEffectTest, Zoom) { +TEST_P(ResampleEffectTest, Zoom) { const int width = 5; const int height = 3; @@ -378,7 +387,7 @@ TEST(ResampleEffectTest, Zoom) { expect_equal(expected_data, out_data, width, height); } -TEST(ResampleEffectTest, VerticalZoomFromTop) { +TEST_P(ResampleEffectTest, VerticalZoomFromTop) { const int width = 5; const int height = 5; @@ -412,7 +421,7 @@ TEST(ResampleEffectTest, VerticalZoomFromTop) { expect_equal(expected_data, out_data, width, height); } -TEST(ResampleEffectTest, Precision) { +TEST_P(ResampleEffectTest, Precision) { const int size = 1920; // Difficult non-power-of-two size. const int offset = 5; @@ -436,6 +445,10 @@ TEST(ResampleEffectTest, Precision) { expect_equal(expected_data, out_data, size, 1); } +INSTANTIATE_TEST_CASE_P(ResampleEffectTest, + ResampleEffectTest, + testing::Values("fragment", "compute")); + #ifdef HAVE_BENCHMARK template<> inline uint8_t from_fp32(float x) { return lrintf(x * 255.0f); } @@ -479,6 +492,10 @@ BENCHMARK_CAPTURE(BM_ResampleEffectInt8, Int8Upscale, GAMMA_REC_709, "fragment") BENCHMARK_CAPTURE(BM_ResampleEffectHalf, Float16Upscale, GAMMA_LINEAR, "fragment")->Args({640, 360, 1280, 720})->Args({320, 180, 1280, 720})->Args({321, 181, 1280, 720})->UseRealTime()->Unit(benchmark::kMicrosecond); BENCHMARK_CAPTURE(BM_ResampleEffectInt8, Int8Downscale, GAMMA_REC_709, "fragment")->Args({1280, 720, 640, 360})->Args({1280, 720, 320, 180})->Args({1280, 720, 321, 181})->UseRealTime()->Unit(benchmark::kMicrosecond); BENCHMARK_CAPTURE(BM_ResampleEffectHalf, Float16Downscale, GAMMA_LINEAR, "fragment")->Args({1280, 720, 640, 360})->Args({1280, 720, 320, 180})->Args({1280, 720, 321, 181})->UseRealTime()->Unit(benchmark::kMicrosecond); +BENCHMARK_CAPTURE(BM_ResampleEffectInt8, Int8UpscaleCompute, GAMMA_REC_709, "compute")->Args({640, 360, 1280, 720})->Args({320, 180, 1280, 720})->Args({321, 181, 1280, 720})->UseRealTime()->Unit(benchmark::kMicrosecond); +BENCHMARK_CAPTURE(BM_ResampleEffectHalf, Float16UpscaleCompute, GAMMA_LINEAR, "compute")->Args({640, 360, 1280, 720})->Args({320, 180, 1280, 720})->Args({321, 181, 1280, 720})->UseRealTime()->Unit(benchmark::kMicrosecond); +BENCHMARK_CAPTURE(BM_ResampleEffectInt8, Int8DownscaleCompute, GAMMA_REC_709, "compute")->Args({1280, 720, 640, 360})->Args({1280, 720, 320, 180})->Args({1280, 720, 321, 181})->UseRealTime()->Unit(benchmark::kMicrosecond); +BENCHMARK_CAPTURE(BM_ResampleEffectHalf, Float16DownscaleCompute, GAMMA_LINEAR, "compute")->Args({1280, 720, 640, 360})->Args({1280, 720, 320, 180})->Args({1280, 720, 321, 181})->UseRealTime()->Unit(benchmark::kMicrosecond); void BM_ComputeBilinearScalingWeights(benchmark::State &state) { @@ -488,16 +505,48 @@ void BM_ComputeBilinearScalingWeights(benchmark::State &state) movit_texel_subpixel_precision = 64; // To get consistent results across GPUs; this is a CPU test. // One iteration warmup to make sure the Lanczos table is computed. - calculate_bilinear_scaling_weights(src_size, dst_size, 0.999f, 0.0f); + calculate_bilinear_scaling_weights(src_size, dst_size, 0.999f, 0.0f, BilinearFormatConstraints::ALLOW_FP16_AND_FP32); for (auto _ : state) { - ScalingWeights weights = calculate_bilinear_scaling_weights(src_size, dst_size, 0.999f, 0.0f); + ScalingWeights weights = calculate_bilinear_scaling_weights(src_size, dst_size, 0.999f, 0.0f, BilinearFormatConstraints::ALLOW_FP16_AND_FP32); } movit_texel_subpixel_precision = old_precision; } BENCHMARK(BM_ComputeBilinearScalingWeights)->Unit(benchmark::kMicrosecond); +void BM_ComputeBilinearScalingWeightsNoFP16(benchmark::State &state) +{ + constexpr unsigned src_size = 1280; + constexpr unsigned dst_size = 35; + int old_precision = movit_texel_subpixel_precision; + movit_texel_subpixel_precision = 64; // To get consistent results across GPUs; this is a CPU test. + + // One iteration warmup to make sure the Lanczos table is computed. + calculate_bilinear_scaling_weights(src_size, dst_size, 0.999f, 0.0f, BilinearFormatConstraints::ALLOW_FP32_ONLY); + + for (auto _ : state) { + ScalingWeights weights = calculate_bilinear_scaling_weights(src_size, dst_size, 0.999f, 0.0f, BilinearFormatConstraints::ALLOW_FP32_ONLY); + } + + movit_texel_subpixel_precision = old_precision; +} +BENCHMARK(BM_ComputeBilinearScalingWeightsNoFP16)->Unit(benchmark::kMicrosecond); + +void BM_ComputeRawScalingWeights(benchmark::State &state) +{ + constexpr unsigned src_size = 1280; + constexpr unsigned dst_size = 35; + + // One iteration warmup to make sure the Lanczos table is computed. + calculate_raw_scaling_weights(src_size, dst_size, 0.999f, 0.0f); + + for (auto _ : state) { + ScalingWeights weights = calculate_raw_scaling_weights(src_size, dst_size, 0.999f, 0.0f); + } +} +BENCHMARK(BM_ComputeRawScalingWeights)->Unit(benchmark::kMicrosecond); + #endif } // namespace movit