From 1bd97eb70a6fcb913bf954e369bc1a90ba17f74e Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Thu, 17 Sep 2015 19:28:43 +0200 Subject: [PATCH] Reduce the amount of computation in combine_two_samples(). Mostly microoptimization, but seemingly this function is somewhat expensive. --- blur_effect.cpp | 22 +++++++++++++--------- resample_effect.cpp | 12 ++++++++---- util.cpp | 12 +++++------- util.h | 9 ++++++--- 4 files changed, 32 insertions(+), 23 deletions(-) diff --git a/blur_effect.cpp b/blur_effect.cpp index 2084768..adffe08 100644 --- a/blur_effect.cpp +++ b/blur_effect.cpp @@ -6,6 +6,7 @@ #include "blur_effect.h" #include "effect_chain.h" #include "effect_util.h" +#include "init.h" #include "util.h" using namespace std; @@ -189,24 +190,27 @@ void SingleBlurPassEffect::set_gl_state(GLuint glsl_program_num, const string &p uniform_samples[2 * 0 + 0] = 0.0f; uniform_samples[2 * 0 + 1] = weight[0]; + int size; + if (direction == HORIZONTAL) { + size = width; + } else if (direction == VERTICAL) { + size = height; + } else { + assert(false); + } + float num_subtexels = size / movit_texel_subpixel_precision; + float inv_num_subtexels = movit_texel_subpixel_precision / size; + // All other samples. for (int i = 1; i < num_taps / 2 + 1; ++i) { unsigned base_pos = i * 2 - 1; float w1 = weight[base_pos]; float w2 = weight[base_pos + 1]; - int size; - if (direction == HORIZONTAL) { - size = width; - } else if (direction == VERTICAL) { - size = height; - } else { - assert(false); - } float pos1 = base_pos / (float)size; float pos2 = (base_pos + 1) / (float)size; float pos, total_weight; - combine_two_samples(w1, w2, pos1, pos2, size, &pos, &total_weight, NULL); + combine_two_samples(w1, w2, pos1, pos2, num_subtexels, inv_num_subtexels, &pos, &total_weight, NULL); uniform_samples[2 * i + 0] = pos; uniform_samples[2 * i + 1] = total_weight; diff --git a/resample_effect.cpp b/resample_effect.cpp index 4dd3ea8..79d5f21 100644 --- a/resample_effect.cpp +++ b/resample_effect.cpp @@ -61,7 +61,7 @@ unsigned gcd(unsigned a, unsigned b) } template -unsigned combine_samples(const Tap *src, Tap *dst, unsigned src_size, unsigned num_src_samples, unsigned max_samples_saved) +unsigned combine_samples(const Tap *src, Tap *dst, float num_subtexels, float inv_num_subtexels, unsigned num_src_samples, unsigned max_samples_saved) { // Cut off near-zero values at both sides. unsigned num_samples_saved = 0; @@ -109,7 +109,7 @@ unsigned combine_samples(const Tap *src, Tap *dst, unsigned sr fp16_int_t pos, total_weight; float sum_sq_error; - combine_two_samples(w1, w2, pos1, pos2, src_size, &pos, &total_weight, &sum_sq_error); + combine_two_samples(w1, w2, pos1, pos2, num_subtexels, inv_num_subtexels, &pos, &total_weight, &sum_sq_error); // If the interpolation error is larger than that of about sqrt(2) of // a level at 8-bit precision, don't combine. (You'd think 1.0 was enough, @@ -159,9 +159,12 @@ void normalize_sum(Tap* vals, unsigned num) template unsigned combine_many_samples(const Tap *weights, unsigned src_size, unsigned src_samples, unsigned dst_samples, Tap **bilinear_weights) { + float num_subtexels = src_size / movit_texel_subpixel_precision; + float inv_num_subtexels = movit_texel_subpixel_precision / src_size; int src_bilinear_samples = 0; + for (unsigned y = 0; y < dst_samples; ++y) { - unsigned num_samples_saved = combine_samples(weights + y * src_samples, NULL, src_size, src_samples, UINT_MAX); + unsigned num_samples_saved = combine_samples(weights + y * src_samples, NULL, num_subtexels, inv_num_subtexels, src_samples, UINT_MAX); src_bilinear_samples = max(src_bilinear_samples, src_samples - num_samples_saved); } @@ -172,7 +175,8 @@ unsigned combine_many_samples(const Tap *weights, unsigned src_size, unsi unsigned num_samples_saved = combine_samples( weights + y * src_samples, bilinear_weights_ptr, - src_size, + num_subtexels, + inv_num_subtexels, src_samples, src_samples - src_bilinear_samples); assert(int(src_samples) - int(num_samples_saved) == src_bilinear_samples); diff --git a/util.cpp b/util.cpp index e266fb8..da6057e 100644 --- a/util.cpp +++ b/util.cpp @@ -219,7 +219,7 @@ string output_glsl_vec3(const string &name, float x, float y, float z) } template -void combine_two_samples(float w1, float w2, float pos1, float pos2, unsigned size, +void combine_two_samples(float w1, float w2, float pos1, float pos2, float num_subtexels, float inv_num_subtexels, DestFloat *offset, DestFloat *total_weight, float *sum_sq_error) { assert(movit_initialized); @@ -238,9 +238,7 @@ void combine_two_samples(float w1, float w2, float pos1, float pos2, unsigned si // Round to the minimum number of bits we have measured earlier. // The card will do this for us anyway, but if we know what the real z // is, we can pick a better total_weight below. - z *= size; // Move to pixel coordinates, - z = lrintf(z / movit_texel_subpixel_precision) * movit_texel_subpixel_precision; // Round. - z /= size; // Move back to normalized coordinates. + z = lrintf(z * num_subtexels) * inv_num_subtexels; // Choose total weight w so that we minimize total squared error // for the effective weights: @@ -255,7 +253,7 @@ void combine_two_samples(float w1, float w2, float pos1, float pos2, unsigned si // w = (a(1-z) + bz) / ((1-z)² + z²) // // If z had infinite precision, this would simply reduce to w = w1 + w2. - *total_weight = (w1 * (1 - z) + w2 * z) / (z * z + (1 - z) * (1 - z)); + *total_weight = (w1 + z * (w2 - w1)) / (z * z + (1 - z) * (1 - z)); if (sum_sq_error != NULL) { float err1 = *total_weight * (1 - z) - w1; @@ -266,11 +264,11 @@ void combine_two_samples(float w1, float w2, float pos1, float pos2, unsigned si // Explicit instantiations. template -void combine_two_samples(float w1, float w2, float pos1, float pos2, unsigned size, +void combine_two_samples(float w1, float w2, float pos1, float pos2, float num_subtexels, float inv_num_subtexels, float *offset, float *total_weight, float *sum_sq_error); template -void combine_two_samples(float w1, float w2, float pos1, float pos2, unsigned size, +void combine_two_samples(float w1, float w2, float pos1, float pos2, float num_subtexels, float inv_num_subtexels, fp16_int_t *offset, fp16_int_t *total_weight, float *sum_sq_error); GLuint fill_vertex_attribute(GLuint glsl_program_num, const string &attribute_name, GLint size, GLenum type, GLsizeiptr data_size, const GLvoid *data) diff --git a/util.h b/util.h index e102f21..8cabaf7 100644 --- a/util.h +++ b/util.h @@ -54,8 +54,11 @@ enum CombineRoundingBehavior { // Calculate where to sample, and with what weight, if one wants to use // the GPU's bilinear hardware to sample w1 * x[pos1] + w2 * x[pos2], // where pos1 and pos2 must be normalized coordinates describing neighboring -// pixels in the mipmap level at which you sample, and the total number of -// pixels (in given mipmap level) is . +// texels in the mipmap level at which you sample. is the +// number of distinct accessible subtexels in the given mipmap level, +// calculated by num_texels / movit_texel_subpixel_precision. It is a float +// for performance reasons, even though it is expected to be a whole number. +// is simply its inverse (1/x). // // Note that since the GPU might have limited precision in its linear // interpolation, the effective weights might be different from the ones you @@ -68,7 +71,7 @@ enum CombineRoundingBehavior { // rounded fp16 value. This enables more precise calculation of total_weight // and sum_sq_error. template -void combine_two_samples(float w1, float w2, float pos1, float pos2, unsigned size, +void combine_two_samples(float w1, float w2, float pos1, float pos2, float num_subtexels, float inv_num_subtexels, DestFloat *offset, DestFloat *total_weight, float *sum_sq_error); // Create a VBO with the given data, and bind it to the vertex attribute -- 2.39.2