Reduce the amount of computation in combine_two_samples().

author Steinar H. Gunderson <sgunderson@bigfoot.com>

Thu, 17 Sep 2015 17:28:43 +0000 (19:28 +0200)

committer Steinar H. Gunderson <sgunderson@bigfoot.com>

Thu, 17 Sep 2015 17:28:43 +0000 (19:28 +0200)
author Steinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 17 Sep 2015 17:28:43 +0000 (19:28 +0200)
committer Steinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 17 Sep 2015 17:28:43 +0000 (19:28 +0200)
diff --git a/blur_effect.cpp b/blur_effect.cpp

index 2084768bc54393985bbfccdf48f0be1f10741f08..adffe089a6d452d5f06a2c9907710aa152ee1f6f 100644 (file)
--- a/blur_effect.cpp
+++ b/blur_effect.cpp
@@ -6,6 +6,7 @@
  #include "blur_effect.h"
  #include "effect_chain.h"
  #include "effect_util.h"
+#include "init.h"
  #include "util.h"
  
  using namespace std;
@@ -189,24 +190,27 @@ void SingleBlurPassEffect::set_gl_state(GLuint glsl_program_num, const string &p
         uniform_samples[2 * 0 + 0] = 0.0f;
         uniform_samples[2 * 0 + 1] = weight[0];
  
+       int size;
+       if (direction == HORIZONTAL) {
+               size = width;
+       } else if (direction == VERTICAL) {
+               size = height;
+       } else {
+               assert(false);
+       }
+       float num_subtexels = size / movit_texel_subpixel_precision;
+       float inv_num_subtexels = movit_texel_subpixel_precision / size;
+
         // All other samples.
         for (int i = 1; i < num_taps / 2 + 1; ++i) {
                 unsigned base_pos = i * 2 - 1;
                 float w1 = weight[base_pos];
                 float w2 = weight[base_pos + 1];
-               int size;
-               if (direction == HORIZONTAL) {
-                       size = width;
-               } else if (direction == VERTICAL) {
-                       size = height;
-               } else {
-                       assert(false);
-               }
  
                 float pos1 = base_pos / (float)size;
                 float pos2 = (base_pos + 1) / (float)size;
                 float pos, total_weight;
-               combine_two_samples(w1, w2, pos1, pos2, size, &pos, &total_weight, NULL);
+               combine_two_samples(w1, w2, pos1, pos2, num_subtexels, inv_num_subtexels, &pos, &total_weight, NULL);
  
                 uniform_samples[2 * i + 0] = pos;
                 uniform_samples[2 * i + 1] = total_weight;
diff --git a/resample_effect.cpp b/resample_effect.cpp

index 4dd3ea87e261086939a91e2d26d51ef52c813168..79d5f21c45543a936f81e33a4ead19ca32d5a74f 100644 (file)
--- a/resample_effect.cpp
+++ b/resample_effect.cpp
@@ -61,7 +61,7 @@ unsigned gcd(unsigned a, unsigned b)
  }
  
  template<class DestFloat>
-unsigned combine_samples(const Tap<float> *src, Tap<DestFloat> *dst, unsigned src_size, unsigned num_src_samples, unsigned max_samples_saved)
+unsigned combine_samples(const Tap<float> *src, Tap<DestFloat> *dst, float num_subtexels, float inv_num_subtexels, unsigned num_src_samples, unsigned max_samples_saved)
  {
         // Cut off near-zero values at both sides.
         unsigned num_samples_saved = 0;
@@ -109,7 +109,7 @@ unsigned combine_samples(const Tap<float> *src, Tap<DestFloat> *dst, unsigned sr
  
                 fp16_int_t pos, total_weight;
                 float sum_sq_error;
-               combine_two_samples(w1, w2, pos1, pos2, src_size, &pos, &total_weight, &sum_sq_error);
+               combine_two_samples(w1, w2, pos1, pos2, num_subtexels, inv_num_subtexels, &pos, &total_weight, &sum_sq_error);
  
                 // If the interpolation error is larger than that of about sqrt(2) of
                 // a level at 8-bit precision, don't combine. (You'd think 1.0 was enough,
@@ -159,9 +159,12 @@ void normalize_sum(Tap<T>* vals, unsigned num)
  template<class DestFloat>
  unsigned combine_many_samples(const Tap<float> *weights, unsigned src_size, unsigned src_samples, unsigned dst_samples, Tap<DestFloat> **bilinear_weights)
  {
+       float num_subtexels = src_size / movit_texel_subpixel_precision;
+       float inv_num_subtexels = movit_texel_subpixel_precision / src_size;
         int src_bilinear_samples = 0;
+
         for (unsigned y = 0; y < dst_samples; ++y) {
-               unsigned num_samples_saved = combine_samples<DestFloat>(weights + y * src_samples, NULL, src_size, src_samples, UINT_MAX);
+               unsigned num_samples_saved = combine_samples<DestFloat>(weights + y * src_samples, NULL, num_subtexels, inv_num_subtexels, src_samples, UINT_MAX);
                 src_bilinear_samples = max<int>(src_bilinear_samples, src_samples - num_samples_saved);
         }
  
@@ -172,7 +175,8 @@ unsigned combine_many_samples(const Tap<float> *weights, unsigned src_size, unsi
                 unsigned num_samples_saved = combine_samples(
                         weights + y * src_samples,
                         bilinear_weights_ptr,
-                       src_size,
+                       num_subtexels,
+                       inv_num_subtexels,
                         src_samples,
                         src_samples - src_bilinear_samples);
                 assert(int(src_samples) - int(num_samples_saved) == src_bilinear_samples);
diff --git a/util.cpp b/util.cpp

index e266fb813921e606abd95a374f0a5772c0fd1edb..da6057e52f13b4ad4af9ca82a4c83acab6d23388 100644 (file)
--- a/util.cpp
+++ b/util.cpp
@@ -219,7 +219,7 @@ string output_glsl_vec3(const string &name, float x, float y, float z)
  }
  
  template<class DestFloat>
-void combine_two_samples(float w1, float w2, float pos1, float pos2, unsigned size,
+void combine_two_samples(float w1, float w2, float pos1, float pos2, float num_subtexels, float inv_num_subtexels,
                           DestFloat *offset, DestFloat *total_weight, float *sum_sq_error)
  {
         assert(movit_initialized);
@@ -238,9 +238,7 @@ void combine_two_samples(float w1, float w2, float pos1, float pos2, unsigned si
         // Round to the minimum number of bits we have measured earlier.
         // The card will do this for us anyway, but if we know what the real z
         // is, we can pick a better total_weight below.
-       z *= size;  // Move to pixel coordinates,
-       z = lrintf(z / movit_texel_subpixel_precision) * movit_texel_subpixel_precision;  // Round.
-       z /= size;  // Move back to normalized coordinates.
+       z = lrintf(z * num_subtexels) * inv_num_subtexels;
         
         // Choose total weight w so that we minimize total squared error
         // for the effective weights:
@@ -255,7 +253,7 @@ void combine_two_samples(float w1, float w2, float pos1, float pos2, unsigned si
         //   w = (a(1-z) + bz) / ((1-z)² + z²)
         //
         // If z had infinite precision, this would simply reduce to w = w1 + w2.
-       *total_weight = (w1 * (1 - z) + w2 * z) / (z * z + (1 - z) * (1 - z));
+       *total_weight = (w1 + z * (w2 - w1)) / (z * z + (1 - z) * (1 - z));
  
         if (sum_sq_error != NULL) {
                 float err1 = *total_weight * (1 - z) - w1;
@@ -266,11 +264,11 @@ void combine_two_samples(float w1, float w2, float pos1, float pos2, unsigned si
  
  // Explicit instantiations.
  template
-void combine_two_samples<float>(float w1, float w2, float pos1, float pos2, unsigned size,
+void combine_two_samples<float>(float w1, float w2, float pos1, float pos2, float num_subtexels, float inv_num_subtexels,
                                  float *offset, float *total_weight, float *sum_sq_error);
  
  template
-void combine_two_samples<fp16_int_t>(float w1, float w2, float pos1, float pos2, unsigned size,
+void combine_two_samples<fp16_int_t>(float w1, float w2, float pos1, float pos2, float num_subtexels, float inv_num_subtexels,
                                       fp16_int_t *offset, fp16_int_t *total_weight, float *sum_sq_error);
  
  GLuint fill_vertex_attribute(GLuint glsl_program_num, const string &attribute_name, GLint size, GLenum type, GLsizeiptr data_size, const GLvoid *data)
diff --git a/util.h b/util.h

index e102f2115612ede741798f86e58fec3a718622a4..8cabaf7c3eafe8be36928521bb324545521325ba 100644 (file)
--- a/util.h
+++ b/util.h
@@ -54,8 +54,11 @@ enum CombineRoundingBehavior {
  // Calculate where to sample, and with what weight, if one wants to use
  // the GPU's bilinear hardware to sample w1 * x[pos1] + w2 * x[pos2],
  // where pos1 and pos2 must be normalized coordinates describing neighboring
-// pixels in the mipmap level at which you sample, and the total number of
-// pixels (in given mipmap level) is <size>.
+// texels in the mipmap level at which you sample. <num_subtexels> is the
+// number of distinct accessible subtexels in the given mipmap level,
+// calculated by num_texels / movit_texel_subpixel_precision. It is a float
+// for performance reasons, even though it is expected to be a whole number.
+// <inv_num_subtexels> is simply its inverse (1/x).
  //
  // Note that since the GPU might have limited precision in its linear
  // interpolation, the effective weights might be different from the ones you
@@ -68,7 +71,7 @@ enum CombineRoundingBehavior {
  // rounded fp16 value. This enables more precise calculation of total_weight
  // and sum_sq_error.
  template<class DestFloat>
-void combine_two_samples(float w1, float w2, float pos1, float pos2, unsigned size,
+void combine_two_samples(float w1, float w2, float pos1, float pos2, float num_subtexels, float inv_num_subtexels,
                           DestFloat *offset, DestFloat *total_weight, float *sum_sq_error);
  
  // Create a VBO with the given data, and bind it to the vertex attribute
author	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Thu, 17 Sep 2015 17:28:43 +0000 (19:28 +0200)
committer	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Thu, 17 Sep 2015 17:28:43 +0000 (19:28 +0200)
blur_effect.cpp		patch \| blob \| history
resample_effect.cpp		patch \| blob \| history
util.cpp		patch \| blob \| history
util.h		patch \| blob \| history