Some microoptimizations in combine_two_samples(). Saves about 4% in ResampleEffect...

[movit] / util.cpp
diff --git a/util.cpp b/util.cpp

index 3141f8d2c5eb388aec128f31b6218c06c0b1bb8d..54c815cc2be0f7e45967bcfddabea04cd4060639 100644 (file)
--- a/util.cpp
+++ b/util.cpp
@@ -220,7 +220,7 @@ string output_glsl_vec3(const string &name, float x, float y, float z)
  }
  
  template<class DestFloat>
-void combine_two_samples(float w1, float w2, float pos1, float pos2, float num_subtexels, float inv_num_subtexels,
+void combine_two_samples(float w1, float w2, float pos1, float pos1_pos2_diff, float inv_pos1_pos2_diff, float num_subtexels, float inv_num_subtexels,
                           DestFloat *offset, DestFloat *total_weight, float *sum_sq_error)
  {
         assert(movit_initialized);
@@ -233,8 +233,8 @@ void combine_two_samples(float w1, float w2, float pos1, float pos2, float num_s
         }
  
         // Round to the desired precision. Note that this might take z outside the 0..1 range.
-       *offset = from_fp32<DestFloat>(pos1 + z * (pos2 - pos1));
-       z = (to_fp32(*offset) - pos1) / (pos2 - pos1);
+       *offset = from_fp32<DestFloat>(pos1 + z * pos1_pos2_diff);
+       z = (to_fp32(*offset) - pos1) * inv_pos1_pos2_diff;
  
         // Round to the minimum number of bits we have measured earlier.
         // The card will do this for us anyway, but if we know what the real z
@@ -265,11 +265,11 @@ void combine_two_samples(float w1, float w2, float pos1, float pos2, float num_s
  
  // Explicit instantiations.
  template
-void combine_two_samples<float>(float w1, float w2, float pos1, float pos2, float num_subtexels, float inv_num_subtexels,
+void combine_two_samples<float>(float w1, float w2, float pos1, float pos1_pos2_diff, float inv_pos1_pos2_diff, float num_subtexels, float inv_num_subtexels,
                                  float *offset, float *total_weight, float *sum_sq_error);
  
  template
-void combine_two_samples<fp16_int_t>(float w1, float w2, float pos1, float pos2, float num_subtexels, float inv_num_subtexels,
+void combine_two_samples<fp16_int_t>(float w1, float w2, float pos1, float pos1_pos2_diff, float inv_pos1_pos2_diff, float num_subtexels, float inv_num_subtexels,
                                       fp16_int_t *offset, fp16_int_t *total_weight, float *sum_sq_error);
  
  GLuint generate_vbo(GLint size, GLenum type, GLsizeiptr data_size, const GLvoid *data)
@@ -354,4 +354,39 @@ void *get_gl_context_identifier()
  #endif
  }
  
+void abort_gl_error(GLenum err, const char *filename, int line)
+{
+       const char *err_text = "unknown";
+
+       // All errors listed in the glGetError(3G) man page.
+       switch (err) {
+       case GL_NO_ERROR:
+               err_text = "GL_NO_ERROR";  // Should not happen.
+               break;
+       case GL_INVALID_ENUM:
+               err_text = "GL_INVALID_ENUM";
+               break;
+       case GL_INVALID_VALUE:
+               err_text = "GL_INVALID_VALUE";
+               break;
+       case GL_INVALID_OPERATION:
+               err_text = "GL_INVALID_OPERATION";
+               break;
+       case GL_INVALID_FRAMEBUFFER_OPERATION:
+               err_text = "GL_INVALID_FRAMEBUFFER_OPERATION";
+               break;
+       case GL_OUT_OF_MEMORY:
+               err_text = "GL_OUT_OF_MEMORY";
+               break;
+       case GL_STACK_UNDERFLOW:
+               err_text = "GL_STACK_UNDERFLOW";
+               break;
+       case GL_STACK_OVERFLOW:
+               err_text = "GL_STACK_OVERFLOW";
+               break;
+       }
+       fprintf(stderr, "GL error 0x%x (%s) at %s:%d\n", err, err_text, filename, line);
+       abort();
+}
+
  }  // namespace movit