Mostly microoptimization, but seemingly this function is somewhat expensive.
#include "blur_effect.h"
#include "effect_chain.h"
#include "effect_util.h"
+#include "init.h"
#include "util.h"
using namespace std;
uniform_samples[2 * 0 + 0] = 0.0f;
uniform_samples[2 * 0 + 1] = weight[0];
+ int size;
+ if (direction == HORIZONTAL) {
+ size = width;
+ } else if (direction == VERTICAL) {
+ size = height;
+ } else {
+ assert(false);
+ }
+ float num_subtexels = size / movit_texel_subpixel_precision;
+ float inv_num_subtexels = movit_texel_subpixel_precision / size;
+
// All other samples.
for (int i = 1; i < num_taps / 2 + 1; ++i) {
unsigned base_pos = i * 2 - 1;
float w1 = weight[base_pos];
float w2 = weight[base_pos + 1];
- int size;
- if (direction == HORIZONTAL) {
- size = width;
- } else if (direction == VERTICAL) {
- size = height;
- } else {
- assert(false);
- }
float pos1 = base_pos / (float)size;
float pos2 = (base_pos + 1) / (float)size;
float pos, total_weight;
- combine_two_samples(w1, w2, pos1, pos2, size, &pos, &total_weight, NULL);
+ combine_two_samples(w1, w2, pos1, pos2, num_subtexels, inv_num_subtexels, &pos, &total_weight, NULL);
uniform_samples[2 * i + 0] = pos;
uniform_samples[2 * i + 1] = total_weight;
}
template<class DestFloat>
-unsigned combine_samples(const Tap<float> *src, Tap<DestFloat> *dst, unsigned src_size, unsigned num_src_samples, unsigned max_samples_saved)
+unsigned combine_samples(const Tap<float> *src, Tap<DestFloat> *dst, float num_subtexels, float inv_num_subtexels, unsigned num_src_samples, unsigned max_samples_saved)
{
// Cut off near-zero values at both sides.
unsigned num_samples_saved = 0;
fp16_int_t pos, total_weight;
float sum_sq_error;
- combine_two_samples(w1, w2, pos1, pos2, src_size, &pos, &total_weight, &sum_sq_error);
+ combine_two_samples(w1, w2, pos1, pos2, num_subtexels, inv_num_subtexels, &pos, &total_weight, &sum_sq_error);
// If the interpolation error is larger than that of about sqrt(2) of
// a level at 8-bit precision, don't combine. (You'd think 1.0 was enough,
template<class DestFloat>
unsigned combine_many_samples(const Tap<float> *weights, unsigned src_size, unsigned src_samples, unsigned dst_samples, Tap<DestFloat> **bilinear_weights)
{
+ float num_subtexels = src_size / movit_texel_subpixel_precision;
+ float inv_num_subtexels = movit_texel_subpixel_precision / src_size;
int src_bilinear_samples = 0;
+
for (unsigned y = 0; y < dst_samples; ++y) {
- unsigned num_samples_saved = combine_samples<DestFloat>(weights + y * src_samples, NULL, src_size, src_samples, UINT_MAX);
+ unsigned num_samples_saved = combine_samples<DestFloat>(weights + y * src_samples, NULL, num_subtexels, inv_num_subtexels, src_samples, UINT_MAX);
src_bilinear_samples = max<int>(src_bilinear_samples, src_samples - num_samples_saved);
}
unsigned num_samples_saved = combine_samples(
weights + y * src_samples,
bilinear_weights_ptr,
- src_size,
+ num_subtexels,
+ inv_num_subtexels,
src_samples,
src_samples - src_bilinear_samples);
assert(int(src_samples) - int(num_samples_saved) == src_bilinear_samples);
}
template<class DestFloat>
-void combine_two_samples(float w1, float w2, float pos1, float pos2, unsigned size,
+void combine_two_samples(float w1, float w2, float pos1, float pos2, float num_subtexels, float inv_num_subtexels,
DestFloat *offset, DestFloat *total_weight, float *sum_sq_error)
{
assert(movit_initialized);
// Round to the minimum number of bits we have measured earlier.
// The card will do this for us anyway, but if we know what the real z
// is, we can pick a better total_weight below.
- z *= size; // Move to pixel coordinates,
- z = lrintf(z / movit_texel_subpixel_precision) * movit_texel_subpixel_precision; // Round.
- z /= size; // Move back to normalized coordinates.
+ z = lrintf(z * num_subtexels) * inv_num_subtexels;
// Choose total weight w so that we minimize total squared error
// for the effective weights:
// w = (a(1-z) + bz) / ((1-z)² + z²)
//
// If z had infinite precision, this would simply reduce to w = w1 + w2.
- *total_weight = (w1 * (1 - z) + w2 * z) / (z * z + (1 - z) * (1 - z));
+ *total_weight = (w1 + z * (w2 - w1)) / (z * z + (1 - z) * (1 - z));
if (sum_sq_error != NULL) {
float err1 = *total_weight * (1 - z) - w1;
// Explicit instantiations.
template
-void combine_two_samples<float>(float w1, float w2, float pos1, float pos2, unsigned size,
+void combine_two_samples<float>(float w1, float w2, float pos1, float pos2, float num_subtexels, float inv_num_subtexels,
float *offset, float *total_weight, float *sum_sq_error);
template
-void combine_two_samples<fp16_int_t>(float w1, float w2, float pos1, float pos2, unsigned size,
+void combine_two_samples<fp16_int_t>(float w1, float w2, float pos1, float pos2, float num_subtexels, float inv_num_subtexels,
fp16_int_t *offset, fp16_int_t *total_weight, float *sum_sq_error);
GLuint fill_vertex_attribute(GLuint glsl_program_num, const string &attribute_name, GLint size, GLenum type, GLsizeiptr data_size, const GLvoid *data)
// Calculate where to sample, and with what weight, if one wants to use
// the GPU's bilinear hardware to sample w1 * x[pos1] + w2 * x[pos2],
// where pos1 and pos2 must be normalized coordinates describing neighboring
-// pixels in the mipmap level at which you sample, and the total number of
-// pixels (in given mipmap level) is <size>.
+// texels in the mipmap level at which you sample. <num_subtexels> is the
+// number of distinct accessible subtexels in the given mipmap level,
+// calculated by num_texels / movit_texel_subpixel_precision. It is a float
+// for performance reasons, even though it is expected to be a whole number.
+// <inv_num_subtexels> is simply its inverse (1/x).
//
// Note that since the GPU might have limited precision in its linear
// interpolation, the effective weights might be different from the ones you
// rounded fp16 value. This enables more precise calculation of total_weight
// and sum_sq_error.
template<class DestFloat>
-void combine_two_samples(float w1, float w2, float pos1, float pos2, unsigned size,
+void combine_two_samples(float w1, float w2, float pos1, float pos2, float num_subtexels, float inv_num_subtexels,
DestFloat *offset, DestFloat *total_weight, float *sum_sq_error);
// Create a VBO with the given data, and bind it to the vertex attribute