#include <math.h>
#include <stdio.h>
#include <algorithm>
+#include <Eigen/Sparse>
+#include <Eigen/SparseQR>
+#include <Eigen/OrderingMethods>
#include "effect_chain.h"
#include "effect_util.h"
#include "resample_effect.h"
#include "util.h"
+using namespace Eigen;
using namespace std;
namespace movit {
template<class DestFloat>
unsigned combine_samples(const Tap<float> *src, Tap<DestFloat> *dst, unsigned src_size, unsigned num_src_samples, unsigned max_samples_saved)
{
+ // Cut off near-zero values at both sides.
unsigned num_samples_saved = 0;
+ while (num_samples_saved < max_samples_saved &&
+ num_src_samples > 0 &&
+ fabs(src[0].weight) < 1e-6) {
+ ++src;
+ --num_src_samples;
+ ++num_samples_saved;
+ }
+ while (num_samples_saved < max_samples_saved &&
+ num_src_samples > 0 &&
+ fabs(src[num_src_samples - 1].weight) < 1e-6) {
+ --num_src_samples;
+ ++num_samples_saved;
+ }
+
for (unsigned i = 0, j = 0; i < num_src_samples; ++i, ++j) {
// Copy the sample directly; it will be overwritten later if we can combine.
if (dst != NULL) {
return num_samples_saved;
}
+// Normalize so that the sum becomes one. Note that we do it twice;
+// this sometimes helps a tiny little bit when we have many samples.
+template<class T>
+void normalize_sum(Tap<T>* vals, unsigned num)
+{
+ for (int normalize_pass = 0; normalize_pass < 2; ++normalize_pass) {
+ double sum = 0.0;
+ for (unsigned i = 0; i < num; ++i) {
+ sum += to_fp64(vals[i].weight);
+ }
+ for (unsigned i = 0; i < num; ++i) {
+ vals[i].weight = from_fp64<T>(to_fp64(vals[i].weight) / sum);
+ }
+ }
+}
+
// Make use of the bilinear filtering in the GPU to reduce the number of samples
// we need to make. This is a bit more complex than BlurEffect since we cannot combine
// two neighboring samples if their weights have differing signs, so we first need to
src_samples,
src_samples - src_bilinear_samples);
assert(int(src_samples) - int(num_samples_saved) == src_bilinear_samples);
-
- // Normalize so that the sum becomes one. Note that we do it twice;
- // this sometimes helps a tiny little bit when we have many samples.
- for (int normalize_pass = 0; normalize_pass < 2; ++normalize_pass) {
- double sum = 0.0;
- for (int i = 0; i < src_bilinear_samples; ++i) {
- sum += to_fp64(bilinear_weights_ptr[i].weight);
- }
- for (int i = 0; i < src_bilinear_samples; ++i) {
- bilinear_weights_ptr[i].weight = from_fp64<DestFloat>(
- to_fp64(bilinear_weights_ptr[i].weight) / sum);
- }
- }
+ normalize_sum(bilinear_weights_ptr, src_bilinear_samples);
}
return src_bilinear_samples;
}
int lower_pos = int(floor(to_fp64(bilinear_weights[0].pos) * size - 0.5));
int upper_pos = int(ceil(to_fp64(bilinear_weights[num_bilinear_weights - 1].pos) * size - 0.5)) + 2;
lower_pos = min<int>(lower_pos, lrintf(weights[0].pos * size - 0.5));
- upper_pos = max<int>(upper_pos, lrintf(weights[num_weights - 1].pos * size - 0.5));
+ upper_pos = max<int>(upper_pos, lrintf(weights[num_weights - 1].pos * size - 0.5) + 1);
float* effective_weights = new float[upper_pos - lower_pos];
for (int i = 0; i < upper_pos - lower_pos; ++i) {
// Now make use of the bilinear filtering in the GPU to reduce the number of samples
// we need to make. Try fp16 first; if it's not accurate enough, we go to fp32.
+ // Our tolerance level for total error is a bit higher than the one for invididual
+ // samples, since one would assume overall errors in the shape don't matter as much.
+ const float max_error = 2.0f / (255.0f * 255.0f);
Tap<fp16_int_t> *bilinear_weights_fp16;
src_bilinear_samples = combine_many_samples(weights, src_size, src_samples, dst_samples, &bilinear_weights_fp16);
Tap<float> *bilinear_weights_fp32 = NULL;
bilinear_weights_fp16 + y * src_bilinear_samples, src_bilinear_samples,
src_size);
max_sum_sq_error_fp16 = std::max(max_sum_sq_error_fp16, sum_sq_error_fp16);
+ if (max_sum_sq_error_fp16 > max_error) {
+ break;
+ }
}
- // Our tolerance level for total error is a bit higher than the one for invididual
- // samples, since one would assume overall errors in the shape don't matter as much.
- if (max_sum_sq_error_fp16 > 2.0f / (255.0f * 255.0f)) {
+ if (max_sum_sq_error_fp16 > max_error) {
fallback_to_fp32 = true;
src_bilinear_samples = combine_many_samples(weights, src_size, src_samples, dst_samples, &bilinear_weights_fp32);
}