From 3ccf5fb197c9a72545affc0b7286349d5603b72e Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Sun, 16 Mar 2014 19:25:53 +0100 Subject: [PATCH] Add an FFT convolution effect. A lot of the later commits have been leading up to this, and I finally got to the point where all the unit tests check out, everything seems to work (modulo maybe some overflow issues) and we have a model that matches what people actually expects from convolutions. Note that this adds a dependency on FFTW3; we could probably have added our own routines for such small needs, but like with Eigen, calling out to a library is fine as long as it's of good quality (which FFTW certainly is) and is widely available. --- Makefile.in | 8 +- README | 8 +- configure.ac | 1 + fft_convolution_effect.cpp | 274 ++++++++++++++++++++++++++++++++ fft_convolution_effect.h | 116 ++++++++++++++ fft_convolution_effect_test.cpp | 211 ++++++++++++++++++++++++ fft_input.cpp | 139 ++++++++++++++++ fft_input.h | 81 ++++++++++ slice_effect.cpp | 7 + slice_effect.frag | 3 +- slice_effect.h | 1 + slice_effect_test.cpp | 55 +++++++ util.cpp | 13 ++ util.h | 3 + 14 files changed, 912 insertions(+), 8 deletions(-) create mode 100644 fft_convolution_effect.cpp create mode 100644 fft_convolution_effect.h create mode 100644 fft_convolution_effect_test.cpp create mode 100644 fft_input.cpp create mode 100644 fft_input.h diff --git a/Makefile.in b/Makefile.in index 8164e38..e401340 100644 --- a/Makefile.in +++ b/Makefile.in @@ -12,9 +12,9 @@ with_coverage = @with_coverage@ CC=@CC@ CXX=@CXX@ -CXXFLAGS=-Wall @CXXFLAGS@ -I$(GTEST_DIR)/include @Eigen3_CFLAGS@ @GLEW_CFLAGS@ -LDFLAGS=@GLEW_LIBS@ @SDL_LIBS@ -lpthread -DEMO_LDLIBS=@SDL_image_LIBS@ -lrt -lpthread @libpng_LIBS@ +CXXFLAGS=-Wall @CXXFLAGS@ -I$(GTEST_DIR)/include @Eigen3_CFLAGS@ @GLEW_CFLAGS@ @FFTW3_CFLAGS@ +LDFLAGS=@GLEW_LIBS@ @SDL_LIBS@ @FFTW3_LIBS@ -lpthread +DEMO_LDLIBS=@SDL_image_LIBS@ -lrt -lpthread @libpng_LIBS@ @FFTW3_LIBS@ SHELL=@SHELL@ LIBTOOL=@LIBTOOL@ --tag=CXX RANLIB=ranlib @@ -59,11 +59,13 @@ TESTED_EFFECTS += vignette_effect TESTED_EFFECTS += slice_effect TESTED_EFFECTS += complex_modulate_effect TESTED_EFFECTS += luma_mix_effect +TESTED_EFFECTS += fft_convolution_effect UNTESTED_EFFECTS = sandbox_effect UNTESTED_EFFECTS += mirror_effect UNTESTED_EFFECTS += resize_effect UNTESTED_EFFECTS += multiply_effect +UNTESTED_EFFECTS += fft_input EFFECTS = $(TESTED_EFFECTS) $(UNTESTED_EFFECTS) diff --git a/README b/README index c6b20da..c74e8ef 100644 --- a/README +++ b/README @@ -39,10 +39,10 @@ for performance estimates. Still TL;DR, please give me the list of filters =============================================== -Blur, diffusion, glow, lift/gamma/gain (color correction), mirror, -mix (add two inputs), luma mix (use a map to wipe between two inputs), -overlay (the Porter-Duff “over” operation), scale (bilinear and Lanczos), -sharpen (both by unsharp mask and by Wiener filters), saturation +Blur, diffusion, FFT-based convolution, glow, lift/gamma/gain (color +correction), mirror, mix (add two inputs), luma mix (use a map to wipe between +two inputs), overlay (the Porter-Duff “over” operation), scale (bilinear and +Lanczos), sharpen (both by unsharp mask and by Wiener filters), saturation (or desaturation), vignette, and white balance. Yes, that's a short list. But they all look great, are fast and don't give diff --git a/configure.ac b/configure.ac index 5e03234..3e0b1be 100644 --- a/configure.ac +++ b/configure.ac @@ -8,6 +8,7 @@ AC_PROG_CC AC_PROG_CXX PKG_CHECK_MODULES([Eigen3], [eigen3]) PKG_CHECK_MODULES([GLEW], [glew]) +PKG_CHECK_MODULES([FFTW3], [fftw3]) # Needed for unit tests and the demo app. PKG_CHECK_MODULES([SDL], [sdl]) diff --git a/fft_convolution_effect.cpp b/fft_convolution_effect.cpp new file mode 100644 index 0000000..b3f77bd --- /dev/null +++ b/fft_convolution_effect.cpp @@ -0,0 +1,274 @@ +#include +#include + +#include "complex_modulate_effect.h" +#include "effect_chain.h" +#include "fft_convolution_effect.h" +#include "fft_input.h" +#include "fft_pass_effect.h" +#include "multiply_effect.h" +#include "padding_effect.h" +#include "slice_effect.h" +#include "util.h" + +using namespace std; + +namespace movit { + +FFTConvolutionEffect::FFTConvolutionEffect(int input_width, int input_height, int convolve_width, int convolve_height) + : input_width(input_width), + input_height(input_height), + convolve_width(convolve_width), + convolve_height(convolve_height), + fft_input(new FFTInput(convolve_width, convolve_height)), + crop_effect(new PaddingEffect()), + owns_effects(true) { + CHECK(crop_effect->set_int("width", input_width)); + CHECK(crop_effect->set_int("height", input_height)); + CHECK(crop_effect->set_float("top", 0)); + CHECK(crop_effect->set_float("left", 0)); +} + +FFTConvolutionEffect::~FFTConvolutionEffect() +{ + if (owns_effects) { + delete fft_input; + delete crop_effect; + } +} + +namespace { + +// Returns the last Effect in the new chain. +Effect *add_overlap_and_fft(EffectChain *chain, Effect *last_effect, int fft_size, int pad_size, FFTPassEffect::Direction direction) +{ + // Overlap. + { + Effect *overlap_effect = chain->add_effect(new SliceEffect(), last_effect); + CHECK(overlap_effect->set_int("input_slice_size", fft_size - pad_size)); + CHECK(overlap_effect->set_int("output_slice_size", fft_size)); + CHECK(overlap_effect->set_int("offset", -pad_size)); + if (direction == FFTPassEffect::HORIZONTAL) { + CHECK(overlap_effect->set_int("direction", SliceEffect::HORIZONTAL)); + } else { + assert(direction == FFTPassEffect::VERTICAL); + CHECK(overlap_effect->set_int("direction", SliceEffect::VERTICAL)); + } + + last_effect = overlap_effect; + } + + // FFT. + int num_passes = ffs(fft_size) - 1; + for (int i = 1; i <= num_passes; ++i) { + Effect *fft_effect = chain->add_effect(new FFTPassEffect(), last_effect); + CHECK(fft_effect->set_int("pass_number", i)); + CHECK(fft_effect->set_int("fft_size", fft_size)); + CHECK(fft_effect->set_int("direction", direction)); + CHECK(fft_effect->set_int("inverse", 0)); + + last_effect = fft_effect; + } + + return last_effect; +} + +// Returns the last Effect in the new chain. +Effect *add_ifft_and_discard(EffectChain *chain, Effect *last_effect, int fft_size, int pad_size, FFTPassEffect::Direction direction) +{ + // IFFT. + int num_passes = ffs(fft_size) - 1; + for (int i = 1; i <= num_passes; ++i) { + Effect *fft_effect = chain->add_effect(new FFTPassEffect(), last_effect); + CHECK(fft_effect->set_int("pass_number", i)); + CHECK(fft_effect->set_int("fft_size", fft_size)); + CHECK(fft_effect->set_int("direction", direction)); + CHECK(fft_effect->set_int("inverse", 1)); + + last_effect = fft_effect; + } + + // Discard. + { + Effect *discard_effect = chain->add_effect(new SliceEffect(), last_effect); + CHECK(discard_effect->set_int("input_slice_size", fft_size)); + CHECK(discard_effect->set_int("output_slice_size", fft_size - pad_size)); + if (direction == FFTPassEffect::HORIZONTAL) { + CHECK(discard_effect->set_int("direction", SliceEffect::HORIZONTAL)); + } else { + assert(direction == FFTPassEffect::VERTICAL); + CHECK(discard_effect->set_int("direction", SliceEffect::VERTICAL)); + } + CHECK(discard_effect->set_int("offset", pad_size)); + + last_effect = discard_effect; + } + + return last_effect; +} + +} // namespace + +void FFTConvolutionEffect::rewrite_graph(EffectChain *chain, Node *self) +{ + int pad_width = convolve_width - 1; + int pad_height = convolve_height - 1; + + // Try all possible FFT widths and heights to see which one is the + // cheapest. As a proxy for real performance, we use number of texel + // fetches; this isn't perfect by any means, but it's easy to work with + // and should be approximately correct. + int min_x = next_power_of_two(1 + pad_width); + int min_y = next_power_of_two(1 + pad_height); + int max_y = next_power_of_two(input_height + pad_width); + int max_x = next_power_of_two(input_width + pad_height); + + size_t best_cost = numeric_limits::max(); + int best_x = -1, best_y = -1, best_x_before_y_fft = -1, best_x_before_y_ifft = -1; + + // Try both + // + // overlap(X), FFT(X), overlap(Y), FFT(Y), modulate, IFFT(Y), discard(Y), IFFT(X), discard(X) and + // overlap(Y), FFT(Y), overlap(X), FFT(X), modulate, IFFT(X), discard(X), IFFT(Y), discard(Y) + // + // For simplicity, call them the XY-YX and YX-XY orders. In theory, we + // could have XY-XY and YX-YX orders as well, and I haven't found a + // convincing argument that they will never be optimal (although it + // sounds odd and should be rare), so we test all four possible ones. + // + // We assume that the kernel FFT is for free, since it is typically done + // only once and per frame. + for (int x_before_y_fft = 0; x_before_y_fft <= 1; ++x_before_y_fft) { + for (int x_before_y_ifft = 0; x_before_y_ifft <= 1; ++x_before_y_ifft) { + for (int y = min_y; y <= max_y; y *= 2) { + int y_pixels_per_block = y - pad_height; + int num_vertical_blocks = div_round_up(input_height, y_pixels_per_block); + size_t output_height = y * num_vertical_blocks; + for (int x = min_x; x <= max_x; x *= 2) { + int x_pixels_per_block = x - pad_width; + int num_horizontal_blocks = div_round_up(input_width, x_pixels_per_block); + size_t output_width = x * num_horizontal_blocks; + + size_t cost = 0; + + if (x_before_y_fft) { + // First, the cost of the horizontal padding. + cost = output_width * input_height; + + // log(X) FFT passes. Each pass reads two inputs per pixel, + // plus the support texture. + cost += (ffs(x) - 1) * 3 * output_width * input_height; + + // Now, horizontal padding. + cost += output_width * output_height; + + // log(Y) FFT passes, now at full resolution. + cost += (ffs(y) - 1) * 3 * output_width * output_height; + } else { + // First, the cost of the vertical padding. + cost = input_width * output_height; + + // log(Y) FFT passes. Each pass reads two inputs per pixel, + // plus the support texture. + cost += (ffs(y) - 1) * 3 * input_width * output_height; + + // Now, horizontal padding. + cost += output_width * output_height; + + // log(X) FFT passes, now at full resolution. + cost += (ffs(x) - 1) * 3 * output_width * output_height; + } + + // The actual modulation. Reads one pixel each from two textures. + cost += 2 * output_width * output_height; + + if (x_before_y_ifft) { + // log(X) IFFT passes. + cost += (ffs(x) - 1) * 3 * output_width * output_height; + + // Discard horizontally. + cost += input_width * output_height; + + // log(Y) IFFT passes. + cost += (ffs(y) - 1) * 3 * input_width * output_height; + + // Discard horizontally. + cost += input_width * input_height; + } else { + // log(Y) IFFT passes. + cost += (ffs(y) - 1) * 3 * output_width * output_height; + + // Discard vertically. + cost += output_width * input_height; + + // log(X) IFFT passes. + cost += (ffs(x) - 1) * 3 * output_width * input_height; + + // Discard horizontally. + cost += input_width * input_height; + } + + if (cost < best_cost) { + best_x = x; + best_y = y; + best_x_before_y_fft = x_before_y_fft; + best_x_before_y_ifft = x_before_y_ifft; + best_cost = cost; + } + } + } + } + } + + const int fft_width = best_x, fft_height = best_y; + + assert(self->incoming_links.size() == 1); + Node *last_node = self->incoming_links[0]; + self->incoming_links.clear(); + last_node->outgoing_links.clear(); + + // Do FFT. + Effect *last_effect = last_node->effect; + if (best_x_before_y_fft) { + last_effect = add_overlap_and_fft(chain, last_effect, fft_width, pad_width, FFTPassEffect::HORIZONTAL); + last_effect = add_overlap_and_fft(chain, last_effect, fft_height, pad_height, FFTPassEffect::VERTICAL); + } else { + last_effect = add_overlap_and_fft(chain, last_effect, fft_height, pad_height, FFTPassEffect::VERTICAL); + last_effect = add_overlap_and_fft(chain, last_effect, fft_width, pad_width, FFTPassEffect::HORIZONTAL); + } + + // Normalizer. + Effect *multiply_effect; + float fft_size = fft_width * fft_height; + float factor[4] = { 1.0f / fft_size, 1.0f / fft_size, 1.0f / fft_size, 1.0f / fft_size }; + last_effect = multiply_effect = chain->add_effect(new MultiplyEffect(), last_effect); + CHECK(multiply_effect->set_vec4("factor", factor)); + + // Multiply by the FFT of the convolution kernel. + CHECK(fft_input->set_int("fft_width", fft_width)); + CHECK(fft_input->set_int("fft_height", fft_height)); + chain->add_input(fft_input); + owns_effects = false; + + Effect *modulate_effect = chain->add_effect(new ComplexModulateEffect(), multiply_effect, fft_input); + CHECK(modulate_effect->set_int("num_repeats_x", div_round_up(input_width, fft_width - pad_width))); + CHECK(modulate_effect->set_int("num_repeats_y", div_round_up(input_height, fft_height - pad_height))); + last_effect = modulate_effect; + + // Finally, do IFFT. + if (best_x_before_y_ifft) { + last_effect = add_ifft_and_discard(chain, last_effect, fft_width, pad_width, FFTPassEffect::HORIZONTAL); + last_effect = add_ifft_and_discard(chain, last_effect, fft_height, pad_height, FFTPassEffect::VERTICAL); + } else { + last_effect = add_ifft_and_discard(chain, last_effect, fft_height, pad_height, FFTPassEffect::VERTICAL); + last_effect = add_ifft_and_discard(chain, last_effect, fft_width, pad_width, FFTPassEffect::HORIZONTAL); + } + + // ...and crop away any extra padding we have have added. + last_effect = chain->add_effect(crop_effect); + + chain->replace_sender(self, chain->find_node_for_effect(last_effect)); + self->disabled = true; +} + +} // namespace movit diff --git a/fft_convolution_effect.h b/fft_convolution_effect.h new file mode 100644 index 0000000..f601a49 --- /dev/null +++ b/fft_convolution_effect.h @@ -0,0 +1,116 @@ +#ifndef _MOVIT_FFT_CONVOLUTION_EFFECT_H +#define _MOVIT_FFT_CONVOLUTION_EFFECT_H 1 + +// FFTConvolutionEffect applies an arbitrary 2D convolution between the input image +// and a convolution kernel (assumed to be much smaller than the image). +// It does this convolution using multiple smaller FFTs and an algorithm called +// overlap-discard (also known as overlap-save) to achieve much higher +// efficiency than direct evaluation of the convolution, at some expense of +// accuracy. +// +// FFTConvolutionEffect follows the usual convention for convolution, which is that +// you sample from the origin pixel, and then up and to the left from that. This means +// that (in horizontal 1D) [1 0 0 0 0 ...] would be an identity transform, and that +// [0 1 0 0 0 ...] would mean sampling one pixel to the left of the origin, which +// effectively move would move the image one pixel to the right. +// +// The basic idea of the acceleration comes from the the convolution theorem +// (which holds in any number of dimensions), namely that FFT(A ⊙ B) = +// FFT(A) * FFT(B), where ⊙ is circular convolution and * is pointwise +// multiplication. This means that A ⊙ B = IFFT(FFT(A) * FFT(B)), and since +// we can do a 2D FFT in O(n² log n), this is asymptotically better than +// direct convolution, which is O(n²m²) (where m is the size of the convolution +// kernel). However, the convolution theorem is rarely _directly_ applicable, +// for two reasons: +// +// - ⊙ is _circular_ convolution, which means that inputs are taken to +// repeat (wrap around), which is rarely what you want. +// - A and B must be the same size, which means that to convolve a 1280x720 +// kernel with a 64x64 kernel, you need to zero pad the 64x64 kernel and +// then do _two_ full 1280x720 FFTs (one for each of A and B). +// +// The first problem is solved by adding m-1 zero pixels (horizontally +// and vertically) as padding, and then discarding the result of those pixels. +// This causes the output to be identical to a non-circular convolution. +// +// The second is slightly more tricky, and there are multiple ways of solving +// it. The one that appears to be the most suitable to suitable for GPU use, +// and the one that is used here, is overlap-discard (more commonly but less +// precisely known as overlap-save). In overlap-discard, the input is broken +// up into multiple equally-sized slices which are then FFTed and convolved +// with the kernel individually. (The kernel must still be zero padded to the +// same size as the slice, but this is typically much smaller then the full +// picture.) As before, the pad area contains data that's essentially junk, +// which is thrown away when the slices are put together again. +// +// The optimal slice size is a tradeoff. More slices means more space wasted +// for padding, since the padding is the same no matter the slice size, +// but fewer slices means we need to do larger FFTs (although fewer of them). +// There's no exact closed formula for this, especially since the 2D case +// makes things a bit more tricky with ordering of the X versus Y passes, +// so we simply try all possible sizes and orderings, attempting to estimate +// roughly how much each operation will cost. The result isn't perfect, though; +// FFTW has a mode for actually measuring, which they claim improves speeds +// by ~2x over simple estimation, but they also have much more freedom in +// their execution model than we do. +// +// The output _size_ of a convolution can be defined in a couple of different +// ways; in a sense, what's the most reasonable is using only the central part +// of the result (the mode “valid” in MATLAB/Octave), since that is the only +// one not used by any edge pixels. (FFTConvolutionEffect assumes normal Movit +// edge pixel behavior, which is to repeat the outermost pixels.) You could also +// keep all the output pixels (“full” in MATLAB/Octave), which is nicely symmetric. +// However, for video processing, typically what you want is to have the _same_ +// output size as input size, so we crop to the input size. This means that +// you'll get some of the edge-affected pixels but not all, but it's usually +// an okay tradeoff. +// +// FFTConvolutionEffect does not do any actual pixel work by itself; it +// rewrites itself into a long chain of SliceEffect, FFTPassEffect, FFTInput +// and ComplexModulationEffect to do its bidding. Note that currently, due to +// Movit limitations, we need to know the number of FFT passes at finalize() +// time, which in turn means you cannot change image or kernel size on the fly. + +#include +#include +#include + +#include "effect.h" +#include "fft_input.h" + +namespace movit { + +class PaddingEffect; + +class FFTConvolutionEffect : public Effect { +public: + FFTConvolutionEffect(int input_width, int input_height, int convolve_width, int convolve_height); + ~FFTConvolutionEffect(); + virtual std::string effect_type_id() const { return "FFTConvolutionEffect"; } + std::string output_fragment_shader() { assert(false); } + virtual void rewrite_graph(EffectChain *graph, Node *self); + + // See FFTInput::set_pixel_data(). + void set_convolution_kernel(const float *pixel_data) + { + assert(fft_input); + fft_input->set_pixel_data(pixel_data); + } + +private: + int input_width, input_height; + int convolve_width, convolve_height; + + // Chosen by algorithm. + int fft_width, fft_height; + + // Both of these are owned by us if owns_effects is true (before finalize()), + // and otherwise owned by the EffectChain. + FFTInput *fft_input; + PaddingEffect *crop_effect; + bool owns_effects; +}; + +} // namespace movit + +#endif // !defined(_MOVIT_FFT_CONVOLUTION_EFFECT_H) diff --git a/fft_convolution_effect_test.cpp b/fft_convolution_effect_test.cpp new file mode 100644 index 0000000..f0f8527 --- /dev/null +++ b/fft_convolution_effect_test.cpp @@ -0,0 +1,211 @@ +// Unit tests for FFTConvolutionEffect. + +#include +#include + +#include "effect_chain.h" +#include "gtest/gtest.h" +#include "image_format.h" +#include "test_util.h" +#include "fft_convolution_effect.h" + +namespace movit { + +TEST(FFTConvolutionEffectTest, Identity) { + const int size = 4; + + float data[size * size] = { + 0.1, 1.1, 2.1, 3.1, + 0.2, 1.2, 2.2, 3.2, + 0.3, 1.3, 2.3, 3.3, + 0.4, 1.4, 2.4, 3.4, + }; + float out_data[size * size]; + + for (int convolve_size = 1; convolve_size < 10; ++convolve_size) { + float kernel[convolve_size * convolve_size]; + for (int i = 0; i < convolve_size * convolve_size; ++i) { + kernel[i] = 0.0f; + } + kernel[0] = 1.0f; + + EffectChainTester tester(NULL, size, size, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR); + tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, size, size); + + FFTConvolutionEffect *fft_effect = new FFTConvolutionEffect(size, size, convolve_size, convolve_size); + tester.get_chain()->add_effect(fft_effect); + fft_effect->set_convolution_kernel(kernel); + tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED); + + expect_equal(data, out_data, size, size, 0.02, 0.003); + } +} + +TEST(FFTConvolutionEffectTest, Constant) { + const int size = 4, convolve_size = 17; + const float f = 7.0f; + + float data[size * size] = { + 0.1, 1.1, 2.1, 3.1, + 0.2, 1.2, 2.2, 3.2, + 0.3, 1.3, 2.3, 3.3, + 0.4, 1.4, 2.4, 3.4, + }; + float expected_data[size * size] = { + f * 0.1, f * 1.1, f * 2.1, f * 3.1, + f * 0.2, f * 1.2, f * 2.2, f * 3.2, + f * 0.3, f * 1.3, f * 2.3, f * 3.3, + f * 0.4, f * 1.4, f * 2.4, f * 3.4, + }; + float out_data[size * size]; + float kernel[convolve_size * convolve_size]; + for (int i = 0; i < convolve_size * convolve_size; ++i) { + kernel[i] = 0.0f; + } + kernel[0] = f; + + EffectChainTester tester(NULL, size, size, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR); + tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, size, size); + + FFTConvolutionEffect *fft_effect = new FFTConvolutionEffect(size, size, convolve_size, convolve_size); + tester.get_chain()->add_effect(fft_effect); + fft_effect->set_convolution_kernel(kernel); + tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED); + + // Somewhat looser bounds due to the higher magnitude. + expect_equal(expected_data, out_data, size, size, f * 0.03, f * 0.004); +} + +TEST(FFTConvolutionEffectTest, MoveRight) { + const int size = 4, convolve_size = 3; + + float data[size * size] = { + 0.1, 1.1, 2.1, 3.1, + 0.2, 1.2, 2.2, 3.2, + 0.3, 1.3, 2.3, 3.3, + 0.4, 1.4, 2.4, 3.4, + }; + float kernel[convolve_size * convolve_size] = { + 0.0, 1.0, 0.0, + 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, + }; + float expected_data[size * size] = { + 0.1, 0.1, 1.1, 2.1, + 0.2, 0.2, 1.2, 2.2, + 0.3, 0.3, 1.3, 2.3, + 0.4, 0.4, 1.4, 2.4, + }; + float out_data[size * size]; + + EffectChainTester tester(NULL, size, size, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR); + tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, size, size); + + FFTConvolutionEffect *fft_effect = new FFTConvolutionEffect(size, size, convolve_size, convolve_size); + tester.get_chain()->add_effect(fft_effect); + fft_effect->set_convolution_kernel(kernel); + tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED); + + expect_equal(expected_data, out_data, size, size, 0.02, 0.003); +} + +TEST(FFTConvolutionEffectTest, MoveDown) { + const int size = 4, convolve_size = 3; + + float data[size * size] = { + 0.1, 1.1, 2.1, 3.1, + 0.2, 1.2, 2.2, 3.2, + 0.3, 1.3, 2.3, 3.3, + 0.4, 1.4, 2.4, 3.4, + }; + float kernel[convolve_size * convolve_size] = { + 0.0, 0.0, 0.0, + 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, + }; + float expected_data[size * size] = { + 0.1, 1.1, 2.1, 3.1, + 0.1, 1.1, 2.1, 3.1, + 0.2, 1.2, 2.2, 3.2, + 0.3, 1.3, 2.3, 3.3, + }; + float out_data[size * size]; + + EffectChainTester tester(NULL, size, size, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR); + tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, size, size); + + FFTConvolutionEffect *fft_effect = new FFTConvolutionEffect(size, size, convolve_size, convolve_size); + tester.get_chain()->add_effect(fft_effect); + fft_effect->set_convolution_kernel(kernel); + tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED); + + expect_equal(expected_data, out_data, size, size, 0.02, 0.003); +} + +TEST(FFTConvolutionEffectTest, MergeWithLeft) { + const int size = 4, convolve_size = 3; + + float data[size * size] = { + 0.1, 1.1, 2.1, 3.1, + 0.2, 1.2, 2.2, 3.2, + 0.3, 1.3, 2.3, 3.3, + 0.4, 1.4, 2.4, 3.4, + }; + float kernel[convolve_size * convolve_size] = { + 1.0, 1.0, 0.0, + 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, + }; + float expected_data[size * size] = { + 0.1 + 0.1, 0.1 + 1.1, 1.1 + 2.1, 2.1 + 3.1, + 0.2 + 0.2, 0.2 + 1.2, 1.2 + 2.2, 2.2 + 3.2, + 0.3 + 0.3, 0.3 + 1.3, 1.3 + 2.3, 2.3 + 3.3, + 0.4 + 0.4, 0.4 + 1.4, 1.4 + 2.4, 2.4 + 3.4, + }; + float out_data[size * size]; + + EffectChainTester tester(NULL, size, size, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR); + tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, size, size); + + FFTConvolutionEffect *fft_effect = new FFTConvolutionEffect(size, size, convolve_size, convolve_size); + tester.get_chain()->add_effect(fft_effect); + fft_effect->set_convolution_kernel(kernel); + tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED); + + expect_equal(expected_data, out_data, size, size, 0.02, 0.003); +} + +TEST(FFTConvolutionEffectTest, NegativeCoefficients) { + const int size = 4; + const int convolve_width = 3, convolve_height = 2; + + float data[size * size] = { + 0.1, 1.1, 2.1, 3.1, + 0.2, 1.2, 2.2, 3.2, + 0.3, 1.3, 2.3, 3.3, + 0.4, 1.4, 2.4, 3.4, + }; + float kernel[convolve_width * convolve_height] = { + 1.0, 0.0, 0.0, + 0.0, 0.0, -0.5, + }; + float expected_data[size * size] = { + 0.1 - 0.5 * 0.1, 1.1 - 0.5 * 0.1, 2.1 - 0.5 * 0.1, 3.1 - 0.5 * 1.1, + 0.2 - 0.5 * 0.1, 1.2 - 0.5 * 0.1, 2.2 - 0.5 * 0.1, 3.2 - 0.5 * 1.1, + 0.3 - 0.5 * 0.2, 1.3 - 0.5 * 0.2, 2.3 - 0.5 * 0.2, 3.3 - 0.5 * 1.2, + 0.4 - 0.5 * 0.3, 1.4 - 0.5 * 0.3, 2.4 - 0.5 * 0.3, 3.4 - 0.5 * 1.3, + }; + float out_data[size * size]; + + EffectChainTester tester(NULL, size, size, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR); + tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, size, size); + + FFTConvolutionEffect *fft_effect = new FFTConvolutionEffect(size, size, convolve_width, convolve_height); + tester.get_chain()->add_effect(fft_effect); + fft_effect->set_convolution_kernel(kernel); + tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED); + + expect_equal(expected_data, out_data, size, size, 0.02, 0.003); +} + +} // namespace movit diff --git a/fft_input.cpp b/fft_input.cpp new file mode 100644 index 0000000..ba539b6 --- /dev/null +++ b/fft_input.cpp @@ -0,0 +1,139 @@ +#include +#include +#include +#include + +#include "effect_util.h" +#include "fp16.h" +#include "fft_input.h" +#include "resource_pool.h" +#include "util.h" + +using namespace std; + +namespace movit { + +FFTInput::FFTInput(unsigned width, unsigned height) + : texture_num(0), + fft_width(width), + fft_height(height), + convolve_width(width), + convolve_height(height), + pixel_data(NULL) +{ + register_int("fft_width", &fft_width); + register_int("fft_height", &fft_height); +} + +FFTInput::~FFTInput() +{ + if (texture_num != 0) { + resource_pool->release_2d_texture(texture_num); + } +} + +void FFTInput::set_gl_state(GLuint glsl_program_num, const string& prefix, unsigned *sampler_num) +{ + glActiveTexture(GL_TEXTURE0 + *sampler_num); + check_error(); + + if (texture_num == 0) { + assert(pixel_data != NULL); + + // Do the FFT. Our FFTs should typically be small enough and + // the data changed often enough that FFTW_ESTIMATE should be + // quite OK. Otherwise, we'd need to worry about caching these + // plans (possibly including FFTW wisdom). + fftw_complex *in = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * fft_width * fft_height); + fftw_complex *out = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * fft_width * fft_height); + fftw_plan p = fftw_plan_dft_2d(fft_height, fft_width, in, out, FFTW_FORWARD, FFTW_ESTIMATE); + + // Zero pad. + for (int i = 0; i < fft_height * fft_width; ++i) { + in[i][0] = 0.0; + in[i][1] = 0.0; + } + for (unsigned y = 0; y < convolve_height; ++y) { + for (unsigned x = 0; x < convolve_width; ++x) { + int i = y * fft_width + x; + in[i][0] = pixel_data[y * convolve_width + x]; + in[i][1] = 0.0; + } + } + + fftw_execute(p); + + // Convert to fp16. + fp16_int_t *kernel = new fp16_int_t[fft_width * fft_height * 2]; + for (int i = 0; i < fft_width * fft_height; ++i) { + kernel[i * 2 + 0] = fp64_to_fp16(out[i][0]); + kernel[i * 2 + 1] = fp64_to_fp16(out[i][1]); + } + + // (Re-)upload the texture. + texture_num = resource_pool->create_2d_texture(GL_RG16F, fft_width, fft_height); + glBindTexture(GL_TEXTURE_2D, texture_num); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + check_error(); + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + check_error(); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, fft_width, fft_height, GL_RG, GL_HALF_FLOAT, kernel); + check_error(); + glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); + check_error(); + + fftw_free(in); + fftw_free(out); + delete[] kernel; + } else { + glBindTexture(GL_TEXTURE_2D, texture_num); + check_error(); + } + + // Bind it to a sampler. + set_uniform_int(glsl_program_num, prefix, "tex", *sampler_num); + ++*sampler_num; +} + +string FFTInput::output_fragment_shader() +{ + return read_file("flat_input.frag"); +} + +void FFTInput::invalidate_pixel_data() +{ + if (texture_num != 0) { + resource_pool->release_2d_texture(texture_num); + texture_num = 0; + } +} + +bool FFTInput::set_int(const std::string& key, int value) +{ + if (key == "needs_mipmaps") { + // We cannot supply mipmaps; it would not make any sense for FFT data. + return (value == 0); + } + if (key == "fft_width") { + if (value < int(convolve_width)) { + return false; + } + invalidate_pixel_data(); + } + if (key == "fft_height") { + if (value < int(convolve_height)) { + return false; + } + invalidate_pixel_data(); + } + return Effect::set_int(key, value); +} + +} // namespace movit diff --git a/fft_input.h b/fft_input.h new file mode 100644 index 0000000..681876b --- /dev/null +++ b/fft_input.h @@ -0,0 +1,81 @@ +#ifndef _MOVIT_FFT_INPUT_H +#define _MOVIT_FFT_INPUT_H 1 + +// FFTInput is used by FFTConvolutionEffect to send in the FFTed version of a +// mostly static, one-channel data set, typically the convolution kernel +// with some zero padding. +// +// Since the kernel is typically small and unlikely to change often, +// it will be faster to FFT it once on the CPU (using the excellent FFTW3 +// library) and keep it in a texture, rather than FFT-ing it over and over on +// the GPU. (We do not currently support caching Movit intermediates between +// frames.) As an extra bonus, we can then do it in double precision and round +// precisely to fp16 afterwards. +// +// This class is tested as part of by FFTConvolutionEffectTest. + +#include +#include +#include + +#include "effect.h" +#include "effect_chain.h" +#include "image_format.h" +#include "input.h" + +namespace movit { + +class ResourcePool; + +class FFTInput : public Input { +public: + FFTInput(unsigned width, unsigned height); + ~FFTInput(); + + virtual std::string effect_type_id() const { return "FFTInput"; } + std::string output_fragment_shader(); + + // FFTs the data and uploads the texture if it has changed since last time. + void set_gl_state(GLuint glsl_program_num, const std::string& prefix, unsigned *sampler_num); + + unsigned get_width() const { return fft_width; } + unsigned get_height() const { return fft_height; } + + // Strictly speaking, FFT data doesn't have any colorspace or gamma; + // these values are the Movit standards for “do nothing”. + Colorspace get_color_space() const { return COLORSPACE_sRGB; } + GammaCurve get_gamma_curve() const { return GAMMA_LINEAR; } + virtual AlphaHandling alpha_handling() const { return INPUT_AND_OUTPUT_PREMULTIPLIED_ALPHA; } + virtual bool is_single_texture() const { return true; } + virtual bool can_output_linear_gamma() const { return true; } + + // Tells the input where to fetch the actual pixel data. Note that if you change + // this data, you must either call set_pixel_data() again (using the same pointer + // is fine), or invalidate_pixel_data(). Otherwise, the FFT won't be recalculated, + // and the texture won't be re-uploaded on subsequent frames. + void set_pixel_data(const float *pixel_data) + { + this->pixel_data = pixel_data; + invalidate_pixel_data(); + } + + void invalidate_pixel_data(); + + virtual void inform_added(EffectChain *chain) + { + resource_pool = chain->get_resource_pool(); + } + + virtual bool set_int(const std::string& key, int value); + +private: + GLuint texture_num; + int fft_width, fft_height; + unsigned convolve_width, convolve_height; + const float *pixel_data; + ResourcePool *resource_pool; +}; + +} // namespace movit + +#endif // !defined(_MOVIT_FFT_INPUT_H) diff --git a/slice_effect.cpp b/slice_effect.cpp index 15f5392..1ec68f9 100644 --- a/slice_effect.cpp +++ b/slice_effect.cpp @@ -10,9 +10,14 @@ using namespace std; namespace movit { SliceEffect::SliceEffect() + : input_slice_size(1), + output_slice_size(1), + offset(0), + direction(VERTICAL) { register_int("input_slice_size", &input_slice_size); register_int("output_slice_size", &output_slice_size); + register_int("offset", &offset); register_int("direction", (int *)&direction); } @@ -55,10 +60,12 @@ void SliceEffect::set_gl_state(GLuint glsl_program_num, const string &prefix, un set_uniform_float(glsl_program_num, prefix, "output_coord_to_slice_num", float(output_width) / float(output_slice_size)); set_uniform_float(glsl_program_num, prefix, "slice_num_to_input_coord", float(input_slice_size) / float(input_width)); set_uniform_float(glsl_program_num, prefix, "slice_offset_to_input_coord", float(output_slice_size) / float(input_width)); + set_uniform_float(glsl_program_num, prefix, "offset", float(offset) / float(input_width)); } else { set_uniform_float(glsl_program_num, prefix, "output_coord_to_slice_num", float(output_height) / float(output_slice_size)); set_uniform_float(glsl_program_num, prefix, "slice_num_to_input_coord", float(input_slice_size) / float(input_height)); set_uniform_float(glsl_program_num, prefix, "slice_offset_to_input_coord", float(output_slice_size) / float(input_height)); + set_uniform_float(glsl_program_num, prefix, "offset", float(offset) / float(input_height)); } // Normalized coordinates could potentially cause blurring of the image. diff --git a/slice_effect.frag b/slice_effect.frag index 1598cb0..59e3cc1 100644 --- a/slice_effect.frag +++ b/slice_effect.frag @@ -1,6 +1,7 @@ uniform float PREFIX(output_coord_to_slice_num); uniform float PREFIX(slice_num_to_input_coord); uniform float PREFIX(slice_offset_to_input_coord); +uniform float PREFIX(offset); vec4 FUNCNAME(vec2 tc) { // DIRECTION_VERTICAL will be #defined to 1 if we are expanding vertically, @@ -16,7 +17,7 @@ vec4 FUNCNAME(vec2 tc) { float slice_offset = fract(sliced_coord * PREFIX(output_coord_to_slice_num)); // Find out where this slice begins in the input data, and then offset from that. - float input_coord = slice_num * PREFIX(slice_num_to_input_coord) + slice_offset * PREFIX(slice_offset_to_input_coord); + float input_coord = slice_num * PREFIX(slice_num_to_input_coord) + slice_offset * PREFIX(slice_offset_to_input_coord) + PREFIX(offset); #if DIRECTION_VERTICAL return INPUT(vec2(tc.x, 1.0 - input_coord)); diff --git a/slice_effect.h b/slice_effect.h index 31a13fd..0ce5d08 100644 --- a/slice_effect.h +++ b/slice_effect.h @@ -37,6 +37,7 @@ private: EffectChain *chain; int input_width, input_height; int input_slice_size, output_slice_size; + int offset; Direction direction; }; diff --git a/slice_effect_test.cpp b/slice_effect_test.cpp index 88b77a2..3e56667 100644 --- a/slice_effect_test.cpp +++ b/slice_effect_test.cpp @@ -84,6 +84,30 @@ TEST(SliceEffectTest, HorizontalDiscard) { expect_equal(expected_data, out_data, 4, 2); } +TEST(SliceEffectTest, HorizontalOverlapWithOffset) { + float data[5 * 2] = { + /* 0.0f, */ 0.0f, 0.1f, 0.2f, 0.3f, 0.4f, + /* 0.4f, */ 0.4f, 0.3f, 0.2f, 0.1f, 0.0f, + }; + float expected_data[9 * 2] = { + 0.0f, 0.0f, 0.1f, 0.1f, 0.2f, 0.3f, 0.3f, 0.4f, 0.4f, + 0.4f, 0.4f, 0.3f, 0.3f, 0.2f, 0.1f, 0.1f, 0.0f, 0.0f, + }; + float out_data[9 * 2]; + + EffectChainTester tester(NULL, 9, 2, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR); + tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, 5, 2); + + Effect *slice_effect = tester.get_chain()->add_effect(new SliceEffect()); + ASSERT_TRUE(slice_effect->set_int("input_slice_size", 2)); + ASSERT_TRUE(slice_effect->set_int("output_slice_size", 3)); + ASSERT_TRUE(slice_effect->set_int("offset", -1)); + ASSERT_TRUE(slice_effect->set_int("direction", SliceEffect::HORIZONTAL)); + tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR); + + expect_equal(expected_data, out_data, 9, 2); +} + TEST(SliceEffectTest, VerticalOverlapSlicesFromTop) { float data[2 * 3] = { 0.0f, 0.1f, @@ -114,4 +138,35 @@ TEST(SliceEffectTest, VerticalOverlapSlicesFromTop) { expect_equal(expected_data, out_data, 2, 6); } +TEST(SliceEffectTest, VerticalOverlapOffsetsFromTop) { + float data[2 * 3] = { + 0.0f, 0.1f, + 0.4f, 0.3f, + + 0.6f, 0.2f, + }; + float expected_data[2 * 6] = { + 0.4f, 0.3f, + 0.6f, 0.2f, + 0.6f, 0.2f, + + 0.6f, 0.2f, + 0.6f, 0.2f, + 0.6f, 0.2f, + }; + float out_data[2 * 6]; + + EffectChainTester tester(NULL, 2, 6, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR); + tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, 2, 3); + + Effect *slice_effect = tester.get_chain()->add_effect(new SliceEffect()); + ASSERT_TRUE(slice_effect->set_int("input_slice_size", 2)); + ASSERT_TRUE(slice_effect->set_int("output_slice_size", 3)); + ASSERT_TRUE(slice_effect->set_int("offset", 1)); + ASSERT_TRUE(slice_effect->set_int("direction", SliceEffect::VERTICAL)); + tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR); + + expect_equal(expected_data, out_data, 2, 6); +} + } // namespace movit diff --git a/util.cpp b/util.cpp index 8162f71..310e7be 100644 --- a/util.cpp +++ b/util.cpp @@ -220,4 +220,17 @@ unsigned div_round_up(unsigned a, unsigned b) return (a + b - 1) / b; } +// Algorithm from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2. +unsigned next_power_of_two(unsigned v) +{ + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + return v; +} + } // namespace movit diff --git a/util.h b/util.h index c59231d..c5dafed 100644 --- a/util.h +++ b/util.h @@ -53,6 +53,9 @@ GLuint fill_vertex_attribute(GLuint glsl_program_num, const std::string &attribu // Clean up after fill_vertex_attribute(). void cleanup_vertex_attribute(GLuint glsl_program_num, const std::string &attribute_name, GLuint vbo); +// If v is not already a power of two, return the first higher power of two. +unsigned next_power_of_two(unsigned v); + } // namespace movit #ifdef NDEBUG -- 2.39.2