From: Steinar H. Gunderson <sgunderson@bigfoot.com>
Date: Sun, 16 Mar 2014 18:36:25 +0000 (+0100)
Subject: Merge branch 'master' into epoxy
X-Git-Tag: 1.1~12^2~29
X-Git-Url: https://git.sesse.net/?p=movit;a=commitdiff_plain;h=9e219b96e914b2e5709ba3c3345afdc3cf13a78d

Merge branch 'master' into epoxy

Conflicts:
	Makefile.in
	configure.ac
---

9e219b96e914b2e5709ba3c3345afdc3cf13a78d
diff --cc Makefile.in
index cefbf78,e401340..ee3626b
--- a/Makefile.in
+++ b/Makefile.in
@@@ -13,12 -12,9 +13,12 @@@ with_coverage = @with_coverage
  
  CC=@CC@
  CXX=@CXX@
- CXXFLAGS=-Wall @CXXFLAGS@ -I$(GTEST_DIR)/include @SDL2_CFLAGS@ @SDL_CFLAGS@ @Eigen3_CFLAGS@ @epoxy_CFLAGS@
 -CXXFLAGS=-Wall @CXXFLAGS@ -I$(GTEST_DIR)/include @Eigen3_CFLAGS@ @GLEW_CFLAGS@ @FFTW3_CFLAGS@
 -LDFLAGS=@GLEW_LIBS@ @SDL_LIBS@ @FFTW3_LIBS@ -lpthread
 -DEMO_LDLIBS=@SDL_image_LIBS@ -lrt -lpthread @libpng_LIBS@ @FFTW3_LIBS@
++CXXFLAGS=-Wall @CXXFLAGS@ -I$(GTEST_DIR)/include @SDL2_CFLAGS@ @SDL_CFLAGS@ @Eigen3_CFLAGS@ @epoxy_CFLAGS@ @FFTW3_CFLAGS@
 +ifeq ($(with_SDL2),yes)
 +CXXFLAGS += -DHAVE_SDL2
 +endif
- LDFLAGS=@epoxy_LIBS@ @SDL2_LIBS@ @SDL_LIBS@ -lpthread
- DEMO_LDLIBS=@SDL2_image_LIBS@ @SDL_image_LIBS@ -lrt -lpthread @libpng_LIBS@
++LDFLAGS=@epoxy_LIBS@ @SDL2_LIBS@ @SDL_LIBS@ @FFTW3_LIBS@ -lpthread
++DEMO_LDLIBS=@SDL2_image_LIBS@ @SDL_image_LIBS@ -lrt -lpthread @libpng_LIBS@ @FFTW3_LIBS@
  SHELL=@SHELL@
  LIBTOOL=@LIBTOOL@ --tag=CXX
  RANLIB=ranlib
diff --cc configure.ac
index f911746,3e0b1be..1007ce2
--- a/configure.ac
+++ b/configure.ac
@@@ -7,22 -7,15 +7,23 @@@ AC_CONFIG_SRCDIR(effect.cpp
  AC_PROG_CC
  AC_PROG_CXX
  PKG_CHECK_MODULES([Eigen3], [eigen3])
 -PKG_CHECK_MODULES([GLEW], [glew])
 +PKG_CHECK_MODULES([epoxy], [epoxy])
+ PKG_CHECK_MODULES([FFTW3], [fftw3])
  
 -# Needed for unit tests and the demo app.
 -PKG_CHECK_MODULES([SDL], [sdl])
 +# Needed for unit tests and the demo app. We prefer SDL2 if possible,
 +# but can also use classic SDL.
 +with_SDL2=no
 +with_demo_app=yes
 +PKG_CHECK_MODULES([SDL2], [sdl2], [with_SDL2=yes], [
 +  PKG_CHECK_MODULES([SDL], [sdl])
 +])
  
  # These are only needed for the demo app.
 -with_demo_app=yes
 -PKG_CHECK_MODULES([SDL_image], [SDL_image], [], [with_demo_app=no; AC_MSG_WARN([SDL_image not found, demo program will not be built])])
 +if test $with_SDL2 = "yes"; then
 +  PKG_CHECK_MODULES([SDL2_image], [SDL2_image], [], [with_demo_app=no; AC_MSG_WARN([SDL2_image not found, demo program will not be built])])
 +else
 +  PKG_CHECK_MODULES([SDL_image], [SDL_image], [], [with_demo_app=no; AC_MSG_WARN([SDL_image not found, demo program will not be built])])
 +fi
  PKG_CHECK_MODULES([libpng], [libpng12], [], [with_demo_app=no; AC_MSG_WARN([libpng12 not found, demo program will not be built])])
  
  AC_SUBST([with_demo_app])
diff --cc fft_convolution_effect.cpp
index 0000000,b3f77bd..1c48142
mode 000000,100644..100644
--- a/fft_convolution_effect.cpp
+++ b/fft_convolution_effect.cpp
@@@ -1,0 -1,274 +1,274 @@@
 -#include <GL/glew.h>
++#include <epoxy/gl.h>
+ #include <string.h>
+ 
+ #include "complex_modulate_effect.h"
+ #include "effect_chain.h"
+ #include "fft_convolution_effect.h"
+ #include "fft_input.h"
+ #include "fft_pass_effect.h"
+ #include "multiply_effect.h"
+ #include "padding_effect.h"
+ #include "slice_effect.h"
+ #include "util.h"
+ 
+ using namespace std;
+ 
+ namespace movit {
+ 
+ FFTConvolutionEffect::FFTConvolutionEffect(int input_width, int input_height, int convolve_width, int convolve_height)
+ 	: input_width(input_width),
+ 	  input_height(input_height),
+ 	  convolve_width(convolve_width),
+ 	  convolve_height(convolve_height),
+ 	  fft_input(new FFTInput(convolve_width, convolve_height)),
+ 	  crop_effect(new PaddingEffect()),
+ 	  owns_effects(true) {
+ 	CHECK(crop_effect->set_int("width", input_width));
+ 	CHECK(crop_effect->set_int("height", input_height));
+ 	CHECK(crop_effect->set_float("top", 0));
+ 	CHECK(crop_effect->set_float("left", 0));
+ }
+ 
+ FFTConvolutionEffect::~FFTConvolutionEffect()
+ {
+ 	if (owns_effects) {
+ 		delete fft_input;
+ 		delete crop_effect;
+ 	}
+ }
+ 
+ namespace {
+ 
+ // Returns the last Effect in the new chain.
+ Effect *add_overlap_and_fft(EffectChain *chain, Effect *last_effect, int fft_size, int pad_size, FFTPassEffect::Direction direction)
+ {
+ 	// Overlap.
+ 	{
+ 		Effect *overlap_effect = chain->add_effect(new SliceEffect(), last_effect);
+ 		CHECK(overlap_effect->set_int("input_slice_size", fft_size - pad_size));
+ 		CHECK(overlap_effect->set_int("output_slice_size", fft_size));
+ 		CHECK(overlap_effect->set_int("offset", -pad_size));
+ 		if (direction == FFTPassEffect::HORIZONTAL) {
+ 			CHECK(overlap_effect->set_int("direction", SliceEffect::HORIZONTAL));
+ 		} else {
+ 			assert(direction == FFTPassEffect::VERTICAL);
+ 			CHECK(overlap_effect->set_int("direction", SliceEffect::VERTICAL));
+ 		}
+ 
+ 		last_effect = overlap_effect;
+ 	}
+ 
+ 	// FFT.
+ 	int num_passes = ffs(fft_size) - 1;
+ 	for (int i = 1; i <= num_passes; ++i) {
+ 		Effect *fft_effect = chain->add_effect(new FFTPassEffect(), last_effect);
+ 		CHECK(fft_effect->set_int("pass_number", i));
+ 		CHECK(fft_effect->set_int("fft_size", fft_size));
+ 		CHECK(fft_effect->set_int("direction", direction));
+ 		CHECK(fft_effect->set_int("inverse", 0));
+ 
+ 		last_effect = fft_effect;
+ 	}
+ 
+ 	return last_effect;
+ }
+ 
+ // Returns the last Effect in the new chain.
+ Effect *add_ifft_and_discard(EffectChain *chain, Effect *last_effect, int fft_size, int pad_size, FFTPassEffect::Direction direction)
+ {
+ 	// IFFT.
+ 	int num_passes = ffs(fft_size) - 1;
+ 	for (int i = 1; i <= num_passes; ++i) {
+ 		Effect *fft_effect = chain->add_effect(new FFTPassEffect(), last_effect);
+ 		CHECK(fft_effect->set_int("pass_number", i));
+ 		CHECK(fft_effect->set_int("fft_size", fft_size));
+ 		CHECK(fft_effect->set_int("direction", direction));
+ 		CHECK(fft_effect->set_int("inverse", 1));
+ 
+ 		last_effect = fft_effect;
+ 	}
+ 
+ 	// Discard.
+ 	{
+ 		Effect *discard_effect = chain->add_effect(new SliceEffect(), last_effect);
+ 		CHECK(discard_effect->set_int("input_slice_size", fft_size));
+ 		CHECK(discard_effect->set_int("output_slice_size", fft_size - pad_size));
+ 		if (direction == FFTPassEffect::HORIZONTAL) {
+ 			CHECK(discard_effect->set_int("direction", SliceEffect::HORIZONTAL));
+ 		} else {
+ 			assert(direction == FFTPassEffect::VERTICAL);
+ 			CHECK(discard_effect->set_int("direction", SliceEffect::VERTICAL));
+ 		}
+ 		CHECK(discard_effect->set_int("offset", pad_size));
+ 
+ 		last_effect = discard_effect;
+ 	}
+ 
+ 	return last_effect;
+ }
+ 
+ }  // namespace
+ 
+ void FFTConvolutionEffect::rewrite_graph(EffectChain *chain, Node *self)
+ {
+ 	int pad_width = convolve_width - 1;
+ 	int pad_height = convolve_height - 1;
+ 
+ 	// Try all possible FFT widths and heights to see which one is the
+ 	// cheapest.  As a proxy for real performance, we use number of texel
+ 	// fetches; this isn't perfect by any means, but it's easy to work with
+ 	// and should be approximately correct.
+ 	int min_x = next_power_of_two(1 + pad_width);
+ 	int min_y = next_power_of_two(1 + pad_height);
+ 	int max_y = next_power_of_two(input_height + pad_width);
+ 	int max_x = next_power_of_two(input_width + pad_height);
+ 
+ 	size_t best_cost = numeric_limits<size_t>::max();
+ 	int best_x = -1, best_y = -1, best_x_before_y_fft = -1, best_x_before_y_ifft = -1;
+ 
+ 	// Try both
+ 	//
+ 	//   overlap(X), FFT(X), overlap(Y), FFT(Y), modulate, IFFT(Y), discard(Y), IFFT(X), discard(X) and
+ 	//   overlap(Y), FFT(Y), overlap(X), FFT(X), modulate, IFFT(X), discard(X), IFFT(Y), discard(Y)
+ 	//
+ 	// For simplicity, call them the XY-YX and YX-XY orders. In theory, we
+ 	// could have XY-XY and YX-YX orders as well, and I haven't found a
+ 	// convincing argument that they will never be optimal (although it
+ 	// sounds odd and should be rare), so we test all four possible ones.
+ 	//
+ 	// We assume that the kernel FFT is for free, since it is typically done
+ 	// only once and per frame.
+ 	for (int x_before_y_fft = 0; x_before_y_fft <= 1; ++x_before_y_fft) {
+ 		for (int x_before_y_ifft = 0; x_before_y_ifft <= 1; ++x_before_y_ifft) {
+ 			for (int y = min_y; y <= max_y; y *= 2) {
+ 				int y_pixels_per_block = y - pad_height;
+ 				int num_vertical_blocks = div_round_up(input_height, y_pixels_per_block);
+ 				size_t output_height = y * num_vertical_blocks;
+ 				for (int x = min_x; x <= max_x; x *= 2) {
+ 					int x_pixels_per_block = x - pad_width;
+ 					int num_horizontal_blocks = div_round_up(input_width, x_pixels_per_block);
+ 					size_t output_width = x * num_horizontal_blocks;
+ 
+ 					size_t cost = 0;
+ 
+ 					if (x_before_y_fft) {
+ 						// First, the cost of the horizontal padding.
+ 						cost = output_width * input_height;
+ 
+ 						// log(X) FFT passes. Each pass reads two inputs per pixel,
+ 						// plus the support texture.
+ 						cost += (ffs(x) - 1) * 3 * output_width * input_height;
+ 
+ 						// Now, horizontal padding.
+ 						cost += output_width * output_height;
+ 
+ 						// log(Y) FFT passes, now at full resolution.
+ 						cost += (ffs(y) - 1) * 3 * output_width * output_height;
+ 					} else {
+ 						// First, the cost of the vertical padding.
+ 						cost = input_width * output_height;
+ 
+ 						// log(Y) FFT passes. Each pass reads two inputs per pixel,
+ 						// plus the support texture.
+ 						cost += (ffs(y) - 1) * 3 * input_width * output_height;
+ 
+ 						// Now, horizontal padding.
+ 						cost += output_width * output_height;
+ 
+ 						// log(X) FFT passes, now at full resolution.
+ 						cost += (ffs(x) - 1) * 3 * output_width * output_height;
+ 					}
+ 
+ 					// The actual modulation. Reads one pixel each from two textures.
+ 					cost += 2 * output_width * output_height;
+ 
+ 					if (x_before_y_ifft) {
+ 						// log(X) IFFT passes.
+ 						cost += (ffs(x) - 1) * 3 * output_width * output_height;
+ 
+ 						// Discard horizontally.
+ 						cost += input_width * output_height;
+ 
+ 						// log(Y) IFFT passes.
+ 						cost += (ffs(y) - 1) * 3 * input_width * output_height;
+ 
+ 						// Discard horizontally.
+ 						cost += input_width * input_height;
+ 					} else {
+ 						// log(Y) IFFT passes.
+ 						cost += (ffs(y) - 1) * 3 * output_width * output_height;
+ 
+ 						// Discard vertically.
+ 						cost += output_width * input_height;
+ 
+ 						// log(X) IFFT passes.
+ 						cost += (ffs(x) - 1) * 3 * output_width * input_height;
+ 
+ 						// Discard horizontally.
+ 						cost += input_width * input_height;
+ 					}
+ 
+ 					if (cost < best_cost) {
+ 						best_x = x;
+ 						best_y = y;
+ 						best_x_before_y_fft = x_before_y_fft;
+ 						best_x_before_y_ifft = x_before_y_ifft;
+ 						best_cost = cost;
+ 					}
+ 				}
+ 			}
+ 		}
+ 	}
+ 
+ 	const int fft_width = best_x, fft_height = best_y;
+ 
+ 	assert(self->incoming_links.size() == 1);
+ 	Node *last_node = self->incoming_links[0];
+ 	self->incoming_links.clear();
+ 	last_node->outgoing_links.clear();
+ 
+ 	// Do FFT.
+ 	Effect *last_effect = last_node->effect;
+ 	if (best_x_before_y_fft) {
+ 		last_effect = add_overlap_and_fft(chain, last_effect, fft_width, pad_width, FFTPassEffect::HORIZONTAL);
+ 		last_effect = add_overlap_and_fft(chain, last_effect, fft_height, pad_height, FFTPassEffect::VERTICAL);
+ 	} else {
+ 		last_effect = add_overlap_and_fft(chain, last_effect, fft_height, pad_height, FFTPassEffect::VERTICAL);
+ 		last_effect = add_overlap_and_fft(chain, last_effect, fft_width, pad_width, FFTPassEffect::HORIZONTAL);
+ 	}
+ 
+ 	// Normalizer.
+ 	Effect *multiply_effect;
+ 	float fft_size = fft_width * fft_height;
+ 	float factor[4] = { 1.0f / fft_size, 1.0f / fft_size, 1.0f / fft_size, 1.0f / fft_size };
+ 	last_effect = multiply_effect = chain->add_effect(new MultiplyEffect(), last_effect);
+ 	CHECK(multiply_effect->set_vec4("factor", factor));
+ 
+ 	// Multiply by the FFT of the convolution kernel.
+ 	CHECK(fft_input->set_int("fft_width", fft_width));
+ 	CHECK(fft_input->set_int("fft_height", fft_height));
+ 	chain->add_input(fft_input);
+ 	owns_effects = false;
+ 
+ 	Effect *modulate_effect = chain->add_effect(new ComplexModulateEffect(), multiply_effect, fft_input);
+ 	CHECK(modulate_effect->set_int("num_repeats_x", div_round_up(input_width, fft_width - pad_width)));
+ 	CHECK(modulate_effect->set_int("num_repeats_y", div_round_up(input_height, fft_height - pad_height)));
+ 	last_effect = modulate_effect;
+ 
+ 	// Finally, do IFFT.
+ 	if (best_x_before_y_ifft) {
+ 		last_effect = add_ifft_and_discard(chain, last_effect, fft_width, pad_width, FFTPassEffect::HORIZONTAL);
+ 		last_effect = add_ifft_and_discard(chain, last_effect, fft_height, pad_height, FFTPassEffect::VERTICAL);
+ 	} else {
+ 		last_effect = add_ifft_and_discard(chain, last_effect, fft_height, pad_height, FFTPassEffect::VERTICAL);
+ 		last_effect = add_ifft_and_discard(chain, last_effect, fft_width, pad_width, FFTPassEffect::HORIZONTAL);
+ 	}
+ 
+ 	// ...and crop away any extra padding we have have added.
+ 	last_effect = chain->add_effect(crop_effect);
+ 
+ 	chain->replace_sender(self, chain->find_node_for_effect(last_effect));
+ 	self->disabled = true;
+ }
+ 
+ }  // namespace movit
diff --cc fft_convolution_effect.h
index 0000000,f601a49..4136a2a
mode 000000,100644..100644
--- a/fft_convolution_effect.h
+++ b/fft_convolution_effect.h
@@@ -1,0 -1,116 +1,116 @@@
+ #ifndef _MOVIT_FFT_CONVOLUTION_EFFECT_H
+ #define _MOVIT_FFT_CONVOLUTION_EFFECT_H 1
+ 
+ // FFTConvolutionEffect applies an arbitrary 2D convolution between the input image
+ // and a convolution kernel (assumed to be much smaller than the image).
+ // It does this convolution using multiple smaller FFTs and an algorithm called
+ // overlap-discard (also known as overlap-save) to achieve much higher
+ // efficiency than direct evaluation of the convolution, at some expense of
+ // accuracy.
+ //
+ // FFTConvolutionEffect follows the usual convention for convolution, which is that
+ // you sample from the origin pixel, and then up and to the left from that. This means
+ // that (in horizontal 1D) [1 0 0 0 0 ...] would be an identity transform, and that
+ // [0 1 0 0 0 ...] would mean sampling one pixel to the left of the origin, which
+ // effectively move would move the image one pixel to the right.
+ //
+ // The basic idea of the acceleration comes from the the convolution theorem
+ // (which holds in any number of dimensions), namely that FFT(A â B) =
+ // FFT(A) * FFT(B), where â is circular convolution and * is pointwise
+ // multiplication. This means that A â B = IFFT(FFT(A) * FFT(B)), and since
+ // we can do a 2D FFT in O(nÂ² log n), this is asymptotically better than
+ // direct convolution, which is O(nÂ²mÂ²) (where m is the size of the convolution
+ // kernel). However, the convolution theorem is rarely _directly_ applicable,
+ // for two reasons:
+ //
+ //  - â is _circular_ convolution, which means that inputs are taken to
+ //    repeat (wrap around), which is rarely what you want.
+ //  - A and B must be the same size, which means that to convolve a 1280x720
+ //    kernel with a 64x64 kernel, you need to zero pad the 64x64 kernel and
+ //    then do _two_ full 1280x720 FFTs (one for each of A and B).
+ //
+ // The first problem is solved by adding m-1 zero pixels (horizontally
+ // and vertically) as padding, and then discarding the result of those pixels.
+ // This causes the output to be identical to a non-circular convolution.
+ //
+ // The second is slightly more tricky, and there are multiple ways of solving
+ // it. The one that appears to be the most suitable to suitable for GPU use,
+ // and the one that is used here, is overlap-discard (more commonly but less
+ // precisely known as overlap-save). In overlap-discard, the input is broken
+ // up into multiple equally-sized slices which are then FFTed and convolved
+ // with the kernel individually. (The kernel must still be zero padded to the
+ // same size as the slice, but this is typically much smaller then the full
+ // picture.) As before, the pad area contains data that's essentially junk,
+ // which is thrown away when the slices are put together again.
+ //
+ // The optimal slice size is a tradeoff. More slices means more space wasted
+ // for padding, since the padding is the same no matter the slice size,
+ // but fewer slices means we need to do larger FFTs (although fewer of them).
+ // There's no exact closed formula for this, especially since the 2D case
+ // makes things a bit more tricky with ordering of the X versus Y passes,
+ // so we simply try all possible sizes and orderings, attempting to estimate
+ // roughly how much each operation will cost. The result isn't perfect, though;
+ // FFTW has a mode for actually measuring, which they claim improves speeds
+ // by ~2x over simple estimation, but they also have much more freedom in
+ // their execution model than we do.
+ //
+ // The output _size_ of a convolution can be defined in a couple of different
+ // ways; in a sense, what's the most reasonable is using only the central part
+ // of the result (the mode âvalidâ in MATLAB/Octave), since that is the only
+ // one not used by any edge pixels. (FFTConvolutionEffect assumes normal Movit
+ // edge pixel behavior, which is to repeat the outermost pixels.) You could also
+ // keep all the output pixels (âfullâ in MATLAB/Octave), which is nicely symmetric.
+ // However, for video processing, typically what you want is to have the _same_
+ // output size as input size, so we crop to the input size. This means that
+ // you'll get some of the edge-affected pixels but not all, but it's usually
+ // an okay tradeoff.
+ //
+ // FFTConvolutionEffect does not do any actual pixel work by itself; it
+ // rewrites itself into a long chain of SliceEffect, FFTPassEffect, FFTInput
+ // and ComplexModulationEffect to do its bidding. Note that currently, due to
+ // Movit limitations, we need to know the number of FFT passes at finalize()
+ // time, which in turn means you cannot change image or kernel size on the fly.
+ 
+ #include <assert.h>
 -#include <GL/glew.h>
++#include <epoxy/gl.h>
+ #include <string>
+ 
+ #include "effect.h"
+ #include "fft_input.h"
+ 
+ namespace movit {
+ 
+ class PaddingEffect;
+ 
+ class FFTConvolutionEffect : public Effect {
+ public:
+ 	FFTConvolutionEffect(int input_width, int input_height, int convolve_width, int convolve_height);
+ 	~FFTConvolutionEffect();
+ 	virtual std::string effect_type_id() const { return "FFTConvolutionEffect"; }
+ 	std::string output_fragment_shader() { assert(false); }
+ 	virtual void rewrite_graph(EffectChain *graph, Node *self);
+ 
+ 	// See FFTInput::set_pixel_data().
+ 	void set_convolution_kernel(const float *pixel_data)
+ 	{
+ 		assert(fft_input);
+ 		fft_input->set_pixel_data(pixel_data);
+ 	}
+ 
+ private:
+ 	int input_width, input_height;
+ 	int convolve_width, convolve_height;
+ 
+ 	// Chosen by algorithm.
+ 	int fft_width, fft_height;
+ 
+ 	// Both of these are owned by us if owns_effects is true (before finalize()),
+ 	// and otherwise owned by the EffectChain.
+ 	FFTInput *fft_input;
+ 	PaddingEffect *crop_effect;
+ 	bool owns_effects;
+ };
+ 
+ }  // namespace movit
+ 
+ #endif // !defined(_MOVIT_FFT_CONVOLUTION_EFFECT_H)
diff --cc fft_convolution_effect_test.cpp
index 0000000,f0f8527..a55fe31
mode 000000,100644..100644
--- a/fft_convolution_effect_test.cpp
+++ b/fft_convolution_effect_test.cpp
@@@ -1,0 -1,211 +1,211 @@@
+ // Unit tests for FFTConvolutionEffect.
+ 
 -#include <GL/glew.h>
++#include <epoxy/gl.h>
+ #include <math.h>
+ 
+ #include "effect_chain.h"
+ #include "gtest/gtest.h"
+ #include "image_format.h"
+ #include "test_util.h"
+ #include "fft_convolution_effect.h"
+ 
+ namespace movit {
+ 
+ TEST(FFTConvolutionEffectTest, Identity) {
+ 	const int size = 4;
+ 
+ 	float data[size * size] = {
+ 		0.1, 1.1, 2.1, 3.1,
+ 		0.2, 1.2, 2.2, 3.2,
+ 		0.3, 1.3, 2.3, 3.3,
+ 		0.4, 1.4, 2.4, 3.4,
+ 	};
+ 	float out_data[size * size];
+ 
+ 	for (int convolve_size = 1; convolve_size < 10; ++convolve_size) {
+ 		float kernel[convolve_size * convolve_size];
+ 		for (int i = 0; i < convolve_size * convolve_size; ++i) {
+ 			kernel[i] = 0.0f;
+ 		}
+ 		kernel[0] = 1.0f;
+ 
+ 		EffectChainTester tester(NULL, size, size, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
+ 		tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, size, size);
+ 
+ 		FFTConvolutionEffect *fft_effect = new FFTConvolutionEffect(size, size, convolve_size, convolve_size);
+ 		tester.get_chain()->add_effect(fft_effect);
+ 		fft_effect->set_convolution_kernel(kernel);
+ 		tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED);
+ 
+ 		expect_equal(data, out_data, size, size, 0.02, 0.003);
+ 	}
+ }
+ 
+ TEST(FFTConvolutionEffectTest, Constant) {
+ 	const int size = 4, convolve_size = 17;
+ 	const float f = 7.0f;
+ 
+ 	float data[size * size] = {
+ 		0.1, 1.1, 2.1, 3.1,
+ 		0.2, 1.2, 2.2, 3.2,
+ 		0.3, 1.3, 2.3, 3.3,
+ 		0.4, 1.4, 2.4, 3.4,
+ 	};
+ 	float expected_data[size * size] = {
+ 		f * 0.1, f * 1.1, f * 2.1, f * 3.1,
+ 		f * 0.2, f * 1.2, f * 2.2, f * 3.2,
+ 		f * 0.3, f * 1.3, f * 2.3, f * 3.3,
+ 		f * 0.4, f * 1.4, f * 2.4, f * 3.4,
+ 	};
+ 	float out_data[size * size];
+ 	float kernel[convolve_size * convolve_size];
+ 	for (int i = 0; i < convolve_size * convolve_size; ++i) {
+ 		kernel[i] = 0.0f;
+ 	}
+ 	kernel[0] = f;
+ 
+ 	EffectChainTester tester(NULL, size, size, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
+ 	tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, size, size);
+ 
+ 	FFTConvolutionEffect *fft_effect = new FFTConvolutionEffect(size, size, convolve_size, convolve_size);
+ 	tester.get_chain()->add_effect(fft_effect);
+ 	fft_effect->set_convolution_kernel(kernel);
+ 	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED);
+ 
+ 	// Somewhat looser bounds due to the higher magnitude.
+ 	expect_equal(expected_data, out_data, size, size, f * 0.03, f * 0.004);
+ }
+ 
+ TEST(FFTConvolutionEffectTest, MoveRight) {
+ 	const int size = 4, convolve_size = 3;
+ 
+ 	float data[size * size] = {
+ 		0.1, 1.1, 2.1, 3.1,
+ 		0.2, 1.2, 2.2, 3.2,
+ 		0.3, 1.3, 2.3, 3.3,
+ 		0.4, 1.4, 2.4, 3.4,
+ 	};
+ 	float kernel[convolve_size * convolve_size] = {
+ 		0.0, 1.0, 0.0,
+ 		0.0, 0.0, 0.0,
+ 		0.0, 0.0, 0.0,
+ 	};
+ 	float expected_data[size * size] = {
+ 		0.1, 0.1, 1.1, 2.1,
+ 		0.2, 0.2, 1.2, 2.2,
+ 		0.3, 0.3, 1.3, 2.3,
+ 		0.4, 0.4, 1.4, 2.4,
+ 	};
+ 	float out_data[size * size];
+ 
+ 	EffectChainTester tester(NULL, size, size, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
+ 	tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, size, size);
+ 
+ 	FFTConvolutionEffect *fft_effect = new FFTConvolutionEffect(size, size, convolve_size, convolve_size);
+ 	tester.get_chain()->add_effect(fft_effect);
+ 	fft_effect->set_convolution_kernel(kernel);
+ 	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED);
+ 
+ 	expect_equal(expected_data, out_data, size, size, 0.02, 0.003);
+ }
+ 
+ TEST(FFTConvolutionEffectTest, MoveDown) {
+ 	const int size = 4, convolve_size = 3;
+ 
+ 	float data[size * size] = {
+ 		0.1, 1.1, 2.1, 3.1,
+ 		0.2, 1.2, 2.2, 3.2,
+ 		0.3, 1.3, 2.3, 3.3,
+ 		0.4, 1.4, 2.4, 3.4,
+ 	};
+ 	float kernel[convolve_size * convolve_size] = {
+ 		0.0, 0.0, 0.0,
+ 		1.0, 0.0, 0.0,
+ 		0.0, 0.0, 0.0,
+ 	};
+ 	float expected_data[size * size] = {
+ 		0.1, 1.1, 2.1, 3.1,
+ 		0.1, 1.1, 2.1, 3.1,
+ 		0.2, 1.2, 2.2, 3.2,
+ 		0.3, 1.3, 2.3, 3.3,
+ 	};
+ 	float out_data[size * size];
+ 
+ 	EffectChainTester tester(NULL, size, size, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
+ 	tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, size, size);
+ 
+ 	FFTConvolutionEffect *fft_effect = new FFTConvolutionEffect(size, size, convolve_size, convolve_size);
+ 	tester.get_chain()->add_effect(fft_effect);
+ 	fft_effect->set_convolution_kernel(kernel);
+ 	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED);
+ 
+ 	expect_equal(expected_data, out_data, size, size, 0.02, 0.003);
+ }
+ 
+ TEST(FFTConvolutionEffectTest, MergeWithLeft) {
+ 	const int size = 4, convolve_size = 3;
+ 
+ 	float data[size * size] = {
+ 		0.1, 1.1, 2.1, 3.1,
+ 		0.2, 1.2, 2.2, 3.2,
+ 		0.3, 1.3, 2.3, 3.3,
+ 		0.4, 1.4, 2.4, 3.4,
+ 	};
+ 	float kernel[convolve_size * convolve_size] = {
+ 		1.0, 1.0, 0.0,
+ 		0.0, 0.0, 0.0,
+ 		0.0, 0.0, 0.0,
+ 	};
+ 	float expected_data[size * size] = {
+ 		0.1 + 0.1,  0.1 + 1.1,  1.1 + 2.1,  2.1 + 3.1,
+ 		0.2 + 0.2,  0.2 + 1.2,  1.2 + 2.2,  2.2 + 3.2,
+ 		0.3 + 0.3,  0.3 + 1.3,  1.3 + 2.3,  2.3 + 3.3,
+ 		0.4 + 0.4,  0.4 + 1.4,  1.4 + 2.4,  2.4 + 3.4,
+ 	};
+ 	float out_data[size * size];
+ 
+ 	EffectChainTester tester(NULL, size, size, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
+ 	tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, size, size);
+ 
+ 	FFTConvolutionEffect *fft_effect = new FFTConvolutionEffect(size, size, convolve_size, convolve_size);
+ 	tester.get_chain()->add_effect(fft_effect);
+ 	fft_effect->set_convolution_kernel(kernel);
+ 	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED);
+ 
+ 	expect_equal(expected_data, out_data, size, size, 0.02, 0.003);
+ }
+ 
+ TEST(FFTConvolutionEffectTest, NegativeCoefficients) {
+ 	const int size = 4;
+ 	const int convolve_width = 3, convolve_height = 2;
+ 
+ 	float data[size * size] = {
+ 		0.1, 1.1, 2.1, 3.1,
+ 		0.2, 1.2, 2.2, 3.2,
+ 		0.3, 1.3, 2.3, 3.3,
+ 		0.4, 1.4, 2.4, 3.4,
+ 	};
+ 	float kernel[convolve_width * convolve_height] = {
+ 		1.0, 0.0,  0.0,
+ 		0.0, 0.0, -0.5,
+ 	};
+ 	float expected_data[size * size] = {
+ 		0.1 - 0.5 * 0.1,  1.1 - 0.5 * 0.1,  2.1 - 0.5 * 0.1,  3.1 - 0.5 * 1.1,
+ 		0.2 - 0.5 * 0.1,  1.2 - 0.5 * 0.1,  2.2 - 0.5 * 0.1,  3.2 - 0.5 * 1.1,
+ 		0.3 - 0.5 * 0.2,  1.3 - 0.5 * 0.2,  2.3 - 0.5 * 0.2,  3.3 - 0.5 * 1.2,
+ 		0.4 - 0.5 * 0.3,  1.4 - 0.5 * 0.3,  2.4 - 0.5 * 0.3,  3.4 - 0.5 * 1.3,
+ 	};
+ 	float out_data[size * size];
+ 
+ 	EffectChainTester tester(NULL, size, size, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
+ 	tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, size, size);
+ 
+ 	FFTConvolutionEffect *fft_effect = new FFTConvolutionEffect(size, size, convolve_width, convolve_height);
+ 	tester.get_chain()->add_effect(fft_effect);
+ 	fft_effect->set_convolution_kernel(kernel);
+ 	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED);
+ 
+ 	expect_equal(expected_data, out_data, size, size, 0.02, 0.003);
+ }
+ 
+ }  // namespace movit
diff --cc fft_input.cpp
index 0000000,ba539b6..76e77d6
mode 000000,100644..100644
--- a/fft_input.cpp
+++ b/fft_input.cpp
@@@ -1,0 -1,139 +1,139 @@@
+ #include <string.h>
+ #include <assert.h>
 -#include <GL/glew.h>
++#include <epoxy/gl.h>
+ #include <fftw3.h>
+ 
+ #include "effect_util.h"
+ #include "fp16.h"
+ #include "fft_input.h"
+ #include "resource_pool.h"
+ #include "util.h"
+ 
+ using namespace std;
+ 
+ namespace movit {
+ 
+ FFTInput::FFTInput(unsigned width, unsigned height)
+ 	: texture_num(0),
+ 	  fft_width(width),
+ 	  fft_height(height),
+ 	  convolve_width(width),
+ 	  convolve_height(height),
+ 	  pixel_data(NULL)
+ {
+ 	register_int("fft_width", &fft_width);
+ 	register_int("fft_height", &fft_height);
+ }
+ 
+ FFTInput::~FFTInput()
+ {
+ 	if (texture_num != 0) {
+ 		resource_pool->release_2d_texture(texture_num);
+ 	}
+ }
+ 
+ void FFTInput::set_gl_state(GLuint glsl_program_num, const string& prefix, unsigned *sampler_num)
+ {
+ 	glActiveTexture(GL_TEXTURE0 + *sampler_num);
+ 	check_error();
+ 
+ 	if (texture_num == 0) {
+ 		assert(pixel_data != NULL);
+ 
+ 		// Do the FFT. Our FFTs should typically be small enough and
+ 		// the data changed often enough that FFTW_ESTIMATE should be
+ 		// quite OK. Otherwise, we'd need to worry about caching these
+ 		// plans (possibly including FFTW wisdom).
+ 		fftw_complex *in = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * fft_width * fft_height);
+ 		fftw_complex *out = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * fft_width * fft_height);
+ 		fftw_plan p = fftw_plan_dft_2d(fft_height, fft_width, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
+ 
+ 		// Zero pad.
+ 		for (int i = 0; i < fft_height * fft_width; ++i) {
+ 			in[i][0] = 0.0;
+ 			in[i][1] = 0.0;
+ 		}
+ 		for (unsigned y = 0; y < convolve_height; ++y) {
+ 			for (unsigned x = 0; x < convolve_width; ++x) {
+ 				int i = y * fft_width + x;
+ 				in[i][0] = pixel_data[y * convolve_width + x];
+ 				in[i][1] = 0.0;
+ 			}
+ 		}
+ 
+ 		fftw_execute(p);
+ 
+ 		// Convert to fp16.
+ 		fp16_int_t *kernel = new fp16_int_t[fft_width * fft_height * 2];
+ 		for (int i = 0; i < fft_width * fft_height; ++i) {
+ 			kernel[i * 2 + 0] = fp64_to_fp16(out[i][0]);
+ 			kernel[i * 2 + 1] = fp64_to_fp16(out[i][1]);
+ 		}
+ 
+ 		// (Re-)upload the texture.
+ 		texture_num = resource_pool->create_2d_texture(GL_RG16F, fft_width, fft_height);
+ 		glBindTexture(GL_TEXTURE_2D, texture_num);
+ 		check_error();
+ 		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+ 		check_error();
+ 		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+ 		check_error();
+ 		glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+ 		check_error();
+ 		glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, fft_width, fft_height, GL_RG, GL_HALF_FLOAT, kernel);
+ 		check_error();
+ 		glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
+ 		check_error();
+ 		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+ 		check_error();
+ 		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+ 		check_error();
+ 
+ 		fftw_free(in);
+ 		fftw_free(out);
+ 		delete[] kernel;
+ 	} else {
+ 		glBindTexture(GL_TEXTURE_2D, texture_num);
+ 		check_error();
+ 	}
+ 
+ 	// Bind it to a sampler.
+ 	set_uniform_int(glsl_program_num, prefix, "tex", *sampler_num);
+ 	++*sampler_num;
+ }
+ 
+ string FFTInput::output_fragment_shader()
+ {
+ 	return read_file("flat_input.frag");
+ }
+ 
+ void FFTInput::invalidate_pixel_data()
+ {
+ 	if (texture_num != 0) {
+ 		resource_pool->release_2d_texture(texture_num);
+ 		texture_num = 0;
+ 	}
+ }
+ 
+ bool FFTInput::set_int(const std::string& key, int value)
+ {
+ 	if (key == "needs_mipmaps") {
+ 		// We cannot supply mipmaps; it would not make any sense for FFT data.
+ 		return (value == 0);
+ 	}
+ 	if (key == "fft_width") {
+ 		if (value < int(convolve_width)) {
+ 			return false;
+ 		}
+ 		invalidate_pixel_data();
+ 	}
+ 	if (key == "fft_height") {
+ 		if (value < int(convolve_height)) {
+ 			return false;
+ 		}
+ 		invalidate_pixel_data();
+ 	}
+ 	return Effect::set_int(key, value);
+ }
+ 
+ }  // namespace movit
diff --cc fft_input.h
index 0000000,681876b..b57723c
mode 000000,100644..100644
--- a/fft_input.h
+++ b/fft_input.h
@@@ -1,0 -1,81 +1,81 @@@
+ #ifndef _MOVIT_FFT_INPUT_H
+ #define _MOVIT_FFT_INPUT_H 1
+ 
+ // FFTInput is used by FFTConvolutionEffect to send in the FFTed version of a
+ // mostly static, one-channel data set, typically the convolution kernel
+ // with some zero padding.
+ //
+ // Since the kernel is typically small and unlikely to change often,
+ // it will be faster to FFT it once on the CPU (using the excellent FFTW3
+ // library) and keep it in a texture, rather than FFT-ing it over and over on
+ // the GPU. (We do not currently support caching Movit intermediates between
+ // frames.) As an extra bonus, we can then do it in double precision and round
+ // precisely to fp16 afterwards.
+ //
+ // This class is tested as part of by FFTConvolutionEffectTest.
+ 
 -#include <GL/glew.h>
++#include <epoxy/gl.h>
+ #include <assert.h>
+ #include <string>
+ 
+ #include "effect.h"
+ #include "effect_chain.h"
+ #include "image_format.h"
+ #include "input.h"
+ 
+ namespace movit {
+ 
+ class ResourcePool;
+ 
+ class FFTInput : public Input {
+ public:
+ 	FFTInput(unsigned width, unsigned height);
+ 	~FFTInput();
+ 
+ 	virtual std::string effect_type_id() const { return "FFTInput"; }
+ 	std::string output_fragment_shader();
+ 
+ 	// FFTs the data and uploads the texture if it has changed since last time.
+ 	void set_gl_state(GLuint glsl_program_num, const std::string& prefix, unsigned *sampler_num);
+ 
+ 	unsigned get_width() const { return fft_width; }
+ 	unsigned get_height() const { return fft_height; }
+ 
+ 	// Strictly speaking, FFT data doesn't have any colorspace or gamma;
+ 	// these values are the Movit standards for âdo nothingâ.
+ 	Colorspace get_color_space() const { return COLORSPACE_sRGB; }
+ 	GammaCurve get_gamma_curve() const { return GAMMA_LINEAR; }
+ 	virtual AlphaHandling alpha_handling() const { return INPUT_AND_OUTPUT_PREMULTIPLIED_ALPHA; }
+ 	virtual bool is_single_texture() const { return true; }
+ 	virtual bool can_output_linear_gamma() const { return true; }
+ 
+ 	// Tells the input where to fetch the actual pixel data. Note that if you change
+ 	// this data, you must either call set_pixel_data() again (using the same pointer
+ 	// is fine), or invalidate_pixel_data(). Otherwise, the FFT won't be recalculated,
+ 	// and the texture won't be re-uploaded on subsequent frames.
+ 	void set_pixel_data(const float *pixel_data)
+ 	{
+ 		this->pixel_data = pixel_data;
+ 		invalidate_pixel_data();
+ 	}
+ 
+ 	void invalidate_pixel_data();
+ 
+ 	virtual void inform_added(EffectChain *chain)
+ 	{
+ 		resource_pool = chain->get_resource_pool();
+ 	}
+ 
+ 	virtual bool set_int(const std::string& key, int value);
+ 
+ private:
+ 	GLuint texture_num;
+ 	int fft_width, fft_height;
+ 	unsigned convolve_width, convolve_height;
+ 	const float *pixel_data;
+ 	ResourcePool *resource_pool;
+ };
+ 
+ }  // namespace movit
+ 
+ #endif // !defined(_MOVIT_FFT_INPUT_H)
diff --cc slice_effect.h
index 1c363ea,0ce5d08..89aeb0e
--- a/slice_effect.h
+++ b/slice_effect.h
@@@ -7,10 -7,10 +7,10 @@@
  // discard roles, where one does convolutions by means of many small FFTs, but
  // could also work as a (relatively boring) video effect on its own.
  //
- // Note that vertical slices happen from the bottom, not the top, due to the
- // OpenGL coordinate system.
+ // Note that vertical slices happen from the top, consistent with the rest of
+ // Movit.
  
 -#include <GL/glew.h>
 +#include <epoxy/gl.h>
  #include <string>
  
  #include "effect.h"