From 3ccf5fb197c9a72545affc0b7286349d5603b72e Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Sun, 16 Mar 2014 19:25:53 +0100
Subject: [PATCH] Add an FFT convolution effect.

A lot of the later commits have been leading up to this, and I finally
got to the point where all the unit tests check out, everything seems
to work (modulo maybe some overflow issues) and we have a model that
matches what people actually expects from convolutions.

Note that this adds a dependency on FFTW3; we could probably have added
our own routines for such small needs, but like with Eigen, calling out to a
library is fine as long as it's of good quality (which FFTW certainly is) and
is widely available.
---
 Makefile.in                     |   8 +-
 README                          |   8 +-
 configure.ac                    |   1 +
 fft_convolution_effect.cpp      | 274 ++++++++++++++++++++++++++++++++
 fft_convolution_effect.h        | 116 ++++++++++++++
 fft_convolution_effect_test.cpp | 211 ++++++++++++++++++++++++
 fft_input.cpp                   | 139 ++++++++++++++++
 fft_input.h                     |  81 ++++++++++
 slice_effect.cpp                |   7 +
 slice_effect.frag               |   3 +-
 slice_effect.h                  |   1 +
 slice_effect_test.cpp           |  55 +++++++
 util.cpp                        |  13 ++
 util.h                          |   3 +
 14 files changed, 912 insertions(+), 8 deletions(-)
 create mode 100644 fft_convolution_effect.cpp
 create mode 100644 fft_convolution_effect.h
 create mode 100644 fft_convolution_effect_test.cpp
 create mode 100644 fft_input.cpp
 create mode 100644 fft_input.h

diff --git a/Makefile.in b/Makefile.in
index 8164e38..e401340 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -12,9 +12,9 @@ with_coverage = @with_coverage@
 
 CC=@CC@
 CXX=@CXX@
-CXXFLAGS=-Wall @CXXFLAGS@ -I$(GTEST_DIR)/include @Eigen3_CFLAGS@ @GLEW_CFLAGS@
-LDFLAGS=@GLEW_LIBS@ @SDL_LIBS@ -lpthread
-DEMO_LDLIBS=@SDL_image_LIBS@ -lrt -lpthread @libpng_LIBS@
+CXXFLAGS=-Wall @CXXFLAGS@ -I$(GTEST_DIR)/include @Eigen3_CFLAGS@ @GLEW_CFLAGS@ @FFTW3_CFLAGS@
+LDFLAGS=@GLEW_LIBS@ @SDL_LIBS@ @FFTW3_LIBS@ -lpthread
+DEMO_LDLIBS=@SDL_image_LIBS@ -lrt -lpthread @libpng_LIBS@ @FFTW3_LIBS@
 SHELL=@SHELL@
 LIBTOOL=@LIBTOOL@ --tag=CXX
 RANLIB=ranlib
@@ -59,11 +59,13 @@ TESTED_EFFECTS += vignette_effect
 TESTED_EFFECTS += slice_effect
 TESTED_EFFECTS += complex_modulate_effect
 TESTED_EFFECTS += luma_mix_effect
+TESTED_EFFECTS += fft_convolution_effect
 
 UNTESTED_EFFECTS = sandbox_effect
 UNTESTED_EFFECTS += mirror_effect
 UNTESTED_EFFECTS += resize_effect
 UNTESTED_EFFECTS += multiply_effect
+UNTESTED_EFFECTS += fft_input
 
 EFFECTS = $(TESTED_EFFECTS) $(UNTESTED_EFFECTS)
 
diff --git a/README b/README
index c6b20da..c74e8ef 100644
--- a/README
+++ b/README
@@ -39,10 +39,10 @@ for performance estimates.
 Still TL;DR, please give me the list of filters
 ===============================================
 
-Blur, diffusion, glow, lift/gamma/gain (color correction), mirror,
-mix (add two inputs), luma mix (use a map to wipe between two inputs),
-overlay (the Porter-Duff âoverâ operation), scale (bilinear and Lanczos),
-sharpen (both by unsharp mask and by Wiener filters), saturation
+Blur, diffusion, FFT-based convolution, glow, lift/gamma/gain (color
+correction), mirror, mix (add two inputs), luma mix (use a map to wipe between
+two inputs), overlay (the Porter-Duff âoverâ operation), scale (bilinear and
+Lanczos), sharpen (both by unsharp mask and by Wiener filters), saturation
 (or desaturation), vignette, and white balance.
 
 Yes, that's a short list. But they all look great, are fast and don't give
diff --git a/configure.ac b/configure.ac
index 5e03234..3e0b1be 100644
--- a/configure.ac
+++ b/configure.ac
@@ -8,6 +8,7 @@ AC_PROG_CC
 AC_PROG_CXX
 PKG_CHECK_MODULES([Eigen3], [eigen3])
 PKG_CHECK_MODULES([GLEW], [glew])
+PKG_CHECK_MODULES([FFTW3], [fftw3])
 
 # Needed for unit tests and the demo app.
 PKG_CHECK_MODULES([SDL], [sdl])
diff --git a/fft_convolution_effect.cpp b/fft_convolution_effect.cpp
new file mode 100644
index 0000000..b3f77bd
--- /dev/null
+++ b/fft_convolution_effect.cpp
@@ -0,0 +1,274 @@
+#include <GL/glew.h>
+#include <string.h>
+
+#include "complex_modulate_effect.h"
+#include "effect_chain.h"
+#include "fft_convolution_effect.h"
+#include "fft_input.h"
+#include "fft_pass_effect.h"
+#include "multiply_effect.h"
+#include "padding_effect.h"
+#include "slice_effect.h"
+#include "util.h"
+
+using namespace std;
+
+namespace movit {
+
+FFTConvolutionEffect::FFTConvolutionEffect(int input_width, int input_height, int convolve_width, int convolve_height)
+	: input_width(input_width),
+	  input_height(input_height),
+	  convolve_width(convolve_width),
+	  convolve_height(convolve_height),
+	  fft_input(new FFTInput(convolve_width, convolve_height)),
+	  crop_effect(new PaddingEffect()),
+	  owns_effects(true) {
+	CHECK(crop_effect->set_int("width", input_width));
+	CHECK(crop_effect->set_int("height", input_height));
+	CHECK(crop_effect->set_float("top", 0));
+	CHECK(crop_effect->set_float("left", 0));
+}
+
+FFTConvolutionEffect::~FFTConvolutionEffect()
+{
+	if (owns_effects) {
+		delete fft_input;
+		delete crop_effect;
+	}
+}
+
+namespace {
+
+// Returns the last Effect in the new chain.
+Effect *add_overlap_and_fft(EffectChain *chain, Effect *last_effect, int fft_size, int pad_size, FFTPassEffect::Direction direction)
+{
+	// Overlap.
+	{
+		Effect *overlap_effect = chain->add_effect(new SliceEffect(), last_effect);
+		CHECK(overlap_effect->set_int("input_slice_size", fft_size - pad_size));
+		CHECK(overlap_effect->set_int("output_slice_size", fft_size));
+		CHECK(overlap_effect->set_int("offset", -pad_size));
+		if (direction == FFTPassEffect::HORIZONTAL) {
+			CHECK(overlap_effect->set_int("direction", SliceEffect::HORIZONTAL));
+		} else {
+			assert(direction == FFTPassEffect::VERTICAL);
+			CHECK(overlap_effect->set_int("direction", SliceEffect::VERTICAL));
+		}
+
+		last_effect = overlap_effect;
+	}
+
+	// FFT.
+	int num_passes = ffs(fft_size) - 1;
+	for (int i = 1; i <= num_passes; ++i) {
+		Effect *fft_effect = chain->add_effect(new FFTPassEffect(), last_effect);
+		CHECK(fft_effect->set_int("pass_number", i));
+		CHECK(fft_effect->set_int("fft_size", fft_size));
+		CHECK(fft_effect->set_int("direction", direction));
+		CHECK(fft_effect->set_int("inverse", 0));
+
+		last_effect = fft_effect;
+	}
+
+	return last_effect;
+}
+
+// Returns the last Effect in the new chain.
+Effect *add_ifft_and_discard(EffectChain *chain, Effect *last_effect, int fft_size, int pad_size, FFTPassEffect::Direction direction)
+{
+	// IFFT.
+	int num_passes = ffs(fft_size) - 1;
+	for (int i = 1; i <= num_passes; ++i) {
+		Effect *fft_effect = chain->add_effect(new FFTPassEffect(), last_effect);
+		CHECK(fft_effect->set_int("pass_number", i));
+		CHECK(fft_effect->set_int("fft_size", fft_size));
+		CHECK(fft_effect->set_int("direction", direction));
+		CHECK(fft_effect->set_int("inverse", 1));
+
+		last_effect = fft_effect;
+	}
+
+	// Discard.
+	{
+		Effect *discard_effect = chain->add_effect(new SliceEffect(), last_effect);
+		CHECK(discard_effect->set_int("input_slice_size", fft_size));
+		CHECK(discard_effect->set_int("output_slice_size", fft_size - pad_size));
+		if (direction == FFTPassEffect::HORIZONTAL) {
+			CHECK(discard_effect->set_int("direction", SliceEffect::HORIZONTAL));
+		} else {
+			assert(direction == FFTPassEffect::VERTICAL);
+			CHECK(discard_effect->set_int("direction", SliceEffect::VERTICAL));
+		}
+		CHECK(discard_effect->set_int("offset", pad_size));
+
+		last_effect = discard_effect;
+	}
+
+	return last_effect;
+}
+
+}  // namespace
+
+void FFTConvolutionEffect::rewrite_graph(EffectChain *chain, Node *self)
+{
+	int pad_width = convolve_width - 1;
+	int pad_height = convolve_height - 1;
+
+	// Try all possible FFT widths and heights to see which one is the
+	// cheapest.  As a proxy for real performance, we use number of texel
+	// fetches; this isn't perfect by any means, but it's easy to work with
+	// and should be approximately correct.
+	int min_x = next_power_of_two(1 + pad_width);
+	int min_y = next_power_of_two(1 + pad_height);
+	int max_y = next_power_of_two(input_height + pad_width);
+	int max_x = next_power_of_two(input_width + pad_height);
+
+	size_t best_cost = numeric_limits<size_t>::max();
+	int best_x = -1, best_y = -1, best_x_before_y_fft = -1, best_x_before_y_ifft = -1;
+
+	// Try both
+	//
+	//   overlap(X), FFT(X), overlap(Y), FFT(Y), modulate, IFFT(Y), discard(Y), IFFT(X), discard(X) and
+	//   overlap(Y), FFT(Y), overlap(X), FFT(X), modulate, IFFT(X), discard(X), IFFT(Y), discard(Y)
+	//
+	// For simplicity, call them the XY-YX and YX-XY orders. In theory, we
+	// could have XY-XY and YX-YX orders as well, and I haven't found a
+	// convincing argument that they will never be optimal (although it
+	// sounds odd and should be rare), so we test all four possible ones.
+	//
+	// We assume that the kernel FFT is for free, since it is typically done
+	// only once and per frame.
+	for (int x_before_y_fft = 0; x_before_y_fft <= 1; ++x_before_y_fft) {
+		for (int x_before_y_ifft = 0; x_before_y_ifft <= 1; ++x_before_y_ifft) {
+			for (int y = min_y; y <= max_y; y *= 2) {
+				int y_pixels_per_block = y - pad_height;
+				int num_vertical_blocks = div_round_up(input_height, y_pixels_per_block);
+				size_t output_height = y * num_vertical_blocks;
+				for (int x = min_x; x <= max_x; x *= 2) {
+					int x_pixels_per_block = x - pad_width;
+					int num_horizontal_blocks = div_round_up(input_width, x_pixels_per_block);
+					size_t output_width = x * num_horizontal_blocks;
+
+					size_t cost = 0;
+
+					if (x_before_y_fft) {
+						// First, the cost of the horizontal padding.
+						cost = output_width * input_height;
+
+						// log(X) FFT passes. Each pass reads two inputs per pixel,
+						// plus the support texture.
+						cost += (ffs(x) - 1) * 3 * output_width * input_height;
+
+						// Now, horizontal padding.
+						cost += output_width * output_height;
+
+						// log(Y) FFT passes, now at full resolution.
+						cost += (ffs(y) - 1) * 3 * output_width * output_height;
+					} else {
+						// First, the cost of the vertical padding.
+						cost = input_width * output_height;
+
+						// log(Y) FFT passes. Each pass reads two inputs per pixel,
+						// plus the support texture.
+						cost += (ffs(y) - 1) * 3 * input_width * output_height;
+
+						// Now, horizontal padding.
+						cost += output_width * output_height;
+
+						// log(X) FFT passes, now at full resolution.
+						cost += (ffs(x) - 1) * 3 * output_width * output_height;
+					}
+
+					// The actual modulation. Reads one pixel each from two textures.
+					cost += 2 * output_width * output_height;
+
+					if (x_before_y_ifft) {
+						// log(X) IFFT passes.
+						cost += (ffs(x) - 1) * 3 * output_width * output_height;
+
+						// Discard horizontally.
+						cost += input_width * output_height;
+
+						// log(Y) IFFT passes.
+						cost += (ffs(y) - 1) * 3 * input_width * output_height;
+
+						// Discard horizontally.
+						cost += input_width * input_height;
+					} else {
+						// log(Y) IFFT passes.
+						cost += (ffs(y) - 1) * 3 * output_width * output_height;
+
+						// Discard vertically.
+						cost += output_width * input_height;
+
+						// log(X) IFFT passes.
+						cost += (ffs(x) - 1) * 3 * output_width * input_height;
+
+						// Discard horizontally.
+						cost += input_width * input_height;
+					}
+
+					if (cost < best_cost) {
+						best_x = x;
+						best_y = y;
+						best_x_before_y_fft = x_before_y_fft;
+						best_x_before_y_ifft = x_before_y_ifft;
+						best_cost = cost;
+					}
+				}
+			}
+		}
+	}
+
+	const int fft_width = best_x, fft_height = best_y;
+
+	assert(self->incoming_links.size() == 1);
+	Node *last_node = self->incoming_links[0];
+	self->incoming_links.clear();
+	last_node->outgoing_links.clear();
+
+	// Do FFT.
+	Effect *last_effect = last_node->effect;
+	if (best_x_before_y_fft) {
+		last_effect = add_overlap_and_fft(chain, last_effect, fft_width, pad_width, FFTPassEffect::HORIZONTAL);
+		last_effect = add_overlap_and_fft(chain, last_effect, fft_height, pad_height, FFTPassEffect::VERTICAL);
+	} else {
+		last_effect = add_overlap_and_fft(chain, last_effect, fft_height, pad_height, FFTPassEffect::VERTICAL);
+		last_effect = add_overlap_and_fft(chain, last_effect, fft_width, pad_width, FFTPassEffect::HORIZONTAL);
+	}
+
+	// Normalizer.
+	Effect *multiply_effect;
+	float fft_size = fft_width * fft_height;
+	float factor[4] = { 1.0f / fft_size, 1.0f / fft_size, 1.0f / fft_size, 1.0f / fft_size };
+	last_effect = multiply_effect = chain->add_effect(new MultiplyEffect(), last_effect);
+	CHECK(multiply_effect->set_vec4("factor", factor));
+
+	// Multiply by the FFT of the convolution kernel.
+	CHECK(fft_input->set_int("fft_width", fft_width));
+	CHECK(fft_input->set_int("fft_height", fft_height));
+	chain->add_input(fft_input);
+	owns_effects = false;
+
+	Effect *modulate_effect = chain->add_effect(new ComplexModulateEffect(), multiply_effect, fft_input);
+	CHECK(modulate_effect->set_int("num_repeats_x", div_round_up(input_width, fft_width - pad_width)));
+	CHECK(modulate_effect->set_int("num_repeats_y", div_round_up(input_height, fft_height - pad_height)));
+	last_effect = modulate_effect;
+
+	// Finally, do IFFT.
+	if (best_x_before_y_ifft) {
+		last_effect = add_ifft_and_discard(chain, last_effect, fft_width, pad_width, FFTPassEffect::HORIZONTAL);
+		last_effect = add_ifft_and_discard(chain, last_effect, fft_height, pad_height, FFTPassEffect::VERTICAL);
+	} else {
+		last_effect = add_ifft_and_discard(chain, last_effect, fft_height, pad_height, FFTPassEffect::VERTICAL);
+		last_effect = add_ifft_and_discard(chain, last_effect, fft_width, pad_width, FFTPassEffect::HORIZONTAL);
+	}
+
+	// ...and crop away any extra padding we have have added.
+	last_effect = chain->add_effect(crop_effect);
+
+	chain->replace_sender(self, chain->find_node_for_effect(last_effect));
+	self->disabled = true;
+}
+
+}  // namespace movit
diff --git a/fft_convolution_effect.h b/fft_convolution_effect.h
new file mode 100644
index 0000000..f601a49
--- /dev/null
+++ b/fft_convolution_effect.h
@@ -0,0 +1,116 @@
+#ifndef _MOVIT_FFT_CONVOLUTION_EFFECT_H
+#define _MOVIT_FFT_CONVOLUTION_EFFECT_H 1
+
+// FFTConvolutionEffect applies an arbitrary 2D convolution between the input image
+// and a convolution kernel (assumed to be much smaller than the image).
+// It does this convolution using multiple smaller FFTs and an algorithm called
+// overlap-discard (also known as overlap-save) to achieve much higher
+// efficiency than direct evaluation of the convolution, at some expense of
+// accuracy.
+//
+// FFTConvolutionEffect follows the usual convention for convolution, which is that
+// you sample from the origin pixel, and then up and to the left from that. This means
+// that (in horizontal 1D) [1 0 0 0 0 ...] would be an identity transform, and that
+// [0 1 0 0 0 ...] would mean sampling one pixel to the left of the origin, which
+// effectively move would move the image one pixel to the right.
+//
+// The basic idea of the acceleration comes from the the convolution theorem
+// (which holds in any number of dimensions), namely that FFT(A â B) =
+// FFT(A) * FFT(B), where â is circular convolution and * is pointwise
+// multiplication. This means that A â B = IFFT(FFT(A) * FFT(B)), and since
+// we can do a 2D FFT in O(nÂ² log n), this is asymptotically better than
+// direct convolution, which is O(nÂ²mÂ²) (where m is the size of the convolution
+// kernel). However, the convolution theorem is rarely _directly_ applicable,
+// for two reasons:
+//
+//  - â is _circular_ convolution, which means that inputs are taken to
+//    repeat (wrap around), which is rarely what you want.
+//  - A and B must be the same size, which means that to convolve a 1280x720
+//    kernel with a 64x64 kernel, you need to zero pad the 64x64 kernel and
+//    then do _two_ full 1280x720 FFTs (one for each of A and B).
+//
+// The first problem is solved by adding m-1 zero pixels (horizontally
+// and vertically) as padding, and then discarding the result of those pixels.
+// This causes the output to be identical to a non-circular convolution.
+//
+// The second is slightly more tricky, and there are multiple ways of solving
+// it. The one that appears to be the most suitable to suitable for GPU use,
+// and the one that is used here, is overlap-discard (more commonly but less
+// precisely known as overlap-save). In overlap-discard, the input is broken
+// up into multiple equally-sized slices which are then FFTed and convolved
+// with the kernel individually. (The kernel must still be zero padded to the
+// same size as the slice, but this is typically much smaller then the full
+// picture.) As before, the pad area contains data that's essentially junk,
+// which is thrown away when the slices are put together again.
+//
+// The optimal slice size is a tradeoff. More slices means more space wasted
+// for padding, since the padding is the same no matter the slice size,
+// but fewer slices means we need to do larger FFTs (although fewer of them).
+// There's no exact closed formula for this, especially since the 2D case
+// makes things a bit more tricky with ordering of the X versus Y passes,
+// so we simply try all possible sizes and orderings, attempting to estimate
+// roughly how much each operation will cost. The result isn't perfect, though;
+// FFTW has a mode for actually measuring, which they claim improves speeds
+// by ~2x over simple estimation, but they also have much more freedom in
+// their execution model than we do.
+//
+// The output _size_ of a convolution can be defined in a couple of different
+// ways; in a sense, what's the most reasonable is using only the central part
+// of the result (the mode âvalidâ in MATLAB/Octave), since that is the only
+// one not used by any edge pixels. (FFTConvolutionEffect assumes normal Movit
+// edge pixel behavior, which is to repeat the outermost pixels.) You could also
+// keep all the output pixels (âfullâ in MATLAB/Octave), which is nicely symmetric.
+// However, for video processing, typically what you want is to have the _same_
+// output size as input size, so we crop to the input size. This means that
+// you'll get some of the edge-affected pixels but not all, but it's usually
+// an okay tradeoff.
+//
+// FFTConvolutionEffect does not do any actual pixel work by itself; it
+// rewrites itself into a long chain of SliceEffect, FFTPassEffect, FFTInput
+// and ComplexModulationEffect to do its bidding. Note that currently, due to
+// Movit limitations, we need to know the number of FFT passes at finalize()
+// time, which in turn means you cannot change image or kernel size on the fly.
+
+#include <assert.h>
+#include <GL/glew.h>
+#include <string>
+
+#include "effect.h"
+#include "fft_input.h"
+
+namespace movit {
+
+class PaddingEffect;
+
+class FFTConvolutionEffect : public Effect {
+public:
+	FFTConvolutionEffect(int input_width, int input_height, int convolve_width, int convolve_height);
+	~FFTConvolutionEffect();
+	virtual std::string effect_type_id() const { return "FFTConvolutionEffect"; }
+	std::string output_fragment_shader() { assert(false); }
+	virtual void rewrite_graph(EffectChain *graph, Node *self);
+
+	// See FFTInput::set_pixel_data().
+	void set_convolution_kernel(const float *pixel_data)
+	{
+		assert(fft_input);
+		fft_input->set_pixel_data(pixel_data);
+	}
+
+private:
+	int input_width, input_height;
+	int convolve_width, convolve_height;
+
+	// Chosen by algorithm.
+	int fft_width, fft_height;
+
+	// Both of these are owned by us if owns_effects is true (before finalize()),
+	// and otherwise owned by the EffectChain.
+	FFTInput *fft_input;
+	PaddingEffect *crop_effect;
+	bool owns_effects;
+};
+
+}  // namespace movit
+
+#endif // !defined(_MOVIT_FFT_CONVOLUTION_EFFECT_H)
diff --git a/fft_convolution_effect_test.cpp b/fft_convolution_effect_test.cpp
new file mode 100644
index 0000000..f0f8527
--- /dev/null
+++ b/fft_convolution_effect_test.cpp
@@ -0,0 +1,211 @@
+// Unit tests for FFTConvolutionEffect.
+
+#include <GL/glew.h>
+#include <math.h>
+
+#include "effect_chain.h"
+#include "gtest/gtest.h"
+#include "image_format.h"
+#include "test_util.h"
+#include "fft_convolution_effect.h"
+
+namespace movit {
+
+TEST(FFTConvolutionEffectTest, Identity) {
+	const int size = 4;
+
+	float data[size * size] = {
+		0.1, 1.1, 2.1, 3.1,
+		0.2, 1.2, 2.2, 3.2,
+		0.3, 1.3, 2.3, 3.3,
+		0.4, 1.4, 2.4, 3.4,
+	};
+	float out_data[size * size];
+
+	for (int convolve_size = 1; convolve_size < 10; ++convolve_size) {
+		float kernel[convolve_size * convolve_size];
+		for (int i = 0; i < convolve_size * convolve_size; ++i) {
+			kernel[i] = 0.0f;
+		}
+		kernel[0] = 1.0f;
+
+		EffectChainTester tester(NULL, size, size, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
+		tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, size, size);
+
+		FFTConvolutionEffect *fft_effect = new FFTConvolutionEffect(size, size, convolve_size, convolve_size);
+		tester.get_chain()->add_effect(fft_effect);
+		fft_effect->set_convolution_kernel(kernel);
+		tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED);
+
+		expect_equal(data, out_data, size, size, 0.02, 0.003);
+	}
+}
+
+TEST(FFTConvolutionEffectTest, Constant) {
+	const int size = 4, convolve_size = 17;
+	const float f = 7.0f;
+
+	float data[size * size] = {
+		0.1, 1.1, 2.1, 3.1,
+		0.2, 1.2, 2.2, 3.2,
+		0.3, 1.3, 2.3, 3.3,
+		0.4, 1.4, 2.4, 3.4,
+	};
+	float expected_data[size * size] = {
+		f * 0.1, f * 1.1, f * 2.1, f * 3.1,
+		f * 0.2, f * 1.2, f * 2.2, f * 3.2,
+		f * 0.3, f * 1.3, f * 2.3, f * 3.3,
+		f * 0.4, f * 1.4, f * 2.4, f * 3.4,
+	};
+	float out_data[size * size];
+	float kernel[convolve_size * convolve_size];
+	for (int i = 0; i < convolve_size * convolve_size; ++i) {
+		kernel[i] = 0.0f;
+	}
+	kernel[0] = f;
+
+	EffectChainTester tester(NULL, size, size, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
+	tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, size, size);
+
+	FFTConvolutionEffect *fft_effect = new FFTConvolutionEffect(size, size, convolve_size, convolve_size);
+	tester.get_chain()->add_effect(fft_effect);
+	fft_effect->set_convolution_kernel(kernel);
+	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED);
+
+	// Somewhat looser bounds due to the higher magnitude.
+	expect_equal(expected_data, out_data, size, size, f * 0.03, f * 0.004);
+}
+
+TEST(FFTConvolutionEffectTest, MoveRight) {
+	const int size = 4, convolve_size = 3;
+
+	float data[size * size] = {
+		0.1, 1.1, 2.1, 3.1,
+		0.2, 1.2, 2.2, 3.2,
+		0.3, 1.3, 2.3, 3.3,
+		0.4, 1.4, 2.4, 3.4,
+	};
+	float kernel[convolve_size * convolve_size] = {
+		0.0, 1.0, 0.0,
+		0.0, 0.0, 0.0,
+		0.0, 0.0, 0.0,
+	};
+	float expected_data[size * size] = {
+		0.1, 0.1, 1.1, 2.1,
+		0.2, 0.2, 1.2, 2.2,
+		0.3, 0.3, 1.3, 2.3,
+		0.4, 0.4, 1.4, 2.4,
+	};
+	float out_data[size * size];
+
+	EffectChainTester tester(NULL, size, size, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
+	tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, size, size);
+
+	FFTConvolutionEffect *fft_effect = new FFTConvolutionEffect(size, size, convolve_size, convolve_size);
+	tester.get_chain()->add_effect(fft_effect);
+	fft_effect->set_convolution_kernel(kernel);
+	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED);
+
+	expect_equal(expected_data, out_data, size, size, 0.02, 0.003);
+}
+
+TEST(FFTConvolutionEffectTest, MoveDown) {
+	const int size = 4, convolve_size = 3;
+
+	float data[size * size] = {
+		0.1, 1.1, 2.1, 3.1,
+		0.2, 1.2, 2.2, 3.2,
+		0.3, 1.3, 2.3, 3.3,
+		0.4, 1.4, 2.4, 3.4,
+	};
+	float kernel[convolve_size * convolve_size] = {
+		0.0, 0.0, 0.0,
+		1.0, 0.0, 0.0,
+		0.0, 0.0, 0.0,
+	};
+	float expected_data[size * size] = {
+		0.1, 1.1, 2.1, 3.1,
+		0.1, 1.1, 2.1, 3.1,
+		0.2, 1.2, 2.2, 3.2,
+		0.3, 1.3, 2.3, 3.3,
+	};
+	float out_data[size * size];
+
+	EffectChainTester tester(NULL, size, size, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
+	tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, size, size);
+
+	FFTConvolutionEffect *fft_effect = new FFTConvolutionEffect(size, size, convolve_size, convolve_size);
+	tester.get_chain()->add_effect(fft_effect);
+	fft_effect->set_convolution_kernel(kernel);
+	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED);
+
+	expect_equal(expected_data, out_data, size, size, 0.02, 0.003);
+}
+
+TEST(FFTConvolutionEffectTest, MergeWithLeft) {
+	const int size = 4, convolve_size = 3;
+
+	float data[size * size] = {
+		0.1, 1.1, 2.1, 3.1,
+		0.2, 1.2, 2.2, 3.2,
+		0.3, 1.3, 2.3, 3.3,
+		0.4, 1.4, 2.4, 3.4,
+	};
+	float kernel[convolve_size * convolve_size] = {
+		1.0, 1.0, 0.0,
+		0.0, 0.0, 0.0,
+		0.0, 0.0, 0.0,
+	};
+	float expected_data[size * size] = {
+		0.1 + 0.1,  0.1 + 1.1,  1.1 + 2.1,  2.1 + 3.1,
+		0.2 + 0.2,  0.2 + 1.2,  1.2 + 2.2,  2.2 + 3.2,
+		0.3 + 0.3,  0.3 + 1.3,  1.3 + 2.3,  2.3 + 3.3,
+		0.4 + 0.4,  0.4 + 1.4,  1.4 + 2.4,  2.4 + 3.4,
+	};
+	float out_data[size * size];
+
+	EffectChainTester tester(NULL, size, size, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
+	tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, size, size);
+
+	FFTConvolutionEffect *fft_effect = new FFTConvolutionEffect(size, size, convolve_size, convolve_size);
+	tester.get_chain()->add_effect(fft_effect);
+	fft_effect->set_convolution_kernel(kernel);
+	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED);
+
+	expect_equal(expected_data, out_data, size, size, 0.02, 0.003);
+}
+
+TEST(FFTConvolutionEffectTest, NegativeCoefficients) {
+	const int size = 4;
+	const int convolve_width = 3, convolve_height = 2;
+
+	float data[size * size] = {
+		0.1, 1.1, 2.1, 3.1,
+		0.2, 1.2, 2.2, 3.2,
+		0.3, 1.3, 2.3, 3.3,
+		0.4, 1.4, 2.4, 3.4,
+	};
+	float kernel[convolve_width * convolve_height] = {
+		1.0, 0.0,  0.0,
+		0.0, 0.0, -0.5,
+	};
+	float expected_data[size * size] = {
+		0.1 - 0.5 * 0.1,  1.1 - 0.5 * 0.1,  2.1 - 0.5 * 0.1,  3.1 - 0.5 * 1.1,
+		0.2 - 0.5 * 0.1,  1.2 - 0.5 * 0.1,  2.2 - 0.5 * 0.1,  3.2 - 0.5 * 1.1,
+		0.3 - 0.5 * 0.2,  1.3 - 0.5 * 0.2,  2.3 - 0.5 * 0.2,  3.3 - 0.5 * 1.2,
+		0.4 - 0.5 * 0.3,  1.4 - 0.5 * 0.3,  2.4 - 0.5 * 0.3,  3.4 - 0.5 * 1.3,
+	};
+	float out_data[size * size];
+
+	EffectChainTester tester(NULL, size, size, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
+	tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, size, size);
+
+	FFTConvolutionEffect *fft_effect = new FFTConvolutionEffect(size, size, convolve_width, convolve_height);
+	tester.get_chain()->add_effect(fft_effect);
+	fft_effect->set_convolution_kernel(kernel);
+	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED);
+
+	expect_equal(expected_data, out_data, size, size, 0.02, 0.003);
+}
+
+}  // namespace movit
diff --git a/fft_input.cpp b/fft_input.cpp
new file mode 100644
index 0000000..ba539b6
--- /dev/null
+++ b/fft_input.cpp
@@ -0,0 +1,139 @@
+#include <string.h>
+#include <assert.h>
+#include <GL/glew.h>
+#include <fftw3.h>
+
+#include "effect_util.h"
+#include "fp16.h"
+#include "fft_input.h"
+#include "resource_pool.h"
+#include "util.h"
+
+using namespace std;
+
+namespace movit {
+
+FFTInput::FFTInput(unsigned width, unsigned height)
+	: texture_num(0),
+	  fft_width(width),
+	  fft_height(height),
+	  convolve_width(width),
+	  convolve_height(height),
+	  pixel_data(NULL)
+{
+	register_int("fft_width", &fft_width);
+	register_int("fft_height", &fft_height);
+}
+
+FFTInput::~FFTInput()
+{
+	if (texture_num != 0) {
+		resource_pool->release_2d_texture(texture_num);
+	}
+}
+
+void FFTInput::set_gl_state(GLuint glsl_program_num, const string& prefix, unsigned *sampler_num)
+{
+	glActiveTexture(GL_TEXTURE0 + *sampler_num);
+	check_error();
+
+	if (texture_num == 0) {
+		assert(pixel_data != NULL);
+
+		// Do the FFT. Our FFTs should typically be small enough and
+		// the data changed often enough that FFTW_ESTIMATE should be
+		// quite OK. Otherwise, we'd need to worry about caching these
+		// plans (possibly including FFTW wisdom).
+		fftw_complex *in = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * fft_width * fft_height);
+		fftw_complex *out = (fftw_complex *)fftw_malloc(sizeof(fftw_complex) * fft_width * fft_height);
+		fftw_plan p = fftw_plan_dft_2d(fft_height, fft_width, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
+
+		// Zero pad.
+		for (int i = 0; i < fft_height * fft_width; ++i) {
+			in[i][0] = 0.0;
+			in[i][1] = 0.0;
+		}
+		for (unsigned y = 0; y < convolve_height; ++y) {
+			for (unsigned x = 0; x < convolve_width; ++x) {
+				int i = y * fft_width + x;
+				in[i][0] = pixel_data[y * convolve_width + x];
+				in[i][1] = 0.0;
+			}
+		}
+
+		fftw_execute(p);
+
+		// Convert to fp16.
+		fp16_int_t *kernel = new fp16_int_t[fft_width * fft_height * 2];
+		for (int i = 0; i < fft_width * fft_height; ++i) {
+			kernel[i * 2 + 0] = fp64_to_fp16(out[i][0]);
+			kernel[i * 2 + 1] = fp64_to_fp16(out[i][1]);
+		}
+
+		// (Re-)upload the texture.
+		texture_num = resource_pool->create_2d_texture(GL_RG16F, fft_width, fft_height);
+		glBindTexture(GL_TEXTURE_2D, texture_num);
+		check_error();
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+		check_error();
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+		check_error();
+		glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+		check_error();
+		glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, fft_width, fft_height, GL_RG, GL_HALF_FLOAT, kernel);
+		check_error();
+		glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
+		check_error();
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+		check_error();
+		glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+		check_error();
+
+		fftw_free(in);
+		fftw_free(out);
+		delete[] kernel;
+	} else {
+		glBindTexture(GL_TEXTURE_2D, texture_num);
+		check_error();
+	}
+
+	// Bind it to a sampler.
+	set_uniform_int(glsl_program_num, prefix, "tex", *sampler_num);
+	++*sampler_num;
+}
+
+string FFTInput::output_fragment_shader()
+{
+	return read_file("flat_input.frag");
+}
+
+void FFTInput::invalidate_pixel_data()
+{
+	if (texture_num != 0) {
+		resource_pool->release_2d_texture(texture_num);
+		texture_num = 0;
+	}
+}
+
+bool FFTInput::set_int(const std::string& key, int value)
+{
+	if (key == "needs_mipmaps") {
+		// We cannot supply mipmaps; it would not make any sense for FFT data.
+		return (value == 0);
+	}
+	if (key == "fft_width") {
+		if (value < int(convolve_width)) {
+			return false;
+		}
+		invalidate_pixel_data();
+	}
+	if (key == "fft_height") {
+		if (value < int(convolve_height)) {
+			return false;
+		}
+		invalidate_pixel_data();
+	}
+	return Effect::set_int(key, value);
+}
+
+}  // namespace movit
diff --git a/fft_input.h b/fft_input.h
new file mode 100644
index 0000000..681876b
--- /dev/null
+++ b/fft_input.h
@@ -0,0 +1,81 @@
+#ifndef _MOVIT_FFT_INPUT_H
+#define _MOVIT_FFT_INPUT_H 1
+
+// FFTInput is used by FFTConvolutionEffect to send in the FFTed version of a
+// mostly static, one-channel data set, typically the convolution kernel
+// with some zero padding.
+//
+// Since the kernel is typically small and unlikely to change often,
+// it will be faster to FFT it once on the CPU (using the excellent FFTW3
+// library) and keep it in a texture, rather than FFT-ing it over and over on
+// the GPU. (We do not currently support caching Movit intermediates between
+// frames.) As an extra bonus, we can then do it in double precision and round
+// precisely to fp16 afterwards.
+//
+// This class is tested as part of by FFTConvolutionEffectTest.
+
+#include <GL/glew.h>
+#include <assert.h>
+#include <string>
+
+#include "effect.h"
+#include "effect_chain.h"
+#include "image_format.h"
+#include "input.h"
+
+namespace movit {
+
+class ResourcePool;
+
+class FFTInput : public Input {
+public:
+	FFTInput(unsigned width, unsigned height);
+	~FFTInput();
+
+	virtual std::string effect_type_id() const { return "FFTInput"; }
+	std::string output_fragment_shader();
+
+	// FFTs the data and uploads the texture if it has changed since last time.
+	void set_gl_state(GLuint glsl_program_num, const std::string& prefix, unsigned *sampler_num);
+
+	unsigned get_width() const { return fft_width; }
+	unsigned get_height() const { return fft_height; }
+
+	// Strictly speaking, FFT data doesn't have any colorspace or gamma;
+	// these values are the Movit standards for âdo nothingâ.
+	Colorspace get_color_space() const { return COLORSPACE_sRGB; }
+	GammaCurve get_gamma_curve() const { return GAMMA_LINEAR; }
+	virtual AlphaHandling alpha_handling() const { return INPUT_AND_OUTPUT_PREMULTIPLIED_ALPHA; }
+	virtual bool is_single_texture() const { return true; }
+	virtual bool can_output_linear_gamma() const { return true; }
+
+	// Tells the input where to fetch the actual pixel data. Note that if you change
+	// this data, you must either call set_pixel_data() again (using the same pointer
+	// is fine), or invalidate_pixel_data(). Otherwise, the FFT won't be recalculated,
+	// and the texture won't be re-uploaded on subsequent frames.
+	void set_pixel_data(const float *pixel_data)
+	{
+		this->pixel_data = pixel_data;
+		invalidate_pixel_data();
+	}
+
+	void invalidate_pixel_data();
+
+	virtual void inform_added(EffectChain *chain)
+	{
+		resource_pool = chain->get_resource_pool();
+	}
+
+	virtual bool set_int(const std::string& key, int value);
+
+private:
+	GLuint texture_num;
+	int fft_width, fft_height;
+	unsigned convolve_width, convolve_height;
+	const float *pixel_data;
+	ResourcePool *resource_pool;
+};
+
+}  // namespace movit
+
+#endif // !defined(_MOVIT_FFT_INPUT_H)
diff --git a/slice_effect.cpp b/slice_effect.cpp
index 15f5392..1ec68f9 100644
--- a/slice_effect.cpp
+++ b/slice_effect.cpp
@@ -10,9 +10,14 @@ using namespace std;
 namespace movit {
 
 SliceEffect::SliceEffect()
+	: input_slice_size(1),
+	  output_slice_size(1),
+	  offset(0),
+	  direction(VERTICAL)
 {
 	register_int("input_slice_size", &input_slice_size);
 	register_int("output_slice_size", &output_slice_size);
+	register_int("offset", &offset);
 	register_int("direction", (int *)&direction);
 }
 
@@ -55,10 +60,12 @@ void SliceEffect::set_gl_state(GLuint glsl_program_num, const string &prefix, un
 		set_uniform_float(glsl_program_num, prefix, "output_coord_to_slice_num", float(output_width) / float(output_slice_size));
 		set_uniform_float(glsl_program_num, prefix, "slice_num_to_input_coord", float(input_slice_size) / float(input_width));
 		set_uniform_float(glsl_program_num, prefix, "slice_offset_to_input_coord", float(output_slice_size) / float(input_width));
+		set_uniform_float(glsl_program_num, prefix, "offset", float(offset) / float(input_width));
 	} else {
 		set_uniform_float(glsl_program_num, prefix, "output_coord_to_slice_num", float(output_height) / float(output_slice_size));
 		set_uniform_float(glsl_program_num, prefix, "slice_num_to_input_coord", float(input_slice_size) / float(input_height));
 		set_uniform_float(glsl_program_num, prefix, "slice_offset_to_input_coord", float(output_slice_size) / float(input_height));
+		set_uniform_float(glsl_program_num, prefix, "offset", float(offset) / float(input_height));
 	}
 
 	// Normalized coordinates could potentially cause blurring of the image.
diff --git a/slice_effect.frag b/slice_effect.frag
index 1598cb0..59e3cc1 100644
--- a/slice_effect.frag
+++ b/slice_effect.frag
@@ -1,6 +1,7 @@
 uniform float PREFIX(output_coord_to_slice_num);
 uniform float PREFIX(slice_num_to_input_coord);
 uniform float PREFIX(slice_offset_to_input_coord);
+uniform float PREFIX(offset);
  
 vec4 FUNCNAME(vec2 tc) {
 	// DIRECTION_VERTICAL will be #defined to 1 if we are expanding vertically,
@@ -16,7 +17,7 @@ vec4 FUNCNAME(vec2 tc) {
 	float slice_offset = fract(sliced_coord * PREFIX(output_coord_to_slice_num));
 
 	// Find out where this slice begins in the input data, and then offset from that.
-	float input_coord = slice_num * PREFIX(slice_num_to_input_coord) + slice_offset * PREFIX(slice_offset_to_input_coord);
+	float input_coord = slice_num * PREFIX(slice_num_to_input_coord) + slice_offset * PREFIX(slice_offset_to_input_coord) + PREFIX(offset);
 
 #if DIRECTION_VERTICAL
 	return INPUT(vec2(tc.x, 1.0 - input_coord));
diff --git a/slice_effect.h b/slice_effect.h
index 31a13fd..0ce5d08 100644
--- a/slice_effect.h
+++ b/slice_effect.h
@@ -37,6 +37,7 @@ private:
 	EffectChain *chain;
 	int input_width, input_height;
 	int input_slice_size, output_slice_size;
+	int offset;
 	Direction direction;
 };
 
diff --git a/slice_effect_test.cpp b/slice_effect_test.cpp
index 88b77a2..3e56667 100644
--- a/slice_effect_test.cpp
+++ b/slice_effect_test.cpp
@@ -84,6 +84,30 @@ TEST(SliceEffectTest, HorizontalDiscard) {
 	expect_equal(expected_data, out_data, 4, 2);
 }
 
+TEST(SliceEffectTest, HorizontalOverlapWithOffset) {
+	float data[5 * 2] = {
+		/* 0.0f, */ 0.0f,  0.1f, 0.2f,  0.3f, 0.4f,
+		/* 0.4f, */ 0.4f,  0.3f, 0.2f,  0.1f, 0.0f,
+	};
+	float expected_data[9 * 2] = {
+		0.0f, 0.0f, 0.1f,  0.1f, 0.2f, 0.3f,  0.3f, 0.4f, 0.4f,
+		0.4f, 0.4f, 0.3f,  0.3f, 0.2f, 0.1f,  0.1f, 0.0f, 0.0f,
+	};
+	float out_data[9 * 2];
+
+	EffectChainTester tester(NULL, 9, 2, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
+	tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, 5, 2);
+
+	Effect *slice_effect = tester.get_chain()->add_effect(new SliceEffect());
+	ASSERT_TRUE(slice_effect->set_int("input_slice_size", 2));
+	ASSERT_TRUE(slice_effect->set_int("output_slice_size", 3));
+	ASSERT_TRUE(slice_effect->set_int("offset", -1));
+	ASSERT_TRUE(slice_effect->set_int("direction", SliceEffect::HORIZONTAL));
+	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR);
+
+	expect_equal(expected_data, out_data, 9, 2);
+}
+
 TEST(SliceEffectTest, VerticalOverlapSlicesFromTop) {
 	float data[2 * 3] = {
 		0.0f, 0.1f,
@@ -114,4 +138,35 @@ TEST(SliceEffectTest, VerticalOverlapSlicesFromTop) {
 	expect_equal(expected_data, out_data, 2, 6);
 }
 
+TEST(SliceEffectTest, VerticalOverlapOffsetsFromTop) {
+	float data[2 * 3] = {
+		0.0f, 0.1f,
+		0.4f, 0.3f,
+
+		0.6f, 0.2f,
+	};
+	float expected_data[2 * 6] = {
+		0.4f, 0.3f,
+		0.6f, 0.2f,
+		0.6f, 0.2f,
+
+		0.6f, 0.2f,
+		0.6f, 0.2f,
+		0.6f, 0.2f,
+	};
+	float out_data[2 * 6];
+
+	EffectChainTester tester(NULL, 2, 6, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR);
+	tester.add_input(data, FORMAT_GRAYSCALE, COLORSPACE_sRGB, GAMMA_LINEAR, 2, 3);
+
+	Effect *slice_effect = tester.get_chain()->add_effect(new SliceEffect());
+	ASSERT_TRUE(slice_effect->set_int("input_slice_size", 2));
+	ASSERT_TRUE(slice_effect->set_int("output_slice_size", 3));
+	ASSERT_TRUE(slice_effect->set_int("offset", 1));
+	ASSERT_TRUE(slice_effect->set_int("direction", SliceEffect::VERTICAL));
+	tester.run(out_data, GL_RED, COLORSPACE_sRGB, GAMMA_LINEAR);
+
+	expect_equal(expected_data, out_data, 2, 6);
+}
+
 }  // namespace movit
diff --git a/util.cpp b/util.cpp
index 8162f71..310e7be 100644
--- a/util.cpp
+++ b/util.cpp
@@ -220,4 +220,17 @@ unsigned div_round_up(unsigned a, unsigned b)
 	return (a + b - 1) / b;
 }
 
+// Algorithm from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2.
+unsigned next_power_of_two(unsigned v)
+{
+	v--;
+	v |= v >> 1;
+	v |= v >> 2;
+	v |= v >> 4;
+	v |= v >> 8;
+	v |= v >> 16;
+	v++;
+	return v;
+}
+
 }  // namespace movit
diff --git a/util.h b/util.h
index c59231d..c5dafed 100644
--- a/util.h
+++ b/util.h
@@ -53,6 +53,9 @@ GLuint fill_vertex_attribute(GLuint glsl_program_num, const std::string &attribu
 // Clean up after fill_vertex_attribute().
 void cleanup_vertex_attribute(GLuint glsl_program_num, const std::string &attribute_name, GLuint vbo);
 
+// If v is not already a power of two, return the first higher power of two.
+unsigned next_power_of_two(unsigned v);
+
 }  // namespace movit
 
 #ifdef NDEBUG
-- 
2.39.2