git.sesse.net Git - movit/blob - fft_convolution_effect.h

   1 #ifndef _MOVIT_FFT_CONVOLUTION_EFFECT_H
   2 #define _MOVIT_FFT_CONVOLUTION_EFFECT_H 1
   3
   4 // FFTConvolutionEffect applies an arbitrary 2D convolution between the input image
   5 // and a convolution kernel (assumed to be much smaller than the image).
   6 // It does this convolution using multiple smaller FFTs and an algorithm called
   7 // overlap-discard (also known as overlap-save) to achieve much higher
   8 // efficiency than direct evaluation of the convolution, at some expense of
   9 // accuracy.
  10 //
  11 // FFTConvolutionEffect follows the usual convention for convolution, which is that
  12 // you sample from the origin pixel, and then up and to the left from that. This means
  13 // that (in horizontal 1D) [1 0 0 0 0 ...] would be an identity transform, and that
  14 // [0 1 0 0 0 ...] would mean sampling one pixel to the left of the origin, which
  15 // effectively move would move the image one pixel to the right.
  16 //
  17 // The basic idea of the acceleration comes from the the convolution theorem
  18 // (which holds in any number of dimensions), namely that FFT(A ⊙ B) =
  19 // FFT(A) * FFT(B), where ⊙ is circular convolution and * is pointwise
  20 // multiplication. This means that A ⊙ B = IFFT(FFT(A) * FFT(B)), and since
  21 // we can do a 2D FFT in O(n² log n), this is asymptotically better than
  22 // direct convolution, which is O(n²m²) (where m is the size of the convolution
  23 // kernel). However, the convolution theorem is rarely _directly_ applicable,
  24 // for two reasons:
  25 //
  26 //  - ⊙ is _circular_ convolution, which means that inputs are taken to
  27 //    repeat (wrap around), which is rarely what you want.
  28 //  - A and B must be the same size, which means that to convolve a 1280x720
  29 //    kernel with a 64x64 kernel, you need to zero pad the 64x64 kernel and
  30 //    then do _two_ full 1280x720 FFTs (one for each of A and B).
  31 //
  32 // The first problem is solved by adding m-1 zero pixels (horizontally
  33 // and vertically) as padding, and then discarding the result of those pixels.
  34 // This causes the output to be identical to a non-circular convolution.
  35 //
  36 // The second is slightly more tricky, and there are multiple ways of solving
  37 // it. The one that appears to be the most suitable to suitable for GPU use,
  38 // and the one that is used here, is overlap-discard (more commonly but less
  39 // precisely known as overlap-save). In overlap-discard, the input is broken
  40 // up into multiple equally-sized slices which are then FFTed and convolved
  41 // with the kernel individually. (The kernel must still be zero padded to the
  42 // same size as the slice, but this is typically much smaller then the full
  43 // picture.) As before, the pad area contains data that's essentially junk,
  44 // which is thrown away when the slices are put together again.
  45 //
  46 // The optimal slice size is a tradeoff. More slices means more space wasted
  47 // for padding, since the padding is the same no matter the slice size,
  48 // but fewer slices means we need to do larger FFTs (although fewer of them).
  49 // There's no exact closed formula for this, especially since the 2D case
  50 // makes things a bit more tricky with ordering of the X versus Y passes,
  51 // so we simply try all possible sizes and orderings, attempting to estimate
  52 // roughly how much each operation will cost. The result isn't perfect, though;
  53 // FFTW has a mode for actually measuring, which they claim improves speeds
  54 // by ~2x over simple estimation, but they also have much more freedom in
  55 // their execution model than we do.
  56 //
  57 // The output _size_ of a convolution can be defined in a couple of different
  58 // ways; in a sense, what's the most reasonable is using only the central part
  59 // of the result (the mode “valid” in MATLAB/Octave), since that is the only
  60 // one not used by any edge pixels. (FFTConvolutionEffect assumes normal Movit
  61 // edge pixel behavior, which is to repeat the outermost pixels.) You could also
  62 // keep all the output pixels (“full” in MATLAB/Octave), which is nicely symmetric.
  63 // However, for video processing, typically what you want is to have the _same_
  64 // output size as input size, so we crop to the input size. This means that
  65 // you'll get some of the edge-affected pixels but not all, but it's usually
  66 // an okay tradeoff.
  67 //
  68 // FFTConvolutionEffect does not do any actual pixel work by itself; it
  69 // rewrites itself into a long chain of SliceEffect, FFTPassEffect, FFTInput
  70 // and ComplexModulationEffect to do its bidding. Note that currently, due to
  71 // Movit limitations, we need to know the number of FFT passes at finalize()
  72 // time, which in turn means you cannot change image or kernel size on the fly.
  73
  74 #include <assert.h>
  75 #include <GL/glew.h>
  76 #include <string>
  77
  78 #include "effect.h"
  79 #include "fft_input.h"
  80
  81 namespace movit {
  82
  83 class PaddingEffect;
  84
  85 class FFTConvolutionEffect : public Effect {
  86 public:
  87         FFTConvolutionEffect(int input_width, int input_height, int convolve_width, int convolve_height);
  88         ~FFTConvolutionEffect();
  89         virtual std::string effect_type_id() const { return "FFTConvolutionEffect"; }
  90         std::string output_fragment_shader() { assert(false); }
  91         virtual void rewrite_graph(EffectChain *graph, Node *self);
  92
  93         // See FFTInput::set_pixel_data().
  94         void set_convolution_kernel(const float *pixel_data)
  95         {
  96                 assert(fft_input);
  97                 fft_input->set_pixel_data(pixel_data);
  98         }
  99
 100 private:
 101         int input_width, input_height;
 102         int convolve_width, convolve_height;
 103
 104         // Chosen by algorithm.
 105         int fft_width, fft_height;
 106
 107         // Both of these are owned by us if owns_effects is true (before finalize()),
 108         // and otherwise owned by the EffectChain.
 109         FFTInput *fft_input;
 110         PaddingEffect *crop_effect;
 111         bool owns_effects;
 112 };
 113
 114 }  // namespace movit
 115
 116 #endif // !defined(_MOVIT_FFT_CONVOLUTION_EFFECT_H)