Finally get SOR working.

author Steinar H. Gunderson <sgunderson@bigfoot.com>

Thu, 26 Jul 2018 22:21:01 +0000 (00:21 +0200)

committer Steinar H. Gunderson <sgunderson@bigfoot.com>

Thu, 26 Jul 2018 23:29:21 +0000 (01:29 +0200)
author Steinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 26 Jul 2018 22:21:01 +0000 (00:21 +0200)
committer Steinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 26 Jul 2018 23:29:21 +0000 (01:29 +0200)
diff --git a/flow.cpp b/flow.cpp

index b7f6d9c4c347b59cd7e893c08617a041c98ba6ba..8b09c8840999148ee3c0d3a0375e0af6c833da08 100644 (file)
--- a/flow.cpp
+++ b/flow.cpp
@@ -783,11 +783,12 @@ private:
         GLuint uniform_diff_flow_tex;
         GLuint uniform_equation_tex;
         GLuint uniform_smoothness_x_tex, uniform_smoothness_y_tex;
+       GLuint uniform_image_size, uniform_phase;
  };
  
  SOR::SOR()
  {
-       sor_vs_obj = compile_shader(read_file("vs.vert"), GL_VERTEX_SHADER);
+       sor_vs_obj = compile_shader(read_file("sor.vert"), GL_VERTEX_SHADER);
         sor_fs_obj = compile_shader(read_file("sor.frag"), GL_FRAGMENT_SHADER);
         sor_program = link_program(sor_vs_obj, sor_fs_obj);
  
@@ -804,6 +805,8 @@ SOR::SOR()
         uniform_equation_tex = glGetUniformLocation(sor_program, "equation_tex");
         uniform_smoothness_x_tex = glGetUniformLocation(sor_program, "smoothness_x_tex");
         uniform_smoothness_y_tex = glGetUniformLocation(sor_program, "smoothness_y_tex");
+       uniform_image_size = glGetUniformLocation(sor_program, "image_size");
+       uniform_phase = glGetUniformLocation(sor_program, "phase");
  }
  
  void SOR::exec(GLuint diff_flow_tex, GLuint equation_tex, GLuint smoothness_x_tex, GLuint smoothness_y_tex, int level_width, int level_height, int num_iterations)
@@ -815,12 +818,22 @@ void SOR::exec(GLuint diff_flow_tex, GLuint equation_tex, GLuint smoothness_x_te
         bind_sampler(sor_program, uniform_smoothness_y_tex, 2, smoothness_y_tex, zero_border_sampler);
         bind_sampler(sor_program, uniform_equation_tex, 3, equation_tex, nearest_sampler);
  
+       glProgramUniform2f(sor_program, uniform_image_size, level_width, level_height);
+
+       // NOTE: We bind to the texture we are rendering from, but we never write any value
+       // that we read in the same shader pass (we call discard for red values when we compute
+       // black, and vice versa), and we have barriers between the passes, so we're fine
+       // as per the spec.
         glViewport(0, 0, level_width, level_height);
         glDisable(GL_BLEND);
         glBindVertexArray(sor_vao);
-       fbos.render_to(diff_flow_tex);  // NOTE: Bind to same as we render from!
+       fbos.render_to(diff_flow_tex);
  
         for (int i = 0; i < num_iterations; ++i) {
+               glProgramUniform1i(sor_program, uniform_phase, 0);
+               glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+               glTextureBarrier();
+               glProgramUniform1i(sor_program, uniform_phase, 1);
                 glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
                 if (i != num_iterations - 1) {
                         glTextureBarrier();
diff --git a/sor.frag b/sor.frag

index fc9c462bb5b936f1403592810bbd8212f2a57940..fbcc2e4ca8b8532b829a7c653b88757310448fba 100644 (file)
--- a/sor.frag
+++ b/sor.frag
@@ -1,10 +1,13 @@
  #version 450 core
  
  in vec2 tc;
+in float element_sum_idx;
  out vec2 diff_flow;
  
  uniform sampler2D diff_flow_tex, smoothness_x_tex, smoothness_y_tex;
  uniform usampler2D equation_tex;
+uniform vec2 image_size;
+uniform int phase;
  
  // See pack_floats_shared() in equations.frag.
  vec2 unpack_floats_shared(uint c)
@@ -25,6 +28,16 @@ vec2 unpack_floats_shared(uint c)
  
  void main()
  {
+       // Red-black SOR: Every other pass, we update every other element in a
+       // checkerboard pattern. This is rather suboptimal for the GPU, as it
+       // just immediately throws away half of the warp, but it helps convergence
+       // a _lot_ (rough testing indicates that five iterations of SOR is as good
+       // as ~50 iterations of Jacobi). We could probably do better by reorganizing
+       // the data into two-values-per-pixel, so-called “twinning buffering”,
+       // but it makes for rather annoying code in the rest of the pipeline.
+       int color = int(round(element_sum_idx)) & 1;
+       if (color != phase) discard;
+
         uvec4 equation = texture(equation_tex, tc);
         float inv_A11 = uintBitsToFloat(equation.x);
         float A12 = uintBitsToFloat(equation.y);
@@ -44,10 +57,7 @@ void main()
         b += smooth_d * textureOffset(diff_flow_tex, tc, ivec2( 0, -1)).xy;
         b += smooth_u * textureOffset(diff_flow_tex, tc, ivec2( 0,  1)).xy;
  
-       // FIXME: omega=1.6 seems to make our entire system diverge.
-       // Is this because we do Gauss-Seidel instead of Jacobi?
-       // Figure out what's going on.
-       const float omega = 1.0;
+       const float omega = 1.8;  // Marginally better than 1.6, it seems.
         diff_flow = texture(diff_flow_tex, tc).xy;
  
         // From https://en.wikipedia.org/wiki/Successive_over-relaxation.
diff --git a/sor.vert b/sor.vert

new file mode 100644 (file)

index 0000000..a42ebc7
--- /dev/null
+++ b/sor.vert
@@ -0,0 +1,22 @@
+#version 450 core
+
+in vec2 position;
+out vec2 tc;
+out float element_sum_idx;
+
+uniform vec2 image_size;
+
+void main()
+{
+       // The result of glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) is:
+       //
+       //   2.000  0.000  0.000 -1.000
+       //   0.000  2.000  0.000 -1.000
+       //   0.000  0.000 -2.000 -1.000
+       //   0.000  0.000  0.000  1.000
+       gl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0);
+       tc = position;
+
+       vec2 element_idx = position * image_size - 0.5;
+       element_sum_idx = element_idx.x + element_idx.y;
+}
diff --git a/variational_refinement.txt b/variational_refinement.txt

index 35d8499c9edd68d93e17c12f72051c8731710f87..069fc5892fa1080355361b8a1a8a01a7d5b6a87c 100644 (file)
--- a/variational_refinement.txt
+++ b/variational_refinement.txt
@@ -471,11 +471,7 @@ at all it beyond this text to prove, but it does. Not that we bother
  iterating until it converges; a few iterations is good enough.)
  Gauss-Seidel iterations improve on this in that (surprisingly!) using this
  iteration's computed du/dv values if they're ready; this improves convergence,
-but is hard to parallelize. On the GPU, we render to the same texture as
-we render from; as per the OpenGL spec, this will give us undefined behavior
-on read (since our read/write sets are neither identical nor disjoint),
-but in practice, we'll get either the old value (Jacobi) or the
-new one (Gauss-Seidel); more likely, the former.
+but is hard to parallelize.
  
  Successive over-relaxation (SOR) improves further on this, in that it
  assumes that the solution moves towards the right value, so why not
@@ -485,8 +481,27 @@ instead (this value is called ω). Again, the convergence proof is beyond the
  scope here, but SOR converges for any ω between 1 and 2 (1 gives plain
  Gauss-Seidel, and over 2, we risk overshooting and never converging). Optimal
  ω depends on the equation system; DIS uses ω = 1.6, which presumably was
-measured, and should be OK for us too, even if we are closer to Jacobi
-than to Gauss-Seidel.
+measured, while we do ω = 1.8 (seems to be marginally better after some
+light testing).
+
+Efficient GPU implementation of SOR is not trivial; like noted before,
+Gauss-Seidel is inherently serial, which is a poor match for the GPU.
+Worse, doing SOR with Jacobi as base instead of Gauss-Seidel makes for
+an algorithm which simply does not converge. We solve this by using a
+method called red-black SOR (not to be confused with red-black binary
+trees). Conceptually, it assigns every unknown a color, with every other
+being red or black (similar to a checkerboard). Since red values now
+only depend on black values and vice versa, one can do all red values
+in parallel, then all black values, and so on. (This is equivalent to
+reordering the equation set; different such orderings can have different
+convergence speeds.)
+
+Our GPU SOR implementation is not overly efficient, so essentially one such
+half-iteration of red-black SOR costs the same as one full iteration of
+Jacobi but convergence is so much faster that it's worth it. Generally
+speaking, Gauss-Seidel converges twice as fast as Jacobi (ie., if Jacobi
+converges in N iterations, Gauss-Seidel does so in N/2), but SOR converges
+_geometrically_ faster, ie., in O(√N) iterations.
  
  Do note that the DeepFlow code does not fully use SOR or even Gauss-Seidel;
  it solves every 2x2 block (ie., single du/dv pair) using Cramer's rule,
@@ -494,7 +509,9 @@ and then pushes that vector 80% further, SOR-style. This would be clearly
  more accurate if we didn't have SOR in the mix (since du and dv would
  converge immediately relative to each other, bar Cramer's numerical issues),
  but I'm not sure whether it's better given SOR. (DIS changes this to a more
-traditional SOR formulation, which we also use.)
+traditional SOR formulation, which we also use. It doesn't seem to be much
+different in practical testing; perhaps minutely worse, but I haven't done
+a deep analysis here.)
  
  And that's it. References:
author	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Thu, 26 Jul 2018 22:21:01 +0000 (00:21 +0200)
committer	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Thu, 26 Jul 2018 23:29:21 +0000 (01:29 +0200)
flow.cpp		patch \| blob \| history
sor.frag		patch \| blob \| history
sor.vert	[new file with mode: 0644]	patch \| blob
variational_refinement.txt		patch \| blob \| history