From c7f5b3e00a0b6581b42deb5b055420fbf2a629e5 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Thu, 12 Jul 2018 00:44:17 +0200
Subject: [PATCH] Start working on variational refinement.

---
 flow.cpp                   |  73 +++++-
 prewarp.frag               |  20 ++
 variational_refinement.txt | 520 +++++++++++++++++++++++++++++++++++++
 3 files changed, 612 insertions(+), 1 deletion(-)
 create mode 100644 prewarp.frag
 create mode 100644 variational_refinement.txt

diff --git a/flow.cpp b/flow.cpp
index b46e0a1..87cc06f 100644
--- a/flow.cpp
+++ b/flow.cpp
@@ -404,6 +404,67 @@ void Densify::exec(GLuint tex0_view, GLuint tex1_view, GLuint flow_tex, GLuint d
 	glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, width_patches * height_patches);
 }
 
+// Warp I_1 to I_w, and then compute the mean (I) and difference (I_t) of
+// I_0 and I_w. The prewarping is what enables us to solve the variational
+// flow for du,dv instead of u,v.
+//
+// See variational_refinement.txt for more information.
+class Prewarp {
+public:
+	Prewarp();
+	void exec(GLuint tex0_view, GLuint tex1_view, GLuint flow_tex, GLuint I_tex, GLuint I_t_tex, int level_width, int level_height);
+
+private:
+	GLuint prewarp_vs_obj;
+	GLuint prewarp_fs_obj;
+	GLuint prewarp_program;
+	GLuint prewarp_vao;
+
+	GLuint uniform_image0_tex, uniform_image1_tex, uniform_flow_tex;
+};
+
+Prewarp::Prewarp()
+{
+	prewarp_vs_obj = compile_shader(read_file("vs.vert"), GL_VERTEX_SHADER);
+	prewarp_fs_obj = compile_shader(read_file("prewarp.frag"), GL_FRAGMENT_SHADER);
+	prewarp_program = link_program(prewarp_vs_obj, prewarp_fs_obj);
+
+	// Set up the VAO containing all the required position/texcoord data.
+        glCreateVertexArrays(1, &prewarp_vao);
+	glBindVertexArray(prewarp_vao);
+	glBindBuffer(GL_ARRAY_BUFFER, vertex_vbo);
+
+	GLint position_attrib = glGetAttribLocation(prewarp_program, "position");
+	glEnableVertexArrayAttrib(prewarp_vao, position_attrib);
+	glVertexAttribPointer(position_attrib, 2, GL_FLOAT, GL_FALSE, 0, BUFFER_OFFSET(0));
+
+	uniform_image0_tex = glGetUniformLocation(prewarp_program, "image0_tex");
+	uniform_image1_tex = glGetUniformLocation(prewarp_program, "image1_tex");
+	uniform_flow_tex = glGetUniformLocation(prewarp_program, "flow_tex");
+}
+
+void Prewarp::exec(GLuint tex0_view, GLuint tex1_view, GLuint flow_tex, GLuint I_tex, GLuint I_t_tex, int level_width, int level_height)
+{
+	glUseProgram(prewarp_program);
+
+	bind_sampler(prewarp_program, uniform_image0_tex, 0, tex0_view, nearest_sampler);
+	bind_sampler(prewarp_program, uniform_image1_tex, 1, tex1_view, linear_sampler);
+	bind_sampler(prewarp_program, uniform_flow_tex, 2, flow_tex, nearest_sampler);
+
+	GLuint prewarp_fbo;  // TODO: cleanup
+	glCreateFramebuffers(1, &prewarp_fbo);
+	GLenum bufs[] = { GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1 };
+	glNamedFramebufferDrawBuffers(prewarp_fbo, 2, bufs);
+	glNamedFramebufferTexture(prewarp_fbo, GL_COLOR_ATTACHMENT0, I_tex, 0);
+	glNamedFramebufferTexture(prewarp_fbo, GL_COLOR_ATTACHMENT1, I_t_tex, 0);
+
+	glViewport(0, 0, level_width, level_height);
+	glDisable(GL_BLEND);
+	glBindVertexArray(prewarp_vao);
+	glBindFramebuffer(GL_FRAMEBUFFER, prewarp_fbo);
+	glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+}
+
 int main(void)
 {
 	if (SDL_Init(SDL_INIT_EVERYTHING) == -1) {
@@ -464,6 +525,7 @@ int main(void)
 	Sobel sobel;
 	MotionSearch motion_search;
 	Densify densify;
+	Prewarp prewarp;
 
 	GLuint query;
 	glGenQueries(1, &query);
@@ -516,7 +578,16 @@ int main(void)
 		// And draw.
 		densify.exec(tex0_view, tex1_view, flow_out_tex, dense_flow_tex, level_width, level_height, width_patches, height_patches);
 
-		// TODO: Variational refinement.
+		// Everything below here in the loop belongs to variational refinement.
+		// It is not done yet.
+
+		// Prewarping; create I and I_t.
+		GLuint I_tex, I_t_tex;
+		glCreateTextures(GL_TEXTURE_2D, 1, &I_tex);
+		glCreateTextures(GL_TEXTURE_2D, 1, &I_t_tex);
+		glTextureStorage2D(I_tex, 1, GL_R16F, level_width, level_height);
+		glTextureStorage2D(I_t_tex, 1, GL_R16F, level_width, level_height);
+		prewarp.exec(tex0_view, tex1_view, dense_flow_tex, I_tex, I_t_tex, level_width, level_height);
 
 		prev_level_flow_tex = dense_flow_tex;
 	}
diff --git a/prewarp.frag b/prewarp.frag
new file mode 100644
index 0000000..1bf99f7
--- /dev/null
+++ b/prewarp.frag
@@ -0,0 +1,20 @@
+#version 450 core
+
+// Warps I_1 according to the flow, then computes the mean and difference to I_0.
+
+in vec2 tc;
+out float I, I_t;
+
+uniform sampler2D image0_tex, image1_tex, flow_tex;
+
+void main()
+{
+	vec3 flow = texture(flow_tex, tc).xyz;
+	flow.xy /= flow.z;  // Normalize the sum coming out of the densification.
+
+	float I_0 = texture(image0_tex, tc).x;
+	float I_w = texture(image1_tex, tc + flow.xy).x;  // NOTE: This is effectively a reverse warp since texture() is a gather operation and flow is conceptually scatter.
+
+	I = 0.5f * (I_0 + I_w);
+	I_t = I_w - I_0;
+}
diff --git a/variational_refinement.txt b/variational_refinement.txt
new file mode 100644
index 0000000..58d0663
--- /dev/null
+++ b/variational_refinement.txt
@@ -0,0 +1,520 @@
+Variational refinement -- an introduction and derivation
+
+The variational refinement is probably the most difficult part of the
+algorithm to understand, in part because the description in most papers
+are very heavy on notation and rather light on exposition. I've tried
+to give a somewhat friendlier introduction to this specific algorithm
+below.
+
+The general idea is fairly simple; we try to optimize the flow field
+as a whole, by minimizing some mathematical notion of badness expressed
+as an energy function. The one used in the dense inverse search paper
+[Kroeger05; se references below] has this form:
+
+  E(U) = int( Ï Î¨(E_I) + Î³ Î¨(E_G) + Î± Î¨(E_S) ) dx
+
+where Î¨(aÂ²) = sqrt(aÂ² + ÎµÂ²) for some small constant Îµ = 0.001, and
+Ï, Î³, Î± are empirically set weighting constants. (We'll get to what the
+different enery terms are in a minute.) The integral is, for all practical
+purposes, just a sum over all the pixels in the flow.
+
+In general, such formulas are nonconvex and highly nonlinear, so we
+cannot hope to find a global minimum -- but if we start from the flow
+generated by the motion search, we can at least hope to make it somehow
+better by walking towards a local minimum. (In fact, there are many
+methods for optical flow that work _only_ by such minimization,
+so the word ârefinementâ is maybe not doing the method justice.
+One could just as well say that the motion search is a way of
+finding a reasonable starting point for the optimization.)
+
+The dense inverse search paper [Kroeger05; se references below] sets
+up the energy terms as described by some motion tensors and normalizations,
+then says simply that it is optimized by âÎ¸_vo fixed point iterations
+and Î¸_vi iterations of Successive Over Relaxation (SOR) for the linear
+systemâ. It's not immediately obvious what this means, but it gives
+a reference to [Brox04]. However, that paper describes a numerical
+approximation scheme that is _far_ more complicated than what the DIS
+code actually does.
+
+Rather, one must look at the other main reference they are giving,
+which is [Weinzaepfel13], describing a system called DeepFlow.
+DIS borrows most of the exposition and code for its variational
+refinement from DeepFlow, just removing some terms and fixing up
+a few issues here and there. (There are some slight differences in
+the paper, like the use of âz instead of ât, but that looks mostly
+like an error to me.) Unfortunately, that paper in turn refers to
+[Brox11], which appears no more useful in clearing up the notation
+to me.
+
+However, digging down in the references, finally one finds [Zimmer11],
+which is where the tensor notation appears to come from. This allows
+us to look at the first term in the energy, E_I, which comes from the
+intensity constant assumption. The basic idea is optical flow nearly
+by definition should preserve intensity after the warp:
+
+  I_0(x + u) = I_1(x) 
+
+where I_0 is the first picture, I_1 is the second, x is any 2D
+coordinate and u is the flow at x (which we are optimizing over).
+In general, we'll be optimizing over the entire field of u
+(potentially hundreds of thousands of values), but we'll be looking
+mostly at individual points, so we'll skip the coordinates when we
+can (e.g. we write u instead of or u(x, y)). u is of course the 2D
+flow, although often, we'll write its components separately as u and v
+instead of as a vector u.
+
+Before we go further, we need to add some more notation:
+
+  * I_x is the partial derivative of I with respect to x (at some
+    point), and similarly for I_y. These do not depend on u,
+    so they can be precalculated before the optimization.
+  * I_xx is the double partial derivative of I, and similar for
+    I_yy and I_xy (the latter is the same as I_yx).
+  * I_t is the temporal derivative of I, ie. in practice just
+    I_t(x) = I_1(x) - I_0(x).
+
+Returning now to our original assertion:
+
+  I_0(x + u) = I_1(x)
+
+Classically in optical flow, one assumes that the flow is smooth
+and linear around the point x, which allows one to approximate this
+equation by
+
+  I_x u + I_y v + I_t = 0
+
+This is usually simply called âthe optical flow constraintâ,
+and gives rise to a very natural part of the energy:
+
+  E_I = I_x u + I_y v + I_t
+
+Remember that we send E_I through the function Î¨(aÂ²) = sqrt(aÂ² + ÎµÂ²),
+so clearly Î¨(E_I) will be minimized if indeed E_I is zero.
+
+At this point, many papers start talking about Euler-Lagrange
+multivariate equations, which is a fairly daunting concept
+(at least the Wikipedia page is suitable for scaring small children).
+However, for the first two terms, we don't need its general form,
+and it reduces to something much simpler; just differentiate the energy
+by u and equate the result to zero (finding some minimum; it can't be
+a maximum, since *wave hands intensely*). Then differentiate the energy
+by v and set that to zero, too; now you have two equations in two
+unknowns (or, since we're optimizing over a field, maybe 500k
+equations in 500k unknowns -- although the equation set will be
+very sparse), which is hopefully solvable using linear methods.
+We'll look at what this gives for E_I in a moment, then try to apply
+the same notions to E_G and E_S later.
+
+First we modify E_I a bit by adding some normalization:
+
+  E_I = Î²_0 (I_x u + I_y v + I_t)
+
+where Î²_0 = 1/(abs(âI)Â² + 0.01). Note that Î²_0 depends on I only,
+so for the purposes of optimizing u, it's a constant and can be
+precomputed across I. (Î²_0 will, of course, depend on x, but so
+do all the other terms in the equation.)
+
+Now we give it to Maple, differentiating first by u and then by v:
+
+> M := (u,v) -> B_0 * (I_x * u + I_y * v + I_t);
+                   M := (u, v) -> B_0 (I_x u + I_y v + I_t)
+
+> diff(sqrt(M(u,v)^2 + e), u);                  
+                           2
+                        B_0  (I_x u + I_y v + I_t) I_x
+                     ------------------------------------
+                         2                      2     1/2
+                     (B_0  (I_x u + I_y v + I_t)  + e)
+
+> diff(sqrt(M(u,v)^2 + e), v);
+                           2
+                        B_0  (I_x u + I_y v + I_t) I_y
+                     ------------------------------------
+                         2                      2     1/2
+                     (B_0  (I_x u + I_y v + I_t)  + e)
+
+
+So these are the two expressions to be set to zero (for each
+point). We'll notice immediately that this isn't very linear
+in u and v, so here's where the âfixed point iterationsâ come in;
+we simply assume that our previous values for u and v are
+approximately good enough for the denominator, and optimize
+them in the numerator only. Then we get new values that are
+hopefully a bit closer, which we can then use for the
+denominator, and so on. (This is seemingly an old technique;
+[Brox05] cites [Ciarlet78]. It is justifiable in the sense
+that the only thing really held constant is the derivative
+of the penalizer.) In other words, if we define the constant
+
+  k1 = Î²_0Â² / sqrt(Î²_0Â² (I_x u' + I_y v' + I_t)Â² + ÎµÂ²)
+
+(where u' and v' are the guesses for u and v from the previous
+iteration)
+
+we have the much more manageable
+
+  k1 I_xÂ²    u + k1 I_x I_y v = - k1 I_t
+  k1 I_x I_y u + k1 I_yÂ²    v = - k1 I_t
+
+ie., two linear equations in u and v. Now, you will notice two
+immediate problems by this equation set:
+
+  * The factor k1 is completely useless, since it's just multiplied
+    in everywhere.
+  * The set of equations is colinear (the determinant of the matrix
+    is zero), and thus there is an infinite number of possible
+    solutionsâthis is known as the so-called âaperture problemâ.
+    It shouldn't be surprising, though, as we cannot expect that
+    starting with a single constraint should allow us to solve
+    for two unknowns.
+
+However, both problems will go away as soon as we start adding
+more terms, so let's look at the gradient constancy term E_G next.
+It is fairly similar to the brightness constancy term, except it
+uses the (spatial) gradient instead of intensity:
+
+  âI_0(x + u) = âI_1(x)
+
+or equivalently (by definition):
+
+  (âI/âx)_0(x + u) = (âI/âx)_1(x)
+  (âI/ây)_0(x + u) = (âI/ây)_1(x)
+
+The idea is that this is more robust to changes in lighting.
+It doesn't replace the intensity term, but augments it; the weighting
+constants Ï and Î³ control their relative importance. Also note that
+this actually gives us two independent equations, unlike the brightness
+constancy term.
+
+However, it is not obvious at all how to discretize this. In particular,
+most papers, including [Brox04], appear to want _not_ to make any linear
+assumptions of the flow in this case, and end up with tons of terms.
+(The DIS and DeepFlow papers do, again, use some tensor notation that
+I do not understand, but I'm not convinced it actually contains any
+of the discretization.)
+
+Yet more paper searching eventually turns up [Fahad07], which simply
+states that the discretized versions of these equations are:
+
+  I_xx u + I_xy v + I_xt = 0
+  I_yx u + I_yy v + I_yt = 0.
+
+which seems to match well what the DIS code uses. Note that even though
+this is an equation set equal to zero, we can't just solve for them;
+we need to make (penalized, normalized) energy terms and add them to
+the other terms. This gives
+  
+  E_G = Î²_x (I_xx u + I_xy v + I_xt) + Î²_y (I_yx u + I_yy v + I_yt)
+
+with normalization terms
+
+  Î²_x = 1 / (abs(â(I_x))Â² + 0.01)  (â(I_x) is the gradient of âI/âx)
+  Î²_y = 1 / (abs(â(I_y))Â² + 0.01)
+
+(The DIS paper writes âI_dx and âI_dy instead of âI_x and âI_y, but I believe
+that's a typo; the DeepFlow paper says âI_x and âI_y.)
+
+The papers both write that Î¨(E_G) is used, which would mean that the penalized
+term is
+
+  E_G = sqrt((Î²_x (I_xx u + I_xy v + I_xt) + Î²_y (I_yx u + I_yy v + I_yt))Â² + ÎµÂ²)
+
+but that isn't what the code actually does. Instead, it seems that the two
+terms are squared independently:
+  
+  E_G = sqrt((Î²_x (I_xx u + I_xy v + I_xt))Â² + (Î²_y (I_yx u + I_yy v + I_yt))Â² + ÎµÂ²)
+
+Both are solvable just fine, and it probably does not matter all that much
+which we use in practice (although [Zimmer11] suggests that if we are using
+multichannel images, we should penalize the three channels separately),
+but we follow what the code actually does here.
+
+We can differentiate them and equate them to zero as before:
+
+> M_x := (u,v) -> B_x * (I_xx * u + I_xy * v + I_xt);
+                      M_x := (u, v) -> B_x (I_xx u + I_xy v + I_xt)
+
+> M_y := (u,v) -> B_y * (I_xy * u + I_yy * v + I_yt);
+                      M_y := (u, v) -> B_y (I_xy u + I_yy v + I_yt)
+
+> diff(sqrt(M_x(u,v)^2 + M_y(u,v)^2 + e), u);        
+                                     2             2
+       2 (I_xx u + I_xy v + I_xt) B_x  I_xx + 2 B_y  (I_xy u + I_yy v + I_yt) I_xy
+       ---------------------------------------------------------------------------
+                                  2    2      2                         2     1/2
+       2 ((I_xx u + I_xy v + I_xt)  B_x  + B_y  (I_xy u + I_yy v + I_yt)  + e)
+
+> diff(sqrt(M_x(u,v)^2 + M_y(u,v)^2 + e), v);
+                                     2             2
+       2 (I_xx u + I_xy v + I_xt) B_x  I_xy + 2 B_y  (I_xy u + I_yy v + I_yt) I_yy
+       ---------------------------------------------------------------------------
+                                  2    2      2                         2     1/2
+       2 ((I_xx u + I_xy v + I_xt)  B_x  + B_y  (I_xy u + I_yy v + I_yt)  + e)
+
+Using the same fixed-point scheme where we hold the terms in the
+denominator constant and equal to last iteration's values, we get
+a new common constant
+
+  k2 = 1 / sqrt(Î²_xÂ² (I_xx u' + I_xy v' + I_xt)Â² + Î²_yÂ² (I_xy u' + I_yy v' + I_yt)Â²)
+
+and for brevity
+
+  k_x = k2 Î²_xÂ²
+  k_y = k2 Î²_yÂ²
+
+and thus, collecting terms for u and v, we get the two equations:
+
+  (k_x I_xxÂ² + k_y I_xyÂ²)         u + (k_x I_xx I_xy + k_y I_xy I_yy) v = - k_x I_xx I_xt - k_y I_xy I_yt
+  (k_x I_xx I_xy + k_y I_xy I_yy) u + (k_x I_xyÂ² + k_y I_yyÂ²)         v = - k_x I_xy I_xt - k_y I_yy I_yt
+
+which is linear in u and v, not colinear (unless we are extremely
+unlucky), and can be easily solved.
+
+Of course, for optimizing the weighted sum Ï Î¨(E_I) + Î³ Î¨(E_G),
+we just add the two equation sets pairwise with appropriate weights.
+
+There's a small discrepancy here: The equations suggest that we should
+be be squaring the normalization terms Î²_0Â², Î²_xÂ², Î²_yÂ²; however, the
+code does not appear to do so. It's possible that they were intended to be
+added outside of the penalization, e.g. Î¨(aÂ²) = sqrt(Î² aÂ² + ÎµÂ²), but given
+that these come from [Zimmer11], which mentions nothing of the sort,
+I'll just have to assume that this is an implementation mishap.
+
+The final smoothness term the one that binds the flow field together as a whole
+so that we don't have WxH completely independent equations (with its positive
+and negative sides, of course). It is the simplest in terms of notation,
+but it requires the full power of the Euler-Lagrange equations to minimize,
+so we'll need to figure that part out.
+
+  E_S = abs(âu)Â² + abs(âv)Â²
+
+or
+
+  E_S = (u_xÂ² + u_yÂ²) + (v_xÂ² + v_yÂ²)
+
+The penalized form used in the DeepFlow notation, contrary to what you'd expect
+from the paper, is:
+
+  E_S = sqrt(u_xÂ² + u_yÂ² + v_xÂ² + v_yÂ² + ÎµÂ²)
+
+How would one go about to minimize such an expression by u? (We'll deal with v
+later.) It's perhaps no big surprise that the expression involves double
+derivatives, but the full form involves the Euler-Lagrange equations.
+They allow us to minimize expressions that contain x, y, u(x, y) _and_ the partial
+derivatives u_x(x, y) and u_y(x, y), although the answer becomes a differential
+equation.
+
+The Wikipedia page is, unfortunately, suitable for scaring small
+children, but the general idea is: Differentiate the expression by u_x
+(yes, differentiating by a partial derivative!), negate it, and then
+differentiate the result by x. Then do the same thing by u_y and y,
+add the two results together and equate to zero. Mathematically
+(https://en.wikipedia.org/wiki/Euler%E2%80%93Lagrange_equation#Several_functions_of_several_variables_with_single_derivative):
+
+  âE/âu - â/âx (âE/âu_x) - â/ây (âE/âu_y) = 0
+
+The first term disappears, since we don't have a non-differentiated
+u(x, y) in E_S. (Previously, the two _other_ terms would disappear,
+because we didn't have u_x or u_y in E_I or E_G.) This means we get
+
+  - â/âx (u_x / sqrt(u_xÂ² + u_yÂ² + v_xÂ² + v_yÂ² + ÎµÂ²)) - â/ây (u_y / sqrt(u_xÂ² + u_yÂ² + v_xÂ² + v_yÂ² + ÎµÂ²)) = 0
+
+(We don't remove the minus signs since this is supposed to be added to
+all the other terms.)
+
+This is what's called an _anisotropic diffusion_ (or PeronaâMalik diffusion)
+equation, and is extensively described in literature. It has the effect of
+smoothing the flow more in some places than others; in particular, it does
+not smooth as strongly near edges, so it is edge-preserving. (It's a bit odd to
+call it anisotropic, since it does smooth equally in all directions;
+[Brox05] calls it vector-valued diffusion.)
+
+We'd love to our usual trick of keeping the nonlinear terms in the denominator
+constant, but alas, we can't do that yet, since it's under the differentiation
+operator; this factor has to be discretized together with u before we can treat
+it as a constant. So instead, we'll define it as a function (called the
+_diffusivity_ at the given point):
+
+  g(x, y) = -1 / sqrt(u_xÂ² + u_yÂ² + v_xÂ² + v_yÂ² + ÎµÂ²) = 0
+
+which gives us
+
+  â/âx ( g(x, y) u_x ) + â/ây ( g(x, y) u_y ) = 0
+
+We'll also have a similar equation for minimizing v, of course:
+
+  â/âx ( g(x, y) v_x ) + â/ây ( g(x, y) v_y ) = 0
+
+There's no normalization term Î² here, unlike the other terms; DeepFlow2
+adds one, but we're not including it here.
+
+At this point, we make a tweak. This seemingly goes back to at least
+[Brox04], which also makes the same tweak to all the other terms
+(which we don't, but see below). We split u (and v) into something
+based on the original value plus a differential du (and dv), and then
+solve for du (or dv) instead. (In math-speak, we are moving to an
+implicit method, which is often more numerically stable.) In other words,
+
+  u(x, y) = u0(x, y) + du(x, y)
+
+where u0(x, y) is the initial guess for the flow. (It's not the value
+from previous iteration, for reasons that will be clear later, it's
+the first one. [Brox04] differs here, but it does a number of things
+differently in the numerics anyway.)
+
+This gives us:
+
+  â/âx ( g(x, y) (u0 + du)_x ) + â/ây ( g(x, y) (u0 + du)_y ) = 0
+
+or
+
+  â/âx ( g(x, y) du_x ) + â/ây ( g(x, y) du_y ) = - â/âx ( g(x, y) u0_x ) - â/ây ( g(x, y) u0_y )
+
+where the right-hand side is effectively a constant for these purposes
+(although it still needs to be calculated anew for each iteration,
+since g(x, y) changes).
+
+Of course, now we have a different problem; all the other terms are
+formulated in terms of u and v, not du and dv. DeepFlow solves this
+by not searching for the flow between I_0 and I_1, but between I_0 and
+a pre-warped I_1. In other words, before any of the derivatives involving
+I_t are calculated, we calculate an I_w with bilinear interpolation:
+
+  I_w(x, y) = I_1(x + u0(x, y), y + v0(x, y))
+
+and then redefine I_t (occasionally called I_z) as
+
+  I_t(x, y) = I_w(x, y) - I_0(x, y)
+
+Note that the plus sign effectively means inverting the flow, so if
+the u0 and v0 were already correctly estimated, perfectly smooth and linear
+everywhere, I_w = I_0. (All spatial derivatives are calculated on the mean
+between I_0 and I_w; the paper doesn't mention this.) After this, all the
+equations for E_I and E_G earlier will still hold, they will just be
+calculating du and dv instead. Note that this means we have three values
+for the flow; there's u0 for the initial guess, du for the current guess
+of delta from u0 (which makes u0 + du the current guess of the flow),
+and du' for the previous guess of delta from u0. (The initial values for
+du' and dv' will be zero.)
+
+Now back to our equations, as we look at practical implementation:
+
+  â/âx ( g(x, y) du_x ) + â/ây ( g(x, y) du_y ) = - â/âx ( g(x, y) u0_x ) - â/ây ( g(x, y) u0_y )
+  â/âx ( g(x, y) dv_x ) + â/ây ( g(x, y) dv_y ) = - â/âx ( g(x, y) v0_x ) - â/ây ( g(x, y) v0_y )
+
+We can discretize the left-hand and right-hand side identically, so let's
+look only at
+
+  â/âx ( g(x, y) du_x ) + â/ây ( g(x, y) du_y )
+
+[Brox05] equation (2.14) (which refers to a 1998 book, although I couldn't
+immediately find the equation in question in that book) discretizes this as
+
+    1/2 (g(x+1, y) + g(x, y)) (du(x+1, y) - du(x, y))
+  - 1/2 (g(x-1, y) + g(x, y)) (du(x, y) - du(x-1, y))
+  + 1/2 (g(x, y+1) + g(x, y)) (du(x, y+1) - du(x, y))
+  - 1/2 (g(x, y-1) + g(x, y)) (du(x, y) - du(x, y-1))
+
+It also mentions that it would be better to sample g at the half-way points,
+e.g. g(x+0.5, y), but that begs the question exactly how we'd do that, and
+DeepFlow doesn't seem to care, so we stick with their version.
+
+Now we can finally let g use the values of the flow (note that this is the
+actual flow u and v, not du and dv!) from the previous iteration, as before:
+
+  g(x, y) = -1 / sqrt(u'_xÂ² + u'_yÂ² + v'_xÂ² + v'_yÂ² + ÎµÂ²) = 0
+
+The single derivatives in g(x) are approximated by standard central differences
+(see https://en.wikipedia.org/wiki/Finite_difference_coefficient), e.g.
+
+  u_x(x, y) = 1/2 (u(x + 1, y) - u(x - 1, y))
+
+although the derivatives of I are using the fancier
+
+  I_x(x, y) = 1/12 (-I(x - 2, y) + 8 I(x - 1, y) - 8 I(x - 1, y) + I(x - 2, y))
+
+I assume this is because I_x derivatives are calculated only once, so we can
+afford more accurate derivatives (or possibly simply because of influence
+from earlier papers).
+
+Let's now define a smoothness constant between the neighbors (x,y) and (x1,y1):
+
+  s(x1, y1) = 1/2 (g(x, y) + g(x1, y1))
+
+Collecting all the du(x, y) terms of the discretized equation above,
+ignoring the right-hand side, which is just a constant for us anyway:
+
+    s(x+1, y) (du(x+1, y) - du(x, y))
+  - s(x-1, y) (du(x, y) - du(x-1, y))
+  + s(x, y+1) (du(x, y+1) - du(x, y))
+  - s(x, y-1) (du(x, y) - du(x, y-1)) = C
+
+    s(x+1, y) du(x+1, y) - s(x+1, y) du(x, y)
+  - s(x-1, y) du(x, y) + s(x-1, y) du(x-1, y)
+  + s(x, y+1) du(x, y+1) - s(x, y+1) du(x, y)
+  - s(x, y-1) du(x, y) + s(x, y-1) du(x, y-1) = C
+
+  - (s(x+1, y) + s(x-1, y) + s(x, y+1) + s(x, y-1)) du(x, y) =
+  - s(x+1, y) du(x+1, y) - s(x-1, y) du(x-1, y) - s(x, y+1) du(x, y+1) - s(x, y-1) du(x, y-1) + C
+
+It is interesting to note that if s = 1 uniformly, which would be the case
+without our penalizer Î¨(aÂ²), we would have the familiar discrete Laplacian,
+where du(x, y) would seek to simply become the average of its four immediate
+neighbors.
+
+Now our equation system is finally complete and linear, and the rest is
+fairly pedestrian. The last term connects all the unknowns together,
+but we still solve them mostly as 2x2 matrices. The most basic iterative
+method is Gauss-Seidel, where we solve du(x, y) and dv(x,y) using the
+previous iteration's value for all other du/dv values. (That this converges
+at all it beyond this text to prove, but it does. Not that we bother
+iterating until it converges; a few iterations is good enough.)
+Jacobi iterations improve on this in that (surprisingly!) using this
+iteration's computed du/dv values if they're ready; this improves convergence,
+but is hard to parallelize. On the GPU, we render to the same texture as
+we render from; as per the OpenGL spec, this will give us undefined behavior
+on read (since our read/write sets are neither identical nor disjoint),
+but in practice, we'll get either the old value (Gauss-Seidel) or the
+new one (Jacobi); more likely, the former.
+
+Successive over-relaxation (SOR) improves further on this, in that it
+assumes that the solution moves towards the right value, so why not
+just go a bit further? That is, if Jacobi would tell you to increase
+the flow by 1.0 pixel to the right, perhaps go 1.5 pixels to the right
+instead (this value is called Ï). Again, the convergence proof is beyond the
+scope here, but SOR converges for any Ï between 1 and 2 (1 gives plain
+Jacobi, and over 2, we risk overshooting and never converging). Optimal
+Ï depends on the equation system; DIS uses Ï = 1.6, which presumably was
+measured, and should be OK for us too, even if we are closer to Gauss-Seidel
+than to Jacobi.
+
+Do note that the DeepFlow code does not fully use SOR or even Gauss-Seidel;
+it solves every 2x2 block (ie., single du/dv pair) using Cramer's rule,
+and then pushes that vector 80% further, SOR-style. This would be clearly
+more accurate if we didn't have SOR in the mix (since du and dv would
+converge immediately relative to each other, bar Cramer's numerical issues),
+but I'm not sure whether it's better given SOR. (DIS changes this to a more
+traditional SOR formulation, which we also use.)
+
+And that's it. References:
+
+[Brox04]: Brox, Bruhn, Papenberg, Weickert: âHigh Accuracy Optical Flow
+  Estimation Based on a Theory for Warpingâ, in Proceedings of the European
+  Conference on Computer Vision (ECCV), 2004
+[Brox05]: Brox: âFrom Pixels to Regions: Partial Differential Equations in
+  Image Analysisâ, PhD thesis, 2005
+[Brox11]: Brox, Malik: âLarge Displacement Optical Flow: Descriptor Matching in
+  Variational Motion Estimationâ, IEEE Transactions on Pattern Analysis and
+  Machine Intelligence, 2011
+[Ciarlet78]: Ciarlet: âThe Finite Element Method for Elliptic Problemsâ, 1978
+[Fahad07]: Fahad, Morris: âMultiple Combined Constraints for Optical Flow
+  Estimationâ, in Proceedings of the 3rd International Conference on Advances
+  in Visual Computing (ISVC), 2007
+[Kroeger05]: Kroeger, Timofte, Dai, van Gool: âFast Optical Flow using Dense
+  Inverse Searchâ, in Proceedings of the European Conference on Computer Vision
+  (ECCV), 2016
+[Weinzaepfel13]: Weinzaepfel, Revaud, Harchaoui, Schmid: âDeepFlow: Large
+  displacement optical flow with deep matchingâ, in IEEE International Conference
+  on Computer Vision (ICCV), 2013
+[Zimmer11]: Zimmer, Bruhn, Weickert: âOptic Flow in Harmonyâ, International
+  Journal of Computer Vision, 2011
-- 
2.39.5