]> git.sesse.net Git - nageru/commitdiff
Support rendering forward and backward flow in parallel.
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Tue, 7 Aug 2018 22:02:41 +0000 (00:02 +0200)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Wed, 8 Aug 2018 22:23:32 +0000 (00:23 +0200)
~15% faster flow computation on GTX 950; the lower resolutions are so
inherently low-parallel, so we get backward flow on those levels
essentially for free. Should be even more important on larger GPUs.

20 files changed:
add_base_flow.frag
blend.frag
densify.frag
densify.vert
derivatives.frag
diffusivity.frag
equations.frag
equations.vert
flow.cpp
gray.frag
motion_search.frag
motion_search.vert
prewarp.frag
resize_flow.frag
sobel.frag
sor.frag
sor.vert
splat.frag
splat.vert
vs.vert

index 2e774da8b79dc2927c1c597241d68f418e33901d..ac5610107466862b914e273358166709fce7c77a 100644 (file)
@@ -1,9 +1,9 @@
 #version 450 core
 
-in vec2 tc;
+in vec3 tc;
 out vec2 diff_flow;
 
-uniform sampler2D diff_flow_tex;
+uniform sampler2DArray diff_flow_tex;
 
 void main()
 {
index 5f78259242775c876e93c62248230d4959a95c95..e88da50b929ce23516586d97a4960577ebafa824 100644 (file)
@@ -1,27 +1,28 @@
 #version 450 core
 
-in vec2 tc;
+in vec3 tc;
 out vec4 rgba;
 
-uniform sampler2D image0_tex, image1_tex, flow_tex;
+uniform sampler2DArray image_tex;
+uniform sampler2D flow_tex;
 uniform float alpha;
 
 void main()
 {
-       vec2 flow = texture(flow_tex, tc).xy;
-       vec4 I_0 = texture(image0_tex, tc - alpha * flow);
-       vec4 I_1 = texture(image1_tex, tc + (1.0f - alpha) * flow);
+       vec2 flow = texture(flow_tex, tc.xy).xy;
+       vec4 I_0 = texture(image_tex, vec3(tc.xy - alpha * flow, 0));
+       vec4 I_1 = texture(image_tex, vec3(tc.xy + (1.0f - alpha) * flow, 1));
 
        // Occlusion reasoning:
 
-       vec2 size = textureSize(image0_tex, 0);
+       vec2 size = textureSize(image_tex, 0).xy;
 
        // Follow the flow back to the initial point (where we sample I_0 from), then forward again.
        // See how well we match the point we started at, which is out flow consistency.
-       float d0 = alpha * length(size * (texture(flow_tex, tc - alpha * flow).xy - flow));
+       float d0 = alpha * length(size * (texture(flow_tex, vec2(tc.xy - alpha * flow)).xy - flow));
 
        // Same for d1.
-       float d1 = (1.0f - alpha) * length(size * (texture(flow_tex, tc + (1.0f - alpha) * flow).xy - flow));
+       float d1 = (1.0f - alpha) * length(size * (texture(flow_tex, vec2(tc.xy + (1.0f - alpha) * flow)).xy - flow));
 
        if (max(d0, d1) < 3.0f) {  // Arbitrary constant, not all that tuned. The UW paper says 1.0 is fine for ground truth.
                // Both are visible, so blend.
index f9c67ecea6c1b8000d63248de0976833813d395e..3bca126b2a458974cdeb5537771065ee480087fc 100644 (file)
@@ -1,11 +1,12 @@
 #version 450 core
 
 in vec2 image_pos;
+flat in int image0_layer, image1_layer;
 flat in vec2 flow_du;
 flat in float mean_diff;
 out vec3 flow_contribution;
 
-uniform sampler2D image0_tex, image1_tex;
+uniform sampler2DArray image_tex;
 
 void main()
 {
@@ -16,7 +17,7 @@ void main()
        // Note that equation (2) says 1 for the minimum error, but the code says 2.0.
        // And it says L2 norm, but really, the code does absolute value even for
        // L2 error norm (it uses a square root formula for L1 norm).
-       float diff = texture(image0_tex, image_pos).x - texture(image1_tex, image_pos + flow_du).x;
+       float diff = texture(image_tex, vec3(image_pos, image0_layer)).x - texture(image_tex, vec3(image_pos + flow_du, image1_layer)).x;
        diff -= mean_diff;
        float weight = 1.0 / max(abs(diff), 2.0 / 255.0);
        flow_contribution = vec3(flow_du.x * weight, flow_du.y * weight, weight);
index 50d230f2f4fd46ae82f4adee4c7c7c1c6cf0006a..181c7f32b72203ed16244253097e0223fe4493c2 100644 (file)
@@ -1,22 +1,26 @@
 #version 450 core
+#extension GL_ARB_shader_viewport_layer_array : require
 
 layout(location=0) in vec2 position;
 out vec2 image_pos;
 flat out vec2 flow_du;
 flat out float mean_diff;
+flat out int image0_layer, image1_layer;
 
 uniform vec2 patch_size;  // In 0..1 coordinates.
-uniform sampler2D flow_tex;
+uniform sampler2DArray flow_tex;
 
 void main()
 {
+       int num_patches = textureSize(flow_tex, 0).x * textureSize(flow_tex, 0).y;
+       int patch_layer = gl_InstanceID / num_patches;
        int patch_x = gl_InstanceID % textureSize(flow_tex, 0).x;
-       int patch_y = gl_InstanceID / textureSize(flow_tex, 0).x;
+       int patch_y = (gl_InstanceID % num_patches) / textureSize(flow_tex, 0).x;
 
        // Convert the patch index to being the full 0..1 range, to match where
        // the motion search puts the patches. We don't bother with the locking
        // to texel centers, though.
-       vec2 patch_center = ivec2(patch_x, patch_y) / (textureSize(flow_tex, 0) - 1.0);
+       vec2 patch_center = ivec2(patch_x, patch_y) / (textureSize(flow_tex, 0).xy - 1.0);
 
        // Increase the patch size a bit; since patch spacing is not necessarily
        // an integer number of pixels, and we don't use conservative rasterization,
@@ -32,7 +36,7 @@ void main()
        image_pos = patch_center + patch_size * (grown_pos - 0.5f);
 
        // Find the flow value for this patch, and send it on to the fragment shader.
-       vec3 flow_du_and_mean_diff = texelFetch(flow_tex, ivec2(patch_x, patch_y), 0).xyz;
+       vec3 flow_du_and_mean_diff = texelFetch(flow_tex, ivec3(patch_x, patch_y, patch_layer), 0).xyz;
        flow_du = flow_du_and_mean_diff.xy;
        mean_diff = flow_du_and_mean_diff.z;
 
@@ -43,4 +47,9 @@ void main()
        //   0.000  0.000 -2.000 -1.000
        //   0.000  0.000  0.000  1.000
        gl_Position = vec4(2.0 * image_pos.x - 1.0, 2.0 * image_pos.y - 1.0, -1.0, 1.0);
+       gl_Layer = patch_layer;
+
+       // Forward flow (0) goes from 0 to 1. Backward flow (1) goes from 1 to 0.
+       image0_layer = patch_layer;
+       image1_layer = 1 - patch_layer;
 }
index b10125ea13cd1e539360ecaab519e065cc6b85bb..0e2fd687c9cd483ba2f33d5ca40f40046c0eea80 100644 (file)
@@ -1,10 +1,10 @@
 #version 450 core
 
-in vec2 tc;
+in vec3 tc;
 out vec2 derivatives;
 out float beta_0;
 
-uniform sampler2D tex;
+uniform sampler2DArray tex;
 
 void main()
 {
index 8e83e48783e6f7a17fa87dd4e2e7a6e0c02330d0..345c3ebf0a8b1815646db7069ba6fbd8d85717c3 100644 (file)
@@ -1,10 +1,10 @@
 #version 450 core
 
-in vec2 tc;
+in vec3 tc;
 out float g;
 const float eps_sq = 0.001 * 0.001;
 
-uniform sampler2D flow_tex, diff_flow_tex;
+uniform sampler2DArray flow_tex, diff_flow_tex;
 
 // Relative weighting of smoothness term.
 uniform float alpha;
index 8bf0cbb4f7a68bd49fc79be400365fc973ee7ad4..04e5370fd9d53baf5afda39ee140a526d6937945 100644 (file)
@@ -1,14 +1,14 @@
 #version 450 core
 
-in vec2 tc0, tc_left0, tc_down0;
-in vec2 tc1, tc_left1, tc_down1;
+in vec3 tc0, tc_left0, tc_down0;
+in vec3 tc1, tc_left1, tc_down1;
 in float line_offset;
 out uvec4 equation_red, equation_black;
 
-uniform sampler2D I_x_y_tex, I_t_tex;
-uniform sampler2D diff_flow_tex, base_flow_tex;
-uniform sampler2D beta_0_tex;
-uniform sampler2D diffusivity_tex;
+uniform sampler2DArray I_x_y_tex, I_t_tex;
+uniform sampler2DArray diff_flow_tex, base_flow_tex;
+uniform sampler2DArray beta_0_tex;
+uniform sampler2DArray diffusivity_tex;
 
 // Relative weighting of intensity term.
 uniform float delta;
@@ -67,7 +67,7 @@ float zero_if_outside_border(vec4 val)
        }
 }
 
-uvec4 compute_equation(vec2 tc, vec2 tc_left, vec2 tc_down)
+uvec4 compute_equation(vec3 tc, vec3 tc_left, vec3 tc_down)
 {
        // Read the flow (on top of the u0/v0 flow).
        float du, dv;
index eec3c170aa6c7cb2361d1d2a6daa744ff0ad85fd..731e1599170499c6776f4b37b05d4b0ef8e58b91 100644 (file)
@@ -1,11 +1,12 @@
 #version 450 core
+#extension GL_ARB_shader_viewport_layer_array : require
 
 layout(location=0) in vec2 position;
-out vec2 tc0, tc_left0, tc_down0;
-out vec2 tc1, tc_left1, tc_down1;
+out vec3 tc0, tc_left0, tc_down0;
+out vec3 tc1, tc_left1, tc_down1;
 out float line_offset;
 
-uniform sampler2D diffusivity_tex;
+uniform sampler2DArray diffusivity_tex;
 
 void main()
 {
@@ -16,21 +17,22 @@ void main()
        //   0.000  0.000 -2.000 -1.000
        //   0.000  0.000  0.000  1.000
        gl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0);
+       gl_Layer = gl_InstanceID;
 
-       const vec2 half_texel = 0.5f / textureSize(diffusivity_tex, 0);
+       const vec2 half_texel = 0.5f / textureSize(diffusivity_tex, 0).xy;
 
        vec2 tc = position;
        vec2 tc_left = vec2(tc.x - half_texel.x, tc.y);
        vec2 tc_down = vec2(tc.x, tc.y - half_texel.y);
 
        // Adjust for different texel centers.
-       tc0 = vec2(tc.x - half_texel.x, tc.y);
-       tc_left0 = vec2(tc_left.x - half_texel.x, tc_left.y);
-       tc_down0 = vec2(tc_down.x - half_texel.x, tc_down.y);
+       tc0 = vec3(tc.x - half_texel.x, tc.y, gl_InstanceID);
+       tc_left0 = vec3(tc_left.x - half_texel.x, tc_left.y, gl_InstanceID);
+       tc_down0 = vec3(tc_down.x - half_texel.x, tc_down.y, gl_InstanceID);
 
-       tc1 = vec2(tc.x + half_texel.x, tc.y);
-       tc_left1 = vec2(tc_left.x + half_texel.x, tc_left.y);
-       tc_down1 = vec2(tc_down.x + half_texel.x, tc_down.y);
+       tc1 = vec3(tc.x + half_texel.x, tc.y, gl_InstanceID);
+       tc_left1 = vec3(tc_left.x + half_texel.x, tc_left.y, gl_InstanceID);
+       tc_down1 = vec3(tc_down.x + half_texel.x, tc_down.y, gl_InstanceID);
 
        line_offset = position.y * textureSize(diffusivity_tex, 0).y - 0.5f;
 }
index 9d04b0975c813f43b85f9b1f9a5b5ddccd51aa4a..b673b9d91cbde9dd14a98dfeb432e4709e3c599f 100644 (file)
--- a/flow.cpp
+++ b/flow.cpp
@@ -344,7 +344,7 @@ void PersistentFBOSetWithDepth<num_elements>::render_to(GLuint depth_rb, const a
 class GrayscaleConversion {
 public:
        GrayscaleConversion();
-       void exec(GLint tex, GLint gray_tex, int width, int height);
+       void exec(GLint tex, GLint gray_tex, int width, int height, int num_layers);
 
 private:
        PersistentFBOSet<1> fbos;
@@ -373,7 +373,7 @@ GrayscaleConversion::GrayscaleConversion()
        uniform_tex = glGetUniformLocation(gray_program, "tex");
 }
 
-void GrayscaleConversion::exec(GLint tex, GLint gray_tex, int width, int height)
+void GrayscaleConversion::exec(GLint tex, GLint gray_tex, int width, int height, int num_layers)
 {
        glUseProgram(gray_program);
        bind_sampler(gray_program, uniform_tex, 0, tex, nearest_sampler);
@@ -382,7 +382,7 @@ void GrayscaleConversion::exec(GLint tex, GLint gray_tex, int width, int height)
        fbos.render_to(gray_tex);
        glBindVertexArray(gray_vao);
        glDisable(GL_BLEND);
-       glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
 }
 
 // Compute gradients in every point, used for the motion search.
@@ -396,7 +396,7 @@ void GrayscaleConversion::exec(GLint tex, GLint gray_tex, int width, int height)
 class Sobel {
 public:
        Sobel();
-       void exec(GLint tex0_view, GLint grad0_tex, int level_width, int level_height);
+       void exec(GLint tex_view, GLint grad_tex, int level_width, int level_height, int num_layers);
 
 private:
        PersistentFBOSet<1> fbos;
@@ -416,22 +416,22 @@ Sobel::Sobel()
        uniform_tex = glGetUniformLocation(sobel_program, "tex");
 }
 
-void Sobel::exec(GLint tex0_view, GLint grad0_tex, int level_width, int level_height)
+void Sobel::exec(GLint tex_view, GLint grad_tex, int level_width, int level_height, int num_layers)
 {
        glUseProgram(sobel_program);
-       bind_sampler(sobel_program, uniform_tex, 0, tex0_view, nearest_sampler);
+       bind_sampler(sobel_program, uniform_tex, 0, tex_view, nearest_sampler);
 
        glViewport(0, 0, level_width, level_height);
-       fbos.render_to(grad0_tex);
+       fbos.render_to(grad_tex);
        glDisable(GL_BLEND);
-       glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
 }
 
 // Motion search to find the initial flow. See motion_search.frag for documentation.
 class MotionSearch {
 public:
        MotionSearch();
-       void exec(GLuint tex0_view, GLuint tex1_view, GLuint grad0_tex, GLuint flow_tex, GLuint flow_out_tex, int level_width, int level_height, int prev_level_width, int prev_level_height, int width_patches, int height_patches);
+       void exec(GLuint tex_view, GLuint grad_tex, GLuint flow_tex, GLuint flow_out_tex, int level_width, int level_height, int prev_level_width, int prev_level_height, int width_patches, int height_patches, int num_layers);
 
 private:
        PersistentFBOSet<1> fbos;
@@ -441,7 +441,7 @@ private:
        GLuint motion_search_program;
 
        GLuint uniform_inv_image_size, uniform_inv_prev_level_size, uniform_out_flow_size;
-       GLuint uniform_image1_tex, uniform_grad0_tex, uniform_flow_tex;
+       GLuint uniform_image_tex, uniform_grad_tex, uniform_flow_tex;
 };
 
 MotionSearch::MotionSearch()
@@ -453,18 +453,18 @@ MotionSearch::MotionSearch()
        uniform_inv_image_size = glGetUniformLocation(motion_search_program, "inv_image_size");
        uniform_inv_prev_level_size = glGetUniformLocation(motion_search_program, "inv_prev_level_size");
        uniform_out_flow_size = glGetUniformLocation(motion_search_program, "out_flow_size");
-       uniform_image1_tex = glGetUniformLocation(motion_search_program, "image1_tex");
-       uniform_grad0_tex = glGetUniformLocation(motion_search_program, "grad0_tex");
+       uniform_image_tex = glGetUniformLocation(motion_search_program, "image_tex");
+       uniform_grad_tex = glGetUniformLocation(motion_search_program, "grad_tex");
        uniform_flow_tex = glGetUniformLocation(motion_search_program, "flow_tex");
 }
 
-void MotionSearch::exec(GLuint tex0_view, GLuint tex1_view, GLuint grad0_tex, GLuint flow_tex, GLuint flow_out_tex, int level_width, int level_height, int prev_level_width, int prev_level_height, int width_patches, int height_patches)
+void MotionSearch::exec(GLuint tex_view, GLuint grad_tex, GLuint flow_tex, GLuint flow_out_tex, int level_width, int level_height, int prev_level_width, int prev_level_height, int width_patches, int height_patches, int num_layers)
 {
        glUseProgram(motion_search_program);
 
-       bind_sampler(motion_search_program, uniform_image1_tex, 1, tex1_view, linear_sampler);
-       bind_sampler(motion_search_program, uniform_grad0_tex, 2, grad0_tex, nearest_sampler);
-       bind_sampler(motion_search_program, uniform_flow_tex, 3, flow_tex, linear_sampler);
+       bind_sampler(motion_search_program, uniform_image_tex, 0, tex_view, linear_sampler);
+       bind_sampler(motion_search_program, uniform_grad_tex, 1, grad_tex, nearest_sampler);
+       bind_sampler(motion_search_program, uniform_flow_tex, 2, flow_tex, linear_sampler);
 
        glProgramUniform2f(motion_search_program, uniform_inv_image_size, 1.0f / level_width, 1.0f / level_height);
        glProgramUniform2f(motion_search_program, uniform_inv_prev_level_size, 1.0f / prev_level_width, 1.0f / prev_level_height);
@@ -472,7 +472,7 @@ void MotionSearch::exec(GLuint tex0_view, GLuint tex1_view, GLuint grad0_tex, GL
 
        glViewport(0, 0, width_patches, height_patches);
        fbos.render_to(flow_out_tex);
-       glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
 }
 
 // Do “densification”, ie., upsampling of the flow patches to the flow field
@@ -486,7 +486,7 @@ void MotionSearch::exec(GLuint tex0_view, GLuint tex1_view, GLuint grad0_tex, GL
 class Densify {
 public:
        Densify();
-       void exec(GLuint tex0_view, GLuint tex1_view, GLuint flow_tex, GLuint dense_flow_tex, int level_width, int level_height, int width_patches, int height_patches);
+       void exec(GLuint tex_view, GLuint flow_tex, GLuint dense_flow_tex, int level_width, int level_height, int width_patches, int height_patches, int num_layers);
 
 private:
        PersistentFBOSet<1> fbos;
@@ -496,7 +496,7 @@ private:
        GLuint densify_program;
 
        GLuint uniform_patch_size;
-       GLuint uniform_image0_tex, uniform_image1_tex, uniform_flow_tex;
+       GLuint uniform_image_tex, uniform_flow_tex;
 };
 
 Densify::Densify()
@@ -506,18 +506,16 @@ Densify::Densify()
        densify_program = link_program(densify_vs_obj, densify_fs_obj);
 
        uniform_patch_size = glGetUniformLocation(densify_program, "patch_size");
-       uniform_image0_tex = glGetUniformLocation(densify_program, "image0_tex");
-       uniform_image1_tex = glGetUniformLocation(densify_program, "image1_tex");
+       uniform_image_tex = glGetUniformLocation(densify_program, "image_tex");
        uniform_flow_tex = glGetUniformLocation(densify_program, "flow_tex");
 }
 
-void Densify::exec(GLuint tex0_view, GLuint tex1_view, GLuint flow_tex, GLuint dense_flow_tex, int level_width, int level_height, int width_patches, int height_patches)
+void Densify::exec(GLuint tex_view, GLuint flow_tex, GLuint dense_flow_tex, int level_width, int level_height, int width_patches, int height_patches, int num_layers)
 {
        glUseProgram(densify_program);
 
-       bind_sampler(densify_program, uniform_image0_tex, 0, tex0_view, nearest_sampler);
-       bind_sampler(densify_program, uniform_image1_tex, 1, tex1_view, linear_sampler);
-       bind_sampler(densify_program, uniform_flow_tex, 2, flow_tex, nearest_sampler);
+       bind_sampler(densify_program, uniform_image_tex, 0, tex_view, linear_sampler);
+       bind_sampler(densify_program, uniform_flow_tex, 1, flow_tex, nearest_sampler);
 
        glProgramUniform2f(densify_program, uniform_patch_size,
                float(patch_size_pixels) / level_width,
@@ -529,7 +527,7 @@ void Densify::exec(GLuint tex0_view, GLuint tex1_view, GLuint flow_tex, GLuint d
        fbos.render_to(dense_flow_tex);
        glClearColor(0.0f, 0.0f, 0.0f, 0.0f);
        glClear(GL_COLOR_BUFFER_BIT);
-       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, width_patches * height_patches);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, width_patches * height_patches * num_layers);
 }
 
 // Warp I_1 to I_w, and then compute the mean (I) and difference (I_t) of
@@ -543,7 +541,7 @@ void Densify::exec(GLuint tex0_view, GLuint tex1_view, GLuint flow_tex, GLuint d
 class Prewarp {
 public:
        Prewarp();
-       void exec(GLuint tex0_view, GLuint tex1_view, GLuint flow_tex, GLuint normalized_flow_tex, GLuint I_tex, GLuint I_t_tex, int level_width, int level_height);
+       void exec(GLuint tex_view, GLuint flow_tex, GLuint normalized_flow_tex, GLuint I_tex, GLuint I_t_tex, int level_width, int level_height, int num_layers);
 
 private:
        PersistentFBOSet<3> fbos;
@@ -552,7 +550,7 @@ private:
        GLuint prewarp_fs_obj;
        GLuint prewarp_program;
 
-       GLuint uniform_image0_tex, uniform_image1_tex, uniform_flow_tex;
+       GLuint uniform_image_tex, uniform_flow_tex;
 };
 
 Prewarp::Prewarp()
@@ -561,23 +559,21 @@ Prewarp::Prewarp()
        prewarp_fs_obj = compile_shader(read_file("prewarp.frag"), GL_FRAGMENT_SHADER);
        prewarp_program = link_program(prewarp_vs_obj, prewarp_fs_obj);
 
-       uniform_image0_tex = glGetUniformLocation(prewarp_program, "image0_tex");
-       uniform_image1_tex = glGetUniformLocation(prewarp_program, "image1_tex");
+       uniform_image_tex = glGetUniformLocation(prewarp_program, "image_tex");
        uniform_flow_tex = glGetUniformLocation(prewarp_program, "flow_tex");
 }
 
-void Prewarp::exec(GLuint tex0_view, GLuint tex1_view, GLuint flow_tex, GLuint I_tex, GLuint I_t_tex, GLuint normalized_flow_tex, int level_width, int level_height)
+void Prewarp::exec(GLuint tex_view, GLuint flow_tex, GLuint I_tex, GLuint I_t_tex, GLuint normalized_flow_tex, int level_width, int level_height, int num_layers)
 {
        glUseProgram(prewarp_program);
 
-       bind_sampler(prewarp_program, uniform_image0_tex, 0, tex0_view, nearest_sampler);
-       bind_sampler(prewarp_program, uniform_image1_tex, 1, tex1_view, linear_sampler);
-       bind_sampler(prewarp_program, uniform_flow_tex, 2, flow_tex, nearest_sampler);
+       bind_sampler(prewarp_program, uniform_image_tex, 0, tex_view, linear_sampler);
+       bind_sampler(prewarp_program, uniform_flow_tex, 1, flow_tex, nearest_sampler);
 
        glViewport(0, 0, level_width, level_height);
        glDisable(GL_BLEND);
        fbos.render_to(I_tex, I_t_tex, normalized_flow_tex);
-       glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
 }
 
 // From I, calculate the partial derivatives I_x and I_y. We use a four-tap
@@ -591,7 +587,7 @@ void Prewarp::exec(GLuint tex0_view, GLuint tex1_view, GLuint flow_tex, GLuint I
 class Derivatives {
 public:
        Derivatives();
-       void exec(GLuint input_tex, GLuint I_x_y_tex, GLuint beta_0_tex, int level_width, int level_height);
+       void exec(GLuint input_tex, GLuint I_x_y_tex, GLuint beta_0_tex, int level_width, int level_height, int num_layers);
 
 private:
        PersistentFBOSet<2> fbos;
@@ -612,7 +608,7 @@ Derivatives::Derivatives()
        uniform_tex = glGetUniformLocation(derivatives_program, "tex");
 }
 
-void Derivatives::exec(GLuint input_tex, GLuint I_x_y_tex, GLuint beta_0_tex, int level_width, int level_height)
+void Derivatives::exec(GLuint input_tex, GLuint I_x_y_tex, GLuint beta_0_tex, int level_width, int level_height, int num_layers)
 {
        glUseProgram(derivatives_program);
 
@@ -621,7 +617,7 @@ void Derivatives::exec(GLuint input_tex, GLuint I_x_y_tex, GLuint beta_0_tex, in
        glViewport(0, 0, level_width, level_height);
        glDisable(GL_BLEND);
        fbos.render_to(I_x_y_tex, beta_0_tex);
-       glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
 }
 
 // Calculate the diffusivity for each pixels, g(x,y). Smoothness (s) will
@@ -633,7 +629,7 @@ void Derivatives::exec(GLuint input_tex, GLuint I_x_y_tex, GLuint beta_0_tex, in
 class ComputeDiffusivity {
 public:
        ComputeDiffusivity();
-       void exec(GLuint flow_tex, GLuint diff_flow_tex, GLuint diffusivity_tex, int level_width, int level_height, bool zero_diff_flow);
+       void exec(GLuint flow_tex, GLuint diff_flow_tex, GLuint diffusivity_tex, int level_width, int level_height, bool zero_diff_flow, int num_layers);
 
 private:
        PersistentFBOSet<1> fbos;
@@ -658,7 +654,7 @@ ComputeDiffusivity::ComputeDiffusivity()
        uniform_zero_diff_flow = glGetUniformLocation(diffusivity_program, "zero_diff_flow");
 }
 
-void ComputeDiffusivity::exec(GLuint flow_tex, GLuint diff_flow_tex, GLuint diffusivity_tex, int level_width, int level_height, bool zero_diff_flow)
+void ComputeDiffusivity::exec(GLuint flow_tex, GLuint diff_flow_tex, GLuint diffusivity_tex, int level_width, int level_height, bool zero_diff_flow, int num_layers)
 {
        glUseProgram(diffusivity_program);
 
@@ -671,7 +667,7 @@ void ComputeDiffusivity::exec(GLuint flow_tex, GLuint diff_flow_tex, GLuint diff
 
        glDisable(GL_BLEND);
        fbos.render_to(diffusivity_tex);
-       glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
 }
 
 // Set up the equations set (two equations in two unknowns, per pixel).
@@ -696,7 +692,7 @@ void ComputeDiffusivity::exec(GLuint flow_tex, GLuint diff_flow_tex, GLuint diff
 class SetupEquations {
 public:
        SetupEquations();
-       void exec(GLuint I_x_y_tex, GLuint I_t_tex, GLuint diff_flow_tex, GLuint flow_tex, GLuint beta_0_tex, GLuint diffusivity_tex, GLuint equation_red_tex, GLuint equation_black_tex, int level_width, int level_height, bool zero_diff_flow);
+       void exec(GLuint I_x_y_tex, GLuint I_t_tex, GLuint diff_flow_tex, GLuint flow_tex, GLuint beta_0_tex, GLuint diffusivity_tex, GLuint equation_red_tex, GLuint equation_black_tex, int level_width, int level_height, bool zero_diff_flow, int num_layers);
 
 private:
        PersistentFBOSet<2> fbos;
@@ -729,7 +725,7 @@ SetupEquations::SetupEquations()
        uniform_zero_diff_flow = glGetUniformLocation(equations_program, "zero_diff_flow");
 }
 
-void SetupEquations::exec(GLuint I_x_y_tex, GLuint I_t_tex, GLuint diff_flow_tex, GLuint base_flow_tex, GLuint beta_0_tex, GLuint diffusivity_tex, GLuint equation_red_tex, GLuint equation_black_tex, int level_width, int level_height, bool zero_diff_flow)
+void SetupEquations::exec(GLuint I_x_y_tex, GLuint I_t_tex, GLuint diff_flow_tex, GLuint base_flow_tex, GLuint beta_0_tex, GLuint diffusivity_tex, GLuint equation_red_tex, GLuint equation_black_tex, int level_width, int level_height, bool zero_diff_flow, int num_layers)
 {
        glUseProgram(equations_program);
 
@@ -745,8 +741,8 @@ void SetupEquations::exec(GLuint I_x_y_tex, GLuint I_t_tex, GLuint diff_flow_tex
 
        glViewport(0, 0, (level_width + 1) / 2, level_height);
        glDisable(GL_BLEND);
-       fbos.render_to({equation_red_tex, equation_black_tex});
-       glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+       fbos.render_to(equation_red_tex, equation_black_tex);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
 }
 
 // Actually solve the equation sets made by SetupEquations, by means of
@@ -756,7 +752,7 @@ void SetupEquations::exec(GLuint I_x_y_tex, GLuint I_t_tex, GLuint diff_flow_tex
 class SOR {
 public:
        SOR();
-       void exec(GLuint diff_flow_tex, GLuint equation_red_tex, GLuint equation_black_tex, GLuint diffusivity_tex, int level_width, int level_height, int num_iterations, bool zero_diff_flow, ScopedTimer *sor_timer);
+       void exec(GLuint diff_flow_tex, GLuint equation_red_tex, GLuint equation_black_tex, GLuint diffusivity_tex, int level_width, int level_height, int num_iterations, bool zero_diff_flow, int num_layers, ScopedTimer *sor_timer);
 
 private:
        PersistentFBOSet<1> fbos;
@@ -785,7 +781,7 @@ SOR::SOR()
        uniform_num_nonzero_phases = glGetUniformLocation(sor_program, "num_nonzero_phases");
 }
 
-void SOR::exec(GLuint diff_flow_tex, GLuint equation_red_tex, GLuint equation_black_tex, GLuint diffusivity_tex, int level_width, int level_height, int num_iterations, bool zero_diff_flow, ScopedTimer *sor_timer)
+void SOR::exec(GLuint diff_flow_tex, GLuint equation_red_tex, GLuint equation_black_tex, GLuint diffusivity_tex, int level_width, int level_height, int num_iterations, bool zero_diff_flow, int num_layers, ScopedTimer *sor_timer)
 {
        glUseProgram(sor_program);
 
@@ -813,7 +809,7 @@ void SOR::exec(GLuint diff_flow_tex, GLuint equation_red_tex, GLuint equation_bl
                                glProgramUniform1i(sor_program, uniform_num_nonzero_phases, 0);
                        }
                        glProgramUniform1i(sor_program, uniform_phase, 0);
-                       glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+                       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
                        glTextureBarrier();
                }
                {
@@ -822,7 +818,7 @@ void SOR::exec(GLuint diff_flow_tex, GLuint equation_red_tex, GLuint equation_bl
                                glProgramUniform1i(sor_program, uniform_num_nonzero_phases, 1);
                        }
                        glProgramUniform1i(sor_program, uniform_phase, 1);
-                       glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+                       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
                        if (zero_diff_flow && i == 0) {
                                glProgramUniform1i(sor_program, uniform_num_nonzero_phases, 2);
                        }
@@ -838,7 +834,7 @@ void SOR::exec(GLuint diff_flow_tex, GLuint equation_red_tex, GLuint equation_bl
 class AddBaseFlow {
 public:
        AddBaseFlow();
-       void exec(GLuint base_flow_tex, GLuint diff_flow_tex, int level_width, int level_height);
+       void exec(GLuint base_flow_tex, GLuint diff_flow_tex, int level_width, int level_height, int num_layers);
 
 private:
        PersistentFBOSet<1> fbos;
@@ -859,7 +855,7 @@ AddBaseFlow::AddBaseFlow()
        uniform_diff_flow_tex = glGetUniformLocation(add_flow_program, "diff_flow_tex");
 }
 
-void AddBaseFlow::exec(GLuint base_flow_tex, GLuint diff_flow_tex, int level_width, int level_height)
+void AddBaseFlow::exec(GLuint base_flow_tex, GLuint diff_flow_tex, int level_width, int level_height, int num_layers)
 {
        glUseProgram(add_flow_program);
 
@@ -870,14 +866,14 @@ void AddBaseFlow::exec(GLuint base_flow_tex, GLuint diff_flow_tex, int level_wid
        glBlendFunc(GL_ONE, GL_ONE);
        fbos.render_to(base_flow_tex);
 
-       glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
 }
 
 // Take a copy of the flow, bilinearly interpolated and scaled up.
 class ResizeFlow {
 public:
        ResizeFlow();
-       void exec(GLuint in_tex, GLuint out_tex, int input_width, int input_height, int output_width, int output_height);
+       void exec(GLuint in_tex, GLuint out_tex, int input_width, int input_height, int output_width, int output_height, int num_layers);
 
 private:
        PersistentFBOSet<1> fbos;
@@ -900,7 +896,7 @@ ResizeFlow::ResizeFlow()
        uniform_scale_factor = glGetUniformLocation(resize_flow_program, "scale_factor");
 }
 
-void ResizeFlow::exec(GLuint flow_tex, GLuint out_tex, int input_width, int input_height, int output_width, int output_height)
+void ResizeFlow::exec(GLuint flow_tex, GLuint out_tex, int input_width, int input_height, int output_width, int output_height, int num_layers)
 {
        glUseProgram(resize_flow_program);
 
@@ -912,12 +908,12 @@ void ResizeFlow::exec(GLuint flow_tex, GLuint out_tex, int input_width, int inpu
        glDisable(GL_BLEND);
        fbos.render_to(out_tex);
 
-       glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
 }
 
 class TexturePool {
 public:
-       GLuint get_texture(GLenum format, GLuint width, GLuint height);
+       GLuint get_texture(GLenum format, GLuint width, GLuint height, GLuint num_layers = 0);
        void release_texture(GLuint tex_num);
        GLuint get_renderbuffer(GLenum format, GLuint width, GLuint height);
        void release_renderbuffer(GLuint tex_num);
@@ -926,7 +922,7 @@ private:
        struct Texture {
                GLuint tex_num;
                GLenum format;
-               GLuint width, height;
+               GLuint width, height, num_layers;
                bool in_use = false;
                bool is_renderbuffer = false;
        };
@@ -937,14 +933,19 @@ class DISComputeFlow {
 public:
        DISComputeFlow(int width, int height);
 
+       enum FlowDirection {
+               FORWARD,
+               FORWARD_AND_BACKWARD
+       };
        enum ResizeStrategy {
                DO_NOT_RESIZE_FLOW,
                RESIZE_FLOW_TO_FULL_SIZE
        };
 
+       // The texture must have two layers (first and second frame).
        // Returns a texture that must be released with release_texture()
        // after use.
-       GLuint exec(GLuint tex0, GLuint tex1, ResizeStrategy resize_strategy);
+       GLuint exec(GLuint tex, FlowDirection flow_direction, ResizeStrategy resize_strategy);
 
        void release_texture(GLuint tex) {
                pool.release_texture(tex);
@@ -998,8 +999,8 @@ DISComputeFlow::DISComputeFlow(int width, int height)
        glSamplerParameterfv(zero_border_sampler, GL_TEXTURE_BORDER_COLOR, zero);
 
        // Initial flow is zero, 1x1.
-       glCreateTextures(GL_TEXTURE_2D, 1, &initial_flow_tex);
-       glTextureStorage2D(initial_flow_tex, 1, GL_RG16F, 1, 1);
+       glCreateTextures(GL_TEXTURE_2D_ARRAY, 1, &initial_flow_tex);
+       glTextureStorage3D(initial_flow_tex, 1, GL_RG16F, 1, 1, 1);
        glClearTexImage(initial_flow_tex, 0, GL_RG, GL_FLOAT, nullptr);
 
        // Set up the vertex data that will be shared between all passes.
@@ -1021,8 +1022,9 @@ DISComputeFlow::DISComputeFlow(int width, int height)
        glVertexAttribPointer(position_attrib, 2, GL_FLOAT, GL_FALSE, 0, BUFFER_OFFSET(0));
 }
 
-GLuint DISComputeFlow::exec(GLuint tex0, GLuint tex1, ResizeStrategy resize_strategy)
+GLuint DISComputeFlow::exec(GLuint tex, FlowDirection flow_direction, ResizeStrategy resize_strategy)
 {
+       int num_layers = (flow_direction == FORWARD_AND_BACKWARD) ? 2 : 1;
        int prev_level_width = 1, prev_level_height = 1;
        GLuint prev_level_flow_tex = initial_flow_tex;
 
@@ -1050,44 +1052,41 @@ GLuint DISComputeFlow::exec(GLuint tex0, GLuint tex1, ResizeStrategy resize_stra
                // Make sure we always read from the correct level; the chosen
                // mipmapping could otherwise be rather unpredictable, especially
                // during motion search.
-               GLuint tex0_view, tex1_view;
-               glGenTextures(1, &tex0_view);
-               glTextureView(tex0_view, GL_TEXTURE_2D, tex0, GL_R8, level, 1, 0, 1);
-               glGenTextures(1, &tex1_view);
-               glTextureView(tex1_view, GL_TEXTURE_2D, tex1, GL_R8, level, 1, 0, 1);
+               GLuint tex_view;
+               glGenTextures(1, &tex_view);
+               glTextureView(tex_view, GL_TEXTURE_2D_ARRAY, tex, GL_R8, level, 1, 0, 2);
 
-               // Create a new texture; we could be fancy and render use a multi-level
-               // texture, but meh.
-               GLuint grad0_tex = pool.get_texture(GL_R32UI, level_width, level_height);
+               // Create a new texture to hold the gradients.
+               GLuint grad_tex = pool.get_texture(GL_R32UI, level_width, level_height, num_layers);
 
                // Find the derivative.
                {
                        ScopedTimer timer("Sobel", &level_timer);
-                       sobel.exec(tex0_view, grad0_tex, level_width, level_height);
+                       sobel.exec(tex_view, grad_tex, level_width, level_height, num_layers);
                }
 
                // Motion search to find the initial flow. We use the flow from the previous
                // level (sampled bilinearly; no fancy tricks) as a guide, then search from there.
 
                // Create an output flow texture.
-               GLuint flow_out_tex = pool.get_texture(GL_RGB16F, width_patches, height_patches);
+               GLuint flow_out_tex = pool.get_texture(GL_RGB16F, width_patches, height_patches, num_layers);
 
                // And draw.
                {
                        ScopedTimer timer("Motion search", &level_timer);
-                       motion_search.exec(tex0_view, tex1_view, grad0_tex, prev_level_flow_tex, flow_out_tex, level_width, level_height, prev_level_width, prev_level_height, width_patches, height_patches);
+                       motion_search.exec(tex_view, grad_tex, prev_level_flow_tex, flow_out_tex, level_width, level_height, prev_level_width, prev_level_height, width_patches, height_patches, num_layers);
                }
-               pool.release_texture(grad0_tex);
+               pool.release_texture(grad_tex);
 
                // Densification.
 
                // Set up an output texture (cleared in Densify).
-               GLuint dense_flow_tex = pool.get_texture(GL_RGB16F, level_width, level_height);
+               GLuint dense_flow_tex = pool.get_texture(GL_RGB16F, level_width, level_height, num_layers);
 
                // And draw.
                {
                        ScopedTimer timer("Densification", &level_timer);
-                       densify.exec(tex0_view, tex1_view, flow_out_tex, dense_flow_tex, level_width, level_height, width_patches, height_patches);
+                       densify.exec(tex_view, flow_out_tex, dense_flow_tex, level_width, level_height, width_patches, height_patches, num_layers);
                }
                pool.release_texture(flow_out_tex);
 
@@ -1101,60 +1100,59 @@ GLuint DISComputeFlow::exec(GLuint tex0, GLuint tex1, ResizeStrategy resize_stra
                // in pixels, not 0..1 normalized OpenGL texture coordinates.
                // This is because variational refinement depends so heavily on derivatives,
                // which are measured in intensity levels per pixel.
-               GLuint I_tex = pool.get_texture(GL_R16F, level_width, level_height);
-               GLuint I_t_tex = pool.get_texture(GL_R16F, level_width, level_height);
-               GLuint base_flow_tex = pool.get_texture(GL_RG16F, level_width, level_height);
+               GLuint I_tex = pool.get_texture(GL_R16F, level_width, level_height, num_layers);
+               GLuint I_t_tex = pool.get_texture(GL_R16F, level_width, level_height, num_layers);
+               GLuint base_flow_tex = pool.get_texture(GL_RG16F, level_width, level_height, num_layers);
                {
                        ScopedTimer timer("Prewarping", &varref_timer);
-                       prewarp.exec(tex0_view, tex1_view, dense_flow_tex, I_tex, I_t_tex, base_flow_tex, level_width, level_height);
+                       prewarp.exec(tex_view, dense_flow_tex, I_tex, I_t_tex, base_flow_tex, level_width, level_height, num_layers);
                }
                pool.release_texture(dense_flow_tex);
-               glDeleteTextures(1, &tex0_view);
-               glDeleteTextures(1, &tex1_view);
+               glDeleteTextures(1, &tex_view);
 
                // Calculate I_x and I_y. We're only calculating first derivatives;
                // the others will be taken on-the-fly in order to sample from fewer
                // textures overall, since sampling from the L1 cache is cheap.
                // (TODO: Verify that this is indeed faster than making separate
                // double-derivative textures.)
-               GLuint I_x_y_tex = pool.get_texture(GL_RG16F, level_width, level_height);
-               GLuint beta_0_tex = pool.get_texture(GL_R16F, level_width, level_height);
+               GLuint I_x_y_tex = pool.get_texture(GL_RG16F, level_width, level_height, num_layers);
+               GLuint beta_0_tex = pool.get_texture(GL_R16F, level_width, level_height, num_layers);
                {
                        ScopedTimer timer("First derivatives", &varref_timer);
-                       derivatives.exec(I_tex, I_x_y_tex, beta_0_tex, level_width, level_height);
+                       derivatives.exec(I_tex, I_x_y_tex, beta_0_tex, level_width, level_height, num_layers);
                }
                pool.release_texture(I_tex);
 
                // We need somewhere to store du and dv (the flow increment, relative
                // to the non-refined base flow u0 and v0). It's initially garbage,
                // but not read until we've written something sane to it.
-               GLuint diff_flow_tex = pool.get_texture(GL_RG16F, level_width, level_height);
+               GLuint diff_flow_tex = pool.get_texture(GL_RG16F, level_width, level_height, num_layers);
 
                // And for diffusivity.
-               GLuint diffusivity_tex = pool.get_texture(GL_R16F, level_width, level_height);
+               GLuint diffusivity_tex = pool.get_texture(GL_R16F, level_width, level_height, num_layers);
 
                // And finally for the equation set. See SetupEquations for
                // the storage format.
-               GLuint equation_red_tex = pool.get_texture(GL_RGBA32UI, (level_width + 1) / 2, level_height);
-               GLuint equation_black_tex = pool.get_texture(GL_RGBA32UI, (level_width + 1) / 2, level_height);
+               GLuint equation_red_tex = pool.get_texture(GL_RGBA32UI, (level_width + 1) / 2, level_height, num_layers);
+               GLuint equation_black_tex = pool.get_texture(GL_RGBA32UI, (level_width + 1) / 2, level_height, num_layers);
 
                for (int outer_idx = 0; outer_idx < level + 1; ++outer_idx) {
                        // Calculate the diffusivity term for each pixel.
                        {
                                ScopedTimer timer("Compute diffusivity", &varref_timer);
-                               compute_diffusivity.exec(base_flow_tex, diff_flow_tex, diffusivity_tex, level_width, level_height, outer_idx == 0);
+                               compute_diffusivity.exec(base_flow_tex, diff_flow_tex, diffusivity_tex, level_width, level_height, outer_idx == 0, num_layers);
                        }
 
                        // Set up the 2x2 equation system for each pixel.
                        {
                                ScopedTimer timer("Set up equations", &varref_timer);
-                               setup_equations.exec(I_x_y_tex, I_t_tex, diff_flow_tex, base_flow_tex, beta_0_tex, diffusivity_tex, equation_red_tex, equation_black_tex, level_width, level_height, outer_idx == 0);
+                               setup_equations.exec(I_x_y_tex, I_t_tex, diff_flow_tex, base_flow_tex, beta_0_tex, diffusivity_tex, equation_red_tex, equation_black_tex, level_width, level_height, outer_idx == 0, num_layers);
                        }
 
                        // Run a few SOR iterations. Note that these are to/from the same texture.
                        {
                                ScopedTimer timer("SOR", &varref_timer);
-                               sor.exec(diff_flow_tex, equation_red_tex, equation_black_tex, diffusivity_tex, level_width, level_height, 5, outer_idx == 0, &timer);
+                               sor.exec(diff_flow_tex, equation_red_tex, equation_black_tex, diffusivity_tex, level_width, level_height, 5, outer_idx == 0, num_layers, &timer);
                        }
                }
 
@@ -1173,7 +1171,7 @@ GLuint DISComputeFlow::exec(GLuint tex0, GLuint tex1, ResizeStrategy resize_stra
                // it is more efficient), but it helps debug the motion search.
                if (enable_variational_refinement) {
                        ScopedTimer timer("Add differential flow", &varref_timer);
-                       add_base_flow.exec(base_flow_tex, diff_flow_tex, level_width, level_height);
+                       add_base_flow.exec(base_flow_tex, diff_flow_tex, level_width, level_height, num_layers);
                }
                pool.release_texture(diff_flow_tex);
 
@@ -1194,8 +1192,8 @@ GLuint DISComputeFlow::exec(GLuint tex0, GLuint tex1, ResizeStrategy resize_stra
        if (finest_level == 0 || resize_strategy == DO_NOT_RESIZE_FLOW) {
                return prev_level_flow_tex;
        } else {
-               GLuint final_tex = pool.get_texture(GL_RG16F, width, height);
-               resize_flow.exec(prev_level_flow_tex, final_tex, prev_level_width, prev_level_height, width, height);
+               GLuint final_tex = pool.get_texture(GL_RG16F, width, height, num_layers);
+               resize_flow.exec(prev_level_flow_tex, final_tex, prev_level_width, prev_level_height, width, height, num_layers);
                pool.release_texture(prev_level_flow_tex);
                return final_tex;
        }
@@ -1208,7 +1206,7 @@ public:
        Splat();
 
        // alpha is the time of the interpolated frame (0..1).
-       void exec(GLuint tex0, GLuint tex1, GLuint forward_flow_tex, GLuint backward_flow_tex, GLuint flow_tex, GLuint depth_rb, int width, int height, float alpha);
+       void exec(GLuint image_tex, GLuint bidirectional_flow_tex, GLuint flow_tex, GLuint depth_rb, int width, int height, float alpha);
 
 private:
        PersistentFBOSetWithDepth<1> fbos;
@@ -1217,8 +1215,8 @@ private:
        GLuint splat_fs_obj;
        GLuint splat_program;
 
-       GLuint uniform_invert_flow, uniform_splat_size, uniform_alpha;
-       GLuint uniform_image0_tex, uniform_image1_tex, uniform_flow_tex;
+       GLuint uniform_splat_size, uniform_alpha;
+       GLuint uniform_image_tex, uniform_flow_tex;
        GLuint uniform_inv_flow_size;
 };
 
@@ -1228,21 +1226,19 @@ Splat::Splat()
        splat_fs_obj = compile_shader(read_file("splat.frag"), GL_FRAGMENT_SHADER);
        splat_program = link_program(splat_vs_obj, splat_fs_obj);
 
-       uniform_invert_flow = glGetUniformLocation(splat_program, "invert_flow");
        uniform_splat_size = glGetUniformLocation(splat_program, "splat_size");
        uniform_alpha = glGetUniformLocation(splat_program, "alpha");
-       uniform_image0_tex = glGetUniformLocation(splat_program, "image0_tex");
-       uniform_image1_tex = glGetUniformLocation(splat_program, "image1_tex");
+       uniform_image_tex = glGetUniformLocation(splat_program, "image_tex");
        uniform_flow_tex = glGetUniformLocation(splat_program, "flow_tex");
        uniform_inv_flow_size = glGetUniformLocation(splat_program, "inv_flow_size");
 }
 
-void Splat::exec(GLuint tex0, GLuint tex1, GLuint forward_flow_tex, GLuint backward_flow_tex, GLuint flow_tex, GLuint depth_rb, int width, int height, float alpha)
+void Splat::exec(GLuint image_tex, GLuint bidirectional_flow_tex, GLuint flow_tex, GLuint depth_rb, int width, int height, float alpha)
 {
        glUseProgram(splat_program);
 
-       bind_sampler(splat_program, uniform_image0_tex, 0, tex0, linear_sampler);
-       bind_sampler(splat_program, uniform_image1_tex, 1, tex1, linear_sampler);
+       bind_sampler(splat_program, uniform_image_tex, 0, image_tex, linear_sampler);
+       bind_sampler(splat_program, uniform_flow_tex, 1, bidirectional_flow_tex, nearest_sampler);
 
        // FIXME: This is set to 1.0 right now so not to trigger Haswell's “PMA stall”.
        // Move to 2.0 later, or even 4.0.
@@ -1266,15 +1262,7 @@ void Splat::exec(GLuint tex0, GLuint tex1, GLuint forward_flow_tex, GLuint backw
        glClearDepth(1.0f);  // Effectively infinity.
        glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
 
-       // Do forward splatting.
-       bind_sampler(splat_program, uniform_flow_tex, 2, forward_flow_tex, nearest_sampler);
-       glProgramUniform1i(splat_program, uniform_invert_flow, 0);
-       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, width * height);
-
-       // Do backward splatting.
-       bind_sampler(splat_program, uniform_flow_tex, 2, backward_flow_tex, nearest_sampler);
-       glProgramUniform1i(splat_program, uniform_invert_flow, 1);
-       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, width * height);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, width * height * 2);
 
        glDisable(GL_DEPTH_TEST);
 }
@@ -1436,7 +1424,7 @@ void HoleBlend::exec(GLuint flow_tex, GLuint depth_rb, GLuint temp_tex[3], int w
 class Blend {
 public:
        Blend();
-       void exec(GLuint tex0, GLuint tex1, GLuint flow_tex, GLuint output_tex, int width, int height, float alpha);
+       void exec(GLuint image_tex, GLuint flow_tex, GLuint output_tex, int width, int height, float alpha);
 
 private:
        PersistentFBOSet<1> fbos;
@@ -1444,7 +1432,7 @@ private:
        GLuint blend_fs_obj;
        GLuint blend_program;
 
-       GLuint uniform_image0_tex, uniform_image1_tex, uniform_flow_tex;
+       GLuint uniform_image_tex, uniform_flow_tex;
        GLuint uniform_alpha, uniform_flow_consistency_tolerance;
 };
 
@@ -1454,19 +1442,17 @@ Blend::Blend()
        blend_fs_obj = compile_shader(read_file("blend.frag"), GL_FRAGMENT_SHADER);
        blend_program = link_program(blend_vs_obj, blend_fs_obj);
 
-       uniform_image0_tex = glGetUniformLocation(blend_program, "image0_tex");
-       uniform_image1_tex = glGetUniformLocation(blend_program, "image1_tex");
+       uniform_image_tex = glGetUniformLocation(blend_program, "image_tex");
        uniform_flow_tex = glGetUniformLocation(blend_program, "flow_tex");
        uniform_alpha = glGetUniformLocation(blend_program, "alpha");
        uniform_flow_consistency_tolerance = glGetUniformLocation(blend_program, "flow_consistency_tolerance");
 }
 
-void Blend::exec(GLuint tex0, GLuint tex1, GLuint flow_tex, GLuint output_tex, int level_width, int level_height, float alpha)
+void Blend::exec(GLuint image_tex, GLuint flow_tex, GLuint output_tex, int level_width, int level_height, float alpha)
 {
        glUseProgram(blend_program);
-       bind_sampler(blend_program, uniform_image0_tex, 0, tex0, linear_sampler);
-       bind_sampler(blend_program, uniform_image1_tex, 1, tex1, linear_sampler);
-       bind_sampler(blend_program, uniform_flow_tex, 2, flow_tex, linear_sampler);  // May be upsampled.
+       bind_sampler(blend_program, uniform_image_tex, 0, image_tex, linear_sampler);
+       bind_sampler(blend_program, uniform_flow_tex, 1, flow_tex, linear_sampler);  // May be upsampled.
        glProgramUniform1f(blend_program, uniform_alpha, alpha);
 
        glViewport(0, 0, level_width, level_height);
@@ -1480,9 +1466,9 @@ public:
        Interpolate(int width, int height, int flow_level);
 
        // Returns a texture that must be released with release_texture()
-       // after use. tex0 and tex1 must be RGBA8 textures with mipmaps
+       // after use. image_tex must be a two-layer RGBA8 texture with mipmaps
        // (unless flow_level == 0).
-       GLuint exec(GLuint tex0, GLuint tex1, GLuint forward_flow_tex, GLuint backward_flow_tex, GLuint width, GLuint height, float alpha);
+       GLuint exec(GLuint image_tex, GLuint bidirectional_flow_tex, GLuint width, GLuint height, float alpha);
 
        void release_texture(GLuint tex) {
                pool.release_texture(tex);
@@ -1520,7 +1506,7 @@ Interpolate::Interpolate(int width, int height, int flow_level)
        glVertexAttribPointer(position_attrib, 2, GL_FLOAT, GL_FALSE, 0, BUFFER_OFFSET(0));
 }
 
-GLuint Interpolate::exec(GLuint tex0, GLuint tex1, GLuint forward_flow_tex, GLuint backward_flow_tex, GLuint width, GLuint height, float alpha)
+GLuint Interpolate::exec(GLuint image_tex, GLuint bidirectional_flow_tex, GLuint width, GLuint height, float alpha)
 {
        GPUTimers timers;
 
@@ -1529,11 +1515,9 @@ GLuint Interpolate::exec(GLuint tex0, GLuint tex1, GLuint forward_flow_tex, GLui
        glBindVertexArray(vao);
 
        // Pick out the right level to test splatting results on.
-       GLuint tex0_view, tex1_view;
-       glGenTextures(1, &tex0_view);
-       glTextureView(tex0_view, GL_TEXTURE_2D, tex0, GL_RGBA8, flow_level, 1, 0, 1);
-       glGenTextures(1, &tex1_view);
-       glTextureView(tex1_view, GL_TEXTURE_2D, tex1, GL_RGBA8, flow_level, 1, 0, 1);
+       GLuint tex_view;
+       glGenTextures(1, &tex_view);
+       glTextureView(tex_view, GL_TEXTURE_2D_ARRAY, image_tex, GL_RGBA8, flow_level, 1, 0, 2);
 
        int flow_width = width >> flow_level;
        int flow_height = height >> flow_level;
@@ -1543,10 +1527,9 @@ GLuint Interpolate::exec(GLuint tex0, GLuint tex1, GLuint forward_flow_tex, GLui
 
        {
                ScopedTimer timer("Splat", &total_timer);
-               splat.exec(tex0_view, tex1_view, forward_flow_tex, backward_flow_tex, flow_tex, depth_rb, flow_width, flow_height, alpha);
+               splat.exec(tex_view, bidirectional_flow_tex, flow_tex, depth_rb, flow_width, flow_height, alpha);
        }
-       glDeleteTextures(1, &tex0_view);
-       glDeleteTextures(1, &tex1_view);
+       glDeleteTextures(1, &tex_view);
 
        GLuint temp_tex[3];
        temp_tex[0] = pool.get_texture(GL_RG16F, flow_width, flow_height);
@@ -1567,7 +1550,7 @@ GLuint Interpolate::exec(GLuint tex0, GLuint tex1, GLuint forward_flow_tex, GLui
        GLuint output_tex = pool.get_texture(GL_RGBA8, width, height);
        {
                ScopedTimer timer("Blend", &total_timer);
-               blend.exec(tex0, tex1, flow_tex, output_tex, width, height, alpha);
+               blend.exec(image_tex, flow_tex, output_tex, width, height, alpha);
        }
        pool.release_texture(flow_tex);
        total_timer.end();
@@ -1578,22 +1561,28 @@ GLuint Interpolate::exec(GLuint tex0, GLuint tex1, GLuint forward_flow_tex, GLui
        return output_tex;
 }
 
-GLuint TexturePool::get_texture(GLenum format, GLuint width, GLuint height)
+GLuint TexturePool::get_texture(GLenum format, GLuint width, GLuint height, GLuint num_layers)
 {
        for (Texture &tex : textures) {
                if (!tex.in_use && !tex.is_renderbuffer && tex.format == format &&
-                   tex.width == width && tex.height == height) {
+                   tex.width == width && tex.height == height && tex.num_layers == num_layers) {
                        tex.in_use = true;
                        return tex.tex_num;
                }
        }
 
        Texture tex;
-       glCreateTextures(GL_TEXTURE_2D, 1, &tex.tex_num);
-       glTextureStorage2D(tex.tex_num, 1, format, width, height);
+       if (num_layers == 0) {
+               glCreateTextures(GL_TEXTURE_2D, 1, &tex.tex_num);
+               glTextureStorage2D(tex.tex_num, 1, format, width, height);
+       } else {
+               glCreateTextures(GL_TEXTURE_2D_ARRAY, 1, &tex.tex_num);
+               glTextureStorage3D(tex.tex_num, 1, format, width, height, num_layers);
+       }
        tex.format = format;
        tex.width = width;
        tex.height = height;
+       tex.num_layers = num_layers;
        tex.in_use = true;
        tex.is_renderbuffer = false;
        textures.push_back(tex);
@@ -1644,7 +1633,7 @@ void TexturePool::release_renderbuffer(GLuint tex_num)
                        return;
                }
        }
-       assert(false);
+       //assert(false);
 }
 
 // OpenGL uses a bottom-left coordinate system, .flo files use a top-left coordinate system.
@@ -1789,42 +1778,47 @@ void compute_flow_only(int argc, char **argv, int optind)
                exit(1);
        }
 
+       // Move them into an array texture, since that's how the rest of the code
+       // would like them.
+       GLuint image_tex;
+       glCreateTextures(GL_TEXTURE_2D_ARRAY, 1, &image_tex);
+       glTextureStorage3D(image_tex, 1, GL_RGBA8, width1, height1, 2);
+       glCopyImageSubData(tex0, GL_TEXTURE_2D, 0, 0, 0, 0, image_tex, GL_TEXTURE_2D_ARRAY, 0, 0, 0, 0, width1, height1, 1);
+       glCopyImageSubData(tex1, GL_TEXTURE_2D, 0, 0, 0, 0, image_tex, GL_TEXTURE_2D_ARRAY, 0, 0, 0, 1, width1, height1, 1);
+       glDeleteTextures(1, &tex0);
+       glDeleteTextures(1, &tex1);
+
        // Set up some PBOs to do asynchronous readback.
        GLuint pbos[5];
        glCreateBuffers(5, pbos);
        for (int i = 0; i < 5; ++i) {
-               glNamedBufferData(pbos[i], width1 * height1 * 2 * sizeof(float), nullptr, GL_STREAM_READ);
+               glNamedBufferData(pbos[i], width1 * height1 * 2 * 2 * sizeof(float), nullptr, GL_STREAM_READ);
                spare_pbos.push(pbos[i]);
        }
 
        int levels = find_num_levels(width1, height1);
-       GLuint tex0_gray, tex1_gray;
-       glCreateTextures(GL_TEXTURE_2D, 1, &tex0_gray);
-       glCreateTextures(GL_TEXTURE_2D, 1, &tex1_gray);
-       glTextureStorage2D(tex0_gray, levels, GL_R8, width1, height1);
-       glTextureStorage2D(tex1_gray, levels, GL_R8, width1, height1);
 
-       GrayscaleConversion gray;
-       gray.exec(tex0, tex0_gray, width1, height1);
-       glDeleteTextures(1, &tex0);
-       glGenerateTextureMipmap(tex0_gray);
+       GLuint tex_gray;
+       glCreateTextures(GL_TEXTURE_2D_ARRAY, 1, &tex_gray);
+       glTextureStorage3D(tex_gray, levels, GL_R8, width1, height1, 2);
 
-       gray.exec(tex1, tex1_gray, width1, height1);
-       glDeleteTextures(1, &tex1);
-       glGenerateTextureMipmap(tex1_gray);
+       GrayscaleConversion gray;
+       gray.exec(image_tex, tex_gray, width1, height1, /*num_layers=*/2);
+       glGenerateTextureMipmap(tex_gray);
 
        DISComputeFlow compute_flow(width1, height1);
 
        if (enable_warmup) {
                in_warmup = true;
                for (int i = 0; i < 10; ++i) {
-                       GLuint final_tex = compute_flow.exec(tex0_gray, tex1_gray, DISComputeFlow::RESIZE_FLOW_TO_FULL_SIZE);
+                       GLuint final_tex = compute_flow.exec(tex_gray, DISComputeFlow::FORWARD, DISComputeFlow::RESIZE_FLOW_TO_FULL_SIZE);
                        compute_flow.release_texture(final_tex);
                }
                in_warmup = false;
        }
 
-       GLuint final_tex = compute_flow.exec(tex0_gray, tex1_gray, DISComputeFlow::RESIZE_FLOW_TO_FULL_SIZE);
+       GLuint final_tex = compute_flow.exec(tex_gray, DISComputeFlow::FORWARD, DISComputeFlow::RESIZE_FLOW_TO_FULL_SIZE);
+       //GLuint final_tex = compute_flow.exec(tex_gray, DISComputeFlow::FORWARD_AND_BACKWARD, DISComputeFlow::RESIZE_FLOW_TO_FULL_SIZE);
 
        schedule_read<FlowType>(final_tex, width1, height1, filename0, filename1, flow_filename, "flow.ppm");
        compute_flow.release_texture(final_tex);
@@ -1843,8 +1837,7 @@ void compute_flow_only(int argc, char **argv, int optind)
                                filename0, width, height, width1, height1);
                        exit(1);
                }
-               gray.exec(tex0, tex0_gray, width, height);
-               glGenerateTextureMipmap(tex0_gray);
+               glCopyImageSubData(tex0, GL_TEXTURE_2D, 0, 0, 0, 0, image_tex, GL_TEXTURE_2D_ARRAY, 0, 0, 0, 0, width1, height1, 1);
                glDeleteTextures(1, &tex0);
 
                GLuint tex1 = load_texture(filename1, &width, &height, WITHOUT_MIPMAPS);
@@ -1853,17 +1846,18 @@ void compute_flow_only(int argc, char **argv, int optind)
                                filename1, width, height, width1, height1);
                        exit(1);
                }
-               gray.exec(tex1, tex1_gray, width, height);
-               glGenerateTextureMipmap(tex1_gray);
+               glCopyImageSubData(tex1, GL_TEXTURE_2D, 0, 0, 0, 0, image_tex, GL_TEXTURE_2D_ARRAY, 0, 0, 0, 1, width1, height1, 1);
                glDeleteTextures(1, &tex1);
 
-               GLuint final_tex = compute_flow.exec(tex0_gray, tex1_gray, DISComputeFlow::RESIZE_FLOW_TO_FULL_SIZE);
+               gray.exec(image_tex, tex_gray, width1, height1, /*num_layers=*/2);
+               glGenerateTextureMipmap(tex_gray);
+
+               GLuint final_tex = compute_flow.exec(tex_gray, DISComputeFlow::FORWARD, DISComputeFlow::RESIZE_FLOW_TO_FULL_SIZE);
 
                schedule_read<FlowType>(final_tex, width1, height1, filename0, filename1, flow_filename, "");
                compute_flow.release_texture(final_tex);
        }
-       glDeleteTextures(1, &tex0_gray);
-       glDeleteTextures(1, &tex1_gray);
+       glDeleteTextures(1, &tex_gray);
 
        while (!reads_in_progress.empty()) {
                finish_one_read<FlowType>(width1, height1);
@@ -1893,6 +1887,18 @@ void interpolate_image(int argc, char **argv, int optind)
                exit(1);
        }
 
+       // Move them into an array texture, since that's how the rest of the code
+       // would like them.
+       int levels = find_num_levels(width1, height1);
+       GLuint image_tex;
+       glCreateTextures(GL_TEXTURE_2D_ARRAY, 1, &image_tex);
+       glTextureStorage3D(image_tex, levels, GL_RGBA8, width1, height1, 2);
+       glCopyImageSubData(tex0, GL_TEXTURE_2D, 0, 0, 0, 0, image_tex, GL_TEXTURE_2D_ARRAY, 0, 0, 0, 0, width1, height1, 1);
+       glCopyImageSubData(tex1, GL_TEXTURE_2D, 0, 0, 0, 0, image_tex, GL_TEXTURE_2D_ARRAY, 0, 0, 0, 1, width1, height1, 1);
+       glDeleteTextures(1, &tex0);
+       glDeleteTextures(1, &tex1);
+       glGenerateTextureMipmap(image_tex);
+
        // Set up some PBOs to do asynchronous readback.
        GLuint pbos[5];
        glCreateBuffers(5, pbos);
@@ -1905,41 +1911,31 @@ void interpolate_image(int argc, char **argv, int optind)
        GrayscaleConversion gray;
        Interpolate interpolate(width1, height1, finest_level);
 
-       int levels = find_num_levels(width1, height1);
-       GLuint tex0_gray, tex1_gray;
-       glCreateTextures(GL_TEXTURE_2D, 1, &tex0_gray);
-       glCreateTextures(GL_TEXTURE_2D, 1, &tex1_gray);
-       glTextureStorage2D(tex0_gray, levels, GL_R8, width1, height1);
-       glTextureStorage2D(tex1_gray, levels, GL_R8, width1, height1);
-
-       gray.exec(tex0, tex0_gray, width1, height1);
-       glGenerateTextureMipmap(tex0_gray);
-
-       gray.exec(tex1, tex1_gray, width1, height1);
-       glGenerateTextureMipmap(tex1_gray);
+       GLuint tex_gray;
+       glCreateTextures(GL_TEXTURE_2D_ARRAY, 1, &tex_gray);
+       glTextureStorage3D(tex_gray, levels, GL_R8, width1, height1, 2);
+       gray.exec(image_tex, tex_gray, width1, height1, /*num_layers=*/2);
+       glGenerateTextureMipmap(tex_gray);
 
        if (enable_warmup) {
                in_warmup = true;
                for (int i = 0; i < 10; ++i) {
-                       GLuint forward_flow_tex = compute_flow.exec(tex0_gray, tex1_gray, DISComputeFlow::DO_NOT_RESIZE_FLOW);
-                       GLuint backward_flow_tex = compute_flow.exec(tex1_gray, tex0_gray, DISComputeFlow::DO_NOT_RESIZE_FLOW);
-                       GLuint interpolated_tex = interpolate.exec(tex0, tex1, forward_flow_tex, backward_flow_tex, width1, height1, 0.5f);
-                       compute_flow.release_texture(forward_flow_tex);
-                       compute_flow.release_texture(backward_flow_tex);
+                       GLuint bidirectional_flow_tex = compute_flow.exec(tex_gray, DISComputeFlow::FORWARD_AND_BACKWARD, DISComputeFlow::DO_NOT_RESIZE_FLOW);
+                       GLuint interpolated_tex = interpolate.exec(image_tex, bidirectional_flow_tex, width1, height1, 0.5f);
+                       compute_flow.release_texture(bidirectional_flow_tex);
                        interpolate.release_texture(interpolated_tex);
                }
                in_warmup = false;
        }
 
-       GLuint forward_flow_tex = compute_flow.exec(tex0_gray, tex1_gray, DISComputeFlow::DO_NOT_RESIZE_FLOW);
-       GLuint backward_flow_tex = compute_flow.exec(tex1_gray, tex0_gray, DISComputeFlow::DO_NOT_RESIZE_FLOW);
+       GLuint bidirectional_flow_tex = compute_flow.exec(tex_gray, DISComputeFlow::FORWARD_AND_BACKWARD, DISComputeFlow::DO_NOT_RESIZE_FLOW);
 
        for (int frameno = 1; frameno < 60; ++frameno) {
                char ppm_filename[256];
                snprintf(ppm_filename, sizeof(ppm_filename), "interp%04d.ppm", frameno);
 
                float alpha = frameno / 60.0f;
-               GLuint interpolated_tex = interpolate.exec(tex0, tex1, forward_flow_tex, backward_flow_tex, width1, height1, alpha);
+               GLuint interpolated_tex = interpolate.exec(image_tex, bidirectional_flow_tex, width1, height1, alpha);
 
                schedule_read<RGBAType>(interpolated_tex, width1, height1, filename0, filename1, "", ppm_filename);
                interpolate.release_texture(interpolated_tex);
index df21dabe5bca160dcc7f9a6abfce74333d54ca37..57a6891c4b7032ff48d4760b3b7c45aff5924373 100644 (file)
--- a/gray.frag
+++ b/gray.frag
@@ -1,9 +1,9 @@
 #version 450 core
 
-in vec2 tc;
+in vec3 tc;
 out vec4 gray;
 
-uniform sampler2D tex;
+uniform sampler2DArray tex;
 
 void main()
 {
index 12ebf1a38632ac533792c966ab828532f4767b92..29417f944d374df466982cfcb75a25cf0e31fdb7 100644 (file)
 const uint patch_size = 12;
 const uint num_iterations = 8;
 
-in vec2 flow_tc;
+in vec3 flow_tc;
 in vec2 patch_center;
+flat in int ref_layer, search_layer;
 out vec3 out_flow;
 
-uniform sampler2D flow_tex, image1_tex;
-uniform usampler2D grad0_tex;  // Also contains image0.
+uniform sampler2DArray flow_tex, image_tex;
+uniform usampler2DArray grad_tex;  // Also contains the corresponding reference image.
 uniform vec2 inv_image_size, inv_prev_level_size;
 
 vec3 unpack_gradients(uint v)
@@ -56,14 +57,14 @@ vec3 unpack_gradients(uint v)
 }
 
 // Note: The third variable is the actual pixel value.
-vec3 get_gradients(vec2 tc)
+vec3 get_gradients(vec3 tc)
 {
-       vec3 grad = unpack_gradients(texture(grad0_tex, tc).x);
+       vec3 grad = unpack_gradients(texture(grad_tex, tc).x);
 
        // Zero gradients outside the image. (We'd do this with a sampler,
        // but we want the repeat behavior for the actual texels, in the
        // z channel.)
-       if (any(lessThan(tc, vec2(0.0f))) || any(greaterThan(tc, vec2(1.0f)))) {
+       if (any(lessThan(tc.xy, vec2(0.0f))) || any(greaterThan(tc.xy, vec2(1.0f)))) {
                grad.xy = vec2(0.0f);
        }
 
@@ -72,7 +73,7 @@ vec3 get_gradients(vec2 tc)
 
 void main()
 {
-       vec2 image_size = textureSize(grad0_tex, 0);
+       vec2 image_size = textureSize(grad_tex, 0).xy;
 
        // Lock the patch center to an integer, so that we never get
        // any bilinear artifacts for the gradient. (NOTE: This assumes an
@@ -96,7 +97,7 @@ void main()
        for (uint y = 0; y < patch_size; ++y) {
                for (uint x = 0; x < patch_size; ++x) {
                        vec2 tc = base + uvec2(x, y) * inv_image_size;
-                       vec3 grad = get_gradients(tc);
+                       vec3 grad = get_gradients(vec3(tc, ref_layer));
                        H[0][0] += grad.x * grad.x;
                        H[1][1] += grad.y * grad.y;
                        H[0][1] += grad.x * grad.y;
@@ -130,9 +131,9 @@ void main()
                for (uint y = 0; y < patch_size; ++y) {
                        for (uint x = 0; x < patch_size; ++x) {
                                vec2 tc = base + uvec2(x, y) * inv_image_size;
-                               vec3 grad = get_gradients(tc);
+                               vec3 grad = get_gradients(vec3(tc, ref_layer));
                                float t = grad.z;
-                               float warped = texture(image1_tex, tc + u_norm).x;
+                               float warped = texture(image_tex, vec3(tc + u_norm, search_layer)).x;
                                du += grad.xy * (warped - t);
                                warped_sum += warped;
                        }
index ba21b846d8b98fe0484556351264e68f8479e340..d0232767fbcef03434d18ad532ffbc0e0bbf870d 100644 (file)
@@ -1,10 +1,12 @@
 #version 450 core
+#extension GL_ARB_shader_viewport_layer_array : require
 
 layout(location=0) in vec2 position;
-out vec2 flow_tc;
+out vec3 flow_tc;
 out vec2 patch_center;
+flat out int ref_layer, search_layer;
 
-uniform sampler2D flow_tex;
+uniform sampler2DArray flow_tex;
 uniform vec2 out_flow_size;
 
 void main()
@@ -35,5 +37,11 @@ void main()
        //   0.000  0.000 -2.000 -1.000
        //   0.000  0.000  0.000  1.000
        gl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0);
-       flow_tc = position;
+       flow_tc = vec3(position, gl_InstanceID);
+
+       gl_Layer = gl_InstanceID;
+
+       // Forward flow (0) goes from 0 to 1. Backward flow (1) goes from 1 to 0.
+       ref_layer = gl_InstanceID;
+       search_layer = 1 - gl_InstanceID;
 }
index 4b2db4408ee8d9d35af8c4f72de10cafe5b05e3a..baf24d1b41e0aefb8c0d1249c1d9ec5b4d7d8fd1 100644 (file)
@@ -2,21 +2,21 @@
 
 // Warps I_1 according to the flow, then computes the mean and difference to I_0.
 
-in vec2 tc;
+in vec3 tc;
 out float I, I_t;
 out vec2 normalized_flow;
 
-uniform sampler2D image0_tex, image1_tex, flow_tex;
+uniform sampler2DArray image_tex, flow_tex;
 
 void main()
 {
        vec3 flow = texture(flow_tex, tc).xyz;
        flow.xy /= flow.z;  // Normalize the sum coming out of the densification.
 
-       float I_0 = texture(image0_tex, tc).x;
-       float I_w = texture(image1_tex, tc + flow.xy).x;  // NOTE: This is effectively a reverse warp since texture() is a gather operation and flow is conceptually scatter.
+       float I_0 = texture(image_tex, tc).x;
+       float I_w = texture(image_tex, vec3(tc.xy + flow.xy, 1.0f - tc.z)).x;  // NOTE: This is effectively a reverse warp since texture() is a gather operation and flow is conceptually scatter.
 
        I = 0.5f * (I_0 + I_w);
        I_t = I_w - I_0;
-       normalized_flow = flow.xy * textureSize(image0_tex, 0);
+       normalized_flow = flow.xy * textureSize(image_tex, 0).xy;
 }
index 834c6579e92f9e0055955bcc4a46e90616b55dc3..4efc975c778b78cd3cae548655f9217f44acb3b5 100644 (file)
@@ -1,9 +1,9 @@
 #version 450 core
 
-in vec2 tc;
+in vec3 tc;
 out vec2 flow;
 
-uniform sampler2D flow_tex;
+uniform sampler2DArray flow_tex;
 uniform vec2 scale_factor;
 
 void main()
index 3066300dfec5f5ceb29e795bc8f779a4511a1855..8c5c6ee8a69184147cd1f4e65b415dc292a1eaaa 100644 (file)
@@ -1,9 +1,9 @@
 #version 450 core
 
-in vec2 tc;
+in vec3 tc;
 out uint packed_gradients;
 
-uniform sampler2D tex;
+uniform sampler2DArray tex;
 
 uint pack_gradients(float x, float y, float v)
 {
index b072839a2b3b86bc510f260bbb30ae3a93c5bb60..9a8e1e40aa0a32490091c5e2eb7b1b671ad3ff2e 100644 (file)
--- a/sor.frag
+++ b/sor.frag
@@ -1,12 +1,12 @@
 #version 450 core
 
-in vec2 tc, tc_left, tc_down;
-in vec2 equation_tc_assuming_left, equation_tc_assuming_right;
+in vec3 tc, tc_left, tc_down;
+in vec3 equation_tc_assuming_left, equation_tc_assuming_right;
 in float element_x_idx, element_sum_idx;
 out vec2 diff_flow;
 
-uniform sampler2D diff_flow_tex, diffusivity_tex;
-uniform usampler2D equation_red_tex, equation_black_tex;
+uniform sampler2DArray diff_flow_tex, diffusivity_tex;
+uniform usampler2DArray equation_red_tex, equation_black_tex;
 uniform int phase;
 
 uniform int num_nonzero_phases;
@@ -52,7 +52,7 @@ void main()
        if (color != phase) discard;
 
        uvec4 equation;
-       vec2 equation_tc;
+       vec3 equation_tc;
        if ((int(round(element_x_idx)) & 1) == 0) {
                equation_tc = equation_tc_assuming_left;
        } else {
index 32498decf948f4afe6972e313e822918416405f4..c68b1db922e636bbea64b024d2b6dc0dfa12dc38 100644 (file)
--- a/sor.vert
+++ b/sor.vert
@@ -1,13 +1,14 @@
 #version 450 core
+#extension GL_ARB_shader_viewport_layer_array : require
 
 layout(location=0) in vec2 position;
-out vec2 tc, tc_left, tc_down;
-out vec2 equation_tc_assuming_left, equation_tc_assuming_right;
+out vec3 tc, tc_left, tc_down;
+out vec3 equation_tc_assuming_left, equation_tc_assuming_right;
 out float element_x_idx;
 out float element_sum_idx;
 
-uniform sampler2D diff_flow_tex, diffusivity_tex;
-uniform usampler2D equation_red_tex;
+uniform sampler2DArray diff_flow_tex, diffusivity_tex;
+uniform usampler2DArray equation_red_tex;
 
 void main()
 {
@@ -18,20 +19,24 @@ void main()
        //   0.000  0.000 -2.000 -1.000
        //   0.000  0.000  0.000  1.000
        gl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0);
-       tc = position;
-       tc_left = vec2(tc.x - 0.5f / textureSize(diffusivity_tex, 0).x, tc.y);
-       tc_down = vec2(tc.x, tc.y - 0.5f / textureSize(diffusivity_tex, 0).y);
+       gl_Layer = gl_InstanceID;
+
+       tc = vec3(position, gl_InstanceID);
+       tc_left = vec3(tc.x - 0.5f / textureSize(diffusivity_tex, 0).x, tc.y, gl_InstanceID);
+       tc_down = vec3(tc.x, tc.y - 0.5f / textureSize(diffusivity_tex, 0).y, gl_InstanceID);
 
        // The equation textures have half the horizontal width, so we need to adjust the texel centers.
        // It becomes extra tricky since the SOR texture might be of odd size, and then
        // the equation texture is not exactly half the size.
-       vec2 element_idx = position * textureSize(diff_flow_tex, 0) - 0.5f;
+       vec2 element_idx = position * textureSize(diff_flow_tex, 0).xy - 0.5f;
        float equation_texel_number_assuming_left = element_idx.x / 2.0f;
        float equation_texel_number_assuming_right = (element_idx.x - 1.0f) / 2.0f;
        equation_tc_assuming_left.x = (equation_texel_number_assuming_left + 0.5f) / textureSize(equation_red_tex, 0).x;
        equation_tc_assuming_right.x = (equation_texel_number_assuming_right + 0.5f) / textureSize(equation_red_tex, 0).x;
        equation_tc_assuming_left.y = tc.y;
        equation_tc_assuming_right.y = tc.y;
+       equation_tc_assuming_left.z = gl_InstanceID;
+       equation_tc_assuming_right.z = gl_InstanceID;
 
        element_x_idx = element_idx.x;
        element_sum_idx = element_idx.x + element_idx.y;
index 7b6875a131164f89d159b8417b4aa98bbd52e657..83fd68333751a9df65b00956b920ac3ae8fb97ba 100644 (file)
@@ -4,7 +4,7 @@ in vec2 image_pos;
 flat in vec2 flow, I_0_check_offset, I_1_check_offset;
 out vec2 out_flow;
 
-uniform sampler2D image0_tex, image1_tex;
+uniform sampler2DArray image_tex;
 
 void main()
 {
@@ -13,8 +13,8 @@ void main()
        // TODO: Check if we are sampling out-of-image.
        // TODO: See whether using intensity values gives equally good results
        // as RGB, since the rest of our pipeline is intensity.
-       vec3 I_0 = texture(image0_tex, image_pos + I_0_check_offset).rgb;
-       vec3 I_1 = texture(image1_tex, image_pos + I_1_check_offset).rgb;
+       vec3 I_0 = texture(image_tex, vec3(image_pos + I_0_check_offset, 0)).rgb;
+       vec3 I_1 = texture(image_tex, vec3(image_pos + I_1_check_offset, 1)).rgb;
        vec3 diff = abs(I_1 - I_0);
        gl_FragDepth = 0.125 * (diff.x + diff.y + diff.z);  // Make sure we stay well under the 1.0 maximum.
 }
index fd856662a4802b6102495631982b9533c472ed31..08462315264261ac9b3d230ca4c91d187f20394c 100644 (file)
@@ -4,22 +4,29 @@ layout(location=0) in vec2 position;
 out vec2 image_pos;
 flat out vec2 flow, I_0_check_offset, I_1_check_offset;
 
-uniform bool invert_flow;
 uniform vec2 splat_size;  // In 0..1 coordinates.
 uniform vec2 inv_flow_size;
 uniform float alpha;
-uniform sampler2D flow_tex;
+uniform sampler2DArray flow_tex;  // 0 = forward flow, 1 = backward flow.
 
 void main()
 {
-       int x = gl_InstanceID % textureSize(flow_tex, 0).x;
-       int y = gl_InstanceID / textureSize(flow_tex, 0).x;
+       int instance = gl_InstanceID;
+       int num_pixels_per_layer = textureSize(flow_tex, 0).x * textureSize(flow_tex, 0).y;
+       int src_layer;
+       if (instance >= num_pixels_per_layer) {
+               instance -= num_pixels_per_layer;
+               src_layer = 1;
+       } else {
+               src_layer = 0;
+       }
+       int x = instance % textureSize(flow_tex, 0).x;
+       int y = instance / textureSize(flow_tex, 0).x;
 
        // Find out where to splat this to.
-       // TODO: See if we can move some of these calculations into uniforms.
-       vec2 full_flow = texelFetch(flow_tex, ivec2(x, y), 0).xy;
+       vec2 full_flow = texelFetch(flow_tex, ivec3(x, y, src_layer), 0).xy;
        float splat_alpha;
-       if (invert_flow) {
+       if (src_layer == 1) {  // Reverse flow.
                full_flow = -full_flow;
                splat_alpha = 1.0f - alpha;
        } else {
diff --git a/vs.vert b/vs.vert
index 797e0c15f3e30bbb1a9fc4d0f1db2e562a6ecaee..61ad91a7cfd53c2ca962a7a28796bfff6461dfa7 100644 (file)
--- a/vs.vert
+++ b/vs.vert
@@ -1,7 +1,8 @@
 #version 450 core
+#extension GL_ARB_shader_viewport_layer_array : require
 
 layout(location=0) in vec2 position;
-out vec2 tc;
+out vec3 tc;
 
 void main()
 {
@@ -12,5 +13,8 @@ void main()
        //   0.000  0.000 -2.000 -1.000
        //   0.000  0.000  0.000  1.000
        gl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0);
-       tc = position;
+       tc.xy = position;
+       tc.z = gl_InstanceID;
+
+       gl_Layer = gl_InstanceID;
 }