+}
+
+// Simply add the differential flow found by the variational refinement to the base flow.
+// The output is in base_flow_tex; we don't need to make a new texture.
+class AddBaseFlow {
+public:
+ AddBaseFlow();
+ void exec(GLuint base_flow_tex, GLuint diff_flow_tex, int level_width, int level_height);
+
+private:
+ PersistentFBOSet<1> fbos;
+
+ GLuint add_flow_vs_obj;
+ GLuint add_flow_fs_obj;
+ GLuint add_flow_program;
+
+ GLuint uniform_diff_flow_tex;
+};
+
+AddBaseFlow::AddBaseFlow()
+{
+ add_flow_vs_obj = compile_shader(read_file("vs.vert"), GL_VERTEX_SHADER);
+ add_flow_fs_obj = compile_shader(read_file("add_base_flow.frag"), GL_FRAGMENT_SHADER);
+ add_flow_program = link_program(add_flow_vs_obj, add_flow_fs_obj);
+
+ uniform_diff_flow_tex = glGetUniformLocation(add_flow_program, "diff_flow_tex");
+}
+
+void AddBaseFlow::exec(GLuint base_flow_tex, GLuint diff_flow_tex, int level_width, int level_height)
+{
+ glUseProgram(add_flow_program);
+
+ bind_sampler(add_flow_program, uniform_diff_flow_tex, 0, diff_flow_tex, nearest_sampler);
+
+ glViewport(0, 0, level_width, level_height);
+ glEnable(GL_BLEND);
+ glBlendFunc(GL_ONE, GL_ONE);
+ fbos.render_to(base_flow_tex);
+
+ glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+}
+
+// Take a copy of the flow, bilinearly interpolated and scaled up.
+class ResizeFlow {
+public:
+ ResizeFlow();
+ void exec(GLuint in_tex, GLuint out_tex, int input_width, int input_height, int output_width, int output_height);
+
+private:
+ PersistentFBOSet<1> fbos;
+
+ GLuint resize_flow_vs_obj;
+ GLuint resize_flow_fs_obj;
+ GLuint resize_flow_program;
+
+ GLuint uniform_flow_tex;
+ GLuint uniform_scale_factor;
+};
+
+ResizeFlow::ResizeFlow()
+{
+ resize_flow_vs_obj = compile_shader(read_file("vs.vert"), GL_VERTEX_SHADER);
+ resize_flow_fs_obj = compile_shader(read_file("resize_flow.frag"), GL_FRAGMENT_SHADER);
+ resize_flow_program = link_program(resize_flow_vs_obj, resize_flow_fs_obj);
+
+ uniform_flow_tex = glGetUniformLocation(resize_flow_program, "flow_tex");
+ uniform_scale_factor = glGetUniformLocation(resize_flow_program, "scale_factor");
+}
+
+void ResizeFlow::exec(GLuint flow_tex, GLuint out_tex, int input_width, int input_height, int output_width, int output_height)
+{
+ glUseProgram(resize_flow_program);
+
+ bind_sampler(resize_flow_program, uniform_flow_tex, 0, flow_tex, nearest_sampler);
+
+ glProgramUniform2f(resize_flow_program, uniform_scale_factor, float(output_width) / input_width, float(output_height) / input_height);
+
+ glViewport(0, 0, output_width, output_height);
+ glDisable(GL_BLEND);
+ fbos.render_to(out_tex);
+
+ glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+}
+
+class TexturePool {
+public:
+ GLuint get_texture(GLenum format, GLuint width, GLuint height);
+ void release_texture(GLuint tex_num);
+
+private:
+ struct Texture {
+ GLuint tex_num;
+ GLenum format;
+ GLuint width, height;
+ bool in_use = false;
+ };
+ vector<Texture> textures;
+};
+
+class DISComputeFlow {
+public:
+ DISComputeFlow(int width, int height);
+
+ enum ResizeStrategy {
+ DO_NOT_RESIZE_FLOW,
+ RESIZE_FLOW_TO_FULL_SIZE
+ };
+
+ // Returns a texture that must be released with release_texture()
+ // after use.
+ GLuint exec(GLuint tex0, GLuint tex1, ResizeStrategy resize_strategy);
+
+ void release_texture(GLuint tex) {
+ pool.release_texture(tex);
+ }
+
+private:
+ int width, height;
+ GLuint initial_flow_tex;
+ GLuint vertex_vbo, vao;
+ TexturePool pool;
+
+ // The various passes.
+ Sobel sobel;
+ MotionSearch motion_search;
+ Densify densify;
+ Prewarp prewarp;
+ Derivatives derivatives;
+ ComputeDiffusivity compute_diffusivity;
+ SetupEquations setup_equations;
+ SOR sor;
+ AddBaseFlow add_base_flow;
+ ResizeFlow resize_flow;
+};
+
+DISComputeFlow::DISComputeFlow(int width, int height)
+ : width(width), height(height)
+{
+ // Make some samplers.
+ glCreateSamplers(1, &nearest_sampler);
+ glSamplerParameteri(nearest_sampler, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+ glSamplerParameteri(nearest_sampler, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+ glSamplerParameteri(nearest_sampler, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+ glSamplerParameteri(nearest_sampler, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+
+ glCreateSamplers(1, &linear_sampler);
+ glSamplerParameteri(linear_sampler, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+ glSamplerParameteri(linear_sampler, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+ glSamplerParameteri(linear_sampler, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+ glSamplerParameteri(linear_sampler, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+
+ // The smoothness is sampled so that once we get to a smoothness involving
+ // a value outside the border, the diffusivity between the two becomes zero.
+ // Similarly, gradients are zero outside the border, since the edge is taken
+ // to be constant.
+ glCreateSamplers(1, &zero_border_sampler);
+ glSamplerParameteri(zero_border_sampler, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+ glSamplerParameteri(zero_border_sampler, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+ glSamplerParameteri(zero_border_sampler, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_BORDER);
+ glSamplerParameteri(zero_border_sampler, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_BORDER);
+ float zero[] = { 0.0f, 0.0f, 0.0f, 0.0f }; // Note that zero alpha means we can also see whether we sampled outside the border or not.
+ glSamplerParameterfv(zero_border_sampler, GL_TEXTURE_BORDER_COLOR, zero);
+
+ // Initial flow is zero, 1x1.
+ glCreateTextures(GL_TEXTURE_2D, 1, &initial_flow_tex);
+ glTextureStorage2D(initial_flow_tex, 1, GL_RG16F, 1, 1);
+ glClearTexImage(initial_flow_tex, 0, GL_RG, GL_FLOAT, nullptr);
+
+ // Set up the vertex data that will be shared between all passes.
+ float vertices[] = {
+ 0.0f, 1.0f,
+ 0.0f, 0.0f,
+ 1.0f, 1.0f,
+ 1.0f, 0.0f,
+ };
+ glCreateBuffers(1, &vertex_vbo);
+ glNamedBufferData(vertex_vbo, sizeof(vertices), vertices, GL_STATIC_DRAW);
+
+ glCreateVertexArrays(1, &vao);
+ glBindVertexArray(vao);
+ glBindBuffer(GL_ARRAY_BUFFER, vertex_vbo);
+
+ GLint position_attrib = 0; // Hard-coded in every vertex shader.
+ glEnableVertexArrayAttrib(vao, position_attrib);
+ glVertexAttribPointer(position_attrib, 2, GL_FLOAT, GL_FALSE, 0, BUFFER_OFFSET(0));
+}
+
+GLuint DISComputeFlow::exec(GLuint tex0, GLuint tex1, ResizeStrategy resize_strategy)
+{
+ int prev_level_width = 1, prev_level_height = 1;
+ GLuint prev_level_flow_tex = initial_flow_tex;
+
+ GPUTimers timers;
+
+ glBindVertexArray(vao);
+
+ ScopedTimer total_timer("Total", &timers);
+ for (int level = coarsest_level; level >= int(finest_level); --level) {
+ char timer_name[256];
+ snprintf(timer_name, sizeof(timer_name), "Level %d (%d x %d)", level, width >> level, height >> level);
+ ScopedTimer level_timer(timer_name, &total_timer);
+
+ int level_width = width >> level;
+ int level_height = height >> level;
+ float patch_spacing_pixels = patch_size_pixels * (1.0f - patch_overlap_ratio);
+
+ // Make sure we have patches at least every Nth pixel, e.g. for width=9
+ // and patch_spacing=3 (the default), we put out patch centers in
+ // x=0, x=3, x=6, x=9, which is four patches. The fragment shader will
+ // lock all the centers to integer coordinates if needed.
+ int width_patches = 1 + ceil(level_width / patch_spacing_pixels);
+ int height_patches = 1 + ceil(level_height / patch_spacing_pixels);
+
+ // Make sure we always read from the correct level; the chosen
+ // mipmapping could otherwise be rather unpredictable, especially
+ // during motion search.
+ GLuint tex0_view, tex1_view;
+ glGenTextures(1, &tex0_view);
+ glTextureView(tex0_view, GL_TEXTURE_2D, tex0, GL_R8, level, 1, 0, 1);
+ glGenTextures(1, &tex1_view);
+ glTextureView(tex1_view, GL_TEXTURE_2D, tex1, GL_R8, level, 1, 0, 1);
+
+ // Create a new texture; we could be fancy and render use a multi-level
+ // texture, but meh.
+ GLuint grad0_tex = pool.get_texture(GL_R32UI, level_width, level_height);
+
+ // Find the derivative.
+ {
+ ScopedTimer timer("Sobel", &level_timer);
+ sobel.exec(tex0_view, grad0_tex, level_width, level_height);
+ }
+
+ // Motion search to find the initial flow. We use the flow from the previous
+ // level (sampled bilinearly; no fancy tricks) as a guide, then search from there.
+
+ // Create an output flow texture.
+ GLuint flow_out_tex = pool.get_texture(GL_RGB16F, width_patches, height_patches);
+
+ // And draw.
+ {
+ ScopedTimer timer("Motion search", &level_timer);
+ motion_search.exec(tex0_view, tex1_view, grad0_tex, prev_level_flow_tex, flow_out_tex, level_width, level_height, prev_level_width, prev_level_height, width_patches, height_patches);
+ }
+ pool.release_texture(grad0_tex);
+
+ // Densification.
+
+ // Set up an output texture (cleared in Densify).
+ GLuint dense_flow_tex = pool.get_texture(GL_RGB16F, level_width, level_height);
+
+ // And draw.
+ {
+ ScopedTimer timer("Densification", &level_timer);
+ densify.exec(tex0_view, tex1_view, flow_out_tex, dense_flow_tex, level_width, level_height, width_patches, height_patches);
+ }
+ pool.release_texture(flow_out_tex);
+
+ // Everything below here in the loop belongs to variational refinement.
+ ScopedTimer varref_timer("Variational refinement", &level_timer);
+
+ // Prewarping; create I and I_t, and a normalized base flow (so we don't
+ // have to normalize it over and over again, and also save some bandwidth).
+ //
+ // During the entire rest of the variational refinement, flow will be measured
+ // in pixels, not 0..1 normalized OpenGL texture coordinates.
+ // This is because variational refinement depends so heavily on derivatives,
+ // which are measured in intensity levels per pixel.
+ GLuint I_tex = pool.get_texture(GL_R16F, level_width, level_height);
+ GLuint I_t_tex = pool.get_texture(GL_R16F, level_width, level_height);
+ GLuint base_flow_tex = pool.get_texture(GL_RG16F, level_width, level_height);
+ {
+ ScopedTimer timer("Prewarping", &varref_timer);
+ prewarp.exec(tex0_view, tex1_view, dense_flow_tex, I_tex, I_t_tex, base_flow_tex, level_width, level_height);
+ }
+ pool.release_texture(dense_flow_tex);
+ glDeleteTextures(1, &tex0_view);
+ glDeleteTextures(1, &tex1_view);
+
+ // Calculate I_x and I_y. We're only calculating first derivatives;
+ // the others will be taken on-the-fly in order to sample from fewer
+ // textures overall, since sampling from the L1 cache is cheap.
+ // (TODO: Verify that this is indeed faster than making separate
+ // double-derivative textures.)
+ GLuint I_x_y_tex = pool.get_texture(GL_RG16F, level_width, level_height);
+ GLuint beta_0_tex = pool.get_texture(GL_R16F, level_width, level_height);
+ {
+ ScopedTimer timer("First derivatives", &varref_timer);
+ derivatives.exec(I_tex, I_x_y_tex, beta_0_tex, level_width, level_height);
+ }
+ pool.release_texture(I_tex);
+
+ // We need somewhere to store du and dv (the flow increment, relative
+ // to the non-refined base flow u0 and v0). It's initially garbage,
+ // but not read until we've written something sane to it.
+ GLuint du_dv_tex = pool.get_texture(GL_RG16F, level_width, level_height);
+
+ // And for diffusivity.
+ GLuint diffusivity_tex = pool.get_texture(GL_R16F, level_width, level_height);
+
+ // And finally for the equation set. See SetupEquations for
+ // the storage format.
+ GLuint equation_red_tex = pool.get_texture(GL_RGBA32UI, (level_width + 1) / 2, level_height);
+ GLuint equation_black_tex = pool.get_texture(GL_RGBA32UI, (level_width + 1) / 2, level_height);
+
+ for (int outer_idx = 0; outer_idx < level + 1; ++outer_idx) {
+ // Calculate the diffusivity term for each pixel.
+ {
+ ScopedTimer timer("Compute diffusivity", &varref_timer);
+ compute_diffusivity.exec(base_flow_tex, du_dv_tex, diffusivity_tex, level_width, level_height, outer_idx == 0);
+ }
+
+ // Set up the 2x2 equation system for each pixel.
+ {
+ ScopedTimer timer("Set up equations", &varref_timer);
+ setup_equations.exec(I_x_y_tex, I_t_tex, du_dv_tex, base_flow_tex, beta_0_tex, diffusivity_tex, equation_red_tex, equation_black_tex, level_width, level_height, outer_idx == 0);
+ }
+
+ // Run a few SOR iterations. Note that these are to/from the same texture.
+ {
+ ScopedTimer timer("SOR", &varref_timer);
+ sor.exec(du_dv_tex, equation_red_tex, equation_black_tex, diffusivity_tex, level_width, level_height, 5, outer_idx == 0, &timer);
+ }
+ }
+
+ pool.release_texture(I_t_tex);
+ pool.release_texture(I_x_y_tex);
+ pool.release_texture(beta_0_tex);
+ pool.release_texture(diffusivity_tex);
+ pool.release_texture(equation_red_tex);
+ pool.release_texture(equation_black_tex);
+
+ // Add the differential flow found by the variational refinement to the base flow,
+ // giving the final flow estimate for this level.
+ // The output is in diff_flow_tex; we don't need to make a new texture.
+ //
+ // Disabling this doesn't save any time (although we could easily make it so that
+ // it is more efficient), but it helps debug the motion search.
+ if (enable_variational_refinement) {
+ ScopedTimer timer("Add differential flow", &varref_timer);
+ add_base_flow.exec(base_flow_tex, du_dv_tex, level_width, level_height);
+ }
+ pool.release_texture(du_dv_tex);
+
+ if (prev_level_flow_tex != initial_flow_tex) {
+ pool.release_texture(prev_level_flow_tex);
+ }
+ prev_level_flow_tex = base_flow_tex;
+ prev_level_width = level_width;
+ prev_level_height = level_height;
+ }
+ total_timer.end();
+
+ timers.print();
+
+ // Scale up the flow to the final size (if needed).
+ if (finest_level == 0 || resize_strategy == DO_NOT_RESIZE_FLOW) {
+ return prev_level_flow_tex;
+ } else {
+ GLuint final_tex = pool.get_texture(GL_RG16F, width, height);
+ resize_flow.exec(prev_level_flow_tex, final_tex, prev_level_width, prev_level_height, width, height);
+ pool.release_texture(prev_level_flow_tex);
+ return final_tex;
+ }
+}
+
+// Forward-warp the flow half-way (or rather, by alpha). A non-zero “splatting”
+// radius fills most of the holes.
+class Splat {
+public:
+ Splat();
+
+ // alpha is the time of the interpolated frame (0..1).
+ void exec(GLuint tex0, GLuint tex1, GLuint forward_flow_tex, GLuint backward_flow_tex, GLuint flow_tex, GLuint depth_tex, int width, int height, float alpha);
+
+private:
+ PersistentFBOSetWithDepth<1> fbos;
+
+ GLuint splat_vs_obj;
+ GLuint splat_fs_obj;
+ GLuint splat_program;
+
+ GLuint uniform_invert_flow, uniform_splat_size, uniform_alpha;
+ GLuint uniform_image0_tex, uniform_image1_tex, uniform_flow_tex;
+ GLuint uniform_inv_flow_size;
+};
+
+Splat::Splat()
+{
+ splat_vs_obj = compile_shader(read_file("splat.vert"), GL_VERTEX_SHADER);
+ splat_fs_obj = compile_shader(read_file("splat.frag"), GL_FRAGMENT_SHADER);
+ splat_program = link_program(splat_vs_obj, splat_fs_obj);
+
+ uniform_invert_flow = glGetUniformLocation(splat_program, "invert_flow");
+ uniform_splat_size = glGetUniformLocation(splat_program, "splat_size");
+ uniform_alpha = glGetUniformLocation(splat_program, "alpha");
+ uniform_image0_tex = glGetUniformLocation(splat_program, "image0_tex");
+ uniform_image1_tex = glGetUniformLocation(splat_program, "image1_tex");
+ uniform_flow_tex = glGetUniformLocation(splat_program, "flow_tex");
+ uniform_inv_flow_size = glGetUniformLocation(splat_program, "inv_flow_size");
+}
+
+void Splat::exec(GLuint tex0, GLuint tex1, GLuint forward_flow_tex, GLuint backward_flow_tex, GLuint flow_tex, GLuint depth_tex, int width, int height, float alpha)
+{
+ glUseProgram(splat_program);
+
+ bind_sampler(splat_program, uniform_image0_tex, 0, tex0, linear_sampler);
+ bind_sampler(splat_program, uniform_image1_tex, 1, tex1, linear_sampler);
+
+ // FIXME: This is set to 1.0 right now so not to trigger Haswell's “PMA stall”.
+ // Move to 2.0 later, or even 4.0.
+ // (Since we have hole filling, it's not critical, but larger values seem to do
+ // better than hole filling for large motion, blurs etc.)
+ float splat_size = 1.0f; // 4x4 splat means 16x overdraw, 2x2 splat means 4x overdraw.
+ glProgramUniform2f(splat_program, uniform_splat_size, splat_size / width, splat_size / height);
+ glProgramUniform1f(splat_program, uniform_alpha, alpha);
+ glProgramUniform2f(splat_program, uniform_inv_flow_size, 1.0f / width, 1.0f / height);
+
+ glViewport(0, 0, width, height);
+ glDisable(GL_BLEND);
+ glEnable(GL_DEPTH_TEST);
+ glDepthFunc(GL_LESS); // We store the difference between I_0 and I_1, where less difference is good. (Default 1.0 is effectively +inf, which always loses.)
+
+ fbos.render_to(depth_tex, flow_tex);
+
+ // Evidently NVIDIA doesn't use fast clears for glClearTexImage, so clear now that
+ // we've got it bound.
+ glClearColor(1000.0f, 1000.0f, 0.0f, 1.0f); // Invalid flow.
+ glClearDepth(1.0f); // Effectively infinity.
+ glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+
+ // Do forward splatting.
+ bind_sampler(splat_program, uniform_flow_tex, 2, forward_flow_tex, nearest_sampler);
+ glProgramUniform1i(splat_program, uniform_invert_flow, 0);
+ glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, width * height);
+
+ // Do backward splatting.
+ bind_sampler(splat_program, uniform_flow_tex, 2, backward_flow_tex, nearest_sampler);
+ glProgramUniform1i(splat_program, uniform_invert_flow, 1);
+ glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, width * height);
+
+ glDisable(GL_DEPTH_TEST);
+}
+
+// Doing good and fast hole-filling on a GPU is nontrivial. We choose an option
+// that's fairly simple (given that most holes are really small) and also hopefully
+// cheap should the holes not be so small. Conceptually, we look for the first
+// non-hole to the left of us (ie., shoot a ray until we hit something), then
+// the first non-hole to the right of us, then up and down, and then average them
+// all together. It's going to create “stars” if the holes are big, but OK, that's
+// a tradeoff.
+//
+// Our implementation here is efficient assuming that the hierarchical Z-buffer is
+// on even for shaders that do discard (this typically kills early Z, but hopefully
+// not hierarchical Z); we set up Z so that only holes are written to, which means
+// that as soon as a hole is filled, the rasterizer should just skip it. Most of the
+// fullscreen quads should just be discarded outright, really.
+class HoleFill {
+public:
+ HoleFill();
+
+ // Output will be in flow_tex, temp_tex[0, 1, 2], representing the filling
+ // from the down, left, right and up, respectively. Use HoleBlend to merge
+ // them into one.
+ void exec(GLuint flow_tex, GLuint depth_tex, GLuint temp_tex[3], int width, int height);
+
+private:
+ PersistentFBOSetWithDepth<1> fbos;
+
+ GLuint fill_vs_obj;
+ GLuint fill_fs_obj;
+ GLuint fill_program;
+
+ GLuint uniform_tex;
+ GLuint uniform_z, uniform_sample_offset;
+};
+
+HoleFill::HoleFill()
+{
+ fill_vs_obj = compile_shader(read_file("hole_fill.vert"), GL_VERTEX_SHADER);
+ fill_fs_obj = compile_shader(read_file("hole_fill.frag"), GL_FRAGMENT_SHADER);
+ fill_program = link_program(fill_vs_obj, fill_fs_obj);
+
+ uniform_tex = glGetUniformLocation(fill_program, "tex");
+ uniform_z = glGetUniformLocation(fill_program, "z");
+ uniform_sample_offset = glGetUniformLocation(fill_program, "sample_offset");
+}
+
+void HoleFill::exec(GLuint flow_tex, GLuint depth_tex, GLuint temp_tex[3], int width, int height)
+{
+ glUseProgram(fill_program);
+
+ bind_sampler(fill_program, uniform_tex, 0, flow_tex, nearest_sampler);
+
+ glProgramUniform1f(fill_program, uniform_z, 1.0f - 1.0f / 1024.0f);
+
+ glViewport(0, 0, width, height);
+ glDisable(GL_BLEND);
+ glEnable(GL_DEPTH_TEST);
+ glDepthFunc(GL_LESS); // Only update the values > 0.999f (ie., only invalid pixels).
+
+ fbos.render_to(depth_tex, flow_tex); // NOTE: Reading and writing to the same texture.
+
+ // Fill holes from the left, by shifting 1, 2, 4, 8, etc. pixels to the right.
+ for (int offs = 1; offs < width; offs *= 2) {
+ glProgramUniform2f(fill_program, uniform_sample_offset, -offs / float(width), 0.0f);
+ glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+ glTextureBarrier();
+ }
+ glCopyImageSubData(flow_tex, GL_TEXTURE_2D, 0, 0, 0, 0, temp_tex[0], GL_TEXTURE_2D, 0, 0, 0, 0, width, height, 1);
+
+ // Similar to the right; adjust Z a bit down, so that we re-fill the pixels that
+ // were overwritten in the last algorithm.
+ glProgramUniform1f(fill_program, uniform_z, 1.0f - 2.0f / 1024.0f);
+ for (int offs = 1; offs < width; offs *= 2) {
+ glProgramUniform2f(fill_program, uniform_sample_offset, offs / float(width), 0.0f);
+ glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+ glTextureBarrier();
+ }
+ glCopyImageSubData(flow_tex, GL_TEXTURE_2D, 0, 0, 0, 0, temp_tex[1], GL_TEXTURE_2D, 0, 0, 0, 0, width, height, 1);
+
+ // Up.
+ glProgramUniform1f(fill_program, uniform_z, 1.0f - 3.0f / 1024.0f);
+ for (int offs = 1; offs < height; offs *= 2) {
+ glProgramUniform2f(fill_program, uniform_sample_offset, 0.0f, -offs / float(height));
+ glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+ glTextureBarrier();
+ }
+ glCopyImageSubData(flow_tex, GL_TEXTURE_2D, 0, 0, 0, 0, temp_tex[2], GL_TEXTURE_2D, 0, 0, 0, 0, width, height, 1);
+
+ // Down.
+ glProgramUniform1f(fill_program, uniform_z, 1.0f - 4.0f / 1024.0f);
+ for (int offs = 1; offs < height; offs *= 2) {
+ glProgramUniform2f(fill_program, uniform_sample_offset, 0.0f, offs / float(height));
+ glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+ glTextureBarrier();
+ }
+
+ glDisable(GL_DEPTH_TEST);
+}
+
+// Blend the four directions from HoleFill into one pixel, so that single-pixel
+// holes become the average of their four neighbors.
+class HoleBlend {
+public:
+ HoleBlend();
+
+ void exec(GLuint flow_tex, GLuint depth_tex, GLuint temp_tex[3], int width, int height);
+
+private:
+ PersistentFBOSetWithDepth<1> fbos;
+
+ GLuint blend_vs_obj;
+ GLuint blend_fs_obj;
+ GLuint blend_program;
+
+ GLuint uniform_left_tex, uniform_right_tex, uniform_up_tex, uniform_down_tex;
+ GLuint uniform_z, uniform_sample_offset;
+};
+
+HoleBlend::HoleBlend()
+{
+ blend_vs_obj = compile_shader(read_file("hole_fill.vert"), GL_VERTEX_SHADER); // Reuse the vertex shader from the fill.
+ blend_fs_obj = compile_shader(read_file("hole_blend.frag"), GL_FRAGMENT_SHADER);
+ blend_program = link_program(blend_vs_obj, blend_fs_obj);
+
+ uniform_left_tex = glGetUniformLocation(blend_program, "left_tex");
+ uniform_right_tex = glGetUniformLocation(blend_program, "right_tex");
+ uniform_up_tex = glGetUniformLocation(blend_program, "up_tex");
+ uniform_down_tex = glGetUniformLocation(blend_program, "down_tex");
+ uniform_z = glGetUniformLocation(blend_program, "z");
+ uniform_sample_offset = glGetUniformLocation(blend_program, "sample_offset");
+}
+
+void HoleBlend::exec(GLuint flow_tex, GLuint depth_tex, GLuint temp_tex[3], int width, int height)
+{
+ glUseProgram(blend_program);
+
+ bind_sampler(blend_program, uniform_left_tex, 0, temp_tex[0], nearest_sampler);
+ bind_sampler(blend_program, uniform_right_tex, 1, temp_tex[1], nearest_sampler);
+ bind_sampler(blend_program, uniform_up_tex, 2, temp_tex[2], nearest_sampler);
+ bind_sampler(blend_program, uniform_down_tex, 3, flow_tex, nearest_sampler);
+
+ glProgramUniform1f(blend_program, uniform_z, 1.0f - 4.0f / 1024.0f);
+ glProgramUniform2f(blend_program, uniform_sample_offset, 0.0f, 0.0f);
+
+ glViewport(0, 0, width, height);
+ glDisable(GL_BLEND);
+ glEnable(GL_DEPTH_TEST);
+ glDepthFunc(GL_LEQUAL); // Skip over all of the pixels that were never holes to begin with.
+
+ fbos.render_to(depth_tex, flow_tex); // NOTE: Reading and writing to the same texture.
+
+ glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+
+ glDisable(GL_DEPTH_TEST);
+}
+
+class Blend {
+public:
+ Blend();
+ void exec(GLuint tex0, GLuint tex1, GLuint flow_tex, GLuint output_tex, int width, int height, float alpha);
+
+private:
+ PersistentFBOSet<1> fbos;
+ GLuint blend_vs_obj;
+ GLuint blend_fs_obj;
+ GLuint blend_program;
+
+ GLuint uniform_image0_tex, uniform_image1_tex, uniform_flow_tex;
+ GLuint uniform_alpha, uniform_flow_consistency_tolerance;
+};
+
+Blend::Blend()
+{
+ blend_vs_obj = compile_shader(read_file("vs.vert"), GL_VERTEX_SHADER);
+ blend_fs_obj = compile_shader(read_file("blend.frag"), GL_FRAGMENT_SHADER);
+ blend_program = link_program(blend_vs_obj, blend_fs_obj);
+
+ uniform_image0_tex = glGetUniformLocation(blend_program, "image0_tex");
+ uniform_image1_tex = glGetUniformLocation(blend_program, "image1_tex");
+ uniform_flow_tex = glGetUniformLocation(blend_program, "flow_tex");
+ uniform_alpha = glGetUniformLocation(blend_program, "alpha");
+ uniform_flow_consistency_tolerance = glGetUniformLocation(blend_program, "flow_consistency_tolerance");
+}
+
+void Blend::exec(GLuint tex0, GLuint tex1, GLuint flow_tex, GLuint output_tex, int level_width, int level_height, float alpha)
+{
+ glUseProgram(blend_program);
+ bind_sampler(blend_program, uniform_image0_tex, 0, tex0, linear_sampler);
+ bind_sampler(blend_program, uniform_image1_tex, 1, tex1, linear_sampler);
+ bind_sampler(blend_program, uniform_flow_tex, 2, flow_tex, linear_sampler); // May be upsampled.
+ glProgramUniform1f(blend_program, uniform_alpha, alpha);
+
+ glViewport(0, 0, level_width, level_height);
+ fbos.render_to(output_tex);
+ glDisable(GL_BLEND); // A bit ironic, perhaps.
+ glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+}
+
+class Interpolate {
+public:
+ Interpolate(int width, int height, int flow_level);
+
+ // Returns a texture that must be released with release_texture()
+ // after use. tex0 and tex1 must be RGBA8 textures with mipmaps
+ // (unless flow_level == 0).
+ GLuint exec(GLuint tex0, GLuint tex1, GLuint forward_flow_tex, GLuint backward_flow_tex, GLuint width, GLuint height, float alpha);
+
+ void release_texture(GLuint tex) {
+ pool.release_texture(tex);
+ }
+
+private:
+ int width, height, flow_level;
+ GLuint vertex_vbo, vao;
+ TexturePool pool;
+
+ Splat splat;
+ HoleFill hole_fill;
+ HoleBlend hole_blend;
+ Blend blend;
+};
+
+Interpolate::Interpolate(int width, int height, int flow_level)
+ : width(width), height(height), flow_level(flow_level) {
+ // Set up the vertex data that will be shared between all passes.
+ float vertices[] = {
+ 0.0f, 1.0f,
+ 0.0f, 0.0f,
+ 1.0f, 1.0f,
+ 1.0f, 0.0f,
+ };
+ glCreateBuffers(1, &vertex_vbo);
+ glNamedBufferData(vertex_vbo, sizeof(vertices), vertices, GL_STATIC_DRAW);
+
+ glCreateVertexArrays(1, &vao);
+ glBindVertexArray(vao);
+ glBindBuffer(GL_ARRAY_BUFFER, vertex_vbo);
+
+ GLint position_attrib = 0; // Hard-coded in every vertex shader.
+ glEnableVertexArrayAttrib(vao, position_attrib);
+ glVertexAttribPointer(position_attrib, 2, GL_FLOAT, GL_FALSE, 0, BUFFER_OFFSET(0));
+}
+
+GLuint Interpolate::exec(GLuint tex0, GLuint tex1, GLuint forward_flow_tex, GLuint backward_flow_tex, GLuint width, GLuint height, float alpha)
+{
+ GPUTimers timers;
+
+ ScopedTimer total_timer("Total", &timers);
+
+ glBindVertexArray(vao);
+
+ // Pick out the right level to test splatting results on.
+ GLuint tex0_view, tex1_view;
+ glGenTextures(1, &tex0_view);
+ glTextureView(tex0_view, GL_TEXTURE_2D, tex0, GL_RGBA8, flow_level, 1, 0, 1);
+ glGenTextures(1, &tex1_view);
+ glTextureView(tex1_view, GL_TEXTURE_2D, tex1, GL_RGBA8, flow_level, 1, 0, 1);
+
+ int flow_width = width >> flow_level;
+ int flow_height = height >> flow_level;
+
+ GLuint flow_tex = pool.get_texture(GL_RG16F, flow_width, flow_height);
+ GLuint depth_tex = pool.get_texture(GL_DEPTH_COMPONENT32F, flow_width, flow_height); // Used for ranking flows.
+
+ {
+ ScopedTimer timer("Splat", &total_timer);
+ splat.exec(tex0_view, tex1_view, forward_flow_tex, backward_flow_tex, flow_tex, depth_tex, flow_width, flow_height, alpha);
+ }
+ glDeleteTextures(1, &tex0_view);
+ glDeleteTextures(1, &tex1_view);
+
+ GLuint temp_tex[3];
+ temp_tex[0] = pool.get_texture(GL_RG16F, flow_width, flow_height);
+ temp_tex[1] = pool.get_texture(GL_RG16F, flow_width, flow_height);
+ temp_tex[2] = pool.get_texture(GL_RG16F, flow_width, flow_height);
+
+ {
+ ScopedTimer timer("Fill holes", &total_timer);
+ hole_fill.exec(flow_tex, depth_tex, temp_tex, flow_width, flow_height);
+ hole_blend.exec(flow_tex, depth_tex, temp_tex, flow_width, flow_height);
+ }
+
+ pool.release_texture(temp_tex[0]);
+ pool.release_texture(temp_tex[1]);
+ pool.release_texture(temp_tex[2]);
+ pool.release_texture(depth_tex);
+
+ GLuint output_tex = pool.get_texture(GL_RGBA8, width, height);
+ {
+ ScopedTimer timer("Blend", &total_timer);
+ blend.exec(tex0, tex1, flow_tex, output_tex, width, height, alpha);
+ }
+ pool.release_texture(flow_tex);
+ total_timer.end();
+ timers.print();
+
+ return output_tex;
+}
+
+GLuint TexturePool::get_texture(GLenum format, GLuint width, GLuint height)
+{
+ for (Texture &tex : textures) {
+ if (!tex.in_use && tex.format == format &&
+ tex.width == width && tex.height == height) {
+ tex.in_use = true;
+ return tex.tex_num;
+ }
+ }
+
+ Texture tex;
+ glCreateTextures(GL_TEXTURE_2D, 1, &tex.tex_num);
+ glTextureStorage2D(tex.tex_num, 1, format, width, height);
+ tex.format = format;
+ tex.width = width;
+ tex.height = height;
+ tex.in_use = true;
+ textures.push_back(tex);
+ return tex.tex_num;
+}
+
+void TexturePool::release_texture(GLuint tex_num)
+{
+ for (Texture &tex : textures) {
+ if (tex.tex_num == tex_num) {
+ assert(tex.in_use);
+ tex.in_use = false;
+ return;
+ }
+ }
+ assert(false);
+}
+
+// OpenGL uses a bottom-left coordinate system, .flo files use a top-left coordinate system.
+void flip_coordinate_system(float *dense_flow, unsigned width, unsigned height)
+{
+ for (unsigned i = 0; i < width * height; ++i) {
+ dense_flow[i * 2 + 1] = -dense_flow[i * 2 + 1];
+ }
+}
+
+// Not relevant for RGB.
+void flip_coordinate_system(uint8_t *dense_flow, unsigned width, unsigned height)
+{
+}
+
+void write_flow(const char *filename, const float *dense_flow, unsigned width, unsigned height)
+{
+ FILE *flowfp = fopen(filename, "wb");
+ fprintf(flowfp, "FEIH");
+ fwrite(&width, 4, 1, flowfp);
+ fwrite(&height, 4, 1, flowfp);
+ for (unsigned y = 0; y < height; ++y) {
+ int yy = height - y - 1;
+ fwrite(&dense_flow[yy * width * 2], width * 2 * sizeof(float), 1, flowfp);
+ }