]> git.sesse.net Git - nageru/blob - flow.cpp
Allow symlinked frame files. Useful for testing.
[nageru] / flow.cpp
1 #define NO_SDL_GLEXT 1
2
3 #include "flow.h"
4
5 #include "embedded_files.h"
6 #include "gpu_timers.h"
7 #include "util.h"
8
9 #include <algorithm>
10 #include <assert.h>
11 #include <deque>
12 #include <dlfcn.h>
13 #include <epoxy/gl.h>
14 #include <map>
15 #include <memory>
16 #include <stack>
17 #include <stdio.h>
18 #include <string.h>
19 #include <unistd.h>
20 #include <vector>
21
22 #define BUFFER_OFFSET(i) ((char *)nullptr + (i))
23
24 using namespace std;
25
26 // Weighting constants for the different parts of the variational refinement.
27 // These don't correspond 1:1 to the values given in the DIS paper,
28 // since we have different normalizations and ranges in some cases.
29 // These are found through a simple grid search on some MPI-Sintel data,
30 // although the error (EPE) seems to be fairly insensitive to the precise values.
31 // Only the relative values matter, so we fix alpha (the smoothness constant)
32 // at unity and tweak the others.
33 //
34 // TODO: Maybe this should not be global.
35 float vr_alpha = 1.0f, vr_delta = 0.25f, vr_gamma = 0.25f;
36
37 // Some global OpenGL objects.
38 // TODO: These should really be part of DISComputeFlow.
39 GLuint nearest_sampler, linear_sampler, zero_border_sampler;
40 GLuint vertex_vbo;
41
42 int find_num_levels(int width, int height)
43 {
44         int levels = 1;
45         for (int w = width, h = height; w > 1 || h > 1; ) {
46                 w >>= 1;
47                 h >>= 1;
48                 ++levels;
49         }
50         return levels;
51 }
52
53 string read_file(const string &filename, const unsigned char *start = nullptr, const size_t size = 0)
54 {
55         FILE *fp = fopen(filename.c_str(), "r");
56         if (fp == nullptr) {
57                 // Fall back to the version we compiled in. (We prefer disk if we can,
58                 // since that makes it possible to work on shaders without recompiling
59                 // all the time.)
60                 if (start != nullptr) {
61                         return string(reinterpret_cast<const char *>(start),
62                                 reinterpret_cast<const char *>(start) + size);
63                 }
64
65                 perror(filename.c_str());
66                 exit(1);
67         }
68
69         int ret = fseek(fp, 0, SEEK_END);
70         if (ret == -1) {
71                 perror("fseek(SEEK_END)");
72                 exit(1);
73         }
74
75         int disk_size = ftell(fp);
76
77         ret = fseek(fp, 0, SEEK_SET);
78         if (ret == -1) {
79                 perror("fseek(SEEK_SET)");
80                 exit(1);
81         }
82
83         string str;
84         str.resize(disk_size);
85         ret = fread(&str[0], disk_size, 1, fp);
86         if (ret == -1) {
87                 perror("fread");
88                 exit(1);
89         }
90         if (ret == 0) {
91                 fprintf(stderr, "Short read when trying to read %d bytes from %s\n",
92                         disk_size, filename.c_str());
93                 exit(1);
94         }
95         fclose(fp);
96
97         return str;
98 }
99
100 GLuint compile_shader(const string &shader_src, GLenum type)
101 {
102         GLuint obj = glCreateShader(type);
103         const GLchar *source[] = { shader_src.data() };
104         const GLint length[] = { (GLint)shader_src.size() };
105         glShaderSource(obj, 1, source, length);
106         glCompileShader(obj);
107
108         GLchar info_log[4096];
109         GLsizei log_length = sizeof(info_log) - 1;
110         glGetShaderInfoLog(obj, log_length, &log_length, info_log);
111         info_log[log_length] = 0;
112         if (strlen(info_log) > 0) {
113                 fprintf(stderr, "Shader compile log: %s\n", info_log);
114         }
115
116         GLint status;
117         glGetShaderiv(obj, GL_COMPILE_STATUS, &status);
118         if (status == GL_FALSE) {
119                 // Add some line numbers to easier identify compile errors.
120                 string src_with_lines = "/*   1 */ ";
121                 size_t lineno = 1;
122                 for (char ch : shader_src) {
123                         src_with_lines.push_back(ch);
124                         if (ch == '\n') {
125                                 char buf[32];
126                                 snprintf(buf, sizeof(buf), "/* %3zu */ ", ++lineno);
127                                 src_with_lines += buf;
128                         }
129                 }
130
131                 fprintf(stderr, "Failed to compile shader:\n%s\n", src_with_lines.c_str());
132                 exit(1);
133         }
134
135         return obj;
136 }
137
138 GLuint link_program(GLuint vs_obj, GLuint fs_obj)
139 {
140         GLuint program = glCreateProgram();
141         glAttachShader(program, vs_obj);
142         glAttachShader(program, fs_obj);
143         glLinkProgram(program);
144         GLint success;
145         glGetProgramiv(program, GL_LINK_STATUS, &success);
146         if (success == GL_FALSE) {
147                 GLchar error_log[1024] = {0};
148                 glGetProgramInfoLog(program, 1024, nullptr, error_log);
149                 fprintf(stderr, "Error linking program: %s\n", error_log);
150                 exit(1);
151         }
152         return program;
153 }
154
155 void bind_sampler(GLuint program, GLint location, GLuint texture_unit, GLuint tex, GLuint sampler)
156 {
157         if (location == -1) {
158                 return;
159         }
160
161         glBindTextureUnit(texture_unit, tex);
162         glBindSampler(texture_unit, sampler);
163         glProgramUniform1i(program, location, texture_unit);
164 }
165
166 template<size_t num_elements>
167 void PersistentFBOSet<num_elements>::render_to(const array<GLuint, num_elements> &textures)
168 {
169         auto it = fbos.find(textures);
170         if (it != fbos.end()) {
171                 glBindFramebuffer(GL_FRAMEBUFFER, it->second);
172                 return;
173         }
174
175         GLuint fbo;
176         glCreateFramebuffers(1, &fbo);
177         GLenum bufs[num_elements];
178         for (size_t i = 0; i < num_elements; ++i) {
179                 glNamedFramebufferTexture(fbo, GL_COLOR_ATTACHMENT0 + i, textures[i], 0);
180                 bufs[i] = GL_COLOR_ATTACHMENT0 + i;
181         }
182         glNamedFramebufferDrawBuffers(fbo, num_elements, bufs);
183
184         fbos[textures] = fbo;
185         glBindFramebuffer(GL_FRAMEBUFFER, fbo);
186 }
187
188 template<size_t num_elements>
189 void PersistentFBOSetWithDepth<num_elements>::render_to(GLuint depth_rb, const array<GLuint, num_elements> &textures)
190 {
191         auto key = make_pair(depth_rb, textures);
192
193         auto it = fbos.find(key);
194         if (it != fbos.end()) {
195                 glBindFramebuffer(GL_FRAMEBUFFER, it->second);
196                 return;
197         }
198
199         GLuint fbo;
200         glCreateFramebuffers(1, &fbo);
201         GLenum bufs[num_elements];
202         glNamedFramebufferRenderbuffer(fbo, GL_DEPTH_ATTACHMENT, GL_RENDERBUFFER, depth_rb);
203         for (size_t i = 0; i < num_elements; ++i) {
204                 glNamedFramebufferTexture(fbo, GL_COLOR_ATTACHMENT0 + i, textures[i], 0);
205                 bufs[i] = GL_COLOR_ATTACHMENT0 + i;
206         }
207         glNamedFramebufferDrawBuffers(fbo, num_elements, bufs);
208
209         fbos[key] = fbo;
210         glBindFramebuffer(GL_FRAMEBUFFER, fbo);
211 }
212
213 GrayscaleConversion::GrayscaleConversion()
214 {
215         gray_vs_obj = compile_shader(read_file("vs.vert", _binary_vs_vert_data, _binary_vs_vert_size), GL_VERTEX_SHADER);
216         gray_fs_obj = compile_shader(read_file("gray.frag", _binary_gray_frag_data, _binary_gray_frag_size), GL_FRAGMENT_SHADER);
217         gray_program = link_program(gray_vs_obj, gray_fs_obj);
218
219         // Set up the VAO containing all the required position/texcoord data.
220         glCreateVertexArrays(1, &gray_vao);
221         glBindVertexArray(gray_vao);
222
223         GLint position_attrib = glGetAttribLocation(gray_program, "position");
224         glEnableVertexArrayAttrib(gray_vao, position_attrib);
225         glVertexAttribPointer(position_attrib, 2, GL_FLOAT, GL_FALSE, 0, BUFFER_OFFSET(0));
226
227         uniform_tex = glGetUniformLocation(gray_program, "tex");
228 }
229
230 void GrayscaleConversion::exec(GLint tex, GLint gray_tex, int width, int height, int num_layers)
231 {
232         glUseProgram(gray_program);
233         bind_sampler(gray_program, uniform_tex, 0, tex, nearest_sampler);
234
235         glViewport(0, 0, width, height);
236         fbos.render_to(gray_tex);
237         glBindVertexArray(gray_vao);
238         glDisable(GL_BLEND);
239         glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
240 }
241
242 Sobel::Sobel()
243 {
244         sobel_vs_obj = compile_shader(read_file("vs.vert", _binary_vs_vert_data, _binary_vs_vert_size), GL_VERTEX_SHADER);
245         sobel_fs_obj = compile_shader(read_file("sobel.frag", _binary_sobel_frag_data, _binary_sobel_frag_size), GL_FRAGMENT_SHADER);
246         sobel_program = link_program(sobel_vs_obj, sobel_fs_obj);
247
248         uniform_tex = glGetUniformLocation(sobel_program, "tex");
249 }
250
251 void Sobel::exec(GLint tex_view, GLint grad_tex, int level_width, int level_height, int num_layers)
252 {
253         glUseProgram(sobel_program);
254         bind_sampler(sobel_program, uniform_tex, 0, tex_view, nearest_sampler);
255
256         glViewport(0, 0, level_width, level_height);
257         fbos.render_to(grad_tex);
258         glDisable(GL_BLEND);
259         glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
260 }
261
262 MotionSearch::MotionSearch(const OperatingPoint &op)
263         : op(op)
264 {
265         motion_vs_obj = compile_shader(read_file("motion_search.vert", _binary_motion_search_vert_data, _binary_motion_search_vert_size), GL_VERTEX_SHADER);
266         motion_fs_obj = compile_shader(read_file("motion_search.frag", _binary_motion_search_frag_data, _binary_motion_search_frag_size), GL_FRAGMENT_SHADER);
267         motion_search_program = link_program(motion_vs_obj, motion_fs_obj);
268
269         uniform_inv_image_size = glGetUniformLocation(motion_search_program, "inv_image_size");
270         uniform_inv_prev_level_size = glGetUniformLocation(motion_search_program, "inv_prev_level_size");
271         uniform_out_flow_size = glGetUniformLocation(motion_search_program, "out_flow_size");
272         uniform_image_tex = glGetUniformLocation(motion_search_program, "image_tex");
273         uniform_grad_tex = glGetUniformLocation(motion_search_program, "grad_tex");
274         uniform_flow_tex = glGetUniformLocation(motion_search_program, "flow_tex");
275         uniform_patch_size = glGetUniformLocation(motion_search_program, "patch_size");
276         uniform_num_iterations = glGetUniformLocation(motion_search_program, "num_iterations");
277 }
278
279 void MotionSearch::exec(GLuint tex_view, GLuint grad_tex, GLuint flow_tex, GLuint flow_out_tex, int level_width, int level_height, int prev_level_width, int prev_level_height, int width_patches, int height_patches, int num_layers)
280 {
281         glUseProgram(motion_search_program);
282
283         bind_sampler(motion_search_program, uniform_image_tex, 0, tex_view, linear_sampler);
284         bind_sampler(motion_search_program, uniform_grad_tex, 1, grad_tex, nearest_sampler);
285         bind_sampler(motion_search_program, uniform_flow_tex, 2, flow_tex, linear_sampler);
286
287         glProgramUniform2f(motion_search_program, uniform_inv_image_size, 1.0f / level_width, 1.0f / level_height);
288         glProgramUniform2f(motion_search_program, uniform_inv_prev_level_size, 1.0f / prev_level_width, 1.0f / prev_level_height);
289         glProgramUniform2f(motion_search_program, uniform_out_flow_size, width_patches, height_patches);
290         glProgramUniform1ui(motion_search_program, uniform_patch_size, op.patch_size_pixels);
291         glProgramUniform1ui(motion_search_program, uniform_num_iterations, op.search_iterations);
292
293         glViewport(0, 0, width_patches, height_patches);
294         fbos.render_to(flow_out_tex);
295         glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
296 }
297
298 Densify::Densify(const OperatingPoint &op)
299         : op(op)
300 {
301         densify_vs_obj = compile_shader(read_file("densify.vert", _binary_densify_vert_data, _binary_densify_vert_size), GL_VERTEX_SHADER);
302         densify_fs_obj = compile_shader(read_file("densify.frag", _binary_densify_frag_data, _binary_densify_frag_size), GL_FRAGMENT_SHADER);
303         densify_program = link_program(densify_vs_obj, densify_fs_obj);
304
305         uniform_patch_size = glGetUniformLocation(densify_program, "patch_size");
306         uniform_image_tex = glGetUniformLocation(densify_program, "image_tex");
307         uniform_flow_tex = glGetUniformLocation(densify_program, "flow_tex");
308 }
309
310 void Densify::exec(GLuint tex_view, GLuint flow_tex, GLuint dense_flow_tex, int level_width, int level_height, int width_patches, int height_patches, int num_layers)
311 {
312         glUseProgram(densify_program);
313
314         bind_sampler(densify_program, uniform_image_tex, 0, tex_view, linear_sampler);
315         bind_sampler(densify_program, uniform_flow_tex, 1, flow_tex, nearest_sampler);
316
317         glProgramUniform2f(densify_program, uniform_patch_size,
318                 float(op.patch_size_pixels) / level_width,
319                 float(op.patch_size_pixels) / level_height);
320
321         glViewport(0, 0, level_width, level_height);
322         glEnable(GL_BLEND);
323         glBlendFunc(GL_ONE, GL_ONE);
324         fbos.render_to(dense_flow_tex);
325         glClearColor(0.0f, 0.0f, 0.0f, 0.0f);
326         glClear(GL_COLOR_BUFFER_BIT);
327         glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, width_patches * height_patches * num_layers);
328 }
329
330 Prewarp::Prewarp()
331 {
332         prewarp_vs_obj = compile_shader(read_file("vs.vert", _binary_vs_vert_data, _binary_vs_vert_size), GL_VERTEX_SHADER);
333         prewarp_fs_obj = compile_shader(read_file("prewarp.frag", _binary_prewarp_frag_data, _binary_prewarp_frag_size), GL_FRAGMENT_SHADER);
334         prewarp_program = link_program(prewarp_vs_obj, prewarp_fs_obj);
335
336         uniform_image_tex = glGetUniformLocation(prewarp_program, "image_tex");
337         uniform_flow_tex = glGetUniformLocation(prewarp_program, "flow_tex");
338 }
339
340 void Prewarp::exec(GLuint tex_view, GLuint flow_tex, GLuint I_tex, GLuint I_t_tex, GLuint normalized_flow_tex, int level_width, int level_height, int num_layers)
341 {
342         glUseProgram(prewarp_program);
343
344         bind_sampler(prewarp_program, uniform_image_tex, 0, tex_view, linear_sampler);
345         bind_sampler(prewarp_program, uniform_flow_tex, 1, flow_tex, nearest_sampler);
346
347         glViewport(0, 0, level_width, level_height);
348         glDisable(GL_BLEND);
349         fbos.render_to(I_tex, I_t_tex, normalized_flow_tex);
350         glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
351 }
352
353 Derivatives::Derivatives()
354 {
355         derivatives_vs_obj = compile_shader(read_file("vs.vert", _binary_vs_vert_data, _binary_vs_vert_size), GL_VERTEX_SHADER);
356         derivatives_fs_obj = compile_shader(read_file("derivatives.frag", _binary_derivatives_frag_data, _binary_derivatives_frag_size), GL_FRAGMENT_SHADER);
357         derivatives_program = link_program(derivatives_vs_obj, derivatives_fs_obj);
358
359         uniform_tex = glGetUniformLocation(derivatives_program, "tex");
360 }
361
362 void Derivatives::exec(GLuint input_tex, GLuint I_x_y_tex, GLuint beta_0_tex, int level_width, int level_height, int num_layers)
363 {
364         glUseProgram(derivatives_program);
365
366         bind_sampler(derivatives_program, uniform_tex, 0, input_tex, nearest_sampler);
367
368         glViewport(0, 0, level_width, level_height);
369         glDisable(GL_BLEND);
370         fbos.render_to(I_x_y_tex, beta_0_tex);
371         glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
372 }
373
374 ComputeDiffusivity::ComputeDiffusivity()
375 {
376         diffusivity_vs_obj = compile_shader(read_file("vs.vert", _binary_vs_vert_data, _binary_vs_vert_size), GL_VERTEX_SHADER);
377         diffusivity_fs_obj = compile_shader(read_file("diffusivity.frag", _binary_diffusivity_frag_data, _binary_diffusivity_frag_size), GL_FRAGMENT_SHADER);
378         diffusivity_program = link_program(diffusivity_vs_obj, diffusivity_fs_obj);
379
380         uniform_flow_tex = glGetUniformLocation(diffusivity_program, "flow_tex");
381         uniform_diff_flow_tex = glGetUniformLocation(diffusivity_program, "diff_flow_tex");
382         uniform_alpha = glGetUniformLocation(diffusivity_program, "alpha");
383         uniform_zero_diff_flow = glGetUniformLocation(diffusivity_program, "zero_diff_flow");
384 }
385
386 void ComputeDiffusivity::exec(GLuint flow_tex, GLuint diff_flow_tex, GLuint diffusivity_tex, int level_width, int level_height, bool zero_diff_flow, int num_layers)
387 {
388         glUseProgram(diffusivity_program);
389
390         bind_sampler(diffusivity_program, uniform_flow_tex, 0, flow_tex, nearest_sampler);
391         bind_sampler(diffusivity_program, uniform_diff_flow_tex, 1, diff_flow_tex, nearest_sampler);
392         glProgramUniform1f(diffusivity_program, uniform_alpha, vr_alpha);
393         glProgramUniform1i(diffusivity_program, uniform_zero_diff_flow, zero_diff_flow);
394
395         glViewport(0, 0, level_width, level_height);
396
397         glDisable(GL_BLEND);
398         fbos.render_to(diffusivity_tex);
399         glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
400 }
401
402 SetupEquations::SetupEquations()
403 {
404         equations_vs_obj = compile_shader(read_file("equations.vert", _binary_equations_vert_data, _binary_equations_vert_size), GL_VERTEX_SHADER);
405         equations_fs_obj = compile_shader(read_file("equations.frag", _binary_equations_frag_data, _binary_equations_frag_size), GL_FRAGMENT_SHADER);
406         equations_program = link_program(equations_vs_obj, equations_fs_obj);
407
408         uniform_I_x_y_tex = glGetUniformLocation(equations_program, "I_x_y_tex");
409         uniform_I_t_tex = glGetUniformLocation(equations_program, "I_t_tex");
410         uniform_diff_flow_tex = glGetUniformLocation(equations_program, "diff_flow_tex");
411         uniform_base_flow_tex = glGetUniformLocation(equations_program, "base_flow_tex");
412         uniform_beta_0_tex = glGetUniformLocation(equations_program, "beta_0_tex");
413         uniform_diffusivity_tex = glGetUniformLocation(equations_program, "diffusivity_tex");
414         uniform_gamma = glGetUniformLocation(equations_program, "gamma");
415         uniform_delta = glGetUniformLocation(equations_program, "delta");
416         uniform_zero_diff_flow = glGetUniformLocation(equations_program, "zero_diff_flow");
417 }
418
419 void SetupEquations::exec(GLuint I_x_y_tex, GLuint I_t_tex, GLuint diff_flow_tex, GLuint base_flow_tex, GLuint beta_0_tex, GLuint diffusivity_tex, GLuint equation_red_tex, GLuint equation_black_tex, int level_width, int level_height, bool zero_diff_flow, int num_layers)
420 {
421         glUseProgram(equations_program);
422
423         bind_sampler(equations_program, uniform_I_x_y_tex, 0, I_x_y_tex, nearest_sampler);
424         bind_sampler(equations_program, uniform_I_t_tex, 1, I_t_tex, nearest_sampler);
425         bind_sampler(equations_program, uniform_diff_flow_tex, 2, diff_flow_tex, nearest_sampler);
426         bind_sampler(equations_program, uniform_base_flow_tex, 3, base_flow_tex, nearest_sampler);
427         bind_sampler(equations_program, uniform_beta_0_tex, 4, beta_0_tex, nearest_sampler);
428         bind_sampler(equations_program, uniform_diffusivity_tex, 5, diffusivity_tex, zero_border_sampler);
429         glProgramUniform1f(equations_program, uniform_delta, vr_delta);
430         glProgramUniform1f(equations_program, uniform_gamma, vr_gamma);
431         glProgramUniform1i(equations_program, uniform_zero_diff_flow, zero_diff_flow);
432
433         glViewport(0, 0, (level_width + 1) / 2, level_height);
434         glDisable(GL_BLEND);
435         fbos.render_to(equation_red_tex, equation_black_tex);
436         glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
437 }
438
439 SOR::SOR()
440 {
441         sor_vs_obj = compile_shader(read_file("sor.vert", _binary_sor_vert_data, _binary_sor_vert_size), GL_VERTEX_SHADER);
442         sor_fs_obj = compile_shader(read_file("sor.frag", _binary_sor_frag_data, _binary_sor_frag_size), GL_FRAGMENT_SHADER);
443         sor_program = link_program(sor_vs_obj, sor_fs_obj);
444
445         uniform_diff_flow_tex = glGetUniformLocation(sor_program, "diff_flow_tex");
446         uniform_equation_red_tex = glGetUniformLocation(sor_program, "equation_red_tex");
447         uniform_equation_black_tex = glGetUniformLocation(sor_program, "equation_black_tex");
448         uniform_diffusivity_tex = glGetUniformLocation(sor_program, "diffusivity_tex");
449         uniform_phase = glGetUniformLocation(sor_program, "phase");
450         uniform_num_nonzero_phases = glGetUniformLocation(sor_program, "num_nonzero_phases");
451 }
452
453 void SOR::exec(GLuint diff_flow_tex, GLuint equation_red_tex, GLuint equation_black_tex, GLuint diffusivity_tex, int level_width, int level_height, int num_iterations, bool zero_diff_flow, int num_layers, ScopedTimer *sor_timer)
454 {
455         glUseProgram(sor_program);
456
457         bind_sampler(sor_program, uniform_diff_flow_tex, 0, diff_flow_tex, nearest_sampler);
458         bind_sampler(sor_program, uniform_diffusivity_tex, 1, diffusivity_tex, zero_border_sampler);
459         bind_sampler(sor_program, uniform_equation_red_tex, 2, equation_red_tex, nearest_sampler);
460         bind_sampler(sor_program, uniform_equation_black_tex, 3, equation_black_tex, nearest_sampler);
461
462         if (!zero_diff_flow) {
463                 glProgramUniform1i(sor_program, uniform_num_nonzero_phases, 2);
464         }
465
466         // NOTE: We bind to the texture we are rendering from, but we never write any value
467         // that we read in the same shader pass (we call discard for red values when we compute
468         // black, and vice versa), and we have barriers between the passes, so we're fine
469         // as per the spec.
470         glViewport(0, 0, level_width, level_height);
471         glDisable(GL_BLEND);
472         fbos.render_to(diff_flow_tex);
473
474         for (int i = 0; i < num_iterations; ++i) {
475                 {
476                         ScopedTimer timer("Red pass", sor_timer);
477                         if (zero_diff_flow && i == 0) {
478                                 glProgramUniform1i(sor_program, uniform_num_nonzero_phases, 0);
479                         }
480                         glProgramUniform1i(sor_program, uniform_phase, 0);
481                         glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
482                         glTextureBarrier();
483                 }
484                 {
485                         ScopedTimer timer("Black pass", sor_timer);
486                         if (zero_diff_flow && i == 0) {
487                                 glProgramUniform1i(sor_program, uniform_num_nonzero_phases, 1);
488                         }
489                         glProgramUniform1i(sor_program, uniform_phase, 1);
490                         glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
491                         if (zero_diff_flow && i == 0) {
492                                 glProgramUniform1i(sor_program, uniform_num_nonzero_phases, 2);
493                         }
494                         if (i != num_iterations - 1) {
495                                 glTextureBarrier();
496                         }
497                 }
498         }
499 }
500
501 AddBaseFlow::AddBaseFlow()
502 {
503         add_flow_vs_obj = compile_shader(read_file("vs.vert", _binary_vs_vert_data, _binary_vs_vert_size), GL_VERTEX_SHADER);
504         add_flow_fs_obj = compile_shader(read_file("add_base_flow.frag", _binary_add_base_flow_frag_data, _binary_add_base_flow_frag_size), GL_FRAGMENT_SHADER);
505         add_flow_program = link_program(add_flow_vs_obj, add_flow_fs_obj);
506
507         uniform_diff_flow_tex = glGetUniformLocation(add_flow_program, "diff_flow_tex");
508 }
509
510 void AddBaseFlow::exec(GLuint base_flow_tex, GLuint diff_flow_tex, int level_width, int level_height, int num_layers)
511 {
512         glUseProgram(add_flow_program);
513
514         bind_sampler(add_flow_program, uniform_diff_flow_tex, 0, diff_flow_tex, nearest_sampler);
515
516         glViewport(0, 0, level_width, level_height);
517         glEnable(GL_BLEND);
518         glBlendFunc(GL_ONE, GL_ONE);
519         fbos.render_to(base_flow_tex);
520
521         glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
522 }
523
524 ResizeFlow::ResizeFlow()
525 {
526         resize_flow_vs_obj = compile_shader(read_file("vs.vert", _binary_vs_vert_data, _binary_vs_vert_size), GL_VERTEX_SHADER);
527         resize_flow_fs_obj = compile_shader(read_file("resize_flow.frag", _binary_resize_flow_frag_data, _binary_resize_flow_frag_size), GL_FRAGMENT_SHADER);
528         resize_flow_program = link_program(resize_flow_vs_obj, resize_flow_fs_obj);
529
530         uniform_flow_tex = glGetUniformLocation(resize_flow_program, "flow_tex");
531         uniform_scale_factor = glGetUniformLocation(resize_flow_program, "scale_factor");
532 }
533
534 void ResizeFlow::exec(GLuint flow_tex, GLuint out_tex, int input_width, int input_height, int output_width, int output_height, int num_layers)
535 {
536         glUseProgram(resize_flow_program);
537
538         bind_sampler(resize_flow_program, uniform_flow_tex, 0, flow_tex, nearest_sampler);
539
540         glProgramUniform2f(resize_flow_program, uniform_scale_factor, float(output_width) / input_width, float(output_height) / input_height);
541
542         glViewport(0, 0, output_width, output_height);
543         glDisable(GL_BLEND);
544         fbos.render_to(out_tex);
545
546         glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
547 }
548
549 DISComputeFlow::DISComputeFlow(int width, int height, const OperatingPoint &op)
550         : width(width), height(height), op(op), motion_search(op), densify(op)
551 {
552         // Make some samplers.
553         glCreateSamplers(1, &nearest_sampler);
554         glSamplerParameteri(nearest_sampler, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
555         glSamplerParameteri(nearest_sampler, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
556         glSamplerParameteri(nearest_sampler, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
557         glSamplerParameteri(nearest_sampler, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
558
559         glCreateSamplers(1, &linear_sampler);
560         glSamplerParameteri(linear_sampler, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
561         glSamplerParameteri(linear_sampler, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
562         glSamplerParameteri(linear_sampler, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
563         glSamplerParameteri(linear_sampler, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
564
565         // The smoothness is sampled so that once we get to a smoothness involving
566         // a value outside the border, the diffusivity between the two becomes zero.
567         // Similarly, gradients are zero outside the border, since the edge is taken
568         // to be constant.
569         glCreateSamplers(1, &zero_border_sampler);
570         glSamplerParameteri(zero_border_sampler, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
571         glSamplerParameteri(zero_border_sampler, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
572         glSamplerParameteri(zero_border_sampler, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_BORDER);
573         glSamplerParameteri(zero_border_sampler, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_BORDER);
574         float zero[] = { 0.0f, 0.0f, 0.0f, 0.0f };  // Note that zero alpha means we can also see whether we sampled outside the border or not.
575         glSamplerParameterfv(zero_border_sampler, GL_TEXTURE_BORDER_COLOR, zero);
576
577         // Initial flow is zero, 1x1.
578         glCreateTextures(GL_TEXTURE_2D_ARRAY, 1, &initial_flow_tex);
579         glTextureStorage3D(initial_flow_tex, 1, GL_RG16F, 1, 1, 1);
580         glClearTexImage(initial_flow_tex, 0, GL_RG, GL_FLOAT, nullptr);
581
582         // Set up the vertex data that will be shared between all passes.
583         float vertices[] = {
584                 0.0f, 1.0f,
585                 0.0f, 0.0f,
586                 1.0f, 1.0f,
587                 1.0f, 0.0f,
588         };
589         glCreateBuffers(1, &vertex_vbo);
590         glNamedBufferData(vertex_vbo, sizeof(vertices), vertices, GL_STATIC_DRAW);
591
592         glCreateVertexArrays(1, &vao);
593         glBindVertexArray(vao);
594         glBindBuffer(GL_ARRAY_BUFFER, vertex_vbo);
595
596         GLint position_attrib = 0;  // Hard-coded in every vertex shader.
597         glEnableVertexArrayAttrib(vao, position_attrib);
598         glVertexAttribPointer(position_attrib, 2, GL_FLOAT, GL_FALSE, 0, BUFFER_OFFSET(0));
599 }
600
601 GLuint DISComputeFlow::exec(GLuint tex, FlowDirection flow_direction, ResizeStrategy resize_strategy)
602 {
603         int num_layers = (flow_direction == FORWARD_AND_BACKWARD) ? 2 : 1;
604         int prev_level_width = 1, prev_level_height = 1;
605         GLuint prev_level_flow_tex = initial_flow_tex;
606
607         GPUTimers timers;
608
609         glBindVertexArray(vao);
610         glDisable(GL_DITHER);
611
612         ScopedTimer total_timer("Compute flow", &timers);
613         for (int level = op.coarsest_level; level >= int(op.finest_level); --level) {
614                 char timer_name[256];
615                 snprintf(timer_name, sizeof(timer_name), "Level %d (%d x %d)", level, width >> level, height >> level);
616                 ScopedTimer level_timer(timer_name, &total_timer);
617
618                 int level_width = width >> level;
619                 int level_height = height >> level;
620                 float patch_spacing_pixels = op.patch_size_pixels * (1.0f - op.patch_overlap_ratio);
621
622                 // Make sure we have patches at least every Nth pixel, e.g. for width=9
623                 // and patch_spacing=3 (the default), we put out patch centers in
624                 // x=0, x=3, x=6, x=9, which is four patches. The fragment shader will
625                 // lock all the centers to integer coordinates if needed.
626                 int width_patches = 1 + ceil(level_width / patch_spacing_pixels);
627                 int height_patches = 1 + ceil(level_height / patch_spacing_pixels);
628
629                 // Make sure we always read from the correct level; the chosen
630                 // mipmapping could otherwise be rather unpredictable, especially
631                 // during motion search.
632                 GLuint tex_view;
633                 glGenTextures(1, &tex_view);
634                 glTextureView(tex_view, GL_TEXTURE_2D_ARRAY, tex, GL_R8, level, 1, 0, 2);
635
636                 // Create a new texture to hold the gradients.
637                 GLuint grad_tex = pool.get_texture(GL_R32UI, level_width, level_height, num_layers);
638
639                 // Find the derivative.
640                 {
641                         ScopedTimer timer("Sobel", &level_timer);
642                         sobel.exec(tex_view, grad_tex, level_width, level_height, num_layers);
643                 }
644
645                 // Motion search to find the initial flow. We use the flow from the previous
646                 // level (sampled bilinearly; no fancy tricks) as a guide, then search from there.
647
648                 // Create an output flow texture.
649                 GLuint flow_out_tex = pool.get_texture(GL_RGB16F, width_patches, height_patches, num_layers);
650
651                 // And draw.
652                 {
653                         ScopedTimer timer("Motion search", &level_timer);
654                         motion_search.exec(tex_view, grad_tex, prev_level_flow_tex, flow_out_tex, level_width, level_height, prev_level_width, prev_level_height, width_patches, height_patches, num_layers);
655                 }
656                 pool.release_texture(grad_tex);
657
658                 // Densification.
659
660                 // Set up an output texture (cleared in Densify).
661                 GLuint dense_flow_tex = pool.get_texture(GL_RGB16F, level_width, level_height, num_layers);
662
663                 // And draw.
664                 {
665                         ScopedTimer timer("Densification", &level_timer);
666                         densify.exec(tex_view, flow_out_tex, dense_flow_tex, level_width, level_height, width_patches, height_patches, num_layers);
667                 }
668                 pool.release_texture(flow_out_tex);
669
670                 // Everything below here in the loop belongs to variational refinement.
671                 ScopedTimer varref_timer("Variational refinement", &level_timer);
672
673                 // Prewarping; create I and I_t, and a normalized base flow (so we don't
674                 // have to normalize it over and over again, and also save some bandwidth).
675                 //
676                 // During the entire rest of the variational refinement, flow will be measured
677                 // in pixels, not 0..1 normalized OpenGL texture coordinates.
678                 // This is because variational refinement depends so heavily on derivatives,
679                 // which are measured in intensity levels per pixel.
680                 GLuint I_tex = pool.get_texture(GL_R16F, level_width, level_height, num_layers);
681                 GLuint I_t_tex = pool.get_texture(GL_R16F, level_width, level_height, num_layers);
682                 GLuint base_flow_tex = pool.get_texture(GL_RG16F, level_width, level_height, num_layers);
683                 {
684                         ScopedTimer timer("Prewarping", &varref_timer);
685                         prewarp.exec(tex_view, dense_flow_tex, I_tex, I_t_tex, base_flow_tex, level_width, level_height, num_layers);
686                 }
687                 pool.release_texture(dense_flow_tex);
688                 glDeleteTextures(1, &tex_view);
689
690                 // TODO: If we don't have variational refinement, we don't need I and I_t,
691                 // so computing them is a waste.
692                 if (op.variational_refinement) {
693                         // Calculate I_x and I_y. We're only calculating first derivatives;
694                         // the others will be taken on-the-fly in order to sample from fewer
695                         // textures overall, since sampling from the L1 cache is cheap.
696                         // (TODO: Verify that this is indeed faster than making separate
697                         // double-derivative textures.)
698                         GLuint I_x_y_tex = pool.get_texture(GL_RG16F, level_width, level_height, num_layers);
699                         GLuint beta_0_tex = pool.get_texture(GL_R16F, level_width, level_height, num_layers);
700                         {
701                                 ScopedTimer timer("First derivatives", &varref_timer);
702                                 derivatives.exec(I_tex, I_x_y_tex, beta_0_tex, level_width, level_height, num_layers);
703                         }
704                         pool.release_texture(I_tex);
705
706                         // We need somewhere to store du and dv (the flow increment, relative
707                         // to the non-refined base flow u0 and v0). It's initially garbage,
708                         // but not read until we've written something sane to it.
709                         GLuint diff_flow_tex = pool.get_texture(GL_RG16F, level_width, level_height, num_layers);
710
711                         // And for diffusivity.
712                         GLuint diffusivity_tex = pool.get_texture(GL_R16F, level_width, level_height, num_layers);
713
714                         // And finally for the equation set. See SetupEquations for
715                         // the storage format.
716                         GLuint equation_red_tex = pool.get_texture(GL_RGBA32UI, (level_width + 1) / 2, level_height, num_layers);
717                         GLuint equation_black_tex = pool.get_texture(GL_RGBA32UI, (level_width + 1) / 2, level_height, num_layers);
718
719                         for (int outer_idx = 0; outer_idx < level + 1; ++outer_idx) {
720                                 // Calculate the diffusivity term for each pixel.
721                                 {
722                                         ScopedTimer timer("Compute diffusivity", &varref_timer);
723                                         compute_diffusivity.exec(base_flow_tex, diff_flow_tex, diffusivity_tex, level_width, level_height, outer_idx == 0, num_layers);
724                                 }
725
726                                 // Set up the 2x2 equation system for each pixel.
727                                 {
728                                         ScopedTimer timer("Set up equations", &varref_timer);
729                                         setup_equations.exec(I_x_y_tex, I_t_tex, diff_flow_tex, base_flow_tex, beta_0_tex, diffusivity_tex, equation_red_tex, equation_black_tex, level_width, level_height, outer_idx == 0, num_layers);
730                                 }
731
732                                 // Run a few SOR iterations. Note that these are to/from the same texture.
733                                 {
734                                         ScopedTimer timer("SOR", &varref_timer);
735                                         sor.exec(diff_flow_tex, equation_red_tex, equation_black_tex, diffusivity_tex, level_width, level_height, 5, outer_idx == 0, num_layers, &timer);
736                                 }
737                         }
738
739                         pool.release_texture(I_t_tex);
740                         pool.release_texture(I_x_y_tex);
741                         pool.release_texture(beta_0_tex);
742                         pool.release_texture(diffusivity_tex);
743                         pool.release_texture(equation_red_tex);
744                         pool.release_texture(equation_black_tex);
745
746                         // Add the differential flow found by the variational refinement to the base flow,
747                         // giving the final flow estimate for this level.
748                         // The output is in base_flow_tex; we don't need to make a new texture.
749                         {
750                                 ScopedTimer timer("Add differential flow", &varref_timer);
751                                 add_base_flow.exec(base_flow_tex, diff_flow_tex, level_width, level_height, num_layers);
752                         }
753                         pool.release_texture(diff_flow_tex);
754                 }
755
756                 if (prev_level_flow_tex != initial_flow_tex) {
757                         pool.release_texture(prev_level_flow_tex);
758                 }
759                 prev_level_flow_tex = base_flow_tex;
760                 prev_level_width = level_width;
761                 prev_level_height = level_height;
762         }
763         total_timer.end();
764
765         if (!in_warmup) {
766                 timers.print();
767         }
768
769         // Scale up the flow to the final size (if needed).
770         if (op.finest_level == 0 || resize_strategy == DO_NOT_RESIZE_FLOW) {
771                 return prev_level_flow_tex;
772         } else {
773                 GLuint final_tex = pool.get_texture(GL_RG16F, width, height, num_layers);
774                 resize_flow.exec(prev_level_flow_tex, final_tex, prev_level_width, prev_level_height, width, height, num_layers);
775                 pool.release_texture(prev_level_flow_tex);
776                 return final_tex;
777         }
778 }
779
780 Splat::Splat(const OperatingPoint &op)
781         : op(op)
782 {
783         splat_vs_obj = compile_shader(read_file("splat.vert", _binary_splat_vert_data, _binary_splat_vert_size), GL_VERTEX_SHADER);
784         splat_fs_obj = compile_shader(read_file("splat.frag", _binary_splat_frag_data, _binary_splat_frag_size), GL_FRAGMENT_SHADER);
785         splat_program = link_program(splat_vs_obj, splat_fs_obj);
786
787         uniform_splat_size = glGetUniformLocation(splat_program, "splat_size");
788         uniform_alpha = glGetUniformLocation(splat_program, "alpha");
789         uniform_gray_tex = glGetUniformLocation(splat_program, "gray_tex");
790         uniform_flow_tex = glGetUniformLocation(splat_program, "flow_tex");
791         uniform_inv_flow_size = glGetUniformLocation(splat_program, "inv_flow_size");
792 }
793
794 void Splat::exec(GLuint gray_tex, GLuint bidirectional_flow_tex, GLuint flow_tex, GLuint depth_rb, int width, int height, float alpha)
795 {
796         glUseProgram(splat_program);
797
798         bind_sampler(splat_program, uniform_gray_tex, 0, gray_tex, linear_sampler);
799         bind_sampler(splat_program, uniform_flow_tex, 1, bidirectional_flow_tex, nearest_sampler);
800
801         glProgramUniform2f(splat_program, uniform_splat_size, op.splat_size / width, op.splat_size / height);
802         glProgramUniform1f(splat_program, uniform_alpha, alpha);
803         glProgramUniform2f(splat_program, uniform_inv_flow_size, 1.0f / width, 1.0f / height);
804
805         glViewport(0, 0, width, height);
806         glDisable(GL_BLEND);
807         glEnable(GL_DEPTH_TEST);
808         glDepthMask(GL_TRUE);
809         glDepthFunc(GL_LESS);  // We store the difference between I_0 and I_1, where less difference is good. (Default 1.0 is effectively +inf, which always loses.)
810
811         fbos.render_to(depth_rb, flow_tex);
812
813         // Evidently NVIDIA doesn't use fast clears for glClearTexImage, so clear now that
814         // we've got it bound.
815         glClearColor(1000.0f, 1000.0f, 0.0f, 1.0f);  // Invalid flow.
816         glClearDepth(1.0f);  // Effectively infinity.
817         glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
818
819         glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, width * height * 2);
820
821         glDisable(GL_DEPTH_TEST);
822 }
823
824 HoleFill::HoleFill()
825 {
826         fill_vs_obj = compile_shader(read_file("hole_fill.vert", _binary_hole_fill_vert_data, _binary_hole_fill_vert_size), GL_VERTEX_SHADER);
827         fill_fs_obj = compile_shader(read_file("hole_fill.frag", _binary_hole_fill_frag_data, _binary_hole_fill_frag_size), GL_FRAGMENT_SHADER);
828         fill_program = link_program(fill_vs_obj, fill_fs_obj);
829
830         uniform_tex = glGetUniformLocation(fill_program, "tex");
831         uniform_z = glGetUniformLocation(fill_program, "z");
832         uniform_sample_offset = glGetUniformLocation(fill_program, "sample_offset");
833 }
834
835 void HoleFill::exec(GLuint flow_tex, GLuint depth_rb, GLuint temp_tex[3], int width, int height)
836 {
837         glUseProgram(fill_program);
838
839         bind_sampler(fill_program, uniform_tex, 0, flow_tex, nearest_sampler);
840
841         glProgramUniform1f(fill_program, uniform_z, 1.0f - 1.0f / 1024.0f);
842
843         glViewport(0, 0, width, height);
844         glDisable(GL_BLEND);
845         glEnable(GL_DEPTH_TEST);
846         glDepthFunc(GL_LESS);  // Only update the values > 0.999f (ie., only invalid pixels).
847
848         fbos.render_to(depth_rb, flow_tex);  // NOTE: Reading and writing to the same texture.
849
850         // Fill holes from the left, by shifting 1, 2, 4, 8, etc. pixels to the right.
851         for (int offs = 1; offs < width; offs *= 2) {
852                 glProgramUniform2f(fill_program, uniform_sample_offset, -offs / float(width), 0.0f);
853                 glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
854                 glTextureBarrier();
855         }
856         glCopyImageSubData(flow_tex, GL_TEXTURE_2D, 0, 0, 0, 0, temp_tex[0], GL_TEXTURE_2D, 0, 0, 0, 0, width, height, 1);
857
858         // Similar to the right; adjust Z a bit down, so that we re-fill the pixels that
859         // were overwritten in the last algorithm.
860         glProgramUniform1f(fill_program, uniform_z, 1.0f - 2.0f / 1024.0f);
861         for (int offs = 1; offs < width; offs *= 2) {
862                 glProgramUniform2f(fill_program, uniform_sample_offset, offs / float(width), 0.0f);
863                 glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
864                 glTextureBarrier();
865         }
866         glCopyImageSubData(flow_tex, GL_TEXTURE_2D, 0, 0, 0, 0, temp_tex[1], GL_TEXTURE_2D, 0, 0, 0, 0, width, height, 1);
867
868         // Up.
869         glProgramUniform1f(fill_program, uniform_z, 1.0f - 3.0f / 1024.0f);
870         for (int offs = 1; offs < height; offs *= 2) {
871                 glProgramUniform2f(fill_program, uniform_sample_offset, 0.0f, -offs / float(height));
872                 glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
873                 glTextureBarrier();
874         }
875         glCopyImageSubData(flow_tex, GL_TEXTURE_2D, 0, 0, 0, 0, temp_tex[2], GL_TEXTURE_2D, 0, 0, 0, 0, width, height, 1);
876
877         // Down.
878         glProgramUniform1f(fill_program, uniform_z, 1.0f - 4.0f / 1024.0f);
879         for (int offs = 1; offs < height; offs *= 2) {
880                 glProgramUniform2f(fill_program, uniform_sample_offset, 0.0f, offs / float(height));
881                 glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
882                 glTextureBarrier();
883         }
884
885         glDisable(GL_DEPTH_TEST);
886 }
887
888 HoleBlend::HoleBlend()
889 {
890         blend_vs_obj = compile_shader(read_file("hole_fill.vert", _binary_hole_fill_vert_data, _binary_hole_fill_vert_size), GL_VERTEX_SHADER);  // Reuse the vertex shader from the fill.
891         blend_fs_obj = compile_shader(read_file("hole_blend.frag", _binary_hole_blend_frag_data, _binary_hole_blend_frag_size), GL_FRAGMENT_SHADER);
892         blend_program = link_program(blend_vs_obj, blend_fs_obj);
893
894         uniform_left_tex = glGetUniformLocation(blend_program, "left_tex");
895         uniform_right_tex = glGetUniformLocation(blend_program, "right_tex");
896         uniform_up_tex = glGetUniformLocation(blend_program, "up_tex");
897         uniform_down_tex = glGetUniformLocation(blend_program, "down_tex");
898         uniform_z = glGetUniformLocation(blend_program, "z");
899         uniform_sample_offset = glGetUniformLocation(blend_program, "sample_offset");
900 }
901
902 void HoleBlend::exec(GLuint flow_tex, GLuint depth_rb, GLuint temp_tex[3], int width, int height)
903 {
904         glUseProgram(blend_program);
905
906         bind_sampler(blend_program, uniform_left_tex, 0, temp_tex[0], nearest_sampler);
907         bind_sampler(blend_program, uniform_right_tex, 1, temp_tex[1], nearest_sampler);
908         bind_sampler(blend_program, uniform_up_tex, 2, temp_tex[2], nearest_sampler);
909         bind_sampler(blend_program, uniform_down_tex, 3, flow_tex, nearest_sampler);
910
911         glProgramUniform1f(blend_program, uniform_z, 1.0f - 4.0f / 1024.0f);
912         glProgramUniform2f(blend_program, uniform_sample_offset, 0.0f, 0.0f);
913
914         glViewport(0, 0, width, height);
915         glDisable(GL_BLEND);
916         glEnable(GL_DEPTH_TEST);
917         glDepthFunc(GL_LEQUAL);  // Skip over all of the pixels that were never holes to begin with.
918
919         fbos.render_to(depth_rb, flow_tex);  // NOTE: Reading and writing to the same texture.
920
921         glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
922
923         glDisable(GL_DEPTH_TEST);
924 }
925
926 Blend::Blend(bool split_ycbcr_output)
927         : split_ycbcr_output(split_ycbcr_output)
928 {
929         string frag_shader = read_file("blend.frag", _binary_blend_frag_data, _binary_blend_frag_size);
930         if (split_ycbcr_output) {
931                 // Insert after the first #version line.
932                 size_t offset = frag_shader.find('\n');
933                 assert(offset != string::npos);
934                 frag_shader = frag_shader.substr(0, offset + 1) + "#define SPLIT_YCBCR_OUTPUT 1\n" + frag_shader.substr(offset + 1);
935         }
936
937         blend_vs_obj = compile_shader(read_file("vs.vert", _binary_vs_vert_data, _binary_vs_vert_size), GL_VERTEX_SHADER);
938         blend_fs_obj = compile_shader(frag_shader, GL_FRAGMENT_SHADER);
939         blend_program = link_program(blend_vs_obj, blend_fs_obj);
940
941         uniform_image_tex = glGetUniformLocation(blend_program, "image_tex");
942         uniform_flow_tex = glGetUniformLocation(blend_program, "flow_tex");
943         uniform_alpha = glGetUniformLocation(blend_program, "alpha");
944         uniform_flow_consistency_tolerance = glGetUniformLocation(blend_program, "flow_consistency_tolerance");
945 }
946
947 void Blend::exec(GLuint image_tex, GLuint flow_tex, GLuint output_tex, GLuint output2_tex, int level_width, int level_height, float alpha)
948 {
949         glUseProgram(blend_program);
950         bind_sampler(blend_program, uniform_image_tex, 0, image_tex, linear_sampler);
951         bind_sampler(blend_program, uniform_flow_tex, 1, flow_tex, linear_sampler);  // May be upsampled.
952         glProgramUniform1f(blend_program, uniform_alpha, alpha);
953
954         glViewport(0, 0, level_width, level_height);
955         if (split_ycbcr_output) {
956                 fbos_split.render_to(output_tex, output2_tex);
957         } else {
958                 fbos.render_to(output_tex);
959         }
960         glDisable(GL_BLEND);  // A bit ironic, perhaps.
961         glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
962 }
963
964 Interpolate::Interpolate(const OperatingPoint &op, bool split_ycbcr_output)
965         : flow_level(op.finest_level),
966           split_ycbcr_output(split_ycbcr_output),
967           splat(op),
968           blend(split_ycbcr_output) {
969         // Set up the vertex data that will be shared between all passes.
970         float vertices[] = {
971                 0.0f, 1.0f,
972                 0.0f, 0.0f,
973                 1.0f, 1.0f,
974                 1.0f, 0.0f,
975         };
976         glCreateBuffers(1, &vertex_vbo);
977         glNamedBufferData(vertex_vbo, sizeof(vertices), vertices, GL_STATIC_DRAW);
978
979         glCreateVertexArrays(1, &vao);
980         glBindVertexArray(vao);
981         glBindBuffer(GL_ARRAY_BUFFER, vertex_vbo);
982
983         GLint position_attrib = 0;  // Hard-coded in every vertex shader.
984         glEnableVertexArrayAttrib(vao, position_attrib);
985         glVertexAttribPointer(position_attrib, 2, GL_FLOAT, GL_FALSE, 0, BUFFER_OFFSET(0));
986 }
987
988 pair<GLuint, GLuint> Interpolate::exec(GLuint image_tex, GLuint gray_tex, GLuint bidirectional_flow_tex, GLuint width, GLuint height, float alpha)
989 {
990         GPUTimers timers;
991
992         ScopedTimer total_timer("Interpolate", &timers);
993
994         glBindVertexArray(vao);
995         glDisable(GL_DITHER);
996
997         // Pick out the right level to test splatting results on.
998         GLuint tex_view;
999         glGenTextures(1, &tex_view);
1000         glTextureView(tex_view, GL_TEXTURE_2D_ARRAY, gray_tex, GL_R8, flow_level, 1, 0, 2);
1001
1002         int flow_width = width >> flow_level;
1003         int flow_height = height >> flow_level;
1004
1005         GLuint flow_tex = pool.get_texture(GL_RG16F, flow_width, flow_height);
1006         GLuint depth_rb = pool.get_renderbuffer(GL_DEPTH_COMPONENT16, flow_width, flow_height);  // Used for ranking flows.
1007
1008         {
1009                 ScopedTimer timer("Splat", &total_timer);
1010                 splat.exec(tex_view, bidirectional_flow_tex, flow_tex, depth_rb, flow_width, flow_height, alpha);
1011         }
1012         glDeleteTextures(1, &tex_view);
1013
1014         GLuint temp_tex[3];
1015         temp_tex[0] = pool.get_texture(GL_RG16F, flow_width, flow_height);
1016         temp_tex[1] = pool.get_texture(GL_RG16F, flow_width, flow_height);
1017         temp_tex[2] = pool.get_texture(GL_RG16F, flow_width, flow_height);
1018
1019         {
1020                 ScopedTimer timer("Fill holes", &total_timer);
1021                 hole_fill.exec(flow_tex, depth_rb, temp_tex, flow_width, flow_height);
1022                 hole_blend.exec(flow_tex, depth_rb, temp_tex, flow_width, flow_height);
1023         }
1024
1025         pool.release_texture(temp_tex[0]);
1026         pool.release_texture(temp_tex[1]);
1027         pool.release_texture(temp_tex[2]);
1028         pool.release_renderbuffer(depth_rb);
1029
1030         GLuint output_tex, output2_tex = 0;
1031         if (split_ycbcr_output) {
1032                 output_tex = pool.get_texture(GL_R8, width, height);
1033                 output2_tex = pool.get_texture(GL_RG8, width, height);
1034                 {
1035                         ScopedTimer timer("Blend", &total_timer);
1036                         blend.exec(image_tex, flow_tex, output_tex, output2_tex, width, height, alpha);
1037                 }
1038         } else {
1039                 output_tex = pool.get_texture(GL_RGBA8, width, height);
1040                 {
1041                         ScopedTimer timer("Blend", &total_timer);
1042                         blend.exec(image_tex, flow_tex, output_tex, 0, width, height, alpha);
1043                 }
1044         }
1045         pool.release_texture(flow_tex);
1046         total_timer.end();
1047         if (!in_warmup) {
1048                 timers.print();
1049         }
1050
1051         return make_pair(output_tex, output2_tex);
1052 }
1053
1054 GLuint TexturePool::get_texture(GLenum format, GLuint width, GLuint height, GLuint num_layers)
1055 {
1056         {
1057                 lock_guard<mutex> lock(mu);
1058                 for (Texture &tex : textures) {
1059                         if (!tex.in_use && !tex.is_renderbuffer && tex.format == format &&
1060                             tex.width == width && tex.height == height && tex.num_layers == num_layers) {
1061                                 tex.in_use = true;
1062                                 return tex.tex_num;
1063                         }
1064                 }
1065         }
1066
1067         Texture tex;
1068         if (num_layers == 0) {
1069                 glCreateTextures(GL_TEXTURE_2D, 1, &tex.tex_num);
1070                 glTextureStorage2D(tex.tex_num, 1, format, width, height);
1071         } else {
1072                 glCreateTextures(GL_TEXTURE_2D_ARRAY, 1, &tex.tex_num);
1073                 glTextureStorage3D(tex.tex_num, 1, format, width, height, num_layers);
1074         }
1075         tex.format = format;
1076         tex.width = width;
1077         tex.height = height;
1078         tex.num_layers = num_layers;
1079         tex.in_use = true;
1080         tex.is_renderbuffer = false;
1081         {
1082                 lock_guard<mutex> lock(mu);
1083                 textures.push_back(tex);
1084         }
1085         return tex.tex_num;
1086 }
1087
1088 GLuint TexturePool::get_renderbuffer(GLenum format, GLuint width, GLuint height)
1089 {
1090         {
1091                 lock_guard<mutex> lock(mu);
1092                 for (Texture &tex : textures) {
1093                         if (!tex.in_use && tex.is_renderbuffer && tex.format == format &&
1094                             tex.width == width && tex.height == height) {
1095                                 tex.in_use = true;
1096                                 return tex.tex_num;
1097                         }
1098                 }
1099         }
1100
1101         Texture tex;
1102         glCreateRenderbuffers(1, &tex.tex_num);
1103         glNamedRenderbufferStorage(tex.tex_num, format, width, height);
1104
1105         tex.format = format;
1106         tex.width = width;
1107         tex.height = height;
1108         tex.in_use = true;
1109         tex.is_renderbuffer = true;
1110         {
1111                 lock_guard<mutex> lock(mu);
1112                 textures.push_back(tex);
1113         }
1114         return tex.tex_num;
1115 }
1116
1117 void TexturePool::release_texture(GLuint tex_num)
1118 {
1119         lock_guard<mutex> lock(mu);
1120         for (Texture &tex : textures) {
1121                 if (!tex.is_renderbuffer && tex.tex_num == tex_num) {
1122                         assert(tex.in_use);
1123                         tex.in_use = false;
1124                         return;
1125                 }
1126         }
1127         assert(false);
1128 }
1129
1130 void TexturePool::release_renderbuffer(GLuint tex_num)
1131 {
1132         lock_guard<mutex> lock(mu);
1133         for (Texture &tex : textures) {
1134                 if (tex.is_renderbuffer && tex.tex_num == tex_num) {
1135                         assert(tex.in_use);
1136                         tex.in_use = false;
1137                         return;
1138                 }
1139         }
1140         //assert(false);
1141 }