]> git.sesse.net Git - nageru/commitdiff
Merge remote-tracking branch 'futatabi/master'
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Sat, 1 Dec 2018 23:11:12 +0000 (00:11 +0100)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Sat, 1 Dec 2018 23:11:12 +0000 (00:11 +0100)
This merges Nageru and Futatabi, since they are fairly closely related
and also share a fair amount of code.

88 files changed:
.gitignore
futatabi/add_base_flow.frag [new file with mode: 0644]
futatabi/bin2h.cpp [new file with mode: 0644]
futatabi/blend.frag [new file with mode: 0644]
futatabi/chroma_subsample.frag [new file with mode: 0644]
futatabi/chroma_subsample.vert [new file with mode: 0644]
futatabi/chroma_subsampler.cpp [new file with mode: 0644]
futatabi/chroma_subsampler.h [new file with mode: 0644]
futatabi/clip_list.cpp [new file with mode: 0644]
futatabi/clip_list.h [new file with mode: 0644]
futatabi/context.cpp [new file with mode: 0644]
futatabi/context.h [new file with mode: 0644]
futatabi/db.cpp [new file with mode: 0644]
futatabi/db.h [new file with mode: 0644]
futatabi/defs.h [new file with mode: 0644]
futatabi/densify.frag [new file with mode: 0644]
futatabi/densify.vert [new file with mode: 0644]
futatabi/derivatives.frag [new file with mode: 0644]
futatabi/diffusivity.frag [new file with mode: 0644]
futatabi/disk_space_estimator.cpp [new file with mode: 0644]
futatabi/disk_space_estimator.h [new file with mode: 0644]
futatabi/embedded_files.h [new file with mode: 0644]
futatabi/equations.frag [new file with mode: 0644]
futatabi/equations.vert [new file with mode: 0644]
futatabi/eval.cpp [new file with mode: 0644]
futatabi/ffmpeg_raii.cpp [new file with mode: 0644]
futatabi/ffmpeg_raii.h [new file with mode: 0644]
futatabi/flags.cpp [new file with mode: 0644]
futatabi/flags.h [new file with mode: 0644]
futatabi/flow.cpp [new file with mode: 0644]
futatabi/flow.h [new file with mode: 0644]
futatabi/flow_main.cpp [new file with mode: 0644]
futatabi/frame.proto [new file with mode: 0644]
futatabi/frame_on_disk.cpp [new file with mode: 0644]
futatabi/frame_on_disk.h [new file with mode: 0644]
futatabi/gpu_timers.cpp [new file with mode: 0644]
futatabi/gpu_timers.h [new file with mode: 0644]
futatabi/gray.frag [new file with mode: 0644]
futatabi/hole_blend.frag [new file with mode: 0644]
futatabi/hole_fill.frag [new file with mode: 0644]
futatabi/hole_fill.vert [new file with mode: 0644]
futatabi/httpd.cpp [new file with mode: 0644]
futatabi/httpd.h [new file with mode: 0644]
futatabi/jpeg_destroyer.h [new file with mode: 0644]
futatabi/jpeg_frame.h [new file with mode: 0644]
futatabi/jpeg_frame_view.cpp [new file with mode: 0644]
futatabi/jpeg_frame_view.h [new file with mode: 0644]
futatabi/main.cpp [new file with mode: 0644]
futatabi/mainwindow.cpp [new file with mode: 0644]
futatabi/mainwindow.h [new file with mode: 0644]
futatabi/mainwindow.ui [new file with mode: 0644]
futatabi/memcpy_interleaved.cpp [new file with mode: 0644]
futatabi/memcpy_interleaved.h [new file with mode: 0644]
futatabi/meson.build [new file with mode: 0644]
futatabi/metacube2.cpp [new file with mode: 0644]
futatabi/metacube2.h [new file with mode: 0644]
futatabi/motion_search.frag [new file with mode: 0644]
futatabi/motion_search.vert [new file with mode: 0644]
futatabi/mux.cpp [new file with mode: 0644]
futatabi/mux.h [new file with mode: 0644]
futatabi/player.cpp [new file with mode: 0644]
futatabi/player.h [new file with mode: 0644]
futatabi/post_to_main_thread.h [new file with mode: 0644]
futatabi/prewarp.frag [new file with mode: 0644]
futatabi/queue_spot_holder.h [new file with mode: 0644]
futatabi/ref_counted_gl_sync.h [new file with mode: 0644]
futatabi/resize_flow.frag [new file with mode: 0644]
futatabi/sobel.frag [new file with mode: 0644]
futatabi/sor.frag [new file with mode: 0644]
futatabi/sor.vert [new file with mode: 0644]
futatabi/splat.frag [new file with mode: 0644]
futatabi/splat.vert [new file with mode: 0644]
futatabi/state.proto [new file with mode: 0644]
futatabi/timebase.h [new file with mode: 0644]
futatabi/util.cpp [new file with mode: 0644]
futatabi/util.h [new file with mode: 0644]
futatabi/vaapi_jpeg_decoder.cpp [new file with mode: 0644]
futatabi/vaapi_jpeg_decoder.h [new file with mode: 0644]
futatabi/video_stream.cpp [new file with mode: 0644]
futatabi/video_stream.h [new file with mode: 0644]
futatabi/vis.cpp [new file with mode: 0644]
futatabi/vs.vert [new file with mode: 0644]
futatabi/ycbcr_converter.cpp [new file with mode: 0644]
futatabi/ycbcr_converter.h [new file with mode: 0644]
make-example-video.sh [new file with mode: 0644]
meson.build
nageru/meson.build
variational_refinement.txt [new file with mode: 0644]

index 2416a678e7d3f25ea206b6982136c2f49dcbb1f1..c0b5588694a5fad6b6192b6aa67a6ed2226b7d32 100644 (file)
@@ -1 +1,2 @@
 obj/
+.ycm_extra_conf.py
diff --git a/futatabi/add_base_flow.frag b/futatabi/add_base_flow.frag
new file mode 100644 (file)
index 0000000..ac56101
--- /dev/null
@@ -0,0 +1,11 @@
+#version 450 core
+
+in vec3 tc;
+out vec2 diff_flow;
+
+uniform sampler2DArray diff_flow_tex;
+
+void main()
+{
+       diff_flow = texture(diff_flow_tex, tc).xy;
+}
diff --git a/futatabi/bin2h.cpp b/futatabi/bin2h.cpp
new file mode 100644 (file)
index 0000000..a396afe
--- /dev/null
@@ -0,0 +1,51 @@
+#include <stdio.h>
+#include <string>
+
+using namespace std;
+
+int main(int argc, char **argv)
+{
+       if (argc != 4) {
+               fprintf(stderr, "Usage: bin2h INFILE BASENAME OUTFILE\n");
+               return 1;
+       }
+
+       string basename = argv[2];
+       for (char &ch : basename) {
+               if (!isalpha(ch) && !isdigit(ch)) {
+                       ch = '_';
+               }
+       }
+
+       FILE *infp = fopen(argv[1], "rb");
+       if (infp == nullptr) {
+               perror(argv[1]);
+               exit(1);
+       }
+
+       FILE *outfp = fopen(argv[3], "w");
+       if (outfp == nullptr) {
+               perror(argv[3]);
+               exit(1);
+       }
+
+       fprintf(outfp, "// Generated by bin2h.cpp from %s. Do not edit by hand.\n", argv[1]);
+       fprintf(outfp, "#include <stddef.h>\n");
+       fprintf(outfp, "unsigned char _binary_%s[] = {", basename.c_str());
+
+       size_t num_bytes = 0;
+       while (!feof(infp)) {
+               if (num_bytes++ % 16 == 0) {
+                       fprintf(outfp, "\n\t");
+               }
+               int ch = getc(infp);
+               if (ch == -1) {
+                       break;
+               }
+               fprintf(outfp, "0x%02x, ", ch);
+       }
+       fprintf(outfp, "\n};\n");
+       fprintf(outfp, "unsigned char *_binary_%s_data = _binary_%s;\n", basename.c_str(), basename.c_str());
+       fprintf(outfp, "size_t _binary_%s_size = sizeof(_binary_%s);\n", basename.c_str(), basename.c_str());
+       return 0;
+}
diff --git a/futatabi/blend.frag b/futatabi/blend.frag
new file mode 100644 (file)
index 0000000..eb3fc80
--- /dev/null
@@ -0,0 +1,49 @@
+#version 450 core
+
+in vec3 tc;
+
+#ifdef SPLIT_YCBCR_OUTPUT
+out float Y;
+out vec2 CbCr;
+#else
+out vec4 rgba;
+#endif
+
+uniform sampler2DArray image_tex;
+uniform sampler2D flow_tex;
+uniform float alpha;
+
+void main()
+{
+       vec2 flow = texture(flow_tex, tc.xy).xy;
+       vec4 I_0 = texture(image_tex, vec3(tc.xy - alpha * flow, 0));
+       vec4 I_1 = texture(image_tex, vec3(tc.xy + (1.0f - alpha) * flow, 1));
+
+       // Occlusion reasoning:
+
+       vec2 size = textureSize(image_tex, 0).xy;
+
+       // Follow the flow back to the initial point (where we sample I_0 from), then forward again.
+       // See how well we match the point we started at, which is out flow consistency.
+       float d0 = alpha * length(size * (texture(flow_tex, vec2(tc.xy - alpha * flow)).xy - flow));
+
+       // Same for d1.
+       float d1 = (1.0f - alpha) * length(size * (texture(flow_tex, vec2(tc.xy + (1.0f - alpha) * flow)).xy - flow));
+
+       vec4 result;
+       if (max(d0, d1) < 3.0f) {  // Arbitrary constant, not all that tuned. The UW paper says 1.0 is fine for ground truth.
+               // Both are visible, so blend.
+               result = I_0 + alpha * (I_1 - I_0);
+       } else if (d0 < d1) {
+               result = I_0;
+       } else {
+               result = I_1;
+       }
+
+#ifdef SPLIT_YCBCR_OUTPUT
+       Y = result.r;
+       CbCr = result.gb;
+#else
+       rgba = result;
+#endif
+}
diff --git a/futatabi/chroma_subsample.frag b/futatabi/chroma_subsample.frag
new file mode 100644 (file)
index 0000000..9a4155f
--- /dev/null
@@ -0,0 +1,10 @@
+#version 450 core
+in vec2 tc0, tc1;
+uniform sampler2D cbcr_tex;
+out float Cb, Cr;
+void main() {
+       vec2 result = 0.5 * (texture(cbcr_tex, tc0).rg + texture(cbcr_tex, tc1).rg);
+       Cb = result.r;
+       Cr = result.g;
+}
+
diff --git a/futatabi/chroma_subsample.vert b/futatabi/chroma_subsample.vert
new file mode 100644 (file)
index 0000000..81e1004
--- /dev/null
@@ -0,0 +1,21 @@
+#version 450 core
+
+layout(location=0) in vec2 position;
+out vec2 tc0, tc1;
+uniform vec2 chroma_offset_0;
+uniform vec2 chroma_offset_1;
+
+void main()
+{
+       // The result of glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) is:
+       //
+       //   2.000  0.000  0.000 -1.000
+       //   0.000  2.000  0.000 -1.000
+       //   0.000  0.000 -2.000 -1.000
+       //   0.000  0.000  0.000  1.000
+       gl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0);
+       vec2 flipped_tc = position;
+       tc0 = flipped_tc + chroma_offset_0;
+       tc1 = flipped_tc + chroma_offset_1;
+}
+
diff --git a/futatabi/chroma_subsampler.cpp b/futatabi/chroma_subsampler.cpp
new file mode 100644 (file)
index 0000000..d064bc7
--- /dev/null
@@ -0,0 +1,122 @@
+#include "chroma_subsampler.h"
+
+#include <movit/util.h>
+#include <string>
+
+#include "embedded_files.h"
+
+#define BUFFER_OFFSET(i) ((char *)nullptr + (i))
+
+using namespace std;
+
+string read_file(const string &filename, const unsigned char *start = nullptr, const size_t size = 0);
+GLuint compile_shader(const string &shader_src, GLenum type);
+GLuint link_program(GLuint vs_obj, GLuint fs_obj);
+void bind_sampler(GLuint program, GLint location, GLuint texture_unit, GLuint tex, GLuint sampler);
+
+extern GLuint linear_sampler;
+
+ChromaSubsampler::ChromaSubsampler()
+{
+       // Set up stuff for 4:2:2 conversion.
+       //
+       // Note: Due to the horizontally co-sited chroma/luma samples in H.264
+       // (chroma position is left for horizontal),
+       // we need to be a bit careful in our subsampling. A diagram will make
+       // this clearer, showing some luma and chroma samples:
+       //
+       //     a   b   c   d
+       //   +---+---+---+---+
+       //   |   |   |   |   |
+       //   | Y | Y | Y | Y |
+       //   |   |   |   |   |
+       //   +---+---+---+---+
+       //
+       // +-------+-------+
+       // |       |       |
+       // |   C   |   C   |
+       // |       |       |
+       // +-------+-------+
+       //
+       // Clearly, the rightmost chroma sample here needs to be equivalent to
+       // b/4 + c/2 + d/4. (We could also implement more sophisticated filters,
+       // of course, but as long as the upsampling is not going to be equally
+       // sophisticated, it's probably not worth it.) If we sample once with
+       // no mipmapping, we get just c, ie., no actual filtering in the
+       // horizontal direction. (For the vertical direction, we can just
+       // sample in the middle to get the right filtering.) One could imagine
+       // we could use mipmapping (assuming we can create mipmaps cheaply),
+       // but then, what we'd get is this:
+       //
+       //    (a+b)/2 (c+d)/2
+       //   +-------+-------+
+       //   |       |       |
+       //   |   Y   |   Y   |
+       //   |       |       |
+       //   +-------+-------+
+       //
+       // +-------+-------+
+       // |       |       |
+       // |   C   |   C   |
+       // |       |       |
+       // +-------+-------+
+       //
+       // which ends up sampling equally from a and b, which clearly isn't right. Instead,
+       // we need to do two (non-mipmapped) chroma samples, both hitting exactly in-between
+       // source pixels.
+       //
+       // Sampling in-between b and c gives us the sample (b+c)/2, and similarly for c and d.
+       // Taking the average of these gives of (b+c)/4 + (c+d)/4 = b/4 + c/2 + d/4, which is
+       // exactly what we want.
+       //
+       // See also http://www.poynton.com/PDFs/Merging_RGB_and_422.pdf, pages 6–7.
+
+       cbcr_vs_obj = compile_shader(read_file("chroma_subsample.vert", _binary_chroma_subsample_vert_data, _binary_chroma_subsample_vert_size), GL_VERTEX_SHADER);
+       cbcr_fs_obj = compile_shader(read_file("chroma_subsample.frag", _binary_chroma_subsample_frag_data, _binary_chroma_subsample_frag_size), GL_FRAGMENT_SHADER);
+       cbcr_program = link_program(cbcr_vs_obj, cbcr_fs_obj);
+
+       // Set up the VAO containing all the required position data.
+       glCreateVertexArrays(1, &vao);
+       glBindVertexArray(vao);
+
+       float vertices[] = {
+               0.0f, 2.0f,
+               0.0f, 0.0f,
+               2.0f, 0.0f
+       };
+       glCreateBuffers(1, &vbo);
+       glNamedBufferData(vbo, sizeof(vertices), vertices, GL_STATIC_DRAW);
+       glBindBuffer(GL_ARRAY_BUFFER, vbo);
+
+       GLint position_attrib = 0;  // Hard-coded in every vertex shader.
+       glEnableVertexArrayAttrib(vao, position_attrib);
+       glVertexAttribPointer(position_attrib, 2, GL_FLOAT, GL_FALSE, 0, BUFFER_OFFSET(0));
+
+       uniform_cbcr_tex = glGetUniformLocation(cbcr_program, "cbcr_tex");
+       uniform_chroma_offset_0 = glGetUniformLocation(cbcr_program, "chroma_offset_0");
+       uniform_chroma_offset_1 = glGetUniformLocation(cbcr_program, "chroma_offset_1");
+}
+
+ChromaSubsampler::~ChromaSubsampler()
+{
+       glDeleteProgram(cbcr_program);
+       check_error();
+       glDeleteBuffers(1, &vbo);
+       check_error();
+       glDeleteVertexArrays(1, &vao);
+       check_error();
+}
+
+void ChromaSubsampler::subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint cb_tex, GLuint cr_tex)
+{
+       glUseProgram(cbcr_program);
+       bind_sampler(cbcr_program, uniform_cbcr_tex, 0, cbcr_tex, linear_sampler);
+       glProgramUniform2f(cbcr_program, uniform_chroma_offset_0, -1.0f / width, 0.0f);
+       glProgramUniform2f(cbcr_program, uniform_chroma_offset_1, -0.0f / width, 0.0f);
+
+       glViewport(0, 0, width / 2, height);
+       fbos.render_to(cb_tex, cr_tex);
+
+       glBindVertexArray(vao);
+       glDrawArrays(GL_TRIANGLES, 0, 3);
+}
diff --git a/futatabi/chroma_subsampler.h b/futatabi/chroma_subsampler.h
new file mode 100644 (file)
index 0000000..ec43fe0
--- /dev/null
@@ -0,0 +1,29 @@
+#ifndef _CHROMA_SUBSAMPLER_H
+#define _CHROMA_SUBSAMPLER_H 1
+
+#include "flow.h"
+
+#include <epoxy/gl.h>
+
+class ChromaSubsampler {
+public:
+       ChromaSubsampler();
+       ~ChromaSubsampler();
+
+       // Subsamples chroma (packed Cb and Cr) 2x1 to yield chroma suitable for
+       // planar 4:2:2. Chroma positioning is left (H.264 convention).
+       // width and height are the dimensions (in pixels) of the input texture.
+       void subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint cb_tex, GLuint cr_tex);
+
+private:
+       PersistentFBOSet<2> fbos;
+
+       GLuint vao;
+       GLuint vbo;  // Holds position data.
+
+       GLuint cbcr_vs_obj, cbcr_fs_obj, cbcr_program;
+       GLuint uniform_cbcr_tex;
+       GLuint uniform_chroma_offset_0, uniform_chroma_offset_1;
+};
+
+#endif  // !defined(_CHROMA_SUBSAMPLER_H)
diff --git a/futatabi/clip_list.cpp b/futatabi/clip_list.cpp
new file mode 100644 (file)
index 0000000..14f083e
--- /dev/null
@@ -0,0 +1,513 @@
+#include "clip_list.h"
+
+#include "mainwindow.h"
+#include "timebase.h"
+#include "ui_mainwindow.h"
+
+#include <math.h>
+#include <string>
+#include <vector>
+
+using namespace std;
+
+string pts_to_string(int64_t pts)
+{
+       int64_t t = lrint((pts / double(TIMEBASE)) * 1e3);  // In milliseconds.
+       int ms = t % 1000;
+       t /= 1000;
+       int sec = t % 60;
+       t /= 60;
+       int min = t % 60;
+       t /= 60;
+       int hour = t;
+
+       char buf[256];
+       snprintf(buf, sizeof(buf), "%d:%02d:%02d.%03d", hour, min, sec, ms);
+       return buf;
+}
+
+string duration_to_string(int64_t pts_diff)
+{
+       int64_t t = lrint((pts_diff / double(TIMEBASE)) * 1e3);  // In milliseconds.
+       int ms = t % 1000;
+       t /= 1000;
+       int sec = t % 60;
+       t /= 60;
+       int min = t;
+
+       char buf[256];
+       snprintf(buf, sizeof(buf), "%d:%02d.%03d", min, sec, ms);
+       return buf;
+}
+
+int ClipList::rowCount(const QModelIndex &parent) const
+{
+       if (parent.isValid())
+               return 0;
+       return clips.size();
+}
+
+int PlayList::rowCount(const QModelIndex &parent) const
+{
+       if (parent.isValid())
+               return 0;
+       return clips.size();
+}
+
+int ClipList::columnCount(const QModelIndex &parent) const
+{
+       if (parent.isValid())
+               return 0;
+       return int(Column::NUM_COLUMNS);
+}
+
+int PlayList::columnCount(const QModelIndex &parent) const
+{
+       if (parent.isValid())
+               return 0;
+       return int(Column::NUM_COLUMNS);
+}
+
+QVariant ClipList::data(const QModelIndex &parent, int role) const
+{
+       if (!parent.isValid())
+               return QVariant();
+       const int row = parent.row(), column = parent.column();
+       if (size_t(row) >= clips.size())
+               return QVariant();
+
+       if (role == Qt::TextAlignmentRole) {
+               switch (Column(column)) {
+               case Column::IN:
+               case Column::OUT:
+               case Column::DURATION:
+                       return Qt::AlignRight + Qt::AlignVCenter;
+               default:
+                       return Qt::AlignLeft + Qt::AlignVCenter;
+               }
+       }
+
+       if (role != Qt::DisplayRole && role != Qt::EditRole)
+               return QVariant();
+
+       switch (Column(column)) {
+       case Column::IN:
+               return QString::fromStdString(pts_to_string(clips[row].pts_in));
+       case Column::OUT:
+               if (clips[row].pts_out >= 0) {
+                       return QString::fromStdString(pts_to_string(clips[row].pts_out));
+               } else {
+                       return QVariant();
+               }
+       case Column::DURATION:
+               if (clips[row].pts_out >= 0) {
+                       return QString::fromStdString(duration_to_string(clips[row].pts_out - clips[row].pts_in));
+               } else {
+                       return QVariant();
+               }
+       case Column::CAMERA_1:
+       case Column::CAMERA_2:
+       case Column::CAMERA_3:
+       case Column::CAMERA_4: {
+               unsigned stream_idx = column - int(Column::CAMERA_1);
+               return QString::fromStdString(clips[row].descriptions[stream_idx]);
+       }
+       default:
+               return "";
+       }
+}
+
+QVariant PlayList::data(const QModelIndex &parent, int role) const
+{
+       if (!parent.isValid())
+               return QVariant();
+       const int row = parent.row(), column = parent.column();
+       if (size_t(row) >= clips.size())
+               return QVariant();
+
+       if (role == Qt::TextAlignmentRole) {
+               switch (Column(column)) {
+               case Column::PLAYING:
+                       return Qt::AlignCenter;
+               case Column::IN:
+               case Column::OUT:
+               case Column::DURATION:
+               case Column::FADE_TIME:
+                       return Qt::AlignRight + Qt::AlignVCenter;
+               case Column::CAMERA:
+                       return Qt::AlignCenter;
+               default:
+                       return Qt::AlignLeft + Qt::AlignVCenter;
+               }
+       }
+       if (role == Qt::BackgroundRole) {
+               if (Column(column) == Column::PLAYING) {
+                       auto it = current_progress.find(row);
+                       if (it != current_progress.end()) {
+                               double play_progress = it->second;
+
+                               // This only really works well for the first column, for whatever odd Qt reason.
+                               QLinearGradient grad(QPointF(0, 0), QPointF(1, 0));
+                               grad.setCoordinateMode(grad.QGradient::ObjectBoundingMode);
+                               grad.setColorAt(0.0f, QColor::fromRgbF(0.0f, 0.0f, 1.0f, 0.2f));
+                               grad.setColorAt(play_progress, QColor::fromRgbF(0.0f, 0.0f, 1.0f, 0.2f));
+                               if (play_progress + 0.01f <= 1.0f) {
+                                       grad.setColorAt(play_progress + 0.01f, QColor::fromRgbF(0.0f, 0.0f, 1.0f, 0.0f));
+                               }
+                               return QBrush(grad);
+                       } else {
+                               return QVariant();
+                       }
+               } else {
+                       return QVariant();
+               }
+       }
+
+       if (role != Qt::DisplayRole && role != Qt::EditRole)
+               return QVariant();
+
+       switch (Column(column)) {
+       case Column::PLAYING:
+               return current_progress.count(row) ? "→" : "";
+       case Column::IN:
+               return QString::fromStdString(pts_to_string(clips[row].pts_in));
+       case Column::OUT:
+               if (clips[row].pts_out >= 0) {
+                       return QString::fromStdString(pts_to_string(clips[row].pts_out));
+               } else {
+                       return QVariant();
+               }
+       case Column::DURATION:
+               if (clips[row].pts_out >= 0) {
+                       return QString::fromStdString(duration_to_string(clips[row].pts_out - clips[row].pts_in));
+               } else {
+                       return QVariant();
+               }
+       case Column::CAMERA:
+               return qlonglong(clips[row].stream_idx + 1);
+       case Column::DESCRIPTION:
+               return QString::fromStdString(clips[row].descriptions[clips[row].stream_idx]);
+       case Column::FADE_TIME: {
+               stringstream ss;
+               ss.imbue(locale("C"));
+               ss.precision(3);
+               ss << fixed << clips[row].fade_time_seconds;
+               return QString::fromStdString(ss.str());
+       }
+       default:
+               return "";
+       }
+}
+
+QVariant ClipList::headerData(int section, Qt::Orientation orientation, int role) const
+{
+       if (role != Qt::DisplayRole)
+               return QVariant();
+       if (orientation != Qt::Horizontal)
+               return QVariant();
+
+       switch (Column(section)) {
+       case Column::IN:
+               return "In";
+       case Column::OUT:
+               return "Out";
+       case Column::DURATION:
+               return "Duration";
+       case Column::CAMERA_1:
+               return "Camera 1";
+       case Column::CAMERA_2:
+               return "Camera 2";
+       case Column::CAMERA_3:
+               return "Camera 3";
+       case Column::CAMERA_4:
+               return "Camera 4";
+       default:
+               return "";
+       }
+}
+
+QVariant PlayList::headerData(int section, Qt::Orientation orientation, int role) const
+{
+       if (role != Qt::DisplayRole)
+               return QVariant();
+       if (orientation != Qt::Horizontal)
+               return QVariant();
+
+       switch (Column(section)) {
+       case Column::PLAYING:
+               return "";
+       case Column::IN:
+               return "In";
+       case Column::OUT:
+               return "Out";
+       case Column::DURATION:
+               return "Duration";
+       case Column::CAMERA:
+               return "Camera";
+       case Column::DESCRIPTION:
+               return "Description";
+       case Column::FADE_TIME:
+               return "Fade time";
+       default:
+               return "";
+       }
+}
+
+Qt::ItemFlags ClipList::flags(const QModelIndex &index) const
+{
+       if (!index.isValid())
+               return Qt::ItemIsEnabled | Qt::ItemIsSelectable;
+       const int row = index.row(), column = index.column();
+       if (size_t(row) >= clips.size())
+               return Qt::ItemIsEnabled | Qt::ItemIsSelectable;
+
+       switch (Column(column)) {
+       case Column::CAMERA_1:
+       case Column::CAMERA_2:
+       case Column::CAMERA_3:
+       case Column::CAMERA_4:
+               return Qt::ItemIsEnabled | Qt::ItemIsSelectable | Qt::ItemIsEditable | Qt::ItemIsDragEnabled;
+       default:
+               return Qt::ItemIsEnabled | Qt::ItemIsSelectable;
+       }
+}
+
+Qt::ItemFlags PlayList::flags(const QModelIndex &index) const
+{
+       if (!index.isValid())
+               return Qt::ItemIsEnabled | Qt::ItemIsSelectable;
+       const int row = index.row(), column = index.column();
+       if (size_t(row) >= clips.size())
+               return Qt::ItemIsEnabled | Qt::ItemIsSelectable;
+
+       switch (Column(column)) {
+       case Column::DESCRIPTION:
+       case Column::CAMERA:
+       case Column::FADE_TIME:
+               return Qt::ItemIsEnabled | Qt::ItemIsSelectable | Qt::ItemIsEditable;
+               return Qt::ItemIsEnabled | Qt::ItemIsSelectable | Qt::ItemIsEditable;
+       default:
+               return Qt::ItemIsEnabled | Qt::ItemIsSelectable;
+       }
+}
+
+bool ClipList::setData(const QModelIndex &index, const QVariant &value, int role)
+{
+       if (!index.isValid() || role != Qt::EditRole) {
+               return false;
+       }
+
+       const int row = index.row(), column = index.column();
+       if (size_t(row) >= clips.size())
+               return false;
+
+       switch (Column(column)) {
+       case Column::CAMERA_1:
+       case Column::CAMERA_2:
+       case Column::CAMERA_3:
+       case Column::CAMERA_4: {
+               unsigned stream_idx = column - int(Column::CAMERA_1);
+               clips[row].descriptions[stream_idx] = value.toString().toStdString();
+               emit_data_changed(row);
+               return true;
+       }
+       default:
+               return false;
+       }
+}
+
+bool PlayList::setData(const QModelIndex &index, const QVariant &value, int role)
+{
+       if (!index.isValid() || role != Qt::EditRole) {
+               return false;
+       }
+
+       const int row = index.row(), column = index.column();
+       if (size_t(row) >= clips.size())
+               return false;
+
+       switch (Column(column)) {
+       case Column::DESCRIPTION:
+               clips[row].descriptions[clips[row].stream_idx] = value.toString().toStdString();
+               emit_data_changed(row);
+               return true;
+       case Column::CAMERA: {
+               bool ok;
+               int camera_idx = value.toInt(&ok);
+               if (!ok || camera_idx < 1 || camera_idx > NUM_CAMERAS) {
+                       return false;
+               }
+               clips[row].stream_idx = camera_idx - 1;
+               emit_data_changed(row);
+               return true;
+       }
+       case Column::FADE_TIME: {
+               bool ok;
+               double val = value.toDouble(&ok);
+               if (!ok || !(val >= 0.0)) {
+                       return false;
+               }
+               clips[row].fade_time_seconds = val;
+               emit_data_changed(row);
+               return true;
+       }
+       default:
+               return false;
+       }
+}
+
+void ClipList::add_clip(const Clip &clip)
+{
+       beginInsertRows(QModelIndex(), clips.size(), clips.size());
+       clips.push_back(clip);
+       endInsertRows();
+       emit any_content_changed();
+}
+
+void PlayList::add_clip(const Clip &clip)
+{
+       beginInsertRows(QModelIndex(), clips.size(), clips.size());
+       clips.push_back(clip);
+       endInsertRows();
+       emit any_content_changed();
+}
+
+void PlayList::duplicate_clips(size_t first, size_t last)
+{
+       beginInsertRows(QModelIndex(), first, last);
+       clips.insert(clips.begin() + first, clips.begin() + first, clips.begin() + last + 1);
+       endInsertRows();
+       emit any_content_changed();
+}
+
+void PlayList::erase_clips(size_t first, size_t last)
+{
+       beginRemoveRows(QModelIndex(), first, last);
+       clips.erase(clips.begin() + first, clips.begin() + last + 1);
+       endRemoveRows();
+       emit any_content_changed();
+}
+
+void PlayList::move_clips(size_t first, size_t last, int delta)
+{
+       if (delta == -1) {
+               beginMoveRows(QModelIndex(), first, last, QModelIndex(), first - 1);
+               rotate(clips.begin() + first - 1, clips.begin() + first, clips.begin() + last + 1);
+       } else {
+               beginMoveRows(QModelIndex(), first, last, QModelIndex(), first + (last - first + 1) + 1);
+               first = clips.size() - first - 1;
+               last = clips.size() - last - 1;
+               rotate(clips.rbegin() + last - 1, clips.rbegin() + last, clips.rbegin() + first + 1);
+       }
+       endMoveRows();
+       emit any_content_changed();
+}
+
+void ClipList::emit_data_changed(size_t row)
+{
+       emit dataChanged(index(row, 0), index(row, int(Column::NUM_COLUMNS)));
+       emit any_content_changed();
+}
+
+void PlayList::emit_data_changed(size_t row)
+{
+       emit dataChanged(index(row, 0), index(row, int(Column::NUM_COLUMNS)));
+       emit any_content_changed();
+}
+
+void PlayList::set_currently_playing(int index, double progress)
+{
+       int old_index = currently_playing_index;
+       int column = int(Column::PLAYING);
+       if (index != old_index) {
+               currently_playing_index = index;
+               play_progress = progress;
+               if (old_index != -1) {
+                       emit dataChanged(this->index(old_index, column), this->index(old_index, column));
+               }
+               if (index != -1) {
+                       emit dataChanged(this->index(index, column), this->index(index, column));
+               }
+       } else if (index != -1 && fabs(progress - play_progress) > 1e-3) {
+               play_progress = progress;
+               emit dataChanged(this->index(index, column), this->index(index, column));
+       }
+}
+
+void PlayList::set_progress(const map<size_t, double> &progress)
+{
+       const int column = int(Column::PLAYING);
+       map<size_t, double> old_progress = move(this->current_progress);
+       this->current_progress = progress;
+
+       for (auto it : old_progress) {
+               size_t index = it.first;
+               if (current_progress.count(index) == 0) {
+                       emit dataChanged(this->index(index, column), this->index(index, column));
+               }
+       }
+       for (auto it : current_progress) {
+               size_t index = it.first;
+               emit dataChanged(this->index(index, column), this->index(index, column));
+       }
+}
+
+namespace {
+
+Clip deserialize_clip(const ClipProto &clip_proto)
+{
+       Clip clip;
+       clip.pts_in = clip_proto.pts_in();
+       clip.pts_out = clip_proto.pts_out();
+       for (int camera_idx = 0; camera_idx < min(clip_proto.description_size(), NUM_CAMERAS); ++camera_idx) {
+               clip.descriptions[camera_idx] = clip_proto.description(camera_idx);
+       }
+       clip.stream_idx = clip_proto.stream_idx();
+       clip.fade_time_seconds = clip_proto.fade_time_seconds();
+       return clip;
+}
+
+void serialize_clip(const Clip &clip, ClipProto *clip_proto)
+{
+       clip_proto->set_pts_in(clip.pts_in);
+       clip_proto->set_pts_out(clip.pts_out);
+       for (int camera_idx = 0; camera_idx < NUM_CAMERAS; ++camera_idx) {
+               *clip_proto->add_description() = clip.descriptions[camera_idx];
+       }
+       clip_proto->set_stream_idx(clip.stream_idx);
+       clip_proto->set_fade_time_seconds(clip.fade_time_seconds);
+}
+
+}  // namespace
+
+ClipList::ClipList(const ClipListProto &serialized)
+{
+       for (const ClipProto &clip_proto : serialized.clip()) {
+               clips.push_back(deserialize_clip(clip_proto));
+       }
+}
+
+ClipListProto ClipList::serialize() const
+{
+       ClipListProto ret;
+       for (const Clip &clip : clips) {
+               serialize_clip(clip, ret.add_clip());
+       }
+       return ret;
+}
+
+PlayList::PlayList(const ClipListProto &serialized)
+{
+       for (const ClipProto &clip_proto : serialized.clip()) {
+               clips.push_back(deserialize_clip(clip_proto));
+       }
+}
+
+ClipListProto PlayList::serialize() const
+{
+       ClipListProto ret;
+       for (const Clip &clip : clips) {
+               serialize_clip(clip, ret.add_clip());
+       }
+       return ret;
+}
diff --git a/futatabi/clip_list.h b/futatabi/clip_list.h
new file mode 100644 (file)
index 0000000..8dead83
--- /dev/null
@@ -0,0 +1,152 @@
+#ifndef _CLIP_LIST_H
+#define _CLIP_LIST_H 1
+
+#include "defs.h"
+#include "state.pb.h"
+
+#include <QAbstractTableModel>
+#include <stdint.h>
+#include <map>
+#include <string>
+#include <vector>
+
+struct Clip {
+       int64_t pts_in = -1, pts_out = -1;  // pts_in is inclusive, pts_out is exclusive.
+       std::string descriptions[NUM_CAMERAS];
+       unsigned stream_idx = 0;  // For the playlist only.
+       double fade_time_seconds = 0.5;  // For the playlist only.
+};
+
+class DataChangedReceiver {
+public:
+       virtual ~DataChangedReceiver() {}
+       virtual void emit_data_changed(size_t row) = 0;
+};
+
+// Like a smart pointer to a Clip, but emits dataChanged when it goes out of scope.
+struct ClipProxy {
+public:
+       ClipProxy(Clip &clip, DataChangedReceiver *clip_list, size_t row)
+               : clip(clip), clip_list(clip_list), row(row) {}
+       ~ClipProxy()
+       {
+               if (clip_list != nullptr) {
+                       clip_list->emit_data_changed(row);
+               }
+       }
+       Clip *operator->() { return &clip; }
+       Clip &operator*() { return clip; }
+
+private:
+       Clip &clip;
+       DataChangedReceiver *clip_list;
+       size_t row;
+};
+
+class ClipList : public QAbstractTableModel, public DataChangedReceiver {
+       Q_OBJECT
+
+public:
+       explicit ClipList(const ClipListProto &serialized);
+
+       enum class Column {
+               IN,
+               OUT,
+               DURATION,
+               CAMERA_1,
+               CAMERA_2,
+               CAMERA_3,
+               CAMERA_4,
+               NUM_COLUMNS
+       };
+
+       int rowCount(const QModelIndex &parent) const override;
+       int columnCount(const QModelIndex &parent) const override;
+       QVariant data(const QModelIndex &parent, int role) const override;
+       QVariant headerData(int section, Qt::Orientation orientation, int role = Qt::DisplayRole) const override;
+       Qt::ItemFlags flags(const QModelIndex &index) const override;
+       bool setData(const QModelIndex &index, const QVariant &value, int role = Qt::EditRole) override;
+
+       void add_clip(const Clip &clip);
+       size_t size() const { return clips.size(); }
+       bool empty() const { return clips.empty(); }
+
+       ClipProxy mutable_clip(size_t index) { return ClipProxy(clips[index], this, index); }
+       const Clip *clip(size_t index) const { return &clips[index]; }
+
+       ClipProxy mutable_back() { return mutable_clip(size() - 1); }
+       const Clip *back() const { return clip(size() - 1); }
+
+       ClipListProto serialize() const;
+
+       void emit_data_changed(size_t row) override;
+
+signals:
+       void any_content_changed();
+
+private:
+       std::vector<Clip> clips;
+};
+
+class PlayList : public QAbstractTableModel, public DataChangedReceiver {
+       Q_OBJECT
+
+public:
+       explicit PlayList(const ClipListProto &serialized);
+
+       enum class Column {
+               PLAYING,
+               IN,
+               OUT,
+               DURATION,
+               CAMERA,
+               DESCRIPTION,
+               FADE_TIME,
+               NUM_COLUMNS
+       };
+
+       int rowCount(const QModelIndex &parent) const override;
+       int columnCount(const QModelIndex &parent) const override;
+       QVariant data(const QModelIndex &parent, int role) const override;
+       QVariant headerData(int section, Qt::Orientation orientation, int role = Qt::DisplayRole) const override;
+       Qt::ItemFlags flags(const QModelIndex &index) const override;
+       bool setData(const QModelIndex &index, const QVariant &value, int role = Qt::EditRole) override;
+
+       void add_clip(const Clip &clip);
+
+       // <last> is inclusive in all of these.
+       void duplicate_clips(size_t first, size_t last);
+       void erase_clips(size_t first, size_t last);
+       // <delta> is -1 to move upwards, +1 to move downwards.
+       void move_clips(size_t first, size_t last, int delta);
+
+       size_t size() const { return clips.size(); }
+       bool empty() const { return clips.empty(); }
+
+       ClipProxy mutable_clip(size_t index) { return ClipProxy(clips[index], this, index); }
+       const Clip *clip(size_t index) const { return &clips[index]; }
+
+       ClipProxy mutable_back() { return mutable_clip(size() - 1); }
+       const Clip *back() const { return clip(size() - 1); }
+
+       // TODO: Move these out of PlayList.
+       void set_currently_playing(int index, double progress);  // -1 = none.
+       int get_currently_playing() const { return currently_playing_index; }
+
+       void set_progress(const std::map<size_t, double> &progress);
+
+       ClipListProto serialize() const;
+
+       void emit_data_changed(size_t row) override;
+
+signals:
+       void any_content_changed();
+
+private:
+       std::vector<Clip> clips;
+       int currently_playing_index = -1;
+       double play_progress = 0.0;
+       std::map<size_t, double> current_progress;
+};
+
+#endif  // !defined (_CLIP_LIST_H)
diff --git a/futatabi/context.cpp b/futatabi/context.cpp
new file mode 100644 (file)
index 0000000..0b17bfa
--- /dev/null
@@ -0,0 +1,66 @@
+#include <QGL>
+#include <QOffscreenSurface>
+#include <QOpenGLContext>
+#include <QSurface>
+#include <QSurfaceFormat>
+#include <stdio.h>
+#include <string>
+
+QGLWidget *global_share_widget = nullptr;
+
+using namespace std;
+
+QSurface *create_surface()
+{
+       QSurfaceFormat fmt;
+       fmt.setDepthBufferSize(0);
+       fmt.setStencilBufferSize(0);
+       fmt.setProfile(QSurfaceFormat::CoreProfile);
+       fmt.setMajorVersion(4);
+       fmt.setMinorVersion(5);
+       fmt.setSwapInterval(0);
+       QOffscreenSurface *surface = new QOffscreenSurface;
+       surface->setFormat(fmt);
+       surface->create();
+       if (!surface->isValid()) {
+               fprintf(stderr, "ERROR: surface not valid!\n");
+               exit(1);
+       }
+       return surface;
+}
+
+QSurface *create_surface(const QSurfaceFormat &format)
+{
+       QOffscreenSurface *surface = new QOffscreenSurface;
+       surface->setFormat(format);
+       surface->create();
+       if (!surface->isValid()) {
+               fprintf(stderr, "ERROR: surface not valid!\n");
+               exit(1);
+       }
+       return surface;
+}
+
+QSurface *create_surface_with_same_format(const QSurface *surface)
+{
+       return create_surface(surface->format());
+}
+
+QOpenGLContext *create_context(const QSurface *surface)
+{
+       QOpenGLContext *context = new QOpenGLContext;
+       context->setShareContext(global_share_widget->context()->contextHandle());
+       context->setFormat(surface->format());
+       context->create();
+       return context;
+}
+
+bool make_current(QOpenGLContext *context, QSurface *surface)
+{
+       return context->makeCurrent(surface);
+}
+
+void delete_context(QOpenGLContext *context)
+{
+       delete context;
+}
diff --git a/futatabi/context.h b/futatabi/context.h
new file mode 100644 (file)
index 0000000..aebba96
--- /dev/null
@@ -0,0 +1,17 @@
+
+// Needs to be in its own file because Qt and libepoxy seemingly don't coexist well
+// within the same file.
+
+class QSurface;
+class QOpenGLContext;
+class QSurfaceFormat;
+class QGLWidget;
+
+extern bool using_egl;
+extern QGLWidget *global_share_widget;
+QSurface *create_surface();
+QSurface *create_surface(const QSurfaceFormat &format);
+QSurface *create_surface_with_same_format(const QSurface *surface);
+QOpenGLContext *create_context(const QSurface *surface);
+bool make_current(QOpenGLContext *context, QSurface *surface);
+void delete_context(QOpenGLContext *context);
diff --git a/futatabi/db.cpp b/futatabi/db.cpp
new file mode 100644 (file)
index 0000000..39fd557
--- /dev/null
@@ -0,0 +1,324 @@
+#include "db.h"
+
+#include "frame.pb.h"
+
+#include <string>
+
+using namespace std;
+
+DB::DB(const string &filename)
+{
+       int ret = sqlite3_open(filename.c_str(), &db);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "%s: %s\n", filename.c_str(), sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       sqlite3_exec(db, R"(
+               CREATE TABLE IF NOT EXISTS state (state BLOB);
+       )", nullptr, nullptr, nullptr);  // Ignore errors.
+
+       sqlite3_exec(db, R"(
+               DROP TABLE file;
+       )", nullptr, nullptr, nullptr);  // Ignore errors.
+
+       sqlite3_exec(db, R"(
+               DROP TABLE frame;
+       )", nullptr, nullptr, nullptr);  // Ignore errors.
+
+       sqlite3_exec(db, R"(
+               CREATE TABLE IF NOT EXISTS filev2 (
+                       file INTEGER NOT NULL PRIMARY KEY,
+                       filename VARCHAR NOT NULL UNIQUE,
+                       size BIGINT NOT NULL,
+                       frames BLOB NOT NULL
+               );
+       )", nullptr, nullptr, nullptr);  // Ignore errors.
+
+       sqlite3_exec(db, "PRAGMA journal_mode=WAL", nullptr, nullptr, nullptr);  // Ignore errors.
+       sqlite3_exec(db, "PRAGMA synchronous=NORMAL", nullptr, nullptr, nullptr);  // Ignore errors.
+}
+
+StateProto DB::get_state()
+{
+       StateProto state;
+
+       sqlite3_stmt *stmt;
+       int ret = sqlite3_prepare_v2(db, "SELECT state FROM state", -1, &stmt, 0);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "SELECT prepare: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       ret = sqlite3_step(stmt);
+       if (ret == SQLITE_ROW) {
+               bool ok = state.ParseFromArray(sqlite3_column_blob(stmt, 0), sqlite3_column_bytes(stmt, 0));
+               if (!ok) {
+                       fprintf(stderr, "State in database is corrupted!\n");
+                       exit(1);
+               }
+       } else if (ret != SQLITE_DONE) {
+               fprintf(stderr, "SELECT step: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       ret = sqlite3_finalize(stmt);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "SELECT finalize: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       return state;
+}
+
+void DB::store_state(const StateProto &state)
+{
+       string serialized;
+       state.SerializeToString(&serialized);
+
+       int ret = sqlite3_exec(db, "BEGIN", nullptr, nullptr, nullptr);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "BEGIN: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       ret = sqlite3_exec(db, "DELETE FROM state", nullptr, nullptr, nullptr);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "DELETE: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       sqlite3_stmt *stmt;
+       ret = sqlite3_prepare_v2(db, "INSERT INTO state VALUES (?)", -1, &stmt, 0);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "INSERT prepare: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       sqlite3_bind_blob(stmt, 1, serialized.data(), serialized.size(), SQLITE_STATIC);
+
+       ret = sqlite3_step(stmt);
+       if (ret == SQLITE_ROW) {
+               fprintf(stderr, "INSERT step: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       ret = sqlite3_finalize(stmt);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "INSERT finalize: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       ret = sqlite3_exec(db, "COMMIT", nullptr, nullptr, nullptr);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "COMMIT: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+}
+
+vector<DB::FrameOnDiskAndStreamIdx> DB::load_frame_file(const string &filename, size_t size, unsigned filename_idx)
+{
+       FileContentsProto file_contents;
+
+       sqlite3_stmt *stmt;
+       int ret = sqlite3_prepare_v2(db, "SELECT frames FROM filev2 WHERE filename=? AND size=?", -1, &stmt, 0);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "SELECT prepare: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       sqlite3_bind_text(stmt, 1, filename.data(), filename.size(), SQLITE_STATIC);
+       sqlite3_bind_int64(stmt, 2, size);
+
+       ret = sqlite3_step(stmt);
+       if (ret == SQLITE_ROW) {
+               bool ok = file_contents.ParseFromArray(sqlite3_column_blob(stmt, 0), sqlite3_column_bytes(stmt, 0));
+               if (!ok) {
+                       fprintf(stderr, "Frame list in database is corrupted!\n");
+                       exit(1);
+               }
+       } else if (ret != SQLITE_DONE) {
+               fprintf(stderr, "SELECT step: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       ret = sqlite3_finalize(stmt);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "SELECT finalize: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       vector<FrameOnDiskAndStreamIdx> frames;
+       for (const StreamContentsProto &stream : file_contents.stream()) {
+               FrameOnDiskAndStreamIdx frame;
+               frame.stream_idx = stream.stream_idx();
+               for (int i = 0; i < stream.pts_size(); ++i) {
+                       frame.frame.filename_idx = filename_idx;
+                       frame.frame.pts = stream.pts(i);
+                       frame.frame.offset = stream.offset(i);
+                       frame.frame.size = stream.file_size(i);
+                       frames.push_back(frame);
+               }
+       }
+
+       return frames;
+}
+
+void DB::store_frame_file(const string &filename, size_t size, const vector<FrameOnDiskAndStreamIdx> &frames)
+{
+       int ret = sqlite3_exec(db, "BEGIN", nullptr, nullptr, nullptr);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "BEGIN: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       // Delete any existing instances with this filename.
+       sqlite3_stmt *stmt;
+
+       ret = sqlite3_prepare_v2(db, "DELETE FROM filev2 WHERE filename=?", -1, &stmt, 0);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "DELETE prepare: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       sqlite3_bind_text(stmt, 1, filename.data(), filename.size(), SQLITE_STATIC);
+
+       ret = sqlite3_step(stmt);
+       if (ret == SQLITE_ROW) {
+               fprintf(stderr, "DELETE step: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       ret = sqlite3_finalize(stmt);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "DELETE finalize: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       // Create the protobuf blob for the new row.
+       FileContentsProto file_contents;
+       unordered_set<unsigned> seen_stream_idx;  // Usually only one.
+       for (const FrameOnDiskAndStreamIdx &frame : frames) {
+               seen_stream_idx.insert(frame.stream_idx);
+       }
+       for (unsigned stream_idx : seen_stream_idx) {
+               StreamContentsProto *stream = file_contents.add_stream();
+               stream->set_stream_idx(stream_idx);
+               stream->mutable_pts()->Reserve(frames.size());
+               stream->mutable_offset()->Reserve(frames.size());
+               stream->mutable_file_size()->Reserve(frames.size());
+               for (const FrameOnDiskAndStreamIdx &frame : frames) {
+                       if (frame.stream_idx != stream_idx) {
+                               continue;
+                       }
+                       stream->add_pts(frame.frame.pts);
+                       stream->add_offset(frame.frame.offset);
+                       stream->add_file_size(frame.frame.size);
+               }
+       }
+       string serialized;
+       file_contents.SerializeToString(&serialized);
+
+       // Insert the new row.
+       ret = sqlite3_prepare_v2(db, "INSERT INTO filev2 (filename, size, frames) VALUES (?, ?, ?)", -1, &stmt, 0);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "INSERT prepare: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       sqlite3_bind_text(stmt, 1, filename.data(), filename.size(), SQLITE_STATIC);
+       sqlite3_bind_int64(stmt, 2, size);
+       sqlite3_bind_blob(stmt, 3, serialized.data(), serialized.size(), SQLITE_STATIC);
+
+       ret = sqlite3_step(stmt);
+       if (ret == SQLITE_ROW) {
+               fprintf(stderr, "INSERT step: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       ret = sqlite3_finalize(stmt);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "INSERT finalize: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       // Commit.
+       ret = sqlite3_exec(db, "COMMIT", nullptr, nullptr, nullptr);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "COMMIT: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+}
+
+void DB::clean_unused_frame_files(const vector<string> &used_filenames)
+{
+       int ret = sqlite3_exec(db, "BEGIN", nullptr, nullptr, nullptr);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "BEGIN: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       ret = sqlite3_exec(db, R"(
+               CREATE TEMPORARY TABLE used_filenames ( filename VARCHAR NOT NULL PRIMARY KEY )
+       )", nullptr, nullptr, nullptr);
+
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "CREATE TEMPORARY TABLE: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       // Insert the new rows.
+       sqlite3_stmt *stmt;
+       ret = sqlite3_prepare_v2(db, "INSERT INTO used_filenames (filename) VALUES (?)", -1, &stmt, 0);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "INSERT prepare: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       for (const string &filename : used_filenames) {
+               sqlite3_bind_text(stmt, 1, filename.data(), filename.size(), SQLITE_STATIC);
+
+               ret = sqlite3_step(stmt);
+               if (ret == SQLITE_ROW) {
+                       fprintf(stderr, "INSERT step: %s\n", sqlite3_errmsg(db));
+                       exit(1);
+               }
+
+               ret = sqlite3_reset(stmt);
+               if (ret == SQLITE_ROW) {
+                       fprintf(stderr, "INSERT reset: %s\n", sqlite3_errmsg(db));
+                       exit(1);
+               }
+       }
+
+       ret = sqlite3_finalize(stmt);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "INSERT finalize: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       ret = sqlite3_exec(db, R"(
+               DELETE FROM filev2 WHERE filename NOT IN ( SELECT filename FROM used_filenames )
+       )", nullptr, nullptr, nullptr);
+
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "DELETE: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       ret = sqlite3_exec(db, R"(
+               DROP TABLE used_filenames
+       )", nullptr, nullptr, nullptr);
+
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "DROP TABLE: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+
+       // Commit.
+       ret = sqlite3_exec(db, "COMMIT", nullptr, nullptr, nullptr);
+       if (ret != SQLITE_OK) {
+               fprintf(stderr, "COMMIT: %s\n", sqlite3_errmsg(db));
+               exit(1);
+       }
+}
diff --git a/futatabi/db.h b/futatabi/db.h
new file mode 100644 (file)
index 0000000..f8032c0
--- /dev/null
@@ -0,0 +1,33 @@
+#ifndef DB_H
+#define DB_H 1
+
+#include "state.pb.h"
+
+#include <sqlite3.h>
+#include <string>
+#include <vector>
+
+#include "frame_on_disk.h"
+
+class DB {
+public:
+       explicit DB(const std::string &filename);
+       DB(const DB &) = delete;
+
+       StateProto get_state();
+       void store_state(const StateProto &state);
+
+       struct FrameOnDiskAndStreamIdx {
+               FrameOnDisk frame;
+               unsigned stream_idx;
+       };
+       std::vector<FrameOnDiskAndStreamIdx> load_frame_file(const std::string &filename, size_t size, unsigned frame_idx);  // Empty = none found, or there were no frames.
+       void store_frame_file(const std::string &filename, size_t size, const std::vector<FrameOnDiskAndStreamIdx> &frames);
+       void clean_unused_frame_files(const std::vector<std::string> &used_filenames);
+
+private:
+       StateProto state;
+       sqlite3 *db;
+};
+
+#endif  // !defined(DB_H)
diff --git a/futatabi/defs.h b/futatabi/defs.h
new file mode 100644 (file)
index 0000000..d03b06c
--- /dev/null
@@ -0,0 +1,23 @@
+#ifndef _DEFS_H
+#define _DEFS_H 1
+
+#define MAX_STREAMS 16
+#define CACHE_SIZE_MB 2048
+#define NUM_CAMERAS 4
+#define MUX_BUFFER_SIZE 10485760
+
+#define DEFAULT_STREAM_MUX_NAME "nut"  // Only for HTTP. Local dump guesses from LOCAL_DUMP_SUFFIX.
+#define DEFAULT_HTTPD_PORT 9095
+#define MUX_OPTS { \
+       /* Make seekable .mov files, and keep MP4 muxer from using unlimited amounts of memory. */ \
+       { "movflags", "empty_moov+frag_keyframe+default_base_moof+skip_trailer" }, \
+       \
+       /* Make for somewhat less bursty stream output when using .mov. */ \
+       { "frag_duration", "125000" }, \
+       \
+       /* Keep nut muxer from using unlimited amounts of memory. */ \
+       { "write_index", "0" } \
+}
+
+
+#endif  // !defined(_DEFS_H)
diff --git a/futatabi/densify.frag b/futatabi/densify.frag
new file mode 100644 (file)
index 0000000..3bca126
--- /dev/null
@@ -0,0 +1,24 @@
+#version 450 core
+
+in vec2 image_pos;
+flat in int image0_layer, image1_layer;
+flat in vec2 flow_du;
+flat in float mean_diff;
+out vec3 flow_contribution;
+
+uniform sampler2DArray image_tex;
+
+void main()
+{
+       // Equation (3) from the paper. We're using additive blending, so the
+       // sum will happen automatically for us, and normalization happens on
+       // next read.
+       //
+       // Note that equation (2) says 1 for the minimum error, but the code says 2.0.
+       // And it says L2 norm, but really, the code does absolute value even for
+       // L2 error norm (it uses a square root formula for L1 norm).
+       float diff = texture(image_tex, vec3(image_pos, image0_layer)).x - texture(image_tex, vec3(image_pos + flow_du, image1_layer)).x;
+       diff -= mean_diff;
+       float weight = 1.0 / max(abs(diff), 2.0 / 255.0);
+       flow_contribution = vec3(flow_du.x * weight, flow_du.y * weight, weight);
+}
diff --git a/futatabi/densify.vert b/futatabi/densify.vert
new file mode 100644 (file)
index 0000000..181c7f3
--- /dev/null
@@ -0,0 +1,55 @@
+#version 450 core
+#extension GL_ARB_shader_viewport_layer_array : require
+
+layout(location=0) in vec2 position;
+out vec2 image_pos;
+flat out vec2 flow_du;
+flat out float mean_diff;
+flat out int image0_layer, image1_layer;
+
+uniform vec2 patch_size;  // In 0..1 coordinates.
+uniform sampler2DArray flow_tex;
+
+void main()
+{
+       int num_patches = textureSize(flow_tex, 0).x * textureSize(flow_tex, 0).y;
+       int patch_layer = gl_InstanceID / num_patches;
+       int patch_x = gl_InstanceID % textureSize(flow_tex, 0).x;
+       int patch_y = (gl_InstanceID % num_patches) / textureSize(flow_tex, 0).x;
+
+       // Convert the patch index to being the full 0..1 range, to match where
+       // the motion search puts the patches. We don't bother with the locking
+       // to texel centers, though.
+       vec2 patch_center = ivec2(patch_x, patch_y) / (textureSize(flow_tex, 0).xy - 1.0);
+
+       // Increase the patch size a bit; since patch spacing is not necessarily
+       // an integer number of pixels, and we don't use conservative rasterization,
+       // we could be missing the outer edges of the patch. And it seemingly helps
+       // a little bit in general to have some more candidates as well -- although
+       // this is measured without variational refinement, so it might be moot
+       // with it.
+       //
+       // This maps [0.0,1.0] to [-0.25,1.25], ie. extends the patch by 25% in
+       // all directions.
+       vec2 grown_pos = (position * 1.5) - 0.25;
+
+       image_pos = patch_center + patch_size * (grown_pos - 0.5f);
+
+       // Find the flow value for this patch, and send it on to the fragment shader.
+       vec3 flow_du_and_mean_diff = texelFetch(flow_tex, ivec3(patch_x, patch_y, patch_layer), 0).xyz;
+       flow_du = flow_du_and_mean_diff.xy;
+       mean_diff = flow_du_and_mean_diff.z;
+
+       // The result of glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) is:
+       //
+       //   2.000  0.000  0.000 -1.000
+       //   0.000  2.000  0.000 -1.000
+       //   0.000  0.000 -2.000 -1.000
+       //   0.000  0.000  0.000  1.000
+       gl_Position = vec4(2.0 * image_pos.x - 1.0, 2.0 * image_pos.y - 1.0, -1.0, 1.0);
+       gl_Layer = patch_layer;
+
+       // Forward flow (0) goes from 0 to 1. Backward flow (1) goes from 1 to 0.
+       image0_layer = patch_layer;
+       image1_layer = 1 - patch_layer;
+}
diff --git a/futatabi/derivatives.frag b/futatabi/derivatives.frag
new file mode 100644 (file)
index 0000000..0e2fd68
--- /dev/null
@@ -0,0 +1,32 @@
+#version 450 core
+
+in vec3 tc;
+out vec2 derivatives;
+out float beta_0;
+
+uniform sampler2DArray tex;
+
+void main()
+{
+       float x_m2 = textureOffset(tex, tc, ivec2(-2,  0)).x;
+       float x_m1 = textureOffset(tex, tc, ivec2(-1,  0)).x;
+       float x_p1 = textureOffset(tex, tc, ivec2( 1,  0)).x;
+       float x_p2 = textureOffset(tex, tc, ivec2( 2,  0)).x;
+
+       float y_m2 = textureOffset(tex, tc, ivec2( 0, -2)).x;
+       float y_m1 = textureOffset(tex, tc, ivec2( 0, -1)).x;
+       float y_p1 = textureOffset(tex, tc, ivec2( 0,  1)).x;
+       float y_p2 = textureOffset(tex, tc, ivec2( 0,  2)).x;
+
+       derivatives.x = (x_p1 - x_m1) * (2.0/3.0) + (x_m2 - x_p2) * (1.0/12.0);
+       derivatives.y = (y_p1 - y_m1) * (2.0/3.0) + (y_m2 - y_p2) * (1.0/12.0);
+
+       // The nudge term in the square root in the DeepFlow paper is ζ² = 0.1² = 0.01.
+       // But this is assuming a 0..255 level. Given the nonlinearities in the expression
+       // where β_0 appears, there's no 100% equivalent way to adjust this
+       // constant that I can see, but taking it to (0.1/255)² ~= 1.53e-7 ~=
+       // 1e-7 ought to be good enough. I guess the basic idea is that it
+       // will only matter for near-zero derivatives anyway. I am a tiny
+       // bit worried about fp16 precision when storing these numbers, but OK.
+       beta_0 = 1.0 / (derivatives.x * derivatives.x + derivatives.y * derivatives.y + 1e-7);
+}
diff --git a/futatabi/diffusivity.frag b/futatabi/diffusivity.frag
new file mode 100644 (file)
index 0000000..345c3eb
--- /dev/null
@@ -0,0 +1,39 @@
+#version 450 core
+
+in vec3 tc;
+out float g;
+const float eps_sq = 0.001 * 0.001;
+
+uniform sampler2DArray flow_tex, diff_flow_tex;
+
+// Relative weighting of smoothness term.
+uniform float alpha;
+
+uniform bool zero_diff_flow;
+
+// This must be a macro, since the offset needs to be a constant expression.
+#define get_flow(x_offs, y_offs) \
+       (textureOffset(flow_tex, tc, ivec2((x_offs), (y_offs))).xy + \
+       textureOffset(diff_flow_tex, tc, ivec2((x_offs), (y_offs))).xy)
+
+#define get_flow_no_diff(x_offs, y_offs) \
+       textureOffset(flow_tex, tc, ivec2((x_offs), (y_offs))).xy
+
+float diffusivity(float u_x, float u_y, float v_x, float v_y)
+{
+       return alpha * inversesqrt(u_x * u_x + u_y * u_y + v_x * v_x + v_y * v_y + eps_sq);
+}
+
+void main()
+{
+       // Find diffusivity (g) for this pixel, using central differences.
+       if (zero_diff_flow) {
+               vec2 uv_x = get_flow_no_diff(1, 0) - get_flow_no_diff(-1,  0);
+               vec2 uv_y = get_flow_no_diff(0, 1) - get_flow_no_diff( 0, -1);
+               g = diffusivity(uv_x.x, uv_y.x, uv_x.y, uv_y.y);
+       } else {
+               vec2 uv_x = get_flow(1, 0) - get_flow(-1,  0);
+               vec2 uv_y = get_flow(0, 1) - get_flow( 0, -1);
+               g = diffusivity(uv_x.x, uv_y.x, uv_x.y, uv_y.y);
+       }
+}
diff --git a/futatabi/disk_space_estimator.cpp b/futatabi/disk_space_estimator.cpp
new file mode 100644 (file)
index 0000000..69de4d5
--- /dev/null
@@ -0,0 +1,52 @@
+#include "disk_space_estimator.h"
+
+#include "timebase.h"
+
+#include <memory>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+
+DiskSpaceEstimator::DiskSpaceEstimator(DiskSpaceEstimator::callback_t callback)
+       : callback(callback)
+{
+}
+
+void DiskSpaceEstimator::report_write(const std::string &filename, size_t bytes, uint64_t pts)
+{
+       // Reject points that are out-of-order (happens with B-frames).
+       if (!measure_points.empty() && pts <= measure_points.back().pts) {
+               return;
+       }
+
+       // Remove too old points.
+       while (measure_points.size() > 1 && measure_points.front().pts + window_length < pts) {
+               measure_points.pop_front();
+       }
+
+       total_size += bytes;
+
+       struct statfs fst;
+       if (statfs(filename.c_str(), &fst) == -1) {
+               perror(filename.c_str());
+               return;
+       }
+
+       off_t free_bytes = off_t(fst.f_bavail) * fst.f_frsize;
+
+       if (!measure_points.empty()) {
+               double bytes_per_second = double(total_size - measure_points.front().size) /
+                       (pts - measure_points.front().pts) * TIMEBASE;
+               double seconds_left = free_bytes / bytes_per_second;
+
+               // Only report every second, since updating the UI can be expensive.
+               if (last_pts_reported == 0 || pts - last_pts_reported >= TIMEBASE) {
+                       callback(free_bytes, seconds_left);
+                       last_pts_reported = pts;
+               }
+       }
+
+       measure_points.push_back({ pts, total_size });
+}
+
+DiskSpaceEstimator *global_disk_space_estimator = nullptr;  // Created in MainWindow::MainWindow().
diff --git a/futatabi/disk_space_estimator.h b/futatabi/disk_space_estimator.h
new file mode 100644 (file)
index 0000000..f02cb9c
--- /dev/null
@@ -0,0 +1,47 @@
+#ifndef _DISK_SPACE_ESTIMATOR_H
+#define _DISK_SPACE_ESTIMATOR_H
+
+// A class responsible for measuring how much disk there is left when we
+// store our video to disk, and how much recording time that equates to.
+// It gets callbacks from the Mux writing the stream to disk (which also
+// knows which filesystem the file is going to), makes its calculations,
+// and calls back to the MainWindow, which shows it to the user.
+//
+// The bitrate is measured over a simple 30-second sliding window.
+
+#include "timebase.h"
+
+#include <deque>
+#include <functional>
+#include <stdint.h>
+#include <string>
+#include <sys/types.h>
+
+class DiskSpaceEstimator {
+public:
+       typedef std::function<void(off_t free_bytes, double estimated_seconds_left)> callback_t;
+       DiskSpaceEstimator(callback_t callback);
+
+       // Report that a video frame with the given pts and size has just been
+       // written (possibly appended) to the given file.
+       //
+       // <pts> is taken to be in TIMEBASE units (see timebase.h).
+       void report_write(const std::string &filename, size_t bytes, uint64_t pts);
+
+private:
+       static constexpr int64_t window_length = 30 * TIMEBASE;
+
+       callback_t callback;
+
+       struct MeasurePoint {
+               uint64_t pts;
+               off_t size;
+       };
+       std::deque<MeasurePoint> measure_points;
+       uint64_t last_pts_reported = 0;
+       off_t total_size = 0;
+};
+
+extern DiskSpaceEstimator *global_disk_space_estimator;
+
+#endif  // !defined(_DISK_SPACE_ESTIMATOR_H)
diff --git a/futatabi/embedded_files.h b/futatabi/embedded_files.h
new file mode 100644 (file)
index 0000000..83cf0fc
--- /dev/null
@@ -0,0 +1,60 @@
+#ifndef _EMBEDDED_FILES_H
+#define _EMBEDDED_FILES_H 1
+
+// Files that are embedded into the binary as part of the build process.
+// They are used as a backup if the files are not available on disk
+// (which is typically the case if the program is installed, as opposed to
+// being run during development).
+
+#include <stddef.h>
+
+extern const unsigned char *_binary_add_base_flow_frag_data;
+extern const size_t _binary_add_base_flow_frag_size;
+extern const unsigned char *_binary_blend_frag_data;
+extern const size_t _binary_blend_frag_size;
+extern const unsigned char *_binary_chroma_subsample_frag_data;
+extern const size_t _binary_chroma_subsample_frag_size;
+extern const unsigned char *_binary_chroma_subsample_vert_data;
+extern const size_t _binary_chroma_subsample_vert_size;
+extern const unsigned char *_binary_densify_frag_data;
+extern const size_t _binary_densify_frag_size;
+extern const unsigned char *_binary_densify_vert_data;
+extern const size_t _binary_densify_vert_size;
+extern const unsigned char *_binary_derivatives_frag_data;
+extern const size_t _binary_derivatives_frag_size;
+extern const unsigned char *_binary_diffusivity_frag_data;
+extern const size_t _binary_diffusivity_frag_size;
+extern const unsigned char *_binary_equations_frag_data;
+extern const size_t _binary_equations_frag_size;
+extern const unsigned char *_binary_equations_vert_data;
+extern const size_t _binary_equations_vert_size;
+extern const unsigned char *_binary_gray_frag_data;
+extern const size_t _binary_gray_frag_size;
+extern const unsigned char *_binary_hole_blend_frag_data;
+extern const size_t _binary_hole_blend_frag_size;
+extern const unsigned char *_binary_hole_fill_frag_data;
+extern const size_t _binary_hole_fill_frag_size;
+extern const unsigned char *_binary_hole_fill_vert_data;
+extern const size_t _binary_hole_fill_vert_size;
+extern const unsigned char *_binary_motion_search_frag_data;
+extern const size_t _binary_motion_search_frag_size;
+extern const unsigned char *_binary_motion_search_vert_data;
+extern const size_t _binary_motion_search_vert_size;
+extern const unsigned char *_binary_prewarp_frag_data;
+extern const size_t _binary_prewarp_frag_size;
+extern const unsigned char *_binary_resize_flow_frag_data;
+extern const size_t _binary_resize_flow_frag_size;
+extern const unsigned char *_binary_sobel_frag_data;
+extern const size_t _binary_sobel_frag_size;
+extern const unsigned char *_binary_sor_frag_data;
+extern const size_t _binary_sor_frag_size;
+extern const unsigned char *_binary_sor_vert_data;
+extern const size_t _binary_sor_vert_size;
+extern const unsigned char *_binary_splat_frag_data;
+extern const size_t _binary_splat_frag_size;
+extern const unsigned char *_binary_splat_vert_data;
+extern const size_t _binary_splat_vert_size;
+extern const unsigned char *_binary_vs_vert_data;
+extern const size_t _binary_vs_vert_size;
+
+#endif  // !defined(_EMBEDDED_FILES_H)
diff --git a/futatabi/equations.frag b/futatabi/equations.frag
new file mode 100644 (file)
index 0000000..04e5370
--- /dev/null
@@ -0,0 +1,187 @@
+#version 450 core
+
+in vec3 tc0, tc_left0, tc_down0;
+in vec3 tc1, tc_left1, tc_down1;
+in float line_offset;
+out uvec4 equation_red, equation_black;
+
+uniform sampler2DArray I_x_y_tex, I_t_tex;
+uniform sampler2DArray diff_flow_tex, base_flow_tex;
+uniform sampler2DArray beta_0_tex;
+uniform sampler2DArray diffusivity_tex;
+
+// Relative weighting of intensity term.
+uniform float delta;
+
+// Relative weighting of gradient term.
+uniform float gamma;
+
+uniform bool zero_diff_flow;
+
+// Similar to packHalf2x16, but the two values share exponent, and are stored
+// as 12-bit fixed point numbers multiplied by that exponent (the leading one
+// can't be implicit in this kind of format). This allows us to store a much
+// greater range of numbers (8-bit, ie., full fp32 range), and also gives us an
+// extra mantissa bit. (Well, ostensibly two, but because the numbers have to
+// be stored denormalized, we only really gain one.)
+//
+// The price we pay is that if the numbers are of very different magnitudes,
+// the smaller number gets less precision.
+uint pack_floats_shared(float a, float b)
+{
+       float greatest = max(abs(a), abs(b));
+
+       // Find the exponent, increase it by one, and negate it.
+       // E.g., if the nonbiased exponent is 3, the number is between
+       // 2^3 and 2^4, so our normalization factor to get within -1..1
+       // is going to be 2^-4.
+       //
+       // exponent -= 127;
+       // exponent = -(exponent + 1);
+       // exponent += 127;
+       //
+       // is the same as
+       //
+       // exponent = 252 - exponent;
+       uint e = floatBitsToUint(greatest) & 0x7f800000u;
+       float normalizer = uintBitsToFloat((252 << 23) - e);
+
+       // The exponent is the same range as fp32, so just copy it
+       // verbatim, shifted up to where the sign bit used to be.
+       e <<= 1;
+
+       // Quantize to 12 bits.
+       uint qa = uint(int(round(a * (normalizer * 2047.0))));
+       uint qb = uint(int(round(b * (normalizer * 2047.0))));
+
+       return (qa & 0xfffu) | ((qb & 0xfffu) << 12) | e;
+}
+
+float zero_if_outside_border(vec4 val)
+{
+       if (val.w < 1.0f) {
+               // We hit the border (or more like half-way to it), so zero smoothness.
+               return 0.0f;
+       } else {
+               return val.x;
+       }
+}
+
+uvec4 compute_equation(vec3 tc, vec3 tc_left, vec3 tc_down)
+{
+       // Read the flow (on top of the u0/v0 flow).
+       float du, dv;
+       if (zero_diff_flow) {
+               du = dv = 0.0f;
+       } else {
+               vec2 diff_flow = texture(diff_flow_tex, tc).xy;
+               du = diff_flow.x;
+               dv = diff_flow.y;
+       }
+
+       // Read the first derivatives.
+       vec2 I_x_y = texture(I_x_y_tex, tc).xy;
+       float I_x = I_x_y.x;
+       float I_y = I_x_y.y;
+       float I_t = texture(I_t_tex, tc).x;
+
+       // E_I term. Note that we don't square β_0, in line with DeepFlow;
+       // it's probably an error (see variational_refinement.txt),
+       // but squaring it seems to give worse results.
+       float beta_0 = texture(beta_0_tex, tc).x;
+       float k1 = delta * beta_0 * inversesqrt(beta_0 * (I_x * du + I_y * dv + I_t) * (I_x * du + I_y * dv + I_t) + 1e-6);
+       float A11 = k1 * I_x * I_x;
+       float A12 = k1 * I_x * I_y;
+       float A22 = k1 * I_y * I_y;
+       float b1 = -k1 * I_t * I_x;
+       float b2 = -k1 * I_t * I_y;
+
+       // Compute the second derivatives. First I_xx and I_xy.
+       vec2 I_x_y_m2 = textureOffset(I_x_y_tex, tc, ivec2(-2,  0)).xy;
+       vec2 I_x_y_m1 = textureOffset(I_x_y_tex, tc, ivec2(-1,  0)).xy;
+       vec2 I_x_y_p1 = textureOffset(I_x_y_tex, tc, ivec2( 1,  0)).xy;
+       vec2 I_x_y_p2 = textureOffset(I_x_y_tex, tc, ivec2( 2,  0)).xy;
+       vec2 I_xx_yx = (I_x_y_p1 - I_x_y_m1) * (2.0/3.0) + (I_x_y_m2 - I_x_y_p2) * (1.0/12.0);
+       float I_xx = I_xx_yx.x;
+       float I_xy = I_xx_yx.y;
+
+       // And now I_yy; I_yx = I_xy, bar rounding differences, so we don't
+       // bother computing it. We still have to sample the x component,
+       // though, but we can throw it away immediately.
+       float I_y_m2 = textureOffset(I_x_y_tex, tc, ivec2(0, -2)).y;
+       float I_y_m1 = textureOffset(I_x_y_tex, tc, ivec2(0, -1)).y;
+       float I_y_p1 = textureOffset(I_x_y_tex, tc, ivec2(0,  1)).y;
+       float I_y_p2 = textureOffset(I_x_y_tex, tc, ivec2(0,  2)).y;
+       float I_yy = (I_y_p1 - I_y_m1) * (2.0/3.0) + (I_y_m2 - I_y_p2) * (1.0/12.0);
+
+       // Finally I_xt and I_yt. (We compute these as I_tx and I_yt.)
+       vec2 I_t_m2 = textureOffset(I_t_tex, tc, ivec2(-2,  0)).xy;
+       vec2 I_t_m1 = textureOffset(I_t_tex, tc, ivec2(-1,  0)).xy;
+       vec2 I_t_p1 = textureOffset(I_t_tex, tc, ivec2( 1,  0)).xy;
+       vec2 I_t_p2 = textureOffset(I_t_tex, tc, ivec2( 2,  0)).xy;
+       vec2 I_tx_ty = (I_t_p1 - I_t_m1) * (2.0/3.0) + (I_t_m2 - I_t_p2) * (1.0/12.0);
+       float I_xt = I_tx_ty.x;
+       float I_yt = I_tx_ty.y;
+
+       // E_G term. Same normalization as beta_0 (see derivatives.frag).
+       float beta_x = 1.0 / (I_xx * I_xx + I_xy * I_xy + 1e-7);
+       float beta_y = 1.0 / (I_xy * I_xy + I_yy * I_yy + 1e-7);
+       float k2 = gamma * inversesqrt(
+               beta_x * (I_xx * du + I_xy * dv + I_xt) * (I_xx * du + I_xy * dv + I_xt) +
+               beta_y * (I_xy * du + I_yy * dv + I_yt) * (I_xy * du + I_yy * dv + I_yt) +
+               1e-6);
+       float k_x = k2 * beta_x;
+       float k_y = k2 * beta_y;
+       A11 += k_x * I_xx * I_xx + k_y * I_xy * I_xy;
+       A12 += k_x * I_xx * I_xy + k_y * I_xy * I_yy;
+       A22 += k_x * I_xy * I_xy + k_y * I_yy * I_yy;
+       b1 -= k_x * I_xx * I_xt + k_y * I_xy * I_yt;
+       b2 -= k_x * I_xy * I_xt + k_y * I_yy * I_yt;
+
+       // E_S term, sans the part on the right-hand side that deals with
+       // the neighboring pixels. The gamma is multiplied in in smoothness.frag.
+       //
+       // Note that we sample in-between two texels, which gives us the 0.5 *
+       // (x[-1] + x[0]) part for free. If one of the texels is a border
+       // texel, it will have zero alpha, and zero_if_outside_border() will
+       // set smoothness to zero.
+       float smooth_l = zero_if_outside_border(texture(diffusivity_tex, tc_left));
+       float smooth_r = zero_if_outside_border(textureOffset(diffusivity_tex, tc_left, ivec2(1, 0)));
+       float smooth_d = zero_if_outside_border(texture(diffusivity_tex, tc_down));
+       float smooth_u = zero_if_outside_border(textureOffset(diffusivity_tex, tc_down, ivec2(0, 1)));
+       A11 += smooth_l + smooth_r + smooth_d + smooth_u;
+       A22 += smooth_l + smooth_r + smooth_d + smooth_u;
+
+       // Laplacian of (u0, v0).
+       vec2 laplacian =
+               smooth_l * textureOffset(base_flow_tex, tc, ivec2(-1,  0)).xy +
+               smooth_r * textureOffset(base_flow_tex, tc, ivec2( 1,  0)).xy +
+               smooth_d * textureOffset(base_flow_tex, tc, ivec2( 0, -1)).xy +
+               smooth_u * textureOffset(base_flow_tex, tc, ivec2( 0,  1)).xy -
+               (smooth_l + smooth_r + smooth_d + smooth_u) * texture(base_flow_tex, tc).xy;
+       b1 += laplacian.x;
+       b2 += laplacian.y;
+
+       // Encode the equation down into four uint32s.
+       uvec4 ret;
+       ret.x = floatBitsToUint(1.0 / A11);
+       ret.y = floatBitsToUint(A12);
+       ret.z = floatBitsToUint(1.0 / A22);
+       ret.w = pack_floats_shared(b1, b2);
+       return ret;
+}
+
+void main()
+{
+       uvec4 eq0 = compute_equation(tc0, tc_left0, tc_down0);
+       uvec4 eq1 = compute_equation(tc1, tc_left1, tc_down1);
+
+       if ((int(round(line_offset)) & 1) == 1) {
+               // Odd line, so the right value is red.
+               equation_red = eq1;
+               equation_black = eq0;
+       } else {
+               equation_red = eq0;
+               equation_black = eq1;
+       }
+}
diff --git a/futatabi/equations.vert b/futatabi/equations.vert
new file mode 100644 (file)
index 0000000..731e159
--- /dev/null
@@ -0,0 +1,38 @@
+#version 450 core
+#extension GL_ARB_shader_viewport_layer_array : require
+
+layout(location=0) in vec2 position;
+out vec3 tc0, tc_left0, tc_down0;
+out vec3 tc1, tc_left1, tc_down1;
+out float line_offset;
+
+uniform sampler2DArray diffusivity_tex;
+
+void main()
+{
+       // The result of glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) is:
+       //
+       //   2.000  0.000  0.000 -1.000
+       //   0.000  2.000  0.000 -1.000
+       //   0.000  0.000 -2.000 -1.000
+       //   0.000  0.000  0.000  1.000
+       gl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0);
+       gl_Layer = gl_InstanceID;
+
+       const vec2 half_texel = 0.5f / textureSize(diffusivity_tex, 0).xy;
+
+       vec2 tc = position;
+       vec2 tc_left = vec2(tc.x - half_texel.x, tc.y);
+       vec2 tc_down = vec2(tc.x, tc.y - half_texel.y);
+
+       // Adjust for different texel centers.
+       tc0 = vec3(tc.x - half_texel.x, tc.y, gl_InstanceID);
+       tc_left0 = vec3(tc_left.x - half_texel.x, tc_left.y, gl_InstanceID);
+       tc_down0 = vec3(tc_down.x - half_texel.x, tc_down.y, gl_InstanceID);
+
+       tc1 = vec3(tc.x + half_texel.x, tc.y, gl_InstanceID);
+       tc_left1 = vec3(tc_left.x + half_texel.x, tc_left.y, gl_InstanceID);
+       tc_down1 = vec3(tc_down.x + half_texel.x, tc_down.y, gl_InstanceID);
+
+       line_offset = position.y * textureSize(diffusivity_tex, 0).y - 0.5f;
+}
diff --git a/futatabi/eval.cpp b/futatabi/eval.cpp
new file mode 100644 (file)
index 0000000..85783bb
--- /dev/null
@@ -0,0 +1,41 @@
+// Evaluate a .flo file against ground truth,
+// outputting the average end-point error.
+
+#include "util.h"
+
+#include <assert.h>
+#include <memory>
+#include <stdio.h>
+
+using namespace std;
+
+double eval_flow(const char *filename1, const char *filename2);
+
+int main(int argc, char **argv)
+{
+       double sum_epe = 0.0;
+       int num_flows = 0;
+       for (int i = 1; i < argc; i += 2) {
+               sum_epe += eval_flow(argv[i], argv[i + 1]);
+               ++num_flows;
+       }
+       printf("Average EPE: %.2f pixels\n", sum_epe / num_flows);
+}
+
+double eval_flow(const char *filename1, const char *filename2)
+{
+       Flow flow = read_flow(filename1);
+       Flow gt = read_flow(filename2);
+
+       double sum = 0.0;
+       for (unsigned y = 0; y < unsigned(flow.height); ++y) {
+               for (unsigned x = 0; x < unsigned(flow.width); ++x) {
+                       float du = flow.flow[y * flow.width + x].du;
+                       float dv = flow.flow[y * flow.width + x].dv;
+                       float gt_du = gt.flow[y * flow.width + x].du;
+                       float gt_dv = gt.flow[y * flow.width + x].dv;
+                       sum += hypot(du - gt_du, dv - gt_dv);
+               }
+       }
+       return sum / (flow.width * flow.height);
+}
diff --git a/futatabi/ffmpeg_raii.cpp b/futatabi/ffmpeg_raii.cpp
new file mode 100644 (file)
index 0000000..746e03d
--- /dev/null
@@ -0,0 +1,77 @@
+#include "ffmpeg_raii.h"
+
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavformat/avformat.h>
+#include <libavutil/dict.h>
+#include <libavutil/frame.h>
+#include <libswscale/swscale.h>
+}
+
+using namespace std;
+
+// AVFormatContext
+
+void avformat_close_input_unique::operator() (AVFormatContext *format_ctx) const
+{
+       avformat_close_input(&format_ctx);
+}
+
+AVFormatContextWithCloser avformat_open_input_unique(
+       const char *pathname, AVInputFormat *fmt,
+       AVDictionary **options)
+{
+       return avformat_open_input_unique(pathname, fmt, options, AVIOInterruptCB{ nullptr, nullptr });
+}
+
+AVFormatContextWithCloser avformat_open_input_unique(
+       const char *pathname, AVInputFormat *fmt,
+       AVDictionary **options,
+       const AVIOInterruptCB &interrupt_cb)
+{
+       AVFormatContext *format_ctx = avformat_alloc_context();
+       format_ctx->interrupt_callback = interrupt_cb;
+       if (avformat_open_input(&format_ctx, pathname, fmt, options) != 0) {
+               format_ctx = nullptr;
+       }
+       return AVFormatContextWithCloser(format_ctx);
+}
+
+// AVCodecContext
+
+void avcodec_free_context_unique::operator() (AVCodecContext *codec_ctx) const
+{
+       avcodec_free_context(&codec_ctx);
+}
+
+AVCodecContextWithDeleter avcodec_alloc_context3_unique(const AVCodec *codec)
+{
+       return AVCodecContextWithDeleter(avcodec_alloc_context3(codec));
+}
+
+
+// AVCodecParameters
+
+void avcodec_parameters_free_unique::operator() (AVCodecParameters *codec_par) const
+{
+       avcodec_parameters_free(&codec_par);
+}
+
+// AVFrame
+
+void av_frame_free_unique::operator() (AVFrame *frame) const
+{
+       av_frame_free(&frame);
+}
+
+AVFrameWithDeleter av_frame_alloc_unique()
+{
+       return AVFrameWithDeleter(av_frame_alloc());
+}
+
+// SwsContext
+
+void sws_free_context_unique::operator() (SwsContext *context) const
+{
+       sws_freeContext(context);
+}
diff --git a/futatabi/ffmpeg_raii.h b/futatabi/ffmpeg_raii.h
new file mode 100644 (file)
index 0000000..33d2334
--- /dev/null
@@ -0,0 +1,80 @@
+#ifndef _FFMPEG_RAII_H
+#define _FFMPEG_RAII_H 1
+
+// Some helpers to make RAII versions of FFmpeg objects.
+// The cleanup functions don't interact all that well with unique_ptr,
+// so things get a bit messy and verbose, but overall it's worth it to ensure
+// we never leak things by accident in error paths.
+//
+// This does not cover any of the types that can actually be declared as
+// a unique_ptr with no helper functions for deleter.
+
+#include <memory>
+
+struct AVCodec;
+struct AVCodecContext;
+struct AVCodecParameters;
+struct AVDictionary;
+struct AVFormatContext;
+struct AVFrame;
+struct AVInputFormat;
+struct SwsContext;
+typedef struct AVIOInterruptCB AVIOInterruptCB;
+
+// AVFormatContext
+struct avformat_close_input_unique {
+       void operator() (AVFormatContext *format_ctx) const;
+};
+
+typedef std::unique_ptr<AVFormatContext, avformat_close_input_unique>
+       AVFormatContextWithCloser;
+
+AVFormatContextWithCloser avformat_open_input_unique(
+       const char *pathname, AVInputFormat *fmt,
+       AVDictionary **options);
+
+AVFormatContextWithCloser avformat_open_input_unique(
+       const char *pathname, AVInputFormat *fmt,
+       AVDictionary **options,
+       const AVIOInterruptCB &interrupt_cb);
+
+
+// AVCodecContext
+struct avcodec_free_context_unique {
+       void operator() (AVCodecContext *ctx) const;
+};
+
+typedef std::unique_ptr<AVCodecContext, avcodec_free_context_unique>
+       AVCodecContextWithDeleter;
+
+AVCodecContextWithDeleter avcodec_alloc_context3_unique(const AVCodec *codec);
+
+
+// AVCodecParameters
+struct avcodec_parameters_free_unique {
+       void operator() (AVCodecParameters *codec_par) const;
+};
+
+typedef std::unique_ptr<AVCodecParameters, avcodec_parameters_free_unique>
+       AVCodecParametersWithDeleter;
+
+
+// AVFrame
+struct av_frame_free_unique {
+       void operator() (AVFrame *frame) const;
+};
+
+typedef std::unique_ptr<AVFrame, av_frame_free_unique>
+       AVFrameWithDeleter;
+
+AVFrameWithDeleter av_frame_alloc_unique();
+
+// SwsContext
+struct sws_free_context_unique {
+       void operator() (SwsContext *context) const;
+};
+
+typedef std::unique_ptr<SwsContext, sws_free_context_unique>
+       SwsContextWithDeleter;
+
+#endif  // !defined(_FFMPEG_RAII_H)
diff --git a/futatabi/flags.cpp b/futatabi/flags.cpp
new file mode 100644 (file)
index 0000000..4c75370
--- /dev/null
@@ -0,0 +1,82 @@
+#include "flags.h"
+
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <utility>
+
+using namespace std;
+
+Flags global_flags;
+
+// Long options that have no corresponding short option.
+enum LongOption {
+       OPTION_HELP = 1000,
+       OPTION_SLOW_DOWN_INPUT = 1001,
+       OPTION_HTTP_PORT = 1002
+};
+
+void usage()
+{
+       fprintf(stderr, "Usage: futatabi [OPTION]... SOURCE_URL\n");
+       fprintf(stderr, "\n");
+       fprintf(stderr, "      --help                      print usage information\n");
+       fprintf(stderr, "      --slow-down-input           slow down input to realtime (default on if no\n");
+       fprintf(stderr, "                                    source URL given)\n");
+       fprintf(stderr, "  -q, --interpolation-quality N   1 = fastest\n");
+       fprintf(stderr, "                                  2 = default (realtime 720p on fast embedded GPUs)\n");
+       fprintf(stderr, "                                  3 = good (realtime 720p on GTX 970 or so)\n");
+       fprintf(stderr, "                                  4 = best (not realtime on any current GPU)\n");
+       fprintf(stderr, "  -d, --working-directory DIR     where to store frames and database\n");
+       fprintf(stderr, "      --http-port PORT            which port to listen on for output\n");
+}
+
+void parse_flags(int argc, char * const argv[])
+{
+       static const option long_options[] = {
+               { "help", no_argument, 0, OPTION_HELP },
+               { "slow-down-input", no_argument, 0, OPTION_SLOW_DOWN_INPUT },
+               { "interpolation-quality", required_argument, 0, 'q' },
+               { "working-directory", required_argument, 0, 'd' },
+               { "http-port", required_argument, 0, OPTION_HTTP_PORT },
+               { 0, 0, 0, 0 }
+       };
+       for ( ;; ) {
+               int option_index = 0;
+               int c = getopt_long(argc, argv, "q:d:", long_options, &option_index);
+
+               if (c == -1) {
+                       break;
+               }
+               switch (c) {
+               case OPTION_SLOW_DOWN_INPUT:
+                       global_flags.slow_down_input = true;
+                       break;
+               case 'q':
+                       global_flags.interpolation_quality = atoi(optarg);
+                       break;
+               case 'd':
+                       global_flags.working_directory = optarg;
+                       break;
+               case OPTION_HTTP_PORT:
+                       global_flags.http_port = atoi(optarg);
+                       break;
+               case OPTION_HELP:
+                       usage();
+                       exit(0);
+               default:
+                       fprintf(stderr, "Unknown option '%s'\n", argv[option_index]);
+                       fprintf(stderr, "\n");
+                       usage();
+                       exit(1);
+               }
+       }
+
+       if (global_flags.interpolation_quality < 1 || global_flags.interpolation_quality > 4) {
+               fprintf(stderr, "Interpolation quality must be 1, 2, 3 or 4.\n");
+               usage();
+               exit(1);
+       }
+}
diff --git a/futatabi/flags.h b/futatabi/flags.h
new file mode 100644 (file)
index 0000000..5e9d34b
--- /dev/null
@@ -0,0 +1,20 @@
+#ifndef _FLAGS_H
+#define _FLAGS_H
+
+#include <string>
+
+#include "defs.h"
+
+struct Flags {
+       std::string stream_source;
+       std::string working_directory = ".";
+       bool slow_down_input = false;
+       int interpolation_quality = 2;
+       uint16_t http_port = DEFAULT_HTTPD_PORT;
+};
+extern Flags global_flags;
+
+void usage();
+void parse_flags(int argc, char * const argv[]);
+
+#endif  // !defined(_FLAGS_H)
diff --git a/futatabi/flow.cpp b/futatabi/flow.cpp
new file mode 100644 (file)
index 0000000..5125d26
--- /dev/null
@@ -0,0 +1,1141 @@
+#define NO_SDL_GLEXT 1
+
+#include "flow.h"
+
+#include "embedded_files.h"
+#include "gpu_timers.h"
+#include "util.h"
+
+#include <algorithm>
+#include <assert.h>
+#include <deque>
+#include <dlfcn.h>
+#include <epoxy/gl.h>
+#include <map>
+#include <memory>
+#include <stack>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <vector>
+
+#define BUFFER_OFFSET(i) ((char *)nullptr + (i))
+
+using namespace std;
+
+// Weighting constants for the different parts of the variational refinement.
+// These don't correspond 1:1 to the values given in the DIS paper,
+// since we have different normalizations and ranges in some cases.
+// These are found through a simple grid search on some MPI-Sintel data,
+// although the error (EPE) seems to be fairly insensitive to the precise values.
+// Only the relative values matter, so we fix alpha (the smoothness constant)
+// at unity and tweak the others.
+//
+// TODO: Maybe this should not be global.
+float vr_alpha = 1.0f, vr_delta = 0.25f, vr_gamma = 0.25f;
+
+// Some global OpenGL objects.
+// TODO: These should really be part of DISComputeFlow.
+GLuint nearest_sampler, linear_sampler, zero_border_sampler;
+GLuint vertex_vbo;
+
+int find_num_levels(int width, int height)
+{
+       int levels = 1;
+       for (int w = width, h = height; w > 1 || h > 1; ) {
+               w >>= 1;
+               h >>= 1;
+               ++levels;
+       }
+       return levels;
+}
+
+string read_file(const string &filename, const unsigned char *start = nullptr, const size_t size = 0)
+{
+       FILE *fp = fopen(filename.c_str(), "r");
+       if (fp == nullptr) {
+               // Fall back to the version we compiled in. (We prefer disk if we can,
+               // since that makes it possible to work on shaders without recompiling
+               // all the time.)
+               if (start != nullptr) {
+                       return string(reinterpret_cast<const char *>(start),
+                               reinterpret_cast<const char *>(start) + size);
+               }
+
+               perror(filename.c_str());
+               exit(1);
+       }
+
+       int ret = fseek(fp, 0, SEEK_END);
+       if (ret == -1) {
+               perror("fseek(SEEK_END)");
+               exit(1);
+       }
+
+       int disk_size = ftell(fp);
+
+       ret = fseek(fp, 0, SEEK_SET);
+       if (ret == -1) {
+               perror("fseek(SEEK_SET)");
+               exit(1);
+       }
+
+       string str;
+       str.resize(disk_size);
+       ret = fread(&str[0], disk_size, 1, fp);
+       if (ret == -1) {
+               perror("fread");
+               exit(1);
+       }
+       if (ret == 0) {
+               fprintf(stderr, "Short read when trying to read %d bytes from %s\n",
+                       disk_size, filename.c_str());
+               exit(1);
+       }
+       fclose(fp);
+
+       return str;
+}
+
+GLuint compile_shader(const string &shader_src, GLenum type)
+{
+       GLuint obj = glCreateShader(type);
+       const GLchar *source[] = { shader_src.data() };
+       const GLint length[] = { (GLint)shader_src.size() };
+       glShaderSource(obj, 1, source, length);
+       glCompileShader(obj);
+
+       GLchar info_log[4096];
+       GLsizei log_length = sizeof(info_log) - 1;
+       glGetShaderInfoLog(obj, log_length, &log_length, info_log);
+       info_log[log_length] = 0;
+       if (strlen(info_log) > 0) {
+               fprintf(stderr, "Shader compile log: %s\n", info_log);
+       }
+
+       GLint status;
+       glGetShaderiv(obj, GL_COMPILE_STATUS, &status);
+       if (status == GL_FALSE) {
+               // Add some line numbers to easier identify compile errors.
+               string src_with_lines = "/*   1 */ ";
+               size_t lineno = 1;
+               for (char ch : shader_src) {
+                       src_with_lines.push_back(ch);
+                       if (ch == '\n') {
+                               char buf[32];
+                               snprintf(buf, sizeof(buf), "/* %3zu */ ", ++lineno);
+                               src_with_lines += buf;
+                       }
+               }
+
+               fprintf(stderr, "Failed to compile shader:\n%s\n", src_with_lines.c_str());
+               exit(1);
+       }
+
+       return obj;
+}
+
+GLuint link_program(GLuint vs_obj, GLuint fs_obj)
+{
+       GLuint program = glCreateProgram();
+       glAttachShader(program, vs_obj);
+       glAttachShader(program, fs_obj);
+       glLinkProgram(program);
+       GLint success;
+       glGetProgramiv(program, GL_LINK_STATUS, &success);
+       if (success == GL_FALSE) {
+               GLchar error_log[1024] = {0};
+               glGetProgramInfoLog(program, 1024, nullptr, error_log);
+               fprintf(stderr, "Error linking program: %s\n", error_log);
+               exit(1);
+       }
+       return program;
+}
+
+void bind_sampler(GLuint program, GLint location, GLuint texture_unit, GLuint tex, GLuint sampler)
+{
+       if (location == -1) {
+               return;
+       }
+
+       glBindTextureUnit(texture_unit, tex);
+       glBindSampler(texture_unit, sampler);
+       glProgramUniform1i(program, location, texture_unit);
+}
+
+template<size_t num_elements>
+void PersistentFBOSet<num_elements>::render_to(const array<GLuint, num_elements> &textures)
+{
+       auto it = fbos.find(textures);
+       if (it != fbos.end()) {
+               glBindFramebuffer(GL_FRAMEBUFFER, it->second);
+               return;
+       }
+
+       GLuint fbo;
+       glCreateFramebuffers(1, &fbo);
+       GLenum bufs[num_elements];
+       for (size_t i = 0; i < num_elements; ++i) {
+               glNamedFramebufferTexture(fbo, GL_COLOR_ATTACHMENT0 + i, textures[i], 0);
+               bufs[i] = GL_COLOR_ATTACHMENT0 + i;
+       }
+       glNamedFramebufferDrawBuffers(fbo, num_elements, bufs);
+
+       fbos[textures] = fbo;
+       glBindFramebuffer(GL_FRAMEBUFFER, fbo);
+}
+
+template<size_t num_elements>
+void PersistentFBOSetWithDepth<num_elements>::render_to(GLuint depth_rb, const array<GLuint, num_elements> &textures)
+{
+       auto key = make_pair(depth_rb, textures);
+
+       auto it = fbos.find(key);
+       if (it != fbos.end()) {
+               glBindFramebuffer(GL_FRAMEBUFFER, it->second);
+               return;
+       }
+
+       GLuint fbo;
+       glCreateFramebuffers(1, &fbo);
+       GLenum bufs[num_elements];
+       glNamedFramebufferRenderbuffer(fbo, GL_DEPTH_ATTACHMENT, GL_RENDERBUFFER, depth_rb);
+       for (size_t i = 0; i < num_elements; ++i) {
+               glNamedFramebufferTexture(fbo, GL_COLOR_ATTACHMENT0 + i, textures[i], 0);
+               bufs[i] = GL_COLOR_ATTACHMENT0 + i;
+       }
+       glNamedFramebufferDrawBuffers(fbo, num_elements, bufs);
+
+       fbos[key] = fbo;
+       glBindFramebuffer(GL_FRAMEBUFFER, fbo);
+}
+
+GrayscaleConversion::GrayscaleConversion()
+{
+       gray_vs_obj = compile_shader(read_file("vs.vert", _binary_vs_vert_data, _binary_vs_vert_size), GL_VERTEX_SHADER);
+       gray_fs_obj = compile_shader(read_file("gray.frag", _binary_gray_frag_data, _binary_gray_frag_size), GL_FRAGMENT_SHADER);
+       gray_program = link_program(gray_vs_obj, gray_fs_obj);
+
+       // Set up the VAO containing all the required position/texcoord data.
+       glCreateVertexArrays(1, &gray_vao);
+       glBindVertexArray(gray_vao);
+
+       GLint position_attrib = glGetAttribLocation(gray_program, "position");
+       glEnableVertexArrayAttrib(gray_vao, position_attrib);
+       glVertexAttribPointer(position_attrib, 2, GL_FLOAT, GL_FALSE, 0, BUFFER_OFFSET(0));
+
+       uniform_tex = glGetUniformLocation(gray_program, "tex");
+}
+
+void GrayscaleConversion::exec(GLint tex, GLint gray_tex, int width, int height, int num_layers)
+{
+       glUseProgram(gray_program);
+       bind_sampler(gray_program, uniform_tex, 0, tex, nearest_sampler);
+
+       glViewport(0, 0, width, height);
+       fbos.render_to(gray_tex);
+       glBindVertexArray(gray_vao);
+       glDisable(GL_BLEND);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
+}
+
+Sobel::Sobel()
+{
+       sobel_vs_obj = compile_shader(read_file("vs.vert", _binary_vs_vert_data, _binary_vs_vert_size), GL_VERTEX_SHADER);
+       sobel_fs_obj = compile_shader(read_file("sobel.frag", _binary_sobel_frag_data, _binary_sobel_frag_size), GL_FRAGMENT_SHADER);
+       sobel_program = link_program(sobel_vs_obj, sobel_fs_obj);
+
+       uniform_tex = glGetUniformLocation(sobel_program, "tex");
+}
+
+void Sobel::exec(GLint tex_view, GLint grad_tex, int level_width, int level_height, int num_layers)
+{
+       glUseProgram(sobel_program);
+       bind_sampler(sobel_program, uniform_tex, 0, tex_view, nearest_sampler);
+
+       glViewport(0, 0, level_width, level_height);
+       fbos.render_to(grad_tex);
+       glDisable(GL_BLEND);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
+}
+
+MotionSearch::MotionSearch(const OperatingPoint &op)
+       : op(op)
+{
+       motion_vs_obj = compile_shader(read_file("motion_search.vert", _binary_motion_search_vert_data, _binary_motion_search_vert_size), GL_VERTEX_SHADER);
+       motion_fs_obj = compile_shader(read_file("motion_search.frag", _binary_motion_search_frag_data, _binary_motion_search_frag_size), GL_FRAGMENT_SHADER);
+       motion_search_program = link_program(motion_vs_obj, motion_fs_obj);
+
+       uniform_inv_image_size = glGetUniformLocation(motion_search_program, "inv_image_size");
+       uniform_inv_prev_level_size = glGetUniformLocation(motion_search_program, "inv_prev_level_size");
+       uniform_out_flow_size = glGetUniformLocation(motion_search_program, "out_flow_size");
+       uniform_image_tex = glGetUniformLocation(motion_search_program, "image_tex");
+       uniform_grad_tex = glGetUniformLocation(motion_search_program, "grad_tex");
+       uniform_flow_tex = glGetUniformLocation(motion_search_program, "flow_tex");
+       uniform_patch_size = glGetUniformLocation(motion_search_program, "patch_size");
+       uniform_num_iterations = glGetUniformLocation(motion_search_program, "num_iterations");
+}
+
+void MotionSearch::exec(GLuint tex_view, GLuint grad_tex, GLuint flow_tex, GLuint flow_out_tex, int level_width, int level_height, int prev_level_width, int prev_level_height, int width_patches, int height_patches, int num_layers)
+{
+       glUseProgram(motion_search_program);
+
+       bind_sampler(motion_search_program, uniform_image_tex, 0, tex_view, linear_sampler);
+       bind_sampler(motion_search_program, uniform_grad_tex, 1, grad_tex, nearest_sampler);
+       bind_sampler(motion_search_program, uniform_flow_tex, 2, flow_tex, linear_sampler);
+
+       glProgramUniform2f(motion_search_program, uniform_inv_image_size, 1.0f / level_width, 1.0f / level_height);
+       glProgramUniform2f(motion_search_program, uniform_inv_prev_level_size, 1.0f / prev_level_width, 1.0f / prev_level_height);
+       glProgramUniform2f(motion_search_program, uniform_out_flow_size, width_patches, height_patches);
+       glProgramUniform1ui(motion_search_program, uniform_patch_size, op.patch_size_pixels);
+       glProgramUniform1ui(motion_search_program, uniform_num_iterations, op.search_iterations);
+
+       glViewport(0, 0, width_patches, height_patches);
+       fbos.render_to(flow_out_tex);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
+}
+
+Densify::Densify(const OperatingPoint &op)
+       : op(op)
+{
+       densify_vs_obj = compile_shader(read_file("densify.vert", _binary_densify_vert_data, _binary_densify_vert_size), GL_VERTEX_SHADER);
+       densify_fs_obj = compile_shader(read_file("densify.frag", _binary_densify_frag_data, _binary_densify_frag_size), GL_FRAGMENT_SHADER);
+       densify_program = link_program(densify_vs_obj, densify_fs_obj);
+
+       uniform_patch_size = glGetUniformLocation(densify_program, "patch_size");
+       uniform_image_tex = glGetUniformLocation(densify_program, "image_tex");
+       uniform_flow_tex = glGetUniformLocation(densify_program, "flow_tex");
+}
+
+void Densify::exec(GLuint tex_view, GLuint flow_tex, GLuint dense_flow_tex, int level_width, int level_height, int width_patches, int height_patches, int num_layers)
+{
+       glUseProgram(densify_program);
+
+       bind_sampler(densify_program, uniform_image_tex, 0, tex_view, linear_sampler);
+       bind_sampler(densify_program, uniform_flow_tex, 1, flow_tex, nearest_sampler);
+
+       glProgramUniform2f(densify_program, uniform_patch_size,
+               float(op.patch_size_pixels) / level_width,
+               float(op.patch_size_pixels) / level_height);
+
+       glViewport(0, 0, level_width, level_height);
+       glEnable(GL_BLEND);
+       glBlendFunc(GL_ONE, GL_ONE);
+       fbos.render_to(dense_flow_tex);
+       glClearColor(0.0f, 0.0f, 0.0f, 0.0f);
+       glClear(GL_COLOR_BUFFER_BIT);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, width_patches * height_patches * num_layers);
+}
+
+Prewarp::Prewarp()
+{
+       prewarp_vs_obj = compile_shader(read_file("vs.vert", _binary_vs_vert_data, _binary_vs_vert_size), GL_VERTEX_SHADER);
+       prewarp_fs_obj = compile_shader(read_file("prewarp.frag", _binary_prewarp_frag_data, _binary_prewarp_frag_size), GL_FRAGMENT_SHADER);
+       prewarp_program = link_program(prewarp_vs_obj, prewarp_fs_obj);
+
+       uniform_image_tex = glGetUniformLocation(prewarp_program, "image_tex");
+       uniform_flow_tex = glGetUniformLocation(prewarp_program, "flow_tex");
+}
+
+void Prewarp::exec(GLuint tex_view, GLuint flow_tex, GLuint I_tex, GLuint I_t_tex, GLuint normalized_flow_tex, int level_width, int level_height, int num_layers)
+{
+       glUseProgram(prewarp_program);
+
+       bind_sampler(prewarp_program, uniform_image_tex, 0, tex_view, linear_sampler);
+       bind_sampler(prewarp_program, uniform_flow_tex, 1, flow_tex, nearest_sampler);
+
+       glViewport(0, 0, level_width, level_height);
+       glDisable(GL_BLEND);
+       fbos.render_to(I_tex, I_t_tex, normalized_flow_tex);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
+}
+
+Derivatives::Derivatives()
+{
+       derivatives_vs_obj = compile_shader(read_file("vs.vert", _binary_vs_vert_data, _binary_vs_vert_size), GL_VERTEX_SHADER);
+       derivatives_fs_obj = compile_shader(read_file("derivatives.frag", _binary_derivatives_frag_data, _binary_derivatives_frag_size), GL_FRAGMENT_SHADER);
+       derivatives_program = link_program(derivatives_vs_obj, derivatives_fs_obj);
+
+       uniform_tex = glGetUniformLocation(derivatives_program, "tex");
+}
+
+void Derivatives::exec(GLuint input_tex, GLuint I_x_y_tex, GLuint beta_0_tex, int level_width, int level_height, int num_layers)
+{
+       glUseProgram(derivatives_program);
+
+       bind_sampler(derivatives_program, uniform_tex, 0, input_tex, nearest_sampler);
+
+       glViewport(0, 0, level_width, level_height);
+       glDisable(GL_BLEND);
+       fbos.render_to(I_x_y_tex, beta_0_tex);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
+}
+
+ComputeDiffusivity::ComputeDiffusivity()
+{
+       diffusivity_vs_obj = compile_shader(read_file("vs.vert", _binary_vs_vert_data, _binary_vs_vert_size), GL_VERTEX_SHADER);
+       diffusivity_fs_obj = compile_shader(read_file("diffusivity.frag", _binary_diffusivity_frag_data, _binary_diffusivity_frag_size), GL_FRAGMENT_SHADER);
+       diffusivity_program = link_program(diffusivity_vs_obj, diffusivity_fs_obj);
+
+       uniform_flow_tex = glGetUniformLocation(diffusivity_program, "flow_tex");
+       uniform_diff_flow_tex = glGetUniformLocation(diffusivity_program, "diff_flow_tex");
+       uniform_alpha = glGetUniformLocation(diffusivity_program, "alpha");
+       uniform_zero_diff_flow = glGetUniformLocation(diffusivity_program, "zero_diff_flow");
+}
+
+void ComputeDiffusivity::exec(GLuint flow_tex, GLuint diff_flow_tex, GLuint diffusivity_tex, int level_width, int level_height, bool zero_diff_flow, int num_layers)
+{
+       glUseProgram(diffusivity_program);
+
+       bind_sampler(diffusivity_program, uniform_flow_tex, 0, flow_tex, nearest_sampler);
+       bind_sampler(diffusivity_program, uniform_diff_flow_tex, 1, diff_flow_tex, nearest_sampler);
+       glProgramUniform1f(diffusivity_program, uniform_alpha, vr_alpha);
+       glProgramUniform1i(diffusivity_program, uniform_zero_diff_flow, zero_diff_flow);
+
+       glViewport(0, 0, level_width, level_height);
+
+       glDisable(GL_BLEND);
+       fbos.render_to(diffusivity_tex);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
+}
+
+SetupEquations::SetupEquations()
+{
+       equations_vs_obj = compile_shader(read_file("equations.vert", _binary_equations_vert_data, _binary_equations_vert_size), GL_VERTEX_SHADER);
+       equations_fs_obj = compile_shader(read_file("equations.frag", _binary_equations_frag_data, _binary_equations_frag_size), GL_FRAGMENT_SHADER);
+       equations_program = link_program(equations_vs_obj, equations_fs_obj);
+
+       uniform_I_x_y_tex = glGetUniformLocation(equations_program, "I_x_y_tex");
+       uniform_I_t_tex = glGetUniformLocation(equations_program, "I_t_tex");
+       uniform_diff_flow_tex = glGetUniformLocation(equations_program, "diff_flow_tex");
+       uniform_base_flow_tex = glGetUniformLocation(equations_program, "base_flow_tex");
+       uniform_beta_0_tex = glGetUniformLocation(equations_program, "beta_0_tex");
+       uniform_diffusivity_tex = glGetUniformLocation(equations_program, "diffusivity_tex");
+       uniform_gamma = glGetUniformLocation(equations_program, "gamma");
+       uniform_delta = glGetUniformLocation(equations_program, "delta");
+       uniform_zero_diff_flow = glGetUniformLocation(equations_program, "zero_diff_flow");
+}
+
+void SetupEquations::exec(GLuint I_x_y_tex, GLuint I_t_tex, GLuint diff_flow_tex, GLuint base_flow_tex, GLuint beta_0_tex, GLuint diffusivity_tex, GLuint equation_red_tex, GLuint equation_black_tex, int level_width, int level_height, bool zero_diff_flow, int num_layers)
+{
+       glUseProgram(equations_program);
+
+       bind_sampler(equations_program, uniform_I_x_y_tex, 0, I_x_y_tex, nearest_sampler);
+       bind_sampler(equations_program, uniform_I_t_tex, 1, I_t_tex, nearest_sampler);
+       bind_sampler(equations_program, uniform_diff_flow_tex, 2, diff_flow_tex, nearest_sampler);
+       bind_sampler(equations_program, uniform_base_flow_tex, 3, base_flow_tex, nearest_sampler);
+       bind_sampler(equations_program, uniform_beta_0_tex, 4, beta_0_tex, nearest_sampler);
+       bind_sampler(equations_program, uniform_diffusivity_tex, 5, diffusivity_tex, zero_border_sampler);
+       glProgramUniform1f(equations_program, uniform_delta, vr_delta);
+       glProgramUniform1f(equations_program, uniform_gamma, vr_gamma);
+       glProgramUniform1i(equations_program, uniform_zero_diff_flow, zero_diff_flow);
+
+       glViewport(0, 0, (level_width + 1) / 2, level_height);
+       glDisable(GL_BLEND);
+       fbos.render_to(equation_red_tex, equation_black_tex);
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
+}
+
+SOR::SOR()
+{
+       sor_vs_obj = compile_shader(read_file("sor.vert", _binary_sor_vert_data, _binary_sor_vert_size), GL_VERTEX_SHADER);
+       sor_fs_obj = compile_shader(read_file("sor.frag", _binary_sor_frag_data, _binary_sor_frag_size), GL_FRAGMENT_SHADER);
+       sor_program = link_program(sor_vs_obj, sor_fs_obj);
+
+       uniform_diff_flow_tex = glGetUniformLocation(sor_program, "diff_flow_tex");
+       uniform_equation_red_tex = glGetUniformLocation(sor_program, "equation_red_tex");
+       uniform_equation_black_tex = glGetUniformLocation(sor_program, "equation_black_tex");
+       uniform_diffusivity_tex = glGetUniformLocation(sor_program, "diffusivity_tex");
+       uniform_phase = glGetUniformLocation(sor_program, "phase");
+       uniform_num_nonzero_phases = glGetUniformLocation(sor_program, "num_nonzero_phases");
+}
+
+void SOR::exec(GLuint diff_flow_tex, GLuint equation_red_tex, GLuint equation_black_tex, GLuint diffusivity_tex, int level_width, int level_height, int num_iterations, bool zero_diff_flow, int num_layers, ScopedTimer *sor_timer)
+{
+       glUseProgram(sor_program);
+
+       bind_sampler(sor_program, uniform_diff_flow_tex, 0, diff_flow_tex, nearest_sampler);
+       bind_sampler(sor_program, uniform_diffusivity_tex, 1, diffusivity_tex, zero_border_sampler);
+       bind_sampler(sor_program, uniform_equation_red_tex, 2, equation_red_tex, nearest_sampler);
+       bind_sampler(sor_program, uniform_equation_black_tex, 3, equation_black_tex, nearest_sampler);
+
+       if (!zero_diff_flow) {
+               glProgramUniform1i(sor_program, uniform_num_nonzero_phases, 2);
+       }
+
+       // NOTE: We bind to the texture we are rendering from, but we never write any value
+       // that we read in the same shader pass (we call discard for red values when we compute
+       // black, and vice versa), and we have barriers between the passes, so we're fine
+       // as per the spec.
+       glViewport(0, 0, level_width, level_height);
+       glDisable(GL_BLEND);
+       fbos.render_to(diff_flow_tex);
+
+       for (int i = 0; i < num_iterations; ++i) {
+               {
+                       ScopedTimer timer("Red pass", sor_timer);
+                       if (zero_diff_flow && i == 0) {
+                               glProgramUniform1i(sor_program, uniform_num_nonzero_phases, 0);
+                       }
+                       glProgramUniform1i(sor_program, uniform_phase, 0);
+                       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
+                       glTextureBarrier();
+               }
+               {
+                       ScopedTimer timer("Black pass", sor_timer);
+                       if (zero_diff_flow && i == 0) {
+                               glProgramUniform1i(sor_program, uniform_num_nonzero_phases, 1);
+                       }
+                       glProgramUniform1i(sor_program, uniform_phase, 1);
+                       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
+                       if (zero_diff_flow && i == 0) {
+                               glProgramUniform1i(sor_program, uniform_num_nonzero_phases, 2);
+                       }
+                       if (i != num_iterations - 1) {
+                               glTextureBarrier();
+                       }
+               }
+       }
+}
+
+AddBaseFlow::AddBaseFlow()
+{
+       add_flow_vs_obj = compile_shader(read_file("vs.vert", _binary_vs_vert_data, _binary_vs_vert_size), GL_VERTEX_SHADER);
+       add_flow_fs_obj = compile_shader(read_file("add_base_flow.frag", _binary_add_base_flow_frag_data, _binary_add_base_flow_frag_size), GL_FRAGMENT_SHADER);
+       add_flow_program = link_program(add_flow_vs_obj, add_flow_fs_obj);
+
+       uniform_diff_flow_tex = glGetUniformLocation(add_flow_program, "diff_flow_tex");
+}
+
+void AddBaseFlow::exec(GLuint base_flow_tex, GLuint diff_flow_tex, int level_width, int level_height, int num_layers)
+{
+       glUseProgram(add_flow_program);
+
+       bind_sampler(add_flow_program, uniform_diff_flow_tex, 0, diff_flow_tex, nearest_sampler);
+
+       glViewport(0, 0, level_width, level_height);
+       glEnable(GL_BLEND);
+       glBlendFunc(GL_ONE, GL_ONE);
+       fbos.render_to(base_flow_tex);
+
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
+}
+
+ResizeFlow::ResizeFlow()
+{
+       resize_flow_vs_obj = compile_shader(read_file("vs.vert", _binary_vs_vert_data, _binary_vs_vert_size), GL_VERTEX_SHADER);
+       resize_flow_fs_obj = compile_shader(read_file("resize_flow.frag", _binary_resize_flow_frag_data, _binary_resize_flow_frag_size), GL_FRAGMENT_SHADER);
+       resize_flow_program = link_program(resize_flow_vs_obj, resize_flow_fs_obj);
+
+       uniform_flow_tex = glGetUniformLocation(resize_flow_program, "flow_tex");
+       uniform_scale_factor = glGetUniformLocation(resize_flow_program, "scale_factor");
+}
+
+void ResizeFlow::exec(GLuint flow_tex, GLuint out_tex, int input_width, int input_height, int output_width, int output_height, int num_layers)
+{
+       glUseProgram(resize_flow_program);
+
+       bind_sampler(resize_flow_program, uniform_flow_tex, 0, flow_tex, nearest_sampler);
+
+       glProgramUniform2f(resize_flow_program, uniform_scale_factor, float(output_width) / input_width, float(output_height) / input_height);
+
+       glViewport(0, 0, output_width, output_height);
+       glDisable(GL_BLEND);
+       fbos.render_to(out_tex);
+
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, num_layers);
+}
+
+DISComputeFlow::DISComputeFlow(int width, int height, const OperatingPoint &op)
+       : width(width), height(height), op(op), motion_search(op), densify(op)
+{
+       // Make some samplers.
+       glCreateSamplers(1, &nearest_sampler);
+       glSamplerParameteri(nearest_sampler, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+       glSamplerParameteri(nearest_sampler, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+       glSamplerParameteri(nearest_sampler, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+       glSamplerParameteri(nearest_sampler, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+
+       glCreateSamplers(1, &linear_sampler);
+       glSamplerParameteri(linear_sampler, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+       glSamplerParameteri(linear_sampler, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+       glSamplerParameteri(linear_sampler, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+       glSamplerParameteri(linear_sampler, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+
+       // The smoothness is sampled so that once we get to a smoothness involving
+       // a value outside the border, the diffusivity between the two becomes zero.
+       // Similarly, gradients are zero outside the border, since the edge is taken
+       // to be constant.
+       glCreateSamplers(1, &zero_border_sampler);
+       glSamplerParameteri(zero_border_sampler, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+       glSamplerParameteri(zero_border_sampler, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+       glSamplerParameteri(zero_border_sampler, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_BORDER);
+       glSamplerParameteri(zero_border_sampler, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_BORDER);
+       float zero[] = { 0.0f, 0.0f, 0.0f, 0.0f };  // Note that zero alpha means we can also see whether we sampled outside the border or not.
+       glSamplerParameterfv(zero_border_sampler, GL_TEXTURE_BORDER_COLOR, zero);
+
+       // Initial flow is zero, 1x1.
+       glCreateTextures(GL_TEXTURE_2D_ARRAY, 1, &initial_flow_tex);
+       glTextureStorage3D(initial_flow_tex, 1, GL_RG16F, 1, 1, 1);
+       glClearTexImage(initial_flow_tex, 0, GL_RG, GL_FLOAT, nullptr);
+
+       // Set up the vertex data that will be shared between all passes.
+       float vertices[] = {
+               0.0f, 1.0f,
+               0.0f, 0.0f,
+               1.0f, 1.0f,
+               1.0f, 0.0f,
+       };
+       glCreateBuffers(1, &vertex_vbo);
+       glNamedBufferData(vertex_vbo, sizeof(vertices), vertices, GL_STATIC_DRAW);
+
+       glCreateVertexArrays(1, &vao);
+       glBindVertexArray(vao);
+       glBindBuffer(GL_ARRAY_BUFFER, vertex_vbo);
+
+       GLint position_attrib = 0;  // Hard-coded in every vertex shader.
+       glEnableVertexArrayAttrib(vao, position_attrib);
+       glVertexAttribPointer(position_attrib, 2, GL_FLOAT, GL_FALSE, 0, BUFFER_OFFSET(0));
+}
+
+GLuint DISComputeFlow::exec(GLuint tex, FlowDirection flow_direction, ResizeStrategy resize_strategy)
+{
+       int num_layers = (flow_direction == FORWARD_AND_BACKWARD) ? 2 : 1;
+       int prev_level_width = 1, prev_level_height = 1;
+       GLuint prev_level_flow_tex = initial_flow_tex;
+
+       GPUTimers timers;
+
+       glBindVertexArray(vao);
+       glDisable(GL_DITHER);
+
+       ScopedTimer total_timer("Compute flow", &timers);
+       for (int level = op.coarsest_level; level >= int(op.finest_level); --level) {
+               char timer_name[256];
+               snprintf(timer_name, sizeof(timer_name), "Level %d (%d x %d)", level, width >> level, height >> level);
+               ScopedTimer level_timer(timer_name, &total_timer);
+
+               int level_width = width >> level;
+               int level_height = height >> level;
+               float patch_spacing_pixels = op.patch_size_pixels * (1.0f - op.patch_overlap_ratio);
+
+               // Make sure we have patches at least every Nth pixel, e.g. for width=9
+               // and patch_spacing=3 (the default), we put out patch centers in
+               // x=0, x=3, x=6, x=9, which is four patches. The fragment shader will
+               // lock all the centers to integer coordinates if needed.
+               int width_patches = 1 + ceil(level_width / patch_spacing_pixels);
+               int height_patches = 1 + ceil(level_height / patch_spacing_pixels);
+
+               // Make sure we always read from the correct level; the chosen
+               // mipmapping could otherwise be rather unpredictable, especially
+               // during motion search.
+               GLuint tex_view;
+               glGenTextures(1, &tex_view);
+               glTextureView(tex_view, GL_TEXTURE_2D_ARRAY, tex, GL_R8, level, 1, 0, 2);
+
+               // Create a new texture to hold the gradients.
+               GLuint grad_tex = pool.get_texture(GL_R32UI, level_width, level_height, num_layers);
+
+               // Find the derivative.
+               {
+                       ScopedTimer timer("Sobel", &level_timer);
+                       sobel.exec(tex_view, grad_tex, level_width, level_height, num_layers);
+               }
+
+               // Motion search to find the initial flow. We use the flow from the previous
+               // level (sampled bilinearly; no fancy tricks) as a guide, then search from there.
+
+               // Create an output flow texture.
+               GLuint flow_out_tex = pool.get_texture(GL_RGB16F, width_patches, height_patches, num_layers);
+
+               // And draw.
+               {
+                       ScopedTimer timer("Motion search", &level_timer);
+                       motion_search.exec(tex_view, grad_tex, prev_level_flow_tex, flow_out_tex, level_width, level_height, prev_level_width, prev_level_height, width_patches, height_patches, num_layers);
+               }
+               pool.release_texture(grad_tex);
+
+               // Densification.
+
+               // Set up an output texture (cleared in Densify).
+               GLuint dense_flow_tex = pool.get_texture(GL_RGB16F, level_width, level_height, num_layers);
+
+               // And draw.
+               {
+                       ScopedTimer timer("Densification", &level_timer);
+                       densify.exec(tex_view, flow_out_tex, dense_flow_tex, level_width, level_height, width_patches, height_patches, num_layers);
+               }
+               pool.release_texture(flow_out_tex);
+
+               // Everything below here in the loop belongs to variational refinement.
+               ScopedTimer varref_timer("Variational refinement", &level_timer);
+
+               // Prewarping; create I and I_t, and a normalized base flow (so we don't
+               // have to normalize it over and over again, and also save some bandwidth).
+               //
+               // During the entire rest of the variational refinement, flow will be measured
+               // in pixels, not 0..1 normalized OpenGL texture coordinates.
+               // This is because variational refinement depends so heavily on derivatives,
+               // which are measured in intensity levels per pixel.
+               GLuint I_tex = pool.get_texture(GL_R16F, level_width, level_height, num_layers);
+               GLuint I_t_tex = pool.get_texture(GL_R16F, level_width, level_height, num_layers);
+               GLuint base_flow_tex = pool.get_texture(GL_RG16F, level_width, level_height, num_layers);
+               {
+                       ScopedTimer timer("Prewarping", &varref_timer);
+                       prewarp.exec(tex_view, dense_flow_tex, I_tex, I_t_tex, base_flow_tex, level_width, level_height, num_layers);
+               }
+               pool.release_texture(dense_flow_tex);
+               glDeleteTextures(1, &tex_view);
+
+               // TODO: If we don't have variational refinement, we don't need I and I_t,
+               // so computing them is a waste.
+               if (op.variational_refinement) {
+                       // Calculate I_x and I_y. We're only calculating first derivatives;
+                       // the others will be taken on-the-fly in order to sample from fewer
+                       // textures overall, since sampling from the L1 cache is cheap.
+                       // (TODO: Verify that this is indeed faster than making separate
+                       // double-derivative textures.)
+                       GLuint I_x_y_tex = pool.get_texture(GL_RG16F, level_width, level_height, num_layers);
+                       GLuint beta_0_tex = pool.get_texture(GL_R16F, level_width, level_height, num_layers);
+                       {
+                               ScopedTimer timer("First derivatives", &varref_timer);
+                               derivatives.exec(I_tex, I_x_y_tex, beta_0_tex, level_width, level_height, num_layers);
+                       }
+                       pool.release_texture(I_tex);
+
+                       // We need somewhere to store du and dv (the flow increment, relative
+                       // to the non-refined base flow u0 and v0). It's initially garbage,
+                       // but not read until we've written something sane to it.
+                       GLuint diff_flow_tex = pool.get_texture(GL_RG16F, level_width, level_height, num_layers);
+
+                       // And for diffusivity.
+                       GLuint diffusivity_tex = pool.get_texture(GL_R16F, level_width, level_height, num_layers);
+
+                       // And finally for the equation set. See SetupEquations for
+                       // the storage format.
+                       GLuint equation_red_tex = pool.get_texture(GL_RGBA32UI, (level_width + 1) / 2, level_height, num_layers);
+                       GLuint equation_black_tex = pool.get_texture(GL_RGBA32UI, (level_width + 1) / 2, level_height, num_layers);
+
+                       for (int outer_idx = 0; outer_idx < level + 1; ++outer_idx) {
+                               // Calculate the diffusivity term for each pixel.
+                               {
+                                       ScopedTimer timer("Compute diffusivity", &varref_timer);
+                                       compute_diffusivity.exec(base_flow_tex, diff_flow_tex, diffusivity_tex, level_width, level_height, outer_idx == 0, num_layers);
+                               }
+
+                               // Set up the 2x2 equation system for each pixel.
+                               {
+                                       ScopedTimer timer("Set up equations", &varref_timer);
+                                       setup_equations.exec(I_x_y_tex, I_t_tex, diff_flow_tex, base_flow_tex, beta_0_tex, diffusivity_tex, equation_red_tex, equation_black_tex, level_width, level_height, outer_idx == 0, num_layers);
+                               }
+
+                               // Run a few SOR iterations. Note that these are to/from the same texture.
+                               {
+                                       ScopedTimer timer("SOR", &varref_timer);
+                                       sor.exec(diff_flow_tex, equation_red_tex, equation_black_tex, diffusivity_tex, level_width, level_height, 5, outer_idx == 0, num_layers, &timer);
+                               }
+                       }
+
+                       pool.release_texture(I_t_tex);
+                       pool.release_texture(I_x_y_tex);
+                       pool.release_texture(beta_0_tex);
+                       pool.release_texture(diffusivity_tex);
+                       pool.release_texture(equation_red_tex);
+                       pool.release_texture(equation_black_tex);
+
+                       // Add the differential flow found by the variational refinement to the base flow,
+                       // giving the final flow estimate for this level.
+                       // The output is in base_flow_tex; we don't need to make a new texture.
+                       {
+                               ScopedTimer timer("Add differential flow", &varref_timer);
+                               add_base_flow.exec(base_flow_tex, diff_flow_tex, level_width, level_height, num_layers);
+                       }
+                       pool.release_texture(diff_flow_tex);
+               }
+
+               if (prev_level_flow_tex != initial_flow_tex) {
+                       pool.release_texture(prev_level_flow_tex);
+               }
+               prev_level_flow_tex = base_flow_tex;
+               prev_level_width = level_width;
+               prev_level_height = level_height;
+       }
+       total_timer.end();
+
+       if (!in_warmup) {
+               timers.print();
+       }
+
+       // Scale up the flow to the final size (if needed).
+       if (op.finest_level == 0 || resize_strategy == DO_NOT_RESIZE_FLOW) {
+               return prev_level_flow_tex;
+       } else {
+               GLuint final_tex = pool.get_texture(GL_RG16F, width, height, num_layers);
+               resize_flow.exec(prev_level_flow_tex, final_tex, prev_level_width, prev_level_height, width, height, num_layers);
+               pool.release_texture(prev_level_flow_tex);
+               return final_tex;
+       }
+}
+
+Splat::Splat(const OperatingPoint &op)
+       : op(op)
+{
+       splat_vs_obj = compile_shader(read_file("splat.vert", _binary_splat_vert_data, _binary_splat_vert_size), GL_VERTEX_SHADER);
+       splat_fs_obj = compile_shader(read_file("splat.frag", _binary_splat_frag_data, _binary_splat_frag_size), GL_FRAGMENT_SHADER);
+       splat_program = link_program(splat_vs_obj, splat_fs_obj);
+
+       uniform_splat_size = glGetUniformLocation(splat_program, "splat_size");
+       uniform_alpha = glGetUniformLocation(splat_program, "alpha");
+       uniform_gray_tex = glGetUniformLocation(splat_program, "gray_tex");
+       uniform_flow_tex = glGetUniformLocation(splat_program, "flow_tex");
+       uniform_inv_flow_size = glGetUniformLocation(splat_program, "inv_flow_size");
+}
+
+void Splat::exec(GLuint gray_tex, GLuint bidirectional_flow_tex, GLuint flow_tex, GLuint depth_rb, int width, int height, float alpha)
+{
+       glUseProgram(splat_program);
+
+       bind_sampler(splat_program, uniform_gray_tex, 0, gray_tex, linear_sampler);
+       bind_sampler(splat_program, uniform_flow_tex, 1, bidirectional_flow_tex, nearest_sampler);
+
+       glProgramUniform2f(splat_program, uniform_splat_size, op.splat_size / width, op.splat_size / height);
+       glProgramUniform1f(splat_program, uniform_alpha, alpha);
+       glProgramUniform2f(splat_program, uniform_inv_flow_size, 1.0f / width, 1.0f / height);
+
+       glViewport(0, 0, width, height);
+       glDisable(GL_BLEND);
+       glEnable(GL_DEPTH_TEST);
+       glDepthMask(GL_TRUE);
+       glDepthFunc(GL_LESS);  // We store the difference between I_0 and I_1, where less difference is good. (Default 1.0 is effectively +inf, which always loses.)
+
+       fbos.render_to(depth_rb, flow_tex);
+
+       // Evidently NVIDIA doesn't use fast clears for glClearTexImage, so clear now that
+       // we've got it bound.
+       glClearColor(1000.0f, 1000.0f, 0.0f, 1.0f);  // Invalid flow.
+       glClearDepth(1.0f);  // Effectively infinity.
+       glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+
+       glDrawArraysInstanced(GL_TRIANGLE_STRIP, 0, 4, width * height * 2);
+
+       glDisable(GL_DEPTH_TEST);
+}
+
+HoleFill::HoleFill()
+{
+       fill_vs_obj = compile_shader(read_file("hole_fill.vert", _binary_hole_fill_vert_data, _binary_hole_fill_vert_size), GL_VERTEX_SHADER);
+       fill_fs_obj = compile_shader(read_file("hole_fill.frag", _binary_hole_fill_frag_data, _binary_hole_fill_frag_size), GL_FRAGMENT_SHADER);
+       fill_program = link_program(fill_vs_obj, fill_fs_obj);
+
+       uniform_tex = glGetUniformLocation(fill_program, "tex");
+       uniform_z = glGetUniformLocation(fill_program, "z");
+       uniform_sample_offset = glGetUniformLocation(fill_program, "sample_offset");
+}
+
+void HoleFill::exec(GLuint flow_tex, GLuint depth_rb, GLuint temp_tex[3], int width, int height)
+{
+       glUseProgram(fill_program);
+
+       bind_sampler(fill_program, uniform_tex, 0, flow_tex, nearest_sampler);
+
+       glProgramUniform1f(fill_program, uniform_z, 1.0f - 1.0f / 1024.0f);
+
+       glViewport(0, 0, width, height);
+       glDisable(GL_BLEND);
+       glEnable(GL_DEPTH_TEST);
+       glDepthFunc(GL_LESS);  // Only update the values > 0.999f (ie., only invalid pixels).
+
+       fbos.render_to(depth_rb, flow_tex);  // NOTE: Reading and writing to the same texture.
+
+       // Fill holes from the left, by shifting 1, 2, 4, 8, etc. pixels to the right.
+       for (int offs = 1; offs < width; offs *= 2) {
+               glProgramUniform2f(fill_program, uniform_sample_offset, -offs / float(width), 0.0f);
+               glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+               glTextureBarrier();
+       }
+       glCopyImageSubData(flow_tex, GL_TEXTURE_2D, 0, 0, 0, 0, temp_tex[0], GL_TEXTURE_2D, 0, 0, 0, 0, width, height, 1);
+
+       // Similar to the right; adjust Z a bit down, so that we re-fill the pixels that
+       // were overwritten in the last algorithm.
+       glProgramUniform1f(fill_program, uniform_z, 1.0f - 2.0f / 1024.0f);
+       for (int offs = 1; offs < width; offs *= 2) {
+               glProgramUniform2f(fill_program, uniform_sample_offset, offs / float(width), 0.0f);
+               glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+               glTextureBarrier();
+       }
+       glCopyImageSubData(flow_tex, GL_TEXTURE_2D, 0, 0, 0, 0, temp_tex[1], GL_TEXTURE_2D, 0, 0, 0, 0, width, height, 1);
+
+       // Up.
+       glProgramUniform1f(fill_program, uniform_z, 1.0f - 3.0f / 1024.0f);
+       for (int offs = 1; offs < height; offs *= 2) {
+               glProgramUniform2f(fill_program, uniform_sample_offset, 0.0f, -offs / float(height));
+               glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+               glTextureBarrier();
+       }
+       glCopyImageSubData(flow_tex, GL_TEXTURE_2D, 0, 0, 0, 0, temp_tex[2], GL_TEXTURE_2D, 0, 0, 0, 0, width, height, 1);
+
+       // Down.
+       glProgramUniform1f(fill_program, uniform_z, 1.0f - 4.0f / 1024.0f);
+       for (int offs = 1; offs < height; offs *= 2) {
+               glProgramUniform2f(fill_program, uniform_sample_offset, 0.0f, offs / float(height));
+               glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+               glTextureBarrier();
+       }
+
+       glDisable(GL_DEPTH_TEST);
+}
+
+HoleBlend::HoleBlend()
+{
+       blend_vs_obj = compile_shader(read_file("hole_fill.vert", _binary_hole_fill_vert_data, _binary_hole_fill_vert_size), GL_VERTEX_SHADER);  // Reuse the vertex shader from the fill.
+       blend_fs_obj = compile_shader(read_file("hole_blend.frag", _binary_hole_blend_frag_data, _binary_hole_blend_frag_size), GL_FRAGMENT_SHADER);
+       blend_program = link_program(blend_vs_obj, blend_fs_obj);
+
+       uniform_left_tex = glGetUniformLocation(blend_program, "left_tex");
+       uniform_right_tex = glGetUniformLocation(blend_program, "right_tex");
+       uniform_up_tex = glGetUniformLocation(blend_program, "up_tex");
+       uniform_down_tex = glGetUniformLocation(blend_program, "down_tex");
+       uniform_z = glGetUniformLocation(blend_program, "z");
+       uniform_sample_offset = glGetUniformLocation(blend_program, "sample_offset");
+}
+
+void HoleBlend::exec(GLuint flow_tex, GLuint depth_rb, GLuint temp_tex[3], int width, int height)
+{
+       glUseProgram(blend_program);
+
+       bind_sampler(blend_program, uniform_left_tex, 0, temp_tex[0], nearest_sampler);
+       bind_sampler(blend_program, uniform_right_tex, 1, temp_tex[1], nearest_sampler);
+       bind_sampler(blend_program, uniform_up_tex, 2, temp_tex[2], nearest_sampler);
+       bind_sampler(blend_program, uniform_down_tex, 3, flow_tex, nearest_sampler);
+
+       glProgramUniform1f(blend_program, uniform_z, 1.0f - 4.0f / 1024.0f);
+       glProgramUniform2f(blend_program, uniform_sample_offset, 0.0f, 0.0f);
+
+       glViewport(0, 0, width, height);
+       glDisable(GL_BLEND);
+       glEnable(GL_DEPTH_TEST);
+       glDepthFunc(GL_LEQUAL);  // Skip over all of the pixels that were never holes to begin with.
+
+       fbos.render_to(depth_rb, flow_tex);  // NOTE: Reading and writing to the same texture.
+
+       glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+
+       glDisable(GL_DEPTH_TEST);
+}
+
+Blend::Blend(bool split_ycbcr_output)
+       : split_ycbcr_output(split_ycbcr_output)
+{
+       string frag_shader = read_file("blend.frag", _binary_blend_frag_data, _binary_blend_frag_size);
+       if (split_ycbcr_output) {
+               // Insert after the first #version line.
+               size_t offset = frag_shader.find('\n');
+               assert(offset != string::npos);
+               frag_shader = frag_shader.substr(0, offset + 1) + "#define SPLIT_YCBCR_OUTPUT 1\n" + frag_shader.substr(offset + 1);
+       }
+
+       blend_vs_obj = compile_shader(read_file("vs.vert", _binary_vs_vert_data, _binary_vs_vert_size), GL_VERTEX_SHADER);
+       blend_fs_obj = compile_shader(frag_shader, GL_FRAGMENT_SHADER);
+       blend_program = link_program(blend_vs_obj, blend_fs_obj);
+
+       uniform_image_tex = glGetUniformLocation(blend_program, "image_tex");
+       uniform_flow_tex = glGetUniformLocation(blend_program, "flow_tex");
+       uniform_alpha = glGetUniformLocation(blend_program, "alpha");
+       uniform_flow_consistency_tolerance = glGetUniformLocation(blend_program, "flow_consistency_tolerance");
+}
+
+void Blend::exec(GLuint image_tex, GLuint flow_tex, GLuint output_tex, GLuint output2_tex, int level_width, int level_height, float alpha)
+{
+       glUseProgram(blend_program);
+       bind_sampler(blend_program, uniform_image_tex, 0, image_tex, linear_sampler);
+       bind_sampler(blend_program, uniform_flow_tex, 1, flow_tex, linear_sampler);  // May be upsampled.
+       glProgramUniform1f(blend_program, uniform_alpha, alpha);
+
+       glViewport(0, 0, level_width, level_height);
+       if (split_ycbcr_output) {
+               fbos_split.render_to(output_tex, output2_tex);
+       } else {
+               fbos.render_to(output_tex);
+       }
+       glDisable(GL_BLEND);  // A bit ironic, perhaps.
+       glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+}
+
+Interpolate::Interpolate(const OperatingPoint &op, bool split_ycbcr_output)
+       : flow_level(op.finest_level),
+         split_ycbcr_output(split_ycbcr_output),
+         splat(op),
+         blend(split_ycbcr_output) {
+       // Set up the vertex data that will be shared between all passes.
+       float vertices[] = {
+               0.0f, 1.0f,
+               0.0f, 0.0f,
+               1.0f, 1.0f,
+               1.0f, 0.0f,
+       };
+       glCreateBuffers(1, &vertex_vbo);
+       glNamedBufferData(vertex_vbo, sizeof(vertices), vertices, GL_STATIC_DRAW);
+
+       glCreateVertexArrays(1, &vao);
+       glBindVertexArray(vao);
+       glBindBuffer(GL_ARRAY_BUFFER, vertex_vbo);
+
+       GLint position_attrib = 0;  // Hard-coded in every vertex shader.
+       glEnableVertexArrayAttrib(vao, position_attrib);
+       glVertexAttribPointer(position_attrib, 2, GL_FLOAT, GL_FALSE, 0, BUFFER_OFFSET(0));
+}
+
+pair<GLuint, GLuint> Interpolate::exec(GLuint image_tex, GLuint gray_tex, GLuint bidirectional_flow_tex, GLuint width, GLuint height, float alpha)
+{
+       GPUTimers timers;
+
+       ScopedTimer total_timer("Interpolate", &timers);
+
+       glBindVertexArray(vao);
+       glDisable(GL_DITHER);
+
+       // Pick out the right level to test splatting results on.
+       GLuint tex_view;
+       glGenTextures(1, &tex_view);
+       glTextureView(tex_view, GL_TEXTURE_2D_ARRAY, gray_tex, GL_R8, flow_level, 1, 0, 2);
+
+       int flow_width = width >> flow_level;
+       int flow_height = height >> flow_level;
+
+       GLuint flow_tex = pool.get_texture(GL_RG16F, flow_width, flow_height);
+       GLuint depth_rb = pool.get_renderbuffer(GL_DEPTH_COMPONENT16, flow_width, flow_height);  // Used for ranking flows.
+
+       {
+               ScopedTimer timer("Splat", &total_timer);
+               splat.exec(tex_view, bidirectional_flow_tex, flow_tex, depth_rb, flow_width, flow_height, alpha);
+       }
+       glDeleteTextures(1, &tex_view);
+
+       GLuint temp_tex[3];
+       temp_tex[0] = pool.get_texture(GL_RG16F, flow_width, flow_height);
+       temp_tex[1] = pool.get_texture(GL_RG16F, flow_width, flow_height);
+       temp_tex[2] = pool.get_texture(GL_RG16F, flow_width, flow_height);
+
+       {
+               ScopedTimer timer("Fill holes", &total_timer);
+               hole_fill.exec(flow_tex, depth_rb, temp_tex, flow_width, flow_height);
+               hole_blend.exec(flow_tex, depth_rb, temp_tex, flow_width, flow_height);
+       }
+
+       pool.release_texture(temp_tex[0]);
+       pool.release_texture(temp_tex[1]);
+       pool.release_texture(temp_tex[2]);
+       pool.release_renderbuffer(depth_rb);
+
+       GLuint output_tex, output2_tex = 0;
+       if (split_ycbcr_output) {
+               output_tex = pool.get_texture(GL_R8, width, height);
+               output2_tex = pool.get_texture(GL_RG8, width, height);
+               {
+                       ScopedTimer timer("Blend", &total_timer);
+                       blend.exec(image_tex, flow_tex, output_tex, output2_tex, width, height, alpha);
+               }
+       } else {
+               output_tex = pool.get_texture(GL_RGBA8, width, height);
+               {
+                       ScopedTimer timer("Blend", &total_timer);
+                       blend.exec(image_tex, flow_tex, output_tex, 0, width, height, alpha);
+               }
+       }
+       pool.release_texture(flow_tex);
+       total_timer.end();
+       if (!in_warmup) {
+               timers.print();
+       }
+
+       return make_pair(output_tex, output2_tex);
+}
+
+GLuint TexturePool::get_texture(GLenum format, GLuint width, GLuint height, GLuint num_layers)
+{
+       {
+               lock_guard<mutex> lock(mu);
+               for (Texture &tex : textures) {
+                       if (!tex.in_use && !tex.is_renderbuffer && tex.format == format &&
+                           tex.width == width && tex.height == height && tex.num_layers == num_layers) {
+                               tex.in_use = true;
+                               return tex.tex_num;
+                       }
+               }
+       }
+
+       Texture tex;
+       if (num_layers == 0) {
+               glCreateTextures(GL_TEXTURE_2D, 1, &tex.tex_num);
+               glTextureStorage2D(tex.tex_num, 1, format, width, height);
+       } else {
+               glCreateTextures(GL_TEXTURE_2D_ARRAY, 1, &tex.tex_num);
+               glTextureStorage3D(tex.tex_num, 1, format, width, height, num_layers);
+       }
+       tex.format = format;
+       tex.width = width;
+       tex.height = height;
+       tex.num_layers = num_layers;
+       tex.in_use = true;
+       tex.is_renderbuffer = false;
+       {
+               lock_guard<mutex> lock(mu);
+               textures.push_back(tex);
+       }
+       return tex.tex_num;
+}
+
+GLuint TexturePool::get_renderbuffer(GLenum format, GLuint width, GLuint height)
+{
+       {
+               lock_guard<mutex> lock(mu);
+               for (Texture &tex : textures) {
+                       if (!tex.in_use && tex.is_renderbuffer && tex.format == format &&
+                           tex.width == width && tex.height == height) {
+                               tex.in_use = true;
+                               return tex.tex_num;
+                       }
+               }
+       }
+
+       Texture tex;
+       glCreateRenderbuffers(1, &tex.tex_num);
+       glNamedRenderbufferStorage(tex.tex_num, format, width, height);
+
+       tex.format = format;
+       tex.width = width;
+       tex.height = height;
+       tex.in_use = true;
+       tex.is_renderbuffer = true;
+       {
+               lock_guard<mutex> lock(mu);
+               textures.push_back(tex);
+       }
+       return tex.tex_num;
+}
+
+void TexturePool::release_texture(GLuint tex_num)
+{
+       lock_guard<mutex> lock(mu);
+       for (Texture &tex : textures) {
+               if (!tex.is_renderbuffer && tex.tex_num == tex_num) {
+                       assert(tex.in_use);
+                       tex.in_use = false;
+                       return;
+               }
+       }
+       assert(false);
+}
+
+void TexturePool::release_renderbuffer(GLuint tex_num)
+{
+       lock_guard<mutex> lock(mu);
+       for (Texture &tex : textures) {
+               if (tex.is_renderbuffer && tex.tex_num == tex_num) {
+                       assert(tex.in_use);
+                       tex.in_use = false;
+                       return;
+               }
+       }
+       //assert(false);
+}
diff --git a/futatabi/flow.h b/futatabi/flow.h
new file mode 100644 (file)
index 0000000..08b2590
--- /dev/null
@@ -0,0 +1,568 @@
+#ifndef _FLOW_H
+#define _FLOW_H 1
+
+// Code for computing optical flow between two images, and using it to interpolate
+// in-between frames. The main user interface is the DISComputeFlow and Interpolate
+// classes (also GrayscaleConversion can be useful).
+
+#include <array>
+#include <epoxy/gl.h>
+#include <map>
+#include <mutex>
+#include <stdint.h>
+#include <utility>
+#include <vector>
+
+class ScopedTimer;
+
+// Predefined operating points from the paper.
+struct OperatingPoint {
+       unsigned coarsest_level;  // TODO: Adjust dynamically based on the resolution?
+       unsigned finest_level;
+       unsigned search_iterations;  // Halved from the paper.
+       unsigned patch_size_pixels;
+       float patch_overlap_ratio;
+       bool variational_refinement;
+
+       // Not part of the original paper; used for interpolation.
+       // NOTE: Values much larger than 1.0 seems to trigger Haswell's “PMA stall”;
+       // the problem is not present on Broadwell and higher (there's a mitigation
+       // in the hardware, but Mesa doesn't enable it at the time of writing).
+       // Since we have hole filling, the holes from 1.0 are not critical,
+       // but larger values seem to do better than hole filling for large
+       // motion, blurs etc. since we have more candidates.
+       float splat_size;
+};
+
+// Operating point 1 (600 Hz on CPU, excluding preprocessing).
+static constexpr OperatingPoint operating_point1 = {
+       5,      // Coarsest level.
+       3,      // Finest level.
+       8,      // Search iterations.
+       8,      // Patch size (pixels).
+       0.30f,  // Overlap ratio.
+       false,  // Variational refinement.
+       1.0f    // Splat size (pixels).
+};
+
+// Operating point 2 (300 Hz on CPU, excluding preprocessing).
+static constexpr OperatingPoint operating_point2 = {
+       5,      // Coarsest level.
+       3,      // Finest level.
+       6,      // Search iterations.
+       8,      // Patch size (pixels).
+       0.40f,  // Overlap ratio.
+       true,   // Variational refinement.
+       1.0f    // Splat size (pixels).
+};
+
+// Operating point 3 (10 Hz on CPU, excluding preprocessing).
+// This is the only one that has been thorougly tested.
+static constexpr OperatingPoint operating_point3 = {
+       5,      // Coarsest level.
+       1,      // Finest level.
+       8,      // Search iterations.
+       12,     // Patch size (pixels).
+       0.75f,  // Overlap ratio.
+       true,   // Variational refinement.
+       4.0f    // Splat size (pixels).
+};
+
+// Operating point 4 (0.5 Hz on CPU, excluding preprocessing).
+static constexpr OperatingPoint operating_point4 = {
+       5,      // Coarsest level.
+       0,      // Finest level.
+       128,    // Search iterations.
+       12,     // Patch size (pixels).
+       0.75f,  // Overlap ratio.
+       true,   // Variational refinement.
+       8.0f    // Splat size (pixels).
+};
+
+int find_num_levels(int width, int height);
+
+// A class that caches FBOs that render to a given set of textures.
+// It never frees anything, so it is only suitable for rendering to
+// the same (small) set of textures over and over again.
+template<size_t num_elements>
+class PersistentFBOSet {
+public:
+       void render_to(const std::array<GLuint, num_elements> &textures);
+
+       // Convenience wrappers.
+       void render_to(GLuint texture0) {
+               render_to({{texture0}});
+       }
+
+       void render_to(GLuint texture0, GLuint texture1) {
+               render_to({{texture0, texture1}});
+       }
+
+       void render_to(GLuint texture0, GLuint texture1, GLuint texture2) {
+               render_to({{texture0, texture1, texture2}});
+       }
+
+       void render_to(GLuint texture0, GLuint texture1, GLuint texture2, GLuint texture3) {
+               render_to({{texture0, texture1, texture2, texture3}});
+       }
+
+private:
+       // TODO: Delete these on destruction.
+       std::map<std::array<GLuint, num_elements>, GLuint> fbos;
+};
+
+// Same, but with a depth texture.
+template<size_t num_elements>
+class PersistentFBOSetWithDepth {
+public:
+       void render_to(GLuint depth_rb, const std::array<GLuint, num_elements> &textures);
+
+       // Convenience wrappers.
+       void render_to(GLuint depth_rb, GLuint texture0) {
+               render_to(depth_rb, {{texture0}});
+       }
+
+       void render_to(GLuint depth_rb, GLuint texture0, GLuint texture1) {
+               render_to(depth_rb, {{texture0, texture1}});
+       }
+
+       void render_to(GLuint depth_rb, GLuint texture0, GLuint texture1, GLuint texture2) {
+               render_to(depth_rb, {{texture0, texture1, texture2}});
+       }
+
+       void render_to(GLuint depth_rb, GLuint texture0, GLuint texture1, GLuint texture2, GLuint texture3) {
+               render_to(depth_rb, {{texture0, texture1, texture2, texture3}});
+       }
+
+private:
+       // TODO: Delete these on destruction.
+       std::map<std::pair<GLuint, std::array<GLuint, num_elements>>, GLuint> fbos;
+};
+
+// Convert RGB to grayscale, using Rec. 709 coefficients.
+class GrayscaleConversion {
+public:
+       GrayscaleConversion();
+       void exec(GLint tex, GLint gray_tex, int width, int height, int num_layers);
+
+private:
+       PersistentFBOSet<1> fbos;
+       GLuint gray_vs_obj;
+       GLuint gray_fs_obj;
+       GLuint gray_program;
+       GLuint gray_vao;
+
+       GLuint uniform_tex;
+};
+
+// Compute gradients in every point, used for the motion search.
+// The DIS paper doesn't actually mention how these are computed,
+// but seemingly, a 3x3 Sobel operator is used here (at least in
+// later versions of the code), while a [1 -8 0 8 -1] kernel is
+// used for all the derivatives in the variational refinement part
+// (which borrows code from DeepFlow). This is inconsistent,
+// but I guess we're better off with staying with the original
+// decisions until we actually know having different ones would be better.
+class Sobel {
+public:
+       Sobel();
+       void exec(GLint tex_view, GLint grad_tex, int level_width, int level_height, int num_layers);
+
+private:
+       PersistentFBOSet<1> fbos;
+       GLuint sobel_vs_obj;
+       GLuint sobel_fs_obj;
+       GLuint sobel_program;
+
+       GLuint uniform_tex;
+};
+
+// Motion search to find the initial flow. See motion_search.frag for documentation.
+class MotionSearch {
+public:
+       MotionSearch(const OperatingPoint &op);
+       void exec(GLuint tex_view, GLuint grad_tex, GLuint flow_tex, GLuint flow_out_tex, int level_width, int level_height, int prev_level_width, int prev_level_height, int width_patches, int height_patches, int num_layers);
+
+private:
+       const OperatingPoint op;
+       PersistentFBOSet<1> fbos;
+
+       GLuint motion_vs_obj;
+       GLuint motion_fs_obj;
+       GLuint motion_search_program;
+
+       GLuint uniform_inv_image_size, uniform_inv_prev_level_size, uniform_out_flow_size;
+       GLuint uniform_image_tex, uniform_grad_tex, uniform_flow_tex;
+       GLuint uniform_patch_size, uniform_num_iterations;
+};
+
+// Do “densification”, ie., upsampling of the flow patches to the flow field
+// (the same size as the image at this level). We draw one quad per patch
+// over its entire covered area (using instancing in the vertex shader),
+// and then weight the contributions in the pixel shader by post-warp difference.
+// This is equation (3) in the paper.
+//
+// We accumulate the flow vectors in the R/G channels (for u/v) and the total
+// weight in the B channel. Dividing R and G by B gives the normalized values.
+class Densify {
+public:
+       Densify(const OperatingPoint &op);
+       void exec(GLuint tex_view, GLuint flow_tex, GLuint dense_flow_tex, int level_width, int level_height, int width_patches, int height_patches, int num_layers);
+
+private:
+       OperatingPoint op;
+       PersistentFBOSet<1> fbos;
+
+       GLuint densify_vs_obj;
+       GLuint densify_fs_obj;
+       GLuint densify_program;
+
+       GLuint uniform_patch_size;
+       GLuint uniform_image_tex, uniform_flow_tex;
+};
+
+// Warp I_1 to I_w, and then compute the mean (I) and difference (I_t) of
+// I_0 and I_w. The prewarping is what enables us to solve the variational
+// flow for du,dv instead of u,v.
+//
+// Also calculates the normalized flow, ie. divides by z (this is needed because
+// Densify works by additive blending) and multiplies by the image size.
+//
+// See variational_refinement.txt for more information.
+class Prewarp {
+public:
+       Prewarp();
+       void exec(GLuint tex_view, GLuint flow_tex, GLuint normalized_flow_tex, GLuint I_tex, GLuint I_t_tex, int level_width, int level_height, int num_layers);
+
+private:
+       PersistentFBOSet<3> fbos;
+
+       GLuint prewarp_vs_obj;
+       GLuint prewarp_fs_obj;
+       GLuint prewarp_program;
+
+       GLuint uniform_image_tex, uniform_flow_tex;
+};
+
+// From I, calculate the partial derivatives I_x and I_y. We use a four-tap
+// central difference filter, since apparently, that's tradition (I haven't
+// measured quality versus a more normal 0.5 (I[x+1] - I[x-1]).)
+// The coefficients come from
+//
+//   https://en.wikipedia.org/wiki/Finite_difference_coefficient
+//
+// Also computes β_0, since it depends only on I_x and I_y.
+class Derivatives {
+public:
+       Derivatives();
+       void exec(GLuint input_tex, GLuint I_x_y_tex, GLuint beta_0_tex, int level_width, int level_height, int num_layers);
+
+private:
+       PersistentFBOSet<2> fbos;
+
+       GLuint derivatives_vs_obj;
+       GLuint derivatives_fs_obj;
+       GLuint derivatives_program;
+
+       GLuint uniform_tex;
+};
+
+// Calculate the diffusivity for each pixels, g(x,y). Smoothness (s) will
+// be calculated in the shaders on-the-fly by sampling in-between two
+// neighboring g(x,y) pixels, plus a border tweak to make sure we get
+// zero smoothness at the border.
+//
+// See variational_refinement.txt for more information.
+class ComputeDiffusivity {
+public:
+       ComputeDiffusivity();
+       void exec(GLuint flow_tex, GLuint diff_flow_tex, GLuint diffusivity_tex, int level_width, int level_height, bool zero_diff_flow, int num_layers);
+
+private:
+       PersistentFBOSet<1> fbos;
+
+       GLuint diffusivity_vs_obj;
+       GLuint diffusivity_fs_obj;
+       GLuint diffusivity_program;
+
+       GLuint uniform_flow_tex, uniform_diff_flow_tex;
+       GLuint uniform_alpha, uniform_zero_diff_flow;
+};
+
+// Set up the equations set (two equations in two unknowns, per pixel).
+// We store five floats; the three non-redundant elements of the 2x2 matrix (A)
+// as 32-bit floats, and the two elements on the right-hand side (b) as 16-bit
+// floats. (Actually, we store the inverse of the diagonal elements, because
+// we only ever need to divide by them.) This fits into four u32 values;
+// R, G, B for the matrix (the last element is symmetric) and A for the two b values.
+// All the values of the energy term (E_I, E_G, E_S), except the smoothness
+// terms that depend on other pixels, are calculated in one pass.
+//
+// The equation set is split in two; one contains only the pixels needed for
+// the red pass, and one only for the black pass (see sor.frag). This reduces
+// the amount of data the SOR shader has to pull in, at the cost of some
+// complexity when the equation texture ends up with half the size and we need
+// to adjust texture coordinates.  The contraction is done along the horizontal
+// axis, so that on even rows (0, 2, 4, ...), the “red” texture will contain
+// pixels 0, 2, 4, 6, etc., and on odd rows 1, 3, 5, etc..
+//
+// See variational_refinement.txt for more information about the actual
+// equations in use.
+class SetupEquations {
+public:
+       SetupEquations();
+       void exec(GLuint I_x_y_tex, GLuint I_t_tex, GLuint diff_flow_tex, GLuint flow_tex, GLuint beta_0_tex, GLuint diffusivity_tex, GLuint equation_red_tex, GLuint equation_black_tex, int level_width, int level_height, bool zero_diff_flow, int num_layers);
+
+private:
+       PersistentFBOSet<2> fbos;
+
+       GLuint equations_vs_obj;
+       GLuint equations_fs_obj;
+       GLuint equations_program;
+
+       GLuint uniform_I_x_y_tex, uniform_I_t_tex;
+       GLuint uniform_diff_flow_tex, uniform_base_flow_tex;
+       GLuint uniform_beta_0_tex;
+       GLuint uniform_diffusivity_tex;
+       GLuint uniform_gamma, uniform_delta, uniform_zero_diff_flow;
+};
+
+// Actually solve the equation sets made by SetupEquations, by means of
+// successive over-relaxation (SOR).
+//
+// See variational_refinement.txt for more information.
+class SOR {
+public:
+       SOR();
+       void exec(GLuint diff_flow_tex, GLuint equation_red_tex, GLuint equation_black_tex, GLuint diffusivity_tex, int level_width, int level_height, int num_iterations, bool zero_diff_flow, int num_layers, ScopedTimer *sor_timer);
+
+private:
+       PersistentFBOSet<1> fbos;
+
+       GLuint sor_vs_obj;
+       GLuint sor_fs_obj;
+       GLuint sor_program;
+
+       GLuint uniform_diff_flow_tex;
+       GLuint uniform_equation_red_tex, uniform_equation_black_tex;
+       GLuint uniform_diffusivity_tex;
+       GLuint uniform_phase, uniform_num_nonzero_phases;
+};
+
+// Simply add the differential flow found by the variational refinement to the base flow.
+// The output is in base_flow_tex; we don't need to make a new texture.
+class AddBaseFlow {
+public:
+       AddBaseFlow();
+       void exec(GLuint base_flow_tex, GLuint diff_flow_tex, int level_width, int level_height, int num_layers);
+
+private:
+       PersistentFBOSet<1> fbos;
+
+       GLuint add_flow_vs_obj;
+       GLuint add_flow_fs_obj;
+       GLuint add_flow_program;
+
+       GLuint uniform_diff_flow_tex;
+};
+
+// Take a copy of the flow, bilinearly interpolated and scaled up.
+class ResizeFlow {
+public:
+       ResizeFlow();
+       void exec(GLuint in_tex, GLuint out_tex, int input_width, int input_height, int output_width, int output_height, int num_layers);
+
+private:
+       PersistentFBOSet<1> fbos;
+
+       GLuint resize_flow_vs_obj;
+       GLuint resize_flow_fs_obj;
+       GLuint resize_flow_program;
+
+       GLuint uniform_flow_tex;
+       GLuint uniform_scale_factor;
+};
+
+// All operations, except construction and destruction, are thread-safe.
+class TexturePool {
+public:
+       GLuint get_texture(GLenum format, GLuint width, GLuint height, GLuint num_layers = 0);
+       void release_texture(GLuint tex_num);
+       GLuint get_renderbuffer(GLenum format, GLuint width, GLuint height);
+       void release_renderbuffer(GLuint tex_num);
+
+private:
+       struct Texture {
+               GLuint tex_num;
+               GLenum format;
+               GLuint width, height, num_layers;
+               bool in_use = false;
+               bool is_renderbuffer = false;
+       };
+       std::mutex mu;
+       std::vector<Texture> textures;  // Under mu.
+};
+
+class DISComputeFlow {
+public:
+       DISComputeFlow(int width, int height, const OperatingPoint &op);
+
+       enum FlowDirection {
+               FORWARD,
+               FORWARD_AND_BACKWARD
+       };
+       enum ResizeStrategy {
+               DO_NOT_RESIZE_FLOW,
+               RESIZE_FLOW_TO_FULL_SIZE
+       };
+
+       // The texture must have two layers (first and second frame).
+       // Returns a texture that must be released with release_texture()
+       // after use.
+       GLuint exec(GLuint tex, FlowDirection flow_direction, ResizeStrategy resize_strategy);
+
+       void release_texture(GLuint tex)
+       {
+               pool.release_texture(tex);
+       }
+
+private:
+       int width, height;
+       GLuint initial_flow_tex;
+       GLuint vertex_vbo, vao;
+       TexturePool pool;
+       const OperatingPoint op;
+
+       // The various passes.
+       Sobel sobel;
+       MotionSearch motion_search;
+       Densify densify;
+       Prewarp prewarp;
+       Derivatives derivatives;
+       ComputeDiffusivity compute_diffusivity;
+       SetupEquations setup_equations;
+       SOR sor;
+       AddBaseFlow add_base_flow;
+       ResizeFlow resize_flow;
+};
+
+// Forward-warp the flow half-way (or rather, by alpha). A non-zero “splatting”
+// radius fills most of the holes.
+class Splat {
+public:
+       Splat(const OperatingPoint &op);
+
+       // alpha is the time of the interpolated frame (0..1).
+       void exec(GLuint gray_tex, GLuint bidirectional_flow_tex, GLuint flow_tex, GLuint depth_rb, int width, int height, float alpha);
+
+private:
+       const OperatingPoint op;
+       PersistentFBOSetWithDepth<1> fbos;
+
+       GLuint splat_vs_obj;
+       GLuint splat_fs_obj;
+       GLuint splat_program;
+
+       GLuint uniform_splat_size, uniform_alpha;
+       GLuint uniform_gray_tex, uniform_flow_tex;
+       GLuint uniform_inv_flow_size;
+};
+
+// Doing good and fast hole-filling on a GPU is nontrivial. We choose an option
+// that's fairly simple (given that most holes are really small) and also hopefully
+// cheap should the holes not be so small. Conceptually, we look for the first
+// non-hole to the left of us (ie., shoot a ray until we hit something), then
+// the first non-hole to the right of us, then up and down, and then average them
+// all together. It's going to create “stars” if the holes are big, but OK, that's
+// a tradeoff.
+//
+// Our implementation here is efficient assuming that the hierarchical Z-buffer is
+// on even for shaders that do discard (this typically kills early Z, but hopefully
+// not hierarchical Z); we set up Z so that only holes are written to, which means
+// that as soon as a hole is filled, the rasterizer should just skip it. Most of the
+// fullscreen quads should just be discarded outright, really.
+class HoleFill {
+public:
+       HoleFill();
+
+       // Output will be in flow_tex, temp_tex[0, 1, 2], representing the filling
+       // from the down, left, right and up, respectively. Use HoleBlend to merge
+       // them into one.
+       void exec(GLuint flow_tex, GLuint depth_rb, GLuint temp_tex[3], int width, int height);
+
+private:
+       PersistentFBOSetWithDepth<1> fbos;
+
+       GLuint fill_vs_obj;
+       GLuint fill_fs_obj;
+       GLuint fill_program;
+
+       GLuint uniform_tex;
+       GLuint uniform_z, uniform_sample_offset;
+};
+
+// Blend the four directions from HoleFill into one pixel, so that single-pixel
+// holes become the average of their four neighbors.
+class HoleBlend {
+public:
+       HoleBlend();
+
+       void exec(GLuint flow_tex, GLuint depth_rb, GLuint temp_tex[3], int width, int height);
+
+private:
+       PersistentFBOSetWithDepth<1> fbos;
+
+       GLuint blend_vs_obj;
+       GLuint blend_fs_obj;
+       GLuint blend_program;
+
+       GLuint uniform_left_tex, uniform_right_tex, uniform_up_tex, uniform_down_tex;
+       GLuint uniform_z, uniform_sample_offset;
+};
+
+class Blend {
+public:
+       Blend(bool split_ycbcr_output);
+
+       // output2_tex is only used if split_ycbcr_output was true.
+       void exec(GLuint image_tex, GLuint flow_tex, GLuint output_tex, GLuint output2_tex, int width, int height, float alpha);
+
+private:
+       bool split_ycbcr_output;
+       PersistentFBOSet<1> fbos;
+       PersistentFBOSet<2> fbos_split;
+       GLuint blend_vs_obj;
+       GLuint blend_fs_obj;
+       GLuint blend_program;
+
+       GLuint uniform_image_tex, uniform_flow_tex;
+       GLuint uniform_alpha, uniform_flow_consistency_tolerance;
+};
+
+class Interpolate {
+public:
+       Interpolate(const OperatingPoint &op, bool split_ycbcr_output);
+
+       // Returns a texture (or two, if split_ycbcr_output is true) that must
+       // be released with release_texture() after use. image_tex must be a
+       // two-layer RGBA8 texture with mipmaps (unless flow_level == 0).
+       std::pair<GLuint, GLuint> exec(GLuint image_tex, GLuint gray_tex, GLuint bidirectional_flow_tex, GLuint width, GLuint height, float alpha);
+
+       void release_texture(GLuint tex)
+       {
+               pool.release_texture(tex);
+       }
+
+private:
+       int flow_level;
+       GLuint vertex_vbo, vao;
+       TexturePool pool;
+       const bool split_ycbcr_output;
+
+       Splat splat;
+       HoleFill hole_fill;
+       HoleBlend hole_blend;
+       Blend blend;
+};
+
+#endif  // !defined(_FLOW_H)
diff --git a/futatabi/flow_main.cpp b/futatabi/flow_main.cpp
new file mode 100644 (file)
index 0000000..dc82d22
--- /dev/null
@@ -0,0 +1,495 @@
+#define NO_SDL_GLEXT 1
+
+#include "flow.h"
+#include "gpu_timers.h"
+#include "util.h"
+
+#include <SDL2/SDL.h>
+#include <SDL2/SDL_error.h>
+#include <SDL2/SDL_events.h>
+#include <SDL2/SDL_image.h>
+#include <SDL2/SDL_keyboard.h>
+#include <SDL2/SDL_mouse.h>
+#include <SDL2/SDL_video.h>
+#include <algorithm>
+#include <assert.h>
+#include <deque>
+#include <epoxy/gl.h>
+#include <getopt.h>
+#include <map>
+#include <memory>
+#include <stack>
+#include <stdio.h>
+#include <unistd.h>
+#include <vector>
+
+#define BUFFER_OFFSET(i) ((char *)nullptr + (i))
+
+using namespace std;
+
+SDL_Window *window;
+
+bool enable_warmup = false;
+bool enable_variational_refinement = true;  // Just for debugging.
+bool enable_interpolation = false;
+
+extern float vr_alpha, vr_delta, vr_gamma;
+
+// Structures for asynchronous readback. We assume everything is the same size (and GL_RG16F).
+struct ReadInProgress {
+       GLuint pbo;
+       string filename0, filename1;
+       string flow_filename, ppm_filename;  // Either may be empty for no write.
+};
+stack<GLuint> spare_pbos;
+deque<ReadInProgress> reads_in_progress;
+
+enum MipmapPolicy {
+       WITHOUT_MIPMAPS,
+       WITH_MIPMAPS
+};
+
+GLuint load_texture(const char *filename, unsigned *width_ret, unsigned *height_ret, MipmapPolicy mipmaps)
+{
+       SDL_Surface *surf = IMG_Load(filename);
+       if (surf == nullptr) {
+               fprintf(stderr, "IMG_Load(%s): %s\n", filename, IMG_GetError());
+               exit(1);
+       }
+
+       // For whatever reason, SDL doesn't support converting to YUV surfaces
+       // nor grayscale, so we'll do it ourselves.
+       SDL_Surface *rgb_surf = SDL_ConvertSurfaceFormat(surf, SDL_PIXELFORMAT_RGBA32, /*flags=*/0);
+       if (rgb_surf == nullptr) {
+               fprintf(stderr, "SDL_ConvertSurfaceFormat(%s): %s\n", filename, SDL_GetError());
+               exit(1);
+       }
+
+       SDL_FreeSurface(surf);
+
+       unsigned width = rgb_surf->w, height = rgb_surf->h;
+       const uint8_t *sptr = (uint8_t *)rgb_surf->pixels;
+       unique_ptr<uint8_t[]> pix(new uint8_t[width * height * 4]);
+
+       // Extract the Y component, and convert to bottom-left origin.
+       for (unsigned y = 0; y < height; ++y) {
+               unsigned y2 = height - 1 - y;
+               memcpy(pix.get() + y * width * 4, sptr + y2 * rgb_surf->pitch, width * 4);
+       }
+       SDL_FreeSurface(rgb_surf);
+
+       int num_levels = (mipmaps == WITH_MIPMAPS) ? find_num_levels(width, height) : 1;
+
+       GLuint tex;
+       glCreateTextures(GL_TEXTURE_2D, 1, &tex);
+       glTextureStorage2D(tex, num_levels, GL_RGBA8, width, height);
+       glTextureSubImage2D(tex, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, pix.get());
+
+       if (mipmaps == WITH_MIPMAPS) {
+               glGenerateTextureMipmap(tex);
+       }
+
+       *width_ret = width;
+       *height_ret = height;
+
+       return tex;
+}
+
+// OpenGL uses a bottom-left coordinate system, .flo files use a top-left coordinate system.
+void flip_coordinate_system(float *dense_flow, unsigned width, unsigned height)
+{
+       for (unsigned i = 0; i < width * height; ++i) {
+               dense_flow[i * 2 + 1] = -dense_flow[i * 2 + 1];
+       }
+}
+
+// Not relevant for RGB.
+void flip_coordinate_system(uint8_t *dense_flow, unsigned width, unsigned height)
+{
+}
+
+void write_flow(const char *filename, const float *dense_flow, unsigned width, unsigned height)
+{
+       FILE *flowfp = fopen(filename, "wb");
+       fprintf(flowfp, "FEIH");
+       fwrite(&width, 4, 1, flowfp);
+       fwrite(&height, 4, 1, flowfp);
+       for (unsigned y = 0; y < height; ++y) {
+               int yy = height - y - 1;
+               fwrite(&dense_flow[yy * width * 2], width * 2 * sizeof(float), 1, flowfp);
+       }
+       fclose(flowfp);
+}
+
+// Not relevant for RGB.
+void write_flow(const char *filename, const uint8_t *dense_flow, unsigned width, unsigned height)
+{
+       assert(false);
+}
+
+void write_ppm(const char *filename, const float *dense_flow, unsigned width, unsigned height)
+{
+       FILE *fp = fopen(filename, "wb");
+       fprintf(fp, "P6\n%d %d\n255\n", width, height);
+       for (unsigned y = 0; y < unsigned(height); ++y) {
+               int yy = height - y - 1;
+               for (unsigned x = 0; x < unsigned(width); ++x) {
+                       float du = dense_flow[(yy * width + x) * 2 + 0];
+                       float dv = dense_flow[(yy * width + x) * 2 + 1];
+
+                       uint8_t r, g, b;
+                       flow2rgb(du, dv, &r, &g, &b);
+                       putc(r, fp);
+                       putc(g, fp);
+                       putc(b, fp);
+               }
+       }
+       fclose(fp);
+}
+
+void write_ppm(const char *filename, const uint8_t *rgba, unsigned width, unsigned height)
+{
+       unique_ptr<uint8_t[]> rgb_line(new uint8_t[width * 3 + 1]);
+
+       FILE *fp = fopen(filename, "wb");
+       fprintf(fp, "P6\n%d %d\n255\n", width, height);
+       for (unsigned y = 0; y < height; ++y) {
+               unsigned y2 = height - 1 - y;
+               for (size_t x = 0; x < width; ++x) {
+                       memcpy(&rgb_line[x * 3], &rgba[(y2 * width + x) * 4], 4);
+               }
+               fwrite(rgb_line.get(), width * 3, 1, fp);
+       }
+       fclose(fp);
+}
+
+struct FlowType {
+       using type = float;
+       static constexpr GLenum gl_format = GL_RG;
+       static constexpr GLenum gl_type = GL_FLOAT;
+       static constexpr int num_channels = 2;
+};
+
+struct RGBAType {
+       using type = uint8_t;
+       static constexpr GLenum gl_format = GL_RGBA;
+       static constexpr GLenum gl_type = GL_UNSIGNED_BYTE;
+       static constexpr int num_channels = 4;
+};
+
+template <class Type>
+void finish_one_read(GLuint width, GLuint height)
+{
+       using T = typename Type::type;
+       constexpr int bytes_per_pixel = Type::num_channels * sizeof(T);
+
+       assert(!reads_in_progress.empty());
+       ReadInProgress read = reads_in_progress.front();
+       reads_in_progress.pop_front();
+
+       unique_ptr<T[]> flow(new typename Type::type[width * height * Type::num_channels]);
+       void *buf = glMapNamedBufferRange(read.pbo, 0, width * height * bytes_per_pixel, GL_MAP_READ_BIT);  // Blocks if the read isn't done yet.
+       memcpy(flow.get(), buf, width * height * bytes_per_pixel);  // TODO: Unneeded for RGBType, since flip_coordinate_system() does nothing.:
+       glUnmapNamedBuffer(read.pbo);
+       spare_pbos.push(read.pbo);
+
+       flip_coordinate_system(flow.get(), width, height);
+       if (!read.flow_filename.empty()) {
+               write_flow(read.flow_filename.c_str(), flow.get(), width, height);
+               fprintf(stderr, "%s %s -> %s\n", read.filename0.c_str(), read.filename1.c_str(), read.flow_filename.c_str());
+       }
+       if (!read.ppm_filename.empty()) {
+               write_ppm(read.ppm_filename.c_str(), flow.get(), width, height);
+       }
+}
+
+template <class Type>
+void schedule_read(GLuint tex, GLuint width, GLuint height, const char *filename0, const char *filename1, const char *flow_filename, const char *ppm_filename)
+{
+       using T = typename Type::type;
+       constexpr int bytes_per_pixel = Type::num_channels * sizeof(T);
+
+       if (spare_pbos.empty()) {
+               finish_one_read<Type>(width, height);
+       }
+       assert(!spare_pbos.empty());
+       reads_in_progress.emplace_back(ReadInProgress{ spare_pbos.top(), filename0, filename1, flow_filename, ppm_filename });
+       glBindBuffer(GL_PIXEL_PACK_BUFFER, spare_pbos.top());
+       spare_pbos.pop();
+       glGetTextureImage(tex, 0, Type::gl_format, Type::gl_type, width * height * bytes_per_pixel, nullptr);
+       glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+}
+
+void compute_flow_only(int argc, char **argv, int optind)
+{
+       const char *filename0 = argc >= (optind + 1) ? argv[optind] : "test1499.png";
+       const char *filename1 = argc >= (optind + 2) ? argv[optind + 1] : "test1500.png";
+       const char *flow_filename = argc >= (optind + 3) ? argv[optind + 2] : "flow.flo";
+
+       // Load pictures.
+       unsigned width1, height1, width2, height2;
+       GLuint tex0 = load_texture(filename0, &width1, &height1, WITHOUT_MIPMAPS);
+       GLuint tex1 = load_texture(filename1, &width2, &height2, WITHOUT_MIPMAPS);
+
+       if (width1 != width2 || height1 != height2) {
+               fprintf(stderr, "Image dimensions don't match (%dx%d versus %dx%d)\n",
+                       width1, height1, width2, height2);
+               exit(1);
+       }
+
+       // Move them into an array texture, since that's how the rest of the code
+       // would like them.
+       GLuint image_tex;
+       glCreateTextures(GL_TEXTURE_2D_ARRAY, 1, &image_tex);
+       glTextureStorage3D(image_tex, 1, GL_RGBA8, width1, height1, 2);
+       glCopyImageSubData(tex0, GL_TEXTURE_2D, 0, 0, 0, 0, image_tex, GL_TEXTURE_2D_ARRAY, 0, 0, 0, 0, width1, height1, 1);
+       glCopyImageSubData(tex1, GL_TEXTURE_2D, 0, 0, 0, 0, image_tex, GL_TEXTURE_2D_ARRAY, 0, 0, 0, 1, width1, height1, 1);
+       glDeleteTextures(1, &tex0);
+       glDeleteTextures(1, &tex1);
+
+       // Set up some PBOs to do asynchronous readback.
+       GLuint pbos[5];
+       glCreateBuffers(5, pbos);
+       for (int i = 0; i < 5; ++i) {
+               glNamedBufferData(pbos[i], width1 * height1 * 2 * 2 * sizeof(float), nullptr, GL_STREAM_READ);
+               spare_pbos.push(pbos[i]);
+       }
+
+       int levels = find_num_levels(width1, height1);
+
+       GLuint tex_gray;
+       glCreateTextures(GL_TEXTURE_2D_ARRAY, 1, &tex_gray);
+       glTextureStorage3D(tex_gray, levels, GL_R8, width1, height1, 2);
+
+       OperatingPoint op = operating_point3;
+       if (!enable_variational_refinement) {
+               op.variational_refinement = false;
+       }
+
+       DISComputeFlow compute_flow(width1, height1, op);  // Must be initialized before gray.
+       GrayscaleConversion gray;
+       gray.exec(image_tex, tex_gray, width1, height1, /*num_layers=*/2);
+       glGenerateTextureMipmap(tex_gray);
+
+       if (enable_warmup) {
+               in_warmup = true;
+               for (int i = 0; i < 10; ++i) {
+                       GLuint final_tex = compute_flow.exec(tex_gray, DISComputeFlow::FORWARD, DISComputeFlow::RESIZE_FLOW_TO_FULL_SIZE);
+                       compute_flow.release_texture(final_tex);
+               }
+               in_warmup = false;
+       }
+
+       GLuint final_tex = compute_flow.exec(tex_gray, DISComputeFlow::FORWARD, DISComputeFlow::RESIZE_FLOW_TO_FULL_SIZE);
+       //GLuint final_tex = compute_flow.exec(tex_gray, DISComputeFlow::FORWARD_AND_BACKWARD, DISComputeFlow::RESIZE_FLOW_TO_FULL_SIZE);
+
+       schedule_read<FlowType>(final_tex, width1, height1, filename0, filename1, flow_filename, "flow.ppm");
+       compute_flow.release_texture(final_tex);
+
+       // See if there are more flows on the command line (ie., more than three arguments),
+       // and if so, process them.
+       int num_flows = (argc - optind) / 3;
+       for (int i = 1; i < num_flows; ++i) {
+               const char *filename0 = argv[optind + i * 3 + 0];
+               const char *filename1 = argv[optind + i * 3 + 1];
+               const char *flow_filename = argv[optind + i * 3 + 2];
+               GLuint width, height;
+               GLuint tex0 = load_texture(filename0, &width, &height, WITHOUT_MIPMAPS);
+               if (width != width1 || height != height1) {
+                       fprintf(stderr, "%s: Image dimensions don't match (%dx%d versus %dx%d)\n",
+                               filename0, width, height, width1, height1);
+                       exit(1);
+               }
+               glCopyImageSubData(tex0, GL_TEXTURE_2D, 0, 0, 0, 0, image_tex, GL_TEXTURE_2D_ARRAY, 0, 0, 0, 0, width1, height1, 1);
+               glDeleteTextures(1, &tex0);
+
+               GLuint tex1 = load_texture(filename1, &width, &height, WITHOUT_MIPMAPS);
+               if (width != width1 || height != height1) {
+                       fprintf(stderr, "%s: Image dimensions don't match (%dx%d versus %dx%d)\n",
+                               filename1, width, height, width1, height1);
+                       exit(1);
+               }
+               glCopyImageSubData(tex1, GL_TEXTURE_2D, 0, 0, 0, 0, image_tex, GL_TEXTURE_2D_ARRAY, 0, 0, 0, 1, width1, height1, 1);
+               glDeleteTextures(1, &tex1);
+
+               gray.exec(image_tex, tex_gray, width1, height1, /*num_layers=*/2);
+               glGenerateTextureMipmap(tex_gray);
+
+               GLuint final_tex = compute_flow.exec(tex_gray, DISComputeFlow::FORWARD, DISComputeFlow::RESIZE_FLOW_TO_FULL_SIZE);
+
+               schedule_read<FlowType>(final_tex, width1, height1, filename0, filename1, flow_filename, "");
+               compute_flow.release_texture(final_tex);
+       }
+       glDeleteTextures(1, &tex_gray);
+
+       while (!reads_in_progress.empty()) {
+               finish_one_read<FlowType>(width1, height1);
+       }
+}
+
+// Interpolate images based on
+//
+//   Herbst, Seitz, Baker: “Occlusion Reasoning for Temporal Interpolation
+//   Using Optical Flow”
+//
+// or at least a reasonable subset thereof. Unfinished.
+void interpolate_image(int argc, char **argv, int optind)
+{
+       const char *filename0 = argc >= (optind + 1) ? argv[optind] : "test1499.png";
+       const char *filename1 = argc >= (optind + 2) ? argv[optind + 1] : "test1500.png";
+       //const char *out_filename = argc >= (optind + 3) ? argv[optind + 2] : "interpolated.png";
+
+       // Load pictures.
+       unsigned width1, height1, width2, height2;
+       GLuint tex0 = load_texture(filename0, &width1, &height1, WITH_MIPMAPS);
+       GLuint tex1 = load_texture(filename1, &width2, &height2, WITH_MIPMAPS);
+
+       if (width1 != width2 || height1 != height2) {
+               fprintf(stderr, "Image dimensions don't match (%dx%d versus %dx%d)\n",
+                       width1, height1, width2, height2);
+               exit(1);
+       }
+
+       // Move them into an array texture, since that's how the rest of the code
+       // would like them.
+       int levels = find_num_levels(width1, height1);
+       GLuint image_tex;
+       glCreateTextures(GL_TEXTURE_2D_ARRAY, 1, &image_tex);
+       glTextureStorage3D(image_tex, levels, GL_RGBA8, width1, height1, 2);
+       glCopyImageSubData(tex0, GL_TEXTURE_2D, 0, 0, 0, 0, image_tex, GL_TEXTURE_2D_ARRAY, 0, 0, 0, 0, width1, height1, 1);
+       glCopyImageSubData(tex1, GL_TEXTURE_2D, 0, 0, 0, 0, image_tex, GL_TEXTURE_2D_ARRAY, 0, 0, 0, 1, width1, height1, 1);
+       glDeleteTextures(1, &tex0);
+       glDeleteTextures(1, &tex1);
+       glGenerateTextureMipmap(image_tex);
+
+       // Set up some PBOs to do asynchronous readback.
+       GLuint pbos[5];
+       glCreateBuffers(5, pbos);
+       for (int i = 0; i < 5; ++i) {
+               glNamedBufferData(pbos[i], width1 * height1 * 4 * sizeof(uint8_t), nullptr, GL_STREAM_READ);
+               spare_pbos.push(pbos[i]);
+       }
+
+       OperatingPoint op = operating_point3;
+       if (!enable_variational_refinement) {
+               op.variational_refinement = false;
+       }
+       DISComputeFlow compute_flow(width1, height1, op);
+       GrayscaleConversion gray;
+       Interpolate interpolate(op, /*split_ycbcr_output=*/false);
+
+       GLuint tex_gray;
+       glCreateTextures(GL_TEXTURE_2D_ARRAY, 1, &tex_gray);
+       glTextureStorage3D(tex_gray, levels, GL_R8, width1, height1, 2);
+       gray.exec(image_tex, tex_gray, width1, height1, /*num_layers=*/2);
+       glGenerateTextureMipmap(tex_gray);
+
+       if (enable_warmup) {
+               in_warmup = true;
+               for (int i = 0; i < 10; ++i) {
+                       GLuint bidirectional_flow_tex = compute_flow.exec(tex_gray, DISComputeFlow::FORWARD_AND_BACKWARD, DISComputeFlow::DO_NOT_RESIZE_FLOW);
+                       GLuint interpolated_tex = interpolate.exec(image_tex, tex_gray, bidirectional_flow_tex, width1, height1, 0.5f).first;
+                       compute_flow.release_texture(bidirectional_flow_tex);
+                       interpolate.release_texture(interpolated_tex);
+               }
+               in_warmup = false;
+       }
+
+       GLuint bidirectional_flow_tex = compute_flow.exec(tex_gray, DISComputeFlow::FORWARD_AND_BACKWARD, DISComputeFlow::DO_NOT_RESIZE_FLOW);
+
+       for (int frameno = 1; frameno < 60; ++frameno) {
+               char ppm_filename[256];
+               snprintf(ppm_filename, sizeof(ppm_filename), "interp%04d.ppm", frameno);
+
+               float alpha = frameno / 60.0f;
+               GLuint interpolated_tex = interpolate.exec(image_tex, tex_gray, bidirectional_flow_tex, width1, height1, alpha).first;
+
+               schedule_read<RGBAType>(interpolated_tex, width1, height1, filename0, filename1, "", ppm_filename);
+               interpolate.release_texture(interpolated_tex);
+       }
+
+       while (!reads_in_progress.empty()) {
+               finish_one_read<RGBAType>(width1, height1);
+       }
+}
+
+int main(int argc, char **argv)
+{
+       static const option long_options[] = {
+               { "smoothness-relative-weight", required_argument, 0, 's' },  // alpha.
+               { "intensity-relative-weight", required_argument, 0, 'i' },  // delta.
+               { "gradient-relative-weight", required_argument, 0, 'g' },  // gamma.
+               { "disable-timing", no_argument, 0, 1000 },
+               { "detailed-timing", no_argument, 0, 1003 },
+               { "disable-variational-refinement", no_argument, 0, 1001 },
+               { "interpolate", no_argument, 0, 1002 },
+               { "warmup", no_argument, 0, 1004 }
+       };
+
+       enable_timing = true;
+
+       for ( ;; ) {
+               int option_index = 0;
+               int c = getopt_long(argc, argv, "s:i:g:", long_options, &option_index);
+
+               if (c == -1) {
+                       break;
+               }
+               switch (c) {
+               case 's':
+                       vr_alpha = atof(optarg);
+                       break;
+               case 'i':
+                       vr_delta = atof(optarg);
+                       break;
+               case 'g':
+                       vr_gamma = atof(optarg);
+                       break;
+               case 1000:
+                       enable_timing = false;
+                       break;
+               case 1001:
+                       enable_variational_refinement = false;
+                       break;
+               case 1002:
+                       enable_interpolation = true;
+                       break;
+               case 1003:
+                       detailed_timing = true;
+                       break;
+               case 1004:
+                       enable_warmup = true;
+                       break;
+               default:
+                       fprintf(stderr, "Unknown option '%s'\n", argv[option_index]);
+                       exit(1);
+               };
+       }
+
+       if (SDL_Init(SDL_INIT_EVERYTHING) == -1) {
+               fprintf(stderr, "SDL_Init failed: %s\n", SDL_GetError());
+               exit(1);
+       }
+       SDL_GL_SetAttribute(SDL_GL_ALPHA_SIZE, 8);
+       SDL_GL_SetAttribute(SDL_GL_DEPTH_SIZE, 0);
+       SDL_GL_SetAttribute(SDL_GL_STENCIL_SIZE, 0);
+       SDL_GL_SetAttribute(SDL_GL_DOUBLEBUFFER, 1);
+
+       SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
+       SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
+       SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 5);
+       // SDL_GL_SetAttribute(SDL_GL_CONTEXT_FLAGS, SDL_GL_CONTEXT_DEBUG_FLAG);
+       window = SDL_CreateWindow("OpenGL window",
+               SDL_WINDOWPOS_UNDEFINED,
+               SDL_WINDOWPOS_UNDEFINED,
+               64, 64,
+               SDL_WINDOW_OPENGL | SDL_WINDOW_HIDDEN);
+       SDL_GLContext context = SDL_GL_CreateContext(window);
+       assert(context != nullptr);
+
+       if (enable_interpolation) {
+               interpolate_image(argc, argv, optind);
+       } else {
+               compute_flow_only(argc, argv, optind);
+       }
+}
diff --git a/futatabi/frame.proto b/futatabi/frame.proto
new file mode 100644 (file)
index 0000000..c8807fd
--- /dev/null
@@ -0,0 +1,25 @@
+syntax = "proto3";
+
+// Used as header before each frame in a .frames file:
+//
+//  1. "Ftbifrm0" (8 bytes, ASCII -- note that no byte repeats)
+//  2. Length of upcoming FrameHeaderProto (uint32, binary, big endian)
+//  3. The FrameHeaderProto itself
+//  4. The actual frame
+
+message FrameHeaderProto {
+       int32 stream_idx = 1;
+       int64 pts = 2;
+       int64 file_size = 3;  // In bytes of compressed frame. TODO: rename to size.
+}
+
+message StreamContentsProto {
+       int32 stream_idx = 1;
+       repeated int64 pts = 2 [packed=true];
+       repeated int64 file_size = 3 [packed=true];
+       repeated int64 offset = 4 [packed=true];
+}
+
+message FileContentsProto {
+       repeated StreamContentsProto stream = 1;  // Typically only one.
+}
diff --git a/futatabi/frame_on_disk.cpp b/futatabi/frame_on_disk.cpp
new file mode 100644 (file)
index 0000000..b496b3d
--- /dev/null
@@ -0,0 +1,53 @@
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "frame_on_disk.h"
+
+using namespace std;
+
+FrameReader::~FrameReader()
+{
+       if (fd != -1) {
+               close(fd);
+       }
+}
+
+string FrameReader::read_frame(FrameOnDisk frame)
+{
+       if (int(frame.filename_idx) != last_filename_idx) {
+               if (fd != -1) {
+                       close(fd);  // Ignore errors.
+               }
+
+               string filename;
+               {
+                       lock_guard<mutex> lock(frame_mu);
+                       filename = frame_filenames[frame.filename_idx];
+               }
+
+               fd = open(filename.c_str(), O_RDONLY);
+               if (fd == -1) {
+                       perror(filename.c_str());
+                       exit(1);
+               }
+
+               // We want readahead. (Ignore errors.)
+               posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+
+               last_filename_idx = frame.filename_idx;
+       }
+
+       string str;
+       str.resize(frame.size);
+       off_t offset = 0;
+       while (offset < frame.size) {
+               int ret = pread(fd, &str[offset], frame.size - offset, frame.offset + offset);
+               if (ret <= 0) {
+                       perror("pread");
+                       exit(1);
+               }
+
+               offset += ret;
+       }
+       return str;
+}
diff --git a/futatabi/frame_on_disk.h b/futatabi/frame_on_disk.h
new file mode 100644 (file)
index 0000000..1843857
--- /dev/null
@@ -0,0 +1,37 @@
+#ifndef _FRAME_ON_DISK_H
+#define _FRAME_ON_DISK_H 1
+
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include <stdint.h>
+
+#include "defs.h"
+
+extern std::mutex frame_mu;
+struct FrameOnDisk {
+        int64_t pts = -1;  // -1 means empty.
+        off_t offset;
+        unsigned filename_idx;
+        uint32_t size;  // Not using size_t saves a few bytes; we can have so many frames.
+};
+extern std::vector<FrameOnDisk> frames[MAX_STREAMS];  // Under frame_mu.
+extern std::vector<std::string> frame_filenames;  // Under frame_mu.
+
+// A helper class to read frames from disk. It caches the file descriptor
+// so that the kernel has a better chance of doing readahead when it sees
+// the sequential reads. (For this reason, each display has a private
+// FrameReader. Thus, we can easily keep multiple open file descriptors around
+// for a single .frames file.)
+class FrameReader {
+public:
+       ~FrameReader();
+       std::string read_frame(FrameOnDisk frame);
+
+private:
+       int fd = -1;
+       int last_filename_idx = -1;
+};
+
+#endif  // !defined(_FRAME_ON_DISK_H)
diff --git a/futatabi/gpu_timers.cpp b/futatabi/gpu_timers.cpp
new file mode 100644 (file)
index 0000000..ad747ae
--- /dev/null
@@ -0,0 +1,72 @@
+#include "gpu_timers.h"
+
+#include <epoxy/gl.h>
+
+using namespace std;
+
+bool enable_timing = false;
+bool detailed_timing = false;
+bool in_warmup = false;
+
+pair<GLuint, GLuint> GPUTimers::begin_timer(const string &name, int level)
+{
+       if (!enable_timing) {
+               return make_pair(0, 0);
+       }
+
+       GLuint queries[2];
+       glGenQueries(2, queries);
+       glQueryCounter(queries[0], GL_TIMESTAMP);
+
+       Timer timer;
+       timer.name = name;
+       timer.level = level;
+       timer.query.first = queries[0];
+       timer.query.second = queries[1];
+       timers.push_back(timer);
+       return timer.query;
+}
+
+GLint64 find_elapsed(pair<GLuint, GLuint> queries)
+{
+       // NOTE: This makes the CPU wait for the GPU.
+       GLuint64 time_start, time_end;
+       glGetQueryObjectui64v(queries.first, GL_QUERY_RESULT, &time_start);
+       glGetQueryObjectui64v(queries.second, GL_QUERY_RESULT, &time_end);
+       return time_end - time_start;
+}
+
+void GPUTimers::print()
+{
+       for (size_t i = 0; i < timers.size(); ++i) {
+               if (timers[i].level >= 4 && !detailed_timing) {
+                       // In practice, only affects the SOR sub-timers.
+                       continue;
+               }
+
+               GLint64 time_elapsed = find_elapsed(timers[i].query);
+               for (int j = 0; j < timers[i].level * 2; ++j) {
+                       fprintf(stderr, " ");
+               }
+
+               if (detailed_timing) {
+                       // Look for any immediate subtimers, and see if they sum to the large one.
+                       size_t num_subtimers = 0;
+                       GLint64 sum_subtimers = 0;
+                       for (size_t j = i + 1; j < timers.size() && timers[j].level > timers[i].level; ++j) {
+                               if (timers[j].level != timers[i].level + 1)
+                                       continue;
+                               ++num_subtimers;
+                               sum_subtimers += find_elapsed(timers[j].query);
+                       }
+
+                       if (num_subtimers > 0 && (time_elapsed - sum_subtimers) / 1e6 >= 0.01) {
+                               fprintf(stderr, "%-30s %4.3f ms [%4.3f ms unaccounted for]\n", timers[i].name.c_str(), time_elapsed / 1e6, (time_elapsed - sum_subtimers) / 1e6);
+                       } else {
+                               fprintf(stderr, "%-30s %4.3f ms\n", timers[i].name.c_str(), time_elapsed / 1e6);
+                       }
+               } else {
+                       fprintf(stderr, "%-30s %4.1f ms\n", timers[i].name.c_str(), time_elapsed / 1e6);
+               }
+       }
+}
diff --git a/futatabi/gpu_timers.h b/futatabi/gpu_timers.h
new file mode 100644 (file)
index 0000000..a8c626e
--- /dev/null
@@ -0,0 +1,63 @@
+#ifndef _GPU_TIMERS_H
+#define _GPU_TIMERS_H 1
+
+#include <epoxy/gl.h>
+#include <string>
+#include <utility>
+#include <vector>
+
+extern bool enable_timing;
+extern bool detailed_timing;
+extern bool in_warmup;
+
+class GPUTimers {
+public:
+       void print();
+       std::pair<GLuint, GLuint> begin_timer(const std::string &name, int level);
+
+private:
+       struct Timer {
+               std::string name;
+               int level;
+               std::pair<GLuint, GLuint> query;
+       };
+       std::vector<Timer> timers;
+};
+
+// A simple RAII class for timing until the end of the scope.
+class ScopedTimer {
+public:
+       ScopedTimer(const std::string &name, GPUTimers *timers)
+               : timers(timers), level(0)
+       {
+               query = timers->begin_timer(name, level);
+       }
+
+       ScopedTimer(const std::string &name, ScopedTimer *parent_timer)
+               : timers(parent_timer->timers),
+                 level(parent_timer->level + 1)
+       {
+               query = timers->begin_timer(name, level);
+       }
+
+       ~ScopedTimer()
+       {
+               end();
+       }
+
+       void end()
+       {
+               if (enable_timing && !ended) {
+                       glQueryCounter(query.second, GL_TIMESTAMP);
+                       ended = true;
+               }
+       }
+
+private:
+       GPUTimers *timers;
+       int level;
+       std::pair<GLuint, GLuint> query;
+       bool ended = false;
+};
+
+#endif  // !defined(_GPU_TIMERS_H)
diff --git a/futatabi/gray.frag b/futatabi/gray.frag
new file mode 100644 (file)
index 0000000..57a6891
--- /dev/null
@@ -0,0 +1,13 @@
+#version 450 core
+
+in vec3 tc;
+out vec4 gray;
+
+uniform sampler2DArray tex;
+
+void main()
+{
+       vec4 color = texture(tex, tc);
+       gray.rgb = vec3(dot(color.rgb, vec3(0.2126f, 0.7152f, 0.0722f)));  // Rec. 709.
+       gray.a = color.a;
+}
diff --git a/futatabi/hole_blend.frag b/futatabi/hole_blend.frag
new file mode 100644 (file)
index 0000000..d7b217f
--- /dev/null
@@ -0,0 +1,48 @@
+#version 450 core
+
+in vec2 tc;
+out vec2 out_flow;
+
+uniform sampler2D left_tex, right_tex, up_tex, down_tex;
+
+void main()
+{
+       // Some of these may contain “junk”, in the sense that they were
+       // not written in the given pass, if they came from an edge.
+       // Most of the time, this is benign, since it means we'll get
+       // the previous value (left/right/up) again. However, if it were
+       // bogus on the very first pass, we need to exclude it.
+       // Thus the test for 100.0f (invalid flows are initialized to 1000,
+       // all valid ones are less than 1).
+       vec2 left = texture(left_tex, tc).xy;
+       vec2 right = texture(right_tex, tc).xy;
+       vec2 up = texture(up_tex, tc).xy;
+       vec2 down = texture(down_tex, tc).xy;
+
+       vec2 sum = vec2(0.0f);
+       float num = 0.0f;
+       if (left.x < 100.0f) {
+               sum = left;
+               num = 1.0f;
+       }
+       if (right.x < 100.0f) {
+               sum += right;
+               num += 1.0f;
+       }
+       if (up.x < 100.0f) {
+               sum += up;
+               num += 1.0f;
+       }
+       if (down.x < 100.0f) {
+               sum += down;
+               num += 1.0f;
+       }
+
+       // If _all_ of them were 0, this would mean the entire row _and_ column
+       // would be devoid of flow. If so, the zero flow is fine for our purposes.
+       if (num == 0.0f) {
+               out_flow = vec2(0.0f);
+       } else {
+               out_flow = sum / num;
+       }
+}
diff --git a/futatabi/hole_fill.frag b/futatabi/hole_fill.frag
new file mode 100644 (file)
index 0000000..bec50d8
--- /dev/null
@@ -0,0 +1,16 @@
+#version 450 core
+
+in vec2 tc;
+out vec2 out_flow;
+
+uniform sampler2D tex;
+
+void main()
+{
+       vec2 flow = texture(tex, tc).xy;
+       if (flow.x > 100.0f) {
+               // Don't copy unset flows around.
+               discard;
+       }
+       out_flow = flow;
+}
diff --git a/futatabi/hole_fill.vert b/futatabi/hole_fill.vert
new file mode 100644 (file)
index 0000000..97098b6
--- /dev/null
@@ -0,0 +1,24 @@
+#version 450 core
+
+layout(location=0) in vec2 position;
+out vec2 tc;
+
+uniform float z;
+uniform vec2 sample_offset;
+
+void main()
+{
+       // Moving the position is equivalent to moving the texture coordinate,
+       // but cheaper -- as it means some of the fullscreen quad can be clipped away.
+       vec2 adjusted_pos = position - sample_offset;
+
+       // The result of glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) is:
+       //
+       //   2.000  0.000  0.000 -1.000
+       //   0.000  2.000  0.000 -1.000
+       //   0.000  0.000 -2.000 -1.000
+       //   0.000  0.000  0.000  1.000
+       gl_Position = vec4(2.0 * adjusted_pos.x - 1.0, 2.0 * adjusted_pos.y - 1.0, 2.0f * (z - 0.5f), 1.0);
+
+       tc = position;
+}
diff --git a/futatabi/httpd.cpp b/futatabi/httpd.cpp
new file mode 100644 (file)
index 0000000..36d2874
--- /dev/null
@@ -0,0 +1,264 @@
+#include "httpd.h"
+
+#include <assert.h>
+#include <byteswap.h>
+#include <endian.h>
+#include <memory>
+#include <microhttpd.h>
+#include <netinet/in.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/time.h>
+#include <time.h>
+extern "C" {
+#include <libavutil/avutil.h>
+}
+
+#include "defs.h"
+#include "metacube2.h"
+
+struct MHD_Connection;
+struct MHD_Response;
+
+using namespace std;
+
+HTTPD::HTTPD()
+{
+}
+
+HTTPD::~HTTPD()
+{
+       stop();
+}
+
+void HTTPD::start(int port)
+{
+       mhd = MHD_start_daemon(MHD_USE_THREAD_PER_CONNECTION | MHD_USE_POLL_INTERNALLY | MHD_USE_DUAL_STACK,
+                              port,
+                              nullptr, nullptr,
+                              &answer_to_connection_thunk, this,
+                              MHD_OPTION_NOTIFY_COMPLETED, nullptr, this,
+                              MHD_OPTION_END);
+       if (mhd == nullptr) {
+               fprintf(stderr, "Warning: Could not open HTTP server. (Port already in use?)\n");
+       }
+}
+
+void HTTPD::stop()
+{
+       if (mhd) {
+               MHD_quiesce_daemon(mhd);
+               for (Stream *stream : streams) {
+                       stream->stop();
+               }
+               MHD_stop_daemon(mhd);
+               mhd = nullptr;
+       }
+}
+
+void HTTPD::add_data(const char *buf, size_t size, bool keyframe, int64_t time, AVRational timebase)
+{
+       unique_lock<mutex> lock(streams_mutex);
+       for (Stream *stream : streams) {
+               stream->add_data(buf, size, keyframe ? Stream::DATA_TYPE_KEYFRAME : Stream::DATA_TYPE_OTHER, time, timebase);
+       }
+}
+
+int HTTPD::answer_to_connection_thunk(void *cls, MHD_Connection *connection,
+                                      const char *url, const char *method,
+                                      const char *version, const char *upload_data,
+                                      size_t *upload_data_size, void **con_cls)
+{
+       HTTPD *httpd = (HTTPD *)cls;
+       return httpd->answer_to_connection(connection, url, method, version, upload_data, upload_data_size, con_cls);
+}
+
+int HTTPD::answer_to_connection(MHD_Connection *connection,
+                                const char *url, const char *method,
+                                const char *version, const char *upload_data,
+                                size_t *upload_data_size, void **con_cls)
+{
+       // See if the URL ends in “.metacube”.
+       HTTPD::Stream::Framing framing;
+       if (strstr(url, ".metacube") == url + strlen(url) - strlen(".metacube")) {
+               framing = HTTPD::Stream::FRAMING_METACUBE;
+       } else {
+               framing = HTTPD::Stream::FRAMING_RAW;
+       }
+
+       if (endpoints.count(url)) {
+               pair<string, string> contents_and_type = endpoints[url].callback();
+               MHD_Response *response = MHD_create_response_from_buffer(
+                       contents_and_type.first.size(), &contents_and_type.first[0], MHD_RESPMEM_MUST_COPY);
+               MHD_add_response_header(response, "Content-type", contents_and_type.second.c_str());
+               if (endpoints[url].cors_policy == ALLOW_ALL_ORIGINS) {
+                       MHD_add_response_header(response, "Access-Control-Allow-Origin", "*");
+               }
+               int ret = MHD_queue_response(connection, MHD_HTTP_OK, response);
+               MHD_destroy_response(response);  // Only decreases the refcount; actual free is after the request is done.
+               return ret;
+       }
+
+       // Small hack; reject unknown /channels/foo.
+       if (string(url).find("/channels/") == 0) {
+               string contents = "Not found.";
+               MHD_Response *response = MHD_create_response_from_buffer(
+                       contents.size(), &contents[0], MHD_RESPMEM_MUST_COPY);
+               MHD_add_response_header(response, "Content-type", "text/plain");
+               int ret = MHD_queue_response(connection, MHD_HTTP_NOT_FOUND, response);
+               MHD_destroy_response(response);  // Only decreases the refcount; actual free is after the request is done.
+               return ret;
+       }
+
+       HTTPD::Stream *stream = new HTTPD::Stream(this, framing);
+       stream->add_data(header.data(), header.size(), Stream::DATA_TYPE_HEADER, AV_NOPTS_VALUE, AVRational{ 1, 0 });
+       {
+               unique_lock<mutex> lock(streams_mutex);
+               streams.insert(stream);
+       }
+       ++metric_num_connected_clients;
+       *con_cls = stream;
+
+       // Does not strictly have to be equal to MUX_BUFFER_SIZE.
+       MHD_Response *response = MHD_create_response_from_callback(
+               (size_t)-1, MUX_BUFFER_SIZE, &HTTPD::Stream::reader_callback_thunk, stream, &HTTPD::free_stream);
+       // TODO: Content-type?
+       if (framing == HTTPD::Stream::FRAMING_METACUBE) {
+               MHD_add_response_header(response, "Content-encoding", "metacube");
+       }
+
+       int ret = MHD_queue_response(connection, MHD_HTTP_OK, response);
+       MHD_destroy_response(response);  // Only decreases the refcount; actual free is after the request is done.
+
+       return ret;
+}
+
+void HTTPD::free_stream(void *cls)
+{
+       HTTPD::Stream *stream = (HTTPD::Stream *)cls;
+       HTTPD *httpd = stream->get_parent();
+       {
+               unique_lock<mutex> lock(httpd->streams_mutex);
+               delete stream;
+               httpd->streams.erase(stream);
+       }
+       --httpd->metric_num_connected_clients;
+}
+
+ssize_t HTTPD::Stream::reader_callback_thunk(void *cls, uint64_t pos, char *buf, size_t max)
+{
+       HTTPD::Stream *stream = (HTTPD::Stream *)cls;
+       return stream->reader_callback(pos, buf, max);
+}
+
+ssize_t HTTPD::Stream::reader_callback(uint64_t pos, char *buf, size_t max)
+{
+       unique_lock<mutex> lock(buffer_mutex);
+       has_buffered_data.wait(lock, [this] { return should_quit || !buffered_data.empty(); });
+       if (should_quit) {
+               return 0;
+       }
+
+       ssize_t ret = 0;
+       while (max > 0 && !buffered_data.empty()) {
+               const string &s = buffered_data.front();
+               assert(s.size() > used_of_buffered_data);
+               size_t len = s.size() - used_of_buffered_data;
+               if (max >= len) {
+                       // Consume the entire (rest of the) string.
+                       memcpy(buf, s.data() + used_of_buffered_data, len);
+                       buf += len;
+                       ret += len;
+                       max -= len;
+                       buffered_data.pop_front();
+                       used_of_buffered_data = 0;
+               } else {
+                       // We don't need the entire string; just use the first part of it.
+                       memcpy(buf, s.data() + used_of_buffered_data, max);
+                       buf += max;
+                       used_of_buffered_data += max;
+                       ret += max;
+                       max = 0;
+               }
+       }
+
+       return ret;
+}
+
+void HTTPD::Stream::add_data(const char *buf, size_t buf_size, HTTPD::Stream::DataType data_type, int64_t time, AVRational timebase)
+{
+       if (buf_size == 0) {
+               return;
+       }
+       if (data_type == DATA_TYPE_KEYFRAME) {
+               seen_keyframe = true;
+       } else if (data_type == DATA_TYPE_OTHER && !seen_keyframe) {
+               // Start sending only once we see a keyframe.
+               return;
+       }
+
+       unique_lock<mutex> lock(buffer_mutex);
+
+       if (framing == FRAMING_METACUBE) {
+               int flags = 0;
+               if (data_type == DATA_TYPE_HEADER) {
+                       flags |= METACUBE_FLAGS_HEADER;
+               } else if (data_type == DATA_TYPE_OTHER) {
+                       flags |= METACUBE_FLAGS_NOT_SUITABLE_FOR_STREAM_START;
+               }
+
+               // If we're about to send a keyframe, send a pts metadata block
+               // to mark its time.
+               if ((flags & METACUBE_FLAGS_NOT_SUITABLE_FOR_STREAM_START) == 0 && time != AV_NOPTS_VALUE) {
+                       metacube2_pts_packet packet;
+                       packet.type = htobe64(METACUBE_METADATA_TYPE_NEXT_BLOCK_PTS);
+                       packet.pts = htobe64(time);
+                       packet.timebase_num = htobe64(timebase.num);
+                       packet.timebase_den = htobe64(timebase.den);
+
+                       metacube2_block_header hdr;
+                       memcpy(hdr.sync, METACUBE2_SYNC, sizeof(hdr.sync));
+                       hdr.size = htonl(sizeof(packet));
+                       hdr.flags = htons(METACUBE_FLAGS_METADATA);
+                       hdr.csum = htons(metacube2_compute_crc(&hdr));
+                       buffered_data.emplace_back((char *)&hdr, sizeof(hdr));
+                       buffered_data.emplace_back((char *)&packet, sizeof(packet));
+               }
+
+               metacube2_block_header hdr;
+               memcpy(hdr.sync, METACUBE2_SYNC, sizeof(hdr.sync));
+               hdr.size = htonl(buf_size);
+               hdr.flags = htons(flags);
+               hdr.csum = htons(metacube2_compute_crc(&hdr));
+               buffered_data.emplace_back((char *)&hdr, sizeof(hdr));
+       }
+       buffered_data.emplace_back(buf, buf_size);
+
+       // Send a Metacube2 timestamp every keyframe.
+       if (framing == FRAMING_METACUBE && data_type == DATA_TYPE_KEYFRAME) {
+               timespec now;
+               clock_gettime(CLOCK_REALTIME, &now);
+
+               metacube2_timestamp_packet packet;
+               packet.type = htobe64(METACUBE_METADATA_TYPE_ENCODER_TIMESTAMP);
+               packet.tv_sec = htobe64(now.tv_sec);
+               packet.tv_nsec = htobe64(now.tv_nsec);
+
+               metacube2_block_header hdr;
+               memcpy(hdr.sync, METACUBE2_SYNC, sizeof(hdr.sync));
+               hdr.size = htonl(sizeof(packet));
+               hdr.flags = htons(METACUBE_FLAGS_METADATA);
+               hdr.csum = htons(metacube2_compute_crc(&hdr));
+               buffered_data.emplace_back((char *)&hdr, sizeof(hdr));
+               buffered_data.emplace_back((char *)&packet, sizeof(packet));
+       }
+
+       has_buffered_data.notify_all();
+}
+
+void HTTPD::Stream::stop()
+{
+       unique_lock<mutex> lock(buffer_mutex);
+       should_quit = true;
+       has_buffered_data.notify_all();
+}
diff --git a/futatabi/httpd.h b/futatabi/httpd.h
new file mode 100644 (file)
index 0000000..9901814
--- /dev/null
@@ -0,0 +1,118 @@
+#ifndef _HTTPD_H
+#define _HTTPD_H
+
+// A class dealing with stream output to HTTP.
+
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include <functional>
+#include <mutex>
+#include <set>
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+#include <sys/types.h>
+#include <unordered_map>
+#include <utility>
+
+extern "C" {
+#include <libavutil/rational.h>
+}
+
+struct MHD_Connection;
+struct MHD_Daemon;
+
+class HTTPD {
+public:
+       // Returns a pair of content and content-type.
+       using EndpointCallback = std::function<std::pair<std::string, std::string>()>;
+
+       HTTPD();
+       ~HTTPD();
+
+       // Should be called before start().
+       void set_header(const std::string &data)
+       {
+               header = data;
+       }
+
+       // Should be called before start() (due to threading issues).
+       enum CORSPolicy {
+               NO_CORS_POLICY,
+               ALLOW_ALL_ORIGINS
+       };
+       void add_endpoint(const std::string &url, const EndpointCallback &callback, CORSPolicy cors_policy)
+       {
+               endpoints[url] = Endpoint{ callback, cors_policy };
+       }
+
+       void start(int port);
+       void stop();
+       void add_data(const char *buf, size_t size, bool keyframe, int64_t time, AVRational timebase);
+       int64_t get_num_connected_clients() const
+       {
+               return metric_num_connected_clients.load();
+       }
+
+private:
+       static int answer_to_connection_thunk(void *cls, MHD_Connection *connection,
+                                             const char *url, const char *method,
+                                             const char *version, const char *upload_data,
+                                             size_t *upload_data_size, void **con_cls);
+
+       int answer_to_connection(MHD_Connection *connection,
+                                const char *url, const char *method,
+                                const char *version, const char *upload_data,
+                                size_t *upload_data_size, void **con_cls);
+
+       static void free_stream(void *cls);
+
+
+       class Stream {
+       public:
+               enum Framing {
+                       FRAMING_RAW,
+                       FRAMING_METACUBE
+               };
+               Stream(HTTPD *parent, Framing framing) : parent(parent), framing(framing) {}
+
+               static ssize_t reader_callback_thunk(void *cls, uint64_t pos, char *buf, size_t max);
+               ssize_t reader_callback(uint64_t pos, char *buf, size_t max);
+
+               enum DataType {
+                       DATA_TYPE_HEADER,
+                       DATA_TYPE_KEYFRAME,
+                       DATA_TYPE_OTHER
+               };
+               void add_data(const char *buf, size_t size, DataType data_type, int64_t time, AVRational timebase);
+               void stop();
+               HTTPD *get_parent() const { return parent; }
+
+       private:
+               HTTPD *parent;
+               Framing framing;
+
+               std::mutex buffer_mutex;
+               bool should_quit = false;  // Under <buffer_mutex>.
+               std::condition_variable has_buffered_data;
+               std::deque<std::string> buffered_data;  // Protected by <buffer_mutex>.
+               size_t used_of_buffered_data = 0;  // How many bytes of the first element of <buffered_data> that is already used. Protected by <mutex>.
+               size_t seen_keyframe = false;
+       };
+
+       MHD_Daemon *mhd = nullptr;
+       std::mutex streams_mutex;
+       std::set<Stream *> streams;  // Not owned.
+       struct Endpoint {
+               EndpointCallback callback;
+               CORSPolicy cors_policy;
+       };
+       std::unordered_map<std::string, Endpoint> endpoints;
+       std::string header;
+
+       // Metrics.
+       std::atomic<int64_t> metric_num_connected_clients{0};
+};
+
+#endif  // !defined(_HTTPD_H)
diff --git a/futatabi/jpeg_destroyer.h b/futatabi/jpeg_destroyer.h
new file mode 100644 (file)
index 0000000..5fc5c95
--- /dev/null
@@ -0,0 +1,19 @@
+#ifndef _JPEG_DESTROYER_H
+#define _JPEG_DESTROYER_H 1
+
+#include <jpeglib.h>
+
+class JPEGDestroyer {
+public:
+       JPEGDestroyer(jpeg_decompress_struct *dinfo)
+               : dinfo(dinfo) {}
+
+       ~JPEGDestroyer() {
+               jpeg_destroy_decompress(dinfo);
+       }
+
+private:
+       jpeg_decompress_struct *dinfo;
+};
+
+#endif  // !defined(_JPEG_DESTROYER_H)
diff --git a/futatabi/jpeg_frame.h b/futatabi/jpeg_frame.h
new file mode 100644 (file)
index 0000000..eb73e13
--- /dev/null
@@ -0,0 +1,16 @@
+#ifndef _JPEG_FRAME_H
+#define _JPEG_FRAME_H 1
+
+#include <memory>
+
+struct Frame {
+       bool is_semiplanar = false;
+       std::unique_ptr<uint8_t[]> y;
+       std::unique_ptr<uint8_t[]> cb, cr; // For planar.
+       std::unique_ptr<uint8_t[]> cbcr;  // For semiplanar.
+       unsigned width, height;
+       unsigned chroma_subsampling_x, chroma_subsampling_y;
+       unsigned pitch_y, pitch_chroma;
+};
+
+#endif   // !defined(_JPEG_FRAME_H)
diff --git a/futatabi/jpeg_frame_view.cpp b/futatabi/jpeg_frame_view.cpp
new file mode 100644 (file)
index 0000000..3d3383f
--- /dev/null
@@ -0,0 +1,462 @@
+#include "jpeg_frame_view.h"
+
+#include "defs.h"
+#include "jpeg_destroyer.h"
+#include "post_to_main_thread.h"
+#include "video_stream.h"
+#include "ycbcr_converter.h"
+
+#include <QMouseEvent>
+#include <QScreen>
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include <jpeglib.h>
+#include <movit/init.h>
+#include <movit/resource_pool.h>
+#include <movit/util.h>
+#include <mutex>
+#include <stdint.h>
+#include <thread>
+#include <unistd.h>
+#include <utility>
+
+// Must come after the Qt stuff.
+#include "vaapi_jpeg_decoder.h"
+
+using namespace movit;
+using namespace std;
+
+namespace {
+
+// Just an arbitrary order for std::map.
+struct FrameOnDiskLexicalOrder
+{
+       bool operator() (const FrameOnDisk &a, const FrameOnDisk &b) const
+       {
+               if (a.pts != b.pts)
+                       return a.pts < b.pts;
+               if (a.offset != b.offset)
+                       return a.offset < b.offset;
+               if (a.filename_idx != b.filename_idx)
+                       return a.filename_idx < b.filename_idx;
+               assert(a.size == b.size);
+               return false;
+       }
+};
+
+inline size_t frame_size(const Frame &frame)
+{
+       size_t y_size = frame.width * frame.height;
+       size_t cbcr_size = y_size / frame.chroma_subsampling_x / frame.chroma_subsampling_y;
+       return y_size + cbcr_size * 2;
+}
+
+struct LRUFrame {
+       shared_ptr<Frame> frame;
+       size_t last_used;
+};
+
+struct PendingDecode {
+       JPEGFrameView *destination;
+
+       // For actual decodes (only if frame below is nullptr).
+       FrameOnDisk primary, secondary;
+       float fade_alpha;  // Irrelevant if secondary.stream_idx == -1.
+
+       // Already-decoded frames are also sent through PendingDecode,
+       // so that they get drawn in the right order. If frame is nullptr,
+       // it's a real decode.
+       shared_ptr<Frame> frame;
+};
+
+}  // namespace
+
+thread JPEGFrameView::jpeg_decoder_thread;
+mutex cache_mu;
+map<FrameOnDisk, LRUFrame, FrameOnDiskLexicalOrder> cache;  // Under cache_mu.
+size_t cache_bytes_used = 0;  // Under cache_mu.
+condition_variable any_pending_decodes;
+deque<PendingDecode> pending_decodes;  // Under cache_mu.
+atomic<size_t> event_counter{0};
+extern QGLWidget *global_share_widget;
+extern atomic<bool> should_quit;
+
+shared_ptr<Frame> decode_jpeg(const string &filename)
+{
+       shared_ptr<Frame> frame;
+       if (vaapi_jpeg_decoding_usable) {
+               frame = decode_jpeg_vaapi(filename);
+               if (frame != nullptr) {
+                       return frame;
+               }
+               fprintf(stderr, "VA-API hardware decoding failed; falling back to software.\n");
+       }
+
+       frame.reset(new Frame);
+
+       jpeg_decompress_struct dinfo;
+       jpeg_error_mgr jerr;
+       dinfo.err = jpeg_std_error(&jerr);
+       jpeg_create_decompress(&dinfo);
+       JPEGDestroyer destroy_dinfo(&dinfo);
+
+       FILE *fp = fopen(filename.c_str(), "rb");
+       if (fp == nullptr) {
+               perror(filename.c_str());
+               exit(1);
+       }
+       jpeg_stdio_src(&dinfo, fp);
+
+       jpeg_read_header(&dinfo, true);
+
+       if (dinfo.num_components != 3) {
+               fprintf(stderr, "Not a color JPEG. (%d components, Y=%dx%d, Cb=%dx%d, Cr=%dx%d)\n",
+                       dinfo.num_components,
+                       dinfo.comp_info[0].h_samp_factor, dinfo.comp_info[0].v_samp_factor,
+                       dinfo.comp_info[1].h_samp_factor, dinfo.comp_info[1].v_samp_factor,
+                       dinfo.comp_info[2].h_samp_factor, dinfo.comp_info[2].v_samp_factor);
+               exit(1);
+       }
+       if (dinfo.comp_info[0].h_samp_factor != dinfo.max_h_samp_factor ||
+           dinfo.comp_info[0].v_samp_factor != dinfo.max_v_samp_factor ||  // Y' must not be subsampled.
+           dinfo.comp_info[1].h_samp_factor != dinfo.comp_info[2].h_samp_factor ||
+           dinfo.comp_info[1].v_samp_factor != dinfo.comp_info[2].v_samp_factor ||  // Cb and Cr must be identically subsampled.
+           (dinfo.max_h_samp_factor % dinfo.comp_info[1].h_samp_factor) != 0 ||
+           (dinfo.max_v_samp_factor % dinfo.comp_info[1].v_samp_factor) != 0) {  // No 2:3 subsampling or other weirdness.
+               fprintf(stderr, "Unsupported subsampling scheme. (Y=%dx%d, Cb=%dx%d, Cr=%dx%d)\n",
+                       dinfo.comp_info[0].h_samp_factor, dinfo.comp_info[0].v_samp_factor,
+                       dinfo.comp_info[1].h_samp_factor, dinfo.comp_info[1].v_samp_factor,
+                       dinfo.comp_info[2].h_samp_factor, dinfo.comp_info[2].v_samp_factor);
+               exit(1);
+       }
+       dinfo.raw_data_out = true;
+
+       jpeg_start_decompress(&dinfo);
+
+       frame->width = dinfo.output_width;
+       frame->height = dinfo.output_height;
+       frame->chroma_subsampling_x = dinfo.max_h_samp_factor / dinfo.comp_info[1].h_samp_factor;
+       frame->chroma_subsampling_y = dinfo.max_v_samp_factor / dinfo.comp_info[1].v_samp_factor;
+
+       unsigned h_mcu_size = DCTSIZE * dinfo.max_h_samp_factor;
+       unsigned v_mcu_size = DCTSIZE * dinfo.max_v_samp_factor;
+       unsigned mcu_width_blocks = (dinfo.output_width + h_mcu_size - 1) / h_mcu_size;
+       unsigned mcu_height_blocks = (dinfo.output_height + v_mcu_size - 1) / v_mcu_size;
+
+       unsigned luma_width_blocks = mcu_width_blocks * dinfo.comp_info[0].h_samp_factor;
+       unsigned chroma_width_blocks = mcu_width_blocks * dinfo.comp_info[1].h_samp_factor;
+       unsigned luma_height_blocks = mcu_height_blocks * dinfo.comp_info[0].v_samp_factor;
+       unsigned chroma_height_blocks = mcu_height_blocks * dinfo.comp_info[1].v_samp_factor;
+
+       // TODO: Decode into a PBO.
+       frame->y.reset(new uint8_t[luma_width_blocks * luma_height_blocks * DCTSIZE2]);
+       frame->cb.reset(new uint8_t[chroma_width_blocks * chroma_height_blocks * DCTSIZE2]);
+       frame->cr.reset(new uint8_t[chroma_width_blocks * chroma_height_blocks * DCTSIZE2]);
+       frame->pitch_y = luma_width_blocks * DCTSIZE;
+       frame->pitch_chroma = chroma_width_blocks * DCTSIZE;
+
+       JSAMPROW yptr[v_mcu_size], cbptr[v_mcu_size], crptr[v_mcu_size];
+       JSAMPARRAY data[3] = { yptr, cbptr, crptr };
+       for (unsigned y = 0; y < mcu_height_blocks; ++y) {
+               // NOTE: The last elements of cbptr/crptr will be unused for vertically subsampled chroma.
+               for (unsigned yy = 0; yy < v_mcu_size; ++yy) {
+                       yptr[yy] = frame->y.get() + (y * DCTSIZE * dinfo.max_v_samp_factor + yy) * frame->pitch_y;
+                       cbptr[yy] = frame->cb.get() + (y * DCTSIZE * dinfo.comp_info[1].v_samp_factor + yy) * frame->pitch_chroma;
+                       crptr[yy] = frame->cr.get() + (y * DCTSIZE * dinfo.comp_info[1].v_samp_factor + yy) * frame->pitch_chroma;
+               }
+
+               jpeg_read_raw_data(&dinfo, data, v_mcu_size);
+       }
+
+       (void)jpeg_finish_decompress(&dinfo);
+       fclose(fp);
+
+       return frame;
+}
+
+void prune_cache()
+{
+       // Assumes cache_mu is held.
+       int64_t bytes_still_to_remove = cache_bytes_used - (size_t(CACHE_SIZE_MB) * 1024 * 1024) * 9 / 10;
+       if (bytes_still_to_remove <= 0) return;
+
+       vector<pair<size_t, size_t>> lru_timestamps_and_size;
+       for (const auto &key_and_value : cache) {
+               lru_timestamps_and_size.emplace_back(
+                       key_and_value.second.last_used,
+                       frame_size(*key_and_value.second.frame));
+       }
+       sort(lru_timestamps_and_size.begin(), lru_timestamps_and_size.end());
+
+       // Remove the oldest ones until we are below 90% of the cache used.
+       size_t lru_cutoff_point = 0;
+       for (const pair<size_t, size_t> &it : lru_timestamps_and_size) {
+               lru_cutoff_point = it.first;
+               bytes_still_to_remove -= it.second;
+               if (bytes_still_to_remove <= 0) break;
+       }
+
+       for (auto it = cache.begin(); it != cache.end(); ) {
+               if (it->second.last_used <= lru_cutoff_point) {
+                       cache_bytes_used -= frame_size(*it->second.frame);
+                       it = cache.erase(it);
+               } else {
+                       ++it;
+               }
+       }
+}
+
+shared_ptr<Frame> decode_jpeg_with_cache(FrameOnDisk frame_spec, CacheMissBehavior cache_miss_behavior, FrameReader *frame_reader, bool *did_decode)
+{
+       *did_decode = false;
+       {
+               unique_lock<mutex> lock(cache_mu);
+               auto it = cache.find(frame_spec);
+               if (it != cache.end()) {
+                       it->second.last_used = event_counter++;
+                       return it->second.frame;
+               }
+       }
+
+       if (cache_miss_behavior == RETURN_NULLPTR_IF_NOT_IN_CACHE) {
+               return nullptr;
+       }
+
+       *did_decode = true;
+       shared_ptr<Frame> frame = decode_jpeg(frame_reader->read_frame(frame_spec));
+
+       unique_lock<mutex> lock(cache_mu);
+       cache_bytes_used += frame_size(*frame);
+       cache[frame_spec] = LRUFrame{ frame, event_counter++ };
+
+       if (cache_bytes_used > size_t(CACHE_SIZE_MB) * 1024 * 1024) {
+               prune_cache();
+       }
+       return frame;
+}
+
+void JPEGFrameView::jpeg_decoder_thread_func()
+{
+       size_t num_decoded = 0, num_dropped = 0;
+
+       pthread_setname_np(pthread_self(), "JPEGDecoder");
+       while (!should_quit.load()) {
+               PendingDecode decode;
+               CacheMissBehavior cache_miss_behavior = DECODE_IF_NOT_IN_CACHE;
+               {
+                       unique_lock<mutex> lock(cache_mu);  // TODO: Perhaps under another lock?
+                       any_pending_decodes.wait(lock, [] {
+                               return !pending_decodes.empty() || should_quit.load();
+                       });
+                       if (should_quit.load())
+                               break;
+                       decode = pending_decodes.front();
+                       pending_decodes.pop_front();
+
+                       size_t num_pending = 0;
+                       for (const PendingDecode &other_decode : pending_decodes) {
+                               if (other_decode.destination == decode.destination) {
+                                       ++num_pending;
+                               }
+                       }
+                       if (num_pending > 3) {
+                               cache_miss_behavior = RETURN_NULLPTR_IF_NOT_IN_CACHE;
+                       }
+               }
+
+               if (decode.frame != nullptr) {
+                       // Already decoded, so just show it.
+                       decode.destination->setDecodedFrame(decode.frame, nullptr, 1.0f);
+                       continue;
+               }
+
+               shared_ptr<Frame> primary_frame, secondary_frame;
+               bool drop = false;
+               for (int subframe_idx = 0; subframe_idx < 2; ++subframe_idx) {
+                       const FrameOnDisk &frame_spec = (subframe_idx == 0 ? decode.primary : decode.secondary);
+                       if (frame_spec.pts == -1) {
+                               // No secondary frame.
+                               continue;
+                       }
+
+                       bool found_in_cache;
+                       shared_ptr<Frame> frame = decode_jpeg_with_cache(frame_spec, cache_miss_behavior, &decode.destination->frame_reader, &found_in_cache);
+
+                       if (frame == nullptr) {
+                               assert(cache_miss_behavior == RETURN_NULLPTR_IF_NOT_IN_CACHE);
+                               drop = true;
+                               break;
+                       }
+
+                       if (!found_in_cache) {
+                               ++num_decoded;
+                               if (num_decoded % 1000 == 0) {
+                                       fprintf(stderr, "Decoded %zu images, dropped %zu (%.2f%% dropped)\n",
+                                               num_decoded, num_dropped, (100.0 * num_dropped) / (num_decoded + num_dropped));
+                               }
+                       }
+                       if (subframe_idx == 0) {
+                               primary_frame = std::move(frame);
+                       } else {
+                               secondary_frame = std::move(frame);
+                       }
+               }
+               if (drop) {
+                       ++num_dropped;
+                       continue;
+               }
+
+               // TODO: Could we get jitter between non-interpolated and interpolated frames here?
+               decode.destination->setDecodedFrame(primary_frame, secondary_frame, decode.fade_alpha);
+       }
+}
+
+void JPEGFrameView::shutdown()
+{
+       any_pending_decodes.notify_all();
+       jpeg_decoder_thread.join();
+}
+
+JPEGFrameView::JPEGFrameView(QWidget *parent)
+       : QGLWidget(parent, global_share_widget)
+{
+}
+
+void JPEGFrameView::setFrame(unsigned stream_idx, FrameOnDisk frame, FrameOnDisk secondary_frame, float fade_alpha)
+{
+       current_stream_idx = stream_idx;  // TODO: Does this interact with fades?
+
+       unique_lock<mutex> lock(cache_mu);
+       PendingDecode decode;
+       decode.primary = frame;
+       decode.secondary = secondary_frame;
+       decode.fade_alpha = fade_alpha;
+       decode.destination = this;
+       pending_decodes.push_back(decode);
+       any_pending_decodes.notify_all();
+}
+
+void JPEGFrameView::setFrame(shared_ptr<Frame> frame)
+{
+       unique_lock<mutex> lock(cache_mu);
+       PendingDecode decode;
+       decode.frame = std::move(frame);
+       decode.destination = this;
+       pending_decodes.push_back(decode);
+       any_pending_decodes.notify_all();
+}
+
+ResourcePool *resource_pool = nullptr;
+
+void JPEGFrameView::initializeGL()
+{
+       glDisable(GL_BLEND);
+       glDisable(GL_DEPTH_TEST);
+       check_error();
+
+       static once_flag once;
+       call_once(once, [] {
+               resource_pool = new ResourcePool;
+               jpeg_decoder_thread = std::thread(jpeg_decoder_thread_func);
+       });
+
+       ycbcr_converter.reset(new YCbCrConverter(YCbCrConverter::OUTPUT_TO_RGBA, resource_pool));
+
+       ImageFormat inout_format;
+       inout_format.color_space = COLORSPACE_sRGB;
+       inout_format.gamma_curve = GAMMA_sRGB;
+
+       overlay_chain.reset(new EffectChain(overlay_base_width, overlay_base_height, resource_pool));
+       overlay_input = (movit::FlatInput *)overlay_chain->add_input(new FlatInput(inout_format, FORMAT_GRAYSCALE, GL_UNSIGNED_BYTE, overlay_base_width, overlay_base_height));
+
+       overlay_chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
+       overlay_chain->finalize();
+}
+
+void JPEGFrameView::resizeGL(int width, int height)
+{
+       check_error();
+       glViewport(0, 0, width, height);
+       check_error();
+
+       // Save these, as width() and height() will lie with DPI scaling.
+       gl_width = width;
+       gl_height = height;
+}
+
+void JPEGFrameView::paintGL()
+{
+       glViewport(0, 0, gl_width, gl_height);
+       if (current_frame == nullptr) {
+               glClearColor(0.0f, 0.0f, 0.0f, 1.0f);
+               glClear(GL_COLOR_BUFFER_BIT);
+               return;
+       }
+
+       check_error();
+       current_chain->render_to_screen();
+
+       if (overlay_image != nullptr) {
+               if (overlay_input_needs_refresh) {
+                       overlay_input->set_width(overlay_width);
+                       overlay_input->set_height(overlay_height);
+                       overlay_input->set_pixel_data(overlay_image->bits());
+               }
+               glViewport(gl_width - overlay_width, 0, overlay_width, overlay_height);
+               overlay_chain->render_to_screen();
+       }
+}
+
+namespace {
+
+}  // namespace
+
+void JPEGFrameView::setDecodedFrame(shared_ptr<Frame> frame, shared_ptr<Frame> secondary_frame, float fade_alpha)
+{
+       post_to_main_thread([this, frame, secondary_frame, fade_alpha] {
+               current_frame = frame;
+               current_secondary_frame = secondary_frame;
+
+               if (secondary_frame != nullptr) {
+                       current_chain = ycbcr_converter->prepare_chain_for_fade(frame, secondary_frame, fade_alpha);
+               } else {
+                       current_chain = ycbcr_converter->prepare_chain_for_conversion(frame);
+               }
+               update();
+       });
+}
+
+void JPEGFrameView::mousePressEvent(QMouseEvent *event)
+{
+       if (event->type() == QEvent::MouseButtonPress && event->button() == Qt::LeftButton) {
+               emit clicked();
+       }
+}
+
+void JPEGFrameView::set_overlay(const string &text)
+{
+       if (text.empty()) {
+               overlay_image.reset();
+               return;
+       }
+
+       float dpr = QGuiApplication::primaryScreen()->devicePixelRatio();
+       overlay_width = lrint(overlay_base_width * dpr);
+       overlay_height = lrint(overlay_base_height * dpr);
+
+       overlay_image.reset(new QImage(overlay_width, overlay_height, QImage::Format_Grayscale8));
+       overlay_image->setDevicePixelRatio(dpr);
+       overlay_image->fill(0);
+       QPainter painter(overlay_image.get());
+
+       painter.setPen(Qt::white);
+       QFont font = painter.font();
+       font.setPointSize(12);
+       painter.setFont(font);
+
+       painter.drawText(QRectF(0, 0, overlay_base_width, overlay_base_height), Qt::AlignCenter, QString::fromStdString(text));
+
+       // Don't refresh immediately; we might not have an OpenGL context here.
+       overlay_input_needs_refresh = true;
+}
diff --git a/futatabi/jpeg_frame_view.h b/futatabi/jpeg_frame_view.h
new file mode 100644 (file)
index 0000000..38ffd41
--- /dev/null
@@ -0,0 +1,78 @@
+#ifndef _JPEG_FRAME_VIEW_H
+#define _JPEG_FRAME_VIEW_H 1
+
+#include "frame_on_disk.h"
+#include "jpeg_frame.h"
+#include "ycbcr_converter.h"
+
+#include <QGLWidget>
+#include <epoxy/gl.h>
+#include <memory>
+#include <movit/effect_chain.h>
+#include <movit/flat_input.h>
+#include <movit/mix_effect.h>
+#include <movit/ycbcr_input.h>
+#include <stdint.h>
+#include <thread>
+
+enum CacheMissBehavior {
+       DECODE_IF_NOT_IN_CACHE,
+       RETURN_NULLPTR_IF_NOT_IN_CACHE
+};
+
+std::shared_ptr<Frame> decode_jpeg(const std::string &filename);
+std::shared_ptr<Frame> decode_jpeg_with_cache(FrameOnDisk id, CacheMissBehavior cache_miss_behavior, FrameReader *frame_reader, bool *did_decode);
+
+class JPEGFrameView : public QGLWidget {
+       Q_OBJECT
+
+public:
+       JPEGFrameView(QWidget *parent);
+
+       void setFrame(unsigned stream_idx, FrameOnDisk frame, FrameOnDisk secondary_frame = {}, float fade_alpha = 0.0f);
+       void setFrame(std::shared_ptr<Frame> frame);
+
+       void mousePressEvent(QMouseEvent *event) override;
+
+       unsigned get_stream_idx() const { return current_stream_idx; }
+
+       void setDecodedFrame(std::shared_ptr<Frame> frame, std::shared_ptr<Frame> secondary_frame, float fade_alpha);
+       void set_overlay(const std::string &text);  // Blank for none.
+
+       static void shutdown();
+
+signals:
+       void clicked();
+
+protected:
+       void initializeGL() override;
+       void resizeGL(int width, int height) override;
+       void paintGL() override;
+
+private:
+       static void jpeg_decoder_thread_func();
+
+       FrameReader frame_reader;
+
+       // The stream index of the latest frame we displayed.
+       unsigned current_stream_idx = 0;
+
+       std::unique_ptr<YCbCrConverter> ycbcr_converter;
+       movit::EffectChain *current_chain = nullptr;  // Owned by ycbcr_converter.
+
+       std::shared_ptr<Frame> current_frame;  // So that we hold on to the pixels.
+       std::shared_ptr<Frame> current_secondary_frame;  // Same.
+
+       static constexpr int overlay_base_width = 16, overlay_base_height = 16;
+       int overlay_width = overlay_base_width, overlay_height = overlay_base_height;
+       std::unique_ptr<QImage> overlay_image;  // If nullptr, no overlay.
+       std::unique_ptr<movit::EffectChain> overlay_chain;  // Just to get the overlay on screen in the easiest way possible.
+       movit::FlatInput *overlay_input;
+       bool overlay_input_needs_refresh = false;
+
+       int gl_width, gl_height;
+
+       static std::thread jpeg_decoder_thread;
+};
+
+#endif  // !defined(_JPEG_FRAME_VIEW_H)
diff --git a/futatabi/main.cpp b/futatabi/main.cpp
new file mode 100644 (file)
index 0000000..e0518af
--- /dev/null
@@ -0,0 +1,502 @@
+#include <assert.h>
+#include <arpa/inet.h>
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <dirent.h>
+#include <getopt.h>
+#include <memory>
+#include <mutex>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <thread>
+#include <vector>
+
+extern "C" {
+#include <libavformat/avformat.h>
+}
+
+#include "clip_list.h"
+#include "context.h"
+#include "defs.h"
+#include "disk_space_estimator.h"
+#include "ffmpeg_raii.h"
+#include "flags.h"
+#include "frame_on_disk.h"
+#include "frame.pb.h"
+#include "httpd.h"
+#include "mainwindow.h"
+#include "player.h"
+#include "post_to_main_thread.h"
+#include "ref_counted_gl_sync.h"
+#include "timebase.h"
+#include "ui_mainwindow.h"
+#include "vaapi_jpeg_decoder.h"
+
+#include <QApplication>
+#include <QGLFormat>
+#include <QSurfaceFormat>
+#include <QProgressDialog>
+#include <movit/init.h>
+#include <movit/util.h>
+
+using namespace std;
+using namespace std::chrono;
+
+constexpr char frame_magic[] = "Ftbifrm0";
+constexpr size_t frame_magic_len = 8;
+
+mutex RefCountedGLsync::fence_lock;
+atomic<bool> should_quit{false};
+
+int64_t start_pts = -1;
+
+// TODO: Replace by some sort of GUI control, I guess.
+int64_t current_pts = 0;
+
+struct FrameFile {
+       FILE *fp = nullptr;
+       unsigned filename_idx;
+       size_t frames_written_so_far = 0;
+};
+std::map<int, FrameFile> open_frame_files;
+
+mutex frame_mu;
+vector<FrameOnDisk> frames[MAX_STREAMS];  // Under frame_mu.
+vector<string> frame_filenames;  // Under frame_mu.
+
+namespace {
+
+FrameOnDisk write_frame(int stream_idx, int64_t pts, const uint8_t *data, size_t size, DB *db)
+{
+       if (open_frame_files.count(stream_idx) == 0) {
+               char filename[256];
+               snprintf(filename, sizeof(filename), "%s/frames/cam%d-pts%09ld.frames",
+                       global_flags.working_directory.c_str(), stream_idx, pts);
+               FILE *fp = fopen(filename, "wb");
+               if (fp == nullptr) {
+                       perror(filename);
+                       exit(1);
+               }
+
+               lock_guard<mutex> lock(frame_mu);
+               unsigned filename_idx = frame_filenames.size();
+               frame_filenames.push_back(filename);
+               open_frame_files[stream_idx] = FrameFile{ fp, filename_idx, 0 };
+       }
+
+       FrameFile &file = open_frame_files[stream_idx];
+       unsigned filename_idx = file.filename_idx;
+       string filename;
+       {
+               lock_guard<mutex> lock(frame_mu);
+               filename = frame_filenames[filename_idx];
+       }
+
+       FrameHeaderProto hdr;
+       hdr.set_stream_idx(stream_idx);
+       hdr.set_pts(pts);
+       hdr.set_file_size(size);
+
+       string serialized;
+       if (!hdr.SerializeToString(&serialized)) {
+               fprintf(stderr, "Frame header serialization failed.\n");
+               exit(1);
+       }
+       uint32_t len = htonl(serialized.size());
+
+       if (fwrite(frame_magic, frame_magic_len, 1, file.fp) != 1) {
+               perror("fwrite");
+               exit(1);
+       }
+       if (fwrite(&len, sizeof(len), 1, file.fp) != 1) {
+               perror("fwrite");
+               exit(1);
+       }
+       if (fwrite(serialized.data(), serialized.size(), 1, file.fp) != 1) {
+               perror("fwrite");
+               exit(1);
+       }
+       off_t offset = ftell(file.fp);
+       if (fwrite(data, size, 1, file.fp) != 1) {
+               perror("fwrite");
+               exit(1);
+       }
+       fflush(file.fp);  // No fsync(), though. We can accept losing a few frames.
+       global_disk_space_estimator->report_write(filename, 8 + sizeof(len) + serialized.size() + size, pts);
+
+       FrameOnDisk frame;
+       frame.pts = pts;
+       frame.filename_idx = filename_idx;
+       frame.offset = offset;
+       frame.size = size;
+
+       {
+               lock_guard<mutex> lock(frame_mu);
+               assert(stream_idx < MAX_STREAMS);
+               frames[stream_idx].push_back(frame);
+       }
+
+       if (++file.frames_written_so_far >= 1000) {
+               size_t size = ftell(file.fp);
+
+               // Start a new file next time.
+               if (fclose(file.fp) != 0) {
+                       perror("fclose");
+                       exit(1);
+               }
+               open_frame_files.erase(stream_idx);
+
+               // Write information about all frames in the finished file to SQLite.
+               // (If we crash before getting to do this, we'll be scanning through
+               // the file on next startup, and adding it to the database then.)
+               // NOTE: Since we don't fsync(), we could in theory get broken data
+               // but with the right size, but it would seem unlikely.
+               vector<DB::FrameOnDiskAndStreamIdx> frames_this_file;
+               {
+                       lock_guard<mutex> lock(frame_mu);
+                       for (size_t stream_idx = 0; stream_idx < MAX_STREAMS; ++stream_idx) {
+                               for (const FrameOnDisk &frame : frames[stream_idx]) {
+                                       if (frame.filename_idx == filename_idx) {
+                                               frames_this_file.emplace_back(DB::FrameOnDiskAndStreamIdx{ frame, unsigned(stream_idx) });
+                                       }
+                               }
+                       }
+               }
+
+               const char *basename = filename.c_str();
+               while (strchr(basename, '/') != nullptr) {
+                       basename = strchr(basename, '/');
+               }
+               db->store_frame_file(basename, size, frames_this_file);
+       }
+
+       return frame;
+}
+
+} // namespace
+
+HTTPD *global_httpd;
+
+void load_existing_frames();
+int record_thread_func();
+
+int main(int argc, char **argv)
+{
+       parse_flags(argc, argv);
+       if (optind == argc) {
+               global_flags.stream_source = "multiangle.mp4";
+               global_flags.slow_down_input = true;
+       } else if (optind + 1 == argc) {
+               global_flags.stream_source = argv[optind];
+       } else {
+               usage();
+               exit(1);
+       }
+
+       string frame_dir = global_flags.working_directory + "/frames";
+
+       struct stat st;
+       if (stat(frame_dir.c_str(), &st) == -1) {
+               fprintf(stderr, "%s does not exist, creating it.\n", frame_dir.c_str());
+               if (mkdir(frame_dir.c_str(), 0777) == -1) {
+                       perror(global_flags.working_directory.c_str());
+                       exit(1);
+               }
+       }
+
+       avformat_network_init();
+       global_httpd = new HTTPD;
+
+       QCoreApplication::setAttribute(Qt::AA_ShareOpenGLContexts, true);
+
+       QSurfaceFormat fmt;
+       fmt.setDepthBufferSize(0);
+       fmt.setStencilBufferSize(0);
+       fmt.setProfile(QSurfaceFormat::CoreProfile);
+       fmt.setMajorVersion(4);
+       fmt.setMinorVersion(5);
+
+       // Turn off vsync, since Qt generally gives us at most frame rate
+       // (display frequency) / (number of QGLWidgets active).
+       fmt.setSwapInterval(0);
+
+       QSurfaceFormat::setDefaultFormat(fmt);
+
+       QGLFormat::setDefaultFormat(QGLFormat::fromSurfaceFormat(fmt));
+
+       QApplication app(argc, argv);
+       global_share_widget = new QGLWidget();
+       if (!global_share_widget->isValid()) {
+               fprintf(stderr, "Failed to initialize OpenGL. Futatabi needs at least OpenGL 4.5 to function properly.\n");
+               exit(1);
+       }
+
+       // Initialize Movit.
+       {
+               QSurface *surface = create_surface();
+               QOpenGLContext *context = create_context(surface);
+               make_current(context, surface);
+               CHECK(movit::init_movit(MOVIT_SHADER_DIR, movit::MOVIT_DEBUG_OFF));
+               delete_context(context);
+               // TODO: Delete the surface, too.
+       }
+
+       load_existing_frames();
+
+       MainWindow main_window;
+       main_window.show();
+
+       global_httpd->add_endpoint("/queue_status", bind(&MainWindow::get_queue_status, &main_window), HTTPD::NO_CORS_POLICY);
+       global_httpd->start(global_flags.http_port);
+
+       init_jpeg_vaapi();
+
+       thread record_thread(record_thread_func);
+
+       int ret = app.exec();
+
+       should_quit = true;
+       record_thread.join();
+       JPEGFrameView::shutdown();
+
+       return ret;
+}
+
+void load_frame_file(const char *filename, const string &basename, unsigned filename_idx, DB *db)
+{
+       struct stat st;
+       if (stat(filename, &st) == -1) {
+               perror(filename);
+               exit(1);
+       }
+
+       vector<DB::FrameOnDiskAndStreamIdx> all_frames = db->load_frame_file(basename, st.st_size, filename_idx);
+       if (!all_frames.empty()) {
+               // We already had this cached in the database, so no need to look in the file.
+               for (const DB::FrameOnDiskAndStreamIdx &frame : all_frames) {
+                       if (frame.stream_idx >= 0 && frame.stream_idx < MAX_STREAMS) {
+                               frames[frame.stream_idx].push_back(frame.frame);
+                               start_pts = max(start_pts, frame.frame.pts);
+                       }
+               }
+               return;
+       }
+
+       FILE *fp = fopen(filename, "rb");
+       if (fp == nullptr) {
+               perror(filename);
+               exit(1);
+       }
+
+       size_t magic_offset = 0;
+       size_t skipped_bytes = 0;
+       while (!feof(fp) && !ferror(fp)) {
+               int ch = getc(fp);
+               if (ch == -1) {
+                       break;
+               }
+               if (ch != frame_magic[magic_offset++]) {
+                       skipped_bytes += magic_offset;
+                       magic_offset = 0;
+                       continue;
+               }
+               if (magic_offset < frame_magic_len) {
+                       // Still reading the magic (hopefully).
+                       continue;
+               }
+
+               // OK, found the magic. Try to parse the frame header.
+               magic_offset = 0;
+
+               if (skipped_bytes > 0)  {
+                       fprintf(stderr, "WARNING: %s: Skipped %zu garbage bytes in the middle.\n",
+                               filename, skipped_bytes);
+                       skipped_bytes = 0;
+               }
+
+               uint32_t len;
+               if (fread(&len, sizeof(len), 1, fp) != 1) {
+                       fprintf(stderr, "WARNING: %s: Short read when getting length.\n", filename);
+                       break;
+               }
+
+               string serialized;
+               serialized.resize(ntohl(len));
+               if (fread(&serialized[0], serialized.size(), 1, fp) != 1) {
+                       fprintf(stderr, "WARNING: %s: Short read when reading frame header (%zu bytes).\n", filename, serialized.size());
+                       break;
+               }
+
+               FrameHeaderProto hdr;
+               if (!hdr.ParseFromString(serialized)) {
+                       fprintf(stderr, "WARNING: %s: Corrupted frame header.\n", filename);
+                       continue;
+               }
+
+               FrameOnDisk frame;
+               frame.pts = hdr.pts();
+               frame.offset = ftell(fp);
+               frame.filename_idx = filename_idx;
+               frame.size = hdr.file_size();
+
+               if (fseek(fp, frame.offset + frame.size, SEEK_SET) == -1) {
+                       fprintf(stderr, "WARNING: %s: Could not seek past frame (probably truncated).\n", filename);
+                       continue;
+               }
+
+               if (hdr.stream_idx() >= 0 && hdr.stream_idx() < MAX_STREAMS) {
+                       frames[hdr.stream_idx()].push_back(frame);
+                       start_pts = max(start_pts, hdr.pts());
+               }
+               all_frames.emplace_back(DB::FrameOnDiskAndStreamIdx{ frame, unsigned(hdr.stream_idx()) });
+       }
+
+       if (skipped_bytes > 0) {
+               fprintf(stderr, "WARNING: %s: Skipped %zu garbage bytes at the end.\n",
+                       filename, skipped_bytes);
+       }
+
+       size_t size = ftell(fp);
+       fclose(fp);
+
+       db->store_frame_file(basename, size, all_frames);
+}
+
+void load_existing_frames()
+{
+       QProgressDialog progress("Scanning frame directory...", "Abort", 0, 1);
+       progress.setWindowTitle("Futatabi");
+       progress.setWindowModality(Qt::WindowModal);
+       progress.setMinimumDuration(1000);
+       progress.setMaximum(1);
+       progress.setValue(0);
+
+       string frame_dir = global_flags.working_directory + "/frames";
+       DIR *dir = opendir(frame_dir.c_str());
+       if (dir == nullptr) {
+               perror("frames/");
+               start_pts = 0;
+               return;
+       }
+
+       vector<string> frame_basenames;
+       for ( ;; ) {
+               errno = 0;
+               dirent *de = readdir(dir);
+               if (de == nullptr) {
+                       if (errno != 0) {
+                               perror("readdir");
+                               exit(1);
+                       }
+                       break;
+               }
+
+               if (de->d_type == DT_REG || de->d_type == DT_LNK) {
+                       string filename = frame_dir + "/" + de->d_name;
+                       frame_filenames.push_back(filename);
+                       frame_basenames.push_back(de->d_name);
+               }
+
+               if (progress.wasCanceled()) {
+                       exit(1);
+               }
+       }
+       closedir(dir);
+
+       progress.setMaximum(frame_filenames.size() + 2);
+       progress.setValue(1);
+
+       progress.setLabelText("Opening database...");
+       DB db(global_flags.working_directory + "/futatabi.db");
+
+       progress.setLabelText("Reading frame files...");
+       progress.setValue(2);
+
+       for (size_t i = 0; i < frame_filenames.size(); ++i) {
+               load_frame_file(frame_filenames[i].c_str(), frame_basenames[i], i, &db);
+               progress.setValue(i + 3);
+               if (progress.wasCanceled()) {
+                       exit(1);
+               }
+       }
+
+       if (start_pts == -1) {
+               start_pts = 0;
+       } else {
+               // Add a gap of one second from the old frames to the new ones.
+               start_pts += TIMEBASE;
+       }
+
+       for (int stream_idx = 0; stream_idx < MAX_STREAMS; ++stream_idx) {
+               sort(frames[stream_idx].begin(), frames[stream_idx].end(),
+                       [](const auto &a, const auto &b) { return a.pts < b.pts; });
+       }
+
+       db.clean_unused_frame_files(frame_basenames);
+}
+
+int record_thread_func()
+{
+       auto format_ctx = avformat_open_input_unique(global_flags.stream_source.c_str(), nullptr, nullptr);
+       if (format_ctx == nullptr) {
+               fprintf(stderr, "%s: Error opening file\n", global_flags.stream_source.c_str());
+               return 1;
+       }
+
+       int64_t last_pts = -1;
+       int64_t pts_offset;
+       DB db(global_flags.working_directory + "/futatabi.db");
+
+       while (!should_quit.load()) {
+               AVPacket pkt;
+               unique_ptr<AVPacket, decltype(av_packet_unref)*> pkt_cleanup(
+                       &pkt, av_packet_unref);
+               av_init_packet(&pkt);
+               pkt.data = nullptr;
+               pkt.size = 0;
+
+               // TODO: Make it possible to abort av_read_frame() (use an interrupt callback);
+               // right now, should_quit will be ignored if it's hung on I/O.
+               if (av_read_frame(format_ctx.get(), &pkt) != 0) {
+                       break;
+               }
+
+               // Convert pts to our own timebase.
+               AVRational stream_timebase = format_ctx->streams[pkt.stream_index]->time_base;
+               int64_t pts = av_rescale_q(pkt.pts, stream_timebase, AVRational{ 1, TIMEBASE });
+
+               // Translate offset into our stream.
+               if (last_pts == -1) {
+                       pts_offset = start_pts - pts;
+               }
+               pts = std::max(pts + pts_offset, start_pts);
+
+               //fprintf(stderr, "Got a frame from camera %d, pts = %ld, size = %d\n",
+               //      pkt.stream_index, pts, pkt.size);
+               FrameOnDisk frame = write_frame(pkt.stream_index, pts, pkt.data, pkt.size, &db);
+
+               post_to_main_thread([pkt, frame] {
+                       if (pkt.stream_index == 0) {
+                               global_mainwindow->ui->input1_display->setFrame(pkt.stream_index, frame);
+                       } else if (pkt.stream_index == 1) {
+                               global_mainwindow->ui->input2_display->setFrame(pkt.stream_index, frame);
+                       } else if (pkt.stream_index == 2) {
+                               global_mainwindow->ui->input3_display->setFrame(pkt.stream_index, frame);
+                       } else if (pkt.stream_index == 3) {
+                               global_mainwindow->ui->input4_display->setFrame(pkt.stream_index, frame);
+                       }
+               });
+
+               if (last_pts != -1 && global_flags.slow_down_input) {
+                       this_thread::sleep_for(microseconds((pts - last_pts) * 1000000 / TIMEBASE));
+               }
+               last_pts = pts;
+               current_pts = pts;
+       }
+
+       return 0;
+}
diff --git a/futatabi/mainwindow.cpp b/futatabi/mainwindow.cpp
new file mode 100644 (file)
index 0000000..dd3c827
--- /dev/null
@@ -0,0 +1,773 @@
+#include "mainwindow.h"
+
+#include "clip_list.h"
+#include "disk_space_estimator.h"
+#include "flags.h"
+#include "frame_on_disk.h"
+#include "player.h"
+#include "post_to_main_thread.h"
+#include "timebase.h"
+#include "ui_mainwindow.h"
+
+#include <QMouseEvent>
+#include <QShortcut>
+#include <QTimer>
+#include <QWheelEvent>
+#include <future>
+#include <sqlite3.h>
+#include <string>
+#include <vector>
+
+using namespace std;
+using namespace std::placeholders;
+
+MainWindow *global_mainwindow = nullptr;
+static ClipList *cliplist_clips;
+static PlayList *playlist_clips;
+
+extern int64_t current_pts;
+
+MainWindow::MainWindow()
+       : ui(new Ui::MainWindow),
+         db(global_flags.working_directory + "/futatabi.db")
+{
+       global_mainwindow = this;
+       ui->setupUi(this);
+
+       // The menus.
+       connect(ui->exit_action, &QAction::triggered, this, &MainWindow::exit_triggered);
+
+       global_disk_space_estimator = new DiskSpaceEstimator(bind(&MainWindow::report_disk_space, this, _1, _2));
+       disk_free_label = new QLabel(this);
+       disk_free_label->setStyleSheet("QLabel {padding-right: 5px;}");
+       ui->menuBar->setCornerWidget(disk_free_label);
+
+       StateProto state = db.get_state();
+
+       cliplist_clips = new ClipList(state.clip_list());
+       ui->clip_list->setModel(cliplist_clips);
+       connect(cliplist_clips, &ClipList::any_content_changed, this, &MainWindow::content_changed);
+
+       playlist_clips = new PlayList(state.play_list());
+       ui->playlist->setModel(playlist_clips);
+       connect(playlist_clips, &PlayList::any_content_changed, this, &MainWindow::content_changed);
+
+       // For un-highlighting when we lose focus.
+       ui->clip_list->installEventFilter(this);
+
+       // For scrubbing in the pts columns.
+       ui->clip_list->viewport()->installEventFilter(this);
+       ui->playlist->viewport()->installEventFilter(this);
+
+       QShortcut *cue_in = new QShortcut(QKeySequence(Qt::Key_A), this);
+       connect(cue_in, &QShortcut::activated, ui->cue_in_btn, &QPushButton::click);
+       connect(ui->cue_in_btn, &QPushButton::clicked, this, &MainWindow::cue_in_clicked);
+
+       QShortcut *cue_out = new QShortcut(QKeySequence(Qt::Key_S), this);
+       connect(cue_out, &QShortcut::activated, ui->cue_out_btn, &QPushButton::click);
+       connect(ui->cue_out_btn, &QPushButton::clicked, this, &MainWindow::cue_out_clicked);
+
+       QShortcut *queue = new QShortcut(QKeySequence(Qt::Key_Q), this);
+       connect(queue, &QShortcut::activated, ui->queue_btn, &QPushButton::click);
+       connect(ui->queue_btn, &QPushButton::clicked, this, &MainWindow::queue_clicked);
+
+       QShortcut *preview = new QShortcut(QKeySequence(Qt::Key_W), this);
+       connect(preview, &QShortcut::activated, ui->preview_btn, &QPushButton::click);
+       connect(ui->preview_btn, &QPushButton::clicked, this, &MainWindow::preview_clicked);
+
+       QShortcut *play = new QShortcut(QKeySequence(Qt::Key_Space), this);
+       connect(play, &QShortcut::activated, ui->play_btn, &QPushButton::click);
+       connect(ui->play_btn, &QPushButton::clicked, this, &MainWindow::play_clicked);
+
+       QShortcut *preview_1 = new QShortcut(QKeySequence(Qt::Key_1), this);
+       connect(preview_1, &QShortcut::activated, ui->preview_1_btn, &QPushButton::click);
+       connect(ui->input1_display, &JPEGFrameView::clicked, ui->preview_1_btn, &QPushButton::click);
+       connect(ui->preview_1_btn, &QPushButton::clicked, [this]{ preview_angle_clicked(0); });
+       ui->input1_display->set_overlay("1");
+
+       QShortcut *preview_2 = new QShortcut(QKeySequence(Qt::Key_2), this);
+       connect(preview_2, &QShortcut::activated, ui->preview_2_btn, &QPushButton::click);
+       connect(ui->input2_display, &JPEGFrameView::clicked, ui->preview_2_btn, &QPushButton::click);
+       connect(ui->preview_2_btn, &QPushButton::clicked, [this]{ preview_angle_clicked(1); });
+       ui->input2_display->set_overlay("2");
+
+       QShortcut *preview_3 = new QShortcut(QKeySequence(Qt::Key_3), this);
+       connect(preview_3, &QShortcut::activated, ui->preview_3_btn, &QPushButton::click);
+       connect(ui->input3_display, &JPEGFrameView::clicked, ui->preview_3_btn, &QPushButton::click);
+       connect(ui->preview_3_btn, &QPushButton::clicked, [this]{ preview_angle_clicked(2); });
+       ui->input3_display->set_overlay("3");
+
+       QShortcut *preview_4 = new QShortcut(QKeySequence(Qt::Key_4), this);
+       connect(preview_4, &QShortcut::activated, ui->preview_4_btn, &QPushButton::click);
+       connect(ui->input4_display, &JPEGFrameView::clicked, ui->preview_4_btn, &QPushButton::click);
+       connect(ui->preview_4_btn, &QPushButton::clicked, [this]{ preview_angle_clicked(3); });
+       ui->input4_display->set_overlay("4");
+
+       connect(ui->playlist_duplicate_btn, &QPushButton::clicked, this, &MainWindow::playlist_duplicate);
+
+       connect(ui->playlist_remove_btn, &QPushButton::clicked, this, &MainWindow::playlist_remove);
+       QShortcut *delete_key = new QShortcut(QKeySequence(Qt::Key_Delete), ui->playlist);
+       connect(delete_key, &QShortcut::activated, [this] {
+               if (ui->playlist->hasFocus()) {
+                       playlist_remove();
+               }
+       });
+
+       // TODO: support drag-and-drop.
+       connect(ui->playlist_move_up_btn, &QPushButton::clicked, [this]{ playlist_move(-1); });
+       connect(ui->playlist_move_down_btn, &QPushButton::clicked, [this]{ playlist_move(1); });
+
+       connect(ui->playlist->selectionModel(), &QItemSelectionModel::selectionChanged,
+               this, &MainWindow::playlist_selection_changed);
+       playlist_selection_changed();  // First time set-up.
+
+       preview_player = new Player(ui->preview_display, /*also_output_to_stream=*/false);
+       live_player = new Player(ui->live_display, /*also_output_to_stream=*/true);
+       live_player->set_done_callback([this]{
+               post_to_main_thread([this]{
+                       live_player_clip_done();
+               });
+       });
+       live_player->set_next_clip_callback(bind(&MainWindow::live_player_get_next_clip, this));
+       live_player->set_progress_callback([this](const map<size_t, double> &progress) {
+               post_to_main_thread([this, progress] {
+                       live_player_clip_progress(progress);
+               });
+       });
+       set_output_status("paused");
+
+       defer_timeout = new QTimer(this);
+       defer_timeout->setSingleShot(true);
+       connect(defer_timeout, &QTimer::timeout, this, &MainWindow::defer_timer_expired);
+
+       connect(ui->clip_list->selectionModel(), &QItemSelectionModel::currentChanged,
+               this, &MainWindow::clip_list_selection_changed);
+}
+
+void MainWindow::cue_in_clicked()
+{
+       if (!cliplist_clips->empty() && cliplist_clips->back()->pts_out < 0) {
+               cliplist_clips->mutable_back()->pts_in = current_pts;
+               return;
+       }
+       Clip clip;
+       clip.pts_in = current_pts;
+       cliplist_clips->add_clip(clip);
+       playlist_selection_changed();
+}
+
+void MainWindow::cue_out_clicked()
+{
+       if (!cliplist_clips->empty()) {
+               cliplist_clips->mutable_back()->pts_out = current_pts;
+               // TODO: select the row in the clip list?
+       }
+}
+
+void MainWindow::queue_clicked()
+{
+       if (cliplist_clips->empty()) {
+               return;
+       }
+
+       QItemSelectionModel *selected = ui->clip_list->selectionModel();
+       if (!selected->hasSelection()) {
+               Clip clip = *cliplist_clips->back();
+               clip.stream_idx = 0;
+               if (clip.pts_out != -1) {
+                       playlist_clips->add_clip(clip);
+                       playlist_selection_changed();
+               }
+               return;
+       }
+
+       QModelIndex index = selected->currentIndex();
+       Clip clip = *cliplist_clips->clip(index.row());
+       if (index.column() >= int(ClipList::Column::CAMERA_1) &&
+           index.column() <= int(ClipList::Column::CAMERA_4)) {
+               clip.stream_idx = index.column() - int(ClipList::Column::CAMERA_1);
+       } else {
+               clip.stream_idx = ui->preview_display->get_stream_idx();
+       }
+
+       if (clip.pts_out != -1) {
+               playlist_clips->add_clip(clip);
+               playlist_selection_changed();
+       }
+}
+
+void MainWindow::preview_clicked()
+{
+       if (ui->playlist->hasFocus()) {
+               // Allow the playlist as preview iff it has focus and something is selected.
+               QItemSelectionModel *selected = ui->playlist->selectionModel();
+               if (selected->hasSelection()) {
+                       QModelIndex index = selected->currentIndex();
+                       const Clip &clip = *playlist_clips->clip(index.row());
+                       preview_player->play_clip(clip, index.row(), clip.stream_idx);
+                       return;
+               }
+       }
+
+       if (cliplist_clips->empty())
+               return;
+
+       QItemSelectionModel *selected = ui->clip_list->selectionModel();
+       if (!selected->hasSelection()) {
+               preview_player->play_clip(*cliplist_clips->back(), cliplist_clips->size() - 1, 0);
+               return;
+       }
+
+       QModelIndex index = selected->currentIndex();
+       unsigned stream_idx;
+       if (index.column() >= int(ClipList::Column::CAMERA_1) &&
+           index.column() <= int(ClipList::Column::CAMERA_4)) {
+               stream_idx = index.column() - int(ClipList::Column::CAMERA_1);
+       } else {
+               stream_idx = ui->preview_display->get_stream_idx();
+       }
+       preview_player->play_clip(*cliplist_clips->clip(index.row()), index.row(), stream_idx);
+}
+
+void MainWindow::preview_angle_clicked(unsigned stream_idx)
+{
+       preview_player->override_angle(stream_idx);
+
+       // Change the selection if we were previewing a clip from the clip list.
+       // (The only other thing we could be showing is a pts scrub, and if so,
+       // that would be selected.)
+       QItemSelectionModel *selected = ui->clip_list->selectionModel();
+       if (selected->hasSelection()) {
+               QModelIndex cell = selected->selectedIndexes()[0];
+               int column = int(ClipList::Column::CAMERA_1) + stream_idx;
+               selected->setCurrentIndex(cell.sibling(cell.row(), column), QItemSelectionModel::ClearAndSelect);
+       }
+}
+
+void MainWindow::playlist_duplicate()
+{
+       QItemSelectionModel *selected = ui->playlist->selectionModel();
+       if (!selected->hasSelection()) {
+               // Should have been grayed out, but OK.
+               return;
+       }
+       QModelIndexList rows = selected->selectedRows();
+       int first = rows.front().row(), last = rows.back().row();
+       playlist_clips->duplicate_clips(first, last);
+       playlist_selection_changed();
+}
+
+void MainWindow::playlist_remove()
+{
+       QItemSelectionModel *selected = ui->playlist->selectionModel();
+       if (!selected->hasSelection()) {
+               // Should have been grayed out, but OK.
+               return;
+       }
+       QModelIndexList rows = selected->selectedRows();
+       int first = rows.front().row(), last = rows.back().row();
+       playlist_clips->erase_clips(first, last);
+
+       // TODO: select the next one in the list?
+
+       playlist_selection_changed();
+}
+
+void MainWindow::playlist_move(int delta)
+{
+       QItemSelectionModel *selected = ui->playlist->selectionModel();
+       if (!selected->hasSelection()) {
+               // Should have been grayed out, but OK.
+               return;
+       }
+
+       QModelIndexList rows = selected->selectedRows();
+       int first = rows.front().row(), last = rows.back().row();
+       if ((delta == -1 && first == 0) ||
+           (delta == 1 && size_t(last) == playlist_clips->size() - 1)) {
+               // Should have been grayed out, but OK.
+               return;
+       }
+
+       playlist_clips->move_clips(first, last, delta);
+       playlist_selection_changed();
+}
+
+void MainWindow::defer_timer_expired()
+{
+       state_changed(deferred_state);
+}
+
+void MainWindow::content_changed()
+{
+       if (defer_timeout->isActive() &&
+           (!currently_deferring_model_changes || deferred_change_id != current_change_id)) {
+               // There's some deferred event waiting, but this event is unrelated.
+               // So it's time to short-circuit that timer and do the work it wanted to do.
+               defer_timeout->stop();
+               state_changed(deferred_state);
+       }
+       StateProto state;
+       *state.mutable_clip_list() = cliplist_clips->serialize();
+       *state.mutable_play_list() = playlist_clips->serialize();
+       if (currently_deferring_model_changes) {
+               deferred_change_id = current_change_id;
+               deferred_state = std::move(state);
+               defer_timeout->start(200);
+               return;
+       }
+       state_changed(state);
+}
+
+void MainWindow::state_changed(const StateProto &state)
+{
+       db.store_state(state);
+}
+
+void MainWindow::play_clicked()
+{
+       if (playlist_clips->empty())
+               return;
+
+       QItemSelectionModel *selected = ui->playlist->selectionModel();
+       int row;
+       if (!selected->hasSelection()) {
+               row = 0;
+       } else {
+               row = selected->selectedRows(0)[0].row();
+       }
+
+       const Clip &clip = *playlist_clips->clip(row);
+       live_player->play_clip(clip, row, clip.stream_idx);
+       playlist_clips->set_progress({{ row, 0.0f }});
+       playlist_clips->set_currently_playing(row, 0.0f);
+       playlist_selection_changed();
+}
+
+void MainWindow::live_player_clip_done()
+{
+       int row = playlist_clips->get_currently_playing();
+       if (row == -1 || row == int(playlist_clips->size()) - 1) {
+               set_output_status("paused");
+               playlist_clips->set_progress({});
+               playlist_clips->set_currently_playing(-1, 0.0f);
+       } else {
+               playlist_clips->set_progress({{ row + 1, 0.0f }});
+               playlist_clips->set_currently_playing(row + 1, 0.0f);
+       }
+}
+
+pair<Clip, size_t> MainWindow::live_player_get_next_clip()
+{
+       // playlist_clips can only be accessed on the main thread.
+       // Hopefully, we won't have to wait too long for this to come back.
+       promise<pair<Clip, size_t>> clip_promise;
+       future<pair<Clip, size_t>> clip = clip_promise.get_future();
+       post_to_main_thread([this, &clip_promise] {
+               int row = playlist_clips->get_currently_playing();
+               if (row != -1 && row < int(playlist_clips->size()) - 1) {
+                       clip_promise.set_value(make_pair(*playlist_clips->clip(row + 1), row + 1));
+               } else {
+                       clip_promise.set_value(make_pair(Clip(), 0));
+               }
+       });
+       return clip.get();
+}
+
+static string format_duration(double t)
+{
+       int t_ms = lrint(t * 1e3);
+
+       int ms = t_ms % 1000;
+       t_ms /= 1000;
+       int s = t_ms % 60;
+       t_ms /= 60;
+       int m = t_ms;
+
+       char buf[256];
+       snprintf(buf, sizeof(buf), "%d:%02d.%03d", m, s, ms);
+       return buf;
+}
+
+void MainWindow::live_player_clip_progress(const map<size_t, double> &progress)
+{
+       playlist_clips->set_progress(progress);
+
+       // Look at the last clip and then start counting from there.
+       assert(!progress.empty());
+       auto last_it = progress.end();
+       --last_it;
+       double remaining = 0.0;
+       double last_fade_time_seconds = 0.0;
+       for (size_t row = last_it->first; row < playlist_clips->size(); ++row) {
+               const Clip clip = *playlist_clips->clip(row);
+               double clip_length = double(clip.pts_out - clip.pts_in) / TIMEBASE / 0.5;  // FIXME: stop hardcoding speed.
+               if (row == last_it->first) {
+                       // A clip we're playing: Subtract the part we've already played.
+                       remaining = clip_length * (1.0 - last_it->second);
+               } else {
+                       // A clip we haven't played yet: Subtract the part that's overlapping
+                       // with a previous clip (due to fade).
+                       remaining += max(clip_length - last_fade_time_seconds, 0.0);
+               }
+               last_fade_time_seconds = min(clip_length, clip.fade_time_seconds);
+       }
+       set_output_status(format_duration(remaining) + " left");
+}
+
+void MainWindow::resizeEvent(QResizeEvent *event)
+{
+       QMainWindow::resizeEvent(event);
+
+       // Ask for a relayout, but only after the event loop is done doing relayout
+       // on everything else.
+       QMetaObject::invokeMethod(this, "relayout", Qt::QueuedConnection);
+}
+
+void MainWindow::relayout()
+{
+       ui->live_display->setMinimumWidth(ui->live_display->height() * 16 / 9);
+       ui->preview_display->setMinimumWidth(ui->preview_display->height() * 16 / 9);
+}
+
+void set_pts_in(int64_t pts, int64_t current_pts, ClipProxy &clip)
+{
+       pts = std::max<int64_t>(pts, 0);
+       if (clip->pts_out == -1) {
+               pts = std::min(pts, current_pts);
+       } else {
+               pts = std::min(pts, clip->pts_out);
+       }
+       clip->pts_in = pts;
+}
+
+bool MainWindow::eventFilter(QObject *watched, QEvent *event)
+{
+       constexpr int dead_zone_pixels = 3;  // To avoid that simple clicks get misinterpreted.
+       constexpr int scrub_sensitivity = 100;  // pts units per pixel.
+       constexpr int wheel_sensitivity = 100;  // pts units per degree.
+       constexpr int camera_degrees_per_pixel = 15;  // One click of most mice.
+
+       unsigned stream_idx = ui->preview_display->get_stream_idx();
+
+       if (watched == ui->clip_list) {
+               if (event->type() == QEvent::FocusOut) {
+                       highlight_camera_input(-1);
+               }
+               return false;
+       }
+
+       if (event->type() != QEvent::Wheel) {
+               last_mousewheel_camera_row = -1;
+       }
+
+       if (event->type() == QEvent::MouseButtonPress) {
+               QMouseEvent *mouse = (QMouseEvent *)event;
+
+               QTableView *destination;
+               ScrubType type;
+
+               if (watched == ui->clip_list->viewport()) {
+                       destination = ui->clip_list;
+                       type = SCRUBBING_CLIP_LIST;
+               } else if (watched == ui->playlist->viewport()) {
+                       destination = ui->playlist;
+                       type = SCRUBBING_PLAYLIST;
+               } else {
+                       return false;
+               }
+               int column = destination->columnAt(mouse->x());
+               int row = destination->rowAt(mouse->y());
+               if (column == -1 || row == -1)
+                       return false;
+
+               if (type == SCRUBBING_CLIP_LIST) {
+                       if (ClipList::Column(column) == ClipList::Column::IN) {
+                               scrub_pts_origin = cliplist_clips->clip(row)->pts_in;
+                               preview_single_frame(scrub_pts_origin, stream_idx, FIRST_AT_OR_AFTER);
+                       } else if (ClipList::Column(column) == ClipList::Column::OUT) {
+                               scrub_pts_origin = cliplist_clips->clip(row)->pts_out;
+                               preview_single_frame(scrub_pts_origin, stream_idx, LAST_BEFORE);
+                       } else {
+                               return false;
+                       }
+               } else {
+                       if (PlayList::Column(column) == PlayList::Column::IN) {
+                               scrub_pts_origin = playlist_clips->clip(row)->pts_in;
+                               preview_single_frame(scrub_pts_origin, stream_idx, FIRST_AT_OR_AFTER);
+                       } else if (PlayList::Column(column) == PlayList::Column::OUT) {
+                               scrub_pts_origin = playlist_clips->clip(row)->pts_out;
+                               preview_single_frame(scrub_pts_origin, stream_idx, LAST_BEFORE);
+                       } else {
+                               return false;
+                       }
+               }
+
+               scrubbing = true;
+               scrub_row = row;
+               scrub_column = column;
+               scrub_x_origin = mouse->x();
+               scrub_type = type;
+       } else if (event->type() == QEvent::MouseMove) {
+               if (scrubbing) {
+                       QMouseEvent *mouse = (QMouseEvent *)event;
+                       int offset = mouse->x() - scrub_x_origin;
+                       int adjusted_offset;
+                       if (offset >= dead_zone_pixels) {
+                               adjusted_offset = offset - dead_zone_pixels;
+                       } else if (offset < -dead_zone_pixels) {
+                               adjusted_offset = offset + dead_zone_pixels;
+                       } else {
+                               adjusted_offset = 0;
+                       }
+
+                       int64_t pts = scrub_pts_origin + adjusted_offset * scrub_sensitivity;
+                       currently_deferring_model_changes = true;
+                       if (scrub_type == SCRUBBING_CLIP_LIST) {
+                               ClipProxy clip = cliplist_clips->mutable_clip(scrub_row);
+                               if (scrub_column == int(ClipList::Column::IN)) {
+                                       current_change_id = "cliplist:in:" + to_string(scrub_row);
+                                       set_pts_in(pts, current_pts, clip);
+                                       preview_single_frame(pts, stream_idx, FIRST_AT_OR_AFTER);
+                               } else {
+                                       current_change_id = "cliplist:out" + to_string(scrub_row);
+                                       pts = std::max(pts, clip->pts_in);
+                                       pts = std::min(pts, current_pts);
+                                       clip->pts_out = pts;
+                                       preview_single_frame(pts, stream_idx, LAST_BEFORE);
+                               }
+                       } else {
+                               ClipProxy clip = playlist_clips->mutable_clip(scrub_row);
+                               if (scrub_column == int(PlayList::Column::IN)) {
+                                       current_change_id = "playlist:in:" + to_string(scrub_row);
+                                       set_pts_in(pts, current_pts, clip);
+                                       preview_single_frame(pts, clip->stream_idx, FIRST_AT_OR_AFTER);
+                               } else {
+                                       current_change_id = "playlist:out:" + to_string(scrub_row);
+                                       pts = std::max(pts, clip->pts_in);
+                                       pts = std::min(pts, current_pts);
+                                       clip->pts_out = pts;
+                                       preview_single_frame(pts, clip->stream_idx, LAST_BEFORE);
+                               }
+                       }
+                       currently_deferring_model_changes = false;
+
+                       return true;  // Don't use this mouse movement for selecting things.
+               }
+       } else if (event->type() == QEvent::Wheel) {
+               QWheelEvent *wheel = (QWheelEvent *)event;
+
+               QTableView *destination;
+               int in_column, out_column, camera_column;
+               if (watched == ui->clip_list->viewport()) {
+                       destination = ui->clip_list;
+                       in_column = int(ClipList::Column::IN);
+                       out_column = int(ClipList::Column::OUT);
+                       camera_column = -1;
+                       last_mousewheel_camera_row = -1;
+               } else if (watched == ui->playlist->viewport()) {
+                       destination = ui->playlist;
+                       in_column = int(PlayList::Column::IN);
+                       out_column = int(PlayList::Column::OUT);
+                       camera_column = int(PlayList::Column::CAMERA);
+               } else {
+                       last_mousewheel_camera_row = -1;
+                       return false;
+               }
+               int column = destination->columnAt(wheel->x());
+               int row = destination->rowAt(wheel->y());
+               if (column == -1 || row == -1) return false;
+
+               // Only adjust pts with the wheel if the given row is selected.
+               if (!destination->hasFocus() ||
+                   row != destination->selectionModel()->currentIndex().row()) {
+                       return false;
+               }
+
+               currently_deferring_model_changes = true;
+               {
+                       current_change_id = (watched == ui->clip_list->viewport()) ? "cliplist:" : "playlist:";
+                       ClipProxy clip = (watched == ui->clip_list->viewport()) ?
+                               cliplist_clips->mutable_clip(row) : playlist_clips->mutable_clip(row);
+                       if (watched == ui->playlist->viewport()) {
+                               stream_idx = clip->stream_idx;
+                       }
+
+                       if (column != camera_column) {
+                               last_mousewheel_camera_row = -1;
+                       }
+                       if (column == in_column) {
+                               current_change_id += "in:" + to_string(row);
+                               int64_t pts = clip->pts_in + wheel->angleDelta().y() * wheel_sensitivity;
+                               set_pts_in(pts, current_pts, clip);
+                               preview_single_frame(pts, stream_idx, FIRST_AT_OR_AFTER);
+                       } else if (column == out_column) {
+                               current_change_id += "out:" + to_string(row);
+                               int64_t pts = clip->pts_out + wheel->angleDelta().y() * wheel_sensitivity;
+                               pts = std::max(pts, clip->pts_in);
+                               pts = std::min(pts, current_pts);
+                               clip->pts_out = pts;
+                               preview_single_frame(pts, stream_idx, LAST_BEFORE);
+                       } else if (column == camera_column) {
+                               current_change_id += "camera:" + to_string(row);
+                               int angle_degrees = wheel->angleDelta().y();
+                               if (last_mousewheel_camera_row == row) {
+                                       angle_degrees += leftover_angle_degrees;
+                               }
+
+                               int stream_idx = clip->stream_idx + angle_degrees / camera_degrees_per_pixel;
+                               stream_idx = std::max(stream_idx, 0);
+                               stream_idx = std::min(stream_idx, NUM_CAMERAS - 1);
+                               clip->stream_idx = stream_idx;
+
+                               last_mousewheel_camera_row = row;
+                               leftover_angle_degrees = angle_degrees % camera_degrees_per_pixel;
+
+                               // Don't update the live view, that's rarely what the operator wants.
+                       }
+               }
+               currently_deferring_model_changes = false;
+               return true;  // Don't scroll.
+       } else if (event->type() == QEvent::MouseButtonRelease) {
+               scrubbing = false;
+       }
+       return false;
+}
+
+void MainWindow::preview_single_frame(int64_t pts, unsigned stream_idx, MainWindow::Rounding rounding)
+{
+       if (rounding == LAST_BEFORE) {
+               lock_guard<mutex> lock(frame_mu);
+               if (frames[stream_idx].empty())
+                       return;
+               auto it = lower_bound(frames[stream_idx].begin(), frames[stream_idx].end(), pts,
+                       [](const FrameOnDisk &frame, int64_t pts) { return frame.pts < pts; });
+               if (it != frames[stream_idx].end()) {
+                       pts = it->pts;
+               }
+       } else {
+               assert(rounding == FIRST_AT_OR_AFTER);
+               lock_guard<mutex> lock(frame_mu);
+               if (frames[stream_idx].empty())
+                       return;
+               auto it = upper_bound(frames[stream_idx].begin(), frames[stream_idx].end(), pts - 1,
+                       [](int64_t pts, const FrameOnDisk &frame) { return pts < frame.pts; });
+               if (it != frames[stream_idx].end()) {
+                       pts = it->pts;
+               }
+       }
+
+       Clip fake_clip;
+       fake_clip.pts_in = pts;
+       fake_clip.pts_out = pts + 1;
+       preview_player->play_clip(fake_clip, 0, stream_idx);
+}
+
+void MainWindow::playlist_selection_changed()
+{
+       QItemSelectionModel *selected = ui->playlist->selectionModel();
+       bool any_selected = selected->hasSelection();
+       ui->playlist_duplicate_btn->setEnabled(any_selected);
+       ui->playlist_remove_btn->setEnabled(any_selected);
+       ui->playlist_move_up_btn->setEnabled(
+               any_selected && selected->selectedRows().front().row() > 0);
+       ui->playlist_move_down_btn->setEnabled(
+               any_selected && selected->selectedRows().back().row() < int(playlist_clips->size()) - 1);
+       ui->play_btn->setEnabled(!playlist_clips->empty());
+
+       if (!any_selected) {
+               set_output_status("paused");
+       } else {
+               double remaining = 0.0;
+               for (int row = selected->selectedRows().front().row(); row < int(playlist_clips->size()); ++row) {
+                       const Clip clip = *playlist_clips->clip(row);
+                       remaining += double(clip.pts_out - clip.pts_in) / TIMEBASE / 0.5;  // FIXME: stop hardcoding speed.
+               }
+               set_output_status(format_duration(remaining) + " ready");
+       }
+}
+
+void MainWindow::clip_list_selection_changed(const QModelIndex &current, const QModelIndex &)
+{
+       int camera_selected = -1;
+       if (current.column() >= int(ClipList::Column::CAMERA_1) &&
+           current.column() <= int(ClipList::Column::CAMERA_4)) {
+               camera_selected = current.column() - int(ClipList::Column::CAMERA_1);
+       }
+       highlight_camera_input(camera_selected);
+}
+
+void MainWindow::report_disk_space(off_t free_bytes, double estimated_seconds_left)
+{
+       char time_str[256];
+       if (estimated_seconds_left < 60.0) {
+               strcpy(time_str, "<font color=\"red\">Less than a minute</font>");
+       } else if (estimated_seconds_left < 1800.0) {  // Less than half an hour: Xm Ys (red).
+               int s = lrintf(estimated_seconds_left);
+               int m = s / 60;
+               s %= 60;
+               snprintf(time_str, sizeof(time_str), "<font color=\"red\">%dm %ds</font>", m, s);
+       } else if (estimated_seconds_left < 3600.0) {  // Less than an hour: Xm.
+               int m = lrintf(estimated_seconds_left / 60.0);
+               snprintf(time_str, sizeof(time_str), "%dm", m);
+       } else if (estimated_seconds_left < 36000.0) {  // Less than ten hours: Xh Ym.
+               int m = lrintf(estimated_seconds_left / 60.0);
+               int h = m / 60;
+               m %= 60;
+               snprintf(time_str, sizeof(time_str), "%dh %dm", h, m);
+       } else {  // More than ten hours: Xh.
+               int h = lrintf(estimated_seconds_left / 3600.0);
+               snprintf(time_str, sizeof(time_str), "%dh", h);
+       }
+       char buf[256];
+       snprintf(buf, sizeof(buf), "Disk free: %'.0f MB (approx. %s)", free_bytes / 1048576.0, time_str);
+
+       std::string label = buf;
+
+       post_to_main_thread([this, label] {
+               disk_free_label->setText(QString::fromStdString(label));
+               ui->menuBar->setCornerWidget(disk_free_label);  // Need to set this again for the sizing to get right.
+       });
+}
+
+void MainWindow::exit_triggered()
+{
+       close();
+}
+
+void MainWindow::highlight_camera_input(int stream_idx)
+{
+       if (stream_idx == 0) {
+               ui->input1_frame->setStyleSheet("background: rgb(0,255,0)");
+       } else {
+               ui->input1_frame->setStyleSheet("");
+       }
+       if (stream_idx == 1) {
+               ui->input2_frame->setStyleSheet("background: rgb(0,255,0)");
+       } else {
+               ui->input2_frame->setStyleSheet("");
+       }
+       if (stream_idx == 2) {
+               ui->input3_frame->setStyleSheet("background: rgb(0,255,0)");
+       } else {
+               ui->input3_frame->setStyleSheet("");
+       }
+       if (stream_idx == 3) {
+               ui->input4_frame->setStyleSheet("background: rgb(0,255,0)");
+       } else {
+               ui->input4_frame->setStyleSheet("");
+       }
+}
+
+void MainWindow::set_output_status(const string &status)
+{
+       ui->live_label->setText(QString::fromStdString("Current output (" + status + ")"));
+
+       lock_guard<mutex> lock(queue_status_mu);
+       queue_status = status;
+}
+
+pair<string, string> MainWindow::get_queue_status() const {
+       lock_guard<mutex> lock(queue_status_mu);
+       return {queue_status, "text/plain"};
+}
diff --git a/futatabi/mainwindow.h b/futatabi/mainwindow.h
new file mode 100644 (file)
index 0000000..7f8c57a
--- /dev/null
@@ -0,0 +1,112 @@
+#ifndef MAINWINDOW_H
+#define MAINWINDOW_H
+
+#include "clip_list.h"
+#include "db.h"
+#include "state.pb.h"
+
+#include <mutex>
+#include <QLabel>
+#include <QMainWindow>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <string>
+#include <utility>
+
+namespace Ui {
+class MainWindow;
+}  // namespace Ui
+
+class Player;
+
+class MainWindow : public QMainWindow {
+       Q_OBJECT
+
+public:
+       MainWindow();
+
+       // HTTP callback. TODO: Does perhaps not belong to MainWindow?
+       std::pair<std::string, std::string> get_queue_status() const;
+
+//private:
+       Ui::MainWindow *ui;
+
+private:
+       QLabel *disk_free_label;
+       Player *preview_player, *live_player;
+       DB db;
+
+       // State when doing a scrub operation on a timestamp with the mouse.
+       bool scrubbing = false;
+       int scrub_x_origin;  // In pixels on the viewport.
+       int64_t scrub_pts_origin;
+
+       // Which element (e.g. pts_in on clip 4) we are scrubbing.
+       enum ScrubType { SCRUBBING_CLIP_LIST, SCRUBBING_PLAYLIST } scrub_type;
+       int scrub_row;
+       int scrub_column;
+
+       // Used to keep track of small mouse wheel motions on the camera index in the playlist.
+       int last_mousewheel_camera_row = -1;
+       int leftover_angle_degrees = 0;
+
+       // Some operations, notably scrubbing and scrolling, happen in so large increments
+       // that we want to group them instead of saving to disk every single time.
+       // If they happen (ie., we get a callback from the model that it's changed) while
+       // currently_deferring_model_changes, we fire off this timer. If it manages to elapse
+       // before some other event happens, we count the event. (If the other event is of the
+       // same kind, we just fire off the timer anew instead of taking any action.)
+       QTimer *defer_timeout;
+       std::string deferred_change_id;
+       StateProto deferred_state;
+
+       // Before a change that should be deferred (see above), currently_deferring_model_changes
+       // must be set to true, and current_change_id must be given contents describing what's
+       // changed to avoid accidental grouping.
+       bool currently_deferring_model_changes = false;
+       std::string current_change_id;
+
+       mutable std::mutex queue_status_mu;
+       std::string queue_status;  // Under queue_status_mu.
+
+       void cue_in_clicked();
+       void cue_out_clicked();
+       void queue_clicked();
+       void preview_clicked();
+       void preview_angle_clicked(unsigned stream_idx);
+       void play_clicked();
+       void live_player_clip_done();
+       std::pair<Clip, size_t> live_player_get_next_clip();
+       void live_player_clip_progress(const std::map<size_t, double> &progress);
+       void set_output_status(const std::string &status);
+       void playlist_duplicate();
+       void playlist_remove();
+       void playlist_move(int delta);
+
+       void defer_timer_expired();
+       void content_changed();  // In clip_list or play_list.
+       void state_changed(const StateProto &state);  // Called post-filtering.
+
+       enum Rounding { FIRST_AT_OR_AFTER, LAST_BEFORE };
+       void preview_single_frame(int64_t pts, unsigned stream_idx, Rounding rounding);
+
+       // Also covers when the playlist itself changes.
+       void playlist_selection_changed();
+
+       void clip_list_selection_changed(const QModelIndex &current, const QModelIndex &previous);
+
+       void resizeEvent(QResizeEvent *event) override;
+       bool eventFilter(QObject *watched, QEvent *event) override;
+
+       void report_disk_space(off_t free_bytes, double estimated_seconds_left);
+       void exit_triggered();
+
+       void highlight_camera_input(int stream_idx);
+
+private slots:
+       void relayout();
+};
+
+extern MainWindow *global_mainwindow;
+
+#endif
diff --git a/futatabi/mainwindow.ui b/futatabi/mainwindow.ui
new file mode 100644 (file)
index 0000000..dbdb622
--- /dev/null
@@ -0,0 +1,472 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>MainWindow</class>
+ <widget class="QMainWindow" name="MainWindow">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>1038</width>
+    <height>600</height>
+   </rect>
+  </property>
+  <property name="windowTitle">
+   <string>Futatabi</string>
+  </property>
+  <widget class="QWidget" name="centralwidget">
+   <layout class="QGridLayout" name="gridLayout">
+    <item row="0" column="0">
+     <widget class="QSplitter" name="splitter">
+      <property name="orientation">
+       <enum>Qt::Horizontal</enum>
+      </property>
+      <widget class="QWidget" name="horizontalLayoutWidget">
+       <layout class="QVBoxLayout" name="clip_and_play_lists">
+        <item>
+         <widget class="QTableView" name="clip_list"/>
+        </item>
+        <item>
+         <layout class="QHBoxLayout" name="clip_list_buttons">
+          <item>
+           <widget class="QPushButton" name="queue_btn">
+            <property name="text">
+             <string>Queue (&amp;Q)</string>
+            </property>
+            <property name="icon">
+             <iconset theme="list-add">
+              <normaloff>.</normaloff>.</iconset>
+            </property>
+           </widget>
+          </item>
+          <item>
+           <widget class="QPushButton" name="preview_btn">
+            <property name="text">
+             <string>Preview (&amp;W)</string>
+            </property>
+            <property name="icon">
+             <iconset theme="media-playback-start">
+              <normaloff>.</normaloff>.</iconset>
+            </property>
+           </widget>
+          </item>
+          <item>
+           <widget class="QPushButton" name="cue_in_btn">
+            <property name="text">
+             <string>Cue in (&amp;A)</string>
+            </property>
+           </widget>
+          </item>
+          <item>
+           <widget class="QPushButton" name="cue_out_btn">
+            <property name="text">
+             <string>Cue out (&amp;S)</string>
+            </property>
+           </widget>
+          </item>
+          <item>
+           <spacer name="horizontalSpacer_2">
+            <property name="orientation">
+             <enum>Qt::Horizontal</enum>
+            </property>
+            <property name="sizeHint" stdset="0">
+             <size>
+              <width>40</width>
+              <height>20</height>
+             </size>
+            </property>
+           </spacer>
+          </item>
+         </layout>
+        </item>
+        <item>
+         <widget class="QTableView" name="playlist">
+          <property name="selectionMode">
+           <enum>QAbstractItemView::ContiguousSelection</enum>
+          </property>
+          <property name="selectionBehavior">
+           <enum>QAbstractItemView::SelectRows</enum>
+          </property>
+         </widget>
+        </item>
+        <item>
+         <layout class="QHBoxLayout" name="playlist_buttons">
+          <item>
+           <widget class="QPushButton" name="playlist_duplicate_btn">
+            <property name="text">
+             <string>Duplicate</string>
+            </property>
+            <property name="icon">
+             <iconset theme="list-add">
+              <normaloff>.</normaloff>.</iconset>
+            </property>
+           </widget>
+          </item>
+          <item>
+           <widget class="QPushButton" name="playlist_remove_btn">
+            <property name="sizePolicy">
+             <sizepolicy hsizetype="Fixed" vsizetype="Fixed">
+              <horstretch>0</horstretch>
+              <verstretch>0</verstretch>
+             </sizepolicy>
+            </property>
+            <property name="text">
+             <string>Remove</string>
+            </property>
+            <property name="icon">
+             <iconset theme="list-remove">
+              <normaloff>.</normaloff>.</iconset>
+            </property>
+           </widget>
+          </item>
+          <item>
+           <widget class="QPushButton" name="playlist_move_up_btn">
+            <property name="text">
+             <string>Move up</string>
+            </property>
+            <property name="icon">
+             <iconset theme="go-up">
+              <normaloff>.</normaloff>.</iconset>
+            </property>
+           </widget>
+          </item>
+          <item>
+           <widget class="QPushButton" name="playlist_move_down_btn">
+            <property name="text">
+             <string>Move down</string>
+            </property>
+            <property name="icon">
+             <iconset theme="go-down">
+              <normaloff>.</normaloff>.</iconset>
+            </property>
+           </widget>
+          </item>
+          <item>
+           <spacer name="horizontalSpacer">
+            <property name="orientation">
+             <enum>Qt::Horizontal</enum>
+            </property>
+            <property name="sizeHint" stdset="0">
+             <size>
+              <width>40</width>
+              <height>20</height>
+             </size>
+            </property>
+           </spacer>
+          </item>
+          <item>
+           <widget class="QPushButton" name="play_btn">
+            <property name="text">
+             <string>Play (space)</string>
+            </property>
+            <property name="icon">
+             <iconset theme="media-playback-start">
+              <normaloff>.</normaloff>.</iconset>
+            </property>
+           </widget>
+          </item>
+         </layout>
+        </item>
+       </layout>
+      </widget>
+      <widget class="QWidget" name="verticalLayoutWidget_4">
+       <layout class="QVBoxLayout" name="video_displays" stretch="1,2">
+        <item>
+         <layout class="QHBoxLayout" name="preview_and_live_panes">
+          <item>
+           <layout class="QVBoxLayout" name="preview_pane" stretch="1,0">
+            <item>
+             <widget class="JPEGFrameView" name="preview_display" native="true"/>
+            </item>
+            <item>
+             <layout class="QHBoxLayout" name="horizontalLayout_3">
+              <property name="spacing">
+               <number>0</number>
+              </property>
+              <item>
+               <widget class="QLabel" name="label_2">
+                <property name="text">
+                 <string>Preview output</string>
+                </property>
+                <property name="alignment">
+                 <set>Qt::AlignCenter</set>
+                </property>
+               </widget>
+              </item>
+              <item>
+               <widget class="QPushButton" name="preview_1_btn">
+                <property name="sizePolicy">
+                 <sizepolicy hsizetype="Maximum" vsizetype="Fixed">
+                  <horstretch>0</horstretch>
+                  <verstretch>0</verstretch>
+                 </sizepolicy>
+                </property>
+                <property name="maximumSize">
+                 <size>
+                  <width>20</width>
+                  <height>17</height>
+                 </size>
+                </property>
+                <property name="text">
+                 <string>1</string>
+                </property>
+               </widget>
+              </item>
+              <item>
+               <widget class="QPushButton" name="preview_2_btn">
+                <property name="sizePolicy">
+                 <sizepolicy hsizetype="Maximum" vsizetype="Fixed">
+                  <horstretch>0</horstretch>
+                  <verstretch>0</verstretch>
+                 </sizepolicy>
+                </property>
+                <property name="maximumSize">
+                 <size>
+                  <width>20</width>
+                  <height>17</height>
+                 </size>
+                </property>
+                <property name="text">
+                 <string>2</string>
+                </property>
+               </widget>
+              </item>
+              <item>
+               <widget class="QPushButton" name="preview_3_btn">
+                <property name="maximumSize">
+                 <size>
+                  <width>20</width>
+                  <height>17</height>
+                 </size>
+                </property>
+                <property name="text">
+                 <string>3</string>
+                </property>
+               </widget>
+              </item>
+              <item>
+               <widget class="QPushButton" name="preview_4_btn">
+                <property name="maximumSize">
+                 <size>
+                  <width>20</width>
+                  <height>17</height>
+                 </size>
+                </property>
+                <property name="text">
+                 <string>4</string>
+                </property>
+               </widget>
+              </item>
+             </layout>
+            </item>
+           </layout>
+          </item>
+          <item>
+           <layout class="QVBoxLayout" name="live_pane" stretch="1,0">
+            <item>
+             <widget class="JPEGFrameView" name="live_display" native="true"/>
+            </item>
+            <item>
+             <widget class="QLabel" name="live_label">
+              <property name="text">
+               <string>Current output (paused)</string>
+              </property>
+              <property name="alignment">
+               <set>Qt::AlignCenter</set>
+              </property>
+             </widget>
+            </item>
+           </layout>
+          </item>
+         </layout>
+        </item>
+        <item>
+         <layout class="QVBoxLayout" name="input_pane" stretch="1,0">
+          <item>
+           <layout class="QGridLayout" name="input_displays">
+            <property name="spacing">
+             <number>0</number>
+            </property>
+            <item row="0" column="0">
+             <widget class="QFrame" name="input1_frame">
+              <property name="frameShape">
+               <enum>QFrame::Box</enum>
+              </property>
+              <property name="frameShadow">
+               <enum>QFrame::Plain</enum>
+              </property>
+              <property name="lineWidth">
+               <number>0</number>
+              </property>
+              <layout class="QGridLayout" name="gridLayout_2">
+               <property name="leftMargin">
+                <number>3</number>
+               </property>
+               <property name="topMargin">
+                <number>3</number>
+               </property>
+               <property name="rightMargin">
+                <number>3</number>
+               </property>
+               <property name="bottomMargin">
+                <number>3</number>
+               </property>
+               <item row="0" column="0">
+                <widget class="JPEGFrameView" name="input1_display" native="true">
+                 <property name="autoFillBackground">
+                  <bool>true</bool>
+                 </property>
+                </widget>
+               </item>
+              </layout>
+             </widget>
+            </item>
+            <item row="1" column="0">
+             <widget class="QFrame" name="input3_frame">
+              <property name="frameShape">
+               <enum>QFrame::Box</enum>
+              </property>
+              <property name="frameShadow">
+               <enum>QFrame::Plain</enum>
+              </property>
+              <property name="lineWidth">
+               <number>0</number>
+              </property>
+              <layout class="QGridLayout" name="gridLayout_4">
+               <property name="leftMargin">
+                <number>3</number>
+               </property>
+               <property name="topMargin">
+                <number>3</number>
+               </property>
+               <property name="rightMargin">
+                <number>3</number>
+               </property>
+               <property name="bottomMargin">
+                <number>3</number>
+               </property>
+               <item row="0" column="0">
+                <widget class="JPEGFrameView" name="input3_display" native="true"/>
+               </item>
+              </layout>
+             </widget>
+            </item>
+            <item row="0" column="1">
+             <widget class="QFrame" name="input2_frame">
+              <property name="frameShape">
+               <enum>QFrame::Box</enum>
+              </property>
+              <property name="frameShadow">
+               <enum>QFrame::Plain</enum>
+              </property>
+              <property name="lineWidth">
+               <number>0</number>
+              </property>
+              <layout class="QGridLayout" name="gridLayout_3">
+               <property name="leftMargin">
+                <number>3</number>
+               </property>
+               <property name="topMargin">
+                <number>3</number>
+               </property>
+               <property name="rightMargin">
+                <number>3</number>
+               </property>
+               <property name="bottomMargin">
+                <number>3</number>
+               </property>
+               <item row="0" column="0">
+                <widget class="JPEGFrameView" name="input2_display" native="true">
+                 <property name="autoFillBackground">
+                  <bool>true</bool>
+                 </property>
+                </widget>
+               </item>
+              </layout>
+             </widget>
+            </item>
+            <item row="1" column="1">
+             <widget class="QFrame" name="input4_frame">
+              <property name="autoFillBackground">
+               <bool>true</bool>
+              </property>
+              <property name="frameShape">
+               <enum>QFrame::Box</enum>
+              </property>
+              <property name="frameShadow">
+               <enum>QFrame::Plain</enum>
+              </property>
+              <property name="lineWidth">
+               <number>0</number>
+              </property>
+              <layout class="QGridLayout" name="gridLayout_5">
+               <property name="leftMargin">
+                <number>3</number>
+               </property>
+               <property name="topMargin">
+                <number>3</number>
+               </property>
+               <property name="rightMargin">
+                <number>3</number>
+               </property>
+               <property name="bottomMargin">
+                <number>3</number>
+               </property>
+               <item row="0" column="0">
+                <widget class="JPEGFrameView" name="input4_display" native="true"/>
+               </item>
+              </layout>
+             </widget>
+            </item>
+           </layout>
+          </item>
+          <item>
+           <widget class="QLabel" name="label">
+            <property name="text">
+             <string>Current inputs</string>
+            </property>
+            <property name="alignment">
+             <set>Qt::AlignCenter</set>
+            </property>
+           </widget>
+          </item>
+         </layout>
+        </item>
+       </layout>
+      </widget>
+     </widget>
+    </item>
+   </layout>
+  </widget>
+  <widget class="QMenuBar" name="menuBar">
+   <property name="geometry">
+    <rect>
+     <x>0</x>
+     <y>0</y>
+     <width>1038</width>
+     <height>22</height>
+    </rect>
+   </property>
+   <widget class="QMenu" name="menuFile">
+    <property name="title">
+     <string>&amp;File</string>
+    </property>
+    <addaction name="exit_action"/>
+   </widget>
+   <addaction name="menuFile"/>
+  </widget>
+  <action name="exit_action">
+   <property name="text">
+    <string>E&amp;xit</string>
+   </property>
+  </action>
+ </widget>
+ <customwidgets>
+  <customwidget>
+   <class>JPEGFrameView</class>
+   <extends>QWidget</extends>
+   <header>jpeg_frame_view.h</header>
+  </customwidget>
+ </customwidgets>
+ <resources/>
+ <connections/>
+</ui>
diff --git a/futatabi/memcpy_interleaved.cpp b/futatabi/memcpy_interleaved.cpp
new file mode 100644 (file)
index 0000000..9634fd2
--- /dev/null
@@ -0,0 +1,136 @@
+#include <algorithm>
+#include <assert.h>
+#include <cstdint>
+#if __SSE2__
+#include <immintrin.h>
+#endif
+
+using namespace std;
+
+// TODO: Support stride.
+void memcpy_interleaved_slow(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+       assert(n % 2 == 0);
+       uint8_t *dptr1 = dest1;
+       uint8_t *dptr2 = dest2;
+
+       for (size_t i = 0; i < n; i += 2) {
+               *dptr1++ = *src++;
+               *dptr2++ = *src++;
+       }
+}
+
+#ifdef __SSE2__
+
+// Returns the number of bytes consumed.
+size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+       const uint8_t *limit = src + n;
+       size_t consumed = 0;
+
+       // Align end to 32 bytes.
+       limit = (const uint8_t *)(intptr_t(limit) & ~31);
+
+       if (src >= limit) {
+               return 0;
+       }
+
+       // Process [0,31] bytes, such that start gets aligned to 32 bytes.
+       const uint8_t *aligned_src = (const uint8_t *)(intptr_t(src + 31) & ~31);
+       if (aligned_src != src) {
+               size_t n2 = aligned_src - src;
+               memcpy_interleaved_slow(dest1, dest2, src, n2);
+               dest1 += n2 / 2;
+               dest2 += n2 / 2;
+               if (n2 % 2) {
+                       swap(dest1, dest2);
+               }
+               src = aligned_src;
+               consumed += n2;
+       }
+
+       // Make the length a multiple of 64.
+       if (((limit - src) % 64) != 0) {
+               limit -= 32;
+       }
+       assert(((limit - src) % 64) == 0);
+
+#if __AVX2__
+       const __m256i *__restrict in = (const __m256i *)src;
+       __m256i *__restrict out1 = (__m256i *)dest1;
+       __m256i *__restrict out2 = (__m256i *)dest2;
+
+       __m256i shuffle_cw = _mm256_set_epi8(
+               15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
+               15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
+       while (in < (const __m256i *)limit) {
+               // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
+               __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
+               __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
+
+               data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
+               data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
+       
+               data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
+               data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
+
+               __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
+               __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
+
+               _mm256_storeu_si256(out1, lo);
+               _mm256_storeu_si256(out2, hi);
+
+               in += 2;
+               ++out1;
+               ++out2;
+               consumed += 64;
+       }
+#else
+       const __m128i * __restrict in = (const __m128i *)src;
+       __m128i * __restrict out1 = (__m128i *)dest1;
+       __m128i * __restrict out2 = (__m128i *)dest2;
+
+       __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
+       while (in < (const __m128i *)limit) {
+               __m128i data1 = _mm_load_si128(in);
+               __m128i data2 = _mm_load_si128(in + 1);
+               __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
+               __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
+               __m128i data1_hi = _mm_srli_epi16(data1, 8);
+               __m128i data2_hi = _mm_srli_epi16(data2, 8);
+               __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
+               _mm_storeu_si128(out1, lo);
+               __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
+               _mm_storeu_si128(out2, hi);
+
+               in += 2;
+               ++out1;
+               ++out2;
+               consumed += 32;
+       }
+#endif
+
+       return consumed;
+}
+
+#endif  // defined(__SSE2__)
+
+void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+#ifdef __SSE2__
+       size_t consumed = memcpy_interleaved_fastpath(dest1, dest2, src, n);
+       src += consumed;
+       dest1 += consumed / 2;
+       dest2 += consumed / 2;
+       if (consumed % 2) {
+               swap(dest1, dest2);
+       }
+       n -= consumed;
+
+       if (n > 0) {
+               memcpy_interleaved_slow(dest1, dest2, src, n);
+       }
+#else
+       memcpy_interleaved_slow(dest1, dest2, src, n);
+#endif
+}
diff --git a/futatabi/memcpy_interleaved.h b/futatabi/memcpy_interleaved.h
new file mode 100644 (file)
index 0000000..a7f8994
--- /dev/null
@@ -0,0 +1,11 @@
+#ifndef _MEMCPY_INTERLEAVED_H
+#define _MEMCPY_INTERLEAVED_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+// Copies every other byte from src to dest1 and dest2.
+// TODO: Support stride.
+void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n);
+
+#endif  // !defined(_MEMCPY_INTERLEAVED_H)
diff --git a/futatabi/meson.build b/futatabi/meson.build
new file mode 100644 (file)
index 0000000..fdcc446
--- /dev/null
@@ -0,0 +1,63 @@
+qt5 = import('qt5')
+protoc = find_program('protoc')
+
+epoxydep = dependency('epoxy')
+libavcodecdep = dependency('libavcodec')
+libavformatdep = dependency('libavformat')
+libavutildep = dependency('libavutil')
+libjpegdep = dependency('libjpeg')
+libmicrohttpddep = dependency('libmicrohttpd')
+libswscaledep = dependency('libswscale')
+movitdep = dependency('movit')
+protobufdep = dependency('protobuf')
+qt5deps = dependency('qt5', modules: ['Core', 'Gui', 'Widgets', 'OpenGLExtensions', 'OpenGL', 'PrintSupport'])
+sdl2_imagedep = dependency('SDL2_image')
+sdl2dep = dependency('sdl2')
+sqlite3dep = dependency('sqlite3')
+vadrmdep = dependency('libva-drm')
+vax11dep = dependency('libva-x11')
+x11dep = dependency('x11')
+
+# Protobuf compilation.
+gen = generator(protoc, \
+  output    : ['@BASENAME@.pb.cc', '@BASENAME@.pb.h'],
+  arguments : ['--proto_path=@CURRENT_SOURCE_DIR@', '--cpp_out=@BUILD_DIR@', '@INPUT@'])
+proto_generated = gen.process('state.proto', 'frame.proto')
+
+# Preprocess Qt as needed.
+moc_files = qt5.preprocess(
+  moc_headers: ['mainwindow.h', 'jpeg_frame_view.h', 'clip_list.h'],
+  ui_files: ['mainwindow.ui'],
+  dependencies: qt5deps)
+
+# Flow objects.
+srcs = ['flow.cpp', 'gpu_timers.cpp']
+
+# All the other files.
+srcs += ['ffmpeg_raii.cpp', 'main.cpp', 'player.cpp', 'httpd.cpp', 'mux.cpp', 'metacube2.cpp', 'video_stream.cpp', 'context.cpp', 'chroma_subsampler.cpp']
+srcs += ['vaapi_jpeg_decoder.cpp', 'memcpy_interleaved.cpp', 'db.cpp', 'disk_space_estimator.cpp', 'ycbcr_converter.cpp', 'flags.cpp']
+srcs += ['mainwindow.cpp', 'jpeg_frame_view.cpp', 'clip_list.cpp', 'frame_on_disk.cpp']
+srcs += moc_files
+srcs += proto_generated
+
+# Shaders needed at runtime.
+shaders = ['chroma_subsample.vert', 'densify.vert', 'equations.vert', 'hole_fill.vert', 'motion_search.vert', 'sor.vert', 'splat.vert', 'vs.vert']
+shaders += ['add_base_flow.frag', 'blend.frag', 'chroma_subsample.frag', 'densify.frag', 'derivatives.frag', 'diffusivity.frag',
+  'equations.frag', 'gray.frag', 'hole_blend.frag', 'hole_fill.frag', 'motion_search.frag', 'prewarp.frag', 'resize_flow.frag',
+  'sobel.frag', 'sor.frag', 'splat.frag']
+
+foreach shader : shaders
+  run_command('ln', '-s', join_paths(meson.current_source_dir(), shader), meson.current_build_dir())
+endforeach
+
+bin2h = executable('bin2h', 'bin2h.cpp')
+bin2h_gen = generator(bin2h, \
+  output    : ['@PLAINNAME@.cpp'],
+  arguments : ['@INPUT@', '@PLAINNAME@', '@OUTPUT@'])
+shader_srcs = bin2h_gen.process(shaders)
+srcs += shader_srcs
+
+executable('futatabi', srcs, dependencies: [qt5deps, libjpegdep, movitdep, libmicrohttpddep, protobufdep, sqlite3dep, vax11dep, vadrmdep, x11dep, libavformatdep, libavcodecdep, libavutildep, libswscaledep])
+executable('flow', 'flow_main.cpp', 'flow.cpp', 'gpu_timers.cpp', shader_srcs, dependencies: [epoxydep, sdl2dep, sdl2_imagedep])
+executable('eval', 'eval.cpp', 'util.cpp')
+executable('vis', 'vis.cpp', 'util.cpp')
diff --git a/futatabi/metacube2.cpp b/futatabi/metacube2.cpp
new file mode 100644 (file)
index 0000000..6b68132
--- /dev/null
@@ -0,0 +1,60 @@
+/*
+ * Implementation of Metacube2 utility functions.
+ *
+ * Note: This file is meant to compile as both C and C++, for easier inclusion
+ * in other projects.
+ */
+
+#include "metacube2.h"
+
+#include <byteswap.h>
+#include <netinet/in.h>
+
+/*
+ * https://www.ece.cmu.edu/~koopman/pubs/KoopmanCRCWebinar9May2012.pdf
+ * recommends this for messages as short as ours (see table at page 34).
+ */
+#define METACUBE2_CRC_POLYNOMIAL 0x8FDB
+
+/* Semi-random starting value to make sure all-zero won't pass. */
+#define METACUBE2_CRC_START 0x1234
+
+/* This code is based on code generated by pycrc. */
+uint16_t metacube2_compute_crc(const struct metacube2_block_header *hdr)
+{
+       static const int data_len = sizeof(hdr->size) + sizeof(hdr->flags);
+       const uint8_t *data = (uint8_t *)&hdr->size;
+       uint16_t crc = METACUBE2_CRC_START;
+       int i, j;
+
+       for (i = 0; i < data_len; ++i) {
+               uint8_t c = data[i];
+               for (j = 0; j < 8; j++) {
+                       int bit = crc & 0x8000;
+                       crc = (crc << 1) | ((c >> (7 - j)) & 0x01);
+                       if (bit) {
+                               crc ^= METACUBE2_CRC_POLYNOMIAL;
+                       }
+               }
+       }
+
+       /* Finalize. */
+       for (i = 0; i < 16; i++) {
+               int bit = crc & 0x8000;
+               crc = crc << 1;
+               if (bit) {
+                       crc ^= METACUBE2_CRC_POLYNOMIAL;
+               }
+       }
+
+       /*
+        * Invert the checksum for metadata packets, so that clients that
+        * don't understand metadata will ignore it as broken. There will
+        * probably be logging, but apart from that, it's harmless.
+        */
+       if (ntohs(hdr->flags) & METACUBE_FLAGS_METADATA) {
+               crc ^= 0xffff;
+       }
+
+       return crc;
+}
diff --git a/futatabi/metacube2.h b/futatabi/metacube2.h
new file mode 100644 (file)
index 0000000..4f232c8
--- /dev/null
@@ -0,0 +1,71 @@
+#ifndef _METACUBE2_H
+#define _METACUBE2_H
+
+/*
+ * Definitions for the Metacube2 protocol, used to communicate with Cubemap.
+ *
+ * Note: This file is meant to compile as both C and C++, for easier inclusion
+ * in other projects.
+ */
+
+#include <stdint.h>
+
+#define METACUBE2_SYNC "cube!map"  /* 8 bytes long. */
+#define METACUBE_FLAGS_HEADER 0x1
+#define METACUBE_FLAGS_NOT_SUITABLE_FOR_STREAM_START 0x2
+
+/*
+ * Metadata packets; should not be counted as data, but rather
+ * parsed (or ignored if you don't understand them).
+ *
+ * Metadata packets start with a uint64_t (network byte order)
+ * that describe the type; the rest is defined by the type.
+ */
+#define METACUBE_FLAGS_METADATA 0x4
+
+struct metacube2_block_header {
+       char sync[8];    /* METACUBE2_SYNC */
+       uint32_t size;   /* Network byte order. Does not include header. */
+       uint16_t flags;  /* Network byte order. METACUBE_FLAGS_*. */
+       uint16_t csum;   /* Network byte order. CRC16 of size and flags.
+                            If METACUBE_FLAGS_METADATA is set, inverted
+                            so that older clients will ignore it as broken. */
+};
+
+uint16_t metacube2_compute_crc(const struct metacube2_block_header *hdr);
+
+/*
+ * Set by the encoder, and can be measured for latency purposes (e.g., if the
+ * network can't keep up, the latency will tend to increase.
+ */
+#define METACUBE_METADATA_TYPE_ENCODER_TIMESTAMP 0x1
+
+struct metacube2_timestamp_packet {
+       uint64_t type;  /* METACUBE_METADATA_TYPE_ENCODER_TIMESTAMP, in network byte order. */
+
+       /*
+        * Time since the UTC epoch. Basically a struct timespec.
+        * Both are in network byte order.
+        */
+       uint64_t tv_sec;
+       uint64_t tv_nsec;
+};
+
+/*
+ * Sent before a block to mark its presentation timestamp (ie., counts
+ * only for the next Metacube block). Used so that the reflector can know
+ * the length (in seconds) of fragments.
+ */
+#define METACUBE_METADATA_TYPE_NEXT_BLOCK_PTS 0x2
+
+struct metacube2_pts_packet {
+       uint64_t type;  /* METACUBE_METADATA_TYPE_NEXT_BLOCK_PTS, in network byte order. */
+
+       /* The timestamp of the first packet in the next block, in network byte order. */
+       int64_t pts;
+
+       /* Timebase "pts" is expressed in, as a fraction. Network byte order. */
+       uint64_t timebase_num, timebase_den;
+};
+
+#endif  /* !defined(_METACUBE_H) */
diff --git a/futatabi/motion_search.frag b/futatabi/motion_search.frag
new file mode 100644 (file)
index 0000000..eb4f7c7
--- /dev/null
@@ -0,0 +1,184 @@
+#version 450 core
+
+/*
+  The motion search is one of the two major components of DIS. It works more or less
+  like you'd expect; there's a bunch of overlapping patches (8x8 or 12x12 pixels) in
+  a grid, and for each patch, there's a search to try to find the most similar patch
+  in the other frame.
+
+  Unlike in a typical video codec, the DIS patch search is based on gradient descent;
+  conceptually, you start with an initial guess (the value from the previous level,
+  or the zero flow for the very first level), subtract the reference (“template”)
+  patch from the candidate, look at the gradient to see in what direction there is
+  a lower difference, and then inch a bit toward that direction. (There is seemingly
+  nothing like AdaM, Momentum or similar, but the searched value is only in two
+  dimensions, so perhaps it doesn't matter as much then.)
+
+  DIS does a tweak to this concept. Since the procedure as outlined above requires
+  computing the gradient of the candidate patch, it uses the reference patch as
+  candidate (thus the “inverse” name), and thus uses _its_ gradient to understand
+  in which direction to move. (This is a bit dodgy, but not _that_ dodgy; after
+  all, the two patches are supposed to be quite similar, so their surroundings and
+  thus also gradients should also be quite similar.) It's not entirely clear whether
+  this is still a win on GPU, where calculations are much cheaper, especially
+  the way we parallelize the search, but we've kept it around for now.
+
+  The inverse search is explained and derived in the supplementary material of the
+  paper, section A. Do note that there's a typo; the text under equation 9 claims
+  that the matrix H is n x n (where presumably n is the patch size), while in reality,
+  it's 2x2.
+
+  Our GPU parallellization is fairly dumb right now; we do one patch per fragment
+  (ie., parallellize only over patches, not within each patch), which may not
+  be optimal. In particular, in the initial level, we only have 40 patches,
+  which is on the low side for a GPU, and the memory access patterns may also not
+  be ideal.
+ */
+
+in vec3 flow_tc;
+in vec2 patch_center;
+flat in int ref_layer, search_layer;
+out vec3 out_flow;
+
+uniform sampler2DArray flow_tex, image_tex;
+uniform usampler2DArray grad_tex;  // Also contains the corresponding reference image.
+uniform vec2 inv_image_size, inv_prev_level_size;
+uniform uint patch_size;
+uniform uint num_iterations;
+
+vec3 unpack_gradients(uint v)
+{
+       uint vi = v & 0xffu;
+       uint xi = (v >> 8) & 0xfffu;
+       uint yi = v >> 20;
+       vec3 r = vec3(xi * (1.0f / 4095.0f) - 0.5f, yi * (1.0f / 4095.0f) - 0.5f, vi * (1.0f / 255.0f));
+       return r;
+}
+
+// Note: The third variable is the actual pixel value.
+vec3 get_gradients(vec3 tc)
+{
+       vec3 grad = unpack_gradients(texture(grad_tex, tc).x);
+
+       // Zero gradients outside the image. (We'd do this with a sampler,
+       // but we want the repeat behavior for the actual texels, in the
+       // z channel.)
+       if (any(lessThan(tc.xy, vec2(0.0f))) || any(greaterThan(tc.xy, vec2(1.0f)))) {
+               grad.xy = vec2(0.0f);
+       }
+
+       return grad;
+}
+
+void main()
+{
+       vec2 image_size = textureSize(grad_tex, 0).xy;
+
+       // Lock the patch center to an integer, so that we never get
+       // any bilinear artifacts for the gradient. (NOTE: This assumes an
+       // even patch size.) Then calculate the bottom-left texel of the patch.
+       vec2 base = (round(patch_center * image_size) - (0.5f * patch_size - 0.5f))
+               * inv_image_size;
+
+       // First, precompute the pseudo-Hessian for the template patch.
+       // This is the part where we really save by the inverse search
+       // (ie., we can compute it up-front instead of anew for each
+       // patch).
+       //
+       //  H = sum(S^T S)
+       //
+       // where S is the gradient at each point in the patch. Note that
+       // this is an outer product, so we get a (symmetric) 2x2 matrix,
+       // not a scalar.
+       mat2 H = mat2(0.0f);
+       vec2 grad_sum = vec2(0.0f);  // Used for patch normalization.
+       float template_sum = 0.0f;
+       for (uint y = 0; y < patch_size; ++y) {
+               for (uint x = 0; x < patch_size; ++x) {
+                       vec2 tc = base + uvec2(x, y) * inv_image_size;
+                       vec3 grad = get_gradients(vec3(tc, ref_layer));
+                       H[0][0] += grad.x * grad.x;
+                       H[1][1] += grad.y * grad.y;
+                       H[0][1] += grad.x * grad.y;
+
+                       template_sum += grad.z;  // The actual template pixel value.
+                       grad_sum += grad.xy;
+               }
+       }
+       H[1][0] = H[0][1];
+
+       // Make sure we don't get a singular matrix even if e.g. the picture is
+       // all black. (The paper doesn't mention this, but the reference code
+       // does it, and it seems like a reasonable hack to avoid NaNs. With such
+       // a H, we'll go out-of-bounds pretty soon, though.)
+       if (determinant(H) < 1e-6) {
+               H[0][0] += 1e-6;
+               H[1][1] += 1e-6;
+       }
+
+       mat2 H_inv = inverse(H);
+
+       // Fetch the initial guess for the flow, and convert from the previous size to this one.
+       vec2 initial_u = texture(flow_tex, flow_tc).xy * (image_size * inv_prev_level_size);
+       vec2 u = initial_u;
+       float mean_diff, first_mean_diff;
+
+       for (uint i = 0; i < num_iterations; ++i) {
+               vec2 du = vec2(0.0, 0.0);
+               float warped_sum = 0.0f;
+               vec2 u_norm = u * inv_image_size;  // In [0..1] coordinates instead of pixels.
+               for (uint y = 0; y < patch_size; ++y) {
+                       for (uint x = 0; x < patch_size; ++x) {
+                               vec2 tc = base + uvec2(x, y) * inv_image_size;
+                               vec3 grad = get_gradients(vec3(tc, ref_layer));
+                               float t = grad.z;
+                               float warped = texture(image_tex, vec3(tc + u_norm, search_layer)).x;
+                               du += grad.xy * (warped - t);
+                               warped_sum += warped;
+                       }
+               }
+
+               // Subtract the mean for patch normalization. We've done our
+               // sums without subtracting the means (because we didn't know them
+               // beforehand), ie.:
+               //
+               //   sum(S^T * ((x + µ1) - (y + µ2))) = sum(S^T * (x - y)) + (µ1 – µ2) sum(S^T)
+               //
+               // which gives trivially
+               //
+               //   sum(S^T * (x - y)) = [what we calculated] - (µ1 - µ2) sum(S^T)
+               //
+               // so we can just subtract away the mean difference here.
+               mean_diff = (warped_sum - template_sum) * (1.0 / float(patch_size * patch_size));
+               du -= grad_sum * mean_diff;
+
+               if (i == 0) {
+                       first_mean_diff = mean_diff;
+               }
+
+               // Do the actual update.
+               u -= H_inv * du;
+       }
+
+       // Reject if we moved too far. Note that the paper says “too far” is the
+       // patch size, but the DIS code uses half of a patch size. The latter seems
+       // to give much better overall results.
+       //
+       // Also reject if the patch goes out-of-bounds (the paper does not mention this,
+       // but the code does, and it seems to be critical to avoid really bad behavior
+       // at the edges).
+       vec2 patch_center = (base * image_size - 0.5f) + patch_size * 0.5f + u;
+       if (length(u - initial_u) > (patch_size * 0.5f) ||
+           patch_center.x < -(patch_size * 0.5f) ||
+           image_size.x - patch_center.x < -(patch_size * 0.5f) ||
+           patch_center.y < -(patch_size * 0.5f) ||
+           image_size.y - patch_center.y < -(patch_size * 0.5f)) {
+               u = initial_u;
+               mean_diff = first_mean_diff;
+       }
+
+       // NOTE: The mean patch diff will be for the second-to-last patch,
+       // not the true position of du. But hopefully, it will be very close.
+       u *= inv_image_size;
+       out_flow = vec3(u.x, u.y, mean_diff);
+}
diff --git a/futatabi/motion_search.vert b/futatabi/motion_search.vert
new file mode 100644 (file)
index 0000000..d023276
--- /dev/null
@@ -0,0 +1,47 @@
+#version 450 core
+#extension GL_ARB_shader_viewport_layer_array : require
+
+layout(location=0) in vec2 position;
+out vec3 flow_tc;
+out vec2 patch_center;
+flat out int ref_layer, search_layer;
+
+uniform sampler2DArray flow_tex;
+uniform vec2 out_flow_size;
+
+void main()
+{
+       // Patch placement: We want the outermost patches to have centers exactly in the
+       // image corners, so that the bottom-left patch has centre (0,0) and the
+       // upper-right patch has center (1,1). The position we get in is _almost_ there;
+       // since the quad's corners are in (0,0) and (1,1), the fragment shader will get
+       // centers in x=0.5/w, x=1.5/w and so on (and similar for y).
+       //
+       // In other words, find some f(x) = ax + b so that
+       //
+       //   a 0.5 / w + b = 0
+       //   a (1.0 - 0.5 / w) + b = 1
+       //
+       // which gives
+       //
+       //   a = 1 / (w - 1)
+       //   b = w / 2 (w - 1)
+       vec2 a = out_flow_size / (out_flow_size - 1);
+       vec2 b = -1.0 / (2 * (out_flow_size - 1.0));
+       patch_center = a * position + b;
+
+       // The result of glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) is:
+       //
+       //   2.000  0.000  0.000 -1.000
+       //   0.000  2.000  0.000 -1.000
+       //   0.000  0.000 -2.000 -1.000
+       //   0.000  0.000  0.000  1.000
+       gl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0);
+       flow_tc = vec3(position, gl_InstanceID);
+
+       gl_Layer = gl_InstanceID;
+
+       // Forward flow (0) goes from 0 to 1. Backward flow (1) goes from 1 to 0.
+       ref_layer = gl_InstanceID;
+       search_layer = 1 - gl_InstanceID;
+}
diff --git a/futatabi/mux.cpp b/futatabi/mux.cpp
new file mode 100644 (file)
index 0000000..bcbbef3
--- /dev/null
@@ -0,0 +1,269 @@
+#include "mux.h"
+
+#include <algorithm>
+#include <assert.h>
+#include <mutex>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <utility>
+#include <vector>
+
+extern "C" {
+#include <libavformat/avio.h>
+#include <libavutil/avutil.h>
+#include <libavutil/dict.h>
+#include <libavutil/mathematics.h>
+#include <libavutil/mem.h>
+#include <libavutil/pixfmt.h>
+#include <libavutil/rational.h>
+}
+
+#include "defs.h"
+#include "timebase.h"
+
+using namespace std;
+
+struct PacketBefore {
+       PacketBefore(const AVFormatContext *ctx) : ctx(ctx) {}
+
+       bool operator() (const Mux::QueuedPacket &a_qp, const Mux::QueuedPacket &b_qp) const {
+               const AVPacket *a = a_qp.pkt;
+               const AVPacket *b = b_qp.pkt;
+               int64_t a_dts = (a->dts == AV_NOPTS_VALUE ? a->pts : a->dts);
+               int64_t b_dts = (b->dts == AV_NOPTS_VALUE ? b->pts : b->dts);
+               AVRational a_timebase = ctx->streams[a->stream_index]->time_base;
+               AVRational b_timebase = ctx->streams[b->stream_index]->time_base;
+               if (av_compare_ts(a_dts, a_timebase, b_dts, b_timebase) != 0) {
+                       return av_compare_ts(a_dts, a_timebase, b_dts, b_timebase) < 0;
+               } else {
+                       return av_compare_ts(a->pts, a_timebase, b->pts, b_timebase) < 0;
+               }
+       }
+
+       const AVFormatContext * const ctx;
+};
+
+Mux::Mux(AVFormatContext *avctx, int width, int height, Codec video_codec, const string &video_extradata, const AVCodecParameters *audio_codecpar, int time_base, std::function<void(int64_t)> write_callback, WriteStrategy write_strategy, const vector<MuxMetrics *> &metrics)
+       : write_strategy(write_strategy), avctx(avctx), write_callback(write_callback), metrics(metrics)
+{
+       avstream_video = avformat_new_stream(avctx, nullptr);
+       if (avstream_video == nullptr) {
+               fprintf(stderr, "avformat_new_stream() failed\n");
+               exit(1);
+       }
+       avstream_video->time_base = AVRational{1, time_base};
+       avstream_video->codecpar->codec_type = AVMEDIA_TYPE_VIDEO;
+       if (video_codec == CODEC_H264) {
+               avstream_video->codecpar->codec_id = AV_CODEC_ID_H264;
+       } else if (video_codec == CODEC_NV12) {
+               avstream_video->codecpar->codec_id = AV_CODEC_ID_RAWVIDEO;
+               avstream_video->codecpar->codec_tag = avcodec_pix_fmt_to_codec_tag(AV_PIX_FMT_NV12);
+       } else {
+               assert(video_codec == CODEC_MJPEG);
+               avstream_video->codecpar->codec_id = AV_CODEC_ID_MJPEG;
+       }
+       avstream_video->codecpar->width = width;
+       avstream_video->codecpar->height = height;
+
+       // Colorspace details. Closely correspond to settings in EffectChain_finalize,
+       // as noted in each comment.
+       // Note that the H.264 stream also contains this information and depending on the
+       // mux, this might simply get ignored. See sps_rbsp().
+       // Note that there's no way to change this per-frame as the H.264 stream
+       // would like to be able to.
+       avstream_video->codecpar->color_primaries = AVCOL_PRI_BT709;  // RGB colorspace (inout_format.color_space).
+       avstream_video->codecpar->color_trc = AVCOL_TRC_IEC61966_2_1;  // Gamma curve (inout_format.gamma_curve).
+       // YUV colorspace (output_ycbcr_format.luma_coefficients).
+       avstream_video->codecpar->color_space = AVCOL_SPC_BT709;
+       avstream_video->codecpar->color_range = AVCOL_RANGE_MPEG;  // Full vs. limited range (output_ycbcr_format.full_range).
+       avstream_video->codecpar->chroma_location = AVCHROMA_LOC_LEFT;  // Chroma sample location. See chroma_offset_0[] in Mixer::subsample_chroma().
+       avstream_video->codecpar->field_order = AV_FIELD_PROGRESSIVE;
+
+       if (!video_extradata.empty()) {
+               avstream_video->codecpar->extradata = (uint8_t *)av_malloc(video_extradata.size());
+               avstream_video->codecpar->extradata_size = video_extradata.size();
+               memcpy(avstream_video->codecpar->extradata, video_extradata.data(), video_extradata.size());
+       }
+
+       avstream_audio = nullptr;
+#if 0
+       avstream_audio = avformat_new_stream(avctx, nullptr);
+       if (avstream_audio == nullptr) {
+               fprintf(stderr, "avformat_new_stream() failed\n");
+               exit(1);
+       }
+       avstream_audio->time_base = AVRational{1, time_base};
+       if (avcodec_parameters_copy(avstream_audio->codecpar, audio_codecpar) < 0) {
+               fprintf(stderr, "avcodec_parameters_copy() failed\n");
+               exit(1);
+       }
+#endif
+
+       AVDictionary *options = NULL;
+       vector<pair<string, string>> opts = MUX_OPTS;
+       for (pair<string, string> opt : opts) {
+               av_dict_set(&options, opt.first.c_str(), opt.second.c_str(), 0);
+       }
+       if (avformat_write_header(avctx, &options) < 0) {
+               fprintf(stderr, "avformat_write_header() failed\n");
+               exit(1);
+       }
+       for (MuxMetrics *metric : metrics) {
+               metric->metric_written_bytes += avctx->pb->pos;
+       }
+
+       // Make sure the header is written before the constructor exits.
+       avio_flush(avctx->pb);
+
+       if (write_strategy == WRITE_BACKGROUND) {
+               writer_thread = thread(&Mux::thread_func, this);
+       }
+}
+
+Mux::~Mux()
+{
+       assert(plug_count == 0);
+       if (write_strategy == WRITE_BACKGROUND) {
+               writer_thread_should_quit = true;
+               packet_queue_ready.notify_all();
+               writer_thread.join();
+       }
+       int64_t old_pos = avctx->pb->pos;
+       av_write_trailer(avctx);
+       for (MuxMetrics *metric : metrics) {
+               metric->metric_written_bytes += avctx->pb->pos - old_pos;
+       }
+
+       if (!(avctx->oformat->flags & AVFMT_NOFILE) &&
+           !(avctx->flags & AVFMT_FLAG_CUSTOM_IO)) {
+               avio_closep(&avctx->pb);
+       }
+       avformat_free_context(avctx);
+}
+
+void Mux::add_packet(const AVPacket &pkt, int64_t pts, int64_t dts, AVRational timebase, int stream_index_override)
+{
+       AVPacket pkt_copy;
+       av_init_packet(&pkt_copy);
+       if (av_packet_ref(&pkt_copy, &pkt) < 0) {
+               fprintf(stderr, "av_copy_packet() failed\n");
+               exit(1);
+       }
+       if (stream_index_override != -1) {
+               pkt_copy.stream_index = stream_index_override;
+       }
+       if (pkt_copy.stream_index == 0) {
+               pkt_copy.pts = av_rescale_q(pts, timebase, avstream_video->time_base);
+               pkt_copy.dts = av_rescale_q(dts, timebase, avstream_video->time_base);
+               pkt_copy.duration = av_rescale_q(pkt.duration, timebase, avstream_video->time_base);
+       } else if (pkt_copy.stream_index == 1) {
+               pkt_copy.pts = av_rescale_q(pts, timebase, avstream_audio->time_base);
+               pkt_copy.dts = av_rescale_q(dts, timebase, avstream_audio->time_base);
+               pkt_copy.duration = av_rescale_q(pkt.duration, timebase, avstream_audio->time_base);
+       } else {
+               assert(false);
+       }
+
+       {
+               lock_guard<mutex> lock(mu);
+               if (write_strategy == WriteStrategy::WRITE_BACKGROUND) {
+                       packet_queue.push_back(QueuedPacket{ av_packet_clone(&pkt_copy), pts });
+                       if (plug_count == 0)
+                               packet_queue_ready.notify_all();
+               } else if (plug_count > 0) {
+                       packet_queue.push_back(QueuedPacket{ av_packet_clone(&pkt_copy), pts });
+               } else {
+                       write_packet_or_die(pkt_copy, pts);
+               }
+       }
+
+       av_packet_unref(&pkt_copy);
+}
+
+void Mux::write_packet_or_die(const AVPacket &pkt, int64_t unscaled_pts)
+{
+       for (MuxMetrics *metric : metrics) {
+               if (pkt.stream_index == 0) {
+                       metric->metric_video_bytes += pkt.size;
+               } else if (pkt.stream_index == 1) {
+                       metric->metric_audio_bytes += pkt.size;
+               } else {
+                       assert(false);
+               }
+       }
+       int64_t old_pos = avctx->pb->pos;
+       if (av_interleaved_write_frame(avctx, const_cast<AVPacket *>(&pkt)) < 0) {
+               fprintf(stderr, "av_interleaved_write_frame() failed\n");
+               abort();
+       }
+       avio_flush(avctx->pb);
+       for (MuxMetrics *metric : metrics) {
+               metric->metric_written_bytes += avctx->pb->pos - old_pos;
+       }
+
+       if (pkt.stream_index == 0 && write_callback != nullptr) {
+               write_callback(unscaled_pts);
+       }
+}
+
+void Mux::plug()
+{
+       lock_guard<mutex> lock(mu);
+       ++plug_count;
+}
+
+void Mux::unplug()
+{
+       lock_guard<mutex> lock(mu);
+       if (--plug_count > 0) {
+               return;
+       }
+       assert(plug_count >= 0);
+
+       sort(packet_queue.begin(), packet_queue.end(), PacketBefore(avctx));
+
+       if (write_strategy == WRITE_BACKGROUND) {
+               packet_queue_ready.notify_all();
+       } else {
+               for (QueuedPacket &qp : packet_queue) {
+                       write_packet_or_die(*qp.pkt, qp.unscaled_pts);
+                       av_packet_free(&qp.pkt);
+               }
+               packet_queue.clear();
+       }
+}
+
+void Mux::thread_func()
+{
+       pthread_setname_np(pthread_self(), "Mux");
+
+       unique_lock<mutex> lock(mu);
+       for ( ;; ) {
+               packet_queue_ready.wait(lock, [this]() {
+                       return writer_thread_should_quit || (!packet_queue.empty() && plug_count == 0);
+               });
+               if (writer_thread_should_quit && packet_queue.empty()) {
+                       // All done.
+                       break;
+               }
+
+               assert(!packet_queue.empty() && plug_count == 0);
+               vector<QueuedPacket> packets;
+               swap(packets, packet_queue);
+
+               lock.unlock();
+               for (QueuedPacket &qp : packets) {
+                       write_packet_or_die(*qp.pkt, qp.unscaled_pts);
+                       av_packet_free(&qp.pkt);
+               }
+               lock.lock();
+       }
+}
+
+void MuxMetrics::init(const vector<pair<string, string>> &labels)
+{
+       // TODO: See if we want to reintroduce these.
+}
diff --git a/futatabi/mux.h b/futatabi/mux.h
new file mode 100644 (file)
index 0000000..53e5539
--- /dev/null
@@ -0,0 +1,112 @@
+#ifndef _MUX_H
+#define _MUX_H 1
+
+// Wrapper around an AVFormat mux.
+
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavformat/avformat.h>
+}
+
+#include <sys/types.h>
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+#include <string>
+#include <utility>
+#include <thread>
+#include <vector>
+
+#include "timebase.h"
+
+struct MuxMetrics {
+       // “written” will usually be equal video + audio + mux overhead,
+       // except that there could be buffered packets that count in audio or video
+       // but not yet in written.
+       std::atomic<int64_t> metric_video_bytes{0}, metric_audio_bytes{0}, metric_written_bytes{0};
+
+       // Registers in global_metrics.
+       void init(const std::vector<std::pair<std::string, std::string>> &labels);
+
+       void reset()
+       {
+               metric_video_bytes = 0;
+               metric_audio_bytes = 0;
+               metric_written_bytes = 0;
+       }
+};
+
+class Mux {
+public:
+       enum Codec {
+               CODEC_H264,
+               CODEC_NV12,  // Uncompressed 4:2:0.
+               CODEC_MJPEG
+       };
+       enum WriteStrategy {
+               // add_packet() will write the packet immediately, unless plugged.
+               WRITE_FOREGROUND,
+
+               // All writes will happen on a separate thread, so add_packet()
+               // won't block. Use this if writing to a file and you might be
+               // holding a mutex (because blocking I/O with a mutex held is
+               // not good). Note that this will clone every packet, so it has
+               // higher overhead.
+               WRITE_BACKGROUND,
+       };
+
+       // Takes ownership of avctx. <write_callback> will be called every time
+       // a write has been made to the video stream (id 0), with the pts of
+       // the just-written frame. (write_callback can be nullptr.)
+       // Does not take ownership of <metrics>; elements in there, if any,
+       // will be added to.
+       Mux(AVFormatContext *avctx, int width, int height, Codec video_codec, const std::string &video_extradata, const AVCodecParameters *audio_codecpar, int time_base, std::function<void(int64_t)> write_callback, WriteStrategy write_strategy, const std::vector<MuxMetrics *> &metrics);
+       ~Mux();
+       void add_packet(const AVPacket &pkt, int64_t pts, int64_t dts, AVRational timebase = { 1, TIMEBASE }, int stream_index_override = -1);
+
+       // As long as the mux is plugged, it will not actually write anything to disk,
+       // just queue the packets. Once it is unplugged, the packets are reordered by pts
+       // and written. This is primarily useful if you might have two different encoders
+       // writing to the mux at the same time (because one is shutting down), so that
+       // pts might otherwise come out-of-order.
+       //
+       // You can plug and unplug multiple times; only when the plug count reaches zero,
+       // something will actually happen.
+       void plug();
+       void unplug();
+
+private:
+       // If write_strategy == WRITE_FOREGORUND, Must be called with <mu> held.
+       void write_packet_or_die(const AVPacket &pkt, int64_t unscaled_pts);
+       void thread_func();
+
+       WriteStrategy write_strategy;
+
+       std::mutex mu;
+
+       // These are only in use if write_strategy == WRITE_BACKGROUND.
+       std::atomic<bool> writer_thread_should_quit{false};
+       std::thread writer_thread;
+
+       AVFormatContext *avctx;  // Protected by <mu>, iff write_strategy == WRITE_BACKGROUND.
+       int plug_count = 0;  // Protected by <mu>.
+
+       // Protected by <mu>. If write_strategy == WRITE_FOREGROUND,
+       // this is only in use when plugging.
+       struct QueuedPacket {
+               AVPacket *pkt;
+               int64_t unscaled_pts;
+       };
+       std::vector<QueuedPacket> packet_queue;
+       std::condition_variable packet_queue_ready;
+
+       AVStream *avstream_video, *avstream_audio;
+
+       std::function<void(int64_t)> write_callback;
+       std::vector<MuxMetrics *> metrics;
+
+       friend struct PacketBefore;
+};
+
+#endif  // !defined(_MUX_H)
diff --git a/futatabi/player.cpp b/futatabi/player.cpp
new file mode 100644 (file)
index 0000000..b0e862d
--- /dev/null
@@ -0,0 +1,451 @@
+#include "player.h"
+
+#include "clip_list.h"
+#include "context.h"
+#include "defs.h"
+#include "ffmpeg_raii.h"
+#include "frame_on_disk.h"
+#include "httpd.h"
+#include "jpeg_frame_view.h"
+#include "mux.h"
+#include "timebase.h"
+#include "video_stream.h"
+
+#include <algorithm>
+#include <chrono>
+#include <condition_variable>
+#include <movit/util.h>
+#include <mutex>
+#include <stdio.h>
+#include <thread>
+#include <vector>
+
+using namespace std;
+using namespace std::chrono;
+
+extern HTTPD *global_httpd;
+
+void Player::thread_func(bool also_output_to_stream)
+{
+       pthread_setname_np(pthread_self(), "Player");
+
+       QSurface *surface = create_surface();
+       QOpenGLContext *context = create_context(surface);
+       if (!make_current(context, surface)) {
+               printf("oops\n");
+               exit(1);
+       }
+
+       check_error();
+
+       // Create the VideoStream object, now that we have an OpenGL context.
+       if (also_output_to_stream) {
+               video_stream.reset(new VideoStream);
+               video_stream->start();
+       }
+
+       check_error();
+
+       constexpr double output_framerate = 60000.0 / 1001.0;  // FIXME: make configurable
+       int64_t pts = 0;
+       Clip next_clip;
+       size_t next_clip_idx = size_t(-1);
+       bool got_next_clip = false;
+       double next_clip_fade_time = -1.0;
+
+       for ( ;; ) {
+wait_for_clip:
+               bool clip_ready;
+               steady_clock::time_point before_sleep = steady_clock::now();
+
+               // Wait until we're supposed to play something.
+               {
+                       unique_lock<mutex> lock(queue_state_mu);
+                       clip_ready = new_clip_changed.wait_for(lock, milliseconds(100), [this] {
+                               return new_clip_ready && current_clip.pts_in != -1;
+                       });
+                       new_clip_ready = false;
+                       playing = true;
+               }
+
+               steady_clock::duration time_slept = steady_clock::now() - before_sleep;
+               pts += duration_cast<duration<size_t, TimebaseRatio>>(time_slept).count();
+
+               if (!clip_ready) {
+                       if (video_stream != nullptr) {
+                               video_stream->schedule_refresh_frame(steady_clock::now(), pts, /*display_func=*/nullptr, QueueSpotHolder());
+                       }
+                       continue;
+               }
+
+               Clip clip;
+               size_t clip_idx;
+               unsigned stream_idx;
+               {
+                       lock_guard<mutex> lock(mu);
+                       clip = current_clip;
+                       clip_idx = current_clip_idx;
+                       stream_idx = current_stream_idx;
+               }
+               steady_clock::time_point origin = steady_clock::now();  // TODO: Add a 100 ms buffer for ramp-up?
+               int64_t in_pts_origin = clip.pts_in;
+got_clip:
+               int64_t out_pts_origin = pts;
+
+               // Start playing exactly at a frame.
+               // TODO: Snap secondary (fade-to) clips in the same fashion
+               // so that we don't get jank here).
+               {
+                       lock_guard<mutex> lock(frame_mu);
+
+                       // Find the first frame such that frame.pts <= in_pts.
+                       auto it = lower_bound(frames[stream_idx].begin(),
+                               frames[stream_idx].end(),
+                               in_pts_origin,
+                               [](const FrameOnDisk &frame, int64_t pts) { return frame.pts < pts; });
+                       if (it != frames[stream_idx].end()) {
+                               in_pts_origin = it->pts;
+                       }
+               }
+
+               // TODO: Lock to a rational multiple of the frame rate if possible.
+               double speed = 0.5;
+
+               int64_t in_pts_start_next_clip = -1;
+               for (int frameno = 0; ; ++frameno) {  // Ends when the clip ends.
+                       double out_pts = out_pts_origin + TIMEBASE * frameno / output_framerate;
+                       steady_clock::time_point next_frame_start =
+                               origin + microseconds(lrint((out_pts - out_pts_origin) * 1e6 / TIMEBASE));
+                       int64_t in_pts = lrint(in_pts_origin + TIMEBASE * frameno * speed / output_framerate);
+                       pts = lrint(out_pts);
+
+                       if (in_pts >= clip.pts_out) {
+                               break;
+                       }
+
+                       steady_clock::duration time_behind = steady_clock::now() - next_frame_start;
+                       if (time_behind >= milliseconds(200)) {
+                               fprintf(stderr, "WARNING: %ld ms behind, dropping a frame (no matter the type).\n",
+                                       lrint(1e3 * duration<double>(time_behind).count()));
+                               continue;
+                       }
+
+                       double time_left_this_clip = double(clip.pts_out - in_pts) / TIMEBASE / speed;
+                       if (!got_next_clip && next_clip_callback != nullptr && time_left_this_clip <= clip.fade_time_seconds) {
+                               // Find the next clip so that we can begin a fade.
+                               tie(next_clip, next_clip_idx) = next_clip_callback();
+                               if (next_clip.pts_in != -1) {
+                                       got_next_clip = true;
+
+                                       double duration_next_clip = (next_clip.pts_out - next_clip.pts_in) / TIMEBASE / speed;
+                                       next_clip_fade_time = std::min(time_left_this_clip, duration_next_clip);
+                                       in_pts_start_next_clip = next_clip.pts_in + lrint(next_clip_fade_time * TIMEBASE * speed);
+                               }
+                       }
+
+                       // pts not affected by the swapping below.
+                       int64_t in_pts_for_progress = in_pts, in_pts_secondary_for_progress = -1;
+
+                       int primary_stream_idx = stream_idx;
+                       FrameOnDisk secondary_frame;
+                       int secondary_stream_idx = -1;
+                       float fade_alpha = 0.0f;
+                       if (got_next_clip && time_left_this_clip <= next_clip_fade_time) {
+                               secondary_stream_idx = next_clip.stream_idx;
+                               int64_t in_pts_secondary = lrint(next_clip.pts_in + (next_clip_fade_time - time_left_this_clip) * TIMEBASE * speed);
+                               in_pts_secondary_for_progress = in_pts_secondary;
+                               fade_alpha = 1.0f - time_left_this_clip / next_clip_fade_time;
+
+                               // If more than half-way through the fade, interpolate the next clip
+                               // instead of the current one, since it's more visible.
+                               if (fade_alpha >= 0.5f) {
+                                       swap(primary_stream_idx, secondary_stream_idx);
+                                       swap(in_pts, in_pts_secondary);
+                                       fade_alpha = 1.0f - fade_alpha;
+                               }
+
+                               FrameOnDisk frame_lower, frame_upper;
+                               bool ok = find_surrounding_frames(in_pts_secondary, secondary_stream_idx, &frame_lower, &frame_upper);
+                               if (ok) {
+                                       secondary_frame = frame_lower;
+                               }
+                       }
+
+                       if (progress_callback != nullptr) {
+                               // NOTE: None of this will take into account any snapping done below.
+                               double played_this_clip = double(in_pts_for_progress - clip.pts_in) / TIMEBASE / speed;
+                               double total_length = double(clip.pts_out - clip.pts_in) / TIMEBASE / speed;
+                               map<size_t, double> progress{{ clip_idx, played_this_clip / total_length }};
+
+                               if (got_next_clip && time_left_this_clip <= next_clip_fade_time) {
+                                       double played_next_clip = double(in_pts_secondary_for_progress - next_clip.pts_in) / TIMEBASE / speed;
+                                       double total_next_length = double(next_clip.pts_out - next_clip.pts_in) / TIMEBASE / speed;
+                                       progress[next_clip_idx] = played_next_clip / total_next_length;
+                               }
+                               progress_callback(progress);
+                       }
+
+                       FrameOnDisk frame_lower, frame_upper;
+                       bool ok = find_surrounding_frames(in_pts, primary_stream_idx, &frame_lower, &frame_upper);
+                       if (!ok) {
+                               break;
+                       }
+
+                       {
+                               unique_lock<mutex> lock(queue_state_mu);
+                               if (video_stream == nullptr) {
+                                       // No queue, just wait until the right time and then show the frame.
+                                       new_clip_changed.wait_until(lock, next_frame_start, [this]{
+                                               return new_clip_ready || override_stream_idx != -1;
+                                       });
+                               } else {
+                                       // If the queue is full (which is really the state we'd like to be in),
+                                       // wait until there's room for one more frame (ie., one was output from
+                                       // VideoStream), or until or until there's a new clip we're supposed to play.
+                                       //
+                                       // In this case, we don't sleep until next_frame_start; the displaying is
+                                       // done by the queue.
+                                       new_clip_changed.wait(lock, [this]{
+                                               if (num_queued_frames < max_queued_frames) {
+                                                       return true;
+                                               }
+                                               return new_clip_ready || override_stream_idx != -1;
+                                       });
+                               }
+                               if (new_clip_ready) {
+                                       if (video_stream != nullptr) {
+                                               lock.unlock();  // Urg.
+                                               video_stream->clear_queue();
+                                               lock.lock();
+                                       }
+                                       got_next_clip = false;
+                                       goto wait_for_clip;
+                               }
+                               if (override_stream_idx != -1) {
+                                       stream_idx = override_stream_idx;
+                                       override_stream_idx = -1;
+                                       continue;
+                               }
+                       }
+
+                       if (frame_lower.pts == frame_upper.pts) {
+                               auto display_func = [this, primary_stream_idx, frame_lower, secondary_frame, fade_alpha]{
+                                       destination->setFrame(primary_stream_idx, frame_lower, secondary_frame, fade_alpha);
+                               };
+                               if (video_stream == nullptr) {
+                                       display_func();
+                               } else {
+                                       if (secondary_stream_idx == -1) {
+                                               video_stream->schedule_original_frame(
+                                                       next_frame_start, pts, display_func, QueueSpotHolder(this),
+                                                       frame_lower);
+                                       } else {
+                                               assert(secondary_frame.pts != -1);
+                                               video_stream->schedule_faded_frame(next_frame_start, pts, display_func,
+                                                       QueueSpotHolder(this), frame_lower,
+                                                       secondary_frame, fade_alpha);
+                                       }
+                               }
+                               continue;
+                       }
+
+                       // Snap to input frame: If we can do so with less than 1% jitter
+                       // (ie., move less than 1% of an _output_ frame), do so.
+                       // TODO: Snap secondary (fade-to) clips in the same fashion.
+                       bool snapped = false;
+                       for (int64_t snap_pts : { frame_lower.pts, frame_upper.pts }) {
+                               double snap_pts_as_frameno = (snap_pts - in_pts_origin) * output_framerate / TIMEBASE / speed;
+                               if (fabs(snap_pts_as_frameno - frameno) < 0.01) {
+                                       FrameOnDisk snap_frame = frame_lower;
+                                       snap_frame.pts = snap_pts;
+                                       auto display_func = [this, primary_stream_idx, snap_frame, secondary_frame, fade_alpha]{
+                                               destination->setFrame(primary_stream_idx, snap_frame, secondary_frame, fade_alpha);
+                                       };
+                                       if (video_stream == nullptr) {
+                                               display_func();
+                                       } else {
+                                               if (secondary_stream_idx == -1) {
+                                                       video_stream->schedule_original_frame(
+                                                               next_frame_start, pts, display_func,
+                                                               QueueSpotHolder(this), snap_frame);
+                                               } else {
+                                                       assert(secondary_frame.pts != -1);
+                                                       video_stream->schedule_faded_frame(
+                                                               next_frame_start, pts, display_func, QueueSpotHolder(this),
+                                                               snap_frame, secondary_frame, fade_alpha);
+                                               }
+                                       }
+                                       in_pts_origin += snap_pts - in_pts;
+                                       snapped = true;
+                                       break;
+                               }
+                       }
+                       if (snapped) {
+                               continue;
+                       }
+
+                       if (time_behind >= milliseconds(100)) {
+                               fprintf(stderr, "WARNING: %ld ms behind, dropping an interpolated frame.\n",
+                                       lrint(1e3 * duration<double>(time_behind).count()));
+                               continue;
+                       }
+
+                       double alpha = double(in_pts - frame_lower.pts) / (frame_upper.pts - frame_lower.pts);
+
+                       if (video_stream == nullptr) {
+                               // Previews don't do any interpolation.
+                               assert(secondary_stream_idx == -1);
+                               destination->setFrame(primary_stream_idx, frame_lower);
+                       } else {
+                               auto display_func = [this](shared_ptr<Frame> frame) {
+                                       destination->setFrame(frame);
+                               };
+                               video_stream->schedule_interpolated_frame(
+                                       next_frame_start, pts, display_func, QueueSpotHolder(this),
+                                       frame_lower, frame_upper, alpha,
+                                       secondary_frame, fade_alpha);
+                       }
+               }
+
+               // The clip ended.
+
+               // Last-ditch effort to get the next clip (if e.g. the fade time was zero seconds).
+               if (!got_next_clip && next_clip_callback != nullptr) {
+                       tie(next_clip, next_clip_idx) = next_clip_callback();
+                       if (next_clip.pts_in != -1) {
+                               got_next_clip = true;
+                               in_pts_start_next_clip = next_clip.pts_in;
+                       }
+               }
+
+               // Switch to next clip if we got it.
+               if (got_next_clip) {
+                       clip = next_clip;
+                       clip_idx = next_clip_idx;
+                       stream_idx = next_clip.stream_idx;  // Override is used for previews only, and next_clip is used for live ony.
+                       if (done_callback != nullptr) {
+                               done_callback();
+                       }
+                       got_next_clip = false;
+
+                       // Start the next clip from the point where the fade went out.
+                       origin = steady_clock::now();
+                       in_pts_origin = in_pts_start_next_clip;
+                       goto got_clip;
+               }
+
+               {
+                       unique_lock<mutex> lock(queue_state_mu);
+                       playing = false;
+               }
+               if (done_callback != nullptr) {
+                       done_callback();
+               }
+       }
+}
+
+// Find the frame immediately before and after this point.
+bool Player::find_surrounding_frames(int64_t pts, int stream_idx, FrameOnDisk *frame_lower, FrameOnDisk *frame_upper)
+{
+       lock_guard<mutex> lock(frame_mu);
+
+       // Find the first frame such that frame.pts >= pts.
+       auto it = lower_bound(frames[stream_idx].begin(),
+               frames[stream_idx].end(),
+               pts,
+               [](const FrameOnDisk &frame, int64_t pts) { return frame.pts < pts; });
+       if (it == frames[stream_idx].end()) {
+               return false;
+       }
+       *frame_upper = *it;
+
+       // Find the last frame such that in_pts <= frame.pts (if any).
+       if (it == frames[stream_idx].begin()) {
+               *frame_lower = *it;
+       } else {
+               *frame_lower = *(it - 1);
+       }
+       assert(pts >= frame_lower->pts);
+       assert(pts <= frame_upper->pts);
+       return true;
+}
+
+Player::Player(JPEGFrameView *destination, bool also_output_to_stream)
+       : destination(destination)
+{
+       thread(&Player::thread_func, this, also_output_to_stream).detach();
+}
+
+void Player::play_clip(const Clip &clip, size_t clip_idx, unsigned stream_idx)
+{
+       {
+               lock_guard<mutex> lock(mu);
+               current_clip = clip;
+               current_stream_idx = stream_idx;
+               current_clip_idx = clip_idx;
+       }
+
+       {
+               lock_guard<mutex> lock(queue_state_mu);
+               new_clip_ready = true;
+               override_stream_idx = -1;
+               new_clip_changed.notify_all();
+       }
+}
+
+void Player::override_angle(unsigned stream_idx)
+{
+       // Corner case: If a new clip is waiting to be played, change its stream and then we're done.
+       {
+               unique_lock<mutex> lock(queue_state_mu);
+               if (new_clip_ready) {
+                       lock_guard<mutex> lock2(mu);
+                       current_stream_idx = stream_idx;
+                       return;
+               }
+       }
+
+       // If we are playing a clip, set override_stream_idx, and the player thread will
+       // pick it up and change its internal index.
+       {
+               unique_lock<mutex> lock(queue_state_mu);
+               if (playing) {
+                       override_stream_idx = stream_idx;
+                       new_clip_changed.notify_all();
+               }
+       }
+
+       // OK, so we're standing still, presumably at the end of a clip.
+       // Look at the current pts_out (if it exists), and show the closest
+       // thing we've got.
+       int64_t pts_out;
+       {
+               lock_guard<mutex> lock(mu);
+               if (current_clip.pts_out < 0) {
+                       return;
+               }
+               pts_out = current_clip.pts_out;
+       }
+
+       lock_guard<mutex> lock(frame_mu);
+       auto it = upper_bound(frames[stream_idx].begin(), frames[stream_idx].end(), pts_out,
+               [](int64_t pts, const FrameOnDisk &frame) { return pts < frame.pts; });
+       if (it == frames[stream_idx].end()) {
+               return;
+       }
+       destination->setFrame(stream_idx, *it);
+}
+
+void Player::take_queue_spot()
+{
+       unique_lock<mutex> lock(queue_state_mu);
+       ++num_queued_frames;
+}
+
+void Player::release_queue_spot()
+{
+       unique_lock<mutex> lock(queue_state_mu);
+       assert(num_queued_frames > 0);
+       --num_queued_frames;
+       new_clip_changed.notify_all();
+}
diff --git a/futatabi/player.h b/futatabi/player.h
new file mode 100644 (file)
index 0000000..c7f8e07
--- /dev/null
@@ -0,0 +1,82 @@
+#ifndef _PLAYER_H
+#define _PLAYER_H 1
+
+#include "clip_list.h"
+#include "frame_on_disk.h"
+#include "queue_spot_holder.h"
+
+extern "C" {
+#include <libavformat/avio.h>
+}
+
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+
+class JPEGFrameView;
+class VideoStream;
+class QSurface;
+class QSurfaceFormat;
+
+class Player : public QueueInterface {
+public:
+       Player(JPEGFrameView *destination, bool also_output_to_stream);
+
+       void play_clip(const Clip &clip, size_t clip_idx, unsigned stream_idx);
+       void override_angle(unsigned stream_idx);  // For the current clip only.
+
+       // Not thread-safe to set concurrently with playing.
+       // Will be called back from the player thread.
+       using done_callback_func = std::function<void()>;
+       void set_done_callback(done_callback_func cb) { done_callback = cb; }
+
+       // Not thread-safe to set concurrently with playing.
+       // Will be called back from the player thread.
+       // The second parameter is the clip's position in the play list.
+       using next_clip_callback_func = std::function<std::pair<Clip, size_t>()>;
+       void set_next_clip_callback(next_clip_callback_func cb) { next_clip_callback = cb; }
+
+       // Not thread-safe to set concurrently with playing.
+       // Will be called back from the player thread.
+       using progress_callback_func = std::function<void(const std::map<size_t, double> &progress)>;
+       void set_progress_callback(progress_callback_func cb) { progress_callback = cb; }
+
+       // QueueInterface.
+       void take_queue_spot() override;
+       void release_queue_spot() override;
+
+private:
+       void thread_func(bool also_output_to_stream);
+       void open_output_stream();
+       static int write_packet2_thunk(void *opaque, uint8_t *buf, int buf_size, AVIODataMarkerType type, int64_t time);
+       int write_packet2(uint8_t *buf, int buf_size, AVIODataMarkerType type, int64_t time);
+
+       // Find the frame immediately before and after this point.
+       // Returns false if pts is after the last frame.
+       bool find_surrounding_frames(int64_t pts, int stream_idx, FrameOnDisk *frame_lower, FrameOnDisk *frame_upper);
+
+       JPEGFrameView *destination;
+       done_callback_func done_callback;
+       next_clip_callback_func next_clip_callback;
+       progress_callback_func progress_callback;
+
+       std::mutex mu;
+       Clip current_clip;  // Under mu. Can have pts_in = -1 for no clip.
+       size_t current_clip_idx;  // Under mu.
+       unsigned current_stream_idx;  // Under mu.
+
+       std::mutex queue_state_mu;
+       std::condition_variable new_clip_changed;
+       bool new_clip_ready = false;  // Under queue_state_mu.
+       bool playing = false;  // Under queue_state_mu.
+       int override_stream_idx = -1;  // Under queue_state_mu.
+
+       std::unique_ptr<VideoStream> video_stream;  // Can be nullptr.
+
+       // under queue_state_mu. Part of this instead of VideoStream so that we own
+       // its lock and can sleep on it.
+       size_t num_queued_frames = 0;
+       static constexpr size_t max_queued_frames = 10;
+};
+
+#endif  // !defined(_PLAYER_H)
diff --git a/futatabi/post_to_main_thread.h b/futatabi/post_to_main_thread.h
new file mode 100644 (file)
index 0000000..0462c7b
--- /dev/null
@@ -0,0 +1,16 @@
+#ifndef _POST_TO_MAIN_THREAD_H
+#define _POST_TO_MAIN_THREAD_H 1
+
+#include <QApplication>
+#include <QObject>
+#include <memory>
+
+// http://stackoverflow.com/questions/21646467/how-to-execute-a-functor-in-a-given-thread-in-qt-gcd-style
+template<typename F>
+static inline void post_to_main_thread(F &&fun)
+{
+       QObject signalSource;
+       QObject::connect(&signalSource, &QObject::destroyed, qApp, std::move(fun));
+}
+
+#endif  // !defined(_POST_TO_MAIN_THREAD_H)
diff --git a/futatabi/prewarp.frag b/futatabi/prewarp.frag
new file mode 100644 (file)
index 0000000..baf24d1
--- /dev/null
@@ -0,0 +1,22 @@
+#version 450 core
+
+// Warps I_1 according to the flow, then computes the mean and difference to I_0.
+
+in vec3 tc;
+out float I, I_t;
+out vec2 normalized_flow;
+
+uniform sampler2DArray image_tex, flow_tex;
+
+void main()
+{
+       vec3 flow = texture(flow_tex, tc).xyz;
+       flow.xy /= flow.z;  // Normalize the sum coming out of the densification.
+
+       float I_0 = texture(image_tex, tc).x;
+       float I_w = texture(image_tex, vec3(tc.xy + flow.xy, 1.0f - tc.z)).x;  // NOTE: This is effectively a reverse warp since texture() is a gather operation and flow is conceptually scatter.
+
+       I = 0.5f * (I_0 + I_w);
+       I_t = I_w - I_0;
+       normalized_flow = flow.xy * textureSize(image_tex, 0).xy;
+}
diff --git a/futatabi/queue_spot_holder.h b/futatabi/queue_spot_holder.h
new file mode 100644 (file)
index 0000000..b9dee06
--- /dev/null
@@ -0,0 +1,46 @@
+#ifndef _QUEUE_SPOT_HOLDER
+#define _QUEUE_SPOT_HOLDER 1
+
+// A RAII class to hold a shared resource, in our case an (unordered!) spot in a queue,
+// for as long as a frame is under computation.
+
+class QueueInterface {
+public:
+       virtual ~QueueInterface() {}
+       virtual void take_queue_spot() = 0;
+       virtual void release_queue_spot() = 0;
+};
+
+class QueueSpotHolder {
+public:
+       QueueSpotHolder() : queue(nullptr) {}
+
+       explicit QueueSpotHolder(QueueInterface *queue) : queue(queue) {
+               queue->take_queue_spot();
+       }
+
+       QueueSpotHolder(QueueSpotHolder &&other) : queue(other.queue) {
+               other.queue = nullptr;
+       }
+
+       QueueSpotHolder &operator=(QueueSpotHolder &&other) {
+               queue = other.queue;
+               other.queue = nullptr;
+               return *this;
+       }
+
+       ~QueueSpotHolder() {
+               if (queue != nullptr) {
+                       queue->release_queue_spot();
+               }
+       }
+
+       // Movable only.
+       QueueSpotHolder(QueueSpotHolder &) = delete;
+       QueueSpotHolder &operator=(QueueSpotHolder &) = delete;
+
+private:
+       QueueInterface *queue;  
+};
+
+#endif // !defined(_QUEUE_SPOT_HOLDER)
diff --git a/futatabi/ref_counted_gl_sync.h b/futatabi/ref_counted_gl_sync.h
new file mode 100644 (file)
index 0000000..5604e9e
--- /dev/null
@@ -0,0 +1,39 @@
+#ifndef _REF_COUNTED_GL_SYNC_H
+#define _REF_COUNTED_GL_SYNC_H 1
+
+// A wrapper around GLsync (OpenGL fences) that is automatically refcounted.
+// Useful since we sometimes want to use the same fence two entirely different
+// places. (We could set two fences at the same time, but they are not an
+// unlimited hardware resource, so it would be a bit wasteful.)
+
+#include <epoxy/gl.h>
+#include <memory>
+#include <mutex>
+
+typedef std::shared_ptr<__GLsync> RefCountedGLsyncBase;
+
+class RefCountedGLsync : public RefCountedGLsyncBase {
+public:
+       RefCountedGLsync() {}
+
+       RefCountedGLsync(GLenum condition, GLbitfield flags)
+               : RefCountedGLsyncBase(locked_glFenceSync(condition, flags), glDeleteSync) {}
+
+private:
+       // These are to work around apitrace bug #446.
+       static GLsync locked_glFenceSync(GLenum condition, GLbitfield flags)
+       {
+               std::lock_guard<std::mutex> lock(fence_lock);
+               return glFenceSync(condition, flags);
+       }
+
+       static void locked_glDeleteSync(GLsync sync)
+       {
+               std::lock_guard<std::mutex> lock(fence_lock);
+               glDeleteSync(sync);
+       }
+
+       static std::mutex fence_lock;
+};
+
+#endif  // !defined(_REF_COUNTED_GL_SYNC_H)
diff --git a/futatabi/resize_flow.frag b/futatabi/resize_flow.frag
new file mode 100644 (file)
index 0000000..4efc975
--- /dev/null
@@ -0,0 +1,12 @@
+#version 450 core
+
+in vec3 tc;
+out vec2 flow;
+
+uniform sampler2DArray flow_tex;
+uniform vec2 scale_factor;
+
+void main()
+{
+       flow = texture(flow_tex, tc).xy * scale_factor;
+}
diff --git a/futatabi/sobel.frag b/futatabi/sobel.frag
new file mode 100644 (file)
index 0000000..8c5c6ee
--- /dev/null
@@ -0,0 +1,64 @@
+#version 450 core
+
+in vec3 tc;
+out uint packed_gradients;
+
+uniform sampler2DArray tex;
+
+uint pack_gradients(float x, float y, float v)
+{
+       x = clamp(x, -0.5f, 0.5f);
+       y = clamp(y, -0.5f, 0.5f);
+
+       uint vi = uint(round(v * 255.0f));
+       uint xi = uint(round((x + 0.5f) * 4095.0f));
+       uint yi = uint(round((y + 0.5f) * 4095.0f));
+       return vi | (xi << 8) | (yi << 20);
+}
+
+void main()
+{
+       // There are two common Sobel filters, horizontal and vertical
+       // (see e.g. Wikipedia, or the OpenCV documentation):
+       //
+       //  [1 0 -1]     [-1 -2 -1]
+       //  [2 0 -2]     [ 0  0  0]
+       //  [1 0 -1]     [ 1  2  1]
+       // Horizontal     Vertical
+       //
+       // Note that Wikipedia and OpenCV gives entirely opposite definitions
+       // with regards to sign! This appears to be an error in the OpenCV
+       // documentation, forgetting that for convolution, the filters must be
+       // flipped. We have to flip the vertical matrix again comparing to
+       // Wikipedia, though, since we have bottom-left origin (y = up)
+       // and they define y as pointing downwards.
+       //
+       // Computing both directions at once allows us to get away with eight
+       // texture samples instead of twelve.
+
+       float top_left     = textureOffset(tex, tc, ivec2(-1,  1)).x;  // Note the bottom-left coordinate system.
+       float left         = textureOffset(tex, tc, ivec2(-1,  0)).x;
+       float bottom_left  = textureOffset(tex, tc, ivec2(-1, -1)).x;
+
+       float top          = textureOffset(tex, tc, ivec2( 0,  1)).x;
+       float bottom       = textureOffset(tex, tc, ivec2( 0, -1)).x;
+
+       float top_right    = textureOffset(tex, tc, ivec2( 1,  1)).x;
+       float right        = textureOffset(tex, tc, ivec2( 1,  0)).x;
+       float bottom_right = textureOffset(tex, tc, ivec2( 1, -1)).x;
+
+       vec2 gradients;
+       gradients.x = (top_right + 2.0f * right + bottom_right) - (top_left + 2.0f * left + bottom_left);
+       gradients.y = (top_left + 2.0 * top + top_right) - (bottom_left + 2.0f * bottom + bottom_right);
+
+       // Normalize so that we have a normalized unit of intensity levels per pixel.
+       gradients.x *= 0.125;
+       gradients.y *= 0.125;
+
+       // Also store the actual pixel value, so that we get it “for free”
+       // when we sample the gradients in motion_search.frag later.
+       float center = texture(tex, tc).x;
+
+       // Pack everything into a single 32-bit value, using simple fixed-point.
+       packed_gradients = pack_gradients(gradients.x, gradients.y, center);
+}
diff --git a/futatabi/sor.frag b/futatabi/sor.frag
new file mode 100644 (file)
index 0000000..9a8e1e4
--- /dev/null
@@ -0,0 +1,103 @@
+#version 450 core
+
+in vec3 tc, tc_left, tc_down;
+in vec3 equation_tc_assuming_left, equation_tc_assuming_right;
+in float element_x_idx, element_sum_idx;
+out vec2 diff_flow;
+
+uniform sampler2DArray diff_flow_tex, diffusivity_tex;
+uniform usampler2DArray equation_red_tex, equation_black_tex;
+uniform int phase;
+
+uniform int num_nonzero_phases;
+
+// See pack_floats_shared() in equations.frag.
+vec2 unpack_floats_shared(uint c)
+{
+       // Recover the exponent, and multiply it in. Add one because
+       // we have denormalized mantissas, then another one because we
+       // already reduced the exponent by one. Then subtract 20, because
+       // we are going to shift up the number by 20 below to recover the sign bits.
+       float normalizer = uintBitsToFloat(((c >> 1) & 0x7f800000u) - (18 << 23));
+       normalizer *= (1.0 / 2047.0);
+
+       // Shift the values up so that we recover the sign bit, then normalize.
+       float a = int(uint(c & 0x000fffu) << 20) * normalizer;
+       float b = int(uint(c & 0xfff000u) << 8) * normalizer;
+
+       return vec2(a, b);
+}
+
+float zero_if_outside_border(vec4 val)
+{
+       if (val.w < 1.0f) {
+               // We hit the border (or more like half-way to it), so zero smoothness.
+               return 0.0f;
+       } else {
+               return val.x;
+       }
+}
+
+void main()
+{
+       // Red-black SOR: Every other pass, we update every other element in a
+       // checkerboard pattern. This is rather suboptimal for the GPU, as it
+       // just immediately throws away half of the warp, but it helps convergence
+       // a _lot_ (rough testing indicates that five iterations of SOR is as good
+       // as ~50 iterations of Jacobi). We could probably do better by reorganizing
+       // the data into two-values-per-pixel, so-called “twinned buffering”;
+       // seemingly, it helps Haswell by ~15% on the SOR code, but GTX 950 not at all
+       // (at least not on 720p). Presumably the latter is already bandwidth bound.
+       int color = int(round(element_sum_idx)) & 1;
+       if (color != phase) discard;
+
+       uvec4 equation;
+       vec3 equation_tc;
+       if ((int(round(element_x_idx)) & 1) == 0) {
+               equation_tc = equation_tc_assuming_left;
+       } else {
+               equation_tc = equation_tc_assuming_right;
+       }
+       if (phase == 0) {
+               equation = texture(equation_red_tex, equation_tc);
+       } else {
+               equation = texture(equation_black_tex, equation_tc);
+       }
+       float inv_A11 = uintBitsToFloat(equation.x);
+       float A12 = uintBitsToFloat(equation.y);
+       float inv_A22 = uintBitsToFloat(equation.z);
+       vec2 b = unpack_floats_shared(equation.w);
+
+       const float omega = 1.8;  // Marginally better than 1.6, it seems.
+
+       if (num_nonzero_phases == 0) {
+               // Simplified version of the code below, assuming diff_flow == 0.0f everywhere.
+               diff_flow.x = omega * b.x * inv_A11;
+               diff_flow.y = omega * b.y * inv_A22;
+       } else {
+               // Subtract the missing terms from the right-hand side
+               // (it couldn't be done earlier, because we didn't know
+               // the values of the neighboring pixels; they change for
+               // each SOR iteration).
+               float smooth_l = zero_if_outside_border(texture(diffusivity_tex, tc_left));
+               float smooth_r = zero_if_outside_border(textureOffset(diffusivity_tex, tc_left, ivec2(1, 0)));
+               float smooth_d = zero_if_outside_border(texture(diffusivity_tex, tc_down));
+               float smooth_u = zero_if_outside_border(textureOffset(diffusivity_tex, tc_down, ivec2(0, 1)));
+               b += smooth_l * textureOffset(diff_flow_tex, tc, ivec2(-1,  0)).xy;
+               b += smooth_r * textureOffset(diff_flow_tex, tc, ivec2( 1,  0)).xy;
+               b += smooth_d * textureOffset(diff_flow_tex, tc, ivec2( 0, -1)).xy;
+               b += smooth_u * textureOffset(diff_flow_tex, tc, ivec2( 0,  1)).xy;
+
+               if (num_nonzero_phases == 1) {
+                       diff_flow = vec2(0.0f);
+               } else {
+                       diff_flow = texture(diff_flow_tex, tc).xy;
+               }
+
+               // From https://en.wikipedia.org/wiki/Successive_over-relaxation.
+               float sigma_u = A12 * diff_flow.y;
+               diff_flow.x += omega * ((b.x - sigma_u) * inv_A11 - diff_flow.x);
+               float sigma_v = A12 * diff_flow.x;
+               diff_flow.y += omega * ((b.y - sigma_v) * inv_A22 - diff_flow.y);
+       }
+}
diff --git a/futatabi/sor.vert b/futatabi/sor.vert
new file mode 100644 (file)
index 0000000..c68b1db
--- /dev/null
@@ -0,0 +1,43 @@
+#version 450 core
+#extension GL_ARB_shader_viewport_layer_array : require
+
+layout(location=0) in vec2 position;
+out vec3 tc, tc_left, tc_down;
+out vec3 equation_tc_assuming_left, equation_tc_assuming_right;
+out float element_x_idx;
+out float element_sum_idx;
+
+uniform sampler2DArray diff_flow_tex, diffusivity_tex;
+uniform usampler2DArray equation_red_tex;
+
+void main()
+{
+       // The result of glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) is:
+       //
+       //   2.000  0.000  0.000 -1.000
+       //   0.000  2.000  0.000 -1.000
+       //   0.000  0.000 -2.000 -1.000
+       //   0.000  0.000  0.000  1.000
+       gl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0);
+       gl_Layer = gl_InstanceID;
+
+       tc = vec3(position, gl_InstanceID);
+       tc_left = vec3(tc.x - 0.5f / textureSize(diffusivity_tex, 0).x, tc.y, gl_InstanceID);
+       tc_down = vec3(tc.x, tc.y - 0.5f / textureSize(diffusivity_tex, 0).y, gl_InstanceID);
+
+       // The equation textures have half the horizontal width, so we need to adjust the texel centers.
+       // It becomes extra tricky since the SOR texture might be of odd size, and then
+       // the equation texture is not exactly half the size.
+       vec2 element_idx = position * textureSize(diff_flow_tex, 0).xy - 0.5f;
+       float equation_texel_number_assuming_left = element_idx.x / 2.0f;
+       float equation_texel_number_assuming_right = (element_idx.x - 1.0f) / 2.0f;
+       equation_tc_assuming_left.x = (equation_texel_number_assuming_left + 0.5f) / textureSize(equation_red_tex, 0).x;
+       equation_tc_assuming_right.x = (equation_texel_number_assuming_right + 0.5f) / textureSize(equation_red_tex, 0).x;
+       equation_tc_assuming_left.y = tc.y;
+       equation_tc_assuming_right.y = tc.y;
+       equation_tc_assuming_left.z = gl_InstanceID;
+       equation_tc_assuming_right.z = gl_InstanceID;
+
+       element_x_idx = element_idx.x;
+       element_sum_idx = element_idx.x + element_idx.y;
+}
diff --git a/futatabi/splat.frag b/futatabi/splat.frag
new file mode 100644 (file)
index 0000000..6e873bc
--- /dev/null
@@ -0,0 +1,18 @@
+#version 450 core
+
+in vec2 image_pos;
+flat in vec2 flow, I_0_check_offset, I_1_check_offset;
+out vec2 out_flow;
+
+uniform sampler2DArray gray_tex;
+
+void main()
+{
+       out_flow = flow;
+
+       // TODO: Check if we are sampling out-of-image.
+       float I_0 = texture(gray_tex, vec3(image_pos + I_0_check_offset, 0)).r;
+       float I_1 = texture(gray_tex, vec3(image_pos + I_1_check_offset, 1)).r;
+       float diff = abs(I_1 - I_0);
+       gl_FragDepth = 0.125 * diff.x;  // Make sure we stay well under the 1.0 maximum.
+}
diff --git a/futatabi/splat.vert b/futatabi/splat.vert
new file mode 100644 (file)
index 0000000..0846231
--- /dev/null
@@ -0,0 +1,51 @@
+#version 450 core
+
+layout(location=0) in vec2 position;
+out vec2 image_pos;
+flat out vec2 flow, I_0_check_offset, I_1_check_offset;
+
+uniform vec2 splat_size;  // In 0..1 coordinates.
+uniform vec2 inv_flow_size;
+uniform float alpha;
+uniform sampler2DArray flow_tex;  // 0 = forward flow, 1 = backward flow.
+
+void main()
+{
+       int instance = gl_InstanceID;
+       int num_pixels_per_layer = textureSize(flow_tex, 0).x * textureSize(flow_tex, 0).y;
+       int src_layer;
+       if (instance >= num_pixels_per_layer) {
+               instance -= num_pixels_per_layer;
+               src_layer = 1;
+       } else {
+               src_layer = 0;
+       }
+       int x = instance % textureSize(flow_tex, 0).x;
+       int y = instance / textureSize(flow_tex, 0).x;
+
+       // Find out where to splat this to.
+       vec2 full_flow = texelFetch(flow_tex, ivec3(x, y, src_layer), 0).xy;
+       float splat_alpha;
+       if (src_layer == 1) {  // Reverse flow.
+               full_flow = -full_flow;
+               splat_alpha = 1.0f - alpha;
+       } else {
+               splat_alpha = alpha;
+       }
+       full_flow *= inv_flow_size;
+       
+       vec2 patch_center = (ivec2(x, y) + 0.5) * inv_flow_size + full_flow * splat_alpha;
+       image_pos = patch_center + splat_size * (position - 0.5);
+
+       flow = full_flow;
+       I_0_check_offset = full_flow * -alpha;
+       I_1_check_offset = full_flow * (1.0f - alpha);
+
+       // The result of glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) is:
+       //
+       //   2.000  0.000  0.000 -1.000
+       //   0.000  2.000  0.000 -1.000
+       //   0.000  0.000 -2.000 -1.000
+       //   0.000  0.000  0.000  1.000
+       gl_Position = vec4(2.0 * image_pos.x - 1.0, 2.0 * image_pos.y - 1.0, -1.0, 1.0);
+}
diff --git a/futatabi/state.proto b/futatabi/state.proto
new file mode 100644 (file)
index 0000000..d76bf02
--- /dev/null
@@ -0,0 +1,19 @@
+syntax = "proto3";
+
+// Corresponds to struct Clip.
+message ClipProto {
+       int64 pts_in = 1;
+       int64 pts_out = 2;
+       repeated string description = 3;
+       int64 stream_idx = 4;
+       double fade_time_seconds = 5;
+}
+
+message ClipListProto {
+       repeated ClipProto clip = 1;
+}
+
+message StateProto {
+       ClipListProto clip_list = 1;
+       ClipListProto play_list = 2;
+}
diff --git a/futatabi/timebase.h b/futatabi/timebase.h
new file mode 100644 (file)
index 0000000..532ec86
--- /dev/null
@@ -0,0 +1,29 @@
+#ifndef _TIMEBASE_H
+#define _TIMEBASE_H 1
+
+#include <ratio>
+
+// Common timebase that allows us to represent one frame exactly in all the
+// relevant frame rates:
+//
+//   Timebase:                1/120000
+//   Frame at 50fps:       2400/120000
+//   Frame at 60fps:       2000/120000
+//   Frame at 59.94fps:    2002/120000
+//   Frame at 23.976fps:   5005/120000
+//
+// If we also wanted to represent one sample at 48000 Hz, we'd need
+// to go to 300000. Also supporting one sample at 44100 Hz would mean
+// going to 44100000; probably a bit excessive.
+#define TIMEBASE 120000
+
+// Some muxes, like MP4 (or at least avformat's implementation of it),
+// are not too fond of values above 2^31. At timebase 120000, that's only
+// about five hours or so, so we define a coarser timebase that doesn't
+// get 59.94 precisely (so there will be a marginal amount of pts jitter),
+// but can do at least 50 and 60 precisely, and months of streaming.
+#define COARSE_TIMEBASE 300
+
+using TimebaseRatio = std::ratio<1, TIMEBASE>;
+
+#endif  // !defined(_TIMEBASE_H)
diff --git a/futatabi/util.cpp b/futatabi/util.cpp
new file mode 100644 (file)
index 0000000..061408d
--- /dev/null
@@ -0,0 +1,25 @@
+#include "util.h"
+
+#include <assert.h>
+#include <memory>
+#include <stdio.h>
+
+using namespace std;
+
+Flow read_flow(const char *filename)
+{
+       FILE *flowfp = fopen(filename, "rb");
+       uint32_t hdr, width, height;
+       fread(&hdr, sizeof(hdr), 1, flowfp);
+       fread(&width, sizeof(width), 1, flowfp);
+       fread(&height, sizeof(height), 1, flowfp);
+
+       unique_ptr<Vec2[]> flow(new Vec2[width * height]);
+       fread(flow.get(), width * height * sizeof(Vec2), 1, flowfp);
+
+       Flow ret;
+       ret.width = width;
+       ret.height = height;
+       ret.flow = move(flow);
+       return ret;
+}
diff --git a/futatabi/util.h b/futatabi/util.h
new file mode 100644 (file)
index 0000000..4a0aed8
--- /dev/null
@@ -0,0 +1,64 @@
+#ifndef _UTIL_H
+#define _UTIL_H 1
+
+#include <algorithm>
+#include <math.h>
+#include <memory>
+#include <stdint.h>
+
+struct Vec2 {
+       float du, dv;
+};
+
+struct Flow {
+       uint32_t width, height;
+       std::unique_ptr<Vec2[]> flow;
+};
+
+Flow read_flow(const char *filename);
+
+// du and dv are in pixels.
+inline void flow2rgb(float du, float dv, uint8_t *rr, uint8_t *gg, uint8_t *bb)
+{
+       float angle = atan2(dv, du);
+       float magnitude = std::min(hypot(du, dv) / 20.0, 1.0);
+
+       // HSV to RGB (from Wikipedia). Saturation is 1.
+       float c = magnitude;
+       float h = (angle + M_PI) * 6.0 / (2.0 * M_PI);
+       float X = c * (1.0 - fabs(fmod(h, 2.0) - 1.0));
+       float r = 0.0f, g = 0.0f, b = 0.0f;
+       if (h <= 1.0f) {
+               r = c;
+               g = X;
+       } else if (h <= 2.0f) {
+               r = X;
+               g = c;
+       } else if (h <= 3.0f) {
+               g = c;
+               b = X;
+       } else if (h <= 4.0f) {
+               g = X;
+               b = c;
+       } else if (h <= 5.0f) {
+               r = X;
+               b = c;
+       } else if (h <= 6.0f) {
+               r = c;
+               b = X;
+       } else {
+               // h is NaN, so black is fine.
+       }
+       float m = magnitude - c;
+       r += m;
+       g += m;
+       b += m;
+       r = std::max(std::min(r, 1.0f), 0.0f);
+       g = std::max(std::min(g, 1.0f), 0.0f);
+       b = std::max(std::min(b, 1.0f), 0.0f);
+       *rr = lrintf(r * 255.0f);
+       *gg = lrintf(g * 255.0f);
+       *bb = lrintf(b * 255.0f);
+}
+
+#endif  // !defined(_UTIL_H)
diff --git a/futatabi/vaapi_jpeg_decoder.cpp b/futatabi/vaapi_jpeg_decoder.cpp
new file mode 100644 (file)
index 0000000..12db78b
--- /dev/null
@@ -0,0 +1,568 @@
+#include "vaapi_jpeg_decoder.h"
+
+#include "jpeg_destroyer.h"
+#include "jpeg_frame.h"
+#include "memcpy_interleaved.h"
+
+#include <X11/Xlib.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <glob.h>
+#include <jpeglib.h>
+#include <list>
+#include <mutex>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <string>
+#include <unistd.h>
+#include <va/va.h>
+#include <va/va_drm.h>
+#include <va/va_x11.h>
+
+using namespace std;
+
+static unique_ptr<VADisplayWithCleanup> va_dpy;
+static VAConfigID config_id;
+static VAImageFormat uyvy_format;
+bool vaapi_jpeg_decoding_usable = false;
+
+struct VAResources {
+       unsigned width, height;
+       VASurfaceID surface;
+       VAContextID context;
+       VAImage image;
+};
+static list<VAResources> va_resources_freelist;
+static mutex va_resources_mutex;
+
+#define CHECK_VASTATUS(va_status, func)                                 \
+    if (va_status != VA_STATUS_SUCCESS) {                               \
+        fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \
+        exit(1);                                                        \
+    }
+
+#define CHECK_VASTATUS_RET(va_status, func)                             \
+    if (va_status != VA_STATUS_SUCCESS) {                               \
+        fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \
+        return nullptr;                                                 \
+    }
+
+// From libjpeg (although it's of course identical between implementations).
+static const int jpeg_natural_order[DCTSIZE2] = {
+        0,  1,  8, 16,  9,  2,  3, 10,
+       17, 24, 32, 25, 18, 11,  4,  5,
+       12, 19, 26, 33, 40, 48, 41, 34,
+       27, 20, 13,  6,  7, 14, 21, 28,
+       35, 42, 49, 56, 57, 50, 43, 36,
+       29, 22, 15, 23, 30, 37, 44, 51,
+       58, 59, 52, 45, 38, 31, 39, 46,
+       53, 60, 61, 54, 47, 55, 62, 63,
+};
+
+VAResources get_va_resources(unsigned width, unsigned height)
+{
+       {
+               lock_guard<mutex> lock(va_resources_mutex);
+               for (auto it = va_resources_freelist.begin(); it != va_resources_freelist.end(); ++it) {
+                       if (it->width == width && it->height == height) {
+                               VAResources ret = *it;
+                               va_resources_freelist.erase(it);
+                               return ret;
+                       }
+               }
+       }
+
+       VAResources ret;
+
+       ret.width = width;
+       ret.height = height;
+
+       VAStatus va_status = vaCreateSurfaces(va_dpy->va_dpy, VA_RT_FORMAT_YUV422,
+               width, height,
+               &ret.surface, 1, nullptr, 0);
+       CHECK_VASTATUS(va_status, "vaCreateSurfaces");
+
+       va_status = vaCreateContext(va_dpy->va_dpy, config_id, width, height, 0, &ret.surface, 1, &ret.context);
+       CHECK_VASTATUS(va_status, "vaCreateContext");
+
+       va_status = vaCreateImage(va_dpy->va_dpy, &uyvy_format, width, height, &ret.image);
+       CHECK_VASTATUS(va_status, "vaCreateImage");
+
+       return ret;
+}
+
+void release_va_resources(VAResources resources)
+{
+       lock_guard<mutex> lock(va_resources_mutex);
+       if (va_resources_freelist.size() > 10) {
+               auto it = va_resources_freelist.end();
+               --it;
+
+               VAStatus va_status = vaDestroyImage(va_dpy->va_dpy, it->image.image_id);
+               CHECK_VASTATUS(va_status, "vaDestroyImage");
+
+               va_status = vaDestroyContext(va_dpy->va_dpy, it->context);
+               CHECK_VASTATUS(va_status, "vaDestroyContext");
+
+               va_status = vaDestroySurfaces(va_dpy->va_dpy, &it->surface, 1);
+               CHECK_VASTATUS(va_status, "vaDestroySurfaces");
+
+               va_resources_freelist.erase(it);
+       }
+
+       va_resources_freelist.push_front(resources);
+}
+
+// RAII wrapper to release VAResources on return (even on error).
+class ReleaseVAResources {
+public:
+       ReleaseVAResources(const VAResources &resources)
+               : resources(resources) {}
+       ~ReleaseVAResources()
+       {
+               if (!committed) {
+                       release_va_resources(resources);
+               }
+       }
+
+       void commit() { committed = true; }
+
+private:
+       const VAResources &resources;
+       bool committed = false;
+};
+
+VADisplayWithCleanup::~VADisplayWithCleanup()
+{
+       if (va_dpy != nullptr) {
+               vaTerminate(va_dpy);
+       }
+       if (x11_display != nullptr) {
+               XCloseDisplay(x11_display);
+       }
+       if (drm_fd != -1) {
+               close(drm_fd);
+       }
+}
+
+unique_ptr<VADisplayWithCleanup> va_open_display(const string &va_display)
+{
+       if (va_display.empty() || va_display[0] != '/') {  // An X display.
+               Display *x11_display = XOpenDisplay(va_display.empty() ? nullptr : va_display.c_str());
+               if (x11_display == nullptr) {
+                       fprintf(stderr, "error: can't connect to X server!\n");
+                       return nullptr;
+               }
+
+               unique_ptr<VADisplayWithCleanup> ret(new VADisplayWithCleanup);
+               ret->x11_display = x11_display;
+               ret->va_dpy = vaGetDisplay(x11_display);
+               if (ret->va_dpy == nullptr) {
+                       return nullptr;
+               }
+               return ret;
+       } else {  // A DRM node on the filesystem (e.g. /dev/dri/renderD128).
+               int drm_fd = open(va_display.c_str(), O_RDWR);
+               if (drm_fd == -1) {
+                       perror(va_display.c_str());
+                       return nullptr;
+               }
+               unique_ptr<VADisplayWithCleanup> ret(new VADisplayWithCleanup);
+               ret->drm_fd = drm_fd;
+               ret->va_dpy = vaGetDisplayDRM(drm_fd);
+               if (ret->va_dpy == nullptr) {
+                       return nullptr;
+               }
+               return ret;
+       }
+}
+
+unique_ptr<VADisplayWithCleanup> try_open_va(const string &va_display, string *error)
+{
+       unique_ptr<VADisplayWithCleanup> va_dpy = va_open_display(va_display);
+       if (va_dpy == nullptr) {
+               if (error)
+                       *error = "Opening VA display failed";
+               return nullptr;
+       }
+       int major_ver, minor_ver;
+       VAStatus va_status = vaInitialize(va_dpy->va_dpy, &major_ver, &minor_ver);
+       if (va_status != VA_STATUS_SUCCESS) {
+               char buf[256];
+               snprintf(buf, sizeof(buf), "vaInitialize() failed with status %d\n", va_status);
+               if (error != nullptr)
+                       *error = buf;
+               return nullptr;
+       }
+
+       int num_entrypoints = vaMaxNumEntrypoints(va_dpy->va_dpy);
+       unique_ptr<VAEntrypoint[]> entrypoints(new VAEntrypoint[num_entrypoints]);
+       if (entrypoints == nullptr) {
+               if (error != nullptr)
+                       *error = "Failed to allocate memory for VA entry points";
+               return nullptr;
+       }
+
+       vaQueryConfigEntrypoints(va_dpy->va_dpy, VAProfileJPEGBaseline, entrypoints.get(), &num_entrypoints);
+       for (int slice_entrypoint = 0; slice_entrypoint < num_entrypoints; slice_entrypoint++) {
+               if (entrypoints[slice_entrypoint] != VAEntrypointVLD) {
+                       continue;
+               }
+
+               // We found a usable decode, so return it.
+               return va_dpy;
+       }
+
+       if (error != nullptr)
+               *error = "Can't find VAEntrypointVLD for the JPEG profile";
+       return nullptr;
+}
+
+string get_usable_va_display()
+{
+       // Reduce the amount of chatter while probing,
+       // unless the user has specified otherwise.
+       bool need_env_reset = false;
+       if (getenv("LIBVA_MESSAGING_LEVEL") == nullptr) {
+               setenv("LIBVA_MESSAGING_LEVEL", "0", true);
+               need_env_reset = true;
+       }
+
+       // First try the default (ie., whatever $DISPLAY is set to).
+       unique_ptr<VADisplayWithCleanup> va_dpy = try_open_va("", nullptr);
+       if (va_dpy != nullptr) {
+               if (need_env_reset) {
+                       unsetenv("LIBVA_MESSAGING_LEVEL");
+               }
+               return "";
+       }
+
+       fprintf(stderr, "The X11 display did not expose a VA-API JPEG decoder.\n");
+
+       // Try all /dev/dri/render* in turn. TODO: Accept /dev/dri/card*, too?
+       glob_t g;
+       int err = glob("/dev/dri/renderD*", 0, nullptr, &g);
+       if (err != 0) {
+               fprintf(stderr, "Couldn't list render nodes (%s) when trying to autodetect a replacement.\n", strerror(errno));
+       } else {
+               for (size_t i = 0; i < g.gl_pathc; ++i) {
+                       string path = g.gl_pathv[i];
+                       va_dpy = try_open_va(path, nullptr);
+                       if (va_dpy != nullptr) {
+                               fprintf(stderr, "Autodetected %s as a suitable replacement; using it.\n",
+                                       path.c_str());
+                               globfree(&g);
+                               if (need_env_reset) {
+                                       unsetenv("LIBVA_MESSAGING_LEVEL");
+                               }
+                               return path;
+                       }
+               }
+       }
+
+       fprintf(stderr, "No suitable VA-API JPEG decoders were found in /dev/dri; giving up.\n");
+       fprintf(stderr, "Note that if you are using an Intel CPU with an external GPU,\n");
+       fprintf(stderr, "you may need to enable the integrated Intel GPU in your BIOS\n");
+       fprintf(stderr, "to expose Quick Sync.\n");
+       return "none";
+}
+
+void init_jpeg_vaapi()
+{
+       string dpy = get_usable_va_display();
+       if (dpy == "none") {
+               return;
+       }
+
+       va_dpy = try_open_va(dpy, nullptr);
+       if (va_dpy == nullptr) {
+               return;
+       }
+
+       VAConfigAttrib attr = { VAConfigAttribRTFormat, VA_RT_FORMAT_YUV422 };
+
+       VAStatus va_status = vaCreateConfig(va_dpy->va_dpy, VAProfileJPEGBaseline, VAEntrypointVLD,
+               &attr, 1, &config_id);
+       CHECK_VASTATUS(va_status, "vaCreateConfig");
+
+       int num_formats = vaMaxNumImageFormats(va_dpy->va_dpy);
+       assert(num_formats > 0);
+
+       unique_ptr<VAImageFormat[]> formats(new VAImageFormat[num_formats]);
+       va_status = vaQueryImageFormats(va_dpy->va_dpy, formats.get(), &num_formats);
+       CHECK_VASTATUS(va_status, "vaQueryImageFormats");
+
+       bool found = false;
+       for (int i = 0; i < num_formats; ++i) {
+               // Seemingly VA_FOURCC_422H is no good for vaGetImage(). :-/
+               if (formats[i].fourcc == VA_FOURCC_UYVY) {
+                       memcpy(&uyvy_format, &formats[i], sizeof(VAImageFormat));
+                       found = true;
+                       break;
+               }
+       }
+       if (!found) {
+               return;
+       }
+
+       fprintf(stderr, "VA-API JPEG decoding initialized.\n");
+       vaapi_jpeg_decoding_usable = true;
+}
+
+class VABufferDestroyer {
+public:
+       VABufferDestroyer(VADisplay dpy, VABufferID buf)
+               : dpy(dpy), buf(buf) {}
+
+       ~VABufferDestroyer() {
+               VAStatus va_status = vaDestroyBuffer(dpy, buf);
+               CHECK_VASTATUS(va_status, "vaDestroyBuffer");
+       }
+
+private:
+       VADisplay dpy;
+       VABufferID buf;
+};
+
+shared_ptr<Frame> decode_jpeg_vaapi(const string &jpeg)
+{
+       jpeg_decompress_struct dinfo;
+       jpeg_error_mgr jerr;
+       dinfo.err = jpeg_std_error(&jerr);
+       jpeg_create_decompress(&dinfo);
+       JPEGDestroyer destroy_dinfo(&dinfo);
+
+       jpeg_mem_src(&dinfo, reinterpret_cast<const unsigned char *>(jpeg.data()), jpeg.size());
+       jpeg_read_header(&dinfo, true);
+
+       if (dinfo.num_components != 3) {
+               fprintf(stderr, "Not a color JPEG. (%d components, Y=%dx%d, Cb=%dx%d, Cr=%dx%d)\n",
+                       dinfo.num_components,
+                       dinfo.comp_info[0].h_samp_factor, dinfo.comp_info[0].v_samp_factor,
+                       dinfo.comp_info[1].h_samp_factor, dinfo.comp_info[1].v_samp_factor,
+                       dinfo.comp_info[2].h_samp_factor, dinfo.comp_info[2].v_samp_factor);
+               return nullptr;
+       }
+       if (dinfo.comp_info[0].h_samp_factor != 2 ||
+           dinfo.comp_info[1].h_samp_factor != 1 ||
+           dinfo.comp_info[1].v_samp_factor != dinfo.comp_info[0].v_samp_factor ||
+           dinfo.comp_info[2].h_samp_factor != 1 ||
+           dinfo.comp_info[2].v_samp_factor != dinfo.comp_info[0].v_samp_factor) {
+               fprintf(stderr, "Not 4:2:2. (Y=%dx%d, Cb=%dx%d, Cr=%dx%d)\n",
+                       dinfo.comp_info[0].h_samp_factor, dinfo.comp_info[0].v_samp_factor,
+                       dinfo.comp_info[1].h_samp_factor, dinfo.comp_info[1].v_samp_factor,
+                       dinfo.comp_info[2].h_samp_factor, dinfo.comp_info[2].v_samp_factor);
+               return nullptr;
+       }
+
+       // Picture parameters.
+       VAPictureParameterBufferJPEGBaseline pic_param;
+       memset(&pic_param, 0, sizeof(pic_param));
+       pic_param.picture_width = dinfo.image_width;
+       pic_param.picture_height = dinfo.image_height;
+       for (int component_idx = 0; component_idx < dinfo.num_components; ++component_idx) {
+               const jpeg_component_info *comp = &dinfo.comp_info[component_idx];
+               pic_param.components[component_idx].component_id = comp->component_id;
+               pic_param.components[component_idx].h_sampling_factor = comp->h_samp_factor;
+               pic_param.components[component_idx].v_sampling_factor = comp->v_samp_factor;
+               pic_param.components[component_idx].quantiser_table_selector = comp->quant_tbl_no;
+       }
+       pic_param.num_components = dinfo.num_components;
+       pic_param.color_space = 0;  // YUV.
+       pic_param.rotation = VA_ROTATION_NONE;
+
+       VABufferID pic_param_buffer;
+       VAStatus va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VAPictureParameterBufferType, sizeof(pic_param), 1, &pic_param, &pic_param_buffer);
+       CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+       VABufferDestroyer destroy_pic_param(va_dpy->va_dpy, pic_param_buffer);
+
+       // Quantization matrices.
+       VAIQMatrixBufferJPEGBaseline iq;
+       memset(&iq, 0, sizeof(iq));
+
+       for (int quant_tbl_idx = 0; quant_tbl_idx < min(4, NUM_QUANT_TBLS); ++quant_tbl_idx) {
+               const JQUANT_TBL *qtbl = dinfo.quant_tbl_ptrs[quant_tbl_idx];
+               if (qtbl == nullptr) {
+                       iq.load_quantiser_table[quant_tbl_idx] = 0;
+               } else {
+                       iq.load_quantiser_table[quant_tbl_idx] = 1;
+                       for (int i = 0; i < 64; ++i) {
+                               if (qtbl->quantval[i] > 255) {
+                                       fprintf(stderr, "Baseline JPEG only!\n");
+                                       return nullptr;
+                               }
+                               iq.quantiser_table[quant_tbl_idx][i] = qtbl->quantval[jpeg_natural_order[i]];
+                       }
+               }
+       }
+
+       VABufferID iq_buffer;
+       va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VAIQMatrixBufferType, sizeof(iq), 1, &iq, &iq_buffer);
+       CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+       VABufferDestroyer destroy_iq(va_dpy->va_dpy, iq_buffer);
+
+       // Huffman tables (arithmetic is not supported).
+       VAHuffmanTableBufferJPEGBaseline huff;
+       memset(&huff, 0, sizeof(huff));
+
+       for (int huff_tbl_idx = 0; huff_tbl_idx < min(2, NUM_HUFF_TBLS); ++huff_tbl_idx) {
+               const JHUFF_TBL *ac_hufftbl = dinfo.ac_huff_tbl_ptrs[huff_tbl_idx];
+               const JHUFF_TBL *dc_hufftbl = dinfo.dc_huff_tbl_ptrs[huff_tbl_idx];
+               if (ac_hufftbl == nullptr) {
+                       assert(dc_hufftbl == nullptr);
+                       huff.load_huffman_table[huff_tbl_idx] = 0;
+               } else {
+                       assert(dc_hufftbl != nullptr);
+                       huff.load_huffman_table[huff_tbl_idx] = 1;
+
+                       for (int i = 0; i < 16; ++i) {
+                               huff.huffman_table[huff_tbl_idx].num_dc_codes[i] = dc_hufftbl->bits[i + 1];
+                       }
+                       for (int i = 0; i < 12; ++i) {
+                               huff.huffman_table[huff_tbl_idx].dc_values[i] = dc_hufftbl->huffval[i];
+                       }
+                       for (int i = 0; i < 16; ++i) {
+                               huff.huffman_table[huff_tbl_idx].num_ac_codes[i] = ac_hufftbl->bits[i + 1];
+                       }
+                       for (int i = 0; i < 162; ++i) {
+                               huff.huffman_table[huff_tbl_idx].ac_values[i] = ac_hufftbl->huffval[i];
+                       }
+               }
+       }
+
+       VABufferID huff_buffer;
+       va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VAHuffmanTableBufferType, sizeof(huff), 1, &huff, &huff_buffer);
+       CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+       VABufferDestroyer destroy_huff(va_dpy->va_dpy, huff_buffer);
+
+       // Slice parameters (metadata about the slice).
+       VASliceParameterBufferJPEGBaseline parms;
+       memset(&parms, 0, sizeof(parms));
+       parms.slice_data_size = dinfo.src->bytes_in_buffer;
+       parms.slice_data_offset = 0;
+       parms.slice_data_flag = VA_SLICE_DATA_FLAG_ALL;
+       parms.slice_horizontal_position = 0;
+       parms.slice_vertical_position = 0;
+       for (int component_idx = 0; component_idx < dinfo.num_components; ++component_idx) {
+               const jpeg_component_info *comp = &dinfo.comp_info[component_idx];
+               parms.components[component_idx].component_selector = comp->component_id;
+               parms.components[component_idx].dc_table_selector = comp->dc_tbl_no;
+               parms.components[component_idx].ac_table_selector = comp->ac_tbl_no;
+               if (parms.components[component_idx].dc_table_selector > 1 ||
+                   parms.components[component_idx].ac_table_selector > 1) {
+                       fprintf(stderr, "Uses too many Huffman tables\n");
+                       return nullptr;
+               }
+       }
+       parms.num_components = dinfo.num_components;
+       parms.restart_interval = dinfo.restart_interval;
+       int horiz_mcus = (dinfo.image_width + (DCTSIZE * 2) - 1) / (DCTSIZE * 2);
+       int vert_mcus = (dinfo.image_height + DCTSIZE - 1) / DCTSIZE;
+       parms.num_mcus = horiz_mcus * vert_mcus;
+
+       VABufferID slice_param_buffer;
+       va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VASliceParameterBufferType, sizeof(parms), 1, &parms, &slice_param_buffer);
+       CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+       VABufferDestroyer destroy_slice_param(va_dpy->va_dpy, slice_param_buffer);
+
+       // The actual data. VA-API will destuff and all for us.
+       VABufferID data_buffer;
+       va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VASliceDataBufferType, dinfo.src->bytes_in_buffer, 1, const_cast<unsigned char *>(dinfo.src->next_input_byte), &data_buffer);
+       CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+       VABufferDestroyer destroy_data(va_dpy->va_dpy, data_buffer);
+
+       VAResources resources = get_va_resources(dinfo.image_width, dinfo.image_height);
+       ReleaseVAResources release(resources);
+
+       va_status = vaBeginPicture(va_dpy->va_dpy, resources.context, resources.surface);
+       CHECK_VASTATUS_RET(va_status, "vaBeginPicture");
+       va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &pic_param_buffer, 1);
+       CHECK_VASTATUS_RET(va_status, "vaRenderPicture(pic_param)");
+       va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &iq_buffer, 1);
+       CHECK_VASTATUS_RET(va_status, "vaRenderPicture(iq)");
+       va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &huff_buffer, 1);
+       CHECK_VASTATUS_RET(va_status, "vaRenderPicture(huff)");
+       va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &slice_param_buffer, 1);
+       CHECK_VASTATUS_RET(va_status, "vaRenderPicture(slice_param)");
+       va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &data_buffer, 1);
+       CHECK_VASTATUS_RET(va_status, "vaRenderPicture(data)");
+       va_status = vaEndPicture(va_dpy->va_dpy, resources.context);
+       CHECK_VASTATUS_RET(va_status, "vaEndPicture");
+
+       // vaDeriveImage() works, but the resulting image seems to live in
+       // uncached memory, which makes copying data out from it very, very slow.
+       // Thanks to FFmpeg for the observation that you can vaGetImage() the
+       // surface onto your own image (although then, it can't be planar, which
+       // is unfortunate for us).
+#if 0
+       VAImage image;
+       va_status = vaDeriveImage(va_dpy->va_dpy, surf, &image);
+       CHECK_VASTATUS_RET(va_status, "vaDeriveImage");
+#else
+       va_status = vaSyncSurface(va_dpy->va_dpy, resources.surface);
+       CHECK_VASTATUS_RET(va_status, "vaSyncSurface");
+
+       va_status = vaGetImage(va_dpy->va_dpy, resources.surface, 0, 0, dinfo.image_width, dinfo.image_height, resources.image.image_id);
+       CHECK_VASTATUS_RET(va_status, "vaGetImage");
+#endif
+
+       void *mapped;
+       va_status = vaMapBuffer(va_dpy->va_dpy, resources.image.buf, &mapped);
+       CHECK_VASTATUS_RET(va_status, "vaMapBuffer");
+
+       shared_ptr<Frame> frame(new Frame);
+#if 0
+       // 4:2:2 planar (for vaDeriveImage).
+       frame->y.reset(new uint8_t[dinfo.image_width * dinfo.image_height]);
+       frame->cb.reset(new uint8_t[(dinfo.image_width / 2) * dinfo.image_height]);
+       frame->cr.reset(new uint8_t[(dinfo.image_width / 2) * dinfo.image_height]);
+       for (int component_idx = 0; component_idx < dinfo.num_components; ++component_idx) {
+               uint8_t *dptr;
+               size_t width;
+               if (component_idx == 0) {
+                       dptr = frame->y.get();
+                       width = dinfo.image_width;
+               } else if (component_idx == 1) {
+                       dptr = frame->cb.get();
+                       width = dinfo.image_width / 2;
+               } else if (component_idx == 2) {
+                       dptr = frame->cr.get();
+                       width = dinfo.image_width / 2;
+               } else {
+                       assert(false);
+               }
+               const uint8_t *sptr = (const uint8_t *)mapped + image.offsets[component_idx];
+               size_t spitch = image.pitches[component_idx];
+               for (size_t y = 0; y < dinfo.image_height; ++y) {
+                       memcpy(dptr + y * width, sptr + y * spitch, width);
+               }
+       }
+#else
+       // Convert Y'CbCr to separate Y' and CbCr.
+       frame->is_semiplanar = true;
+       frame->y.reset(new uint8_t[dinfo.image_width * dinfo.image_height]);
+       frame->cbcr.reset(new uint8_t[dinfo.image_width * dinfo.image_height]);
+       const uint8_t *src = (const uint8_t *)mapped + resources.image.offsets[0];
+       if (resources.image.pitches[0] == dinfo.image_width * 2) {
+               memcpy_interleaved(frame->cbcr.get(), frame->y.get(), src, dinfo.image_width * dinfo.image_height * 2);
+       } else {
+               for (unsigned y = 0; y < dinfo.image_height; ++y) {
+                       memcpy_interleaved(frame->cbcr.get() + y * dinfo.image_width, frame->y.get() + y * dinfo.image_width,
+                                          src + y * resources.image.pitches[0], dinfo.image_width * 2);
+               }
+       }
+#endif
+       frame->width = dinfo.image_width;
+       frame->height = dinfo.image_height;
+       frame->chroma_subsampling_x = 2;
+       frame->chroma_subsampling_y = 1;
+       frame->pitch_y = dinfo.image_width;
+       frame->pitch_chroma = dinfo.image_width / 2;
+
+       va_status = vaUnmapBuffer(va_dpy->va_dpy, resources.image.buf);
+       CHECK_VASTATUS_RET(va_status, "vaUnmapBuffer");
+
+       return frame;
+}
diff --git a/futatabi/vaapi_jpeg_decoder.h b/futatabi/vaapi_jpeg_decoder.h
new file mode 100644 (file)
index 0000000..4182cfc
--- /dev/null
@@ -0,0 +1,26 @@
+#ifndef _VAAPI_JPEG_DECODER_H
+#define _VAAPI_JPEG_DECODER_H 1
+
+#include <X11/Xlib.h>
+#include <memory>
+#include <string>
+#include <va/va.h>
+
+struct Frame;
+
+struct VADisplayWithCleanup {
+       ~VADisplayWithCleanup();
+
+       VADisplay va_dpy;
+       Display *x11_display = nullptr;
+       int drm_fd = -1;
+};
+std::unique_ptr<VADisplayWithCleanup> va_open_display(const std::string &va_display);  // Can return nullptr on failure.
+std::string get_usable_va_display();
+
+void init_jpeg_vaapi();
+std::shared_ptr<Frame> decode_jpeg_vaapi(const std::string &jpeg);
+
+extern bool vaapi_jpeg_decoding_usable;
+
+#endif  // !defined(_VAAPI_JPEG_DECODER_H)
diff --git a/futatabi/video_stream.cpp b/futatabi/video_stream.cpp
new file mode 100644 (file)
index 0000000..d425ed1
--- /dev/null
@@ -0,0 +1,656 @@
+#include "video_stream.h"
+
+extern "C" {
+#include <libavformat/avformat.h>
+#include <libavformat/avio.h>
+}
+
+#include "chroma_subsampler.h"
+#include "context.h"
+#include "flags.h"
+#include "flow.h"
+#include "httpd.h"
+#include "jpeg_frame_view.h"
+#include "movit/util.h"
+#include "mux.h"
+#include "player.h"
+#include "util.h"
+#include "ycbcr_converter.h"
+
+#include <epoxy/glx.h>
+#include <jpeglib.h>
+#include <unistd.h>
+
+using namespace std;
+using namespace std::chrono;
+
+extern HTTPD *global_httpd;
+
+struct VectorDestinationManager {
+       jpeg_destination_mgr pub;
+       std::vector<uint8_t> dest;
+
+       VectorDestinationManager()
+       {
+               pub.init_destination = init_destination_thunk;
+               pub.empty_output_buffer = empty_output_buffer_thunk;
+               pub.term_destination = term_destination_thunk;
+       }
+
+       static void init_destination_thunk(j_compress_ptr ptr)
+       {
+               ((VectorDestinationManager *)(ptr->dest))->init_destination();
+       }
+
+       inline void init_destination()
+       {
+               make_room(0);
+       }
+
+       static boolean empty_output_buffer_thunk(j_compress_ptr ptr)
+       {
+               return ((VectorDestinationManager *)(ptr->dest))->empty_output_buffer();
+       }
+
+       inline bool empty_output_buffer()
+       {
+               make_room(dest.size());  // Should ignore pub.free_in_buffer!
+               return true;
+       }
+
+       inline void make_room(size_t bytes_used)
+       {
+               dest.resize(bytes_used + 4096);
+               dest.resize(dest.capacity());
+               pub.next_output_byte = dest.data() + bytes_used;
+               pub.free_in_buffer = dest.size() - bytes_used;
+       }
+
+       static void term_destination_thunk(j_compress_ptr ptr)
+       {
+               ((VectorDestinationManager *)(ptr->dest))->term_destination();
+       }
+
+       inline void term_destination()
+       {
+               dest.resize(dest.size() - pub.free_in_buffer);
+       }
+};
+static_assert(std::is_standard_layout<VectorDestinationManager>::value, "");
+
+vector<uint8_t> encode_jpeg(const uint8_t *y_data, const uint8_t *cb_data, const uint8_t *cr_data, unsigned width, unsigned height)
+{
+       VectorDestinationManager dest;
+
+       jpeg_compress_struct cinfo;
+       jpeg_error_mgr jerr;
+       cinfo.err = jpeg_std_error(&jerr);
+       jpeg_create_compress(&cinfo);
+
+       cinfo.dest = (jpeg_destination_mgr *)&dest;
+       cinfo.input_components = 3;
+       cinfo.in_color_space = JCS_RGB;
+       jpeg_set_defaults(&cinfo);
+       constexpr int quality = 90;
+       jpeg_set_quality(&cinfo, quality, /*force_baseline=*/false);
+
+       cinfo.image_width = width;
+       cinfo.image_height = height;
+       cinfo.raw_data_in = true;
+       jpeg_set_colorspace(&cinfo, JCS_YCbCr);
+       cinfo.comp_info[0].h_samp_factor = 2;
+       cinfo.comp_info[0].v_samp_factor = 1;
+       cinfo.comp_info[1].h_samp_factor = 1;
+       cinfo.comp_info[1].v_samp_factor = 1;
+       cinfo.comp_info[2].h_samp_factor = 1;
+       cinfo.comp_info[2].v_samp_factor = 1;
+       cinfo.CCIR601_sampling = true;  // Seems to be mostly ignored by libjpeg, though.
+       jpeg_start_compress(&cinfo, true);
+
+       JSAMPROW yptr[8], cbptr[8], crptr[8];
+       JSAMPARRAY data[3] = { yptr, cbptr, crptr };
+       for (unsigned y = 0; y < height; y += 8) {
+               for (unsigned yy = 0; yy < 8; ++yy) {
+                       yptr[yy] = const_cast<JSAMPROW>(&y_data[(y + yy) * width]);
+                       cbptr[yy] = const_cast<JSAMPROW>(&cb_data[(y + yy) * width / 2]);
+                       crptr[yy] = const_cast<JSAMPROW>(&cr_data[(y + yy) * width / 2]);
+               }
+
+               jpeg_write_raw_data(&cinfo, data, /*num_lines=*/8);
+       }
+
+       jpeg_finish_compress(&cinfo);
+       jpeg_destroy_compress(&cinfo);
+
+       return move(dest.dest);
+}
+
+VideoStream::VideoStream()
+{
+       ycbcr_converter.reset(new YCbCrConverter(YCbCrConverter::OUTPUT_TO_DUAL_YCBCR, /*resource_pool=*/nullptr));
+       ycbcr_semiplanar_converter.reset(new YCbCrConverter(YCbCrConverter::OUTPUT_TO_SEMIPLANAR, /*resource_pool=*/nullptr));
+
+       GLuint input_tex[num_interpolate_slots], gray_tex[num_interpolate_slots];
+       GLuint fade_y_output_tex[num_interpolate_slots], fade_cbcr_output_tex[num_interpolate_slots];
+       GLuint cb_tex[num_interpolate_slots], cr_tex[num_interpolate_slots];
+
+       glCreateTextures(GL_TEXTURE_2D_ARRAY, num_interpolate_slots, input_tex);
+       glCreateTextures(GL_TEXTURE_2D_ARRAY, num_interpolate_slots, gray_tex);
+       glCreateTextures(GL_TEXTURE_2D, num_interpolate_slots, fade_y_output_tex);
+       glCreateTextures(GL_TEXTURE_2D, num_interpolate_slots, fade_cbcr_output_tex);
+       glCreateTextures(GL_TEXTURE_2D, num_interpolate_slots, cb_tex);
+       glCreateTextures(GL_TEXTURE_2D, num_interpolate_slots, cr_tex);
+       check_error();
+
+       constexpr size_t width = 1280, height = 720;  // FIXME: adjustable width, height
+       int levels = find_num_levels(width, height);
+       for (size_t i = 0; i < num_interpolate_slots; ++i) {
+               glTextureStorage3D(input_tex[i], levels, GL_RGBA8, width, height, 2);
+               check_error();
+               glTextureStorage3D(gray_tex[i], levels, GL_R8, width, height, 2);
+               check_error();
+               glTextureStorage2D(fade_y_output_tex[i], 1, GL_R8, width, height);
+               check_error();
+               glTextureStorage2D(fade_cbcr_output_tex[i], 1, GL_RG8, width, height);
+               check_error();
+               glTextureStorage2D(cb_tex[i], 1, GL_R8, width / 2, height);
+               check_error();
+               glTextureStorage2D(cr_tex[i], 1, GL_R8, width / 2, height);
+               check_error();
+
+               unique_ptr<InterpolatedFrameResources> resource(new InterpolatedFrameResources);
+               resource->owner = this;
+               resource->input_tex = input_tex[i];
+               resource->gray_tex = gray_tex[i];
+               resource->fade_y_output_tex = fade_y_output_tex[i];
+               resource->fade_cbcr_output_tex = fade_cbcr_output_tex[i];
+               resource->cb_tex = cb_tex[i];
+               resource->cr_tex = cr_tex[i];
+               glCreateFramebuffers(2, resource->input_fbos);
+               check_error();
+               glCreateFramebuffers(1, &resource->fade_fbo);
+               check_error();
+
+               glNamedFramebufferTextureLayer(resource->input_fbos[0], GL_COLOR_ATTACHMENT0, input_tex[i], 0, 0);
+               check_error();
+               glNamedFramebufferTextureLayer(resource->input_fbos[0], GL_COLOR_ATTACHMENT1, gray_tex[i], 0, 0);
+               check_error();
+               glNamedFramebufferTextureLayer(resource->input_fbos[1], GL_COLOR_ATTACHMENT0, input_tex[i], 0, 1);
+               check_error();
+               glNamedFramebufferTextureLayer(resource->input_fbos[1], GL_COLOR_ATTACHMENT1, gray_tex[i], 0, 1);
+               check_error();
+               glNamedFramebufferTexture(resource->fade_fbo, GL_COLOR_ATTACHMENT0, fade_y_output_tex[i], 0);
+               check_error();
+               glNamedFramebufferTexture(resource->fade_fbo, GL_COLOR_ATTACHMENT1, fade_cbcr_output_tex[i], 0);
+               check_error();
+
+               GLuint bufs[] = { GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1 };
+               glNamedFramebufferDrawBuffers(resource->input_fbos[0], 2, bufs);
+               check_error();
+               glNamedFramebufferDrawBuffers(resource->input_fbos[1], 2, bufs);
+               check_error();
+               glNamedFramebufferDrawBuffers(resource->fade_fbo, 2, bufs);
+               check_error();
+
+               glCreateBuffers(1, &resource->pbo);
+               check_error();
+               glNamedBufferStorage(resource->pbo, width * height * 4, nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+               check_error();
+               resource->pbo_contents = glMapNamedBufferRange(resource->pbo, 0, width * height * 4, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+               interpolate_resources.push_back(move(resource));
+       }
+
+       check_error();
+
+       OperatingPoint op;
+       if (global_flags.interpolation_quality == 1) {
+               op = operating_point1;
+       } else if (global_flags.interpolation_quality == 2) {
+               op = operating_point2;
+       } else if (global_flags.interpolation_quality == 3) {
+               op = operating_point3;
+       } else if (global_flags.interpolation_quality == 4) {
+               op = operating_point4;
+       } else {
+               assert(false);
+       }
+
+       compute_flow.reset(new DISComputeFlow(width, height, op));
+       interpolate.reset(new Interpolate(op, /*split_ycbcr_output=*/true));
+       interpolate_no_split.reset(new Interpolate(op, /*split_ycbcr_output=*/false));
+       chroma_subsampler.reset(new ChromaSubsampler);
+       check_error();
+
+       // The “last frame” is initially black.
+       unique_ptr<uint8_t[]> y(new uint8_t[1280 * 720]);
+       unique_ptr<uint8_t[]> cb_or_cr(new uint8_t[640 * 720]);
+       memset(y.get(), 16, 1280 * 720);
+       memset(cb_or_cr.get(), 128, 640 * 720);
+       last_frame = encode_jpeg(y.get(), cb_or_cr.get(), cb_or_cr.get(), 1280, 720);
+}
+
+VideoStream::~VideoStream() {}
+
+void VideoStream::start()
+{
+       AVFormatContext *avctx = avformat_alloc_context();
+       avctx->oformat = av_guess_format("nut", nullptr, nullptr);
+
+       uint8_t *buf = (uint8_t *)av_malloc(MUX_BUFFER_SIZE);
+       avctx->pb = avio_alloc_context(buf, MUX_BUFFER_SIZE, 1, this, nullptr, nullptr, nullptr);
+       avctx->pb->write_data_type = &VideoStream::write_packet2_thunk;
+       avctx->pb->ignore_boundary_point = 1;
+
+       Mux::Codec video_codec = Mux::CODEC_MJPEG;
+
+       avctx->flags = AVFMT_FLAG_CUSTOM_IO;
+
+       string video_extradata;
+
+       constexpr int width = 1280, height = 720;  // Doesn't matter for MJPEG.
+       stream_mux.reset(new Mux(avctx, width, height, video_codec, video_extradata, /*audio_codec_parameters=*/nullptr, COARSE_TIMEBASE,
+               /*write_callback=*/nullptr, Mux::WRITE_FOREGROUND, {}));
+
+
+       encode_thread = thread(&VideoStream::encode_thread_func, this);
+}
+
+void VideoStream::stop()
+{
+       encode_thread.join();
+}
+
+void VideoStream::clear_queue()
+{
+       deque<QueuedFrame> q;
+
+       {
+               unique_lock<mutex> lock(queue_lock);
+               q = move(frame_queue);
+       }
+
+       // These are not RAII-ed, unfortunately, so we'll need to clean them ourselves.
+       // Note that release_texture() is thread-safe.
+       for (const QueuedFrame &qf : q) {
+               if (qf.type == QueuedFrame::INTERPOLATED ||
+                   qf.type == QueuedFrame::FADED_INTERPOLATED) {
+                       compute_flow->release_texture(qf.flow_tex);
+               }
+               if (qf.type == QueuedFrame::INTERPOLATED) {
+                       interpolate->release_texture(qf.output_tex);
+                       interpolate->release_texture(qf.cbcr_tex);
+               }
+       }
+
+       // Destroy q outside the mutex, as that would be a double-lock.
+}
+
+void VideoStream::schedule_original_frame(steady_clock::time_point local_pts,
+                                          int64_t output_pts, function<void()> &&display_func,
+                                          QueueSpotHolder &&queue_spot_holder,
+                                          FrameOnDisk frame)
+{
+       fprintf(stderr, "output_pts=%ld  original      input_pts=%ld\n", output_pts, frame.pts);
+
+       // Preload the file from disk, so that the encoder thread does not get stalled.
+       // TODO: Consider sending it through the queue instead.
+       (void)frame_reader.read_frame(frame);
+
+       QueuedFrame qf;
+       qf.local_pts = local_pts;
+       qf.type = QueuedFrame::ORIGINAL;
+       qf.output_pts = output_pts;
+       qf.frame1 = frame;
+       qf.display_func = move(display_func);
+       qf.queue_spot_holder = move(queue_spot_holder);
+
+       unique_lock<mutex> lock(queue_lock);
+       frame_queue.push_back(move(qf));
+       queue_changed.notify_all();
+}
+
+void VideoStream::schedule_faded_frame(steady_clock::time_point local_pts, int64_t output_pts,
+                                       function<void()> &&display_func,
+                                       QueueSpotHolder &&queue_spot_holder,
+                                       FrameOnDisk frame1_spec, FrameOnDisk frame2_spec,
+                                       float fade_alpha)
+{
+       fprintf(stderr, "output_pts=%ld  faded         input_pts=%ld,%ld  fade_alpha=%.2f\n", output_pts, frame1_spec.pts, frame2_spec.pts, fade_alpha);
+
+       // Get the temporary OpenGL resources we need for doing the fade.
+       // (We share these with interpolated frames, which is slightly
+       // overkill, but there's no need to waste resources on keeping
+       // separate pools around.)
+       BorrowedInterpolatedFrameResources resources;
+       {
+               unique_lock<mutex> lock(queue_lock);
+               if (interpolate_resources.empty()) {
+                       fprintf(stderr, "WARNING: Too many interpolated frames already in transit; dropping one.\n");
+                       return;
+               }
+               resources = BorrowedInterpolatedFrameResources(interpolate_resources.front().release());
+               interpolate_resources.pop_front();
+       }
+
+       bool did_decode;
+
+       shared_ptr<Frame> frame1 = decode_jpeg_with_cache(frame1_spec, DECODE_IF_NOT_IN_CACHE, &frame_reader, &did_decode);
+       shared_ptr<Frame> frame2 = decode_jpeg_with_cache(frame2_spec, DECODE_IF_NOT_IN_CACHE, &frame_reader, &did_decode);
+
+       ycbcr_semiplanar_converter->prepare_chain_for_fade(frame1, frame2, fade_alpha)->render_to_fbo(resources->fade_fbo, 1280, 720);
+
+       QueuedFrame qf;
+       qf.local_pts = local_pts;
+       qf.type = QueuedFrame::FADED;
+       qf.output_pts = output_pts;
+       qf.frame1 = frame1_spec;
+       qf.display_func = move(display_func);
+       qf.queue_spot_holder = move(queue_spot_holder);
+
+       qf.secondary_frame = frame2_spec;
+
+       // Subsample and split Cb/Cr.
+       chroma_subsampler->subsample_chroma(resources->fade_cbcr_output_tex, 1280, 720, resources->cb_tex, resources->cr_tex);
+
+       // Read it down (asynchronously) to the CPU.
+       glPixelStorei(GL_PACK_ROW_LENGTH, 0);
+       glBindBuffer(GL_PIXEL_PACK_BUFFER, resources->pbo);
+       check_error();
+       glGetTextureImage(resources->fade_y_output_tex, 0, GL_RED, GL_UNSIGNED_BYTE, 1280 * 720 * 4, BUFFER_OFFSET(0));
+       check_error();
+       glGetTextureImage(resources->cb_tex, 0, GL_RED, GL_UNSIGNED_BYTE, 1280 * 720 * 3, BUFFER_OFFSET(1280 * 720));
+       check_error();
+       glGetTextureImage(resources->cr_tex, 0, GL_RED, GL_UNSIGNED_BYTE, 1280 * 720 * 3 - 640 * 720, BUFFER_OFFSET(1280 * 720 + 640 * 720));
+       check_error();
+       glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+
+       // Set a fence we can wait for to make sure the CPU sees the read.
+       glMemoryBarrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
+       check_error();
+       qf.fence = RefCountedGLsync(GL_SYNC_GPU_COMMANDS_COMPLETE, /*flags=*/0);
+       check_error();
+       qf.resources = move(resources);
+       qf.local_pts = local_pts;
+
+       unique_lock<mutex> lock(queue_lock);
+       frame_queue.push_back(move(qf));
+       queue_changed.notify_all();
+}
+
+void VideoStream::schedule_interpolated_frame(steady_clock::time_point local_pts,
+                                              int64_t output_pts, function<void(shared_ptr<Frame>)> &&display_func,
+                                              QueueSpotHolder &&queue_spot_holder,
+                                              FrameOnDisk frame1, FrameOnDisk frame2,
+                                              float alpha, FrameOnDisk secondary_frame, float fade_alpha)
+{
+       if (secondary_frame.pts != -1) {
+               fprintf(stderr, "output_pts=%ld  interpolated  input_pts1=%ld input_pts2=%ld alpha=%.3f  secondary_pts=%ld  fade_alpha=%.2f\n", output_pts, frame1.pts, frame2.pts, alpha, secondary_frame.pts, fade_alpha);
+       } else {
+               fprintf(stderr, "output_pts=%ld  interpolated  input_pts1=%ld input_pts2=%ld alpha=%.3f\n", output_pts, frame1.pts, frame2.pts, alpha);
+       }
+
+       // Get the temporary OpenGL resources we need for doing the interpolation.
+       BorrowedInterpolatedFrameResources resources;
+       {
+               unique_lock<mutex> lock(queue_lock);
+               if (interpolate_resources.empty()) {
+                       fprintf(stderr, "WARNING: Too many interpolated frames already in transit; dropping one.\n");
+                       return;
+               }
+               resources = BorrowedInterpolatedFrameResources(interpolate_resources.front().release());
+               interpolate_resources.pop_front();
+       }
+
+       QueuedFrame qf;
+       qf.type = (secondary_frame.pts == -1) ? QueuedFrame::INTERPOLATED : QueuedFrame::FADED_INTERPOLATED;
+       qf.output_pts = output_pts;
+       qf.display_decoded_func = move(display_func);
+       qf.queue_spot_holder = move(queue_spot_holder);
+       qf.local_pts = local_pts;
+
+       check_error();
+
+       // Convert frame0 and frame1 to OpenGL textures.
+       for (size_t frame_no = 0; frame_no < 2; ++frame_no) {
+               FrameOnDisk frame_spec = frame_no == 1 ? frame2 : frame1;
+               bool did_decode;
+               shared_ptr<Frame> frame = decode_jpeg_with_cache(frame_spec, DECODE_IF_NOT_IN_CACHE, &frame_reader, &did_decode);
+               ycbcr_converter->prepare_chain_for_conversion(frame)->render_to_fbo(resources->input_fbos[frame_no], 1280, 720);
+       }
+
+       glGenerateTextureMipmap(resources->input_tex);
+       check_error();
+       glGenerateTextureMipmap(resources->gray_tex);
+       check_error();
+
+       // Compute the interpolated frame.
+       qf.flow_tex = compute_flow->exec(resources->gray_tex, DISComputeFlow::FORWARD_AND_BACKWARD, DISComputeFlow::DO_NOT_RESIZE_FLOW);
+       check_error();
+
+       if (secondary_frame.pts != -1) {
+               // Fade. First kick off the interpolation.
+               tie(qf.output_tex, ignore) = interpolate_no_split->exec(resources->input_tex, resources->gray_tex, qf.flow_tex, 1280, 720, alpha);
+               check_error();
+
+               // Now decode the image we are fading against.
+               bool did_decode;
+               shared_ptr<Frame> frame2 = decode_jpeg_with_cache(secondary_frame, DECODE_IF_NOT_IN_CACHE, &frame_reader, &did_decode);
+
+               // Then fade against it, putting it into the fade Y' and CbCr textures.
+               ycbcr_semiplanar_converter->prepare_chain_for_fade_from_texture(qf.output_tex, frame2, fade_alpha)->render_to_fbo(resources->fade_fbo, 1280, 720);
+
+               // Subsample and split Cb/Cr.
+               chroma_subsampler->subsample_chroma(resources->fade_cbcr_output_tex, 1280, 720, resources->cb_tex, resources->cr_tex);
+
+               interpolate_no_split->release_texture(qf.output_tex);
+       } else {
+               tie(qf.output_tex, qf.cbcr_tex) = interpolate->exec(resources->input_tex, resources->gray_tex, qf.flow_tex, 1280, 720, alpha);
+               check_error();
+
+               // Subsample and split Cb/Cr.
+               chroma_subsampler->subsample_chroma(qf.cbcr_tex, 1280, 720, resources->cb_tex, resources->cr_tex);
+       }
+
+       // We could have released qf.flow_tex here, but to make sure we don't cause a stall
+       // when trying to reuse it for the next frame, we can just as well hold on to it
+       // and release it only when the readback is done.
+
+       // Read it down (asynchronously) to the CPU.
+       glPixelStorei(GL_PACK_ROW_LENGTH, 0);
+       glBindBuffer(GL_PIXEL_PACK_BUFFER, resources->pbo);
+       check_error();
+       if (secondary_frame.pts != -1) {
+               glGetTextureImage(resources->fade_y_output_tex, 0, GL_RED, GL_UNSIGNED_BYTE, 1280 * 720 * 4, BUFFER_OFFSET(0));
+       } else {
+               glGetTextureImage(qf.output_tex, 0, GL_RED, GL_UNSIGNED_BYTE, 1280 * 720 * 4, BUFFER_OFFSET(0));
+       }
+       check_error();
+       glGetTextureImage(resources->cb_tex, 0, GL_RED, GL_UNSIGNED_BYTE, 1280 * 720 * 3, BUFFER_OFFSET(1280 * 720));
+       check_error();
+       glGetTextureImage(resources->cr_tex, 0, GL_RED, GL_UNSIGNED_BYTE, 1280 * 720 * 3 - 640 * 720, BUFFER_OFFSET(1280 * 720 + 640 * 720));
+       check_error();
+       glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+
+       // Set a fence we can wait for to make sure the CPU sees the read.
+       glMemoryBarrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
+       check_error();
+       qf.fence = RefCountedGLsync(GL_SYNC_GPU_COMMANDS_COMPLETE, /*flags=*/0);
+       check_error();
+       qf.resources = move(resources);
+
+       unique_lock<mutex> lock(queue_lock);
+       frame_queue.push_back(move(qf));
+       queue_changed.notify_all();
+}
+
+void VideoStream::schedule_refresh_frame(steady_clock::time_point local_pts,
+                                         int64_t output_pts, function<void()> &&display_func,
+                                         QueueSpotHolder &&queue_spot_holder)
+{
+       QueuedFrame qf;
+       qf.type = QueuedFrame::REFRESH;
+       qf.output_pts = output_pts;
+       qf.display_func = move(display_func);
+       qf.queue_spot_holder = move(queue_spot_holder);
+
+       unique_lock<mutex> lock(queue_lock);
+       frame_queue.push_back(move(qf));
+       queue_changed.notify_all();
+}
+
+namespace {
+
+shared_ptr<Frame> frame_from_pbo(void *contents, size_t width, size_t height)
+{
+       size_t chroma_width = width / 2;
+
+       const uint8_t *y = (const uint8_t *)contents;
+       const uint8_t *cb = (const uint8_t *)contents + width * height;
+       const uint8_t *cr = (const uint8_t *)contents + width * height + chroma_width * height;
+
+       shared_ptr<Frame> frame(new Frame);
+       frame->y.reset(new uint8_t[width * height]);
+       frame->cb.reset(new uint8_t[chroma_width * height]);
+       frame->cr.reset(new uint8_t[chroma_width * height]);
+       for (unsigned yy = 0; yy < height; ++yy) {
+               memcpy(frame->y.get() + width * yy, y + width * yy, width);
+               memcpy(frame->cb.get() + chroma_width * yy, cb + chroma_width * yy, chroma_width);
+               memcpy(frame->cr.get() + chroma_width * yy, cr + chroma_width * yy, chroma_width);
+       }
+       frame->is_semiplanar = false;
+       frame->width = width;
+       frame->height = height;
+       frame->chroma_subsampling_x = 2;
+       frame->chroma_subsampling_y = 1;
+       frame->pitch_y = width;
+       frame->pitch_chroma = chroma_width;
+       return frame;
+}
+
+}  // namespace
+
+void VideoStream::encode_thread_func()
+{
+       pthread_setname_np(pthread_self(), "VideoStream");
+       QSurface *surface = create_surface();
+       QOpenGLContext *context = create_context(surface);
+       bool ok = make_current(context, surface);
+       if (!ok) {
+               fprintf(stderr, "Video stream couldn't get an OpenGL context\n");
+               exit(1);
+       }
+
+       for ( ;; ) {
+               QueuedFrame qf;
+               {
+                       unique_lock<mutex> lock(queue_lock);
+
+                       // Wait until we have a frame to play.
+                       queue_changed.wait(lock, [this]{
+                               return !frame_queue.empty();
+                       });
+                       steady_clock::time_point frame_start = frame_queue.front().local_pts;
+
+                       // Now sleep until the frame is supposed to start (the usual case),
+                       // _or_ clear_queue() happened.
+                       bool aborted = queue_changed.wait_until(lock, frame_start, [this, frame_start]{
+                               return frame_queue.empty() || frame_queue.front().local_pts != frame_start;
+                       });
+                       if (aborted) {
+                               // clear_queue() happened, so don't play this frame after all.
+                               continue;
+                       }
+                       qf = move(frame_queue.front());
+                       frame_queue.pop_front();
+               }
+
+               if (qf.type == QueuedFrame::ORIGINAL) {
+                       // Send the JPEG frame on, unchanged.
+                       string jpeg = frame_reader.read_frame(qf.frame1);
+                       AVPacket pkt;
+                       av_init_packet(&pkt);
+                       pkt.stream_index = 0;
+                       pkt.data = (uint8_t *)jpeg.data();
+                       pkt.size = jpeg.size();
+                       stream_mux->add_packet(pkt, qf.output_pts, qf.output_pts);
+
+                       last_frame.assign(&jpeg[0], &jpeg[0] + jpeg.size());
+               } else if (qf.type == QueuedFrame::FADED) {
+                       glClientWaitSync(qf.fence.get(), /*flags=*/0, GL_TIMEOUT_IGNORED);
+
+                       shared_ptr<Frame> frame = frame_from_pbo(qf.resources->pbo_contents, 1280, 720);
+
+                       // Now JPEG encode it, and send it on to the stream.
+                       vector<uint8_t> jpeg = encode_jpeg(frame->y.get(), frame->cb.get(), frame->cr.get(), 1280, 720);
+
+                       AVPacket pkt;
+                       av_init_packet(&pkt);
+                       pkt.stream_index = 0;
+                       pkt.data = (uint8_t *)jpeg.data();
+                       pkt.size = jpeg.size();
+                       stream_mux->add_packet(pkt, qf.output_pts, qf.output_pts);
+                       last_frame = move(jpeg);
+               } else if (qf.type == QueuedFrame::INTERPOLATED || qf.type == QueuedFrame::FADED_INTERPOLATED) {
+                       glClientWaitSync(qf.fence.get(), /*flags=*/0, GL_TIMEOUT_IGNORED);
+
+                       // Send it on to display.
+                       shared_ptr<Frame> frame = frame_from_pbo(qf.resources->pbo_contents, 1280, 720);
+                       if (qf.display_decoded_func != nullptr) {
+                               qf.display_decoded_func(frame);
+                       }
+
+                       // Now JPEG encode it, and send it on to the stream.
+                       vector<uint8_t> jpeg = encode_jpeg(frame->y.get(), frame->cb.get(), frame->cr.get(), 1280, 720);
+                       compute_flow->release_texture(qf.flow_tex);
+                       if (qf.type != QueuedFrame::FADED_INTERPOLATED) {
+                               interpolate->release_texture(qf.output_tex);
+                               interpolate->release_texture(qf.cbcr_tex);
+                       }
+
+                       AVPacket pkt;
+                       av_init_packet(&pkt);
+                       pkt.stream_index = 0;
+                       pkt.data = (uint8_t *)jpeg.data();
+                       pkt.size = jpeg.size();
+                       stream_mux->add_packet(pkt, qf.output_pts, qf.output_pts);
+                       last_frame = move(jpeg);
+               } else if (qf.type == QueuedFrame::REFRESH) {
+                       AVPacket pkt;
+                       av_init_packet(&pkt);
+                       pkt.stream_index = 0;
+                       pkt.data = (uint8_t *)last_frame.data();
+                       pkt.size = last_frame.size();
+                       stream_mux->add_packet(pkt, qf.output_pts, qf.output_pts);
+               } else {
+                       assert(false);
+               }
+               if (qf.display_func != nullptr) {
+                       qf.display_func();
+               }
+       }
+}
+
+int VideoStream::write_packet2_thunk(void *opaque, uint8_t *buf, int buf_size, AVIODataMarkerType type, int64_t time)
+{
+       VideoStream *video_stream = (VideoStream *)opaque;
+       return video_stream->write_packet2(buf, buf_size, type, time);
+}
+
+int VideoStream::write_packet2(uint8_t *buf, int buf_size, AVIODataMarkerType type, int64_t time)
+{
+       if (type == AVIO_DATA_MARKER_SYNC_POINT || type == AVIO_DATA_MARKER_BOUNDARY_POINT) {
+               seen_sync_markers = true;
+       } else if (type == AVIO_DATA_MARKER_UNKNOWN && !seen_sync_markers) {
+               // We don't know if this is a keyframe or not (the muxer could
+               // avoid marking it), so we just have to make the best of it.
+               type = AVIO_DATA_MARKER_SYNC_POINT;
+       }
+
+       if (type == AVIO_DATA_MARKER_HEADER) {
+               stream_mux_header.append((char *)buf, buf_size);
+               global_httpd->set_header(stream_mux_header);
+       } else {
+               global_httpd->add_data((char *)buf, buf_size, type == AVIO_DATA_MARKER_SYNC_POINT, time, AVRational{ AV_TIME_BASE, 1 });
+       }
+       return buf_size;
+}
diff --git a/futatabi/video_stream.h b/futatabi/video_stream.h
new file mode 100644 (file)
index 0000000..736a20f
--- /dev/null
@@ -0,0 +1,147 @@
+#ifndef _VIDEO_STREAM_H
+#define _VIDEO_STREAM_H 1
+
+#include <epoxy/gl.h>
+#include <stdint.h>
+
+extern "C" {
+#include <libavformat/avio.h>
+}
+
+#include "frame_on_disk.h"
+#include "jpeg_frame_view.h"
+#include "ref_counted_gl_sync.h"
+#include "queue_spot_holder.h"
+
+#include <chrono>
+#include <condition_variable>
+#include <deque>
+#include <functional>
+#include <movit/effect_chain.h>
+#include <movit/mix_effect.h>
+#include <movit/ycbcr_input.h>
+#include <mutex>
+#include <string>
+#include <thread>
+
+class ChromaSubsampler;
+class DISComputeFlow;
+class Interpolate;
+class Mux;
+class QSurface;
+class QSurfaceFormat;
+class YCbCrConverter;
+
+class VideoStream {
+public:
+       VideoStream();
+       ~VideoStream();
+       void start();
+       void stop();
+       void clear_queue();
+
+       // “display_func” is called after the frame has been calculated (if needed)
+       // and has gone out to the stream.
+       void schedule_original_frame(std::chrono::steady_clock::time_point,
+                                    int64_t output_pts, std::function<void()> &&display_func,
+                                    QueueSpotHolder &&queue_spot_holder,
+                                    FrameOnDisk frame);
+       void schedule_faded_frame(std::chrono::steady_clock::time_point, int64_t output_pts,
+                                 std::function<void()> &&display_func,
+                                 QueueSpotHolder &&queue_spot_holder,
+                                 FrameOnDisk frame1, FrameOnDisk frame2,
+                                 float fade_alpha);
+       void schedule_interpolated_frame(std::chrono::steady_clock::time_point, int64_t output_pts,
+                                 std::function<void(std::shared_ptr<Frame>)> &&display_func,
+                                 QueueSpotHolder &&queue_spot_holder,
+                                 FrameOnDisk frame1, FrameOnDisk frame2,
+                                 float alpha, FrameOnDisk secondary_frame = {},  // Empty = no secondary (fade) frame.
+                                 float fade_alpha = 0.0f);
+       void schedule_refresh_frame(std::chrono::steady_clock::time_point, int64_t output_pts,
+                                   std::function<void()> &&display_func,
+                                   QueueSpotHolder &&queue_spot_holder);
+
+private:
+       FrameReader frame_reader;
+
+       void encode_thread_func();
+       std::thread encode_thread;
+
+       static int write_packet2_thunk(void *opaque, uint8_t *buf, int buf_size, AVIODataMarkerType type, int64_t time);
+       int write_packet2(uint8_t *buf, int buf_size, AVIODataMarkerType type, int64_t time);
+
+       // Allocated at the very start; if we're empty, we start dropping frames
+       // (so that we don't build up an infinite interpolation backlog).
+       struct InterpolatedFrameResources {
+               VideoStream *owner;  // Used only for IFRReleaser, below.
+
+               GLuint input_tex;  // Layered (contains both input frames), Y'CbCr.
+               GLuint gray_tex;  // Same, but Y only.
+               GLuint input_fbos[2];  // For rendering to the two layers of input_tex.
+
+               // Destination textures and FBO if there is a fade.
+               GLuint fade_y_output_tex, fade_cbcr_output_tex;
+               GLuint fade_fbo;
+
+               GLuint cb_tex, cr_tex;  // Subsampled, final output.
+
+               GLuint pbo;  // For reading the data back.
+               void *pbo_contents;  // Persistently mapped.
+       };
+       std::mutex queue_lock;
+       std::deque<std::unique_ptr<InterpolatedFrameResources>> interpolate_resources;  // Under <queue_lock>.
+       static constexpr size_t num_interpolate_slots = 15;  // Should be larger than Player::max_queued_frames, or we risk mass-dropping frames.
+
+       struct IFRReleaser {
+               void operator() (InterpolatedFrameResources *ifr) const
+               {
+                       if (ifr != nullptr) {
+                               std::unique_lock<std::mutex> lock(ifr->owner->queue_lock);
+                               ifr->owner->interpolate_resources.emplace_back(ifr);
+                       }
+               }
+       };
+       using BorrowedInterpolatedFrameResources = std::unique_ptr<InterpolatedFrameResources, IFRReleaser>;
+
+       struct QueuedFrame {
+               std::chrono::steady_clock::time_point local_pts;
+
+               int64_t output_pts;
+               enum Type { ORIGINAL, FADED, INTERPOLATED, FADED_INTERPOLATED, REFRESH } type;
+               FrameOnDisk frame1;  // The only frame for original frames.
+
+               // For fades only (including fades against interpolated frames).
+               FrameOnDisk secondary_frame;
+
+               // For interpolated frames only.
+               FrameOnDisk frame2;
+               float alpha;
+               BorrowedInterpolatedFrameResources resources;
+               RefCountedGLsync fence;  // Set when the interpolated image is read back to the CPU.
+               GLuint flow_tex, output_tex, cbcr_tex;  // Released in the receiving thread; not really used for anything else.
+               FrameOnDisk id;
+
+               std::function<void()> display_func;  // Called when the image is done decoding.
+               std::function<void(std::shared_ptr<Frame>)> display_decoded_func;  // Same, except for INTERPOLATED and FADED_INTERPOLATED.
+
+               QueueSpotHolder queue_spot_holder;
+       };
+       std::deque<QueuedFrame> frame_queue;  // Under <queue_lock>.
+       std::condition_variable queue_changed;
+
+       std::unique_ptr<Mux> stream_mux;  // To HTTP.
+       std::string stream_mux_header;
+       bool seen_sync_markers = false;
+
+       std::unique_ptr<YCbCrConverter> ycbcr_converter;
+       std::unique_ptr<YCbCrConverter> ycbcr_semiplanar_converter;
+
+       // Frame interpolation.
+       std::unique_ptr<DISComputeFlow> compute_flow;
+       std::unique_ptr<Interpolate> interpolate, interpolate_no_split;
+       std::unique_ptr<ChromaSubsampler> chroma_subsampler;
+
+       std::vector<uint8_t> last_frame;
+};
+
+#endif  // !defined(_VIDEO_STREAM_H)
diff --git a/futatabi/vis.cpp b/futatabi/vis.cpp
new file mode 100644 (file)
index 0000000..c67a0cc
--- /dev/null
@@ -0,0 +1,35 @@
+// Visualize a .flo file.
+
+#include "util.h"
+
+#include <assert.h>
+#include <memory>
+#include <stdio.h>
+
+using namespace std;
+
+int main(int argc, char **argv)
+{
+       if (argc != 3) {
+               fprintf(stderr, "Usage: ./vis input.flo out.ppm\n");
+               exit(1);
+       }
+
+       Flow flow = read_flow(argv[1]);
+
+       FILE *fp = fopen(argv[2], "wb");
+       fprintf(fp, "P6\n%d %d\n255\n", flow.width, flow.height);
+       for (unsigned y = 0; y < unsigned(flow.height); ++y) {
+               for (unsigned x = 0; x < unsigned(flow.width); ++x) {
+                       float du = flow.flow[y * flow.width + x].du;
+                       float dv = flow.flow[y * flow.width + x].dv;
+
+                       uint8_t r, g, b;
+                       flow2rgb(du, dv, &r, &g, &b);
+                       putc(r, fp);
+                       putc(g, fp);
+                       putc(b, fp);
+               }
+       }
+       fclose(fp);
+}
diff --git a/futatabi/vs.vert b/futatabi/vs.vert
new file mode 100644 (file)
index 0000000..61ad91a
--- /dev/null
@@ -0,0 +1,20 @@
+#version 450 core
+#extension GL_ARB_shader_viewport_layer_array : require
+
+layout(location=0) in vec2 position;
+out vec3 tc;
+
+void main()
+{
+       // The result of glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) is:
+       //
+       //   2.000  0.000  0.000 -1.000
+       //   0.000  2.000  0.000 -1.000
+       //   0.000  0.000 -2.000 -1.000
+       //   0.000  0.000  0.000  1.000
+       gl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0);
+       tc.xy = position;
+       tc.z = gl_InstanceID;
+
+       gl_Layer = gl_InstanceID;
+}
diff --git a/futatabi/ycbcr_converter.cpp b/futatabi/ycbcr_converter.cpp
new file mode 100644 (file)
index 0000000..694ba97
--- /dev/null
@@ -0,0 +1,189 @@
+#include "ycbcr_converter.h"
+
+#include "jpeg_frame.h"
+
+#include <movit/mix_effect.h>
+#include <movit/ycbcr_input.h>
+
+using namespace std;
+using namespace movit;
+
+namespace {
+
+void setup_outputs(YCbCrConverter::OutputMode output_mode, const ImageFormat &output_format, const YCbCrFormat &ycbcr_output_format, EffectChain *chain)
+{
+       if (output_mode == YCbCrConverter::OUTPUT_TO_RGBA) {
+               chain->add_output(output_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
+               chain->set_output_origin(OUTPUT_ORIGIN_BOTTOM_LEFT);
+       } else if (output_mode == YCbCrConverter::OUTPUT_TO_SEMIPLANAR) {
+               chain->add_ycbcr_output(output_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format, YCBCR_OUTPUT_SPLIT_Y_AND_CBCR);
+               chain->set_output_origin(OUTPUT_ORIGIN_TOP_LEFT);
+       } else {
+               assert(output_mode == YCbCrConverter::OUTPUT_TO_DUAL_YCBCR);
+
+               // One full Y'CbCr texture (for interpolation), one that's just Y (throwing away the
+               // Cb and Cr channels). The second copy is sort of redundant, but it's the easiest way
+               // of getting the gray data into a layered texture.
+               chain->add_ycbcr_output(output_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
+               chain->add_ycbcr_output(output_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
+               chain->set_output_origin(OUTPUT_ORIGIN_TOP_LEFT);
+       }
+}
+
+}  // namespace
+
+YCbCrConverter::YCbCrConverter(YCbCrConverter::OutputMode output_mode, ResourcePool *resource_pool)
+{
+       ImageFormat inout_format;
+       inout_format.color_space = COLORSPACE_sRGB;
+       inout_format.gamma_curve = GAMMA_sRGB;
+
+       ycbcr_format.luma_coefficients = YCBCR_REC_709;
+       ycbcr_format.num_levels = 256;
+       ycbcr_format.chroma_subsampling_x = 2;
+       ycbcr_format.chroma_subsampling_y = 1;
+       ycbcr_format.cb_x_position = 0.0f;  // H.264 -- _not_ JPEG, even though our input is MJPEG-encoded
+       ycbcr_format.cb_y_position = 0.5f;  // Irrelevant.
+       ycbcr_format.cr_x_position = 0.0f;
+       ycbcr_format.cr_y_position = 0.5f;
+
+       // This is a hack. Even though we're sending MJPEG around, which is
+       // full-range, it's mostly transporting signals from limited-range
+       // sources with no conversion, so we ought to have had false here.
+       // However, in the off chance that we're actually getting real MJPEG,
+       // we don't want to crush its blacks (or whites) by clamping. All of
+       // our processing is fades, so if we're in limited-range input, we'll
+       // stay in limited-range output. (Fading between limited-range and
+       // full-range sources will be broken, of course.) There will be some
+       // slight confusion in the parts of the algorithms dealing with RGB,
+       // but they're small and we'll manage.
+       ycbcr_format.full_range = true;
+
+       YCbCrFormat ycbcr_output_format = ycbcr_format;
+       ycbcr_output_format.chroma_subsampling_x = 1;
+
+       // Planar Y'CbCr decoding chain.
+       planar_chain.reset(new EffectChain(1280, 720, resource_pool));
+       ycbcr_planar_input = (YCbCrInput *)planar_chain->add_input(new YCbCrInput(inout_format, ycbcr_format, 1280, 720, YCBCR_INPUT_PLANAR));
+       setup_outputs(output_mode, inout_format, ycbcr_output_format, planar_chain.get());
+       planar_chain->set_dither_bits(8);
+       planar_chain->finalize();
+
+       // Semiplanar Y'CbCr decoding chain (for images coming from VA-API).
+       semiplanar_chain.reset(new EffectChain(1280, 720, resource_pool));
+       ycbcr_semiplanar_input = (YCbCrInput *)semiplanar_chain->add_input(new YCbCrInput(inout_format, ycbcr_format, 1280, 720, YCBCR_INPUT_SPLIT_Y_AND_CBCR));
+       setup_outputs(output_mode, inout_format, ycbcr_output_format, semiplanar_chain.get());
+       semiplanar_chain->set_dither_bits(8);
+       semiplanar_chain->finalize();
+
+       // Fade chains.
+       for (bool first_input_is_semiplanar : { false, true }) {
+               for (bool second_input_is_semiplanar : { false, true }) {
+                       FadeChain &fade_chain = fade_chains[first_input_is_semiplanar][second_input_is_semiplanar];
+                       fade_chain.chain.reset(new EffectChain(1280, 720, resource_pool));
+                       fade_chain.input[0] = (movit::YCbCrInput *)fade_chain.chain->add_input(
+                               new YCbCrInput(inout_format, ycbcr_format, 1280, 720,
+                                       first_input_is_semiplanar ? YCBCR_INPUT_SPLIT_Y_AND_CBCR : YCBCR_INPUT_PLANAR));
+                       fade_chain.input[1] = (movit::YCbCrInput *)fade_chain.chain->add_input(
+                               new YCbCrInput(inout_format, ycbcr_format, 1280, 720,
+                                       second_input_is_semiplanar ? YCBCR_INPUT_SPLIT_Y_AND_CBCR : YCBCR_INPUT_PLANAR));
+                       fade_chain.mix_effect = (movit::MixEffect *)fade_chain.chain->add_effect(
+                               new MixEffect, fade_chain.input[0], fade_chain.input[1]);
+                       setup_outputs(output_mode, inout_format, ycbcr_output_format, fade_chain.chain.get());
+                       fade_chain.chain->set_dither_bits(8);
+                       fade_chain.chain->finalize();
+               }
+       }
+
+       // Fade from interleaved chain (ie., first input is interleaved, since it comes
+       // directly from the GPU anyway).
+       for (bool second_input_is_semiplanar : { false, true }) {
+               FadeChain &fade_chain = interleaved_fade_chains[second_input_is_semiplanar];
+               fade_chain.chain.reset(new EffectChain(1280, 720, resource_pool));
+
+               ycbcr_format.chroma_subsampling_x = 1;
+               fade_chain.input[0] = (movit::YCbCrInput *)fade_chain.chain->add_input(
+                       new YCbCrInput(inout_format, ycbcr_format, 1280, 720,
+                               YCBCR_INPUT_INTERLEAVED));
+
+               ycbcr_format.chroma_subsampling_x = 2;
+               fade_chain.input[1] = (movit::YCbCrInput *)fade_chain.chain->add_input(
+                       new YCbCrInput(inout_format, ycbcr_format, 1280, 720,
+                               second_input_is_semiplanar ? YCBCR_INPUT_SPLIT_Y_AND_CBCR : YCBCR_INPUT_PLANAR));
+
+               fade_chain.mix_effect = (movit::MixEffect *)fade_chain.chain->add_effect(
+                       new MixEffect, fade_chain.input[0], fade_chain.input[1]);
+               setup_outputs(output_mode, inout_format, ycbcr_output_format, fade_chain.chain.get());
+               fade_chain.chain->set_dither_bits(8);
+               fade_chain.chain->finalize();
+       }
+}
+
+EffectChain *YCbCrConverter::prepare_chain_for_conversion(shared_ptr<Frame> frame)
+{
+       if (frame->is_semiplanar) {
+               setup_input_for_frame(frame, ycbcr_format, ycbcr_semiplanar_input);
+               return semiplanar_chain.get();
+       } else {
+               setup_input_for_frame(frame, ycbcr_format, ycbcr_planar_input);
+               return planar_chain.get();
+       }
+}
+
+EffectChain *YCbCrConverter::prepare_chain_for_fade(shared_ptr<Frame> frame, shared_ptr<Frame> secondary_frame, float fade_alpha)
+{
+       const FadeChain &fade_chain = fade_chains[frame->is_semiplanar][secondary_frame->is_semiplanar];
+       setup_input_for_frame(frame, ycbcr_format, fade_chain.input[0]);
+       setup_input_for_frame(secondary_frame, ycbcr_format, fade_chain.input[1]);
+       bool ok = fade_chain.mix_effect->set_float("strength_first", 1.0f - fade_alpha);
+       ok |= fade_chain.mix_effect->set_float("strength_second", fade_alpha);
+       assert(ok);
+       return fade_chain.chain.get();
+}
+
+EffectChain *YCbCrConverter::prepare_chain_for_fade_from_texture(GLuint tex, std::shared_ptr<Frame> secondary_frame, float fade_alpha)
+{
+       const FadeChain &fade_chain = interleaved_fade_chains[secondary_frame->is_semiplanar];
+       {
+               YCbCrFormat format_copy = ycbcr_format;
+               format_copy.chroma_subsampling_x = 1;
+               format_copy.chroma_subsampling_y = 1;
+               fade_chain.input[0]->change_ycbcr_format(format_copy);
+
+               fade_chain.input[0]->set_width(1280);  // FIXME
+               fade_chain.input[0]->set_height(720);
+               fade_chain.input[0]->set_texture_num(0, tex);
+
+               glTextureParameteri(tex, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+               glTextureParameteri(tex, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+               glTextureParameteri(tex, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_BORDER);
+               glTextureParameteri(tex, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_BORDER);
+       }
+       setup_input_for_frame(secondary_frame, ycbcr_format, fade_chain.input[1]);
+       bool ok = fade_chain.mix_effect->set_float("strength_first", 1.0f - fade_alpha);
+       ok |= fade_chain.mix_effect->set_float("strength_second", fade_alpha);
+       assert(ok);
+       return fade_chain.chain.get();
+}
+
+void setup_input_for_frame(shared_ptr<Frame> frame, const YCbCrFormat &ycbcr_format, YCbCrInput *input)
+{
+       YCbCrFormat format_copy = ycbcr_format;
+       format_copy.chroma_subsampling_x = frame->chroma_subsampling_x;
+       format_copy.chroma_subsampling_y = frame->chroma_subsampling_y;
+       input->change_ycbcr_format(format_copy);
+
+       input->set_width(frame->width);
+       input->set_height(frame->height);
+       input->set_pixel_data(0, frame->y.get());
+       input->set_pitch(0, frame->pitch_y);
+       if (frame->is_semiplanar) {
+               input->set_pixel_data(1, frame->cbcr.get());
+               input->set_pitch(1, frame->pitch_chroma);
+       } else {
+               input->set_pixel_data(1, frame->cb.get());
+               input->set_pixel_data(2, frame->cr.get());
+               input->set_pitch(1, frame->pitch_chroma);
+               input->set_pitch(2, frame->pitch_chroma);
+       }
+}
diff --git a/futatabi/ycbcr_converter.h b/futatabi/ycbcr_converter.h
new file mode 100644 (file)
index 0000000..459377c
--- /dev/null
@@ -0,0 +1,60 @@
+#ifndef _YCBCR_CONVERTER_H
+#define _YCBCR_CONVERTER_H 1
+
+#include <epoxy/gl.h>
+#include <memory>
+#include <movit/ycbcr_input.h>
+
+namespace movit {
+
+class EffectChain;
+class MixEffect;
+class ResourcePool;
+struct YCbCrFormat;
+
+}  // namespace movit
+
+struct Frame;
+
+class YCbCrConverter {
+public:
+       enum OutputMode {
+               OUTPUT_TO_RGBA,         // One texture (bottom-left origin): RGBA
+               OUTPUT_TO_SEMIPLANAR,   // Two textures (top-left origin):   Y, CbCr
+               OUTPUT_TO_DUAL_YCBCR    // Two textures (top-left origin):   Y'CbCr, Y'CbCr
+       };
+       YCbCrConverter(OutputMode output_mode, movit::ResourcePool *resource_pool);
+
+       // Returns the appropriate chain for rendering.
+       movit::EffectChain *prepare_chain_for_conversion(std::shared_ptr<Frame> frame);
+       movit::EffectChain *prepare_chain_for_fade(std::shared_ptr<Frame> frame, std::shared_ptr<Frame> secondary_frame, float fade_alpha);
+
+       // <tex> must be interleaved Y'CbCr.
+       movit::EffectChain *prepare_chain_for_fade_from_texture(GLuint tex, std::shared_ptr<Frame> secondary_frame, float fade_alpha);
+
+private:
+       movit::YCbCrFormat ycbcr_format;
+
+       // Effectively only converts from 4:2:2 to 4:4:4.
+       // TODO: Have a separate version with ResampleEffect, for scaling?
+       std::unique_ptr<movit::EffectChain> planar_chain, semiplanar_chain;
+       movit::YCbCrInput *ycbcr_planar_input, *ycbcr_semiplanar_input;
+
+       // These do fades, parametrized on whether the two inputs are planar
+       // or semiplanar.
+       struct FadeChain {
+               std::unique_ptr<movit::EffectChain> chain;
+               movit::YCbCrInput *input[2];
+               movit::MixEffect *mix_effect;
+       };
+       FadeChain fade_chains[2][2];
+
+       // These do fades, where the first input is interleaved and the second is
+       // either planar or semiplanar.
+       FadeChain interleaved_fade_chains[2];
+};
+
+// TODO: make private
+void setup_input_for_frame(std::shared_ptr<Frame> frame, const movit::YCbCrFormat &ycbcr_format, movit::YCbCrInput *input);
+
+#endif  // !defined(_YCBCR_CONVERTER_H)
diff --git a/make-example-video.sh b/make-example-video.sh
new file mode 100644 (file)
index 0000000..975c7a4
--- /dev/null
@@ -0,0 +1,8 @@
+youtube-dl 'https://www.youtube.com/watch?v=Wa2fBiCEzTc'
+FILE='MULTI ANGLE _ George Evans pops up with a 92nd-minute winner in Blackburn!-Wa2fBiCEzTc.mp4'
+ffmpeg -y -ss 0:03.290 -t 0:37 -i "$FILE" -c:v mjpeg -an angle1.mp4    
+ffmpeg -y -ss 0:40 -t 0:40 -i "$FILE" -c:v mjpeg -an angle2.mp4   
+ffmpeg -y -ss 1:12.880 -i "$FILE" -c:v mjpeg -an angle3.mp4
+ffmpeg -y -ss 0:07 -i ./angle3.mp4 -c:v copy -copyts -start_at_zero angle3-cut.mp4                    
+ffmpeg -y -copyts -i angle1.mp4 -i angle2.mp4 -i angle3-cut.mp4 -map 0:0 -map 1:0 -map 2:0 -c:v copy multiangle.mp4 
+
index 15184b878e0298c647f8e78b99cb03a2f933567e..83570057d161a75573e691062b4c372cba7e2b5e 100644 (file)
@@ -1,3 +1,11 @@
 project('nageru', 'cpp', default_options: ['buildtype=debugoptimized'])
-subdir('nageru')
 
+# Add the right MOVIT_SHADER_DIR definition.
+r = run_command('pkg-config', '--variable=shaderdir', 'movit')
+if r.returncode() != 0
+       error('Movit pkg-config installation is broken.')
+endif
+add_project_arguments('-DMOVIT_SHADER_DIR="' + r.stdout().strip() + '"', language: 'cpp')
+
+subdir('nageru')
+subdir('futatabi')
index bc93a2839c35992353665a9c54e9cccf636316e8..70d5ab77a41a770b8458e8be7a4a78d5adc87b8d 100644 (file)
@@ -61,13 +61,6 @@ if cxx.has_argument('-Wno-deprecated-declarations')
        add_project_arguments('-Wno-deprecated-declarations', language: 'cpp')
 endif
 
-# Add the right MOVIT_SHADER_DIR definition.
-r = run_command('pkg-config', '--variable=shaderdir', 'movit')
-if r.returncode() != 0
-       error('Movit pkg-config installation is broken.')
-endif
-add_project_arguments('-DMOVIT_SHADER_DIR="' + r.stdout().strip() + '"', language: 'cpp')
-
 # CEF.
 exe_dir = join_paths(get_option('prefix'), 'lib/nageru')
 cef_dir = get_option('cef_dir')
diff --git a/variational_refinement.txt b/variational_refinement.txt
new file mode 100644 (file)
index 0000000..0392011
--- /dev/null
@@ -0,0 +1,537 @@
+Variational refinement -- an introduction and derivation
+
+The variational refinement is probably the most difficult part of the
+algorithm to understand, in part because the description in most papers
+are very heavy on notation and rather light on exposition. I've tried
+to give a somewhat friendlier introduction to this specific algorithm
+below.
+
+The general idea is fairly simple; we try to optimize the flow field
+as a whole, by minimizing some mathematical notion of badness expressed
+as an energy function. The one used in the dense inverse search paper
+[Kroeger16; se references below] has this form:
+
+  E(U) = int( σ Ψ(E_I) + γ Ψ(E_G) + α Ψ(E_S) ) dx
+
+where Ψ(a²) = sqrt(a² + ε²) for some small constant ε = 0.001, and
+σ, γ, α are empirically set weighting constants. (We'll get to what the
+different enery terms are in a minute.) The integral is, for all practical
+purposes, just a sum over all the pixels in the flow.
+
+In general, such formulas are nonconvex and highly nonlinear, so we
+cannot hope to find a global minimum -- but if we start from the flow
+generated by the motion search, we can at least hope to make it somehow
+better by walking towards a local minimum. (In fact, there are many
+methods for optical flow that work _only_ by such minimization,
+so the word “refinement” is maybe not doing the method justice.
+One could just as well say that the motion search is a way of
+finding a reasonable starting point for the optimization.)
+
+The dense inverse search paper [Kroeger16; se references below] sets
+up the energy terms as described by some motion tensors and normalizations,
+then says simply that it is optimized by “θ_vo fixed point iterations
+and θ_vi iterations of Successive Over Relaxation (SOR) for the linear
+system”. It's not immediately obvious what this means, but it gives
+a reference to [Brox04]. However, that paper describes a numerical
+approximation scheme that is _far_ more complicated than what the DIS
+code actually does.
+
+Rather, one must look at the other main reference they are giving,
+which is [Weinzaepfel13], describing a system called DeepFlow.
+DIS borrows most of the exposition and code for its variational
+refinement from DeepFlow, just removing some terms and fixing up
+a few issues here and there. (There are some slight differences in
+the paper, like the use of ∂z instead of ∂t, but that looks mostly
+like an error to me.) Unfortunately, that paper in turn refers to
+[Brox11], which appears no more useful in clearing up the notation
+to me.
+
+However, digging down in the references, finally one finds [Zimmer11],
+which is where the tensor notation appears to come from. This allows
+us to look at the first term in the energy, E_I, which comes from the
+intensity constant assumption. The basic idea is optical flow nearly
+by definition should preserve intensity after the warp:
+
+  I_0(x + u) = I_1(x) 
+
+where I_0 is the first picture, I_1 is the second, x is any 2D
+coordinate and u is the flow at x (which we are optimizing over).
+In general, we'll be optimizing over the entire field of u
+(potentially hundreds of thousands of values), but we'll be looking
+mostly at individual points, so we'll skip the coordinates when we
+can (e.g. we write u instead of or u(x, y)). u is of course the 2D
+flow, although often, we'll write its components separately as u and v
+instead of as a vector u.
+
+Before we go further, we need to add some more notation:
+
+  * I_x is the partial derivative of I with respect to x (at some
+    point), and similarly for I_y. These do not depend on u,
+    so they can be precalculated before the optimization.
+  * I_xx is the double partial derivative of I, and similar for
+    I_yy and I_xy (the latter is the same as I_yx).
+  * I_t is the temporal derivative of I, ie. in practice just
+    I_t(x) = I_1(x) - I_0(x).
+
+Returning now to our original assertion:
+
+  I_0(x + u) = I_1(x)
+
+Classically in optical flow, one assumes that the flow is smooth
+and linear around the point x, which allows one to approximate this
+equation by
+
+  I_x u + I_y v + I_t = 0
+
+This is usually simply called “the optical flow constraint”,
+and gives rise to a very natural part of the energy:
+
+  E_I = I_x u + I_y v + I_t
+
+Remember that we send E_I through the function Ψ(a²) = sqrt(a² + ε²),
+so clearly Ψ(E_I) will be minimized if indeed E_I is zero.
+
+At this point, many papers start talking about Euler-Lagrange
+multivariate equations, which is a fairly daunting concept
+(at least the Wikipedia page is suitable for scaring small children).
+However, for the first two terms, we don't need its general form,
+and it reduces to something much simpler; just differentiate the energy
+by u and equate the result to zero (finding some minimum; it can't be
+a maximum, since *wave hands intensely*). Then differentiate the energy
+by v and set that to zero, too; now you have two equations in two
+unknowns (or, since we're optimizing over a field, maybe 500k
+equations in 500k unknowns -- although the equation set will be
+very sparse), which is hopefully solvable using linear methods.
+We'll look at what this gives for E_I in a moment, then try to apply
+the same notions to E_G and E_S later.
+
+First we modify E_I a bit by adding some normalization:
+
+  E_I = β_0 (I_x u + I_y v + I_t)
+
+where β_0 = 1/(abs(∇I)² + 0.01). Note that β_0 depends on I only,
+so for the purposes of optimizing u, it's a constant and can be
+precomputed across I. (β_0 will, of course, depend on x, but so
+do all the other terms in the equation.)
+
+Now we give it to Maple, differentiating first by u and then by v:
+
+> M := (u,v) -> B_0 * (I_x * u + I_y * v + I_t);
+                   M := (u, v) -> B_0 (I_x u + I_y v + I_t)
+
+> diff(sqrt(M(u,v)^2 + e), u);                  
+                           2
+                        B_0  (I_x u + I_y v + I_t) I_x
+                     ------------------------------------
+                         2                      2     1/2
+                     (B_0  (I_x u + I_y v + I_t)  + e)
+
+> diff(sqrt(M(u,v)^2 + e), v);
+                           2
+                        B_0  (I_x u + I_y v + I_t) I_y
+                     ------------------------------------
+                         2                      2     1/2
+                     (B_0  (I_x u + I_y v + I_t)  + e)
+
+
+So these are the two expressions to be set to zero (for each
+point). We'll notice immediately that this isn't very linear
+in u and v, so here's where the “fixed point iterations” come in;
+we simply assume that our previous values for u and v are
+approximately good enough for the denominator, and optimize
+them in the numerator only. Then we get new values that are
+hopefully a bit closer, which we can then use for the
+denominator, and so on. (This is seemingly an old technique;
+[Brox05] cites [Ciarlet78]. It is justifiable in the sense
+that the only thing really held constant is the derivative
+of the penalizer.) In other words, if we define the constant
+
+  k1 = β_0² / sqrt(β_0² (I_x u' + I_y v' + I_t)² + ε²)
+
+(where u' and v' are the guesses for u and v from the previous
+iteration)
+
+we have the much more manageable
+
+  k1 I_x²    u + k1 I_x I_y v = - k1 I_t I_x
+  k1 I_x I_y u + k1 I_y²    v = - k1 I_t I_y
+
+ie., two linear equations in u and v. Now, you will notice two
+immediate problems by this equation set:
+
+  * The factor k1 is completely useless, since it's just multiplied
+    in everywhere.
+  * The set of equations is colinear (the determinant of the matrix
+    is zero), and thus there is an infinite number of possible
+    solutions—this is known as the so-called “aperture problem”.
+    It shouldn't be surprising, though, as we cannot expect that
+    starting with a single constraint should allow us to solve
+    for two unknowns.
+
+However, both problems will go away as soon as we start adding
+more terms, so let's look at the gradient constancy term E_G next.
+It is fairly similar to the brightness constancy term, except it
+uses the (spatial) gradient instead of intensity:
+
+  ∇I_0(x + u) = ∇I_1(x)
+
+or equivalently (by definition):
+
+  (∂I/∂x)_0(x + u) = (∂I/∂x)_1(x)
+  (∂I/∂y)_0(x + u) = (∂I/∂y)_1(x)
+
+The idea is that this is more robust to changes in lighting.
+It doesn't replace the intensity term, but augments it; the weighting
+constants σ and γ control their relative importance. Also note that
+this actually gives us two independent equations, unlike the brightness
+constancy term.
+
+However, it is not obvious at all how to discretize this. In particular,
+most papers, including [Brox04], appear to want _not_ to make any linear
+assumptions of the flow in this case, and end up with tons of terms.
+(The DIS and DeepFlow papers do, again, use some tensor notation that
+I do not understand, but I'm not convinced it actually contains any
+of the discretization.)
+
+Yet more paper searching eventually turns up [Fahad07], which simply
+states that the discretized versions of these equations are:
+
+  I_xx u + I_xy v + I_xt = 0
+  I_yx u + I_yy v + I_yt = 0.
+
+which seems to match well what the DIS code uses. Note that even though
+this is an equation set equal to zero, we can't just solve for them;
+we need to make (penalized, normalized) energy terms and add them to
+the other terms. This gives
+  
+  E_G = β_x (I_xx u + I_xy v + I_xt) + β_y (I_yx u + I_yy v + I_yt)
+
+with normalization terms
+
+  β_x = 1 / (abs(∇(I_x))² + 0.01)  (∇(I_x) is the gradient of ∂I/∂x)
+  β_y = 1 / (abs(∇(I_y))² + 0.01)
+
+(The DIS paper writes ∇I_dx and ∇I_dy instead of ∇I_x and ∇I_y, but I believe
+that's a typo; the DeepFlow paper says ∇I_x and ∇I_y.)
+
+The papers both write that Ψ(E_G) is used, which would mean that the penalized
+term is
+
+  E_G = sqrt((β_x (I_xx u + I_xy v + I_xt) + β_y (I_yx u + I_yy v + I_yt))² + ε²)
+
+but that isn't what the code actually does. Instead, it seems that the two
+terms are squared independently:
+  
+  E_G = sqrt((β_x (I_xx u + I_xy v + I_xt))² + (β_y (I_yx u + I_yy v + I_yt))² + ε²)
+
+Both are solvable just fine, and it probably does not matter all that much
+which we use in practice (although [Zimmer11] suggests that if we are using
+multichannel images, we should penalize the three channels separately),
+but we follow what the code actually does here.
+
+We can differentiate them and equate them to zero as before:
+
+> M_x := (u,v) -> B_x * (I_xx * u + I_xy * v + I_xt);
+                      M_x := (u, v) -> B_x (I_xx u + I_xy v + I_xt)
+
+> M_y := (u,v) -> B_y * (I_xy * u + I_yy * v + I_yt);
+                      M_y := (u, v) -> B_y (I_xy u + I_yy v + I_yt)
+
+> diff(sqrt(M_x(u,v)^2 + M_y(u,v)^2 + e), u);        
+                                     2             2
+       2 (I_xx u + I_xy v + I_xt) B_x  I_xx + 2 B_y  (I_xy u + I_yy v + I_yt) I_xy
+       ---------------------------------------------------------------------------
+                                  2    2      2                         2     1/2
+       2 ((I_xx u + I_xy v + I_xt)  B_x  + B_y  (I_xy u + I_yy v + I_yt)  + e)
+
+> diff(sqrt(M_x(u,v)^2 + M_y(u,v)^2 + e), v);
+                                     2             2
+       2 (I_xx u + I_xy v + I_xt) B_x  I_xy + 2 B_y  (I_xy u + I_yy v + I_yt) I_yy
+       ---------------------------------------------------------------------------
+                                  2    2      2                         2     1/2
+       2 ((I_xx u + I_xy v + I_xt)  B_x  + B_y  (I_xy u + I_yy v + I_yt)  + e)
+
+Using the same fixed-point scheme where we hold the terms in the
+denominator constant and equal to last iteration's values, we get
+a new common constant
+
+  k2 = 1 / sqrt(β_x² (I_xx u' + I_xy v' + I_xt)² + β_y² (I_xy u' + I_yy v' + I_yt)²)
+
+and for brevity
+
+  k_x = k2 β_x²
+  k_y = k2 β_y²
+
+and thus, collecting terms for u and v, we get the two equations:
+
+  (k_x I_xx² + k_y I_xy²)         u + (k_x I_xx I_xy + k_y I_xy I_yy) v = - k_x I_xx I_xt - k_y I_xy I_yt
+  (k_x I_xx I_xy + k_y I_xy I_yy) u + (k_x I_xy² + k_y I_yy²)         v = - k_x I_xy I_xt - k_y I_yy I_yt
+
+which is linear in u and v, not colinear (unless we are extremely
+unlucky), and can be easily solved.
+
+Of course, for optimizing the weighted sum σ Ψ(E_I) + γ Ψ(E_G),
+we just add the two equation sets pairwise with appropriate weights.
+
+There's a small discrepancy here: The equations suggest that we should
+be be squaring the normalization terms β_0², β_x², β_y²; however, the
+code does not appear to do so. It's possible that they were intended to be
+added outside of the penalization, e.g. Ψ(a²) = sqrt(β a² + ε²), but given
+that these come from [Zimmer11], which mentions nothing of the sort,
+I'll just have to assume that this is an implementation mishap.
+
+The final smoothness term the one that binds the flow field together as a whole
+so that we don't have WxH completely independent equations (with its positive
+and negative sides, of course). It is the simplest in terms of notation,
+but it requires the full power of the Euler-Lagrange equations to minimize,
+so we'll need to figure that part out.
+
+  E_S = abs(∇u)² + abs(∇v)²
+
+or
+
+  E_S = (u_x² + u_y²) + (v_x² + v_y²)
+
+The penalized form used in the DeepFlow notation, contrary to what you'd expect
+from the paper, is:
+
+  E_S = sqrt(u_x² + u_y² + v_x² + v_y² + ε²)
+
+How would one go about to minimize such an expression by u? (We'll deal with v
+later.) It's perhaps no big surprise that the expression involves double
+derivatives, but the full form involves the Euler-Lagrange equations.
+They allow us to minimize expressions that contain x, y, u(x, y) _and_ the partial
+derivatives u_x(x, y) and u_y(x, y), although the answer becomes a differential
+equation.
+
+The Wikipedia page is, unfortunately, not very beginner-friendly,
+but the general idea is: Differentiate the expression by u_x
+(yes, differentiating by a partial derivative!), negate it, and then
+differentiate the result by x. Then do the same thing by u_y and y,
+add the two results together and equate to zero. Mathematically
+(https://en.wikipedia.org/wiki/Euler%E2%80%93Lagrange_equation#Several_functions_of_several_variables_with_single_derivative):
+
+  ∂E/∂u - ∂/∂x (∂E/∂u_x) - ∂/∂y (∂E/∂u_y) = 0
+
+The first term disappears, since we don't have a non-differentiated
+u(x, y) in E_S. (Previously, the two _other_ terms would disappear,
+because we didn't have u_x or u_y in E_I or E_G.) This means we get
+
+  - ∂/∂x (u_x / sqrt(u_x² + u_y² + v_x² + v_y² + ε²)) - ∂/∂y (u_y / sqrt(u_x² + u_y² + v_x² + v_y² + ε²)) = 0
+
+(We don't remove the minus signs since this is supposed to be added to
+all the other terms.)
+
+This is what's called an _anisotropic diffusion_ (or Perona–Malik diffusion)
+equation, and is extensively described in literature. It has the effect of
+smoothing the flow more in some places than others; in particular, it does
+not smooth as strongly near edges, so it is edge-preserving. (It's a bit odd to
+call it anisotropic, since it does smooth equally in all directions;
+[Brox05] calls it vector-valued diffusion.)
+
+We'd love to our usual trick of keeping the nonlinear terms in the denominator
+constant, but alas, we can't do that yet, since it's under the differentiation
+operator; this factor has to be discretized together with u before we can treat
+it as a constant. So instead, we'll define it as a function (called the
+_diffusivity_ at the given point):
+
+  g(x, y) = 1 / sqrt(u_x² + u_y² + v_x² + v_y² + ε²) = 0
+
+which gives us
+
+  - ∂/∂x ( g(x, y) u_x ) - ∂/∂y ( g(x, y) u_y ) = 0
+
+We'll also have a similar equation for minimizing v, of course:
+
+  - ∂/∂x ( g(x, y) v_x ) - ∂/∂y ( g(x, y) v_y ) = 0
+
+There's no normalization term β here, unlike the other terms; DeepFlow2
+adds one, but we're not including it here.
+
+At this point, we make a tweak. This seemingly goes back to at least
+[Brox04], which also makes the same tweak to all the other terms
+(which we don't, but see below). We split u (and v) into something
+based on the original value plus a differential du (and dv), and then
+solve for du (or dv) instead. (In math-speak, we are moving to an
+implicit method, which is often more numerically stable.) In other words,
+
+  u(x, y) = u0(x, y) + du(x, y)
+
+where u0(x, y) is the initial guess for the flow. (It's not the value
+from previous iteration, for reasons that will be clear later, it's
+the first one. [Brox04] differs here, but it does a number of things
+differently in the numerics anyway.)
+
+This gives us:
+
+  - ∂/∂x ( g(x, y) (u0 + du)_x ) - ∂/∂y ( g(x, y) (u0 + du)_y ) = 0
+
+or
+
+  - ∂/∂x ( g(x, y) du_x ) - ∂/∂y ( g(x, y) du_y ) = ∂/∂x ( g(x, y) u0_x ) + ∂/∂y ( g(x, y) u0_y )
+
+where the right-hand side is effectively a constant for these purposes
+(although it still needs to be calculated anew for each iteration,
+since g(x, y) changes).
+
+Of course, now we have a different problem; all the other terms are
+formulated in terms of u and v, not du and dv. DeepFlow solves this
+by not searching for the flow between I_0 and I_1, but between I_0 and
+a pre-warped I_1. In other words, before any of the derivatives involving
+I_t are calculated, we calculate an I_w with bilinear interpolation:
+
+  I_w(x, y) = I_1(x + u0(x, y), y + v0(x, y))
+
+and then redefine I_t (occasionally called I_z) as
+
+  I_t(x, y) = I_w(x, y) - I_0(x, y)
+
+Note that the plus sign effectively means inverting the flow, so if
+the u0 and v0 were already correctly estimated, perfectly smooth and linear
+everywhere, I_w = I_0. (All spatial derivatives are calculated on the mean
+between I_0 and I_w; the paper doesn't mention this.) After this, all the
+equations for E_I and E_G earlier will still hold, they will just be
+calculating du and dv instead. Note that this means we have three values
+for the flow; there's u0 for the initial guess, du for the current guess
+of delta from u0 (which makes u0 + du the current guess of the flow),
+and du' for the previous guess of delta from u0. (The initial values for
+du' and dv' will be zero.)
+
+Now back to our equations, as we look at practical implementation:
+
+  - ∂/∂x ( g(x, y) du_x ) - ∂/∂y ( g(x, y) du_y ) = ∂/∂x ( g(x, y) u0_x ) + ∂/∂y ( g(x, y) u0_y )
+  - ∂/∂x ( g(x, y) dv_x ) - ∂/∂y ( g(x, y) dv_y ) = ∂/∂x ( g(x, y) v0_x ) + ∂/∂y ( g(x, y) v0_y )
+
+We can discretize the left-hand and right-hand side identically (they differ
+only in signs and in variable), so let's look only at
+
+  - ∂/∂x ( g(x, y) du_x ) - ∂/∂y ( g(x, y) du_y )
+
+[Brox05] equation (2.14) (which refers to a 1998 book, although I couldn't
+immediately find the equation in question in that book) discretizes this as
+
+  - 1/2 (g(x+1, y) + g(x, y)) (du(x+1, y) - du(x, y))
+  + 1/2 (g(x-1, y) + g(x, y)) (du(x, y) - du(x-1, y))
+  - 1/2 (g(x, y+1) + g(x, y)) (du(x, y+1) - du(x, y))
+  + 1/2 (g(x, y-1) + g(x, y)) (du(x, y) - du(x, y-1))
+
+It also mentions that it would be better to sample g at the half-way points,
+e.g. g(x+0.5, y), but that begs the question exactly how we'd do that, and
+DeepFlow doesn't seem to care, so we stick with their version.
+
+Now we can finally let g use the values of the flow (note that this is the
+actual flow u and v, not du and dv!) from the previous iteration, as before:
+
+  g(x, y) = 1 / sqrt(u'_x² + u'_y² + v'_x² + v'_y² + ε²)
+
+The single derivatives in g(x) are approximated by standard central differences
+(see https://en.wikipedia.org/wiki/Finite_difference_coefficient), e.g.
+
+  u_x(x, y) = 1/2 (u(x + 1, y) - u(x - 1, y))
+
+although the derivatives of I are using the fancier
+
+  I_x(x, y) = 1/12 (-I(x - 2, y) + 8 I(x - 1, y) - 8 I(x - 1, y) + I(x - 2, y))
+
+I assume this is because I_x derivatives are calculated only once, so we can
+afford more accurate derivatives (or possibly simply because of influence
+from earlier papers).
+
+Let's now define a smoothness constant between the neighbors (x,y) and (x1,y1):
+
+  s(x1, y1) = 1/2 (g(x, y) + g(x1, y1))
+
+Collecting all the du(x, y) terms of the discretized equation above,
+ignoring the right-hand side, which is just a constant for us anyway:
+
+  - s(x+1, y) (du(x+1, y) - du(x, y))
+  + s(x-1, y) (du(x, y) - du(x-1, y))
+  - s(x, y+1) (du(x, y+1) - du(x, y))
+  + s(x, y-1) (du(x, y) - du(x, y-1)) = C
+
+  - s(x+1, y) du(x+1, y) + s(x+1, y) du(x, y)
+  + s(x-1, y) du(x, y) - s(x-1, y) du(x-1, y)
+  - s(x, y+1) du(x, y+1) + s(x, y+1) du(x, y)
+  + s(x, y-1) du(x, y) - s(x, y-1) du(x, y-1) = C
+
+  (s(x+1, y) + s(x-1, y) + s(x, y+1) + s(x, y-1)) du(x, y) =
+  s(x+1, y) du(x+1, y) + s(x-1, y) du(x-1, y) + s(x, y+1) du(x, y+1) + s(x, y-1) du(x, y-1) + C
+
+It is interesting to note that if s = 1 uniformly, which would be the case
+without our penalizer Ψ(a²), we would have the familiar discrete Laplacian,
+where du(x, y) would seek to simply become the average of its four immediate
+neighbors.
+
+Now our equation system is finally complete and linear, and the rest is
+fairly pedestrian. The last term connects all the unknowns together,
+but we still solve them mostly as 2x2 matrices. The most basic iterative
+method is Jacobi, where we solve du(x, y) and dv(x,y) using the
+previous iteration's value for all other du/dv values. (That this converges
+at all it beyond this text to prove, but it does. Not that we bother
+iterating until it converges; a few iterations is good enough.)
+Gauss-Seidel iterations improve on this in that (surprisingly!) using this
+iteration's computed du/dv values if they're ready; this improves convergence,
+but is hard to parallelize.
+
+Successive over-relaxation (SOR) improves further on this, in that it
+assumes that the solution moves towards the right value, so why not
+just go a bit further? That is, if Gauss-Seidel would tell you to increase
+the flow by 1.0 pixel to the right, perhaps go 1.5 pixels to the right
+instead (this value is called ω). Again, the convergence proof is beyond the
+scope here, but SOR converges for any ω between 1 and 2 (1 gives plain
+Gauss-Seidel, and over 2, we risk overshooting and never converging). Optimal
+ω depends on the equation system; DIS uses ω = 1.6, which presumably was
+measured, while we do ω = 1.8 (seems to be marginally better after some
+light testing).
+
+Efficient GPU implementation of SOR is not trivial; like noted before,
+Gauss-Seidel is inherently serial, which is a poor match for the GPU.
+Worse, doing SOR with Jacobi as base instead of Gauss-Seidel makes for
+an algorithm which simply does not converge. We solve this by using a
+method called red-black SOR (not to be confused with red-black binary
+trees). Conceptually, it assigns every unknown a color, with every other
+being red or black (similar to a checkerboard). Since red values now
+only depend on black values and vice versa, one can do all red values
+in parallel, then all black values, and so on. (This is equivalent to
+reordering the equation set; different such orderings can have different
+convergence speeds.)
+
+Our GPU SOR implementation is not overly efficient, so essentially one such
+half-iteration of red-black SOR costs the same as one full iteration of
+Jacobi but convergence is so much faster that it's worth it. Generally
+speaking, Gauss-Seidel converges twice as fast as Jacobi (ie., if Jacobi
+converges in N iterations, Gauss-Seidel does so in N/2), but SOR converges
+_geometrically_ faster, ie., in O(√N) iterations.
+
+Do note that the DeepFlow code does not fully use SOR or even Gauss-Seidel;
+it solves every 2x2 block (ie., single du/dv pair) using Cramer's rule,
+and then pushes that vector 60% further, SOR-style. This would be clearly
+more accurate if we didn't have SOR in the mix (since du and dv would
+converge immediately relative to each other, bar Cramer's numerical issues),
+but I'm not sure whether it's better given SOR. (DIS changes this to a more
+traditional SOR formulation, which we also use. It doesn't seem to be much
+different in practical testing; perhaps minutely worse, but I haven't done
+a deep analysis here.)
+
+And that's it. References:
+
+[Brox04]: Brox, Bruhn, Papenberg, Weickert: “High Accuracy Optical Flow
+  Estimation Based on a Theory for Warping”, in Proceedings of the European
+  Conference on Computer Vision (ECCV), 2004
+[Brox05]: Brox: “From Pixels to Regions: Partial Differential Equations in
+  Image Analysis”, PhD thesis, 2005
+[Brox11]: Brox, Malik: “Large Displacement Optical Flow: Descriptor Matching in
+  Variational Motion Estimation”, IEEE Transactions on Pattern Analysis and
+  Machine Intelligence, 2011
+[Ciarlet78]: Ciarlet: “The Finite Element Method for Elliptic Problems”, 1978
+[Fahad07]: Fahad, Morris: “Multiple Combined Constraints for Optical Flow
+  Estimation”, in Proceedings of the 3rd International Conference on Advances
+  in Visual Computing (ISVC), 2007
+[Kroeger16]: Kroeger, Timofte, Dai, van Gool: “Fast Optical Flow using Dense
+  Inverse Search”, in Proceedings of the European Conference on Computer Vision
+  (ECCV), 2016
+[Weinzaepfel13]: Weinzaepfel, Revaud, Harchaoui, Schmid: “DeepFlow: Large
+  displacement optical flow with deep matching”, in IEEE International Conference
+  on Computer Vision (ICCV), 2013
+[Zimmer11]: Zimmer, Bruhn, Weickert: “Optic Flow in Harmony”, International
+  Journal of Computer Vision, 2011