]> git.sesse.net Git - nageru/commitdiff
Do deinterleaving on the GPU (subsampling still remains).
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Sun, 16 Sep 2018 15:35:45 +0000 (17:35 +0200)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Sun, 16 Sep 2018 15:36:13 +0000 (17:36 +0200)
blend.frag
flow.cpp
flow.h
flow_main.cpp
video_stream.cpp
video_stream.h

index e88da50b929ce23516586d97a4960577ebafa824..eb3fc80753fa0ddec596b8c2fc1a686cd7973b61 100644 (file)
@@ -1,7 +1,13 @@
 #version 450 core
 
 in vec3 tc;
+
+#ifdef SPLIT_YCBCR_OUTPUT
+out float Y;
+out vec2 CbCr;
+#else
 out vec4 rgba;
+#endif
 
 uniform sampler2DArray image_tex;
 uniform sampler2D flow_tex;
@@ -24,13 +30,20 @@ void main()
        // Same for d1.
        float d1 = (1.0f - alpha) * length(size * (texture(flow_tex, vec2(tc.xy + (1.0f - alpha) * flow)).xy - flow));
 
+       vec4 result;
        if (max(d0, d1) < 3.0f) {  // Arbitrary constant, not all that tuned. The UW paper says 1.0 is fine for ground truth.
                // Both are visible, so blend.
-               rgba = I_0 + alpha * (I_1 - I_0);
+               result = I_0 + alpha * (I_1 - I_0);
        } else if (d0 < d1) {
-               rgba = I_0;
+               result = I_0;
        } else {
-               rgba = I_1;
+               result = I_1;
        }
 
+#ifdef SPLIT_YCBCR_OUTPUT
+       Y = result.r;
+       CbCr = result.gb;
+#else
+       rgba = result;
+#endif
 }
index 9f816e6f30171ac1c005594fed142b974e6bd0c1..71b0b23c6f85033f10cdbe93fb12798e2649778e 100644 (file)
--- a/flow.cpp
+++ b/flow.cpp
@@ -915,10 +915,19 @@ void HoleBlend::exec(GLuint flow_tex, GLuint depth_rb, GLuint temp_tex[3], int w
        glDisable(GL_DEPTH_TEST);
 }
 
-Blend::Blend()
+Blend::Blend(bool split_ycbcr_output)
+       : split_ycbcr_output(split_ycbcr_output)
 {
+       string frag_shader = read_file("blend.frag");
+       if (split_ycbcr_output) {
+               // Insert after the first #version line.
+               size_t offset = frag_shader.find('\n');
+               assert(offset != string::npos);
+               frag_shader = frag_shader.substr(0, offset + 1) + "#define SPLIT_YCBCR_OUTPUT 1\n" + frag_shader.substr(offset + 1);
+       }
+
        blend_vs_obj = compile_shader(read_file("vs.vert"), GL_VERTEX_SHADER);
-       blend_fs_obj = compile_shader(read_file("blend.frag"), GL_FRAGMENT_SHADER);
+       blend_fs_obj = compile_shader(frag_shader, GL_FRAGMENT_SHADER);
        blend_program = link_program(blend_vs_obj, blend_fs_obj);
 
        uniform_image_tex = glGetUniformLocation(blend_program, "image_tex");
@@ -927,7 +936,7 @@ Blend::Blend()
        uniform_flow_consistency_tolerance = glGetUniformLocation(blend_program, "flow_consistency_tolerance");
 }
 
-void Blend::exec(GLuint image_tex, GLuint flow_tex, GLuint output_tex, int level_width, int level_height, float alpha)
+void Blend::exec(GLuint image_tex, GLuint flow_tex, GLuint output_tex, GLuint output2_tex, int level_width, int level_height, float alpha)
 {
        glUseProgram(blend_program);
        bind_sampler(blend_program, uniform_image_tex, 0, image_tex, linear_sampler);
@@ -935,13 +944,23 @@ void Blend::exec(GLuint image_tex, GLuint flow_tex, GLuint output_tex, int level
        glProgramUniform1f(blend_program, uniform_alpha, alpha);
 
        glViewport(0, 0, level_width, level_height);
-       fbos.render_to(output_tex);
+       if (split_ycbcr_output) {
+               fbos_split.render_to(output_tex, output2_tex);
+       } else {
+               fbos.render_to(output_tex);
+       }
        glDisable(GL_BLEND);  // A bit ironic, perhaps.
        glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
 }
 
-Interpolate::Interpolate(int width, int height, const OperatingPoint &op)
-       : width(width), height(height), flow_level(op.finest_level), op(op), splat(op) {
+Interpolate::Interpolate(int width, int height, const OperatingPoint &op, bool split_ycbcr_output)
+       : width(width),
+         height(height),
+         flow_level(op.finest_level),
+         op(op),
+         split_ycbcr_output(split_ycbcr_output),
+         splat(op),
+         blend(split_ycbcr_output) {
        // Set up the vertex data that will be shared between all passes.
        float vertices[] = {
                0.0f, 1.0f,
@@ -961,7 +980,7 @@ Interpolate::Interpolate(int width, int height, const OperatingPoint &op)
        glVertexAttribPointer(position_attrib, 2, GL_FLOAT, GL_FALSE, 0, BUFFER_OFFSET(0));
 }
 
-GLuint Interpolate::exec(GLuint image_tex, GLuint gray_tex, GLuint bidirectional_flow_tex, GLuint width, GLuint height, float alpha)
+pair<GLuint, GLuint> Interpolate::exec(GLuint image_tex, GLuint gray_tex, GLuint bidirectional_flow_tex, GLuint width, GLuint height, float alpha)
 {
        GPUTimers timers;
 
@@ -1003,10 +1022,20 @@ GLuint Interpolate::exec(GLuint image_tex, GLuint gray_tex, GLuint bidirectional
        pool.release_texture(temp_tex[2]);
        pool.release_renderbuffer(depth_rb);
 
-       GLuint output_tex = pool.get_texture(GL_RGBA8, width, height);
-       {
-               ScopedTimer timer("Blend", &total_timer);
-               blend.exec(image_tex, flow_tex, output_tex, width, height, alpha);
+       GLuint output_tex, output2_tex = 0;
+       if (split_ycbcr_output) {
+               output_tex = pool.get_texture(GL_R8, width, height);
+               output2_tex = pool.get_texture(GL_RG8, width, height);
+               {
+                       ScopedTimer timer("Blend", &total_timer);
+                       blend.exec(image_tex, flow_tex, output_tex, output2_tex, width, height, alpha);
+               }
+       } else {
+               output_tex = pool.get_texture(GL_RGBA8, width, height);
+               {
+                       ScopedTimer timer("Blend", &total_timer);
+                       blend.exec(image_tex, flow_tex, output_tex, 0, width, height, alpha);
+               }
        }
        pool.release_texture(flow_tex);
        total_timer.end();
@@ -1014,7 +1043,7 @@ GLuint Interpolate::exec(GLuint image_tex, GLuint gray_tex, GLuint bidirectional
                timers.print();
        }
 
-       return output_tex;
+       return make_pair(output_tex, output2_tex);
 }
 
 GLuint TexturePool::get_texture(GLenum format, GLuint width, GLuint height, GLuint num_layers)
diff --git a/flow.h b/flow.h
index 31111f5bd42b17f1aa20453ac24694e744d9af79..9536a3808034868810f4ba3143d782b3315b11a4 100644 (file)
--- a/flow.h
+++ b/flow.h
@@ -522,11 +522,15 @@ private:
 
 class Blend {
 public:
-       Blend();
-       void exec(GLuint image_tex, GLuint flow_tex, GLuint output_tex, int width, int height, float alpha);
+       Blend(bool split_ycbcr_output);
+
+       // output2_tex is only used if split_ycbcr_output was true.
+       void exec(GLuint image_tex, GLuint flow_tex, GLuint output_tex, GLuint output2_tex, int width, int height, float alpha);
 
 private:
+       bool split_ycbcr_output;
        PersistentFBOSet<1> fbos;
+       PersistentFBOSet<2> fbos_split;
        GLuint blend_vs_obj;
        GLuint blend_fs_obj;
        GLuint blend_program;
@@ -537,12 +541,12 @@ private:
 
 class Interpolate {
 public:
-       Interpolate(int width, int height, const OperatingPoint &op);
+       Interpolate(int width, int height, const OperatingPoint &op, bool split_ycbcr_output);
 
-       // Returns a texture that must be released with release_texture()
-       // after use. image_tex must be a two-layer RGBA8 texture with mipmaps
-       // (unless flow_level == 0).
-       GLuint exec(GLuint image_tex, GLuint gray_tex, GLuint bidirectional_flow_tex, GLuint width, GLuint height, float alpha);
+       // Returns a texture (or two, if split_ycbcr_output is true) that must
+       // be released with release_texture() after use. image_tex must be a
+       // two-layer RGBA8 texture with mipmaps (unless flow_level == 0).
+       std::pair<GLuint, GLuint> exec(GLuint image_tex, GLuint gray_tex, GLuint bidirectional_flow_tex, GLuint width, GLuint height, float alpha);
 
        void release_texture(GLuint tex) {
                pool.release_texture(tex);
@@ -553,6 +557,7 @@ private:
        GLuint vertex_vbo, vao;
        TexturePool pool;
        const OperatingPoint op;
+       const bool split_ycbcr_output;
 
        Splat splat;
        HoleFill hole_fill;
index d8cf4773f59e6c07d2b5b6a031b2b5103d7e7ff3..a961a60a9aa805df282ddd7b4a4e6fd4d428c1ea 100644 (file)
@@ -379,7 +379,7 @@ void interpolate_image(int argc, char **argv, int optind)
        }
        DISComputeFlow compute_flow(width1, height1, op);
        GrayscaleConversion gray;
-       Interpolate interpolate(width1, height1, op);
+       Interpolate interpolate(width1, height1, op, /*split_ycbcr_output=*/false);
 
        GLuint tex_gray;
        glCreateTextures(GL_TEXTURE_2D_ARRAY, 1, &tex_gray);
@@ -391,7 +391,7 @@ void interpolate_image(int argc, char **argv, int optind)
                in_warmup = true;
                for (int i = 0; i < 10; ++i) {
                        GLuint bidirectional_flow_tex = compute_flow.exec(tex_gray, DISComputeFlow::FORWARD_AND_BACKWARD, DISComputeFlow::DO_NOT_RESIZE_FLOW);
-                       GLuint interpolated_tex = interpolate.exec(image_tex, tex_gray, bidirectional_flow_tex, width1, height1, 0.5f);
+                       GLuint interpolated_tex = interpolate.exec(image_tex, tex_gray, bidirectional_flow_tex, width1, height1, 0.5f).first;
                        compute_flow.release_texture(bidirectional_flow_tex);
                        interpolate.release_texture(interpolated_tex);
                }
@@ -405,7 +405,7 @@ void interpolate_image(int argc, char **argv, int optind)
                snprintf(ppm_filename, sizeof(ppm_filename), "interp%04d.ppm", frameno);
 
                float alpha = frameno / 60.0f;
-               GLuint interpolated_tex = interpolate.exec(image_tex, tex_gray, bidirectional_flow_tex, width1, height1, alpha);
+               GLuint interpolated_tex = interpolate.exec(image_tex, tex_gray, bidirectional_flow_tex, width1, height1, alpha).first;
 
                schedule_read<RGBAType>(interpolated_tex, width1, height1, filename0, filename1, "", ppm_filename);
                interpolate.release_texture(interpolated_tex);
index 8d485ad02b20e79ab3c2a2ef578c7395835d2e9b..130757f0d70047015fd92f398b81d35ef79e3822 100644 (file)
@@ -98,7 +98,7 @@ struct VectorDestinationManager {
 };
 static_assert(std::is_standard_layout<VectorDestinationManager>::value, "");
 
-vector<uint8_t> encode_jpeg(const uint8_t *pixel_data, unsigned width, unsigned height)
+vector<uint8_t> encode_jpeg(const uint8_t *y_data, const uint8_t *cbcr_data, unsigned width, unsigned height)
 {
        VectorDestinationManager dest;
 
@@ -127,30 +127,25 @@ vector<uint8_t> encode_jpeg(const uint8_t *pixel_data, unsigned width, unsigned
        cinfo.CCIR601_sampling = true;  // Seems to be mostly ignored by libjpeg, though.
        jpeg_start_compress(&cinfo, true);
 
-       // TODO: Subsample and deinterleave on the GPU.
-
-       unique_ptr<uint8_t[]> ydata(new uint8_t[width * 8]);
+       // TODO: Subsample on the GPU.
        unique_ptr<uint8_t[]> cbdata(new uint8_t[(width/2) * 8]);
        unique_ptr<uint8_t[]> crdata(new uint8_t[(width/2) * 8]);
        JSAMPROW yptr[8], cbptr[8], crptr[8];
        JSAMPARRAY data[3] = { yptr, cbptr, crptr };
        for (unsigned yy = 0; yy < 8; ++yy) {
-               yptr[yy] = ydata.get() + yy * width;
                cbptr[yy] = cbdata.get() + yy * (width / 2);
                crptr[yy] = crdata.get() + yy * (width / 2);
        }
        for (unsigned y = 0; y < height; y += 8) {
-               uint8_t *yptr = ydata.get();
                uint8_t *cbptr = cbdata.get();
                uint8_t *crptr = crdata.get();
                for (unsigned yy = 0; yy < 8; ++yy) {
-                       const uint8_t *sptr = &pixel_data[(height - y - yy - 1) * width * 4];
+                       yptr[yy] = const_cast<JSAMPROW>(&y_data[(height - y - yy - 1) * width]);
+                       const uint8_t *sptr = &cbcr_data[(height - y - yy - 1) * width * 2];
                        for (unsigned x = 0; x < width; x += 2) {
-                               *yptr++ = sptr[0];
-                               *yptr++ = sptr[4];
-                               *cbptr++ = (sptr[1] + sptr[5]) / 2;
-                               *crptr++ = (sptr[2] + sptr[6]) / 2;
-                               sptr += 8;
+                               *cbptr++ = (sptr[0] + sptr[2]) / 2;
+                               *crptr++ = (sptr[1] + sptr[3]) / 2;
+                               sptr += 4;
                        }
                }
 
@@ -247,7 +242,7 @@ VideoStream::VideoStream()
        check_error();
 
        compute_flow.reset(new DISComputeFlow(width, height, operating_point3));
-       interpolate.reset(new Interpolate(width, height, operating_point3));
+       interpolate.reset(new Interpolate(width, height, operating_point3, /*split_ycbcr_output=*/true));
        check_error();
 }
 
@@ -351,7 +346,7 @@ void VideoStream::schedule_interpolated_frame(int64_t output_pts, unsigned strea
        // Compute the interpolated frame.
        qf.flow_tex = compute_flow->exec(resources.gray_tex, DISComputeFlow::FORWARD_AND_BACKWARD, DISComputeFlow::DO_NOT_RESIZE_FLOW);
        check_error();
-       qf.output_tex = interpolate->exec(resources.input_tex, resources.gray_tex, qf.flow_tex, 1280, 720, alpha);
+       tie(qf.output_tex, qf.output2_tex) = interpolate->exec(resources.input_tex, resources.gray_tex, qf.flow_tex, 1280, 720, alpha);
        check_error();
 
        // We could have released qf.flow_tex here, but to make sure we don't cause a stall
@@ -362,7 +357,9 @@ void VideoStream::schedule_interpolated_frame(int64_t output_pts, unsigned strea
        glPixelStorei(GL_PACK_ROW_LENGTH, 0);
        glBindBuffer(GL_PIXEL_PACK_BUFFER, resources.pbo);
        check_error();
-       glGetTextureImage(qf.output_tex, 0, GL_RGBA, GL_UNSIGNED_BYTE, 1280 * 720 * 4, nullptr);
+       glGetTextureImage(qf.output_tex, 0, GL_RED, GL_UNSIGNED_BYTE, 1280 * 720 * 4, BUFFER_OFFSET(0));
+       check_error();
+       glGetTextureImage(qf.output2_tex, 0, GL_RG, GL_UNSIGNED_BYTE, 1280 * 720 * 3, BUFFER_OFFSET(1280 * 720));
        check_error();
        glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
 
@@ -411,9 +408,13 @@ void VideoStream::encode_thread_func()
                } else if (qf.type == QueuedFrame::INTERPOLATED) {
                        glClientWaitSync(qf.fence.get(), /*flags=*/0, GL_TIMEOUT_IGNORED);
 
-                       vector<uint8_t> jpeg = encode_jpeg((const uint8_t *)qf.resources.pbo_contents, 1280, 720);
+                       vector<uint8_t> jpeg = encode_jpeg(
+                               (const uint8_t *)qf.resources.pbo_contents,
+                               (const uint8_t *)qf.resources.pbo_contents + 1280 * 720,
+                               1280, 720);
                        compute_flow->release_texture(qf.flow_tex);
                        interpolate->release_texture(qf.output_tex);
+                       interpolate->release_texture(qf.output2_tex);
 
                        AVPacket pkt;
                        av_init_packet(&pkt);
index 925cace4c62f5465c7bfb4d21bd0e64aca17a3c3..41d9fc8f7c3279b56958d18207de6ecab6e31395 100644 (file)
@@ -66,7 +66,7 @@ private:
                float alpha;
                InterpolatedFrameResources resources;
                RefCountedGLsync fence;  // Set when the interpolated image is read back to the CPU.
-               GLuint flow_tex, output_tex;  // Released in the receiving thread; not really used for anything else.
+               GLuint flow_tex, output_tex, output2_tex;  // Released in the receiving thread; not really used for anything else.
        };
        std::deque<QueuedFrame> frame_queue;  // Under <queue_lock>.
        std::mutex queue_lock;