Do deinterleaving on the GPU (subsampling still remains).

author Steinar H. Gunderson <sgunderson@bigfoot.com>

Sun, 16 Sep 2018 15:35:45 +0000 (17:35 +0200)

committer Steinar H. Gunderson <sgunderson@bigfoot.com>

Sun, 16 Sep 2018 15:36:13 +0000 (17:36 +0200)
author Steinar H. Gunderson <sgunderson@bigfoot.com>
Sun, 16 Sep 2018 15:35:45 +0000 (17:35 +0200)
committer Steinar H. Gunderson <sgunderson@bigfoot.com>
Sun, 16 Sep 2018 15:36:13 +0000 (17:36 +0200)
diff --git a/blend.frag b/blend.frag

index e88da50b929ce23516586d97a4960577ebafa824..eb3fc80753fa0ddec596b8c2fc1a686cd7973b61 100644 (file)
--- a/blend.frag
+++ b/blend.frag
@@ -1,7 +1,13 @@
  #version 450 core
  
  in vec3 tc;
+
+#ifdef SPLIT_YCBCR_OUTPUT
+out float Y;
+out vec2 CbCr;
+#else
  out vec4 rgba;
+#endif
  
  uniform sampler2DArray image_tex;
  uniform sampler2D flow_tex;
@@ -24,13 +30,20 @@ void main()
         // Same for d1.
         float d1 = (1.0f - alpha) * length(size * (texture(flow_tex, vec2(tc.xy + (1.0f - alpha) * flow)).xy - flow));
  
+       vec4 result;
         if (max(d0, d1) < 3.0f) {  // Arbitrary constant, not all that tuned. The UW paper says 1.0 is fine for ground truth.
                 // Both are visible, so blend.
-               rgba = I_0 + alpha * (I_1 - I_0);
+               result = I_0 + alpha * (I_1 - I_0);
         } else if (d0 < d1) {
-               rgba = I_0;
+               result = I_0;
         } else {
-               rgba = I_1;
+               result = I_1;
         }
  
+#ifdef SPLIT_YCBCR_OUTPUT
+       Y = result.r;
+       CbCr = result.gb;
+#else
+       rgba = result;
+#endif
  }
diff --git a/flow.cpp b/flow.cpp

index 9f816e6f30171ac1c005594fed142b974e6bd0c1..71b0b23c6f85033f10cdbe93fb12798e2649778e 100644 (file)
--- a/flow.cpp
+++ b/flow.cpp
@@ -915,10 +915,19 @@ void HoleBlend::exec(GLuint flow_tex, GLuint depth_rb, GLuint temp_tex[3], int w
         glDisable(GL_DEPTH_TEST);
  }
  
-Blend::Blend()
+Blend::Blend(bool split_ycbcr_output)
+       : split_ycbcr_output(split_ycbcr_output)
  {
+       string frag_shader = read_file("blend.frag");
+       if (split_ycbcr_output) {
+               // Insert after the first #version line.
+               size_t offset = frag_shader.find('\n');
+               assert(offset != string::npos);
+               frag_shader = frag_shader.substr(0, offset + 1) + "#define SPLIT_YCBCR_OUTPUT 1\n" + frag_shader.substr(offset + 1);
+       }
+
         blend_vs_obj = compile_shader(read_file("vs.vert"), GL_VERTEX_SHADER);
-       blend_fs_obj = compile_shader(read_file("blend.frag"), GL_FRAGMENT_SHADER);
+       blend_fs_obj = compile_shader(frag_shader, GL_FRAGMENT_SHADER);
         blend_program = link_program(blend_vs_obj, blend_fs_obj);
  
         uniform_image_tex = glGetUniformLocation(blend_program, "image_tex");
@@ -927,7 +936,7 @@ Blend::Blend()
         uniform_flow_consistency_tolerance = glGetUniformLocation(blend_program, "flow_consistency_tolerance");
  }
  
-void Blend::exec(GLuint image_tex, GLuint flow_tex, GLuint output_tex, int level_width, int level_height, float alpha)
+void Blend::exec(GLuint image_tex, GLuint flow_tex, GLuint output_tex, GLuint output2_tex, int level_width, int level_height, float alpha)
  {
         glUseProgram(blend_program);
         bind_sampler(blend_program, uniform_image_tex, 0, image_tex, linear_sampler);
@@ -935,13 +944,23 @@ void Blend::exec(GLuint image_tex, GLuint flow_tex, GLuint output_tex, int level
         glProgramUniform1f(blend_program, uniform_alpha, alpha);
  
         glViewport(0, 0, level_width, level_height);
-       fbos.render_to(output_tex);
+       if (split_ycbcr_output) {
+               fbos_split.render_to(output_tex, output2_tex);
+       } else {
+               fbos.render_to(output_tex);
+       }
         glDisable(GL_BLEND);  // A bit ironic, perhaps.
         glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
  }
  
-Interpolate::Interpolate(int width, int height, const OperatingPoint &op)
-       : width(width), height(height), flow_level(op.finest_level), op(op), splat(op) {
+Interpolate::Interpolate(int width, int height, const OperatingPoint &op, bool split_ycbcr_output)
+       : width(width),
+         height(height),
+         flow_level(op.finest_level),
+         op(op),
+         split_ycbcr_output(split_ycbcr_output),
+         splat(op),
+         blend(split_ycbcr_output) {
         // Set up the vertex data that will be shared between all passes.
         float vertices[] = {
                 0.0f, 1.0f,
@@ -961,7 +980,7 @@ Interpolate::Interpolate(int width, int height, const OperatingPoint &op)
         glVertexAttribPointer(position_attrib, 2, GL_FLOAT, GL_FALSE, 0, BUFFER_OFFSET(0));
  }
  
-GLuint Interpolate::exec(GLuint image_tex, GLuint gray_tex, GLuint bidirectional_flow_tex, GLuint width, GLuint height, float alpha)
+pair<GLuint, GLuint> Interpolate::exec(GLuint image_tex, GLuint gray_tex, GLuint bidirectional_flow_tex, GLuint width, GLuint height, float alpha)
  {
         GPUTimers timers;
  
@@ -1003,10 +1022,20 @@ GLuint Interpolate::exec(GLuint image_tex, GLuint gray_tex, GLuint bidirectional
         pool.release_texture(temp_tex[2]);
         pool.release_renderbuffer(depth_rb);
  
-       GLuint output_tex = pool.get_texture(GL_RGBA8, width, height);
-       {
-               ScopedTimer timer("Blend", &total_timer);
-               blend.exec(image_tex, flow_tex, output_tex, width, height, alpha);
+       GLuint output_tex, output2_tex = 0;
+       if (split_ycbcr_output) {
+               output_tex = pool.get_texture(GL_R8, width, height);
+               output2_tex = pool.get_texture(GL_RG8, width, height);
+               {
+                       ScopedTimer timer("Blend", &total_timer);
+                       blend.exec(image_tex, flow_tex, output_tex, output2_tex, width, height, alpha);
+               }
+       } else {
+               output_tex = pool.get_texture(GL_RGBA8, width, height);
+               {
+                       ScopedTimer timer("Blend", &total_timer);
+                       blend.exec(image_tex, flow_tex, output_tex, 0, width, height, alpha);
+               }
         }
         pool.release_texture(flow_tex);
         total_timer.end();
@@ -1014,7 +1043,7 @@ GLuint Interpolate::exec(GLuint image_tex, GLuint gray_tex, GLuint bidirectional
                 timers.print();
         }
  
-       return output_tex;
+       return make_pair(output_tex, output2_tex);
  }
  
  GLuint TexturePool::get_texture(GLenum format, GLuint width, GLuint height, GLuint num_layers)
diff --git a/flow.h b/flow.h

index 31111f5bd42b17f1aa20453ac24694e744d9af79..9536a3808034868810f4ba3143d782b3315b11a4 100644 (file)
--- a/flow.h
+++ b/flow.h
@@ -522,11 +522,15 @@ private:
  
  class Blend {
  public:
-       Blend();
-       void exec(GLuint image_tex, GLuint flow_tex, GLuint output_tex, int width, int height, float alpha);
+       Blend(bool split_ycbcr_output);
+
+       // output2_tex is only used if split_ycbcr_output was true.
+       void exec(GLuint image_tex, GLuint flow_tex, GLuint output_tex, GLuint output2_tex, int width, int height, float alpha);
  
  private:
+       bool split_ycbcr_output;
         PersistentFBOSet<1> fbos;
+       PersistentFBOSet<2> fbos_split;
         GLuint blend_vs_obj;
         GLuint blend_fs_obj;
         GLuint blend_program;
@@ -537,12 +541,12 @@ private:
  
  class Interpolate {
  public:
-       Interpolate(int width, int height, const OperatingPoint &op);
+       Interpolate(int width, int height, const OperatingPoint &op, bool split_ycbcr_output);
  
-       // Returns a texture that must be released with release_texture()
-       // after use. image_tex must be a two-layer RGBA8 texture with mipmaps
-       // (unless flow_level == 0).
-       GLuint exec(GLuint image_tex, GLuint gray_tex, GLuint bidirectional_flow_tex, GLuint width, GLuint height, float alpha);
+       // Returns a texture (or two, if split_ycbcr_output is true) that must
+       // be released with release_texture() after use. image_tex must be a
+       // two-layer RGBA8 texture with mipmaps (unless flow_level == 0).
+       std::pair<GLuint, GLuint> exec(GLuint image_tex, GLuint gray_tex, GLuint bidirectional_flow_tex, GLuint width, GLuint height, float alpha);
  
         void release_texture(GLuint tex) {
                 pool.release_texture(tex);
@@ -553,6 +557,7 @@ private:
         GLuint vertex_vbo, vao;
         TexturePool pool;
         const OperatingPoint op;
+       const bool split_ycbcr_output;
  
         Splat splat;
         HoleFill hole_fill;
diff --git a/flow_main.cpp b/flow_main.cpp

index d8cf4773f59e6c07d2b5b6a031b2b5103d7e7ff3..a961a60a9aa805df282ddd7b4a4e6fd4d428c1ea 100644 (file)
--- a/flow_main.cpp
+++ b/flow_main.cpp
@@ -379,7 +379,7 @@ void interpolate_image(int argc, char **argv, int optind)
         }
         DISComputeFlow compute_flow(width1, height1, op);
         GrayscaleConversion gray;
-       Interpolate interpolate(width1, height1, op);
+       Interpolate interpolate(width1, height1, op, /*split_ycbcr_output=*/false);
  
         GLuint tex_gray;
         glCreateTextures(GL_TEXTURE_2D_ARRAY, 1, &tex_gray);
@@ -391,7 +391,7 @@ void interpolate_image(int argc, char **argv, int optind)
                 in_warmup = true;
                 for (int i = 0; i < 10; ++i) {
                         GLuint bidirectional_flow_tex = compute_flow.exec(tex_gray, DISComputeFlow::FORWARD_AND_BACKWARD, DISComputeFlow::DO_NOT_RESIZE_FLOW);
-                       GLuint interpolated_tex = interpolate.exec(image_tex, tex_gray, bidirectional_flow_tex, width1, height1, 0.5f);
+                       GLuint interpolated_tex = interpolate.exec(image_tex, tex_gray, bidirectional_flow_tex, width1, height1, 0.5f).first;
                         compute_flow.release_texture(bidirectional_flow_tex);
                         interpolate.release_texture(interpolated_tex);
                 }
@@ -405,7 +405,7 @@ void interpolate_image(int argc, char **argv, int optind)
                 snprintf(ppm_filename, sizeof(ppm_filename), "interp%04d.ppm", frameno);
  
                 float alpha = frameno / 60.0f;
-               GLuint interpolated_tex = interpolate.exec(image_tex, tex_gray, bidirectional_flow_tex, width1, height1, alpha);
+               GLuint interpolated_tex = interpolate.exec(image_tex, tex_gray, bidirectional_flow_tex, width1, height1, alpha).first;
  
                 schedule_read<RGBAType>(interpolated_tex, width1, height1, filename0, filename1, "", ppm_filename);
                 interpolate.release_texture(interpolated_tex);
diff --git a/video_stream.cpp b/video_stream.cpp

index 8d485ad02b20e79ab3c2a2ef578c7395835d2e9b..130757f0d70047015fd92f398b81d35ef79e3822 100644 (file)
--- a/video_stream.cpp
+++ b/video_stream.cpp
@@ -98,7 +98,7 @@ struct VectorDestinationManager {
  };
  static_assert(std::is_standard_layout<VectorDestinationManager>::value, "");
  
-vector<uint8_t> encode_jpeg(const uint8_t *pixel_data, unsigned width, unsigned height)
+vector<uint8_t> encode_jpeg(const uint8_t *y_data, const uint8_t *cbcr_data, unsigned width, unsigned height)
  {
         VectorDestinationManager dest;
  
@@ -127,30 +127,25 @@ vector<uint8_t> encode_jpeg(const uint8_t *pixel_data, unsigned width, unsigned
         cinfo.CCIR601_sampling = true;  // Seems to be mostly ignored by libjpeg, though.
         jpeg_start_compress(&cinfo, true);
  
-       // TODO: Subsample and deinterleave on the GPU.
-
-       unique_ptr<uint8_t[]> ydata(new uint8_t[width * 8]);
+       // TODO: Subsample on the GPU.
         unique_ptr<uint8_t[]> cbdata(new uint8_t[(width/2) * 8]);
         unique_ptr<uint8_t[]> crdata(new uint8_t[(width/2) * 8]);
         JSAMPROW yptr[8], cbptr[8], crptr[8];
         JSAMPARRAY data[3] = { yptr, cbptr, crptr };
         for (unsigned yy = 0; yy < 8; ++yy) {
-               yptr[yy] = ydata.get() + yy * width;
                 cbptr[yy] = cbdata.get() + yy * (width / 2);
                 crptr[yy] = crdata.get() + yy * (width / 2);
         }
         for (unsigned y = 0; y < height; y += 8) {
-               uint8_t *yptr = ydata.get();
                 uint8_t *cbptr = cbdata.get();
                 uint8_t *crptr = crdata.get();
                 for (unsigned yy = 0; yy < 8; ++yy) {
-                       const uint8_t *sptr = &pixel_data[(height - y - yy - 1) * width * 4];
+                       yptr[yy] = const_cast<JSAMPROW>(&y_data[(height - y - yy - 1) * width]);
+                       const uint8_t *sptr = &cbcr_data[(height - y - yy - 1) * width * 2];
                         for (unsigned x = 0; x < width; x += 2) {
-                               *yptr++ = sptr[0];
-                               *yptr++ = sptr[4];
-                               *cbptr++ = (sptr[1] + sptr[5]) / 2;
-                               *crptr++ = (sptr[2] + sptr[6]) / 2;
-                               sptr += 8;
+                               *cbptr++ = (sptr[0] + sptr[2]) / 2;
+                               *crptr++ = (sptr[1] + sptr[3]) / 2;
+                               sptr += 4;
                         }
                 }
  
@@ -247,7 +242,7 @@ VideoStream::VideoStream()
         check_error();
  
         compute_flow.reset(new DISComputeFlow(width, height, operating_point3));
-       interpolate.reset(new Interpolate(width, height, operating_point3));
+       interpolate.reset(new Interpolate(width, height, operating_point3, /*split_ycbcr_output=*/true));
         check_error();
  }
  
@@ -351,7 +346,7 @@ void VideoStream::schedule_interpolated_frame(int64_t output_pts, unsigned strea
         // Compute the interpolated frame.
         qf.flow_tex = compute_flow->exec(resources.gray_tex, DISComputeFlow::FORWARD_AND_BACKWARD, DISComputeFlow::DO_NOT_RESIZE_FLOW);
         check_error();
-       qf.output_tex = interpolate->exec(resources.input_tex, resources.gray_tex, qf.flow_tex, 1280, 720, alpha);
+       tie(qf.output_tex, qf.output2_tex) = interpolate->exec(resources.input_tex, resources.gray_tex, qf.flow_tex, 1280, 720, alpha);
         check_error();
  
         // We could have released qf.flow_tex here, but to make sure we don't cause a stall
@@ -362,7 +357,9 @@ void VideoStream::schedule_interpolated_frame(int64_t output_pts, unsigned strea
         glPixelStorei(GL_PACK_ROW_LENGTH, 0);
         glBindBuffer(GL_PIXEL_PACK_BUFFER, resources.pbo);
         check_error();
-       glGetTextureImage(qf.output_tex, 0, GL_RGBA, GL_UNSIGNED_BYTE, 1280 * 720 * 4, nullptr);
+       glGetTextureImage(qf.output_tex, 0, GL_RED, GL_UNSIGNED_BYTE, 1280 * 720 * 4, BUFFER_OFFSET(0));
+       check_error();
+       glGetTextureImage(qf.output2_tex, 0, GL_RG, GL_UNSIGNED_BYTE, 1280 * 720 * 3, BUFFER_OFFSET(1280 * 720));
         check_error();
         glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
  
@@ -411,9 +408,13 @@ void VideoStream::encode_thread_func()
                 } else if (qf.type == QueuedFrame::INTERPOLATED) {
                         glClientWaitSync(qf.fence.get(), /*flags=*/0, GL_TIMEOUT_IGNORED);
  
-                       vector<uint8_t> jpeg = encode_jpeg((const uint8_t *)qf.resources.pbo_contents, 1280, 720);
+                       vector<uint8_t> jpeg = encode_jpeg(
+                               (const uint8_t *)qf.resources.pbo_contents,
+                               (const uint8_t *)qf.resources.pbo_contents + 1280 * 720,
+                               1280, 720);
                         compute_flow->release_texture(qf.flow_tex);
                         interpolate->release_texture(qf.output_tex);
+                       interpolate->release_texture(qf.output2_tex);
  
                         AVPacket pkt;
                         av_init_packet(&pkt);
diff --git a/video_stream.h b/video_stream.h

index 925cace4c62f5465c7bfb4d21bd0e64aca17a3c3..41d9fc8f7c3279b56958d18207de6ecab6e31395 100644 (file)
--- a/video_stream.h
+++ b/video_stream.h
@@ -66,7 +66,7 @@ private:
                 float alpha;
                 InterpolatedFrameResources resources;
                 RefCountedGLsync fence;  // Set when the interpolated image is read back to the CPU.
-               GLuint flow_tex, output_tex;  // Released in the receiving thread; not really used for anything else.
+               GLuint flow_tex, output_tex, output2_tex;  // Released in the receiving thread; not really used for anything else.
         };
         std::deque<QueuedFrame> frame_queue;  // Under <queue_lock>.
         std::mutex queue_lock;
author	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Sun, 16 Sep 2018 15:35:45 +0000 (17:35 +0200)
committer	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Sun, 16 Sep 2018 15:36:13 +0000 (17:36 +0200)
blend.frag		patch \| blob \| history
flow.cpp		patch \| blob \| history
flow.h		patch \| blob \| history
flow_main.cpp		patch \| blob \| history
video_stream.cpp		patch \| blob \| history
video_stream.h		patch \| blob \| history