Subsample chroma on the GPU instead of the CPU.

author Steinar H. Gunderson <sgunderson@bigfoot.com>

Sun, 16 Sep 2018 16:23:05 +0000 (18:23 +0200)

committer Steinar H. Gunderson <sgunderson@bigfoot.com>

Sun, 16 Sep 2018 16:34:16 +0000 (18:34 +0200)
author Steinar H. Gunderson <sgunderson@bigfoot.com>
Sun, 16 Sep 2018 16:23:05 +0000 (18:23 +0200)
committer Steinar H. Gunderson <sgunderson@bigfoot.com>
Sun, 16 Sep 2018 16:34:16 +0000 (18:34 +0200)
diff --git a/Makefile b/Makefile

index e7980465e2257f4864a9febdee19f344616dc8db..a0075c505be6fce88e82293ebff415b0c4451d76 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,7 @@ OBJS += $(OBJS_WITH_MOC:.o=.moc.o)
  # Flow objects
  OBJS += flow.o gpu_timers.o
  
-OBJS += ffmpeg_raii.o main.o player.o httpd.o mux.o metacube2.o video_stream.o context.o
+OBJS += ffmpeg_raii.o main.o player.o httpd.o mux.o metacube2.o video_stream.o context.o chroma_subsampler.o
  
  %.o: %.cpp
         $(CXX) -MMD -MP $(CPPFLAGS) $(CXXFLAGS) -o $@ -c $<
diff --git a/chroma_subsample.frag b/chroma_subsample.frag

new file mode 100644 (file)

index 0000000..9a4155f
--- /dev/null
+++ b/chroma_subsample.frag
@@ -0,0 +1,10 @@
+#version 450 core
+in vec2 tc0, tc1;
+uniform sampler2D cbcr_tex;
+out float Cb, Cr;
+void main() {
+       vec2 result = 0.5 * (texture(cbcr_tex, tc0).rg + texture(cbcr_tex, tc1).rg);
+       Cb = result.r;
+       Cr = result.g;
+}
+
diff --git a/chroma_subsample.vert b/chroma_subsample.vert

new file mode 100644 (file)

index 0000000..81e1004
--- /dev/null
+++ b/chroma_subsample.vert
@@ -0,0 +1,21 @@
+#version 450 core
+
+layout(location=0) in vec2 position;
+out vec2 tc0, tc1;
+uniform vec2 chroma_offset_0;
+uniform vec2 chroma_offset_1;
+
+void main()
+{
+       // The result of glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) is:
+       //
+       //   2.000  0.000  0.000 -1.000
+       //   0.000  2.000  0.000 -1.000
+       //   0.000  0.000 -2.000 -1.000
+       //   0.000  0.000  0.000  1.000
+       gl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0);
+       vec2 flipped_tc = position;
+       tc0 = flipped_tc + chroma_offset_0;
+       tc1 = flipped_tc + chroma_offset_1;
+}
+
diff --git a/chroma_subsampler.cpp b/chroma_subsampler.cpp

new file mode 100644 (file)

index 0000000..28bd0a3
--- /dev/null
+++ b/chroma_subsampler.cpp
@@ -0,0 +1,120 @@
+#include "chroma_subsampler.h"
+
+#include <string>
+#include <movit/util.h>
+
+#define BUFFER_OFFSET(i) ((char *)nullptr + (i))
+
+using namespace std;
+
+string read_file(const string &filename);
+GLuint compile_shader(const string &shader_src, GLenum type);
+GLuint link_program(GLuint vs_obj, GLuint fs_obj);
+void bind_sampler(GLuint program, GLint location, GLuint texture_unit, GLuint tex, GLuint sampler);
+
+extern GLuint linear_sampler;
+
+ChromaSubsampler::ChromaSubsampler()
+{
+       // Set up stuff for 4:2:2 conversion.
+       //
+       // Note: Due to the horizontally co-sited chroma/luma samples in H.264
+       // (chroma position is left for horizontal),
+       // we need to be a bit careful in our subsampling. A diagram will make
+       // this clearer, showing some luma and chroma samples:
+       //
+       //     a   b   c   d
+       //   +---+---+---+---+
+       //   |   |   |   |   |
+       //   | Y | Y | Y | Y |
+       //   |   |   |   |   |
+       //   +---+---+---+---+
+       //
+       // +-------+-------+
+       // |       |       |
+       // |   C   |   C   |
+       // |       |       |
+       // +-------+-------+
+       //
+       // Clearly, the rightmost chroma sample here needs to be equivalent to
+       // b/4 + c/2 + d/4. (We could also implement more sophisticated filters,
+       // of course, but as long as the upsampling is not going to be equally
+       // sophisticated, it's probably not worth it.) If we sample once with
+       // no mipmapping, we get just c, ie., no actual filtering in the
+       // horizontal direction. (For the vertical direction, we can just
+       // sample in the middle to get the right filtering.) One could imagine
+       // we could use mipmapping (assuming we can create mipmaps cheaply),
+       // but then, what we'd get is this:
+       //
+       //    (a+b)/2 (c+d)/2
+       //   +-------+-------+
+       //   |       |       |
+       //   |   Y   |   Y   |
+       //   |       |       |
+       //   +-------+-------+
+       //
+       // +-------+-------+
+       // |       |       |
+       // |   C   |   C   |
+       // |       |       |
+       // +-------+-------+
+       //
+       // which ends up sampling equally from a and b, which clearly isn't right. Instead,
+       // we need to do two (non-mipmapped) chroma samples, both hitting exactly in-between
+       // source pixels.
+       //
+       // Sampling in-between b and c gives us the sample (b+c)/2, and similarly for c and d.
+       // Taking the average of these gives of (b+c)/4 + (c+d)/4 = b/4 + c/2 + d/4, which is
+       // exactly what we want.
+       //
+       // See also http://www.poynton.com/PDFs/Merging_RGB_and_422.pdf, pages 6–7.
+
+       cbcr_vs_obj = compile_shader(read_file("chroma_subsample.vert"), GL_VERTEX_SHADER);
+       cbcr_fs_obj = compile_shader(read_file("chroma_subsample.frag"), GL_FRAGMENT_SHADER);
+       cbcr_program = link_program(cbcr_vs_obj, cbcr_fs_obj);
+
+       // Set up the VAO containing all the required position data.
+       glCreateVertexArrays(1, &vao);
+       glBindVertexArray(vao);
+
+       float vertices[] = {
+               0.0f, 2.0f,
+               0.0f, 0.0f,
+               2.0f, 0.0f
+       };
+       glCreateBuffers(1, &vbo);
+        glNamedBufferData(vbo, sizeof(vertices), vertices, GL_STATIC_DRAW);
+       glBindBuffer(GL_ARRAY_BUFFER, vbo);
+
+        GLint position_attrib = 0;  // Hard-coded in every vertex shader.
+       glEnableVertexArrayAttrib(vao, position_attrib);
+       glVertexAttribPointer(position_attrib, 2, GL_FLOAT, GL_FALSE, 0, BUFFER_OFFSET(0));
+
+       uniform_cbcr_tex = glGetUniformLocation(cbcr_program, "cbcr_tex");
+       uniform_chroma_offset_0 = glGetUniformLocation(cbcr_program, "chroma_offset_0");
+       uniform_chroma_offset_1 = glGetUniformLocation(cbcr_program, "chroma_offset_1");
+}
+
+ChromaSubsampler::~ChromaSubsampler()
+{
+       glDeleteProgram(cbcr_program);
+       check_error();
+       glDeleteBuffers(1, &vbo);
+       check_error();
+       glDeleteVertexArrays(1, &vao);
+       check_error();
+}
+
+void ChromaSubsampler::subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint cb_tex, GLuint cr_tex)
+{
+       glUseProgram(cbcr_program);
+       bind_sampler(cbcr_program, uniform_cbcr_tex, 0, cbcr_tex, linear_sampler);
+       glProgramUniform2f(cbcr_program, uniform_chroma_offset_0, -1.0f / width, 0.0f);
+       glProgramUniform2f(cbcr_program, uniform_chroma_offset_1, -0.0f / width, 0.0f);
+
+       glViewport(0, 0, width/2, height);
+       fbos.render_to(cb_tex, cr_tex);
+
+       glBindVertexArray(vao);
+       glDrawArrays(GL_TRIANGLES, 0, 3);
+}
diff --git a/chroma_subsampler.h b/chroma_subsampler.h

new file mode 100644 (file)

index 0000000..84351c9
--- /dev/null
+++ b/chroma_subsampler.h
@@ -0,0 +1,29 @@
+#ifndef _CHROMA_SUBSAMPLER_H
+#define _CHROMA_SUBSAMPLER_H 1
+
+#include <epoxy/gl.h>
+
+#include "flow.h"
+
+class ChromaSubsampler {
+public:
+       ChromaSubsampler();
+       ~ChromaSubsampler();
+
+       // Subsamples chroma (packed Cb and Cr) 2x1 to yield chroma suitable for
+       // planar 4:2:2. Chroma positioning is left (H.264 convention).
+       // width and height are the dimensions (in pixels) of the input texture.
+       void subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint cb_tex, GLuint cr_tex);
+
+private:
+       PersistentFBOSet<2> fbos;
+
+       GLuint vao;
+       GLuint vbo;  // Holds position data.
+
+       GLuint cbcr_vs_obj, cbcr_fs_obj, cbcr_program;
+       GLuint uniform_cbcr_tex;
+       GLuint uniform_chroma_offset_0, uniform_chroma_offset_1;
+};
+
+#endif  // !defined(_CHROMA_SUBSAMPLER_H)
diff --git a/video_stream.cpp b/video_stream.cpp

index 9b0e3a8bed006c6e2deb5ee14f786380e87950f3..27f0b2bd069e5da57639a0b955150f7b433c6862 100644 (file)
--- a/video_stream.cpp
+++ b/video_stream.cpp
@@ -8,6 +8,7 @@ extern "C" {
  #include <jpeglib.h>
  #include <unistd.h>
  
+#include "chroma_subsampler.h"
  #include "context.h"
  #include "flow.h"
  #include "httpd.h"
@@ -98,7 +99,7 @@ struct VectorDestinationManager {
  };
  static_assert(std::is_standard_layout<VectorDestinationManager>::value, "");
  
-vector<uint8_t> encode_jpeg(const uint8_t *y_data, const uint8_t *cbcr_data, unsigned width, unsigned height)
+vector<uint8_t> encode_jpeg(const uint8_t *y_data, const uint8_t *cb_data, const uint8_t *cr_data, unsigned width, unsigned height)
  {
         VectorDestinationManager dest;
  
@@ -127,26 +128,13 @@ vector<uint8_t> encode_jpeg(const uint8_t *y_data, const uint8_t *cbcr_data, uns
         cinfo.CCIR601_sampling = true;  // Seems to be mostly ignored by libjpeg, though.
         jpeg_start_compress(&cinfo, true);
  
-       // TODO: Subsample on the GPU.
-       unique_ptr<uint8_t[]> cbdata(new uint8_t[(width/2) * 8]);
-       unique_ptr<uint8_t[]> crdata(new uint8_t[(width/2) * 8]);
         JSAMPROW yptr[8], cbptr[8], crptr[8];
         JSAMPARRAY data[3] = { yptr, cbptr, crptr };
-       for (unsigned yy = 0; yy < 8; ++yy) {
-               cbptr[yy] = cbdata.get() + yy * (width / 2);
-               crptr[yy] = crdata.get() + yy * (width / 2);
-       }
         for (unsigned y = 0; y < height; y += 8) {
-               uint8_t *cbptr = cbdata.get();
-               uint8_t *crptr = crdata.get();
                 for (unsigned yy = 0; yy < 8; ++yy) {
                         yptr[yy] = const_cast<JSAMPROW>(&y_data[(height - y - yy - 1) * width]);
-                       const uint8_t *sptr = &cbcr_data[(height - y - yy - 1) * width * 2];
-                       for (unsigned x = 0; x < width; x += 2) {
-                               *cbptr++ = (sptr[0] + sptr[2]) / 2;
-                               *crptr++ = (sptr[1] + sptr[3]) / 2;
-                               sptr += 4;
-                       }
+                       cbptr[yy] = const_cast<JSAMPROW>(&cb_data[(height - y - yy - 1) * width/2]);
+                       crptr[yy] = const_cast<JSAMPROW>(&cr_data[(height - y - yy - 1) * width/2]);
                 }
  
                 jpeg_write_raw_data(&cinfo, data, /*num_lines=*/8);
@@ -198,9 +186,11 @@ VideoStream::VideoStream()
         ycbcr_convert_chain->finalize();
         check_error();
  
-       GLuint input_tex[num_interpolate_slots], gray_tex[num_interpolate_slots];
+       GLuint input_tex[num_interpolate_slots], gray_tex[num_interpolate_slots], cb_tex[num_interpolate_slots], cr_tex[num_interpolate_slots];
         glCreateTextures(GL_TEXTURE_2D_ARRAY, 10, input_tex);
         glCreateTextures(GL_TEXTURE_2D_ARRAY, 10, gray_tex);
+       glCreateTextures(GL_TEXTURE_2D, 10, cb_tex);
+       glCreateTextures(GL_TEXTURE_2D, 10, cr_tex);
         check_error();
         constexpr size_t width = 1280, height = 720;  // FIXME: adjustable width, height
         int levels = find_num_levels(width, height);
@@ -209,10 +199,16 @@ VideoStream::VideoStream()
                 check_error();
                 glTextureStorage3D(gray_tex[i], levels, GL_R8, width, height, 2);
                 check_error();
+               glTextureStorage2D(cb_tex[i], 1, GL_R8, width / 2, height);
+               check_error();
+               glTextureStorage2D(cr_tex[i], 1, GL_R8, width / 2, height);
+               check_error();
  
                 InterpolatedFrameResources resource;
                 resource.input_tex = input_tex[i];
                 resource.gray_tex = gray_tex[i];
+               resource.cb_tex = cb_tex[i];
+               resource.cr_tex = cr_tex[i];
                 glCreateFramebuffers(2, resource.input_fbos);
                 check_error();
  
@@ -243,6 +239,7 @@ VideoStream::VideoStream()
  
         compute_flow.reset(new DISComputeFlow(width, height, operating_point2));
         interpolate.reset(new Interpolate(width, height, operating_point2, /*split_ycbcr_output=*/true));
+       chroma_subsampler.reset(new ChromaSubsampler);
         check_error();
  }
  
@@ -346,9 +343,12 @@ void VideoStream::schedule_interpolated_frame(int64_t output_pts, unsigned strea
         // Compute the interpolated frame.
         qf.flow_tex = compute_flow->exec(resources.gray_tex, DISComputeFlow::FORWARD_AND_BACKWARD, DISComputeFlow::DO_NOT_RESIZE_FLOW);
         check_error();
-       tie(qf.output_tex, qf.output2_tex) = interpolate->exec(resources.input_tex, resources.gray_tex, qf.flow_tex, 1280, 720, alpha);
+       tie(qf.output_tex, qf.cbcr_tex) = interpolate->exec(resources.input_tex, resources.gray_tex, qf.flow_tex, 1280, 720, alpha);
         check_error();
  
+       // Subsample and split Cb/Cr.
+       chroma_subsampler->subsample_chroma(qf.cbcr_tex, 1280, 720, resources.cb_tex, resources.cr_tex);
+
         // We could have released qf.flow_tex here, but to make sure we don't cause a stall
         // when trying to reuse it for the next frame, we can just as well hold on to it
         // and release it only when the readback is done.
@@ -359,7 +359,9 @@ void VideoStream::schedule_interpolated_frame(int64_t output_pts, unsigned strea
         check_error();
         glGetTextureImage(qf.output_tex, 0, GL_RED, GL_UNSIGNED_BYTE, 1280 * 720 * 4, BUFFER_OFFSET(0));
         check_error();
-       glGetTextureImage(qf.output2_tex, 0, GL_RG, GL_UNSIGNED_BYTE, 1280 * 720 * 3, BUFFER_OFFSET(1280 * 720));
+       glGetTextureImage(resources.cb_tex, 0, GL_RED, GL_UNSIGNED_BYTE, 1280 * 720 * 3, BUFFER_OFFSET(1280 * 720));
+       check_error();
+       glGetTextureImage(resources.cr_tex, 0, GL_RED, GL_UNSIGNED_BYTE, 1280 * 720 * 3 - 640 * 720, BUFFER_OFFSET(1280 * 720 + 640 * 720));
         check_error();
         glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
  
@@ -411,10 +413,11 @@ void VideoStream::encode_thread_func()
                         vector<uint8_t> jpeg = encode_jpeg(
                                 (const uint8_t *)qf.resources.pbo_contents,
                                 (const uint8_t *)qf.resources.pbo_contents + 1280 * 720,
+                               (const uint8_t *)qf.resources.pbo_contents + 1280 * 720 + 640 * 720,
                                 1280, 720);
                         compute_flow->release_texture(qf.flow_tex);
                         interpolate->release_texture(qf.output_tex);
-                       interpolate->release_texture(qf.output2_tex);
+                       interpolate->release_texture(qf.cbcr_tex);
  
                         AVPacket pkt;
                         av_init_packet(&pkt);
diff --git a/video_stream.h b/video_stream.h

index 41d9fc8f7c3279b56958d18207de6ecab6e31395..2d1e8f12ba90e866a6196881dd570e142f6de388 100644 (file)
--- a/video_stream.h
+++ b/video_stream.h
@@ -19,6 +19,7 @@ extern "C" {
  
  #include "ref_counted_gl_sync.h"
  
+class ChromaSubsampler;
  class DISComputeFlow;
  class Interpolate;
  class Mux;
@@ -48,6 +49,7 @@ private:
         struct InterpolatedFrameResources {
                 GLuint input_tex;  // Layered (contains both input frames).
                 GLuint gray_tex;  // Same.
+               GLuint cb_tex, cr_tex;
                 GLuint input_fbos[2];  // For rendering to the two layers of input_tex.
                 GLuint pbo;  // For reading the data back.
                 void *pbo_contents;  // Persistently mapped.
@@ -66,7 +68,7 @@ private:
                 float alpha;
                 InterpolatedFrameResources resources;
                 RefCountedGLsync fence;  // Set when the interpolated image is read back to the CPU.
-               GLuint flow_tex, output_tex, output2_tex;  // Released in the receiving thread; not really used for anything else.
+               GLuint flow_tex, output_tex, cbcr_tex;  // Released in the receiving thread; not really used for anything else.
         };
         std::deque<QueuedFrame> frame_queue;  // Under <queue_lock>.
         std::mutex queue_lock;
@@ -88,6 +90,7 @@ private:
         // Frame interpolation.
         std::unique_ptr<DISComputeFlow> compute_flow;
         std::unique_ptr<Interpolate> interpolate;
+       std::unique_ptr<ChromaSubsampler> chroma_subsampler;
  };
  
  #endif  // !defined(_VIDEO_STREAM_H)
author	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Sun, 16 Sep 2018 16:23:05 +0000 (18:23 +0200)
committer	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Sun, 16 Sep 2018 16:34:16 +0000 (18:34 +0200)
Makefile		patch \| blob \| history
chroma_subsample.frag	[new file with mode: 0644]	patch \| blob
chroma_subsample.vert	[new file with mode: 0644]	patch \| blob
chroma_subsampler.cpp	[new file with mode: 0644]	patch \| blob
chroma_subsampler.h	[new file with mode: 0644]	patch \| blob
video_stream.cpp		patch \| blob \| history
video_stream.h		patch \| blob \| history