From 2eca06acb1668ccf543ed375e9ac59790672bad3 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Sun, 16 Sep 2018 18:23:05 +0200
Subject: [PATCH] Subsample chroma on the GPU instead of the CPU.

Faster, and also gets the subsampling right. The shaders come from Nageru,
but the support code is heavily tweaked to be more like flow.h.
---
 Makefile              |   2 +-
 chroma_subsample.frag |  10 ++++
 chroma_subsample.vert |  21 ++++++++
 chroma_subsampler.cpp | 120 ++++++++++++++++++++++++++++++++++++++++++
 chroma_subsampler.h   |  29 ++++++++++
 video_stream.cpp      |  43 ++++++++-------
 video_stream.h        |   5 +-
 7 files changed, 208 insertions(+), 22 deletions(-)
 create mode 100644 chroma_subsample.frag
 create mode 100644 chroma_subsample.vert
 create mode 100644 chroma_subsampler.cpp
 create mode 100644 chroma_subsampler.h
diff --git a/Makefile b/Makefile
index e798046..a0075c5 100644
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,7 @@ OBJS += $(OBJS_WITH_MOC:.o=.moc.o)
 # Flow objects
 OBJS += flow.o gpu_timers.o
 
-OBJS += ffmpeg_raii.o main.o player.o httpd.o mux.o metacube2.o video_stream.o context.o
+OBJS += ffmpeg_raii.o main.o player.o httpd.o mux.o metacube2.o video_stream.o context.o chroma_subsampler.o
 
 %.o: %.cpp
 	$(CXX) -MMD -MP $(CPPFLAGS) $(CXXFLAGS) -o $@ -c $<
diff --git a/chroma_subsample.frag b/chroma_subsample.frag
new file mode 100644
index 0000000..9a4155f
--- /dev/null
+++ b/chroma_subsample.frag
@@ -0,0 +1,10 @@
+#version 450 core
+in vec2 tc0, tc1;
+uniform sampler2D cbcr_tex;
+out float Cb, Cr;
+void main() {
+	vec2 result = 0.5 * (texture(cbcr_tex, tc0).rg + texture(cbcr_tex, tc1).rg);
+	Cb = result.r;
+	Cr = result.g;
+}
+
diff --git a/chroma_subsample.vert b/chroma_subsample.vert
new file mode 100644
index 0000000..81e1004
--- /dev/null
+++ b/chroma_subsample.vert
@@ -0,0 +1,21 @@
+#version 450 core
+
+layout(location=0) in vec2 position;
+out vec2 tc0, tc1;
+uniform vec2 chroma_offset_0;
+uniform vec2 chroma_offset_1;
+
+void main()
+{
+	// The result of glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) is:
+	//
+	//   2.000  0.000  0.000 -1.000
+	//   0.000  2.000  0.000 -1.000
+	//   0.000  0.000 -2.000 -1.000
+	//   0.000  0.000  0.000  1.000
+	gl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0);
+	vec2 flipped_tc = position;
+	tc0 = flipped_tc + chroma_offset_0;
+	tc1 = flipped_tc + chroma_offset_1;
+}
+
diff --git a/chroma_subsampler.cpp b/chroma_subsampler.cpp
new file mode 100644
index 0000000..28bd0a3
--- /dev/null
+++ b/chroma_subsampler.cpp
@@ -0,0 +1,120 @@
+#include "chroma_subsampler.h"
+
+#include <string>
+#include <movit/util.h>
+
+#define BUFFER_OFFSET(i) ((char *)nullptr + (i))
+
+using namespace std;
+
+string read_file(const string &filename);
+GLuint compile_shader(const string &shader_src, GLenum type);
+GLuint link_program(GLuint vs_obj, GLuint fs_obj);
+void bind_sampler(GLuint program, GLint location, GLuint texture_unit, GLuint tex, GLuint sampler);
+
+extern GLuint linear_sampler;
+
+ChromaSubsampler::ChromaSubsampler()
+{
+	// Set up stuff for 4:2:2 conversion.
+	//
+	// Note: Due to the horizontally co-sited chroma/luma samples in H.264
+	// (chroma position is left for horizontal),
+	// we need to be a bit careful in our subsampling. A diagram will make
+	// this clearer, showing some luma and chroma samples:
+	//
+	//     a   b   c   d
+	//   +---+---+---+---+
+	//   |   |   |   |   |
+	//   | Y | Y | Y | Y |
+	//   |   |   |   |   |
+	//   +---+---+---+---+
+	//
+	// +-------+-------+
+	// |       |       |
+	// |   C   |   C   |
+	// |       |       |
+	// +-------+-------+
+	//
+	// Clearly, the rightmost chroma sample here needs to be equivalent to
+	// b/4 + c/2 + d/4. (We could also implement more sophisticated filters,
+	// of course, but as long as the upsampling is not going to be equally
+	// sophisticated, it's probably not worth it.) If we sample once with
+	// no mipmapping, we get just c, ie., no actual filtering in the
+	// horizontal direction. (For the vertical direction, we can just
+	// sample in the middle to get the right filtering.) One could imagine
+	// we could use mipmapping (assuming we can create mipmaps cheaply),
+	// but then, what we'd get is this:
+	//
+	//    (a+b)/2 (c+d)/2
+	//   +-------+-------+
+	//   |       |       |
+	//   |   Y   |   Y   |
+	//   |       |       |
+	//   +-------+-------+
+	//
+	// +-------+-------+
+	// |       |       |
+	// |   C   |   C   |
+	// |       |       |
+	// +-------+-------+
+	//
+	// which ends up sampling equally from a and b, which clearly isn't right. Instead,
+	// we need to do two (non-mipmapped) chroma samples, both hitting exactly in-between
+	// source pixels.
+	//
+	// Sampling in-between b and c gives us the sample (b+c)/2, and similarly for c and d.
+	// Taking the average of these gives of (b+c)/4 + (c+d)/4 = b/4 + c/2 + d/4, which is
+	// exactly what we want.
+	//
+	// See also http://www.poynton.com/PDFs/Merging_RGB_and_422.pdf, pages 6â7.
+
+	cbcr_vs_obj = compile_shader(read_file("chroma_subsample.vert"), GL_VERTEX_SHADER);
+	cbcr_fs_obj = compile_shader(read_file("chroma_subsample.frag"), GL_FRAGMENT_SHADER);
+	cbcr_program = link_program(cbcr_vs_obj, cbcr_fs_obj);
+
+	// Set up the VAO containing all the required position data.
+	glCreateVertexArrays(1, &vao);
+	glBindVertexArray(vao);
+
+	float vertices[] = {
+		0.0f, 2.0f,
+		0.0f, 0.0f,
+		2.0f, 0.0f
+	};
+	glCreateBuffers(1, &vbo);
+        glNamedBufferData(vbo, sizeof(vertices), vertices, GL_STATIC_DRAW);
+	glBindBuffer(GL_ARRAY_BUFFER, vbo);
+
+        GLint position_attrib = 0;  // Hard-coded in every vertex shader.
+	glEnableVertexArrayAttrib(vao, position_attrib);
+	glVertexAttribPointer(position_attrib, 2, GL_FLOAT, GL_FALSE, 0, BUFFER_OFFSET(0));
+
+	uniform_cbcr_tex = glGetUniformLocation(cbcr_program, "cbcr_tex");
+	uniform_chroma_offset_0 = glGetUniformLocation(cbcr_program, "chroma_offset_0");
+	uniform_chroma_offset_1 = glGetUniformLocation(cbcr_program, "chroma_offset_1");
+}
+
+ChromaSubsampler::~ChromaSubsampler()
+{
+	glDeleteProgram(cbcr_program);
+	check_error();
+	glDeleteBuffers(1, &vbo);
+	check_error();
+	glDeleteVertexArrays(1, &vao);
+	check_error();
+}
+
+void ChromaSubsampler::subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint cb_tex, GLuint cr_tex)
+{
+	glUseProgram(cbcr_program);
+	bind_sampler(cbcr_program, uniform_cbcr_tex, 0, cbcr_tex, linear_sampler);
+	glProgramUniform2f(cbcr_program, uniform_chroma_offset_0, -1.0f / width, 0.0f);
+	glProgramUniform2f(cbcr_program, uniform_chroma_offset_1, -0.0f / width, 0.0f);
+
+	glViewport(0, 0, width/2, height);
+	fbos.render_to(cb_tex, cr_tex);
+
+	glBindVertexArray(vao);
+	glDrawArrays(GL_TRIANGLES, 0, 3);
+}
diff --git a/chroma_subsampler.h b/chroma_subsampler.h
new file mode 100644
index 0000000..84351c9
--- /dev/null
+++ b/chroma_subsampler.h
@@ -0,0 +1,29 @@
+#ifndef _CHROMA_SUBSAMPLER_H
+#define _CHROMA_SUBSAMPLER_H 1
+
+#include <epoxy/gl.h>
+
+#include "flow.h"
+
+class ChromaSubsampler {
+public:
+	ChromaSubsampler();
+	~ChromaSubsampler();
+
+	// Subsamples chroma (packed Cb and Cr) 2x1 to yield chroma suitable for
+	// planar 4:2:2. Chroma positioning is left (H.264 convention).
+	// width and height are the dimensions (in pixels) of the input texture.
+	void subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint cb_tex, GLuint cr_tex);
+
+private:
+	PersistentFBOSet<2> fbos;
+
+	GLuint vao;
+	GLuint vbo;  // Holds position data.
+
+	GLuint cbcr_vs_obj, cbcr_fs_obj, cbcr_program;
+	GLuint uniform_cbcr_tex;
+	GLuint uniform_chroma_offset_0, uniform_chroma_offset_1;
+};
+
+#endif  // !defined(_CHROMA_SUBSAMPLER_H)
diff --git a/video_stream.cpp b/video_stream.cpp
index 9b0e3a8..27f0b2b 100644
--- a/video_stream.cpp
+++ b/video_stream.cpp
@@ -8,6 +8,7 @@ extern "C" {
 #include <jpeglib.h>
 #include <unistd.h>
 
+#include "chroma_subsampler.h"
 #include "context.h"
 #include "flow.h"
 #include "httpd.h"
@@ -98,7 +99,7 @@ struct VectorDestinationManager {
 };
 static_assert(std::is_standard_layout<VectorDestinationManager>::value, "");
 
-vector<uint8_t> encode_jpeg(const uint8_t *y_data, const uint8_t *cbcr_data, unsigned width, unsigned height)
+vector<uint8_t> encode_jpeg(const uint8_t *y_data, const uint8_t *cb_data, const uint8_t *cr_data, unsigned width, unsigned height)
 {
 	VectorDestinationManager dest;
 
@@ -127,26 +128,13 @@ vector<uint8_t> encode_jpeg(const uint8_t *y_data, const uint8_t *cbcr_data, uns
 	cinfo.CCIR601_sampling = true;  // Seems to be mostly ignored by libjpeg, though.
 	jpeg_start_compress(&cinfo, true);
 
-	// TODO: Subsample on the GPU.
-	unique_ptr<uint8_t[]> cbdata(new uint8_t[(width/2) * 8]);
-	unique_ptr<uint8_t[]> crdata(new uint8_t[(width/2) * 8]);
 	JSAMPROW yptr[8], cbptr[8], crptr[8];
 	JSAMPARRAY data[3] = { yptr, cbptr, crptr };
-	for (unsigned yy = 0; yy < 8; ++yy) {
-		cbptr[yy] = cbdata.get() + yy * (width / 2);
-		crptr[yy] = crdata.get() + yy * (width / 2);
-	}
 	for (unsigned y = 0; y < height; y += 8) {
-		uint8_t *cbptr = cbdata.get();
-		uint8_t *crptr = crdata.get();
 		for (unsigned yy = 0; yy < 8; ++yy) {
 			yptr[yy] = const_cast<JSAMPROW>(&y_data[(height - y - yy - 1) * width]);
-			const uint8_t *sptr = &cbcr_data[(height - y - yy - 1) * width * 2];
-			for (unsigned x = 0; x < width; x += 2) {
-				*cbptr++ = (sptr[0] + sptr[2]) / 2;
-				*crptr++ = (sptr[1] + sptr[3]) / 2;
-				sptr += 4;
-			}
+			cbptr[yy] = const_cast<JSAMPROW>(&cb_data[(height - y - yy - 1) * width/2]);
+			crptr[yy] = const_cast<JSAMPROW>(&cr_data[(height - y - yy - 1) * width/2]);
 		}
 
 		jpeg_write_raw_data(&cinfo, data, /*num_lines=*/8);
@@ -198,9 +186,11 @@ VideoStream::VideoStream()
 	ycbcr_convert_chain->finalize();
 	check_error();
 
-	GLuint input_tex[num_interpolate_slots], gray_tex[num_interpolate_slots];
+	GLuint input_tex[num_interpolate_slots], gray_tex[num_interpolate_slots], cb_tex[num_interpolate_slots], cr_tex[num_interpolate_slots];
 	glCreateTextures(GL_TEXTURE_2D_ARRAY, 10, input_tex);
 	glCreateTextures(GL_TEXTURE_2D_ARRAY, 10, gray_tex);
+	glCreateTextures(GL_TEXTURE_2D, 10, cb_tex);
+	glCreateTextures(GL_TEXTURE_2D, 10, cr_tex);
 	check_error();
 	constexpr size_t width = 1280, height = 720;  // FIXME: adjustable width, height
 	int levels = find_num_levels(width, height);
@@ -209,10 +199,16 @@ VideoStream::VideoStream()
 		check_error();
 		glTextureStorage3D(gray_tex[i], levels, GL_R8, width, height, 2);
 		check_error();
+		glTextureStorage2D(cb_tex[i], 1, GL_R8, width / 2, height);
+		check_error();
+		glTextureStorage2D(cr_tex[i], 1, GL_R8, width / 2, height);
+		check_error();
 
 		InterpolatedFrameResources resource;
 		resource.input_tex = input_tex[i];
 		resource.gray_tex = gray_tex[i];
+		resource.cb_tex = cb_tex[i];
+		resource.cr_tex = cr_tex[i];
 		glCreateFramebuffers(2, resource.input_fbos);
 		check_error();
 
@@ -243,6 +239,7 @@ VideoStream::VideoStream()
 
 	compute_flow.reset(new DISComputeFlow(width, height, operating_point2));
 	interpolate.reset(new Interpolate(width, height, operating_point2, /*split_ycbcr_output=*/true));
+	chroma_subsampler.reset(new ChromaSubsampler);
 	check_error();
 }
 
@@ -346,9 +343,12 @@ void VideoStream::schedule_interpolated_frame(int64_t output_pts, unsigned strea
 	// Compute the interpolated frame.
 	qf.flow_tex = compute_flow->exec(resources.gray_tex, DISComputeFlow::FORWARD_AND_BACKWARD, DISComputeFlow::DO_NOT_RESIZE_FLOW);
 	check_error();
-	tie(qf.output_tex, qf.output2_tex) = interpolate->exec(resources.input_tex, resources.gray_tex, qf.flow_tex, 1280, 720, alpha);
+	tie(qf.output_tex, qf.cbcr_tex) = interpolate->exec(resources.input_tex, resources.gray_tex, qf.flow_tex, 1280, 720, alpha);
 	check_error();
 
+	// Subsample and split Cb/Cr.
+	chroma_subsampler->subsample_chroma(qf.cbcr_tex, 1280, 720, resources.cb_tex, resources.cr_tex);
+
 	// We could have released qf.flow_tex here, but to make sure we don't cause a stall
 	// when trying to reuse it for the next frame, we can just as well hold on to it
 	// and release it only when the readback is done.
@@ -359,7 +359,9 @@ void VideoStream::schedule_interpolated_frame(int64_t output_pts, unsigned strea
 	check_error();
 	glGetTextureImage(qf.output_tex, 0, GL_RED, GL_UNSIGNED_BYTE, 1280 * 720 * 4, BUFFER_OFFSET(0));
 	check_error();
-	glGetTextureImage(qf.output2_tex, 0, GL_RG, GL_UNSIGNED_BYTE, 1280 * 720 * 3, BUFFER_OFFSET(1280 * 720));
+	glGetTextureImage(resources.cb_tex, 0, GL_RED, GL_UNSIGNED_BYTE, 1280 * 720 * 3, BUFFER_OFFSET(1280 * 720));
+	check_error();
+	glGetTextureImage(resources.cr_tex, 0, GL_RED, GL_UNSIGNED_BYTE, 1280 * 720 * 3 - 640 * 720, BUFFER_OFFSET(1280 * 720 + 640 * 720));
 	check_error();
 	glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
 
@@ -411,10 +413,11 @@ void VideoStream::encode_thread_func()
 			vector<uint8_t> jpeg = encode_jpeg(
 				(const uint8_t *)qf.resources.pbo_contents,
 				(const uint8_t *)qf.resources.pbo_contents + 1280 * 720,
+				(const uint8_t *)qf.resources.pbo_contents + 1280 * 720 + 640 * 720,
 				1280, 720);
 			compute_flow->release_texture(qf.flow_tex);
 			interpolate->release_texture(qf.output_tex);
-			interpolate->release_texture(qf.output2_tex);
+			interpolate->release_texture(qf.cbcr_tex);
 
 			AVPacket pkt;
 			av_init_packet(&pkt);
diff --git a/video_stream.h b/video_stream.h
index 41d9fc8..2d1e8f1 100644
--- a/video_stream.h
+++ b/video_stream.h
@@ -19,6 +19,7 @@ extern "C" {
 
 #include "ref_counted_gl_sync.h"
 
+class ChromaSubsampler;
 class DISComputeFlow;
 class Interpolate;
 class Mux;
@@ -48,6 +49,7 @@ private:
 	struct InterpolatedFrameResources {
 		GLuint input_tex;  // Layered (contains both input frames).
 		GLuint gray_tex;  // Same.
+		GLuint cb_tex, cr_tex;
 		GLuint input_fbos[2];  // For rendering to the two layers of input_tex.
 		GLuint pbo;  // For reading the data back.
 		void *pbo_contents;  // Persistently mapped.
@@ -66,7 +68,7 @@ private:
 		float alpha;
 		InterpolatedFrameResources resources;
 		RefCountedGLsync fence;  // Set when the interpolated image is read back to the CPU.
-		GLuint flow_tex, output_tex, output2_tex;  // Released in the receiving thread; not really used for anything else.
+		GLuint flow_tex, output_tex, cbcr_tex;  // Released in the receiving thread; not really used for anything else.
 	};
 	std::deque<QueuedFrame> frame_queue;  // Under <queue_lock>.
 	std::mutex queue_lock;
@@ -88,6 +90,7 @@ private:
 	// Frame interpolation.
 	std::unique_ptr<DISComputeFlow> compute_flow;
 	std::unique_ptr<Interpolate> interpolate;
+	std::unique_ptr<ChromaSubsampler> chroma_subsampler;
 };
 
 #endif  // !defined(_VIDEO_STREAM_H)
-- 
2.39.2