# Flow objects
OBJS += flow.o gpu_timers.o
-OBJS += ffmpeg_raii.o main.o player.o httpd.o mux.o metacube2.o video_stream.o context.o
+OBJS += ffmpeg_raii.o main.o player.o httpd.o mux.o metacube2.o video_stream.o context.o chroma_subsampler.o
%.o: %.cpp
$(CXX) -MMD -MP $(CPPFLAGS) $(CXXFLAGS) -o $@ -c $<
--- /dev/null
+#version 450 core
+in vec2 tc0, tc1;
+uniform sampler2D cbcr_tex;
+out float Cb, Cr;
+void main() {
+ vec2 result = 0.5 * (texture(cbcr_tex, tc0).rg + texture(cbcr_tex, tc1).rg);
+ Cb = result.r;
+ Cr = result.g;
+}
+
--- /dev/null
+#version 450 core
+
+layout(location=0) in vec2 position;
+out vec2 tc0, tc1;
+uniform vec2 chroma_offset_0;
+uniform vec2 chroma_offset_1;
+
+void main()
+{
+ // The result of glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0) is:
+ //
+ // 2.000 0.000 0.000 -1.000
+ // 0.000 2.000 0.000 -1.000
+ // 0.000 0.000 -2.000 -1.000
+ // 0.000 0.000 0.000 1.000
+ gl_Position = vec4(2.0 * position.x - 1.0, 2.0 * position.y - 1.0, -1.0, 1.0);
+ vec2 flipped_tc = position;
+ tc0 = flipped_tc + chroma_offset_0;
+ tc1 = flipped_tc + chroma_offset_1;
+}
+
--- /dev/null
+#include "chroma_subsampler.h"
+
+#include <string>
+#include <movit/util.h>
+
+#define BUFFER_OFFSET(i) ((char *)nullptr + (i))
+
+using namespace std;
+
+string read_file(const string &filename);
+GLuint compile_shader(const string &shader_src, GLenum type);
+GLuint link_program(GLuint vs_obj, GLuint fs_obj);
+void bind_sampler(GLuint program, GLint location, GLuint texture_unit, GLuint tex, GLuint sampler);
+
+extern GLuint linear_sampler;
+
+ChromaSubsampler::ChromaSubsampler()
+{
+ // Set up stuff for 4:2:2 conversion.
+ //
+ // Note: Due to the horizontally co-sited chroma/luma samples in H.264
+ // (chroma position is left for horizontal),
+ // we need to be a bit careful in our subsampling. A diagram will make
+ // this clearer, showing some luma and chroma samples:
+ //
+ // a b c d
+ // +---+---+---+---+
+ // | | | | |
+ // | Y | Y | Y | Y |
+ // | | | | |
+ // +---+---+---+---+
+ //
+ // +-------+-------+
+ // | | |
+ // | C | C |
+ // | | |
+ // +-------+-------+
+ //
+ // Clearly, the rightmost chroma sample here needs to be equivalent to
+ // b/4 + c/2 + d/4. (We could also implement more sophisticated filters,
+ // of course, but as long as the upsampling is not going to be equally
+ // sophisticated, it's probably not worth it.) If we sample once with
+ // no mipmapping, we get just c, ie., no actual filtering in the
+ // horizontal direction. (For the vertical direction, we can just
+ // sample in the middle to get the right filtering.) One could imagine
+ // we could use mipmapping (assuming we can create mipmaps cheaply),
+ // but then, what we'd get is this:
+ //
+ // (a+b)/2 (c+d)/2
+ // +-------+-------+
+ // | | |
+ // | Y | Y |
+ // | | |
+ // +-------+-------+
+ //
+ // +-------+-------+
+ // | | |
+ // | C | C |
+ // | | |
+ // +-------+-------+
+ //
+ // which ends up sampling equally from a and b, which clearly isn't right. Instead,
+ // we need to do two (non-mipmapped) chroma samples, both hitting exactly in-between
+ // source pixels.
+ //
+ // Sampling in-between b and c gives us the sample (b+c)/2, and similarly for c and d.
+ // Taking the average of these gives of (b+c)/4 + (c+d)/4 = b/4 + c/2 + d/4, which is
+ // exactly what we want.
+ //
+ // See also http://www.poynton.com/PDFs/Merging_RGB_and_422.pdf, pages 6–7.
+
+ cbcr_vs_obj = compile_shader(read_file("chroma_subsample.vert"), GL_VERTEX_SHADER);
+ cbcr_fs_obj = compile_shader(read_file("chroma_subsample.frag"), GL_FRAGMENT_SHADER);
+ cbcr_program = link_program(cbcr_vs_obj, cbcr_fs_obj);
+
+ // Set up the VAO containing all the required position data.
+ glCreateVertexArrays(1, &vao);
+ glBindVertexArray(vao);
+
+ float vertices[] = {
+ 0.0f, 2.0f,
+ 0.0f, 0.0f,
+ 2.0f, 0.0f
+ };
+ glCreateBuffers(1, &vbo);
+ glNamedBufferData(vbo, sizeof(vertices), vertices, GL_STATIC_DRAW);
+ glBindBuffer(GL_ARRAY_BUFFER, vbo);
+
+ GLint position_attrib = 0; // Hard-coded in every vertex shader.
+ glEnableVertexArrayAttrib(vao, position_attrib);
+ glVertexAttribPointer(position_attrib, 2, GL_FLOAT, GL_FALSE, 0, BUFFER_OFFSET(0));
+
+ uniform_cbcr_tex = glGetUniformLocation(cbcr_program, "cbcr_tex");
+ uniform_chroma_offset_0 = glGetUniformLocation(cbcr_program, "chroma_offset_0");
+ uniform_chroma_offset_1 = glGetUniformLocation(cbcr_program, "chroma_offset_1");
+}
+
+ChromaSubsampler::~ChromaSubsampler()
+{
+ glDeleteProgram(cbcr_program);
+ check_error();
+ glDeleteBuffers(1, &vbo);
+ check_error();
+ glDeleteVertexArrays(1, &vao);
+ check_error();
+}
+
+void ChromaSubsampler::subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint cb_tex, GLuint cr_tex)
+{
+ glUseProgram(cbcr_program);
+ bind_sampler(cbcr_program, uniform_cbcr_tex, 0, cbcr_tex, linear_sampler);
+ glProgramUniform2f(cbcr_program, uniform_chroma_offset_0, -1.0f / width, 0.0f);
+ glProgramUniform2f(cbcr_program, uniform_chroma_offset_1, -0.0f / width, 0.0f);
+
+ glViewport(0, 0, width/2, height);
+ fbos.render_to(cb_tex, cr_tex);
+
+ glBindVertexArray(vao);
+ glDrawArrays(GL_TRIANGLES, 0, 3);
+}
--- /dev/null
+#ifndef _CHROMA_SUBSAMPLER_H
+#define _CHROMA_SUBSAMPLER_H 1
+
+#include <epoxy/gl.h>
+
+#include "flow.h"
+
+class ChromaSubsampler {
+public:
+ ChromaSubsampler();
+ ~ChromaSubsampler();
+
+ // Subsamples chroma (packed Cb and Cr) 2x1 to yield chroma suitable for
+ // planar 4:2:2. Chroma positioning is left (H.264 convention).
+ // width and height are the dimensions (in pixels) of the input texture.
+ void subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint cb_tex, GLuint cr_tex);
+
+private:
+ PersistentFBOSet<2> fbos;
+
+ GLuint vao;
+ GLuint vbo; // Holds position data.
+
+ GLuint cbcr_vs_obj, cbcr_fs_obj, cbcr_program;
+ GLuint uniform_cbcr_tex;
+ GLuint uniform_chroma_offset_0, uniform_chroma_offset_1;
+};
+
+#endif // !defined(_CHROMA_SUBSAMPLER_H)
#include <jpeglib.h>
#include <unistd.h>
+#include "chroma_subsampler.h"
#include "context.h"
#include "flow.h"
#include "httpd.h"
};
static_assert(std::is_standard_layout<VectorDestinationManager>::value, "");
-vector<uint8_t> encode_jpeg(const uint8_t *y_data, const uint8_t *cbcr_data, unsigned width, unsigned height)
+vector<uint8_t> encode_jpeg(const uint8_t *y_data, const uint8_t *cb_data, const uint8_t *cr_data, unsigned width, unsigned height)
{
VectorDestinationManager dest;
cinfo.CCIR601_sampling = true; // Seems to be mostly ignored by libjpeg, though.
jpeg_start_compress(&cinfo, true);
- // TODO: Subsample on the GPU.
- unique_ptr<uint8_t[]> cbdata(new uint8_t[(width/2) * 8]);
- unique_ptr<uint8_t[]> crdata(new uint8_t[(width/2) * 8]);
JSAMPROW yptr[8], cbptr[8], crptr[8];
JSAMPARRAY data[3] = { yptr, cbptr, crptr };
- for (unsigned yy = 0; yy < 8; ++yy) {
- cbptr[yy] = cbdata.get() + yy * (width / 2);
- crptr[yy] = crdata.get() + yy * (width / 2);
- }
for (unsigned y = 0; y < height; y += 8) {
- uint8_t *cbptr = cbdata.get();
- uint8_t *crptr = crdata.get();
for (unsigned yy = 0; yy < 8; ++yy) {
yptr[yy] = const_cast<JSAMPROW>(&y_data[(height - y - yy - 1) * width]);
- const uint8_t *sptr = &cbcr_data[(height - y - yy - 1) * width * 2];
- for (unsigned x = 0; x < width; x += 2) {
- *cbptr++ = (sptr[0] + sptr[2]) / 2;
- *crptr++ = (sptr[1] + sptr[3]) / 2;
- sptr += 4;
- }
+ cbptr[yy] = const_cast<JSAMPROW>(&cb_data[(height - y - yy - 1) * width/2]);
+ crptr[yy] = const_cast<JSAMPROW>(&cr_data[(height - y - yy - 1) * width/2]);
}
jpeg_write_raw_data(&cinfo, data, /*num_lines=*/8);
ycbcr_convert_chain->finalize();
check_error();
- GLuint input_tex[num_interpolate_slots], gray_tex[num_interpolate_slots];
+ GLuint input_tex[num_interpolate_slots], gray_tex[num_interpolate_slots], cb_tex[num_interpolate_slots], cr_tex[num_interpolate_slots];
glCreateTextures(GL_TEXTURE_2D_ARRAY, 10, input_tex);
glCreateTextures(GL_TEXTURE_2D_ARRAY, 10, gray_tex);
+ glCreateTextures(GL_TEXTURE_2D, 10, cb_tex);
+ glCreateTextures(GL_TEXTURE_2D, 10, cr_tex);
check_error();
constexpr size_t width = 1280, height = 720; // FIXME: adjustable width, height
int levels = find_num_levels(width, height);
check_error();
glTextureStorage3D(gray_tex[i], levels, GL_R8, width, height, 2);
check_error();
+ glTextureStorage2D(cb_tex[i], 1, GL_R8, width / 2, height);
+ check_error();
+ glTextureStorage2D(cr_tex[i], 1, GL_R8, width / 2, height);
+ check_error();
InterpolatedFrameResources resource;
resource.input_tex = input_tex[i];
resource.gray_tex = gray_tex[i];
+ resource.cb_tex = cb_tex[i];
+ resource.cr_tex = cr_tex[i];
glCreateFramebuffers(2, resource.input_fbos);
check_error();
compute_flow.reset(new DISComputeFlow(width, height, operating_point2));
interpolate.reset(new Interpolate(width, height, operating_point2, /*split_ycbcr_output=*/true));
+ chroma_subsampler.reset(new ChromaSubsampler);
check_error();
}
// Compute the interpolated frame.
qf.flow_tex = compute_flow->exec(resources.gray_tex, DISComputeFlow::FORWARD_AND_BACKWARD, DISComputeFlow::DO_NOT_RESIZE_FLOW);
check_error();
- tie(qf.output_tex, qf.output2_tex) = interpolate->exec(resources.input_tex, resources.gray_tex, qf.flow_tex, 1280, 720, alpha);
+ tie(qf.output_tex, qf.cbcr_tex) = interpolate->exec(resources.input_tex, resources.gray_tex, qf.flow_tex, 1280, 720, alpha);
check_error();
+ // Subsample and split Cb/Cr.
+ chroma_subsampler->subsample_chroma(qf.cbcr_tex, 1280, 720, resources.cb_tex, resources.cr_tex);
+
// We could have released qf.flow_tex here, but to make sure we don't cause a stall
// when trying to reuse it for the next frame, we can just as well hold on to it
// and release it only when the readback is done.
check_error();
glGetTextureImage(qf.output_tex, 0, GL_RED, GL_UNSIGNED_BYTE, 1280 * 720 * 4, BUFFER_OFFSET(0));
check_error();
- glGetTextureImage(qf.output2_tex, 0, GL_RG, GL_UNSIGNED_BYTE, 1280 * 720 * 3, BUFFER_OFFSET(1280 * 720));
+ glGetTextureImage(resources.cb_tex, 0, GL_RED, GL_UNSIGNED_BYTE, 1280 * 720 * 3, BUFFER_OFFSET(1280 * 720));
+ check_error();
+ glGetTextureImage(resources.cr_tex, 0, GL_RED, GL_UNSIGNED_BYTE, 1280 * 720 * 3 - 640 * 720, BUFFER_OFFSET(1280 * 720 + 640 * 720));
check_error();
glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
vector<uint8_t> jpeg = encode_jpeg(
(const uint8_t *)qf.resources.pbo_contents,
(const uint8_t *)qf.resources.pbo_contents + 1280 * 720,
+ (const uint8_t *)qf.resources.pbo_contents + 1280 * 720 + 640 * 720,
1280, 720);
compute_flow->release_texture(qf.flow_tex);
interpolate->release_texture(qf.output_tex);
- interpolate->release_texture(qf.output2_tex);
+ interpolate->release_texture(qf.cbcr_tex);
AVPacket pkt;
av_init_packet(&pkt);
#include "ref_counted_gl_sync.h"
+class ChromaSubsampler;
class DISComputeFlow;
class Interpolate;
class Mux;
struct InterpolatedFrameResources {
GLuint input_tex; // Layered (contains both input frames).
GLuint gray_tex; // Same.
+ GLuint cb_tex, cr_tex;
GLuint input_fbos[2]; // For rendering to the two layers of input_tex.
GLuint pbo; // For reading the data back.
void *pbo_contents; // Persistently mapped.
float alpha;
InterpolatedFrameResources resources;
RefCountedGLsync fence; // Set when the interpolated image is read back to the CPU.
- GLuint flow_tex, output_tex, output2_tex; // Released in the receiving thread; not really used for anything else.
+ GLuint flow_tex, output_tex, cbcr_tex; // Released in the receiving thread; not really used for anything else.
};
std::deque<QueuedFrame> frame_queue; // Under <queue_lock>.
std::mutex queue_lock;
// Frame interpolation.
std::unique_ptr<DISComputeFlow> compute_flow;
std::unique_ptr<Interpolate> interpolate;
+ std::unique_ptr<ChromaSubsampler> chroma_subsampler;
};
#endif // !defined(_VIDEO_STREAM_H)