]> git.sesse.net Git - nageru/commitdiff
Support 4:2:2 v210 (10-bit) output for DeckLink.
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Wed, 15 Mar 2017 22:07:24 +0000 (23:07 +0100)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Wed, 15 Mar 2017 22:11:49 +0000 (23:11 +0100)
This again requires compute shaders; my GTX 950 needs a bit under 0.1 ms
to convert a 720p frame from the 16-bit planar representation. It replaces the
flag for 10-bit x264.

v210 is, as far as I understand, pretty much the native format for the DeckLink
cards, but I believe the conversion happens in hardware, so there shouldn't
be any significant speed gains to be have.

chroma_subsampler.cpp
chroma_subsampler.h
decklink_output.cpp
decklink_output.h
flags.cpp
flags.h
mixer.cpp

index a9e535592634c53d9931e964fffecb6f9b85cf88..cf46883c5505621416d72303a5e377276f89dfa5 100644 (file)
@@ -1,4 +1,5 @@
 #include "chroma_subsampler.h"
+#include "v210_converter.h"
 
 #include <vector>
 
@@ -170,6 +171,78 @@ ChromaSubsampler::ChromaSubsampler(ResourcePool *resource_pool)
        };
        vbo = generate_vbo(2, GL_FLOAT, sizeof(vertices), vertices);
        check_error();
+
+       // v210 compute shader.
+       if (v210Converter::has_hardware_support()) {
+               string v210_shader_src = R"(#version 150
+#extension GL_ARB_compute_shader : enable
+#extension GL_ARB_shader_image_load_store : enable
+layout(local_size_x=2, local_size_y=16) in;
+layout(r16) uniform restrict readonly image2D in_y;
+uniform sampler2D in_cbcr;  // Of type RG16.
+layout(rgb10_a2) uniform restrict writeonly image2D outbuf;
+uniform float inv_width, inv_height;
+
+void main()
+{
+       int xb = int(gl_GlobalInvocationID.x);  // X block number.
+       int y = int(gl_GlobalInvocationID.y);  // Y (actual line).
+       float yf = (gl_GlobalInvocationID.y + 0.5f) * inv_height;  // Y float coordinate.
+
+       // Load and scale CbCr values, sampling in-between the texels to get
+       // to (left/4 + center/2 + right/4).
+       vec2 pix_cbcr[3];
+       for (int i = 0; i < 3; ++i) {
+               vec2 a = texture(in_cbcr, vec2((xb * 6 + i * 2) * inv_width, yf)).xy;
+               vec2 b = texture(in_cbcr, vec2((xb * 6 + i * 2 + 1) * inv_width, yf)).xy;
+               pix_cbcr[i] = (a + b) * (0.5 * 65535.0 / 1023.0);
+       }
+
+       // Load and scale the Y values. Note that we use integer coordinates here,
+       // so we don't need to offset by 0.5.
+       float pix_y[6];
+       for (int i = 0; i < 6; ++i) {
+               pix_y[i] = imageLoad(in_y, ivec2(xb * 6 + i, y)).x * (65535.0 / 1023.0);
+       }
+
+       imageStore(outbuf, ivec2(xb * 4 + 0, y), vec4(pix_cbcr[0].x, pix_y[0],      pix_cbcr[0].y, 1.0));
+       imageStore(outbuf, ivec2(xb * 4 + 1, y), vec4(pix_y[1],      pix_cbcr[1].x, pix_y[2],      1.0));
+       imageStore(outbuf, ivec2(xb * 4 + 2, y), vec4(pix_cbcr[1].y, pix_y[3],      pix_cbcr[2].x, 1.0));
+       imageStore(outbuf, ivec2(xb * 4 + 3, y), vec4(pix_y[4],      pix_cbcr[2].y, pix_y[5],      1.0));
+}
+)";
+               GLuint shader_num = movit::compile_shader(v210_shader_src, GL_COMPUTE_SHADER);
+               check_error();
+               v210_program_num = glCreateProgram();
+               check_error();
+               glAttachShader(v210_program_num, shader_num);
+               check_error();
+               glLinkProgram(v210_program_num);
+               check_error();
+
+               GLint success;
+               glGetProgramiv(v210_program_num, GL_LINK_STATUS, &success);
+               check_error();
+               if (success == GL_FALSE) {
+                       GLchar error_log[1024] = {0};
+                       glGetProgramInfoLog(v210_program_num, 1024, NULL, error_log);
+                       fprintf(stderr, "Error linking program: %s\n", error_log);
+                       exit(1);
+               }
+
+               v210_in_y_pos = glGetUniformLocation(v210_program_num, "in_y");
+               check_error();
+               v210_in_cbcr_pos = glGetUniformLocation(v210_program_num, "in_cbcr");
+               check_error();
+               v210_outbuf_pos = glGetUniformLocation(v210_program_num, "outbuf");
+               check_error();
+               v210_inv_width_pos = glGetUniformLocation(v210_program_num, "inv_width");
+               check_error();
+               v210_inv_height_pos = glGetUniformLocation(v210_program_num, "inv_height");
+               check_error();
+       } else {
+               v210_program_num = 0;
+       }
 }
 
 ChromaSubsampler::~ChromaSubsampler()
@@ -180,6 +253,10 @@ ChromaSubsampler::~ChromaSubsampler()
        check_error();
        glDeleteBuffers(1, &vbo);
        check_error();
+       if (v210_program_num != 0) {
+               glDeleteProgram(v210_program_num);
+               check_error();
+       }
 }
 
 void ChromaSubsampler::subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex, GLuint dst2_tex)
@@ -334,3 +411,60 @@ void ChromaSubsampler::create_uyvy(GLuint y_tex, GLuint cbcr_tex, unsigned width
        resource_pool->release_fbo(fbo);
        glDeleteVertexArrays(1, &vao);
 }
+
+void ChromaSubsampler::create_v210(GLuint y_tex, GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex)
+{
+       assert(v210_program_num != 0);
+
+       glUseProgram(v210_program_num);
+       check_error();
+
+       glUniform1i(v210_in_y_pos, 0);
+       check_error();
+       glUniform1i(v210_in_cbcr_pos, 1);
+       check_error();
+       glUniform1i(v210_outbuf_pos, 2);
+       check_error();
+       glUniform1f(v210_inv_width_pos, 1.0 / width);
+       check_error();
+       glUniform1f(v210_inv_height_pos, 1.0 / height);
+       check_error();
+
+       glActiveTexture(GL_TEXTURE0);
+       check_error();
+       glBindTexture(GL_TEXTURE_2D, y_tex);  // We don't actually need to bind it, but we need to set the state.
+       check_error();
+       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+       check_error();
+       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+       check_error();
+       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+       check_error();
+       glBindImageTexture(0, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16);  // This is the real bind.
+       check_error();
+
+       glActiveTexture(GL_TEXTURE1);
+       check_error();
+       glBindTexture(GL_TEXTURE_2D, cbcr_tex);
+       check_error();
+       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+       check_error();
+       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+       check_error();
+       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+       check_error();
+
+       glBindImageTexture(2, dst_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGB10_A2);
+       check_error();
+
+       // Actually run the shader. We use workgroups of size 2x16 threadst , and each thread
+       // processes 6x1 input pixels, so round up to number of 12x16 pixel blocks.
+       glDispatchCompute((width + 11) / 12, (height + 15) / 16, 1);
+
+       glBindTexture(GL_TEXTURE_2D, 0);
+       check_error();
+       glActiveTexture(GL_TEXTURE0);
+       check_error();
+       glUseProgram(0);
+       check_error();
+}
index 1bed433cc5863cefc919da6980a921b3b2407f55..d4c1c1ed22c26c4535e476741b137145f48bcf38 100644 (file)
@@ -27,6 +27,14 @@ public:
        // width and height are the dimensions (in pixels) of the input textures.
        void create_uyvy(GLuint y_tex, GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex);
 
+       // Subsamples and interleaves luma and chroma to give 10-bit 4:2:2
+       // packed Y'CbCr (v210); see v210converter.h for more information on
+       // the format. Luma and chroma are assumed to be 10-bit data packed
+       // into 16-bit textures. Chroma positioning is left (H.264 convention).
+       // width and height are the dimensions (in pixels) of the input textures;
+       // Requires compute shaders; check v210Converter::has_hardware_support().
+       void create_v210(GLuint y_tex, GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex);
+
 private:
        movit::ResourcePool *resource_pool;
 
@@ -39,6 +47,10 @@ private:
        GLuint uyvy_program_num;  // Owned by <resource_pool>.
        GLuint uyvy_y_texture_sampler_uniform, uyvy_cbcr_texture_sampler_uniform;
        GLuint uyvy_position_attribute_index, uyvy_texcoord_attribute_index;
+
+       GLuint v210_program_num;  // Compute shader, so owned by ourselves. Can be 0.
+       GLuint v210_in_y_pos, v210_in_cbcr_pos, v210_outbuf_pos;
+       GLuint v210_inv_width_pos, v210_inv_height_pos;
 };
 
 #endif  // !defined(_CHROMA_SUBSAMPLER_H)
index 3b00d28bbeee3afea7a13277e0f3b70db3c7b475..3ce692bb6146c8413d61a4a3dcc7e1b40dfe2e87 100644 (file)
@@ -12,6 +12,7 @@
 #include "print_latency.h"
 #include "resource_pool.h"
 #include "timebase.h"
+#include "v210_converter.h"
 
 using namespace movit;
 using namespace std;
@@ -96,7 +97,8 @@ void DeckLinkOutput::start_output(uint32_t mode, int64_t base_pts)
 
        BMDDisplayModeSupport support;
        IDeckLinkDisplayMode *display_mode;
-       if (output->DoesSupportVideoMode(mode, bmdFormat8BitYUV, bmdVideoOutputFlagDefault,
+       BMDPixelFormat pixel_format = global_flags.ten_bit_output ? bmdFormat10BitYUV : bmdFormat8BitYUV;
+       if (output->DoesSupportVideoMode(mode, pixel_format, bmdVideoOutputFlagDefault,
                                         &support, &display_mode) != S_OK) {
                fprintf(stderr, "Couldn't ask for format support\n");
                exit(1);
@@ -198,7 +200,11 @@ void DeckLinkOutput::send_frame(GLuint y_tex, GLuint cbcr_tex, YCbCrLumaCoeffici
        }
 
        unique_ptr<Frame> frame = move(get_frame());
-       chroma_subsampler->create_uyvy(y_tex, cbcr_tex, width, height, frame->uyvy_tex);
+       if (global_flags.ten_bit_output) {
+               chroma_subsampler->create_v210(y_tex, cbcr_tex, width, height, frame->uyvy_tex);
+       } else {
+               chroma_subsampler->create_uyvy(y_tex, cbcr_tex, width, height, frame->uyvy_tex);
+       }
 
        // Download the UYVY texture to the PBO.
        glPixelStorei(GL_PACK_ROW_LENGTH, 0);
@@ -207,10 +213,17 @@ void DeckLinkOutput::send_frame(GLuint y_tex, GLuint cbcr_tex, YCbCrLumaCoeffici
        glBindBuffer(GL_PIXEL_PACK_BUFFER, frame->pbo);
        check_error();
 
-       glBindTexture(GL_TEXTURE_2D, frame->uyvy_tex);
-       check_error();
-       glGetTexImage(GL_TEXTURE_2D, 0, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, BUFFER_OFFSET(0));
-       check_error();
+       if (global_flags.ten_bit_output) {
+               glBindTexture(GL_TEXTURE_2D, frame->uyvy_tex);
+               check_error();
+               glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, BUFFER_OFFSET(0));
+               check_error();
+       } else {
+               glBindTexture(GL_TEXTURE_2D, frame->uyvy_tex);
+               check_error();
+               glGetTexImage(GL_TEXTURE_2D, 0, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, BUFFER_OFFSET(0));
+               check_error();
+       }
 
        glBindTexture(GL_TEXTURE_2D, 0);
        check_error();
@@ -406,17 +419,31 @@ unique_ptr<DeckLinkOutput::Frame> DeckLinkOutput::get_frame()
 
        unique_ptr<Frame> frame(new Frame);
 
-       frame->uyvy_tex = resource_pool->create_2d_texture(GL_RGBA8, width / 2, height);
+       size_t stride;
+       if (global_flags.ten_bit_output) {
+               stride = v210Converter::get_v210_stride(width);
+               GLint v210_width = stride / sizeof(uint32_t);
+               frame->uyvy_tex = resource_pool->create_2d_texture(GL_RGB10_A2, v210_width, height);
+
+               // We need valid texture state, or NVIDIA won't allow us to write to the texture.
+               glBindTexture(GL_TEXTURE_2D, frame->uyvy_tex);
+               check_error();
+               glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+               check_error();
+       } else {
+               stride = width * 2;
+               frame->uyvy_tex = resource_pool->create_2d_texture(GL_RGBA8, width / 2, height);
+       }
 
        glGenBuffers(1, &frame->pbo);
        check_error();
        glBindBuffer(GL_PIXEL_PACK_BUFFER, frame->pbo);
        check_error();
-       glBufferStorage(GL_PIXEL_PACK_BUFFER, width * height * 2, NULL, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       glBufferStorage(GL_PIXEL_PACK_BUFFER, stride * height, NULL, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
        check_error();
-       frame->uyvy_ptr = (uint8_t *)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, width * height * 2, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+       frame->uyvy_ptr = (uint8_t *)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, stride * height, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
        check_error();
-       frame->uyvy_ptr_local.reset(new uint8_t[width * height * 2]);
+       frame->uyvy_ptr_local.reset(new uint8_t[stride * height]);
        frame->resource_pool = resource_pool;
 
        return frame;
@@ -444,7 +471,11 @@ void DeckLinkOutput::present_thread_func()
                check_error();
                frame->fence.reset();
 
-               memcpy(frame->uyvy_ptr_local.get(), frame->uyvy_ptr, width * height * 2);
+               if (global_flags.ten_bit_output) {
+                       memcpy(frame->uyvy_ptr_local.get(), frame->uyvy_ptr, v210Converter::get_v210_stride(width) * height);
+               } else {
+                       memcpy(frame->uyvy_ptr_local.get(), frame->uyvy_ptr, width * height * 2);
+               }
 
                // Release any input frames we needed to render this frame.
                frame->input_frames.clear();
@@ -526,12 +557,20 @@ long DeckLinkOutput::Frame::GetHeight()
 
 long DeckLinkOutput::Frame::GetRowBytes()
 {
-       return global_flags.width * 2;
+       if (global_flags.ten_bit_output) {
+               return v210Converter::get_v210_stride(global_flags.width);
+       } else {
+               return global_flags.width * 2;
+       }
 }
 
 BMDPixelFormat DeckLinkOutput::Frame::GetPixelFormat()
 {
-       return bmdFormat8BitYUV;
+       if (global_flags.ten_bit_output) {
+               return bmdFormat10BitYUV;
+       } else {
+               return bmdFormat8BitYUV;
+       }
 }
 
 BMDFrameFlags DeckLinkOutput::Frame::GetFlags()
index 5581c392860f67c89576e788ee59976b919d875a..7c0a17fad2d483c88a5232113f1bc1601bbb3bfb 100644 (file)
@@ -102,7 +102,7 @@ private:
                movit::ResourcePool *resource_pool;
 
                // These members are persistently allocated, and reused when the frame object is.
-               GLuint uyvy_tex;  // Owned by <resource_pool>.
+               GLuint uyvy_tex;  // Owned by <resource_pool>. Can also hold v210 data.
                GLuint pbo;
                uint8_t *uyvy_ptr;  // Persistent mapping into the PBO.
 
index e6c306be52d23b38614aa8361aac6331404c5cca..ee575c57863984621386dcbcee11781857afa222 100644 (file)
--- a/flags.cpp
+++ b/flags.cpp
@@ -27,7 +27,6 @@ enum LongOption {
        OPTION_X264_BITRATE,
        OPTION_X264_VBV_BUFSIZE,
        OPTION_X264_VBV_MAX_BITRATE,
-       OPTION_X264_10_BIT,
        OPTION_X264_PARAM,
        OPTION_HTTP_MUX,
        OPTION_HTTP_COARSE_TIMEBASE,
@@ -56,6 +55,7 @@ enum LongOption {
        OPTION_TIMECODE_STREAM,
        OPTION_TIMECODE_STDOUT,
        OPTION_10_BIT_INPUT,
+       OPTION_10_BIT_OUTPUT,
 };
 
 void usage()
@@ -90,7 +90,6 @@ void usage()
        fprintf(stderr, "                                  default: same as --x264-bitrate, that is, one-second VBV)\n");
        fprintf(stderr, "      --x264-vbv-max-bitrate      x264 local max bitrate (in kilobit/sec per --vbv-bufsize,\n");
        fprintf(stderr, "                                  0 = no limit, default: same as --x264-bitrate, i.e., CBR)\n");
-       fprintf(stderr, "      --x264-10-bit               enable 10-bit x264 encoding\n");
        fprintf(stderr, "      --x264-param=NAME[,VALUE]   set any x264 parameter, for fine tuning\n");
        fprintf(stderr, "      --http-mux=NAME             mux to use for HTTP streams (default " DEFAULT_STREAM_MUX_NAME ")\n");
        fprintf(stderr, "      --http-audio-codec=NAME     audio codec to use for HTTP streams\n");
@@ -130,6 +129,8 @@ void usage()
        fprintf(stderr, "      --timecode-stream           show timestamp and timecode in stream\n");
        fprintf(stderr, "      --timecode-stdout           show timestamp and timecode on standard output\n");
        fprintf(stderr, "      --10-bit-input              use 10-bit video input (requires compute shaders)\n");
+       fprintf(stderr, "      --10-bit-output             use 10-bit video output (requires compute shaders,\n");
+       fprintf(stderr, "                                    implies --record-x264-video)\n");
 }
 
 void parse_flags(int argc, char * const argv[])
@@ -158,7 +159,6 @@ void parse_flags(int argc, char * const argv[])
                { "x264-bitrate", required_argument, 0, OPTION_X264_BITRATE },
                { "x264-vbv-bufsize", required_argument, 0, OPTION_X264_VBV_BUFSIZE },
                { "x264-vbv-max-bitrate", required_argument, 0, OPTION_X264_VBV_MAX_BITRATE },
-               { "x264-10-bit", no_argument, 0, OPTION_X264_10_BIT },
                { "x264-param", required_argument, 0, OPTION_X264_PARAM },
                { "http-mux", required_argument, 0, OPTION_HTTP_MUX },
                { "http-coarse-timebase", no_argument, 0, OPTION_HTTP_COARSE_TIMEBASE },
@@ -187,6 +187,7 @@ void parse_flags(int argc, char * const argv[])
                { "timecode-stream", no_argument, 0, OPTION_TIMECODE_STREAM },
                { "timecode-stdout", no_argument, 0, OPTION_TIMECODE_STDOUT },
                { "10-bit-input", no_argument, 0, OPTION_10_BIT_INPUT },
+               { "10-bit-output", no_argument, 0, OPTION_10_BIT_OUTPUT },
                { 0, 0, 0, 0 }
        };
        vector<string> theme_dirs;
@@ -290,9 +291,6 @@ void parse_flags(int argc, char * const argv[])
                case OPTION_X264_VBV_BUFSIZE:
                        global_flags.x264_vbv_buffer_size = atoi(optarg);
                        break;
-               case OPTION_X264_10_BIT:
-                       global_flags.x264_bit_depth = 10;
-                       break;
                case OPTION_X264_VBV_MAX_BITRATE:
                        global_flags.x264_vbv_max_bitrate = atoi(optarg);
                        break;
@@ -374,6 +372,12 @@ void parse_flags(int argc, char * const argv[])
                case OPTION_10_BIT_INPUT:
                        global_flags.ten_bit_input = true;
                        break;
+               case OPTION_10_BIT_OUTPUT:
+                       global_flags.ten_bit_output = true;
+                       global_flags.x264_video_to_disk = true;
+                       global_flags.x264_video_to_http = true;
+                       global_flags.x264_bit_depth = 10;
+                       break;
                case OPTION_HELP:
                        usage();
                        exit(0);
diff --git a/flags.h b/flags.h
index b840c15de32084bc674aeee4d08e9bbe44e0cf3a..6ca9794359904acbb73712440dc3b740a326f170 100644 (file)
--- a/flags.h
+++ b/flags.h
@@ -35,7 +35,6 @@ struct Flags {
        int x264_bitrate = DEFAULT_X264_OUTPUT_BIT_RATE;  // In kilobit/sec.
        int x264_vbv_max_bitrate = -1;  // In kilobits. 0 = no limit, -1 = same as <x264_bitrate> (CBR).
        int x264_vbv_buffer_size = -1;  // In kilobits. 0 = one-frame VBV, -1 = same as <x264_bitrate> (one-second VBV).
-       int x264_bit_depth = 8;
        std::vector<std::string> x264_extra_param;  // In “key[,value]” format.
        bool enable_alsa_output = true;
        std::map<int, int> default_stream_mapping;
@@ -53,6 +52,8 @@ struct Flags {
        bool display_timecode_in_stream = false;
        bool display_timecode_on_stdout = false;
        bool ten_bit_input = false;
+       bool ten_bit_output = false;  // Implies x264_video_to_disk == true and x264_bit_depth == 10.
+       int x264_bit_depth = 8;  // Not user-settable.
 };
 extern Flags global_flags;
 
index 8e5b25996722d46a2a66805b6c112e107136401e..25fa3e40d70723e12e4fa5a54b69628a0f49ed7c 100644 (file)
--- a/mixer.cpp
+++ b/mixer.cpp
@@ -305,6 +305,13 @@ Mixer::Mixer(const QSurfaceFormat &format, unsigned num_cards)
                v210_converter->precompile_shader(3840);
                v210_converter->precompile_shader(4096);
        }
+       if (global_flags.ten_bit_output) {
+               if (!v210Converter::has_hardware_support()) {
+                       fprintf(stderr, "ERROR: --ten-bit-output requires support for OpenGL compute shaders\n");
+                       fprintf(stderr, "       (OpenGL 4.3, or GL_ARB_compute_shader + GL_ARB_shader_image_load_store).\n");
+                       exit(1);
+               }
+       }
 
        timecode_renderer.reset(new TimecodeRenderer(resource_pool.get(), global_flags.width, global_flags.height));
        display_timecode_in_stream = global_flags.display_timecode_in_stream;