From: Steinar H. Gunderson Date: Wed, 15 Mar 2017 22:07:24 +0000 (+0100) Subject: Support 4:2:2 v210 (10-bit) output for DeckLink. X-Git-Tag: 1.5.0~9 X-Git-Url: https://git.sesse.net/?p=nageru;a=commitdiff_plain;h=aa472f3f2fcf37701198deb330d3169636151060 Support 4:2:2 v210 (10-bit) output for DeckLink. This again requires compute shaders; my GTX 950 needs a bit under 0.1 ms to convert a 720p frame from the 16-bit planar representation. It replaces the flag for 10-bit x264. v210 is, as far as I understand, pretty much the native format for the DeckLink cards, but I believe the conversion happens in hardware, so there shouldn't be any significant speed gains to be have. --- diff --git a/chroma_subsampler.cpp b/chroma_subsampler.cpp index a9e5355..cf46883 100644 --- a/chroma_subsampler.cpp +++ b/chroma_subsampler.cpp @@ -1,4 +1,5 @@ #include "chroma_subsampler.h" +#include "v210_converter.h" #include @@ -170,6 +171,78 @@ ChromaSubsampler::ChromaSubsampler(ResourcePool *resource_pool) }; vbo = generate_vbo(2, GL_FLOAT, sizeof(vertices), vertices); check_error(); + + // v210 compute shader. + if (v210Converter::has_hardware_support()) { + string v210_shader_src = R"(#version 150 +#extension GL_ARB_compute_shader : enable +#extension GL_ARB_shader_image_load_store : enable +layout(local_size_x=2, local_size_y=16) in; +layout(r16) uniform restrict readonly image2D in_y; +uniform sampler2D in_cbcr; // Of type RG16. +layout(rgb10_a2) uniform restrict writeonly image2D outbuf; +uniform float inv_width, inv_height; + +void main() +{ + int xb = int(gl_GlobalInvocationID.x); // X block number. + int y = int(gl_GlobalInvocationID.y); // Y (actual line). + float yf = (gl_GlobalInvocationID.y + 0.5f) * inv_height; // Y float coordinate. + + // Load and scale CbCr values, sampling in-between the texels to get + // to (left/4 + center/2 + right/4). + vec2 pix_cbcr[3]; + for (int i = 0; i < 3; ++i) { + vec2 a = texture(in_cbcr, vec2((xb * 6 + i * 2) * inv_width, yf)).xy; + vec2 b = texture(in_cbcr, vec2((xb * 6 + i * 2 + 1) * inv_width, yf)).xy; + pix_cbcr[i] = (a + b) * (0.5 * 65535.0 / 1023.0); + } + + // Load and scale the Y values. Note that we use integer coordinates here, + // so we don't need to offset by 0.5. + float pix_y[6]; + for (int i = 0; i < 6; ++i) { + pix_y[i] = imageLoad(in_y, ivec2(xb * 6 + i, y)).x * (65535.0 / 1023.0); + } + + imageStore(outbuf, ivec2(xb * 4 + 0, y), vec4(pix_cbcr[0].x, pix_y[0], pix_cbcr[0].y, 1.0)); + imageStore(outbuf, ivec2(xb * 4 + 1, y), vec4(pix_y[1], pix_cbcr[1].x, pix_y[2], 1.0)); + imageStore(outbuf, ivec2(xb * 4 + 2, y), vec4(pix_cbcr[1].y, pix_y[3], pix_cbcr[2].x, 1.0)); + imageStore(outbuf, ivec2(xb * 4 + 3, y), vec4(pix_y[4], pix_cbcr[2].y, pix_y[5], 1.0)); +} +)"; + GLuint shader_num = movit::compile_shader(v210_shader_src, GL_COMPUTE_SHADER); + check_error(); + v210_program_num = glCreateProgram(); + check_error(); + glAttachShader(v210_program_num, shader_num); + check_error(); + glLinkProgram(v210_program_num); + check_error(); + + GLint success; + glGetProgramiv(v210_program_num, GL_LINK_STATUS, &success); + check_error(); + if (success == GL_FALSE) { + GLchar error_log[1024] = {0}; + glGetProgramInfoLog(v210_program_num, 1024, NULL, error_log); + fprintf(stderr, "Error linking program: %s\n", error_log); + exit(1); + } + + v210_in_y_pos = glGetUniformLocation(v210_program_num, "in_y"); + check_error(); + v210_in_cbcr_pos = glGetUniformLocation(v210_program_num, "in_cbcr"); + check_error(); + v210_outbuf_pos = glGetUniformLocation(v210_program_num, "outbuf"); + check_error(); + v210_inv_width_pos = glGetUniformLocation(v210_program_num, "inv_width"); + check_error(); + v210_inv_height_pos = glGetUniformLocation(v210_program_num, "inv_height"); + check_error(); + } else { + v210_program_num = 0; + } } ChromaSubsampler::~ChromaSubsampler() @@ -180,6 +253,10 @@ ChromaSubsampler::~ChromaSubsampler() check_error(); glDeleteBuffers(1, &vbo); check_error(); + if (v210_program_num != 0) { + glDeleteProgram(v210_program_num); + check_error(); + } } void ChromaSubsampler::subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex, GLuint dst2_tex) @@ -334,3 +411,60 @@ void ChromaSubsampler::create_uyvy(GLuint y_tex, GLuint cbcr_tex, unsigned width resource_pool->release_fbo(fbo); glDeleteVertexArrays(1, &vao); } + +void ChromaSubsampler::create_v210(GLuint y_tex, GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex) +{ + assert(v210_program_num != 0); + + glUseProgram(v210_program_num); + check_error(); + + glUniform1i(v210_in_y_pos, 0); + check_error(); + glUniform1i(v210_in_cbcr_pos, 1); + check_error(); + glUniform1i(v210_outbuf_pos, 2); + check_error(); + glUniform1f(v210_inv_width_pos, 1.0 / width); + check_error(); + glUniform1f(v210_inv_height_pos, 1.0 / height); + check_error(); + + glActiveTexture(GL_TEXTURE0); + check_error(); + glBindTexture(GL_TEXTURE_2D, y_tex); // We don't actually need to bind it, but we need to set the state. + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + check_error(); + glBindImageTexture(0, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R16); // This is the real bind. + check_error(); + + glActiveTexture(GL_TEXTURE1); + check_error(); + glBindTexture(GL_TEXTURE_2D, cbcr_tex); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + check_error(); + + glBindImageTexture(2, dst_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGB10_A2); + check_error(); + + // Actually run the shader. We use workgroups of size 2x16 threadst , and each thread + // processes 6x1 input pixels, so round up to number of 12x16 pixel blocks. + glDispatchCompute((width + 11) / 12, (height + 15) / 16, 1); + + glBindTexture(GL_TEXTURE_2D, 0); + check_error(); + glActiveTexture(GL_TEXTURE0); + check_error(); + glUseProgram(0); + check_error(); +} diff --git a/chroma_subsampler.h b/chroma_subsampler.h index 1bed433..d4c1c1e 100644 --- a/chroma_subsampler.h +++ b/chroma_subsampler.h @@ -27,6 +27,14 @@ public: // width and height are the dimensions (in pixels) of the input textures. void create_uyvy(GLuint y_tex, GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex); + // Subsamples and interleaves luma and chroma to give 10-bit 4:2:2 + // packed Y'CbCr (v210); see v210converter.h for more information on + // the format. Luma and chroma are assumed to be 10-bit data packed + // into 16-bit textures. Chroma positioning is left (H.264 convention). + // width and height are the dimensions (in pixels) of the input textures; + // Requires compute shaders; check v210Converter::has_hardware_support(). + void create_v210(GLuint y_tex, GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex); + private: movit::ResourcePool *resource_pool; @@ -39,6 +47,10 @@ private: GLuint uyvy_program_num; // Owned by . GLuint uyvy_y_texture_sampler_uniform, uyvy_cbcr_texture_sampler_uniform; GLuint uyvy_position_attribute_index, uyvy_texcoord_attribute_index; + + GLuint v210_program_num; // Compute shader, so owned by ourselves. Can be 0. + GLuint v210_in_y_pos, v210_in_cbcr_pos, v210_outbuf_pos; + GLuint v210_inv_width_pos, v210_inv_height_pos; }; #endif // !defined(_CHROMA_SUBSAMPLER_H) diff --git a/decklink_output.cpp b/decklink_output.cpp index 3b00d28..3ce692b 100644 --- a/decklink_output.cpp +++ b/decklink_output.cpp @@ -12,6 +12,7 @@ #include "print_latency.h" #include "resource_pool.h" #include "timebase.h" +#include "v210_converter.h" using namespace movit; using namespace std; @@ -96,7 +97,8 @@ void DeckLinkOutput::start_output(uint32_t mode, int64_t base_pts) BMDDisplayModeSupport support; IDeckLinkDisplayMode *display_mode; - if (output->DoesSupportVideoMode(mode, bmdFormat8BitYUV, bmdVideoOutputFlagDefault, + BMDPixelFormat pixel_format = global_flags.ten_bit_output ? bmdFormat10BitYUV : bmdFormat8BitYUV; + if (output->DoesSupportVideoMode(mode, pixel_format, bmdVideoOutputFlagDefault, &support, &display_mode) != S_OK) { fprintf(stderr, "Couldn't ask for format support\n"); exit(1); @@ -198,7 +200,11 @@ void DeckLinkOutput::send_frame(GLuint y_tex, GLuint cbcr_tex, YCbCrLumaCoeffici } unique_ptr frame = move(get_frame()); - chroma_subsampler->create_uyvy(y_tex, cbcr_tex, width, height, frame->uyvy_tex); + if (global_flags.ten_bit_output) { + chroma_subsampler->create_v210(y_tex, cbcr_tex, width, height, frame->uyvy_tex); + } else { + chroma_subsampler->create_uyvy(y_tex, cbcr_tex, width, height, frame->uyvy_tex); + } // Download the UYVY texture to the PBO. glPixelStorei(GL_PACK_ROW_LENGTH, 0); @@ -207,10 +213,17 @@ void DeckLinkOutput::send_frame(GLuint y_tex, GLuint cbcr_tex, YCbCrLumaCoeffici glBindBuffer(GL_PIXEL_PACK_BUFFER, frame->pbo); check_error(); - glBindTexture(GL_TEXTURE_2D, frame->uyvy_tex); - check_error(); - glGetTexImage(GL_TEXTURE_2D, 0, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, BUFFER_OFFSET(0)); - check_error(); + if (global_flags.ten_bit_output) { + glBindTexture(GL_TEXTURE_2D, frame->uyvy_tex); + check_error(); + glGetTexImage(GL_TEXTURE_2D, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, BUFFER_OFFSET(0)); + check_error(); + } else { + glBindTexture(GL_TEXTURE_2D, frame->uyvy_tex); + check_error(); + glGetTexImage(GL_TEXTURE_2D, 0, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, BUFFER_OFFSET(0)); + check_error(); + } glBindTexture(GL_TEXTURE_2D, 0); check_error(); @@ -406,17 +419,31 @@ unique_ptr DeckLinkOutput::get_frame() unique_ptr frame(new Frame); - frame->uyvy_tex = resource_pool->create_2d_texture(GL_RGBA8, width / 2, height); + size_t stride; + if (global_flags.ten_bit_output) { + stride = v210Converter::get_v210_stride(width); + GLint v210_width = stride / sizeof(uint32_t); + frame->uyvy_tex = resource_pool->create_2d_texture(GL_RGB10_A2, v210_width, height); + + // We need valid texture state, or NVIDIA won't allow us to write to the texture. + glBindTexture(GL_TEXTURE_2D, frame->uyvy_tex); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + check_error(); + } else { + stride = width * 2; + frame->uyvy_tex = resource_pool->create_2d_texture(GL_RGBA8, width / 2, height); + } glGenBuffers(1, &frame->pbo); check_error(); glBindBuffer(GL_PIXEL_PACK_BUFFER, frame->pbo); check_error(); - glBufferStorage(GL_PIXEL_PACK_BUFFER, width * height * 2, NULL, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + glBufferStorage(GL_PIXEL_PACK_BUFFER, stride * height, NULL, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); check_error(); - frame->uyvy_ptr = (uint8_t *)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, width * height * 2, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + frame->uyvy_ptr = (uint8_t *)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, stride * height, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); check_error(); - frame->uyvy_ptr_local.reset(new uint8_t[width * height * 2]); + frame->uyvy_ptr_local.reset(new uint8_t[stride * height]); frame->resource_pool = resource_pool; return frame; @@ -444,7 +471,11 @@ void DeckLinkOutput::present_thread_func() check_error(); frame->fence.reset(); - memcpy(frame->uyvy_ptr_local.get(), frame->uyvy_ptr, width * height * 2); + if (global_flags.ten_bit_output) { + memcpy(frame->uyvy_ptr_local.get(), frame->uyvy_ptr, v210Converter::get_v210_stride(width) * height); + } else { + memcpy(frame->uyvy_ptr_local.get(), frame->uyvy_ptr, width * height * 2); + } // Release any input frames we needed to render this frame. frame->input_frames.clear(); @@ -526,12 +557,20 @@ long DeckLinkOutput::Frame::GetHeight() long DeckLinkOutput::Frame::GetRowBytes() { - return global_flags.width * 2; + if (global_flags.ten_bit_output) { + return v210Converter::get_v210_stride(global_flags.width); + } else { + return global_flags.width * 2; + } } BMDPixelFormat DeckLinkOutput::Frame::GetPixelFormat() { - return bmdFormat8BitYUV; + if (global_flags.ten_bit_output) { + return bmdFormat10BitYUV; + } else { + return bmdFormat8BitYUV; + } } BMDFrameFlags DeckLinkOutput::Frame::GetFlags() diff --git a/decklink_output.h b/decklink_output.h index 5581c39..7c0a17f 100644 --- a/decklink_output.h +++ b/decklink_output.h @@ -102,7 +102,7 @@ private: movit::ResourcePool *resource_pool; // These members are persistently allocated, and reused when the frame object is. - GLuint uyvy_tex; // Owned by . + GLuint uyvy_tex; // Owned by . Can also hold v210 data. GLuint pbo; uint8_t *uyvy_ptr; // Persistent mapping into the PBO. diff --git a/flags.cpp b/flags.cpp index e6c306b..ee575c5 100644 --- a/flags.cpp +++ b/flags.cpp @@ -27,7 +27,6 @@ enum LongOption { OPTION_X264_BITRATE, OPTION_X264_VBV_BUFSIZE, OPTION_X264_VBV_MAX_BITRATE, - OPTION_X264_10_BIT, OPTION_X264_PARAM, OPTION_HTTP_MUX, OPTION_HTTP_COARSE_TIMEBASE, @@ -56,6 +55,7 @@ enum LongOption { OPTION_TIMECODE_STREAM, OPTION_TIMECODE_STDOUT, OPTION_10_BIT_INPUT, + OPTION_10_BIT_OUTPUT, }; void usage() @@ -90,7 +90,6 @@ void usage() fprintf(stderr, " default: same as --x264-bitrate, that is, one-second VBV)\n"); fprintf(stderr, " --x264-vbv-max-bitrate x264 local max bitrate (in kilobit/sec per --vbv-bufsize,\n"); fprintf(stderr, " 0 = no limit, default: same as --x264-bitrate, i.e., CBR)\n"); - fprintf(stderr, " --x264-10-bit enable 10-bit x264 encoding\n"); fprintf(stderr, " --x264-param=NAME[,VALUE] set any x264 parameter, for fine tuning\n"); fprintf(stderr, " --http-mux=NAME mux to use for HTTP streams (default " DEFAULT_STREAM_MUX_NAME ")\n"); fprintf(stderr, " --http-audio-codec=NAME audio codec to use for HTTP streams\n"); @@ -130,6 +129,8 @@ void usage() fprintf(stderr, " --timecode-stream show timestamp and timecode in stream\n"); fprintf(stderr, " --timecode-stdout show timestamp and timecode on standard output\n"); fprintf(stderr, " --10-bit-input use 10-bit video input (requires compute shaders)\n"); + fprintf(stderr, " --10-bit-output use 10-bit video output (requires compute shaders,\n"); + fprintf(stderr, " implies --record-x264-video)\n"); } void parse_flags(int argc, char * const argv[]) @@ -158,7 +159,6 @@ void parse_flags(int argc, char * const argv[]) { "x264-bitrate", required_argument, 0, OPTION_X264_BITRATE }, { "x264-vbv-bufsize", required_argument, 0, OPTION_X264_VBV_BUFSIZE }, { "x264-vbv-max-bitrate", required_argument, 0, OPTION_X264_VBV_MAX_BITRATE }, - { "x264-10-bit", no_argument, 0, OPTION_X264_10_BIT }, { "x264-param", required_argument, 0, OPTION_X264_PARAM }, { "http-mux", required_argument, 0, OPTION_HTTP_MUX }, { "http-coarse-timebase", no_argument, 0, OPTION_HTTP_COARSE_TIMEBASE }, @@ -187,6 +187,7 @@ void parse_flags(int argc, char * const argv[]) { "timecode-stream", no_argument, 0, OPTION_TIMECODE_STREAM }, { "timecode-stdout", no_argument, 0, OPTION_TIMECODE_STDOUT }, { "10-bit-input", no_argument, 0, OPTION_10_BIT_INPUT }, + { "10-bit-output", no_argument, 0, OPTION_10_BIT_OUTPUT }, { 0, 0, 0, 0 } }; vector theme_dirs; @@ -290,9 +291,6 @@ void parse_flags(int argc, char * const argv[]) case OPTION_X264_VBV_BUFSIZE: global_flags.x264_vbv_buffer_size = atoi(optarg); break; - case OPTION_X264_10_BIT: - global_flags.x264_bit_depth = 10; - break; case OPTION_X264_VBV_MAX_BITRATE: global_flags.x264_vbv_max_bitrate = atoi(optarg); break; @@ -374,6 +372,12 @@ void parse_flags(int argc, char * const argv[]) case OPTION_10_BIT_INPUT: global_flags.ten_bit_input = true; break; + case OPTION_10_BIT_OUTPUT: + global_flags.ten_bit_output = true; + global_flags.x264_video_to_disk = true; + global_flags.x264_video_to_http = true; + global_flags.x264_bit_depth = 10; + break; case OPTION_HELP: usage(); exit(0); diff --git a/flags.h b/flags.h index b840c15..6ca9794 100644 --- a/flags.h +++ b/flags.h @@ -35,7 +35,6 @@ struct Flags { int x264_bitrate = DEFAULT_X264_OUTPUT_BIT_RATE; // In kilobit/sec. int x264_vbv_max_bitrate = -1; // In kilobits. 0 = no limit, -1 = same as (CBR). int x264_vbv_buffer_size = -1; // In kilobits. 0 = one-frame VBV, -1 = same as (one-second VBV). - int x264_bit_depth = 8; std::vector x264_extra_param; // In “key[,value]” format. bool enable_alsa_output = true; std::map default_stream_mapping; @@ -53,6 +52,8 @@ struct Flags { bool display_timecode_in_stream = false; bool display_timecode_on_stdout = false; bool ten_bit_input = false; + bool ten_bit_output = false; // Implies x264_video_to_disk == true and x264_bit_depth == 10. + int x264_bit_depth = 8; // Not user-settable. }; extern Flags global_flags; diff --git a/mixer.cpp b/mixer.cpp index 8e5b259..25fa3e4 100644 --- a/mixer.cpp +++ b/mixer.cpp @@ -305,6 +305,13 @@ Mixer::Mixer(const QSurfaceFormat &format, unsigned num_cards) v210_converter->precompile_shader(3840); v210_converter->precompile_shader(4096); } + if (global_flags.ten_bit_output) { + if (!v210Converter::has_hardware_support()) { + fprintf(stderr, "ERROR: --ten-bit-output requires support for OpenGL compute shaders\n"); + fprintf(stderr, " (OpenGL 4.3, or GL_ARB_compute_shader + GL_ARB_shader_image_load_store).\n"); + exit(1); + } + } timecode_renderer.reset(new TimecodeRenderer(resource_pool.get(), global_flags.width, global_flags.height)); display_timecode_in_stream = global_flags.display_timecode_in_stream;