From 7297850cec443cb0c02f82d7301a30583744627d Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Mon, 13 Mar 2017 23:55:11 +0100 Subject: [PATCH] Support 10-bit x264 output. Requires a 10-bit-compiled x264. Probably breaks DeckLink output for now. --- mixer.cpp | 20 ++++++++++++++------ quicksync_encoder.cpp | 23 ++++++++++++++++------- theme.cpp | 10 ++++++---- x264_encoder.cpp | 32 +++++++++++++++++++++++--------- 4 files changed, 59 insertions(+), 26 deletions(-) diff --git a/mixer.cpp b/mixer.cpp index 898e7c0..8e5b259 100644 --- a/mixer.cpp +++ b/mixer.cpp @@ -213,7 +213,7 @@ Mixer::Mixer(const QSurfaceFormat &format, unsigned num_cards) ycbcr_format.luma_coefficients = YCBCR_REC_601; } ycbcr_format.full_range = false; - ycbcr_format.num_levels = 256; + ycbcr_format.num_levels = 1 << global_flags.x264_bit_depth; ycbcr_format.cb_x_position = 0.0f; ycbcr_format.cr_x_position = 0.0f; ycbcr_format.cb_y_position = 0.5f; @@ -222,7 +222,8 @@ Mixer::Mixer(const QSurfaceFormat &format, unsigned num_cards) // Display chain; shows the live output produced by the main chain (or rather, a copy of it). display_chain.reset(new EffectChain(global_flags.width, global_flags.height, resource_pool.get())); check_error(); - display_input = new YCbCrInput(inout_format, ycbcr_format, global_flags.width, global_flags.height, YCBCR_INPUT_SPLIT_Y_AND_CBCR); + GLenum type = global_flags.x264_bit_depth > 8 ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE; + display_input = new YCbCrInput(inout_format, ycbcr_format, global_flags.width, global_flags.height, YCBCR_INPUT_SPLIT_Y_AND_CBCR, type); display_chain->add_input(display_input); display_chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED); display_chain->set_dither_bits(0); // Don't bother. @@ -1027,7 +1028,7 @@ void Mixer::render_one_frame(int64_t duration) output_ycbcr_format.chroma_subsampling_y = 1; output_ycbcr_format.luma_coefficients = ycbcr_output_coefficients; output_ycbcr_format.full_range = false; - output_ycbcr_format.num_levels = 256; + output_ycbcr_format.num_levels = 1 << global_flags.x264_bit_depth; chain->change_ycbcr_output_format(output_ycbcr_format); const int64_t av_delay = lrint(global_flags.audio_queue_length_ms * 0.001 * TIMEBASE); // Corresponds to the delay in ResamplingQueue. @@ -1042,8 +1043,16 @@ void Mixer::render_one_frame(int64_t duration) // for display as well, but if they're used for zero-copy Quick Sync encoding // (the default case), they're just views into VA-API memory and must be // unmapped during encoding, so we can't use them for display, unfortunately. - GLuint cbcr_full_tex = resource_pool->create_2d_texture(GL_RG8, global_flags.width, global_flags.height); - GLuint y_copy_tex = resource_pool->create_2d_texture(GL_R8, global_flags.width, global_flags.height); + GLuint cbcr_full_tex, cbcr_copy_tex, y_copy_tex; + if (global_flags.x264_bit_depth > 8) { + cbcr_full_tex = resource_pool->create_2d_texture(GL_RG16, global_flags.width, global_flags.height); + y_copy_tex = resource_pool->create_2d_texture(GL_R16, global_flags.width, global_flags.height); + cbcr_copy_tex = resource_pool->create_2d_texture(GL_RG16, global_flags.width / 2, global_flags.height / 2); + } else { + cbcr_full_tex = resource_pool->create_2d_texture(GL_RG8, global_flags.width, global_flags.height); + y_copy_tex = resource_pool->create_2d_texture(GL_R8, global_flags.width, global_flags.height); + cbcr_copy_tex = resource_pool->create_2d_texture(GL_RG8, global_flags.width / 2, global_flags.height / 2); + } GLuint fbo = resource_pool->create_fbo(y_tex, cbcr_full_tex, y_copy_tex); check_error(); chain->render_to_fbo(fbo, global_flags.width, global_flags.height); @@ -1055,7 +1064,6 @@ void Mixer::render_one_frame(int64_t duration) resource_pool->release_fbo(fbo); - GLuint cbcr_copy_tex = resource_pool->create_2d_texture(GL_RG8, global_flags.width / 2, global_flags.height / 2); chroma_subsampler->subsample_chroma(cbcr_full_tex, global_flags.width, global_flags.height, cbcr_tex, cbcr_copy_tex); if (output_card_index != -1) { cards[output_card_index].output->send_frame(y_tex, cbcr_full_tex, ycbcr_output_coefficients, theme_main_chain.input_frames, pts_int, duration); diff --git a/quicksync_encoder.cpp b/quicksync_encoder.cpp index bd6b4c2..635a95a 100644 --- a/quicksync_encoder.cpp +++ b/quicksync_encoder.cpp @@ -994,17 +994,25 @@ int QuickSyncEncoderImpl::setup_encode() gl_surfaces[i].y_tex = resource_pool->create_2d_texture(GL_R8, 1, 1); gl_surfaces[i].cbcr_tex = resource_pool->create_2d_texture(GL_RG8, 1, 1); } else { - gl_surfaces[i].y_tex = resource_pool->create_2d_texture(GL_R8, frame_width, frame_height); - gl_surfaces[i].cbcr_tex = resource_pool->create_2d_texture(GL_RG8, frame_width / 2, frame_height / 2); + size_t bytes_per_pixel; + if (global_flags.x264_bit_depth > 8) { + bytes_per_pixel = 2; + gl_surfaces[i].y_tex = resource_pool->create_2d_texture(GL_R16, frame_width, frame_height); + gl_surfaces[i].cbcr_tex = resource_pool->create_2d_texture(GL_RG16, frame_width / 2, frame_height / 2); + } else { + bytes_per_pixel = 1; + gl_surfaces[i].y_tex = resource_pool->create_2d_texture(GL_R8, frame_width, frame_height); + gl_surfaces[i].cbcr_tex = resource_pool->create_2d_texture(GL_RG8, frame_width / 2, frame_height / 2); + } // Generate a PBO to read into. It doesn't necessarily fit 1:1 with the VA-API // buffers, due to potentially differing pitch. glGenBuffers(1, &gl_surfaces[i].pbo); glBindBuffer(GL_PIXEL_PACK_BUFFER, gl_surfaces[i].pbo); - glBufferStorage(GL_PIXEL_PACK_BUFFER, frame_width * frame_height * 2, nullptr, GL_MAP_READ_BIT | GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT); - uint8_t *ptr = (uint8_t *)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, frame_width * frame_height * 2, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + glBufferStorage(GL_PIXEL_PACK_BUFFER, frame_width * frame_height * 2 * bytes_per_pixel, nullptr, GL_MAP_READ_BIT | GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT); + uint8_t *ptr = (uint8_t *)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, frame_width * frame_height * 2 * bytes_per_pixel, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); gl_surfaces[i].y_offset = 0; - gl_surfaces[i].cbcr_offset = frame_width * frame_height; + gl_surfaces[i].cbcr_offset = frame_width * frame_height * bytes_per_pixel; gl_surfaces[i].y_ptr = ptr + gl_surfaces[i].y_offset; gl_surfaces[i].cbcr_ptr = ptr + gl_surfaces[i].cbcr_offset; glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); @@ -1695,6 +1703,7 @@ RefCountedGLsync QuickSyncEncoderImpl::end_frame() assert(!is_shutdown); if (!use_zerocopy) { + GLenum type = global_flags.x264_bit_depth > 8 ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE; GLSurface *surf; { unique_lock lock(storage_task_queue_mutex); @@ -1710,12 +1719,12 @@ RefCountedGLsync QuickSyncEncoderImpl::end_frame() glBindTexture(GL_TEXTURE_2D, surf->y_tex); check_error(); - glGetTexImage(GL_TEXTURE_2D, 0, GL_RED, GL_UNSIGNED_BYTE, BUFFER_OFFSET(surf->y_offset)); + glGetTexImage(GL_TEXTURE_2D, 0, GL_RED, type, BUFFER_OFFSET(surf->y_offset)); check_error(); glBindTexture(GL_TEXTURE_2D, surf->cbcr_tex); check_error(); - glGetTexImage(GL_TEXTURE_2D, 0, GL_RG, GL_UNSIGNED_BYTE, BUFFER_OFFSET(surf->cbcr_offset)); + glGetTexImage(GL_TEXTURE_2D, 0, GL_RG, type, BUFFER_OFFSET(surf->cbcr_offset)); check_error(); glBindTexture(GL_TEXTURE_2D, 0); diff --git a/theme.cpp b/theme.cpp index 7bb1877..e5002bc 100644 --- a/theme.cpp +++ b/theme.cpp @@ -274,11 +274,13 @@ int EffectChain_finalize(lua_State* L) } output_ycbcr_format.full_range = false; - output_ycbcr_format.num_levels = 256; + output_ycbcr_format.num_levels = 1 << global_flags.x264_bit_depth; - chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, output_ycbcr_format, YCBCR_OUTPUT_SPLIT_Y_AND_CBCR); - chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, output_ycbcr_format, YCBCR_OUTPUT_INTERLEAVED); // Add a copy where we'll only be using the Y component. - chain->set_dither_bits(8); + GLenum type = global_flags.x264_bit_depth > 8 ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE; + + chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, output_ycbcr_format, YCBCR_OUTPUT_SPLIT_Y_AND_CBCR, type); + chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, output_ycbcr_format, YCBCR_OUTPUT_INTERLEAVED, type); // Add a copy where we'll only be using the Y component. + chain->set_dither_bits(global_flags.x264_bit_depth > 8 ? 16 : 8); chain->set_output_origin(OUTPUT_ORIGIN_TOP_LEFT); } else { chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED); diff --git a/x264_encoder.cpp b/x264_encoder.cpp index c8f7b81..0c1ecbc 100644 --- a/x264_encoder.cpp +++ b/x264_encoder.cpp @@ -48,9 +48,10 @@ X264Encoder::X264Encoder(AVOutputFormat *oformat) : wants_global_headers(oformat->flags & AVFMT_GLOBALHEADER), dyn(load_x264_for_bit_depth(global_flags.x264_bit_depth)) { - frame_pool.reset(new uint8_t[global_flags.width * global_flags.height * 2 * X264_QUEUE_LENGTH]); + size_t bytes_per_pixel = global_flags.x264_bit_depth > 8 ? 2 : 1; + frame_pool.reset(new uint8_t[global_flags.width * global_flags.height * 2 * bytes_per_pixel * X264_QUEUE_LENGTH]); for (unsigned i = 0; i < X264_QUEUE_LENGTH; ++i) { - free_frames.push(frame_pool.get() + i * (global_flags.width * global_flags.height * 2)); + free_frames.push(frame_pool.get() + i * (global_flags.width * global_flags.height * 2 * bytes_per_pixel)); } encoder_thread = thread(&X264Encoder::encoder_thread_func, this); } @@ -86,7 +87,8 @@ void X264Encoder::add_frame(int64_t pts, int64_t duration, YCbCrLumaCoefficients free_frames.pop(); } - memcpy(qf.data, data, global_flags.width * global_flags.height * 2); + size_t bytes_per_pixel = global_flags.x264_bit_depth > 8 ? 2 : 1; + memcpy(qf.data, data, global_flags.width * global_flags.height * 2 * bytes_per_pixel); { lock_guard lock(mu); @@ -103,6 +105,9 @@ void X264Encoder::init_x264() param.i_width = global_flags.width; param.i_height = global_flags.height; param.i_csp = X264_CSP_NV12; + if (global_flags.x264_bit_depth > 8) { + param.i_csp |= X264_CSP_HIGH_DEPTH; + } param.b_vfr_input = 1; param.i_timebase_num = 1; param.i_timebase_den = TIMEBASE; @@ -263,12 +268,21 @@ void X264Encoder::encode_frame(X264Encoder::QueuedFrame qf) dyn.x264_picture_init(&pic); pic.i_pts = qf.pts; - pic.img.i_csp = X264_CSP_NV12; - pic.img.i_plane = 2; - pic.img.plane[0] = qf.data; - pic.img.i_stride[0] = global_flags.width; - pic.img.plane[1] = qf.data + global_flags.width * global_flags.height; - pic.img.i_stride[1] = global_flags.width / 2 * sizeof(uint16_t); + if (global_flags.x264_bit_depth > 8) { + pic.img.i_csp = X264_CSP_NV12 | X264_CSP_HIGH_DEPTH; + pic.img.i_plane = 2; + pic.img.plane[0] = qf.data; + pic.img.i_stride[0] = global_flags.width * sizeof(uint16_t); + pic.img.plane[1] = qf.data + global_flags.width * global_flags.height * sizeof(uint16_t); + pic.img.i_stride[1] = global_flags.width / 2 * sizeof(uint32_t); + } else { + pic.img.i_csp = X264_CSP_NV12; + pic.img.i_plane = 2; + pic.img.plane[0] = qf.data; + pic.img.i_stride[0] = global_flags.width; + pic.img.plane[1] = qf.data + global_flags.width * global_flags.height; + pic.img.i_stride[1] = global_flags.width / 2 * sizeof(uint16_t); + } pic.opaque = reinterpret_cast(intptr_t(qf.duration)); input_pic = &pic; -- 2.39.2