From: Steinar H. Gunderson Date: Wed, 8 Mar 2017 00:11:35 +0000 (+0100) Subject: Display a copy of the Y'CbCr images instead of an RGB565 copy. X-Git-Tag: 1.5.0~16 X-Git-Url: https://git.sesse.net/?p=nageru;a=commitdiff_plain;h=336009fd7baf47b4ad71adf8d7ead8a526045788 Display a copy of the Y'CbCr images instead of an RGB565 copy. This is both higher-quality (the 16-bit artifacts were getting rather annoying), more true to what's actually being output, _and_ higher performance (well, at least lower memory bandwidth; I haven't benchmarked in practice), since we can use multi-output to make extra copies on-the-fly when writing instead of doing it explicitly. Sample calculation for a 1280x720 image; let's say it is one megapixel for ease of calculation: GL_565: 2 MB written (565 texture), 2 MB read during display = 4 MB used Y'CbCr: 1.0 + 0.5 MB written (Y' texture plus half-res dual-channel CbCr texture), same amount read during display = 3 MB used We could have reused the full-resolution CbCr texture, saving the 0.5 MB write, but that make the readback 3 MB instead of 1.5 MB, so it's a net loss. Ideally, we'd avoid the copies altogether, cutting the writes away and getting to 1.5 MB, but interactions with VA-API zerocopy seemingly made that impossible. --- diff --git a/chroma_subsampler.cpp b/chroma_subsampler.cpp index 63be1a9..a9e5355 100644 --- a/chroma_subsampler.cpp +++ b/chroma_subsampler.cpp @@ -94,9 +94,10 @@ ChromaSubsampler::ChromaSubsampler(ResourcePool *resource_pool) "#version 130 \n" "in vec2 tc0, tc1; \n" "uniform sampler2D cbcr_tex; \n" - "out vec4 FragColor; \n" + "out vec4 FragColor, FragColor2; \n" "void main() { \n" " FragColor = 0.5 * (texture(cbcr_tex, tc0) + texture(cbcr_tex, tc1)); \n" + " FragColor2 = FragColor; \n" "} \n"; cbcr_program_num = resource_pool->compile_glsl_program(cbcr_vert_shader, cbcr_frag_shader, frag_shader_outputs); check_error(); @@ -181,7 +182,7 @@ ChromaSubsampler::~ChromaSubsampler() check_error(); } -void ChromaSubsampler::subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex) +void ChromaSubsampler::subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex, GLuint dst2_tex) { GLuint vao; glGenVertexArrays(1, &vao); @@ -191,7 +192,12 @@ void ChromaSubsampler::subsample_chroma(GLuint cbcr_tex, unsigned width, unsigne check_error(); // Extract Cb/Cr. - GLuint fbo = resource_pool->create_fbo(dst_tex); + GLuint fbo; + if (dst2_tex <= 0) { + fbo = resource_pool->create_fbo(dst_tex); + } else { + fbo = resource_pool->create_fbo(dst_tex, dst2_tex); + } glBindFramebuffer(GL_FRAMEBUFFER, fbo); glViewport(0, 0, width/2, height/2); check_error(); diff --git a/chroma_subsampler.h b/chroma_subsampler.h index 811f901..1bed433 100644 --- a/chroma_subsampler.h +++ b/chroma_subsampler.h @@ -17,7 +17,10 @@ public: // Subsamples chroma (packed Cb and Cr) 2x2 to yield chroma suitable for // NV12 (semiplanar 4:2:0). Chroma positioning is left/center (H.264 convention). // width and height are the dimensions (in pixels) of the input texture. - void subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex); + // + // You can get two equal copies if you'd like; just set dst2_tex to a texture + // number and it will receive an exact copy of what goes into dst_tex. + void subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex, GLuint dst2_tex = 0); // Subsamples and interleaves luma and chroma to give 4:2:2 packed Y'CbCr (UYVY). // Chroma positioning is left (H.264 convention). diff --git a/mixer.cpp b/mixer.cpp index bf7dfd7..e286370 100644 --- a/mixer.cpp +++ b/mixer.cpp @@ -203,10 +203,26 @@ Mixer::Mixer(const QSurfaceFormat &format, unsigned num_cards) inout_format.color_space = COLORSPACE_sRGB; inout_format.gamma_curve = GAMMA_sRGB; - // Display chain; shows the live output produced by the main chain (its RGBA version). + // Matches the 4:2:0 format created by the main chain. + YCbCrFormat ycbcr_format; + ycbcr_format.chroma_subsampling_x = 2; + ycbcr_format.chroma_subsampling_y = 2; + if (global_flags.ycbcr_rec709_coefficients) { + ycbcr_format.luma_coefficients = YCBCR_REC_709; + } else { + ycbcr_format.luma_coefficients = YCBCR_REC_601; + } + ycbcr_format.full_range = false; + ycbcr_format.num_levels = 256; + ycbcr_format.cb_x_position = 0.0f; + ycbcr_format.cr_x_position = 0.0f; + ycbcr_format.cb_y_position = 0.5f; + ycbcr_format.cr_y_position = 0.5f; + + // Display chain; shows the live output produced by the main chain (or rather, a copy of it). display_chain.reset(new EffectChain(global_flags.width, global_flags.height, resource_pool.get())); check_error(); - display_input = new FlatInput(inout_format, FORMAT_RGB, GL_UNSIGNED_BYTE, global_flags.width, global_flags.height); // FIXME: GL_UNSIGNED_BYTE is really wrong. + display_input = new YCbCrInput(inout_format, ycbcr_format, global_flags.width, global_flags.height, YCBCR_INPUT_SPLIT_Y_AND_CBCR); display_chain->add_input(display_input); display_chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED); display_chain->set_dither_bits(0); // Don't bother. @@ -1001,10 +1017,16 @@ void Mixer::render_one_frame(int64_t duration) bool got_frame = video_encoder->begin_frame(pts_int + av_delay, duration, theme_main_chain.input_frames, &y_tex, &cbcr_tex); assert(got_frame); - // Render main chain. + // Render main chain. We take an extra copy of the created outputs, + // so that we can display it back to the screen later (it's less memory + // bandwidth than writing and reading back an RGBA texture, even at 16-bit). + // Ideally, we'd like to avoid taking copies and just use the main textures + // for display as well, but if they're used for zero-copy Quick Sync encoding + // (the default case), they're just views into VA-API memory and must be + // unmapped during encoding, so we can't use them for display, unfortunately. GLuint cbcr_full_tex = resource_pool->create_2d_texture(GL_RG8, global_flags.width, global_flags.height); - GLuint rgba_tex = resource_pool->create_2d_texture(GL_RGB565, global_flags.width, global_flags.height); // Saves texture bandwidth, although dithering gets messed up. - GLuint fbo = resource_pool->create_fbo(y_tex, cbcr_full_tex, rgba_tex); + GLuint y_copy_tex = resource_pool->create_2d_texture(GL_R8, global_flags.width, global_flags.height); + GLuint fbo = resource_pool->create_fbo(y_tex, cbcr_full_tex, y_copy_tex); check_error(); chain->render_to_fbo(fbo, global_flags.width, global_flags.height); @@ -1015,31 +1037,38 @@ void Mixer::render_one_frame(int64_t duration) resource_pool->release_fbo(fbo); - chroma_subsampler->subsample_chroma(cbcr_full_tex, global_flags.width, global_flags.height, cbcr_tex); + GLuint cbcr_copy_tex = resource_pool->create_2d_texture(GL_RG8, global_flags.width / 2, global_flags.height / 2); + chroma_subsampler->subsample_chroma(cbcr_full_tex, global_flags.width, global_flags.height, cbcr_tex, cbcr_copy_tex); if (output_card_index != -1) { cards[output_card_index].output->send_frame(y_tex, cbcr_full_tex, theme_main_chain.input_frames, pts_int, duration); } resource_pool->release_2d_texture(cbcr_full_tex); - // Set the right state for rgba_tex. + // Set the right state for the Y' and CbCr copies. glBindFramebuffer(GL_FRAMEBUFFER, 0); - glBindTexture(GL_TEXTURE_2D, rgba_tex); + glBindTexture(GL_TEXTURE_2D, y_copy_tex); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + + glBindTexture(GL_TEXTURE_2D, cbcr_copy_tex); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); RefCountedGLsync fence = video_encoder->end_frame(); - // The live frame just shows the RGBA texture we just rendered. - // It owns rgba_tex now. + // The live frame pieces the Y'CbCr texture copies back into RGB and displays them. + // It owns y_copy_tex and cbcr_copy_tex now. DisplayFrame live_frame; live_frame.chain = display_chain.get(); - live_frame.setup_chain = [this, rgba_tex]{ - display_input->set_texture_num(rgba_tex); + live_frame.setup_chain = [this, y_copy_tex, cbcr_copy_tex]{ + display_input->set_texture_num(0, y_copy_tex); + display_input->set_texture_num(1, cbcr_copy_tex); }; live_frame.ready_fence = fence; live_frame.input_frames = {}; - live_frame.temp_textures = { rgba_tex }; + live_frame.temp_textures = { y_copy_tex, cbcr_copy_tex }; output_channel[OUTPUT_LIVE].output_frame(live_frame); // Set up preview and any additional channels. diff --git a/mixer.h b/mixer.h index 8f5342a..7837b35 100644 --- a/mixer.h +++ b/mixer.h @@ -47,8 +47,8 @@ class v210Converter; namespace movit { class Effect; class EffectChain; -class FlatInput; class ResourcePool; +class YCbCrInput; } // namespace movit // For any card that's not the master (where we pick out the frames as they @@ -383,7 +383,7 @@ private: std::atomic display_timecode_on_stdout{false}; // Effects part of . Owned by . - movit::FlatInput *display_input; + movit::YCbCrInput *display_input; int64_t pts_int = 0; // In TIMEBASE units. unsigned frame_num = 0; diff --git a/theme.cpp b/theme.cpp index 14560c2..c5f335e 100644 --- a/theme.cpp +++ b/theme.cpp @@ -274,10 +274,12 @@ int EffectChain_finalize(lua_State* L) output_ycbcr_format.num_levels = 256; chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, output_ycbcr_format, YCBCR_OUTPUT_SPLIT_Y_AND_CBCR); + chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, output_ycbcr_format, YCBCR_OUTPUT_INTERLEAVED); // Add a copy where we'll only be using the Y component. chain->set_dither_bits(8); chain->set_output_origin(OUTPUT_ORIGIN_TOP_LEFT); + } else { + chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED); } - chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED); chain->finalize(); return 0;