This is both higher-quality (the 16-bit artifacts were getting rather
annoying), more true to what's actually being output, _and_ higher performance
(well, at least lower memory bandwidth; I haven't benchmarked in practice),
since we can use multi-output to make extra copies on-the-fly when writing
instead of doing it explicitly. Sample calculation for a 1280x720 image; let's
say it is one megapixel for ease of calculation:
GL_565: 2 MB written (565 texture), 2 MB read during display = 4 MB used
Y'CbCr: 1.0 + 0.5 MB written (Y' texture plus half-res dual-channel CbCr texture),
same amount read during display = 3 MB used
We could have reused the full-resolution CbCr texture, saving the 0.5 MB
write, but that make the readback 3 MB instead of 1.5 MB, so it's a net loss.
Ideally, we'd avoid the copies altogether, cutting the writes away
and getting to 1.5 MB, but interactions with VA-API zerocopy seemingly
made that impossible.
"#version 130 \n"
"in vec2 tc0, tc1; \n"
"uniform sampler2D cbcr_tex; \n"
"#version 130 \n"
"in vec2 tc0, tc1; \n"
"uniform sampler2D cbcr_tex; \n"
- "out vec4 FragColor; \n"
+ "out vec4 FragColor, FragColor2; \n"
"void main() { \n"
" FragColor = 0.5 * (texture(cbcr_tex, tc0) + texture(cbcr_tex, tc1)); \n"
"void main() { \n"
" FragColor = 0.5 * (texture(cbcr_tex, tc0) + texture(cbcr_tex, tc1)); \n"
+ " FragColor2 = FragColor; \n"
"} \n";
cbcr_program_num = resource_pool->compile_glsl_program(cbcr_vert_shader, cbcr_frag_shader, frag_shader_outputs);
check_error();
"} \n";
cbcr_program_num = resource_pool->compile_glsl_program(cbcr_vert_shader, cbcr_frag_shader, frag_shader_outputs);
check_error();
-void ChromaSubsampler::subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex)
+void ChromaSubsampler::subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex, GLuint dst2_tex)
{
GLuint vao;
glGenVertexArrays(1, &vao);
{
GLuint vao;
glGenVertexArrays(1, &vao);
check_error();
// Extract Cb/Cr.
check_error();
// Extract Cb/Cr.
- GLuint fbo = resource_pool->create_fbo(dst_tex);
+ GLuint fbo;
+ if (dst2_tex <= 0) {
+ fbo = resource_pool->create_fbo(dst_tex);
+ } else {
+ fbo = resource_pool->create_fbo(dst_tex, dst2_tex);
+ }
glBindFramebuffer(GL_FRAMEBUFFER, fbo);
glViewport(0, 0, width/2, height/2);
check_error();
glBindFramebuffer(GL_FRAMEBUFFER, fbo);
glViewport(0, 0, width/2, height/2);
check_error();
// Subsamples chroma (packed Cb and Cr) 2x2 to yield chroma suitable for
// NV12 (semiplanar 4:2:0). Chroma positioning is left/center (H.264 convention).
// width and height are the dimensions (in pixels) of the input texture.
// Subsamples chroma (packed Cb and Cr) 2x2 to yield chroma suitable for
// NV12 (semiplanar 4:2:0). Chroma positioning is left/center (H.264 convention).
// width and height are the dimensions (in pixels) of the input texture.
- void subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex);
+ //
+ // You can get two equal copies if you'd like; just set dst2_tex to a texture
+ // number and it will receive an exact copy of what goes into dst_tex.
+ void subsample_chroma(GLuint cbcr_tex, unsigned width, unsigned height, GLuint dst_tex, GLuint dst2_tex = 0);
// Subsamples and interleaves luma and chroma to give 4:2:2 packed Y'CbCr (UYVY).
// Chroma positioning is left (H.264 convention).
// Subsamples and interleaves luma and chroma to give 4:2:2 packed Y'CbCr (UYVY).
// Chroma positioning is left (H.264 convention).
inout_format.color_space = COLORSPACE_sRGB;
inout_format.gamma_curve = GAMMA_sRGB;
inout_format.color_space = COLORSPACE_sRGB;
inout_format.gamma_curve = GAMMA_sRGB;
- // Display chain; shows the live output produced by the main chain (its RGBA version).
+ // Matches the 4:2:0 format created by the main chain.
+ YCbCrFormat ycbcr_format;
+ ycbcr_format.chroma_subsampling_x = 2;
+ ycbcr_format.chroma_subsampling_y = 2;
+ if (global_flags.ycbcr_rec709_coefficients) {
+ ycbcr_format.luma_coefficients = YCBCR_REC_709;
+ } else {
+ ycbcr_format.luma_coefficients = YCBCR_REC_601;
+ }
+ ycbcr_format.full_range = false;
+ ycbcr_format.num_levels = 256;
+ ycbcr_format.cb_x_position = 0.0f;
+ ycbcr_format.cr_x_position = 0.0f;
+ ycbcr_format.cb_y_position = 0.5f;
+ ycbcr_format.cr_y_position = 0.5f;
+
+ // Display chain; shows the live output produced by the main chain (or rather, a copy of it).
display_chain.reset(new EffectChain(global_flags.width, global_flags.height, resource_pool.get()));
check_error();
display_chain.reset(new EffectChain(global_flags.width, global_flags.height, resource_pool.get()));
check_error();
- display_input = new FlatInput(inout_format, FORMAT_RGB, GL_UNSIGNED_BYTE, global_flags.width, global_flags.height); // FIXME: GL_UNSIGNED_BYTE is really wrong.
+ display_input = new YCbCrInput(inout_format, ycbcr_format, global_flags.width, global_flags.height, YCBCR_INPUT_SPLIT_Y_AND_CBCR);
display_chain->add_input(display_input);
display_chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
display_chain->set_dither_bits(0); // Don't bother.
display_chain->add_input(display_input);
display_chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
display_chain->set_dither_bits(0); // Don't bother.
bool got_frame = video_encoder->begin_frame(pts_int + av_delay, duration, theme_main_chain.input_frames, &y_tex, &cbcr_tex);
assert(got_frame);
bool got_frame = video_encoder->begin_frame(pts_int + av_delay, duration, theme_main_chain.input_frames, &y_tex, &cbcr_tex);
assert(got_frame);
+ // Render main chain. We take an extra copy of the created outputs,
+ // so that we can display it back to the screen later (it's less memory
+ // bandwidth than writing and reading back an RGBA texture, even at 16-bit).
+ // Ideally, we'd like to avoid taking copies and just use the main textures
+ // for display as well, but if they're used for zero-copy Quick Sync encoding
+ // (the default case), they're just views into VA-API memory and must be
+ // unmapped during encoding, so we can't use them for display, unfortunately.
GLuint cbcr_full_tex = resource_pool->create_2d_texture(GL_RG8, global_flags.width, global_flags.height);
GLuint cbcr_full_tex = resource_pool->create_2d_texture(GL_RG8, global_flags.width, global_flags.height);
- GLuint rgba_tex = resource_pool->create_2d_texture(GL_RGB565, global_flags.width, global_flags.height); // Saves texture bandwidth, although dithering gets messed up.
- GLuint fbo = resource_pool->create_fbo(y_tex, cbcr_full_tex, rgba_tex);
+ GLuint y_copy_tex = resource_pool->create_2d_texture(GL_R8, global_flags.width, global_flags.height);
+ GLuint fbo = resource_pool->create_fbo(y_tex, cbcr_full_tex, y_copy_tex);
check_error();
chain->render_to_fbo(fbo, global_flags.width, global_flags.height);
check_error();
chain->render_to_fbo(fbo, global_flags.width, global_flags.height);
resource_pool->release_fbo(fbo);
resource_pool->release_fbo(fbo);
- chroma_subsampler->subsample_chroma(cbcr_full_tex, global_flags.width, global_flags.height, cbcr_tex);
+ GLuint cbcr_copy_tex = resource_pool->create_2d_texture(GL_RG8, global_flags.width / 2, global_flags.height / 2);
+ chroma_subsampler->subsample_chroma(cbcr_full_tex, global_flags.width, global_flags.height, cbcr_tex, cbcr_copy_tex);
if (output_card_index != -1) {
cards[output_card_index].output->send_frame(y_tex, cbcr_full_tex, theme_main_chain.input_frames, pts_int, duration);
}
resource_pool->release_2d_texture(cbcr_full_tex);
if (output_card_index != -1) {
cards[output_card_index].output->send_frame(y_tex, cbcr_full_tex, theme_main_chain.input_frames, pts_int, duration);
}
resource_pool->release_2d_texture(cbcr_full_tex);
- // Set the right state for rgba_tex.
+ // Set the right state for the Y' and CbCr copies.
glBindFramebuffer(GL_FRAMEBUFFER, 0);
glBindFramebuffer(GL_FRAMEBUFFER, 0);
- glBindTexture(GL_TEXTURE_2D, rgba_tex);
+ glBindTexture(GL_TEXTURE_2D, y_copy_tex);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+
+ glBindTexture(GL_TEXTURE_2D, cbcr_copy_tex);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
RefCountedGLsync fence = video_encoder->end_frame();
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
RefCountedGLsync fence = video_encoder->end_frame();
- // The live frame just shows the RGBA texture we just rendered.
- // It owns rgba_tex now.
+ // The live frame pieces the Y'CbCr texture copies back into RGB and displays them.
+ // It owns y_copy_tex and cbcr_copy_tex now.
DisplayFrame live_frame;
live_frame.chain = display_chain.get();
DisplayFrame live_frame;
live_frame.chain = display_chain.get();
- live_frame.setup_chain = [this, rgba_tex]{
- display_input->set_texture_num(rgba_tex);
+ live_frame.setup_chain = [this, y_copy_tex, cbcr_copy_tex]{
+ display_input->set_texture_num(0, y_copy_tex);
+ display_input->set_texture_num(1, cbcr_copy_tex);
};
live_frame.ready_fence = fence;
live_frame.input_frames = {};
};
live_frame.ready_fence = fence;
live_frame.input_frames = {};
- live_frame.temp_textures = { rgba_tex };
+ live_frame.temp_textures = { y_copy_tex, cbcr_copy_tex };
output_channel[OUTPUT_LIVE].output_frame(live_frame);
// Set up preview and any additional channels.
output_channel[OUTPUT_LIVE].output_frame(live_frame);
// Set up preview and any additional channels.
namespace movit {
class Effect;
class EffectChain;
namespace movit {
class Effect;
class EffectChain;
} // namespace movit
// For any card that's not the master (where we pick out the frames as they
} // namespace movit
// For any card that's not the master (where we pick out the frames as they
std::atomic<bool> display_timecode_on_stdout{false};
// Effects part of <display_chain>. Owned by <display_chain>.
std::atomic<bool> display_timecode_on_stdout{false};
// Effects part of <display_chain>. Owned by <display_chain>.
- movit::FlatInput *display_input;
+ movit::YCbCrInput *display_input;
int64_t pts_int = 0; // In TIMEBASE units.
unsigned frame_num = 0;
int64_t pts_int = 0; // In TIMEBASE units.
unsigned frame_num = 0;
output_ycbcr_format.num_levels = 256;
chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, output_ycbcr_format, YCBCR_OUTPUT_SPLIT_Y_AND_CBCR);
output_ycbcr_format.num_levels = 256;
chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, output_ycbcr_format, YCBCR_OUTPUT_SPLIT_Y_AND_CBCR);
+ chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, output_ycbcr_format, YCBCR_OUTPUT_INTERLEAVED); // Add a copy where we'll only be using the Y component.
chain->set_dither_bits(8);
chain->set_output_origin(OUTPUT_ORIGIN_TOP_LEFT);
chain->set_dither_bits(8);
chain->set_output_origin(OUTPUT_ORIGIN_TOP_LEFT);
+ } else {
+ chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
- chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
chain->finalize();
return 0;
chain->finalize();
return 0;