From 471db5155f58c3bf7a98c446575cfa0c483da765 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Tue, 21 Feb 2017 18:42:26 +0100 Subject: [PATCH] Support 10-bit capture, both on bmusb and on DeckLink drivers. --- Makefile | 2 +- README | 7 +- bmusb | 2 +- decklink_capture.cpp | 35 +++++++-- decklink_capture.h | 10 +++ flags.cpp | 6 ++ flags.h | 1 + mixer.cpp | 93 ++++++++++++++++++------ mixer.h | 4 +- pbo_frame_allocator.cpp | 105 +++++++++++++++++++-------- pbo_frame_allocator.h | 3 +- theme.cpp | 18 +++-- v210_converter.cpp | 156 ++++++++++++++++++++++++++++++++++++++++ v210_converter.h | 103 ++++++++++++++++++++++++++ 14 files changed, 477 insertions(+), 68 deletions(-) create mode 100644 v210_converter.cpp create mode 100644 v210_converter.h diff --git a/Makefile b/Makefile index 970450e..9991424 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,7 @@ OBJS += midi_mapper.o midi_mapping.pb.o # Mixer objects AUDIO_MIXER_OBJS = audio_mixer.o alsa_input.o alsa_pool.o ebu_r128_proc.o stereocompressor.o resampling_queue.o flags.o correlation_measurer.o filter.o input_mapping.o state.pb.o -OBJS += chroma_subsampler.o mixer.o pbo_frame_allocator.o context.o ref_counted_frame.o theme.o httpd.o flags.o image_input.o alsa_output.o disk_space_estimator.o print_latency.o timecode_renderer.o $(AUDIO_MIXER_OBJS) +OBJS += chroma_subsampler.o v210_converter.o mixer.o pbo_frame_allocator.o context.o ref_counted_frame.o theme.o httpd.o flags.o image_input.o alsa_output.o disk_space_estimator.o print_latency.o timecode_renderer.o $(AUDIO_MIXER_OBJS) # Streaming and encoding objects OBJS += quicksync_encoder.o x264_encoder.o x264_speed_control.o video_encoder.o metacube2.o mux.o audio_encoder.o ffmpeg_raii.o diff --git a/README b/README index d858177..f4fc9ab 100644 --- a/README +++ b/README @@ -42,7 +42,7 @@ Nageru is in beta stage. It currently needs: but also for stability. - Movit, my GPU-based video filter library (https://movit.sesse.net). - You will need at least version 1.3.1, but at least 1.4.0 is recommended. + You will need at least version 1.5.0 (unreleased; get it from git). - Qt 5.5 or newer for the GUI. @@ -76,6 +76,11 @@ with: libmovit-dev libegl1-mesa-dev libasound2-dev libx264-dev libbmusb-dev \ protobuf-compiler libprotobuf-dev libpci-dev +Exceptions as of February 2017: + + - You will need Movit from git; stretch only has 1.4.0. + + - You will need bmusb from git; stretch only has 0.5.4. The patches/ directory contains a patch that helps zita-resampler performance. It is meant for upstream, but was not in at the time Nageru was released. diff --git a/bmusb b/bmusb index aac1510..01ddb8f 160000 --- a/bmusb +++ b/bmusb @@ -1 +1 @@ -Subproject commit aac15101d9cc85681eee3e02c960d57e32414db6 +Subproject commit 01ddb8f836114c07cff3ca040d9ed2c946b2fdbf diff --git a/decklink_capture.cpp b/decklink_capture.cpp index 21a4e79..33bb73d 100644 --- a/decklink_capture.cpp +++ b/decklink_capture.cpp @@ -20,6 +20,8 @@ #include "bmusb/bmusb.h" #include "decklink_util.h" +#include "flags.h" +#include "v210_converter.h" #define FRAME_SIZE (8 << 20) // 8 MB. @@ -138,6 +140,18 @@ size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t #endif // __SSE2__ +BMDPixelFormat pixel_format_to_bmd(PixelFormat pixel_format) +{ + switch (pixel_format) { + case PixelFormat_8BitYCbCr: + return bmdFormat8BitYUV; + case PixelFormat_10BitYCbCr: + return bmdFormat10BitYUV; + default: + assert(false); + } +} + } // namespace DeckLinkCapture::DeckLinkCapture(IDeckLink *card, int card_index) @@ -329,7 +343,13 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived( int width = video_frame->GetWidth(); int height = video_frame->GetHeight(); const int stride = video_frame->GetRowBytes(); - assert(stride == width * 2); + const BMDPixelFormat format = video_frame->GetPixelFormat(); + assert(format == pixel_format_to_bmd(current_pixel_format)); + if (global_flags.ten_bit_input) { + assert(stride == int(v210Converter::get_v210_stride(width))); + } else { + assert(stride == width * 2); + } current_video_frame = video_frame_allocator->alloc_frame(); if (current_video_frame.data != nullptr) { @@ -362,6 +382,7 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived( video_format.width = width; video_format.height = height; + video_format.stride = stride; } } @@ -413,7 +434,7 @@ void DeckLinkCapture::start_bm_capture() if (running) { return; } - if (input->EnableVideoInput(current_video_mode, bmdFormat8BitYUV, supports_autodetect ? bmdVideoInputEnableFormatDetection : 0) != S_OK) { + if (input->EnableVideoInput(current_video_mode, pixel_format_to_bmd(current_pixel_format), supports_autodetect ? bmdVideoInputEnableFormatDetection : 0) != S_OK) { fprintf(stderr, "Failed to set video mode 0x%04x for card %d\n", current_video_mode, card_index); exit(1); } @@ -469,11 +490,17 @@ void DeckLinkCapture::set_video_mode(uint32_t video_mode_id) } } +void DeckLinkCapture::set_pixel_format(PixelFormat pixel_format) +{ + current_pixel_format = pixel_format; + set_video_mode(current_video_mode); +} + void DeckLinkCapture::set_video_mode_no_restart(uint32_t video_mode_id) { BMDDisplayModeSupport support; IDeckLinkDisplayMode *display_mode; - if (input->DoesSupportVideoMode(video_mode_id, bmdFormat8BitYUV, /*flags=*/0, &support, &display_mode)) { + if (input->DoesSupportVideoMode(video_mode_id, pixel_format_to_bmd(current_pixel_format), /*flags=*/0, &support, &display_mode)) { fprintf(stderr, "Failed to query display mode for card %d\n", card_index); exit(1); } @@ -491,7 +518,7 @@ void DeckLinkCapture::set_video_mode_no_restart(uint32_t video_mode_id) field_dominance = display_mode->GetFieldDominance(); if (running) { - if (input->EnableVideoInput(video_mode_id, bmdFormat8BitYUV, supports_autodetect ? bmdVideoInputEnableFormatDetection : 0) != S_OK) { + if (input->EnableVideoInput(video_mode_id, pixel_format_to_bmd(current_pixel_format), supports_autodetect ? bmdVideoInputEnableFormatDetection : 0) != S_OK) { fprintf(stderr, "Failed to set video mode 0x%04x for card %d\n", video_mode_id, card_index); exit(1); } diff --git a/decklink_capture.h b/decklink_capture.h index 1bdf9ad..f940241 100644 --- a/decklink_capture.h +++ b/decklink_capture.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "DeckLinkAPIModes.h" @@ -93,6 +94,14 @@ public: void set_video_mode(uint32_t video_mode_id) override; uint32_t get_current_video_mode() const override { return current_video_mode; } + std::set get_available_pixel_formats() const override { + return std::set{ bmusb::PixelFormat_8BitYCbCr, bmusb::PixelFormat_10BitYCbCr }; + } + void set_pixel_format(bmusb::PixelFormat pixel_format) override; + bmusb::PixelFormat get_current_pixel_format() const override { + return current_pixel_format; + } + std::map get_available_video_inputs() const override { return video_inputs; } void set_video_input(uint32_t video_input_id) override; uint32_t get_current_video_input() const override { return current_video_input; } @@ -132,6 +141,7 @@ private: std::map video_modes; BMDDisplayMode current_video_mode; + bmusb::PixelFormat current_pixel_format = bmusb::PixelFormat_8BitYCbCr; std::map video_inputs; BMDVideoConnection current_video_input; diff --git a/flags.cpp b/flags.cpp index 9c91bf7..025b6d9 100644 --- a/flags.cpp +++ b/flags.cpp @@ -53,6 +53,7 @@ enum LongOption { OPTION_OUTPUT_SLOP_FRAMES, OPTION_TIMECODE_STREAM, OPTION_TIMECODE_STDOUT, + OPTION_10_BIT_INPUT, }; void usage() @@ -123,6 +124,7 @@ void usage() fprintf(stderr, " dropping the frame (default 0.5)\n"); fprintf(stderr, " --timecode-stream show timestamp and timecode in stream\n"); fprintf(stderr, " --timecode-stdout show timestamp and timecode on standard output\n"); + fprintf(stderr, " --10-bit-input use 10-bit video input (requires compute shaders)\n"); } void parse_flags(int argc, char * const argv[]) @@ -177,6 +179,7 @@ void parse_flags(int argc, char * const argv[]) { "output-slop-frames", required_argument, 0, OPTION_OUTPUT_SLOP_FRAMES }, { "timecode-stream", no_argument, 0, OPTION_TIMECODE_STREAM }, { "timecode-stdout", no_argument, 0, OPTION_TIMECODE_STDOUT }, + { "10-bit-input", no_argument, 0, OPTION_10_BIT_INPUT }, { 0, 0, 0, 0 } }; vector theme_dirs; @@ -354,6 +357,9 @@ void parse_flags(int argc, char * const argv[]) case OPTION_TIMECODE_STDOUT: global_flags.display_timecode_on_stdout = true; break; + case OPTION_10_BIT_INPUT: + global_flags.ten_bit_input = true; + break; case OPTION_HELP: usage(); exit(0); diff --git a/flags.h b/flags.h index 91b2221..12bc3d4 100644 --- a/flags.h +++ b/flags.h @@ -49,6 +49,7 @@ struct Flags { int max_input_queue_frames = 6; bool display_timecode_in_stream = false; bool display_timecode_on_stdout = false; + bool ten_bit_input = false; }; extern Flags global_flags; diff --git a/mixer.cpp b/mixer.cpp index 81d4b73..a5ceeaf 100644 --- a/mixer.cpp +++ b/mixer.cpp @@ -46,6 +46,7 @@ #include "resampling_queue.h" #include "timebase.h" #include "timecode_renderer.h" +#include "v210_converter.h" #include "video_encoder.h" class IDeckLink; @@ -79,29 +80,48 @@ void insert_new_frame(RefCountedFrame frame, unsigned field_num, bool interlaced void ensure_texture_resolution(PBOFrameAllocator::Userdata *userdata, unsigned field, unsigned width, unsigned height) { - if (userdata->tex_y[field] == 0 || - userdata->tex_cbcr[field] == 0 || + bool first; + if (global_flags.ten_bit_input) { + first = userdata->tex_v210[field] == 0 || userdata->tex_444[field] == 0; + } else { + first = userdata->tex_y[field] == 0 || userdata->tex_cbcr[field] == 0; + } + + if (first || width != userdata->last_width[field] || height != userdata->last_height[field]) { - size_t cbcr_width = width / 2; - // We changed resolution since last use of this texture, so we need to create // a new object. Note that this each card has its own PBOFrameAllocator, // we don't need to worry about these flip-flopping between resolutions. - glBindTexture(GL_TEXTURE_2D, userdata->tex_cbcr[field]); - check_error(); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RG8, cbcr_width, height, 0, GL_RG, GL_UNSIGNED_BYTE, nullptr); - check_error(); - glBindTexture(GL_TEXTURE_2D, userdata->tex_y[field]); - check_error(); - glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, width, height, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr); - check_error(); + if (global_flags.ten_bit_input) { + const size_t v210_width = v210Converter::get_minimum_v210_texture_width(width); + + glBindTexture(GL_TEXTURE_2D, userdata->tex_v210[field]); + check_error(); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB10_A2, v210_width, height, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, nullptr); + check_error(); + glBindTexture(GL_TEXTURE_2D, userdata->tex_444[field]); + check_error(); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB10_A2, width, height, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, nullptr); + check_error(); + } else { + size_t cbcr_width = width / 2; + + glBindTexture(GL_TEXTURE_2D, userdata->tex_cbcr[field]); + check_error(); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RG8, cbcr_width, height, 0, GL_RG, GL_UNSIGNED_BYTE, nullptr); + check_error(); + glBindTexture(GL_TEXTURE_2D, userdata->tex_y[field]); + check_error(); + glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, width, height, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr); + check_error(); + } userdata->last_width[field] = width; userdata->last_height[field] = height; } } -void upload_texture(GLuint tex, GLuint width, GLuint height, GLuint stride, bool interlaced_stride, GLenum format, GLintptr offset) +void upload_texture(GLuint tex, GLuint width, GLuint height, GLuint stride, bool interlaced_stride, GLenum format, GLenum type, GLintptr offset) { if (interlaced_stride) { stride *= 2; @@ -121,7 +141,7 @@ void upload_texture(GLuint tex, GLuint width, GLuint height, GLuint stride, bool check_error(); } - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, format, GL_UNSIGNED_BYTE, BUFFER_OFFSET(offset)); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, format, type, BUFFER_OFFSET(offset)); check_error(); glBindTexture(GL_TEXTURE_2D, 0); check_error(); @@ -252,6 +272,24 @@ Mixer::Mixer(const QSurfaceFormat &format, unsigned num_cards) chroma_subsampler.reset(new ChromaSubsampler(resource_pool.get())); + if (global_flags.ten_bit_input) { + if (!v210Converter::has_hardware_support()) { + fprintf(stderr, "ERROR: --ten-bit-input requires support for OpenGL compute shaders\n"); + fprintf(stderr, " (OpenGL 4.3, or GL_ARB_compute_shader + GL_ARB_shader_image_load_store).\n"); + exit(1); + } + v210_converter.reset(new v210Converter()); + + // These are all the widths listed in the Blackmagic SDK documentation + // (section 2.7.3, “Display Modes”). + v210_converter->precompile_shader(720); + v210_converter->precompile_shader(1280); + v210_converter->precompile_shader(1920); + v210_converter->precompile_shader(2048); + v210_converter->precompile_shader(3840); + v210_converter->precompile_shader(4096); + } + timecode_renderer.reset(new TimecodeRenderer(resource_pool.get(), global_flags.width, global_flags.height)); display_timecode_in_stream = global_flags.display_timecode_in_stream; display_timecode_on_stdout = global_flags.display_timecode_on_stdout; @@ -308,6 +346,7 @@ void Mixer::configure_card(unsigned card_index, CaptureInterface *capture, bool } while (!card->new_frames.empty()) card->new_frames.pop_front(); card->last_timecode = -1; + card->capture->set_pixel_format(global_flags.ten_bit_input ? PixelFormat_10BitYCbCr : PixelFormat_8BitYCbCr); card->capture->configure_card(); // NOTE: start_bm_capture() happens in thread_func(). @@ -450,7 +489,7 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode, card->last_timecode = timecode; - size_t expected_length = video_format.width * (video_format.height + video_format.extra_lines_top + video_format.extra_lines_bottom) * 2; + size_t expected_length = video_format.stride * (video_format.height + video_format.extra_lines_top + video_format.extra_lines_bottom); if (video_frame.len - video_offset == 0 || video_frame.len - video_offset != expected_length) { if (video_frame.len != 0) { @@ -503,9 +542,9 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode, RefCountedFrame frame(video_frame); // Upload the textures. - size_t cbcr_width = video_format.width / 2; - size_t cbcr_offset = video_offset / 2; - size_t y_offset = video_frame.size / 2 + video_offset / 2; + const size_t cbcr_width = video_format.width / 2; + const size_t cbcr_offset = video_offset / 2; + const size_t y_offset = video_frame.size / 2 + video_offset / 2; for (unsigned field = 0; field < num_fields; ++field) { // Put the actual texture upload in a lambda that is executed in the main thread. @@ -516,23 +555,31 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode, // Note that this means we must hold on to the actual frame data in // until the upload command is run, but we hold on to much longer than that // (in fact, all the way until we no longer use the texture in rendering). - auto upload_func = [field, video_format, y_offset, cbcr_offset, cbcr_width, interlaced_stride, userdata]() { + auto upload_func = [this, field, video_format, y_offset, video_offset, cbcr_offset, cbcr_width, interlaced_stride, userdata]() { unsigned field_start_line; if (field == 1) { field_start_line = video_format.second_field_start; } else { field_start_line = video_format.extra_lines_top; } - size_t field_y_start = y_offset + video_format.width * field_start_line; - size_t field_cbcr_start = cbcr_offset + cbcr_width * field_start_line * sizeof(uint16_t); ensure_texture_resolution(userdata, field, video_format.width, video_format.height); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, userdata->pbo); check_error(); - upload_texture(userdata->tex_y[field], video_format.width, video_format.height, video_format.width, interlaced_stride, GL_RED, field_y_start); - upload_texture(userdata->tex_cbcr[field], cbcr_width, video_format.height, cbcr_width * sizeof(uint16_t), interlaced_stride, GL_RG, field_cbcr_start); + if (global_flags.ten_bit_input) { + size_t field_start = video_offset + video_format.stride * field_start_line; + upload_texture(userdata->tex_v210[field], video_format.stride / sizeof(uint32_t), video_format.height, video_format.stride, interlaced_stride, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, field_start); + v210_converter->convert(userdata->tex_v210[field], userdata->tex_444[field], video_format.width, video_format.height); + } else { + size_t field_y_start = y_offset + video_format.width * field_start_line; + size_t field_cbcr_start = cbcr_offset + cbcr_width * field_start_line * sizeof(uint16_t); + + // Make up our own strides, since we are interleaving. + upload_texture(userdata->tex_y[field], video_format.width, video_format.height, video_format.width, interlaced_stride, GL_RED, GL_UNSIGNED_BYTE, field_y_start); + upload_texture(userdata->tex_cbcr[field], cbcr_width, video_format.height, cbcr_width * sizeof(uint16_t), interlaced_stride, GL_RG, GL_UNSIGNED_BYTE, field_cbcr_start); + } glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); check_error(); diff --git a/mixer.h b/mixer.h index 2543c24..8f5342a 100644 --- a/mixer.h +++ b/mixer.h @@ -39,9 +39,10 @@ class ALSAOutput; class ChromaSubsampler; class DeckLinkOutput; -class TimecodeRenderer; class QSurface; class QSurfaceFormat; +class TimecodeRenderer; +class v210Converter; namespace movit { class Effect; @@ -374,6 +375,7 @@ private: std::unique_ptr display_chain; std::unique_ptr chroma_subsampler; + std::unique_ptr v210_converter; std::unique_ptr video_encoder; std::unique_ptr timecode_renderer; diff --git a/pbo_frame_allocator.cpp b/pbo_frame_allocator.cpp index d18358a..5aa4705 100644 --- a/pbo_frame_allocator.cpp +++ b/pbo_frame_allocator.cpp @@ -7,6 +7,8 @@ #include #include +#include "flags.h" + using namespace std; PBOFrameAllocator::PBOFrameAllocator(size_t frame_size, GLuint width, GLuint height, size_t num_queued_frames, GLenum buffer, GLenum permissions, GLenum map_bits) @@ -30,15 +32,27 @@ PBOFrameAllocator::PBOFrameAllocator(size_t frame_size, GLuint width, GLuint hei frame.userdata = &userdata[i]; userdata[i].pbo = pbo; frame.owner = this; - frame.interleaved = true; + + // For 8-bit Y'CbCr, we ask the driver to split Y' and Cb/Cr + // into separate textures. For 10-bit, the input format (v210) + // is complicated enough that we need to interpolate up to 4:4:4, + // which we do in a compute shader ourselves. + frame.interleaved = !global_flags.ten_bit_input; // Create textures. We don't allocate any data for the second field at this point // (just create the texture state with the samplers), since our default assumed // resolution is progressive. - glGenTextures(2, userdata[i].tex_y); - check_error(); - glGenTextures(2, userdata[i].tex_cbcr); - check_error(); + if (global_flags.ten_bit_input) { + glGenTextures(2, userdata[i].tex_v210); + check_error(); + glGenTextures(2, userdata[i].tex_444); + check_error(); + } else { + glGenTextures(2, userdata[i].tex_y); + check_error(); + glGenTextures(2, userdata[i].tex_cbcr); + check_error(); + } userdata[i].last_width[0] = width; userdata[i].last_height[0] = height; userdata[i].last_width[1] = 0; @@ -47,30 +61,54 @@ PBOFrameAllocator::PBOFrameAllocator(size_t frame_size, GLuint width, GLuint hei userdata[i].last_has_signal = false; userdata[i].last_is_connected = false; for (unsigned field = 0; field < 2; ++field) { - glBindTexture(GL_TEXTURE_2D, userdata[i].tex_y[field]); - check_error(); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); - check_error(); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); - check_error(); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - check_error(); - if (field == 0) { - glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, width, height, 0, GL_RED, GL_UNSIGNED_BYTE, NULL); + if (global_flags.ten_bit_input) { + glBindTexture(GL_TEXTURE_2D, userdata[i].tex_v210[field]); check_error(); - } + // Don't care about texture parameters, we're only going to read it + // from the compute shader anyway. + if (field == 0) { + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB10_A2, width, height, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, NULL); + check_error(); + } - glBindTexture(GL_TEXTURE_2D, userdata[i].tex_cbcr[field]); - check_error(); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); - check_error(); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); - check_error(); - glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - check_error(); - if (field == 0) { - glTexImage2D(GL_TEXTURE_2D, 0, GL_RG8, width / 2, height, 0, GL_RG, GL_UNSIGNED_BYTE, NULL); + glBindTexture(GL_TEXTURE_2D, userdata[i].tex_444[field]); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + check_error(); + if (field == 0) { + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB10_A2, width, height, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, NULL); + check_error(); + } + } else { + glBindTexture(GL_TEXTURE_2D, userdata[i].tex_y[field]); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + check_error(); + if (field == 0) { + glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, width, height, 0, GL_RED, GL_UNSIGNED_BYTE, NULL); + check_error(); + } + + glBindTexture(GL_TEXTURE_2D, userdata[i].tex_cbcr[field]); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + check_error(); + if (field == 0) { + glTexImage2D(GL_TEXTURE_2D, 0, GL_RG8, width / 2, height, 0, GL_RG, GL_UNSIGNED_BYTE, NULL); + check_error(); + } } } @@ -96,10 +134,17 @@ PBOFrameAllocator::~PBOFrameAllocator() check_error(); glDeleteBuffers(1, &pbo); check_error(); - glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_y); - check_error(); - glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_cbcr); - check_error(); + if (global_flags.ten_bit_input) { + glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_v210); + check_error(); + glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_444); + check_error(); + } else { + glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_y); + check_error(); + glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_cbcr); + check_error(); + } } } //static int sumsum = 0; diff --git a/pbo_frame_allocator.h b/pbo_frame_allocator.h index 024e57a..43310ae 100644 --- a/pbo_frame_allocator.h +++ b/pbo_frame_allocator.h @@ -31,7 +31,8 @@ public: GLuint pbo; // The second set is only used for the second field of interlaced inputs. - GLuint tex_y[2], tex_cbcr[2]; + GLuint tex_y[2], tex_cbcr[2]; // For 8-bit. + GLuint tex_v210[2], tex_444[2]; // For 10-bit. GLuint last_width[2], last_height[2]; bool last_interlaced, last_has_signal, last_is_connected; unsigned last_frame_rate_nom, last_frame_rate_den; diff --git a/theme.cpp b/theme.cpp index 1c0d8c9..14560c2 100644 --- a/theme.cpp +++ b/theme.cpp @@ -603,9 +603,9 @@ LiveInputWrapper::LiveInputWrapper(Theme *theme, EffectChain *chain, bool overri // Perhaps 601 was only to indicate the subsampling positions, not the // colorspace itself? Tested with a Lenovo X1 gen 3 as input. YCbCrFormat input_ycbcr_format; - input_ycbcr_format.chroma_subsampling_x = 2; + input_ycbcr_format.chroma_subsampling_x = global_flags.ten_bit_input ? 1 : 2; input_ycbcr_format.chroma_subsampling_y = 1; - input_ycbcr_format.num_levels = 256; + input_ycbcr_format.num_levels = global_flags.ten_bit_input ? 1024 : 256; input_ycbcr_format.cb_x_position = 0.0; input_ycbcr_format.cr_x_position = 0.0; input_ycbcr_format.cb_y_position = 0.5; @@ -629,10 +629,12 @@ LiveInputWrapper::LiveInputWrapper(Theme *theme, EffectChain *chain, bool overri num_inputs = 1; } for (unsigned i = 0; i < num_inputs; ++i) { + // When using 10-bit input, we're converting to interleaved through v210Converter. + YCbCrInputSplitting splitting = global_flags.ten_bit_input ? YCBCR_INPUT_INTERLEAVED : YCBCR_INPUT_SPLIT_Y_AND_CBCR; if (override_bounce) { - inputs.push_back(new NonBouncingYCbCrInput(inout_format, input_ycbcr_format, global_flags.width, global_flags.height, YCBCR_INPUT_SPLIT_Y_AND_CBCR)); + inputs.push_back(new NonBouncingYCbCrInput(inout_format, input_ycbcr_format, global_flags.width, global_flags.height, splitting)); } else { - inputs.push_back(new YCbCrInput(inout_format, input_ycbcr_format, global_flags.width, global_flags.height, YCBCR_INPUT_SPLIT_Y_AND_CBCR)); + inputs.push_back(new YCbCrInput(inout_format, input_ycbcr_format, global_flags.width, global_flags.height, splitting)); } chain->add_input(inputs.back()); } @@ -681,8 +683,12 @@ void LiveInputWrapper::connect_signal(int signal_num) userdata = (const PBOFrameAllocator::Userdata *)frame.frame->userdata; } - inputs[i]->set_texture_num(0, userdata->tex_y[frame.field_number]); - inputs[i]->set_texture_num(1, userdata->tex_cbcr[frame.field_number]); + if (global_flags.ten_bit_input) { + inputs[i]->set_texture_num(0, userdata->tex_444[frame.field_number]); + } else { + inputs[i]->set_texture_num(0, userdata->tex_y[frame.field_number]); + inputs[i]->set_texture_num(1, userdata->tex_cbcr[frame.field_number]); + } inputs[i]->set_width(userdata->last_width[frame.field_number]); inputs[i]->set_height(userdata->last_height[frame.field_number]); diff --git a/v210_converter.cpp b/v210_converter.cpp new file mode 100644 index 0000000..715dd5f --- /dev/null +++ b/v210_converter.cpp @@ -0,0 +1,156 @@ +#include "v210_converter.h" + +#include +#include + +using namespace std; + +v210Converter::~v210Converter() +{ + for (const auto &shader : shaders) { + glDeleteProgram(shader.second.glsl_program_num); + check_error(); + } +} + +bool v210Converter::has_hardware_support() +{ + // We don't have a GLES version of this, although GLSL ES 3.1 supports + // compute shaders. Note that GLSL ES has some extra restrictions, + // like requiring that the images are allocated with glTexStorage*(), + // or that binding= is effectively mandatory. + if (!epoxy_is_desktop_gl()) { + return false; + } + if (epoxy_gl_version() >= 43) { + // Supports compute shaders natively. + return true; + } + return epoxy_has_gl_extension("GL_ARB_compute_shader") && + epoxy_has_gl_extension("GL_ARB_shader_image_load_store"); +} + +void v210Converter::precompile_shader(unsigned width) +{ + unsigned num_local_work_groups = (width + 5) / 6; + if (shaders.count(num_local_work_groups)) { + // Already exists. + return; + } + + char buf[16]; + snprintf(buf, sizeof(buf), "%u", num_local_work_groups); + string shader_src = R"(#version 150 +#extension GL_ARB_compute_shader : enable +#extension GL_ARB_shader_image_load_store : enable +layout(local_size_x = )" + string(buf) + R"() in; +layout(rgb10_a2) uniform restrict readonly image2D inbuf; +layout(rgb10_a2) uniform restrict writeonly image2D outbuf; +uniform int max_cbcr_x; +shared vec2 cbcr[gl_WorkGroupSize.x * 3u]; + +void main() +{ + int xb = int(gl_LocalInvocationID.x); // X block. + int y = int(gl_GlobalInvocationID.y); // Y (actual line). + + // Load our pixel group, containing data for six pixels. + vec3 indata[4]; + for (int i = 0; i < 4; ++i) { + indata[i] = imageLoad(inbuf, ivec2(xb * 4 + i, y)).xyz; + } + + // Decode Cb and Cr to shared memory, because neighboring blocks need it for interpolation. + cbcr[xb * 3 + 0] = indata[0].xz; + cbcr[xb * 3 + 1] = vec2(indata[1].y, indata[2].x); + cbcr[xb * 3 + 2] = vec2(indata[2].z, indata[3].y); + memoryBarrierShared(); + + float pix_y[6]; + pix_y[0] = indata[0].y; + pix_y[1] = indata[1].x; + pix_y[2] = indata[1].z; + pix_y[3] = indata[2].y; + pix_y[4] = indata[3].x; + pix_y[5] = indata[3].z; + + barrier(); + + // Interpolate the missing Cb/Cr pixels, taking care not to read past the end of the screen + // for pixels that we use for interpolation. + vec2 pix_cbcr[7]; + pix_cbcr[0] = indata[0].xz; + pix_cbcr[2] = cbcr[min(xb * 3 + 1, max_cbcr_x)]; + pix_cbcr[4] = cbcr[min(xb * 3 + 2, max_cbcr_x)]; + pix_cbcr[6] = cbcr[min(xb * 3 + 3, max_cbcr_x)]; + pix_cbcr[1] = 0.5 * (pix_cbcr[0] + pix_cbcr[2]); + pix_cbcr[3] = 0.5 * (pix_cbcr[2] + pix_cbcr[4]); + pix_cbcr[5] = 0.5 * (pix_cbcr[4] + pix_cbcr[6]); + + // Write the decoded pixels to the destination texture. + for (int i = 0; i < 6; ++i) { + vec4 outdata = vec4(pix_y[i], pix_cbcr[i].x, pix_cbcr[i].y, 1.0f); + imageStore(outbuf, ivec2(xb * 6 + i, y), outdata); + } +} +)"; + + Shader shader; + + GLuint shader_num = movit::compile_shader(shader_src, GL_COMPUTE_SHADER); + check_error(); + shader.glsl_program_num = glCreateProgram(); + check_error(); + glAttachShader(shader.glsl_program_num, shader_num); + check_error(); + glLinkProgram(shader.glsl_program_num); + check_error(); + + GLint success; + glGetProgramiv(shader.glsl_program_num, GL_LINK_STATUS, &success); + check_error(); + if (success == GL_FALSE) { + GLchar error_log[1024] = {0}; + glGetProgramInfoLog(shader.glsl_program_num, 1024, NULL, error_log); + fprintf(stderr, "Error linking program: %s\n", error_log); + exit(1); + } + + shader.max_cbcr_x_pos = glGetUniformLocation(shader.glsl_program_num, "max_cbcr_x"); + check_error(); + shader.inbuf_pos = glGetUniformLocation(shader.glsl_program_num, "inbuf"); + check_error(); + shader.outbuf_pos = glGetUniformLocation(shader.glsl_program_num, "outbuf"); + check_error(); + + shaders.emplace(num_local_work_groups, shader); +} + +void v210Converter::convert(GLuint tex_src, GLuint tex_dst, unsigned width, unsigned height) +{ + precompile_shader(width); + unsigned num_local_work_groups = (width + 5) / 6; + const Shader &shader = shaders[num_local_work_groups]; + + glUseProgram(shader.glsl_program_num); + check_error(); + glUniform1i(shader.max_cbcr_x_pos, width / 2 - 1); + check_error(); + + // Bind the textures. + glUniform1i(shader.inbuf_pos, 0); + check_error(); + glUniform1i(shader.outbuf_pos, 1); + check_error(); + glBindImageTexture(0, tex_src, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RGB10_A2); + check_error(); + glBindImageTexture(1, tex_dst, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGB10_A2); + check_error(); + + // Actually run the shader. + glDispatchCompute(1, height, 1); + check_error(); + + glUseProgram(0); + check_error(); +} diff --git a/v210_converter.h b/v210_converter.h new file mode 100644 index 0000000..39c456f --- /dev/null +++ b/v210_converter.h @@ -0,0 +1,103 @@ +#ifndef _V210CONVERTER_H +#define _V210CONVERTER_H 1 + +// v210 is a 10-bit 4:2:2 interleaved Y'CbCr format, packing three values +// into a 32-bit int (leaving two unused bits at the top) with chroma being +// sub-sited with the left luma sample. Even though this 2:10:10:10-arrangement +// can be sampled from using the GL_RGB10_A2/GL_UNSIGNED_2_10_10_10_REV format, +// the placement of the Y', Cb and Cr parts within these ints is rather +// complicated, and thus hard to get a single Y'CbCr pixel from efficiently, +// especially on a GPU. Six pixels (six Y', three Cb, three Cr) are packed into +// four such ints in the following pattern (see e.g. the DeckLink documentation +// for reference): +// +// A B G R +// ----------------- +// X Cr0 Y0 Cb0 +// X Y2 Cb2 Y1 +// X Cb4 Y3 Cr2 +// X Y5 Cr4 Y4 +// +// This patterns repeats for as long as needed, with the additional constraint +// that stride must be divisible by 128 (or equivalently, 32 four-byte ints, +// or eight pixel groups representing 48 pixels in all). +// +// Thus, v210Converter allows you to convert from v210 to a more regular +// 4:4:4 format (upsampling Cb/Cr on the way, using linear interpolation) +// that the GPU supports natively, again in the form of GL_RGB10_A2 +// (with Y', Cb, Cr packed as R, G and B, respectively -- the “alpha” channel +// is always 1). +// +// It does this fairly efficiently using a compute shader, which means you'll +// need compute shader support (GL_ARB_compute_shader + GL_ARB_shader_image_load_store, +// or equivalently, OpenGL 4.3 or newer) to use it. There are many possible +// strategies for doing this in a compute shader, but I ended up settling +// a fairly simple one after some benchmarking; each work unit takes in +// a single four-int group and writes six samples, but as the interpolation +// needs the leftmost chroma samples from the work unit at the right, each line +// is put into a local work group. Cb/Cr is first decoded into shared memory +// (OpenGL guarantees at least 32 kB shared memory for the work group, which is +// enough for up to 6K video or so), and then the rest of the shuffling and +// writing happens. Each line can of course be converted entirely +// independently, so we can fire up as many such work groups as we have lines. +// +// On the Haswell GPU where I developed it (with single-channel memory), +// conversion takes about 1.4 ms for a 720p frame, so it should be possible to +// keep up multiple inputs at 720p60, although probably a faster machine is +// needed if we want to run e.g. heavy scaling filters in the same pipeline. +// (1.4 ms equates to about 35% of the theoretical memory bandwidth of +// 12.8 GB/sec, which is pretty good.) + +#include + +#include + +class v210Converter { +public: + ~v210Converter(); + + // Whether the current hardware and driver supports the compute shader + // necessary to do this conversion. + static bool has_hardware_support(); + + // Given an image width, returns the minimum number of 32-bit groups + // needed for each line. This can be used to size the input texture properly. + static GLuint get_minimum_v210_texture_width(unsigned width) + { + unsigned num_local_groups = (width + 5) / 6; + return 4 * num_local_groups; + } + + // Given an image width, returns the stride (in bytes) for each line. + static size_t get_v210_stride(unsigned width) + { + return (width + 47) / 48 * 128; + } + + // Since work groups need to be determined at shader compile time, + // each width needs potentially a different shader. You can call this + // function at startup to make sure a shader for the given width + // has been compiled, making sure you don't need to start an expensive + // compilation job while video is running if a new resolution comes along. + // This is not required, but generally recommended. + void precompile_shader(unsigned width); + + // Do the actual conversion. tex_src is assumed to be a GL_RGB10_A2 + // texture of at least [get_minimum_v210_texture_width(width), height]. + // tex_dst is assumed to be a GL_RGB10_A2 texture of exactly [width, height] + // (actually, other sizes will work fine, but be nonsensical). + // No textures will be allocated or deleted. + void convert(GLuint tex_src, GLuint tex_dst, unsigned width, unsigned height); + +private: + // Key is number of local groups, ie., ceil(width / 6). + struct Shader { + GLuint glsl_program_num = -1; + + // Uniform locations. + GLuint max_cbcr_x_pos = -1, inbuf_pos = -1, outbuf_pos = -1; + }; + std::map shaders; +}; + +#endif // !defined(_V210CONVERTER_H) -- 2.39.2