Support 10-bit capture, both on bmusb and on DeckLink drivers.

author Steinar H. Gunderson <sgunderson@bigfoot.com>

Tue, 21 Feb 2017 17:42:26 +0000 (18:42 +0100)

committer Steinar H. Gunderson <sgunderson@bigfoot.com>

Tue, 21 Feb 2017 17:42:26 +0000 (18:42 +0100)
author Steinar H. Gunderson <sgunderson@bigfoot.com>
Tue, 21 Feb 2017 17:42:26 +0000 (18:42 +0100)
committer Steinar H. Gunderson <sgunderson@bigfoot.com>
Tue, 21 Feb 2017 17:42:26 +0000 (18:42 +0100)
diff --git a/Makefile b/Makefile

index 970450e0fc1e0446cea8c3c4c0651f405f09943f..99914248d19868d60f5a62fd18c879fc38bc7f2f 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -22,7 +22,7 @@ OBJS += midi_mapper.o midi_mapping.pb.o
  
  # Mixer objects
  AUDIO_MIXER_OBJS = audio_mixer.o alsa_input.o alsa_pool.o ebu_r128_proc.o stereocompressor.o resampling_queue.o flags.o correlation_measurer.o filter.o input_mapping.o state.pb.o
-OBJS += chroma_subsampler.o mixer.o pbo_frame_allocator.o context.o ref_counted_frame.o theme.o httpd.o flags.o image_input.o alsa_output.o disk_space_estimator.o print_latency.o timecode_renderer.o $(AUDIO_MIXER_OBJS)
+OBJS += chroma_subsampler.o v210_converter.o mixer.o pbo_frame_allocator.o context.o ref_counted_frame.o theme.o httpd.o flags.o image_input.o alsa_output.o disk_space_estimator.o print_latency.o timecode_renderer.o $(AUDIO_MIXER_OBJS)
  
  # Streaming and encoding objects
  OBJS += quicksync_encoder.o x264_encoder.o x264_speed_control.o video_encoder.o metacube2.o mux.o audio_encoder.o ffmpeg_raii.o
diff --git a/README b/README

index d85817768be46e5eeb86a91a8f1c13e31c07c1a6..f4fc9abff99a67a7d88c37a6a4e456e029659178 100644 (file)
--- a/README
+++ b/README
@@ -42,7 +42,7 @@ Nageru is in beta stage. It currently needs:
     but also for stability.
  
   - Movit, my GPU-based video filter library (https://movit.sesse.net).
-   You will need at least version 1.3.1, but at least 1.4.0 is recommended.
+   You will need at least version 1.5.0 (unreleased; get it from git).
  
   - Qt 5.5 or newer for the GUI.
  
@@ -76,6 +76,11 @@ with:
      libmovit-dev libegl1-mesa-dev libasound2-dev libx264-dev libbmusb-dev \
      protobuf-compiler libprotobuf-dev libpci-dev
  
+Exceptions as of February 2017:
+
+  - You will need Movit from git; stretch only has 1.4.0.
+
+  - You will need bmusb from git; stretch only has 0.5.4.
  
  The patches/ directory contains a patch that helps zita-resampler performance.
  It is meant for upstream, but was not in at the time Nageru was released.
diff --git a/bmusb b/bmusb

index aac15101d9cc85681eee3e02c960d57e32414db6..01ddb8f836114c07cff3ca040d9ed2c946b2fdbf 160000 (submodule)
--- a/bmusb
+++ b/bmusb
@@ -1 +1 @@
-Subproject commit aac15101d9cc85681eee3e02c960d57e32414db6
+Subproject commit 01ddb8f836114c07cff3ca040d9ed2c946b2fdbf
diff --git a/decklink_capture.cpp b/decklink_capture.cpp

index 21a4e791a747459ed1a4dee6d81cbc5ced55a680..33bb73daa219fa51c31d1ebdd23a3f72709858d5 100644 (file)
--- a/decklink_capture.cpp
+++ b/decklink_capture.cpp
@@ -20,6 +20,8 @@
  
  #include "bmusb/bmusb.h"
  #include "decklink_util.h"
+#include "flags.h"
+#include "v210_converter.h"
  
  #define FRAME_SIZE (8 << 20)  // 8 MB.
  
@@ -138,6 +140,18 @@ size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t
  
  #endif  // __SSE2__
  
+BMDPixelFormat pixel_format_to_bmd(PixelFormat pixel_format)
+{
+       switch (pixel_format) {
+       case PixelFormat_8BitYCbCr:
+               return bmdFormat8BitYUV;
+       case PixelFormat_10BitYCbCr:
+               return bmdFormat10BitYUV;
+       default:
+               assert(false);
+       }
+}
+
  }  // namespace
  
  DeckLinkCapture::DeckLinkCapture(IDeckLink *card, int card_index)
@@ -329,7 +343,13 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
                 int width = video_frame->GetWidth();
                 int height = video_frame->GetHeight();
                 const int stride = video_frame->GetRowBytes();
-               assert(stride == width * 2);
+               const BMDPixelFormat format = video_frame->GetPixelFormat();
+               assert(format == pixel_format_to_bmd(current_pixel_format));
+               if (global_flags.ten_bit_input) {
+                       assert(stride == int(v210Converter::get_v210_stride(width)));
+               } else {
+                       assert(stride == width * 2);
+               }
  
                 current_video_frame = video_frame_allocator->alloc_frame();
                 if (current_video_frame.data != nullptr) {
@@ -362,6 +382,7 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
  
                         video_format.width = width;
                         video_format.height = height;
+                       video_format.stride = stride;
                 }
         }
  
@@ -413,7 +434,7 @@ void DeckLinkCapture::start_bm_capture()
         if (running) {
                 return;
         }
-       if (input->EnableVideoInput(current_video_mode, bmdFormat8BitYUV, supports_autodetect ? bmdVideoInputEnableFormatDetection : 0) != S_OK) {
+       if (input->EnableVideoInput(current_video_mode, pixel_format_to_bmd(current_pixel_format), supports_autodetect ? bmdVideoInputEnableFormatDetection : 0) != S_OK) {
                 fprintf(stderr, "Failed to set video mode 0x%04x for card %d\n", current_video_mode, card_index);
                 exit(1);
         }
@@ -469,11 +490,17 @@ void DeckLinkCapture::set_video_mode(uint32_t video_mode_id)
         }
  }
  
+void DeckLinkCapture::set_pixel_format(PixelFormat pixel_format)
+{
+       current_pixel_format = pixel_format;
+       set_video_mode(current_video_mode);
+}
+
  void DeckLinkCapture::set_video_mode_no_restart(uint32_t video_mode_id)
  {
         BMDDisplayModeSupport support;
         IDeckLinkDisplayMode *display_mode;
-       if (input->DoesSupportVideoMode(video_mode_id, bmdFormat8BitYUV, /*flags=*/0, &support, &display_mode)) {
+       if (input->DoesSupportVideoMode(video_mode_id, pixel_format_to_bmd(current_pixel_format), /*flags=*/0, &support, &display_mode)) {
                 fprintf(stderr, "Failed to query display mode for card %d\n", card_index);
                 exit(1);
         }
@@ -491,7 +518,7 @@ void DeckLinkCapture::set_video_mode_no_restart(uint32_t video_mode_id)
         field_dominance = display_mode->GetFieldDominance();
  
         if (running) {
-               if (input->EnableVideoInput(video_mode_id, bmdFormat8BitYUV, supports_autodetect ? bmdVideoInputEnableFormatDetection : 0) != S_OK) {
+               if (input->EnableVideoInput(video_mode_id, pixel_format_to_bmd(current_pixel_format), supports_autodetect ? bmdVideoInputEnableFormatDetection : 0) != S_OK) {
                         fprintf(stderr, "Failed to set video mode 0x%04x for card %d\n", video_mode_id, card_index);
                         exit(1);
                 }
diff --git a/decklink_capture.h b/decklink_capture.h

index 1bdf9ade9ca96232dab2df839b761a64f9c54fb2..f940241a854f7acc941b1af56d2a35567389458e 100644 (file)
--- a/decklink_capture.h
+++ b/decklink_capture.h
@@ -7,6 +7,7 @@
  #include <functional>
  #include <map>
  #include <memory>
+#include <set>
  #include <string>
  
  #include "DeckLinkAPIModes.h"
@@ -93,6 +94,14 @@ public:
         void set_video_mode(uint32_t video_mode_id) override;
         uint32_t get_current_video_mode() const override { return current_video_mode; }
  
+       std::set<bmusb::PixelFormat> get_available_pixel_formats() const override {
+               return std::set<bmusb::PixelFormat>{ bmusb::PixelFormat_8BitYCbCr, bmusb::PixelFormat_10BitYCbCr };
+       }
+       void set_pixel_format(bmusb::PixelFormat pixel_format) override;
+       bmusb::PixelFormat get_current_pixel_format() const override {
+               return current_pixel_format;
+       }
+
         std::map<uint32_t, std::string> get_available_video_inputs() const override { return video_inputs; }
         void set_video_input(uint32_t video_input_id) override;
         uint32_t get_current_video_input() const override { return current_video_input; }
@@ -132,6 +141,7 @@ private:
  
         std::map<uint32_t, bmusb::VideoMode> video_modes;
         BMDDisplayMode current_video_mode;
+       bmusb::PixelFormat current_pixel_format = bmusb::PixelFormat_8BitYCbCr;
  
         std::map<uint32_t, std::string> video_inputs;
         BMDVideoConnection current_video_input;
diff --git a/flags.cpp b/flags.cpp

index 9c91bf7bee2cd7c269fd21a3566e073a5ba16266..025b6d9b07708504e83b2325b4f406d2233c5238 100644 (file)
--- a/flags.cpp
+++ b/flags.cpp
@@ -53,6 +53,7 @@ enum LongOption {
         OPTION_OUTPUT_SLOP_FRAMES,
         OPTION_TIMECODE_STREAM,
         OPTION_TIMECODE_STDOUT,
+       OPTION_10_BIT_INPUT,
  };
  
  void usage()
@@ -123,6 +124,7 @@ void usage()
         fprintf(stderr, "                                    dropping the frame (default 0.5)\n");
         fprintf(stderr, "      --timecode-stream           show timestamp and timecode in stream\n");
         fprintf(stderr, "      --timecode-stdout           show timestamp and timecode on standard output\n");
+       fprintf(stderr, "      --10-bit-input              use 10-bit video input (requires compute shaders)\n");
  }
  
  void parse_flags(int argc, char * const argv[])
@@ -177,6 +179,7 @@ void parse_flags(int argc, char * const argv[])
                 { "output-slop-frames", required_argument, 0, OPTION_OUTPUT_SLOP_FRAMES },
                 { "timecode-stream", no_argument, 0, OPTION_TIMECODE_STREAM },
                 { "timecode-stdout", no_argument, 0, OPTION_TIMECODE_STDOUT },
+               { "10-bit-input", no_argument, 0, OPTION_10_BIT_INPUT },
                 { 0, 0, 0, 0 }
         };
         vector<string> theme_dirs;
@@ -354,6 +357,9 @@ void parse_flags(int argc, char * const argv[])
                 case OPTION_TIMECODE_STDOUT:
                         global_flags.display_timecode_on_stdout = true;
                         break;
+               case OPTION_10_BIT_INPUT:
+                       global_flags.ten_bit_input = true;
+                       break;
                 case OPTION_HELP:
                         usage();
                         exit(0);
diff --git a/flags.h b/flags.h

index 91b22212bdad607f3840dc7fad992056a60a7600..12bc3d400ff49ba52251ce28db65a28c644dcb15 100644 (file)
--- a/flags.h
+++ b/flags.h
@@ -49,6 +49,7 @@ struct Flags {
         int max_input_queue_frames = 6;
         bool display_timecode_in_stream = false;
         bool display_timecode_on_stdout = false;
+       bool ten_bit_input = false;
  };
  extern Flags global_flags;
  
diff --git a/mixer.cpp b/mixer.cpp

index 81d4b73e5a2425027332071e02635596e4060fb7..a5ceeaf052b66edb95966446c3d5741ce925fb14 100644 (file)
--- a/mixer.cpp
+++ b/mixer.cpp
@@ -46,6 +46,7 @@
  #include "resampling_queue.h"
  #include "timebase.h"
  #include "timecode_renderer.h"
+#include "v210_converter.h"
  #include "video_encoder.h"
  
  class IDeckLink;
@@ -79,29 +80,48 @@ void insert_new_frame(RefCountedFrame frame, unsigned field_num, bool interlaced
  
  void ensure_texture_resolution(PBOFrameAllocator::Userdata *userdata, unsigned field, unsigned width, unsigned height)
  {
-       if (userdata->tex_y[field] == 0 ||
-           userdata->tex_cbcr[field] == 0 ||
+       bool first;
+       if (global_flags.ten_bit_input) {
+               first = userdata->tex_v210[field] == 0 || userdata->tex_444[field] == 0;
+       } else {
+               first = userdata->tex_y[field] == 0 || userdata->tex_cbcr[field] == 0;
+       }
+
+       if (first ||
             width != userdata->last_width[field] ||
             height != userdata->last_height[field]) {
-               size_t cbcr_width = width / 2;
-
                 // We changed resolution since last use of this texture, so we need to create
                 // a new object. Note that this each card has its own PBOFrameAllocator,
                 // we don't need to worry about these flip-flopping between resolutions.
-               glBindTexture(GL_TEXTURE_2D, userdata->tex_cbcr[field]);
-               check_error();
-               glTexImage2D(GL_TEXTURE_2D, 0, GL_RG8, cbcr_width, height, 0, GL_RG, GL_UNSIGNED_BYTE, nullptr);
-               check_error();
-               glBindTexture(GL_TEXTURE_2D, userdata->tex_y[field]);
-               check_error();
-               glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, width, height, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
-               check_error();
+               if (global_flags.ten_bit_input) {
+                       const size_t v210_width = v210Converter::get_minimum_v210_texture_width(width);
+
+                       glBindTexture(GL_TEXTURE_2D, userdata->tex_v210[field]);
+                       check_error();
+                       glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB10_A2, v210_width, height, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, nullptr);
+                       check_error();
+                       glBindTexture(GL_TEXTURE_2D, userdata->tex_444[field]);
+                       check_error();
+                       glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB10_A2, width, height, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, nullptr);
+                       check_error();
+               } else {
+                       size_t cbcr_width = width / 2;
+
+                       glBindTexture(GL_TEXTURE_2D, userdata->tex_cbcr[field]);
+                       check_error();
+                       glTexImage2D(GL_TEXTURE_2D, 0, GL_RG8, cbcr_width, height, 0, GL_RG, GL_UNSIGNED_BYTE, nullptr);
+                       check_error();
+                       glBindTexture(GL_TEXTURE_2D, userdata->tex_y[field]);
+                       check_error();
+                       glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, width, height, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
+                       check_error();
+               }
                 userdata->last_width[field] = width;
                 userdata->last_height[field] = height;
         }
  }
  
-void upload_texture(GLuint tex, GLuint width, GLuint height, GLuint stride, bool interlaced_stride, GLenum format, GLintptr offset)
+void upload_texture(GLuint tex, GLuint width, GLuint height, GLuint stride, bool interlaced_stride, GLenum format, GLenum type, GLintptr offset)
  {
         if (interlaced_stride) {
                 stride *= 2;
@@ -121,7 +141,7 @@ void upload_texture(GLuint tex, GLuint width, GLuint height, GLuint stride, bool
                 check_error();
         }
  
-       glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, format, GL_UNSIGNED_BYTE, BUFFER_OFFSET(offset));
+       glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, format, type, BUFFER_OFFSET(offset));
         check_error();
         glBindTexture(GL_TEXTURE_2D, 0);
         check_error();
@@ -252,6 +272,24 @@ Mixer::Mixer(const QSurfaceFormat &format, unsigned num_cards)
  
         chroma_subsampler.reset(new ChromaSubsampler(resource_pool.get()));
  
+       if (global_flags.ten_bit_input) {
+               if (!v210Converter::has_hardware_support()) {
+                       fprintf(stderr, "ERROR: --ten-bit-input requires support for OpenGL compute shaders\n");
+                       fprintf(stderr, "       (OpenGL 4.3, or GL_ARB_compute_shader + GL_ARB_shader_image_load_store).\n");
+                       exit(1);
+               }
+               v210_converter.reset(new v210Converter());
+
+               // These are all the widths listed in the Blackmagic SDK documentation
+               // (section 2.7.3, “Display Modes”).
+               v210_converter->precompile_shader(720);
+               v210_converter->precompile_shader(1280);
+               v210_converter->precompile_shader(1920);
+               v210_converter->precompile_shader(2048);
+               v210_converter->precompile_shader(3840);
+               v210_converter->precompile_shader(4096);
+       }
+
         timecode_renderer.reset(new TimecodeRenderer(resource_pool.get(), global_flags.width, global_flags.height));
         display_timecode_in_stream = global_flags.display_timecode_in_stream;
         display_timecode_on_stdout = global_flags.display_timecode_on_stdout;
@@ -308,6 +346,7 @@ void Mixer::configure_card(unsigned card_index, CaptureInterface *capture, bool
         }
         while (!card->new_frames.empty()) card->new_frames.pop_front();
         card->last_timecode = -1;
+       card->capture->set_pixel_format(global_flags.ten_bit_input ? PixelFormat_10BitYCbCr : PixelFormat_8BitYCbCr);
         card->capture->configure_card();
  
         // NOTE: start_bm_capture() happens in thread_func().
@@ -450,7 +489,7 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode,
  
         card->last_timecode = timecode;
  
-       size_t expected_length = video_format.width * (video_format.height + video_format.extra_lines_top + video_format.extra_lines_bottom) * 2;
+       size_t expected_length = video_format.stride * (video_format.height + video_format.extra_lines_top + video_format.extra_lines_bottom);
         if (video_frame.len - video_offset == 0 ||
             video_frame.len - video_offset != expected_length) {
                 if (video_frame.len != 0) {
@@ -503,9 +542,9 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode,
         RefCountedFrame frame(video_frame);
  
         // Upload the textures.
-       size_t cbcr_width = video_format.width / 2;
-       size_t cbcr_offset = video_offset / 2;
-       size_t y_offset = video_frame.size / 2 + video_offset / 2;
+       const size_t cbcr_width = video_format.width / 2;
+       const size_t cbcr_offset = video_offset / 2;
+       const size_t y_offset = video_frame.size / 2 + video_offset / 2;
  
         for (unsigned field = 0; field < num_fields; ++field) {
                 // Put the actual texture upload in a lambda that is executed in the main thread.
@@ -516,23 +555,31 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode,
                 // Note that this means we must hold on to the actual frame data in <userdata>
                 // until the upload command is run, but we hold on to <frame> much longer than that
                 // (in fact, all the way until we no longer use the texture in rendering).
-               auto upload_func = [field, video_format, y_offset, cbcr_offset, cbcr_width, interlaced_stride, userdata]() {
+               auto upload_func = [this, field, video_format, y_offset, video_offset, cbcr_offset, cbcr_width, interlaced_stride, userdata]() {
                         unsigned field_start_line;
                         if (field == 1) {
                                 field_start_line = video_format.second_field_start;
                         } else {
                                 field_start_line = video_format.extra_lines_top;
                         }
-                       size_t field_y_start = y_offset + video_format.width * field_start_line;
-                       size_t field_cbcr_start = cbcr_offset + cbcr_width * field_start_line * sizeof(uint16_t);
  
                         ensure_texture_resolution(userdata, field, video_format.width, video_format.height);
  
                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, userdata->pbo);
                         check_error();
  
-                       upload_texture(userdata->tex_y[field], video_format.width, video_format.height, video_format.width, interlaced_stride, GL_RED, field_y_start);
-                       upload_texture(userdata->tex_cbcr[field], cbcr_width, video_format.height, cbcr_width * sizeof(uint16_t), interlaced_stride, GL_RG, field_cbcr_start);
+                       if (global_flags.ten_bit_input) {
+                               size_t field_start = video_offset + video_format.stride * field_start_line;
+                               upload_texture(userdata->tex_v210[field], video_format.stride / sizeof(uint32_t), video_format.height, video_format.stride, interlaced_stride, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, field_start);
+                               v210_converter->convert(userdata->tex_v210[field], userdata->tex_444[field], video_format.width, video_format.height);
+                       } else {
+                               size_t field_y_start = y_offset + video_format.width * field_start_line;
+                               size_t field_cbcr_start = cbcr_offset + cbcr_width * field_start_line * sizeof(uint16_t);
+
+                               // Make up our own strides, since we are interleaving.
+                               upload_texture(userdata->tex_y[field], video_format.width, video_format.height, video_format.width, interlaced_stride, GL_RED, GL_UNSIGNED_BYTE, field_y_start);
+                               upload_texture(userdata->tex_cbcr[field], cbcr_width, video_format.height, cbcr_width * sizeof(uint16_t), interlaced_stride, GL_RG, GL_UNSIGNED_BYTE, field_cbcr_start);
+                       }
  
                         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
                         check_error();
diff --git a/mixer.h b/mixer.h

index 2543c24962ccbc960ec1c35c5ee7b512183fe85e..8f5342afd4f7a5df0257a22ec119b67a6cb9dd24 100644 (file)
--- a/mixer.h
+++ b/mixer.h
@@ -39,9 +39,10 @@
  class ALSAOutput;
  class ChromaSubsampler;
  class DeckLinkOutput;
-class TimecodeRenderer;
  class QSurface;
  class QSurfaceFormat;
+class TimecodeRenderer;
+class v210Converter;
  
  namespace movit {
  class Effect;
@@ -374,6 +375,7 @@ private:
  
         std::unique_ptr<movit::EffectChain> display_chain;
         std::unique_ptr<ChromaSubsampler> chroma_subsampler;
+       std::unique_ptr<v210Converter> v210_converter;
         std::unique_ptr<VideoEncoder> video_encoder;
  
         std::unique_ptr<TimecodeRenderer> timecode_renderer;
diff --git a/pbo_frame_allocator.cpp b/pbo_frame_allocator.cpp

index d18358a3121b7cf131de211df446a62b098db1e6..5aa47057ce4d40a94d900ccdb206b78d79078804 100644 (file)
--- a/pbo_frame_allocator.cpp
+++ b/pbo_frame_allocator.cpp
@@ -7,6 +7,8 @@
  #include <stdio.h>
  #include <cstddef>
  
+#include "flags.h"
+
  using namespace std;
  
  PBOFrameAllocator::PBOFrameAllocator(size_t frame_size, GLuint width, GLuint height, size_t num_queued_frames, GLenum buffer, GLenum permissions, GLenum map_bits)
@@ -30,15 +32,27 @@ PBOFrameAllocator::PBOFrameAllocator(size_t frame_size, GLuint width, GLuint hei
                 frame.userdata = &userdata[i];
                 userdata[i].pbo = pbo;
                 frame.owner = this;
-               frame.interleaved = true;
+
+               // For 8-bit Y'CbCr, we ask the driver to split Y' and Cb/Cr
+               // into separate textures. For 10-bit, the input format (v210)
+               // is complicated enough that we need to interpolate up to 4:4:4,
+               // which we do in a compute shader ourselves.
+               frame.interleaved = !global_flags.ten_bit_input;
  
                 // Create textures. We don't allocate any data for the second field at this point
                 // (just create the texture state with the samplers), since our default assumed
                 // resolution is progressive.
-               glGenTextures(2, userdata[i].tex_y);
-               check_error();
-               glGenTextures(2, userdata[i].tex_cbcr);
-               check_error();
+               if (global_flags.ten_bit_input) {
+                       glGenTextures(2, userdata[i].tex_v210);
+                       check_error();
+                       glGenTextures(2, userdata[i].tex_444);
+                       check_error();
+               } else {
+                       glGenTextures(2, userdata[i].tex_y);
+                       check_error();
+                       glGenTextures(2, userdata[i].tex_cbcr);
+                       check_error();
+               }
                 userdata[i].last_width[0] = width;
                 userdata[i].last_height[0] = height;
                 userdata[i].last_width[1] = 0;
@@ -47,30 +61,54 @@ PBOFrameAllocator::PBOFrameAllocator(size_t frame_size, GLuint width, GLuint hei
                 userdata[i].last_has_signal = false;
                 userdata[i].last_is_connected = false;
                 for (unsigned field = 0; field < 2; ++field) {
-                       glBindTexture(GL_TEXTURE_2D, userdata[i].tex_y[field]);
-                       check_error();
-                       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-                       check_error();
-                       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-                       check_error();
-                       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-                       check_error();
-                       if (field == 0) {
-                               glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, width, height, 0, GL_RED, GL_UNSIGNED_BYTE, NULL);
+                       if (global_flags.ten_bit_input) {
+                               glBindTexture(GL_TEXTURE_2D, userdata[i].tex_v210[field]);
                                 check_error();
-                       }
+                               // Don't care about texture parameters, we're only going to read it
+                               // from the compute shader anyway.
+                               if (field == 0) {
+                                       glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB10_A2, width, height, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, NULL);
+                                       check_error();
+                               }
  
-                       glBindTexture(GL_TEXTURE_2D, userdata[i].tex_cbcr[field]);
-                       check_error();
-                       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-                       check_error();
-                       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-                       check_error();
-                       glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-                       check_error();
-                       if (field == 0) {
-                               glTexImage2D(GL_TEXTURE_2D, 0, GL_RG8, width / 2, height, 0, GL_RG, GL_UNSIGNED_BYTE, NULL);
+                               glBindTexture(GL_TEXTURE_2D, userdata[i].tex_444[field]);
+                               check_error();
+                               glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
                                 check_error();
+                               glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+                               check_error();
+                               glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+                               check_error();
+                               if (field == 0) {
+                                       glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB10_A2, width, height, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, NULL);
+                                       check_error();
+                               }
+                       } else {
+                               glBindTexture(GL_TEXTURE_2D, userdata[i].tex_y[field]);
+                               check_error();
+                               glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+                               check_error();
+                               glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+                               check_error();
+                               glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+                               check_error();
+                               if (field == 0) {
+                                       glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, width, height, 0, GL_RED, GL_UNSIGNED_BYTE, NULL);
+                                       check_error();
+                               }
+
+                               glBindTexture(GL_TEXTURE_2D, userdata[i].tex_cbcr[field]);
+                               check_error();
+                               glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+                               check_error();
+                               glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+                               check_error();
+                               glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+                               check_error();
+                               if (field == 0) {
+                                       glTexImage2D(GL_TEXTURE_2D, 0, GL_RG8, width / 2, height, 0, GL_RG, GL_UNSIGNED_BYTE, NULL);
+                                       check_error();
+                               }
                         }
                 }
  
@@ -96,10 +134,17 @@ PBOFrameAllocator::~PBOFrameAllocator()
                 check_error();
                 glDeleteBuffers(1, &pbo);
                 check_error();
-               glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_y);
-               check_error();
-               glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_cbcr);
-               check_error();
+               if (global_flags.ten_bit_input) {
+                       glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_v210);
+                       check_error();
+                       glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_444);
+                       check_error();
+               } else {
+                       glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_y);
+                       check_error();
+                       glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_cbcr);
+                       check_error();
+               }
         }
  }
  //static int sumsum = 0;
diff --git a/pbo_frame_allocator.h b/pbo_frame_allocator.h

index 024e57a30dc2f60d771659f561b2ac43a337b7aa..43310ae4a6f9491ceb4ae0f2b53ddd3c097fa865 100644 (file)
--- a/pbo_frame_allocator.h
+++ b/pbo_frame_allocator.h
@@ -31,7 +31,8 @@ public:
                 GLuint pbo;
  
                 // The second set is only used for the second field of interlaced inputs.
-               GLuint tex_y[2], tex_cbcr[2];
+               GLuint tex_y[2], tex_cbcr[2];  // For 8-bit.
+               GLuint tex_v210[2], tex_444[2];  // For 10-bit.
                 GLuint last_width[2], last_height[2];
                 bool last_interlaced, last_has_signal, last_is_connected;
                 unsigned last_frame_rate_nom, last_frame_rate_den;
diff --git a/theme.cpp b/theme.cpp

index 1c0d8c9f79fc5c2174972c168c699558846c1f58..14560c2d2e8192cb703cba6884b4f9da75a16129 100644 (file)
--- a/theme.cpp
+++ b/theme.cpp
@@ -603,9 +603,9 @@ LiveInputWrapper::LiveInputWrapper(Theme *theme, EffectChain *chain, bool overri
         // Perhaps 601 was only to indicate the subsampling positions, not the
         // colorspace itself? Tested with a Lenovo X1 gen 3 as input.
         YCbCrFormat input_ycbcr_format;
-       input_ycbcr_format.chroma_subsampling_x = 2;
+       input_ycbcr_format.chroma_subsampling_x = global_flags.ten_bit_input ? 1 : 2;
         input_ycbcr_format.chroma_subsampling_y = 1;
-       input_ycbcr_format.num_levels = 256;
+       input_ycbcr_format.num_levels = global_flags.ten_bit_input ? 1024 : 256;
         input_ycbcr_format.cb_x_position = 0.0;
         input_ycbcr_format.cr_x_position = 0.0;
         input_ycbcr_format.cb_y_position = 0.5;
@@ -629,10 +629,12 @@ LiveInputWrapper::LiveInputWrapper(Theme *theme, EffectChain *chain, bool overri
                 num_inputs = 1;
         }
         for (unsigned i = 0; i < num_inputs; ++i) {
+               // When using 10-bit input, we're converting to interleaved through v210Converter.
+               YCbCrInputSplitting splitting = global_flags.ten_bit_input ? YCBCR_INPUT_INTERLEAVED : YCBCR_INPUT_SPLIT_Y_AND_CBCR;
                 if (override_bounce) {
-                       inputs.push_back(new NonBouncingYCbCrInput(inout_format, input_ycbcr_format, global_flags.width, global_flags.height, YCBCR_INPUT_SPLIT_Y_AND_CBCR));
+                       inputs.push_back(new NonBouncingYCbCrInput(inout_format, input_ycbcr_format, global_flags.width, global_flags.height, splitting));
                 } else {
-                       inputs.push_back(new YCbCrInput(inout_format, input_ycbcr_format, global_flags.width, global_flags.height, YCBCR_INPUT_SPLIT_Y_AND_CBCR));
+                       inputs.push_back(new YCbCrInput(inout_format, input_ycbcr_format, global_flags.width, global_flags.height, splitting));
                 }
                 chain->add_input(inputs.back());
         }
@@ -681,8 +683,12 @@ void LiveInputWrapper::connect_signal(int signal_num)
                         userdata = (const PBOFrameAllocator::Userdata *)frame.frame->userdata;
                 }
  
-               inputs[i]->set_texture_num(0, userdata->tex_y[frame.field_number]);
-               inputs[i]->set_texture_num(1, userdata->tex_cbcr[frame.field_number]);
+               if (global_flags.ten_bit_input) {
+                       inputs[i]->set_texture_num(0, userdata->tex_444[frame.field_number]);
+               } else {
+                       inputs[i]->set_texture_num(0, userdata->tex_y[frame.field_number]);
+                       inputs[i]->set_texture_num(1, userdata->tex_cbcr[frame.field_number]);
+               }
                 inputs[i]->set_width(userdata->last_width[frame.field_number]);
                 inputs[i]->set_height(userdata->last_height[frame.field_number]);
  
diff --git a/v210_converter.cpp b/v210_converter.cpp

new file mode 100644 (file)

index 0000000..715dd5f
--- /dev/null
+++ b/v210_converter.cpp
@@ -0,0 +1,156 @@
+#include "v210_converter.h"
+
+#include <epoxy/gl.h>
+#include <movit/util.h>
+
+using namespace std;
+
+v210Converter::~v210Converter()
+{
+       for (const auto &shader : shaders) {
+               glDeleteProgram(shader.second.glsl_program_num);
+               check_error();
+       }
+}
+
+bool v210Converter::has_hardware_support()
+{
+       // We don't have a GLES version of this, although GLSL ES 3.1 supports
+       // compute shaders. Note that GLSL ES has some extra restrictions,
+       // like requiring that the images are allocated with glTexStorage*(),
+       // or that binding= is effectively mandatory.
+       if (!epoxy_is_desktop_gl()) {
+               return false;
+       }
+       if (epoxy_gl_version() >= 43) {
+               // Supports compute shaders natively.
+               return true;
+       }
+       return epoxy_has_gl_extension("GL_ARB_compute_shader") &&
+              epoxy_has_gl_extension("GL_ARB_shader_image_load_store");
+}
+
+void v210Converter::precompile_shader(unsigned width)
+{
+       unsigned num_local_work_groups = (width + 5) / 6;
+       if (shaders.count(num_local_work_groups)) {
+               // Already exists.
+               return;
+       }
+
+       char buf[16];
+       snprintf(buf, sizeof(buf), "%u", num_local_work_groups);
+        string shader_src = R"(#version 150
+#extension GL_ARB_compute_shader : enable
+#extension GL_ARB_shader_image_load_store : enable
+layout(local_size_x = )" + string(buf) + R"() in;
+layout(rgb10_a2) uniform restrict readonly image2D inbuf;
+layout(rgb10_a2) uniform restrict writeonly image2D outbuf;
+uniform int max_cbcr_x;
+shared vec2 cbcr[gl_WorkGroupSize.x * 3u];
+
+void main()
+{
+       int xb = int(gl_LocalInvocationID.x);  // X block.
+       int y = int(gl_GlobalInvocationID.y);  // Y (actual line).
+
+       // Load our pixel group, containing data for six pixels.
+       vec3 indata[4];
+       for (int i = 0; i < 4; ++i) {
+               indata[i] = imageLoad(inbuf, ivec2(xb * 4 + i, y)).xyz;
+       }
+
+       // Decode Cb and Cr to shared memory, because neighboring blocks need it for interpolation.
+       cbcr[xb * 3 + 0] = indata[0].xz;
+       cbcr[xb * 3 + 1] = vec2(indata[1].y, indata[2].x);
+       cbcr[xb * 3 + 2] = vec2(indata[2].z, indata[3].y);
+       memoryBarrierShared();
+
+       float pix_y[6];
+       pix_y[0] = indata[0].y;
+       pix_y[1] = indata[1].x;
+       pix_y[2] = indata[1].z;
+       pix_y[3] = indata[2].y;
+       pix_y[4] = indata[3].x;
+       pix_y[5] = indata[3].z;
+
+       barrier();
+
+       // Interpolate the missing Cb/Cr pixels, taking care not to read past the end of the screen
+       // for pixels that we use for interpolation.
+       vec2 pix_cbcr[7];
+       pix_cbcr[0] = indata[0].xz;
+       pix_cbcr[2] = cbcr[min(xb * 3 + 1, max_cbcr_x)];
+       pix_cbcr[4] = cbcr[min(xb * 3 + 2, max_cbcr_x)];
+       pix_cbcr[6] = cbcr[min(xb * 3 + 3, max_cbcr_x)];
+       pix_cbcr[1] = 0.5 * (pix_cbcr[0] + pix_cbcr[2]);
+       pix_cbcr[3] = 0.5 * (pix_cbcr[2] + pix_cbcr[4]);
+       pix_cbcr[5] = 0.5 * (pix_cbcr[4] + pix_cbcr[6]);
+
+       // Write the decoded pixels to the destination texture.
+       for (int i = 0; i < 6; ++i) {
+               vec4 outdata = vec4(pix_y[i], pix_cbcr[i].x, pix_cbcr[i].y, 1.0f);
+               imageStore(outbuf, ivec2(xb * 6 + i, y), outdata);
+       }
+}
+)";
+
+       Shader shader;
+
+       GLuint shader_num = movit::compile_shader(shader_src, GL_COMPUTE_SHADER);
+       check_error();
+       shader.glsl_program_num = glCreateProgram();
+       check_error();
+       glAttachShader(shader.glsl_program_num, shader_num);
+       check_error();
+       glLinkProgram(shader.glsl_program_num);
+       check_error();
+
+       GLint success;
+       glGetProgramiv(shader.glsl_program_num, GL_LINK_STATUS, &success);
+       check_error();
+       if (success == GL_FALSE) {
+               GLchar error_log[1024] = {0};
+               glGetProgramInfoLog(shader.glsl_program_num, 1024, NULL, error_log);
+               fprintf(stderr, "Error linking program: %s\n", error_log);
+               exit(1);
+       }
+
+       shader.max_cbcr_x_pos = glGetUniformLocation(shader.glsl_program_num, "max_cbcr_x");
+       check_error();
+       shader.inbuf_pos = glGetUniformLocation(shader.glsl_program_num, "inbuf");
+       check_error();
+       shader.outbuf_pos = glGetUniformLocation(shader.glsl_program_num, "outbuf");
+       check_error();
+
+       shaders.emplace(num_local_work_groups, shader);
+}
+
+void v210Converter::convert(GLuint tex_src, GLuint tex_dst, unsigned width, unsigned height)
+{
+       precompile_shader(width);
+       unsigned num_local_work_groups = (width + 5) / 6;
+       const Shader &shader = shaders[num_local_work_groups];
+
+       glUseProgram(shader.glsl_program_num);
+       check_error();
+       glUniform1i(shader.max_cbcr_x_pos, width / 2 - 1);
+       check_error();
+
+       // Bind the textures.
+       glUniform1i(shader.inbuf_pos, 0);
+       check_error();
+       glUniform1i(shader.outbuf_pos, 1);
+       check_error();
+        glBindImageTexture(0, tex_src, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RGB10_A2);
+       check_error();
+        glBindImageTexture(1, tex_dst, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGB10_A2);
+       check_error();
+
+       // Actually run the shader.
+       glDispatchCompute(1, height, 1);
+       check_error();
+
+       glUseProgram(0);
+       check_error();
+}
diff --git a/v210_converter.h b/v210_converter.h

new file mode 100644 (file)

index 0000000..39c456f
--- /dev/null
+++ b/v210_converter.h
@@ -0,0 +1,103 @@
+#ifndef _V210CONVERTER_H
+#define _V210CONVERTER_H 1
+
+// v210 is a 10-bit 4:2:2 interleaved Y'CbCr format, packing three values
+// into a 32-bit int (leaving two unused bits at the top) with chroma being
+// sub-sited with the left luma sample. Even though this 2:10:10:10-arrangement
+// can be sampled from using the GL_RGB10_A2/GL_UNSIGNED_2_10_10_10_REV format,
+// the placement of the Y', Cb and Cr parts within these ints is rather
+// complicated, and thus hard to get a single Y'CbCr pixel from efficiently,
+// especially on a GPU. Six pixels (six Y', three Cb, three Cr) are packed into
+// four such ints in the following pattern (see e.g. the DeckLink documentation
+// for reference):
+//
+//   A  B   G   R
+// -----------------
+//   X Cr0 Y0  Cb0
+//   X  Y2 Cb2  Y1
+//   X Cb4 Y3  Cr2
+//   X  Y5 Cr4  Y4
+//
+// This patterns repeats for as long as needed, with the additional constraint
+// that stride must be divisible by 128 (or equivalently, 32 four-byte ints,
+// or eight pixel groups representing 48 pixels in all).
+//
+// Thus, v210Converter allows you to convert from v210 to a more regular
+// 4:4:4 format (upsampling Cb/Cr on the way, using linear interpolation)
+// that the GPU supports natively, again in the form of GL_RGB10_A2
+// (with Y', Cb, Cr packed as R, G and B, respectively -- the “alpha” channel
+// is always 1).
+//
+// It does this fairly efficiently using a compute shader, which means you'll
+// need compute shader support (GL_ARB_compute_shader + GL_ARB_shader_image_load_store,
+// or equivalently, OpenGL 4.3 or newer) to use it. There are many possible
+// strategies for doing this in a compute shader, but I ended up settling
+// a fairly simple one after some benchmarking; each work unit takes in
+// a single four-int group and writes six samples, but as the interpolation
+// needs the leftmost chroma samples from the work unit at the right, each line
+// is put into a local work group. Cb/Cr is first decoded into shared memory
+// (OpenGL guarantees at least 32 kB shared memory for the work group, which is
+// enough for up to 6K video or so), and then the rest of the shuffling and
+// writing happens. Each line can of course be converted entirely
+// independently, so we can fire up as many such work groups as we have lines.
+//
+// On the Haswell GPU where I developed it (with single-channel memory),
+// conversion takes about 1.4 ms for a 720p frame, so it should be possible to
+// keep up multiple inputs at 720p60, although probably a faster machine is
+// needed if we want to run e.g. heavy scaling filters in the same pipeline.
+// (1.4 ms equates to about 35% of the theoretical memory bandwidth of
+// 12.8 GB/sec, which is pretty good.)
+
+#include <map>
+
+#include <epoxy/gl.h>
+
+class v210Converter {
+public:
+       ~v210Converter();
+
+       // Whether the current hardware and driver supports the compute shader
+       // necessary to do this conversion.
+       static bool has_hardware_support();
+
+       // Given an image width, returns the minimum number of 32-bit groups
+       // needed for each line. This can be used to size the input texture properly.
+       static GLuint get_minimum_v210_texture_width(unsigned width)
+       {
+               unsigned num_local_groups = (width + 5) / 6;
+               return 4 * num_local_groups;
+       }
+
+       // Given an image width, returns the stride (in bytes) for each line.
+       static size_t get_v210_stride(unsigned width)
+       {
+               return (width + 47) / 48 * 128;
+       }
+
+       // Since work groups need to be determined at shader compile time,
+       // each width needs potentially a different shader. You can call this
+       // function at startup to make sure a shader for the given width
+       // has been compiled, making sure you don't need to start an expensive
+       // compilation job while video is running if a new resolution comes along.
+       // This is not required, but generally recommended.
+       void precompile_shader(unsigned width);
+
+       // Do the actual conversion. tex_src is assumed to be a GL_RGB10_A2
+       // texture of at least [get_minimum_v210_texture_width(width), height].
+       // tex_dst is assumed to be a GL_RGB10_A2 texture of exactly [width, height]
+       // (actually, other sizes will work fine, but be nonsensical).
+       // No textures will be allocated or deleted.
+       void convert(GLuint tex_src, GLuint tex_dst, unsigned width, unsigned height);
+
+private:
+       // Key is number of local groups, ie., ceil(width / 6).
+       struct Shader {
+               GLuint glsl_program_num = -1;
+
+               // Uniform locations.
+               GLuint max_cbcr_x_pos = -1, inbuf_pos = -1, outbuf_pos = -1;
+       };
+       std::map<unsigned, Shader> shaders;
+};
+
+#endif  // !defined(_V210CONVERTER_H)
author	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Tue, 21 Feb 2017 17:42:26 +0000 (18:42 +0100)
committer	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Tue, 21 Feb 2017 17:42:26 +0000 (18:42 +0100)
Makefile		patch \| blob \| history
README		patch \| blob \| history
bmusb		patch \| blob \| history
decklink_capture.cpp		patch \| blob \| history
decklink_capture.h		patch \| blob \| history
flags.cpp		patch \| blob \| history
flags.h		patch \| blob \| history
mixer.cpp		patch \| blob \| history
mixer.h		patch \| blob \| history
pbo_frame_allocator.cpp		patch \| blob \| history
pbo_frame_allocator.h		patch \| blob \| history
theme.cpp		patch \| blob \| history
v210_converter.cpp	[new file with mode: 0644]	patch \| blob
v210_converter.h	[new file with mode: 0644]	patch \| blob