# Mixer objects
AUDIO_MIXER_OBJS = audio_mixer.o alsa_input.o alsa_pool.o ebu_r128_proc.o stereocompressor.o resampling_queue.o flags.o correlation_measurer.o filter.o input_mapping.o state.pb.o
-OBJS += chroma_subsampler.o mixer.o pbo_frame_allocator.o context.o ref_counted_frame.o theme.o httpd.o flags.o image_input.o alsa_output.o disk_space_estimator.o print_latency.o timecode_renderer.o $(AUDIO_MIXER_OBJS)
+OBJS += chroma_subsampler.o v210_converter.o mixer.o pbo_frame_allocator.o context.o ref_counted_frame.o theme.o httpd.o flags.o image_input.o alsa_output.o disk_space_estimator.o print_latency.o timecode_renderer.o $(AUDIO_MIXER_OBJS)
# Streaming and encoding objects
OBJS += quicksync_encoder.o x264_encoder.o x264_speed_control.o video_encoder.o metacube2.o mux.o audio_encoder.o ffmpeg_raii.o
but also for stability.
- Movit, my GPU-based video filter library (https://movit.sesse.net).
- You will need at least version 1.3.1, but at least 1.4.0 is recommended.
+ You will need at least version 1.5.0 (unreleased; get it from git).
- Qt 5.5 or newer for the GUI.
libmovit-dev libegl1-mesa-dev libasound2-dev libx264-dev libbmusb-dev \
protobuf-compiler libprotobuf-dev libpci-dev
+Exceptions as of February 2017:
+
+ - You will need Movit from git; stretch only has 1.4.0.
+
+ - You will need bmusb from git; stretch only has 0.5.4.
The patches/ directory contains a patch that helps zita-resampler performance.
It is meant for upstream, but was not in at the time Nageru was released.
-Subproject commit aac15101d9cc85681eee3e02c960d57e32414db6
+Subproject commit 01ddb8f836114c07cff3ca040d9ed2c946b2fdbf
#include "bmusb/bmusb.h"
#include "decklink_util.h"
+#include "flags.h"
+#include "v210_converter.h"
#define FRAME_SIZE (8 << 20) // 8 MB.
#endif // __SSE2__
+BMDPixelFormat pixel_format_to_bmd(PixelFormat pixel_format)
+{
+ switch (pixel_format) {
+ case PixelFormat_8BitYCbCr:
+ return bmdFormat8BitYUV;
+ case PixelFormat_10BitYCbCr:
+ return bmdFormat10BitYUV;
+ default:
+ assert(false);
+ }
+}
+
} // namespace
DeckLinkCapture::DeckLinkCapture(IDeckLink *card, int card_index)
int width = video_frame->GetWidth();
int height = video_frame->GetHeight();
const int stride = video_frame->GetRowBytes();
- assert(stride == width * 2);
+ const BMDPixelFormat format = video_frame->GetPixelFormat();
+ assert(format == pixel_format_to_bmd(current_pixel_format));
+ if (global_flags.ten_bit_input) {
+ assert(stride == int(v210Converter::get_v210_stride(width)));
+ } else {
+ assert(stride == width * 2);
+ }
current_video_frame = video_frame_allocator->alloc_frame();
if (current_video_frame.data != nullptr) {
video_format.width = width;
video_format.height = height;
+ video_format.stride = stride;
}
}
if (running) {
return;
}
- if (input->EnableVideoInput(current_video_mode, bmdFormat8BitYUV, supports_autodetect ? bmdVideoInputEnableFormatDetection : 0) != S_OK) {
+ if (input->EnableVideoInput(current_video_mode, pixel_format_to_bmd(current_pixel_format), supports_autodetect ? bmdVideoInputEnableFormatDetection : 0) != S_OK) {
fprintf(stderr, "Failed to set video mode 0x%04x for card %d\n", current_video_mode, card_index);
exit(1);
}
}
}
+void DeckLinkCapture::set_pixel_format(PixelFormat pixel_format)
+{
+ current_pixel_format = pixel_format;
+ set_video_mode(current_video_mode);
+}
+
void DeckLinkCapture::set_video_mode_no_restart(uint32_t video_mode_id)
{
BMDDisplayModeSupport support;
IDeckLinkDisplayMode *display_mode;
- if (input->DoesSupportVideoMode(video_mode_id, bmdFormat8BitYUV, /*flags=*/0, &support, &display_mode)) {
+ if (input->DoesSupportVideoMode(video_mode_id, pixel_format_to_bmd(current_pixel_format), /*flags=*/0, &support, &display_mode)) {
fprintf(stderr, "Failed to query display mode for card %d\n", card_index);
exit(1);
}
field_dominance = display_mode->GetFieldDominance();
if (running) {
- if (input->EnableVideoInput(video_mode_id, bmdFormat8BitYUV, supports_autodetect ? bmdVideoInputEnableFormatDetection : 0) != S_OK) {
+ if (input->EnableVideoInput(video_mode_id, pixel_format_to_bmd(current_pixel_format), supports_autodetect ? bmdVideoInputEnableFormatDetection : 0) != S_OK) {
fprintf(stderr, "Failed to set video mode 0x%04x for card %d\n", video_mode_id, card_index);
exit(1);
}
#include <functional>
#include <map>
#include <memory>
+#include <set>
#include <string>
#include "DeckLinkAPIModes.h"
void set_video_mode(uint32_t video_mode_id) override;
uint32_t get_current_video_mode() const override { return current_video_mode; }
+ std::set<bmusb::PixelFormat> get_available_pixel_formats() const override {
+ return std::set<bmusb::PixelFormat>{ bmusb::PixelFormat_8BitYCbCr, bmusb::PixelFormat_10BitYCbCr };
+ }
+ void set_pixel_format(bmusb::PixelFormat pixel_format) override;
+ bmusb::PixelFormat get_current_pixel_format() const override {
+ return current_pixel_format;
+ }
+
std::map<uint32_t, std::string> get_available_video_inputs() const override { return video_inputs; }
void set_video_input(uint32_t video_input_id) override;
uint32_t get_current_video_input() const override { return current_video_input; }
std::map<uint32_t, bmusb::VideoMode> video_modes;
BMDDisplayMode current_video_mode;
+ bmusb::PixelFormat current_pixel_format = bmusb::PixelFormat_8BitYCbCr;
std::map<uint32_t, std::string> video_inputs;
BMDVideoConnection current_video_input;
OPTION_OUTPUT_SLOP_FRAMES,
OPTION_TIMECODE_STREAM,
OPTION_TIMECODE_STDOUT,
+ OPTION_10_BIT_INPUT,
};
void usage()
fprintf(stderr, " dropping the frame (default 0.5)\n");
fprintf(stderr, " --timecode-stream show timestamp and timecode in stream\n");
fprintf(stderr, " --timecode-stdout show timestamp and timecode on standard output\n");
+ fprintf(stderr, " --10-bit-input use 10-bit video input (requires compute shaders)\n");
}
void parse_flags(int argc, char * const argv[])
{ "output-slop-frames", required_argument, 0, OPTION_OUTPUT_SLOP_FRAMES },
{ "timecode-stream", no_argument, 0, OPTION_TIMECODE_STREAM },
{ "timecode-stdout", no_argument, 0, OPTION_TIMECODE_STDOUT },
+ { "10-bit-input", no_argument, 0, OPTION_10_BIT_INPUT },
{ 0, 0, 0, 0 }
};
vector<string> theme_dirs;
case OPTION_TIMECODE_STDOUT:
global_flags.display_timecode_on_stdout = true;
break;
+ case OPTION_10_BIT_INPUT:
+ global_flags.ten_bit_input = true;
+ break;
case OPTION_HELP:
usage();
exit(0);
int max_input_queue_frames = 6;
bool display_timecode_in_stream = false;
bool display_timecode_on_stdout = false;
+ bool ten_bit_input = false;
};
extern Flags global_flags;
#include "resampling_queue.h"
#include "timebase.h"
#include "timecode_renderer.h"
+#include "v210_converter.h"
#include "video_encoder.h"
class IDeckLink;
void ensure_texture_resolution(PBOFrameAllocator::Userdata *userdata, unsigned field, unsigned width, unsigned height)
{
- if (userdata->tex_y[field] == 0 ||
- userdata->tex_cbcr[field] == 0 ||
+ bool first;
+ if (global_flags.ten_bit_input) {
+ first = userdata->tex_v210[field] == 0 || userdata->tex_444[field] == 0;
+ } else {
+ first = userdata->tex_y[field] == 0 || userdata->tex_cbcr[field] == 0;
+ }
+
+ if (first ||
width != userdata->last_width[field] ||
height != userdata->last_height[field]) {
- size_t cbcr_width = width / 2;
-
// We changed resolution since last use of this texture, so we need to create
// a new object. Note that this each card has its own PBOFrameAllocator,
// we don't need to worry about these flip-flopping between resolutions.
- glBindTexture(GL_TEXTURE_2D, userdata->tex_cbcr[field]);
- check_error();
- glTexImage2D(GL_TEXTURE_2D, 0, GL_RG8, cbcr_width, height, 0, GL_RG, GL_UNSIGNED_BYTE, nullptr);
- check_error();
- glBindTexture(GL_TEXTURE_2D, userdata->tex_y[field]);
- check_error();
- glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, width, height, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
- check_error();
+ if (global_flags.ten_bit_input) {
+ const size_t v210_width = v210Converter::get_minimum_v210_texture_width(width);
+
+ glBindTexture(GL_TEXTURE_2D, userdata->tex_v210[field]);
+ check_error();
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB10_A2, v210_width, height, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, nullptr);
+ check_error();
+ glBindTexture(GL_TEXTURE_2D, userdata->tex_444[field]);
+ check_error();
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB10_A2, width, height, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, nullptr);
+ check_error();
+ } else {
+ size_t cbcr_width = width / 2;
+
+ glBindTexture(GL_TEXTURE_2D, userdata->tex_cbcr[field]);
+ check_error();
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_RG8, cbcr_width, height, 0, GL_RG, GL_UNSIGNED_BYTE, nullptr);
+ check_error();
+ glBindTexture(GL_TEXTURE_2D, userdata->tex_y[field]);
+ check_error();
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, width, height, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
+ check_error();
+ }
userdata->last_width[field] = width;
userdata->last_height[field] = height;
}
}
-void upload_texture(GLuint tex, GLuint width, GLuint height, GLuint stride, bool interlaced_stride, GLenum format, GLintptr offset)
+void upload_texture(GLuint tex, GLuint width, GLuint height, GLuint stride, bool interlaced_stride, GLenum format, GLenum type, GLintptr offset)
{
if (interlaced_stride) {
stride *= 2;
check_error();
}
- glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, format, GL_UNSIGNED_BYTE, BUFFER_OFFSET(offset));
+ glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, format, type, BUFFER_OFFSET(offset));
check_error();
glBindTexture(GL_TEXTURE_2D, 0);
check_error();
chroma_subsampler.reset(new ChromaSubsampler(resource_pool.get()));
+ if (global_flags.ten_bit_input) {
+ if (!v210Converter::has_hardware_support()) {
+ fprintf(stderr, "ERROR: --ten-bit-input requires support for OpenGL compute shaders\n");
+ fprintf(stderr, " (OpenGL 4.3, or GL_ARB_compute_shader + GL_ARB_shader_image_load_store).\n");
+ exit(1);
+ }
+ v210_converter.reset(new v210Converter());
+
+ // These are all the widths listed in the Blackmagic SDK documentation
+ // (section 2.7.3, “Display Modes”).
+ v210_converter->precompile_shader(720);
+ v210_converter->precompile_shader(1280);
+ v210_converter->precompile_shader(1920);
+ v210_converter->precompile_shader(2048);
+ v210_converter->precompile_shader(3840);
+ v210_converter->precompile_shader(4096);
+ }
+
timecode_renderer.reset(new TimecodeRenderer(resource_pool.get(), global_flags.width, global_flags.height));
display_timecode_in_stream = global_flags.display_timecode_in_stream;
display_timecode_on_stdout = global_flags.display_timecode_on_stdout;
}
while (!card->new_frames.empty()) card->new_frames.pop_front();
card->last_timecode = -1;
+ card->capture->set_pixel_format(global_flags.ten_bit_input ? PixelFormat_10BitYCbCr : PixelFormat_8BitYCbCr);
card->capture->configure_card();
// NOTE: start_bm_capture() happens in thread_func().
card->last_timecode = timecode;
- size_t expected_length = video_format.width * (video_format.height + video_format.extra_lines_top + video_format.extra_lines_bottom) * 2;
+ size_t expected_length = video_format.stride * (video_format.height + video_format.extra_lines_top + video_format.extra_lines_bottom);
if (video_frame.len - video_offset == 0 ||
video_frame.len - video_offset != expected_length) {
if (video_frame.len != 0) {
RefCountedFrame frame(video_frame);
// Upload the textures.
- size_t cbcr_width = video_format.width / 2;
- size_t cbcr_offset = video_offset / 2;
- size_t y_offset = video_frame.size / 2 + video_offset / 2;
+ const size_t cbcr_width = video_format.width / 2;
+ const size_t cbcr_offset = video_offset / 2;
+ const size_t y_offset = video_frame.size / 2 + video_offset / 2;
for (unsigned field = 0; field < num_fields; ++field) {
// Put the actual texture upload in a lambda that is executed in the main thread.
// Note that this means we must hold on to the actual frame data in <userdata>
// until the upload command is run, but we hold on to <frame> much longer than that
// (in fact, all the way until we no longer use the texture in rendering).
- auto upload_func = [field, video_format, y_offset, cbcr_offset, cbcr_width, interlaced_stride, userdata]() {
+ auto upload_func = [this, field, video_format, y_offset, video_offset, cbcr_offset, cbcr_width, interlaced_stride, userdata]() {
unsigned field_start_line;
if (field == 1) {
field_start_line = video_format.second_field_start;
} else {
field_start_line = video_format.extra_lines_top;
}
- size_t field_y_start = y_offset + video_format.width * field_start_line;
- size_t field_cbcr_start = cbcr_offset + cbcr_width * field_start_line * sizeof(uint16_t);
ensure_texture_resolution(userdata, field, video_format.width, video_format.height);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, userdata->pbo);
check_error();
- upload_texture(userdata->tex_y[field], video_format.width, video_format.height, video_format.width, interlaced_stride, GL_RED, field_y_start);
- upload_texture(userdata->tex_cbcr[field], cbcr_width, video_format.height, cbcr_width * sizeof(uint16_t), interlaced_stride, GL_RG, field_cbcr_start);
+ if (global_flags.ten_bit_input) {
+ size_t field_start = video_offset + video_format.stride * field_start_line;
+ upload_texture(userdata->tex_v210[field], video_format.stride / sizeof(uint32_t), video_format.height, video_format.stride, interlaced_stride, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, field_start);
+ v210_converter->convert(userdata->tex_v210[field], userdata->tex_444[field], video_format.width, video_format.height);
+ } else {
+ size_t field_y_start = y_offset + video_format.width * field_start_line;
+ size_t field_cbcr_start = cbcr_offset + cbcr_width * field_start_line * sizeof(uint16_t);
+
+ // Make up our own strides, since we are interleaving.
+ upload_texture(userdata->tex_y[field], video_format.width, video_format.height, video_format.width, interlaced_stride, GL_RED, GL_UNSIGNED_BYTE, field_y_start);
+ upload_texture(userdata->tex_cbcr[field], cbcr_width, video_format.height, cbcr_width * sizeof(uint16_t), interlaced_stride, GL_RG, GL_UNSIGNED_BYTE, field_cbcr_start);
+ }
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
check_error();
class ALSAOutput;
class ChromaSubsampler;
class DeckLinkOutput;
-class TimecodeRenderer;
class QSurface;
class QSurfaceFormat;
+class TimecodeRenderer;
+class v210Converter;
namespace movit {
class Effect;
std::unique_ptr<movit::EffectChain> display_chain;
std::unique_ptr<ChromaSubsampler> chroma_subsampler;
+ std::unique_ptr<v210Converter> v210_converter;
std::unique_ptr<VideoEncoder> video_encoder;
std::unique_ptr<TimecodeRenderer> timecode_renderer;
#include <stdio.h>
#include <cstddef>
+#include "flags.h"
+
using namespace std;
PBOFrameAllocator::PBOFrameAllocator(size_t frame_size, GLuint width, GLuint height, size_t num_queued_frames, GLenum buffer, GLenum permissions, GLenum map_bits)
frame.userdata = &userdata[i];
userdata[i].pbo = pbo;
frame.owner = this;
- frame.interleaved = true;
+
+ // For 8-bit Y'CbCr, we ask the driver to split Y' and Cb/Cr
+ // into separate textures. For 10-bit, the input format (v210)
+ // is complicated enough that we need to interpolate up to 4:4:4,
+ // which we do in a compute shader ourselves.
+ frame.interleaved = !global_flags.ten_bit_input;
// Create textures. We don't allocate any data for the second field at this point
// (just create the texture state with the samplers), since our default assumed
// resolution is progressive.
- glGenTextures(2, userdata[i].tex_y);
- check_error();
- glGenTextures(2, userdata[i].tex_cbcr);
- check_error();
+ if (global_flags.ten_bit_input) {
+ glGenTextures(2, userdata[i].tex_v210);
+ check_error();
+ glGenTextures(2, userdata[i].tex_444);
+ check_error();
+ } else {
+ glGenTextures(2, userdata[i].tex_y);
+ check_error();
+ glGenTextures(2, userdata[i].tex_cbcr);
+ check_error();
+ }
userdata[i].last_width[0] = width;
userdata[i].last_height[0] = height;
userdata[i].last_width[1] = 0;
userdata[i].last_has_signal = false;
userdata[i].last_is_connected = false;
for (unsigned field = 0; field < 2; ++field) {
- glBindTexture(GL_TEXTURE_2D, userdata[i].tex_y[field]);
- check_error();
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
- check_error();
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
- check_error();
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
- check_error();
- if (field == 0) {
- glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, width, height, 0, GL_RED, GL_UNSIGNED_BYTE, NULL);
+ if (global_flags.ten_bit_input) {
+ glBindTexture(GL_TEXTURE_2D, userdata[i].tex_v210[field]);
check_error();
- }
+ // Don't care about texture parameters, we're only going to read it
+ // from the compute shader anyway.
+ if (field == 0) {
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB10_A2, width, height, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, NULL);
+ check_error();
+ }
- glBindTexture(GL_TEXTURE_2D, userdata[i].tex_cbcr[field]);
- check_error();
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
- check_error();
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
- check_error();
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
- check_error();
- if (field == 0) {
- glTexImage2D(GL_TEXTURE_2D, 0, GL_RG8, width / 2, height, 0, GL_RG, GL_UNSIGNED_BYTE, NULL);
+ glBindTexture(GL_TEXTURE_2D, userdata[i].tex_444[field]);
+ check_error();
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
check_error();
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+ check_error();
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+ check_error();
+ if (field == 0) {
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB10_A2, width, height, 0, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, NULL);
+ check_error();
+ }
+ } else {
+ glBindTexture(GL_TEXTURE_2D, userdata[i].tex_y[field]);
+ check_error();
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+ check_error();
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+ check_error();
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+ check_error();
+ if (field == 0) {
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, width, height, 0, GL_RED, GL_UNSIGNED_BYTE, NULL);
+ check_error();
+ }
+
+ glBindTexture(GL_TEXTURE_2D, userdata[i].tex_cbcr[field]);
+ check_error();
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+ check_error();
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+ check_error();
+ glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+ check_error();
+ if (field == 0) {
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_RG8, width / 2, height, 0, GL_RG, GL_UNSIGNED_BYTE, NULL);
+ check_error();
+ }
}
}
check_error();
glDeleteBuffers(1, &pbo);
check_error();
- glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_y);
- check_error();
- glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_cbcr);
- check_error();
+ if (global_flags.ten_bit_input) {
+ glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_v210);
+ check_error();
+ glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_444);
+ check_error();
+ } else {
+ glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_y);
+ check_error();
+ glDeleteTextures(2, ((Userdata *)frame.userdata)->tex_cbcr);
+ check_error();
+ }
}
}
//static int sumsum = 0;
GLuint pbo;
// The second set is only used for the second field of interlaced inputs.
- GLuint tex_y[2], tex_cbcr[2];
+ GLuint tex_y[2], tex_cbcr[2]; // For 8-bit.
+ GLuint tex_v210[2], tex_444[2]; // For 10-bit.
GLuint last_width[2], last_height[2];
bool last_interlaced, last_has_signal, last_is_connected;
unsigned last_frame_rate_nom, last_frame_rate_den;
// Perhaps 601 was only to indicate the subsampling positions, not the
// colorspace itself? Tested with a Lenovo X1 gen 3 as input.
YCbCrFormat input_ycbcr_format;
- input_ycbcr_format.chroma_subsampling_x = 2;
+ input_ycbcr_format.chroma_subsampling_x = global_flags.ten_bit_input ? 1 : 2;
input_ycbcr_format.chroma_subsampling_y = 1;
- input_ycbcr_format.num_levels = 256;
+ input_ycbcr_format.num_levels = global_flags.ten_bit_input ? 1024 : 256;
input_ycbcr_format.cb_x_position = 0.0;
input_ycbcr_format.cr_x_position = 0.0;
input_ycbcr_format.cb_y_position = 0.5;
num_inputs = 1;
}
for (unsigned i = 0; i < num_inputs; ++i) {
+ // When using 10-bit input, we're converting to interleaved through v210Converter.
+ YCbCrInputSplitting splitting = global_flags.ten_bit_input ? YCBCR_INPUT_INTERLEAVED : YCBCR_INPUT_SPLIT_Y_AND_CBCR;
if (override_bounce) {
- inputs.push_back(new NonBouncingYCbCrInput(inout_format, input_ycbcr_format, global_flags.width, global_flags.height, YCBCR_INPUT_SPLIT_Y_AND_CBCR));
+ inputs.push_back(new NonBouncingYCbCrInput(inout_format, input_ycbcr_format, global_flags.width, global_flags.height, splitting));
} else {
- inputs.push_back(new YCbCrInput(inout_format, input_ycbcr_format, global_flags.width, global_flags.height, YCBCR_INPUT_SPLIT_Y_AND_CBCR));
+ inputs.push_back(new YCbCrInput(inout_format, input_ycbcr_format, global_flags.width, global_flags.height, splitting));
}
chain->add_input(inputs.back());
}
userdata = (const PBOFrameAllocator::Userdata *)frame.frame->userdata;
}
- inputs[i]->set_texture_num(0, userdata->tex_y[frame.field_number]);
- inputs[i]->set_texture_num(1, userdata->tex_cbcr[frame.field_number]);
+ if (global_flags.ten_bit_input) {
+ inputs[i]->set_texture_num(0, userdata->tex_444[frame.field_number]);
+ } else {
+ inputs[i]->set_texture_num(0, userdata->tex_y[frame.field_number]);
+ inputs[i]->set_texture_num(1, userdata->tex_cbcr[frame.field_number]);
+ }
inputs[i]->set_width(userdata->last_width[frame.field_number]);
inputs[i]->set_height(userdata->last_height[frame.field_number]);
--- /dev/null
+#include "v210_converter.h"
+
+#include <epoxy/gl.h>
+#include <movit/util.h>
+
+using namespace std;
+
+v210Converter::~v210Converter()
+{
+ for (const auto &shader : shaders) {
+ glDeleteProgram(shader.second.glsl_program_num);
+ check_error();
+ }
+}
+
+bool v210Converter::has_hardware_support()
+{
+ // We don't have a GLES version of this, although GLSL ES 3.1 supports
+ // compute shaders. Note that GLSL ES has some extra restrictions,
+ // like requiring that the images are allocated with glTexStorage*(),
+ // or that binding= is effectively mandatory.
+ if (!epoxy_is_desktop_gl()) {
+ return false;
+ }
+ if (epoxy_gl_version() >= 43) {
+ // Supports compute shaders natively.
+ return true;
+ }
+ return epoxy_has_gl_extension("GL_ARB_compute_shader") &&
+ epoxy_has_gl_extension("GL_ARB_shader_image_load_store");
+}
+
+void v210Converter::precompile_shader(unsigned width)
+{
+ unsigned num_local_work_groups = (width + 5) / 6;
+ if (shaders.count(num_local_work_groups)) {
+ // Already exists.
+ return;
+ }
+
+ char buf[16];
+ snprintf(buf, sizeof(buf), "%u", num_local_work_groups);
+ string shader_src = R"(#version 150
+#extension GL_ARB_compute_shader : enable
+#extension GL_ARB_shader_image_load_store : enable
+layout(local_size_x = )" + string(buf) + R"() in;
+layout(rgb10_a2) uniform restrict readonly image2D inbuf;
+layout(rgb10_a2) uniform restrict writeonly image2D outbuf;
+uniform int max_cbcr_x;
+shared vec2 cbcr[gl_WorkGroupSize.x * 3u];
+
+void main()
+{
+ int xb = int(gl_LocalInvocationID.x); // X block.
+ int y = int(gl_GlobalInvocationID.y); // Y (actual line).
+
+ // Load our pixel group, containing data for six pixels.
+ vec3 indata[4];
+ for (int i = 0; i < 4; ++i) {
+ indata[i] = imageLoad(inbuf, ivec2(xb * 4 + i, y)).xyz;
+ }
+
+ // Decode Cb and Cr to shared memory, because neighboring blocks need it for interpolation.
+ cbcr[xb * 3 + 0] = indata[0].xz;
+ cbcr[xb * 3 + 1] = vec2(indata[1].y, indata[2].x);
+ cbcr[xb * 3 + 2] = vec2(indata[2].z, indata[3].y);
+ memoryBarrierShared();
+
+ float pix_y[6];
+ pix_y[0] = indata[0].y;
+ pix_y[1] = indata[1].x;
+ pix_y[2] = indata[1].z;
+ pix_y[3] = indata[2].y;
+ pix_y[4] = indata[3].x;
+ pix_y[5] = indata[3].z;
+
+ barrier();
+
+ // Interpolate the missing Cb/Cr pixels, taking care not to read past the end of the screen
+ // for pixels that we use for interpolation.
+ vec2 pix_cbcr[7];
+ pix_cbcr[0] = indata[0].xz;
+ pix_cbcr[2] = cbcr[min(xb * 3 + 1, max_cbcr_x)];
+ pix_cbcr[4] = cbcr[min(xb * 3 + 2, max_cbcr_x)];
+ pix_cbcr[6] = cbcr[min(xb * 3 + 3, max_cbcr_x)];
+ pix_cbcr[1] = 0.5 * (pix_cbcr[0] + pix_cbcr[2]);
+ pix_cbcr[3] = 0.5 * (pix_cbcr[2] + pix_cbcr[4]);
+ pix_cbcr[5] = 0.5 * (pix_cbcr[4] + pix_cbcr[6]);
+
+ // Write the decoded pixels to the destination texture.
+ for (int i = 0; i < 6; ++i) {
+ vec4 outdata = vec4(pix_y[i], pix_cbcr[i].x, pix_cbcr[i].y, 1.0f);
+ imageStore(outbuf, ivec2(xb * 6 + i, y), outdata);
+ }
+}
+)";
+
+ Shader shader;
+
+ GLuint shader_num = movit::compile_shader(shader_src, GL_COMPUTE_SHADER);
+ check_error();
+ shader.glsl_program_num = glCreateProgram();
+ check_error();
+ glAttachShader(shader.glsl_program_num, shader_num);
+ check_error();
+ glLinkProgram(shader.glsl_program_num);
+ check_error();
+
+ GLint success;
+ glGetProgramiv(shader.glsl_program_num, GL_LINK_STATUS, &success);
+ check_error();
+ if (success == GL_FALSE) {
+ GLchar error_log[1024] = {0};
+ glGetProgramInfoLog(shader.glsl_program_num, 1024, NULL, error_log);
+ fprintf(stderr, "Error linking program: %s\n", error_log);
+ exit(1);
+ }
+
+ shader.max_cbcr_x_pos = glGetUniformLocation(shader.glsl_program_num, "max_cbcr_x");
+ check_error();
+ shader.inbuf_pos = glGetUniformLocation(shader.glsl_program_num, "inbuf");
+ check_error();
+ shader.outbuf_pos = glGetUniformLocation(shader.glsl_program_num, "outbuf");
+ check_error();
+
+ shaders.emplace(num_local_work_groups, shader);
+}
+
+void v210Converter::convert(GLuint tex_src, GLuint tex_dst, unsigned width, unsigned height)
+{
+ precompile_shader(width);
+ unsigned num_local_work_groups = (width + 5) / 6;
+ const Shader &shader = shaders[num_local_work_groups];
+
+ glUseProgram(shader.glsl_program_num);
+ check_error();
+ glUniform1i(shader.max_cbcr_x_pos, width / 2 - 1);
+ check_error();
+
+ // Bind the textures.
+ glUniform1i(shader.inbuf_pos, 0);
+ check_error();
+ glUniform1i(shader.outbuf_pos, 1);
+ check_error();
+ glBindImageTexture(0, tex_src, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RGB10_A2);
+ check_error();
+ glBindImageTexture(1, tex_dst, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGB10_A2);
+ check_error();
+
+ // Actually run the shader.
+ glDispatchCompute(1, height, 1);
+ check_error();
+
+ glUseProgram(0);
+ check_error();
+}
--- /dev/null
+#ifndef _V210CONVERTER_H
+#define _V210CONVERTER_H 1
+
+// v210 is a 10-bit 4:2:2 interleaved Y'CbCr format, packing three values
+// into a 32-bit int (leaving two unused bits at the top) with chroma being
+// sub-sited with the left luma sample. Even though this 2:10:10:10-arrangement
+// can be sampled from using the GL_RGB10_A2/GL_UNSIGNED_2_10_10_10_REV format,
+// the placement of the Y', Cb and Cr parts within these ints is rather
+// complicated, and thus hard to get a single Y'CbCr pixel from efficiently,
+// especially on a GPU. Six pixels (six Y', three Cb, three Cr) are packed into
+// four such ints in the following pattern (see e.g. the DeckLink documentation
+// for reference):
+//
+// A B G R
+// -----------------
+// X Cr0 Y0 Cb0
+// X Y2 Cb2 Y1
+// X Cb4 Y3 Cr2
+// X Y5 Cr4 Y4
+//
+// This patterns repeats for as long as needed, with the additional constraint
+// that stride must be divisible by 128 (or equivalently, 32 four-byte ints,
+// or eight pixel groups representing 48 pixels in all).
+//
+// Thus, v210Converter allows you to convert from v210 to a more regular
+// 4:4:4 format (upsampling Cb/Cr on the way, using linear interpolation)
+// that the GPU supports natively, again in the form of GL_RGB10_A2
+// (with Y', Cb, Cr packed as R, G and B, respectively -- the “alpha” channel
+// is always 1).
+//
+// It does this fairly efficiently using a compute shader, which means you'll
+// need compute shader support (GL_ARB_compute_shader + GL_ARB_shader_image_load_store,
+// or equivalently, OpenGL 4.3 or newer) to use it. There are many possible
+// strategies for doing this in a compute shader, but I ended up settling
+// a fairly simple one after some benchmarking; each work unit takes in
+// a single four-int group and writes six samples, but as the interpolation
+// needs the leftmost chroma samples from the work unit at the right, each line
+// is put into a local work group. Cb/Cr is first decoded into shared memory
+// (OpenGL guarantees at least 32 kB shared memory for the work group, which is
+// enough for up to 6K video or so), and then the rest of the shuffling and
+// writing happens. Each line can of course be converted entirely
+// independently, so we can fire up as many such work groups as we have lines.
+//
+// On the Haswell GPU where I developed it (with single-channel memory),
+// conversion takes about 1.4 ms for a 720p frame, so it should be possible to
+// keep up multiple inputs at 720p60, although probably a faster machine is
+// needed if we want to run e.g. heavy scaling filters in the same pipeline.
+// (1.4 ms equates to about 35% of the theoretical memory bandwidth of
+// 12.8 GB/sec, which is pretty good.)
+
+#include <map>
+
+#include <epoxy/gl.h>
+
+class v210Converter {
+public:
+ ~v210Converter();
+
+ // Whether the current hardware and driver supports the compute shader
+ // necessary to do this conversion.
+ static bool has_hardware_support();
+
+ // Given an image width, returns the minimum number of 32-bit groups
+ // needed for each line. This can be used to size the input texture properly.
+ static GLuint get_minimum_v210_texture_width(unsigned width)
+ {
+ unsigned num_local_groups = (width + 5) / 6;
+ return 4 * num_local_groups;
+ }
+
+ // Given an image width, returns the stride (in bytes) for each line.
+ static size_t get_v210_stride(unsigned width)
+ {
+ return (width + 47) / 48 * 128;
+ }
+
+ // Since work groups need to be determined at shader compile time,
+ // each width needs potentially a different shader. You can call this
+ // function at startup to make sure a shader for the given width
+ // has been compiled, making sure you don't need to start an expensive
+ // compilation job while video is running if a new resolution comes along.
+ // This is not required, but generally recommended.
+ void precompile_shader(unsigned width);
+
+ // Do the actual conversion. tex_src is assumed to be a GL_RGB10_A2
+ // texture of at least [get_minimum_v210_texture_width(width), height].
+ // tex_dst is assumed to be a GL_RGB10_A2 texture of exactly [width, height]
+ // (actually, other sizes will work fine, but be nonsensical).
+ // No textures will be allocated or deleted.
+ void convert(GLuint tex_src, GLuint tex_dst, unsigned width, unsigned height);
+
+private:
+ // Key is number of local groups, ie., ceil(width / 6).
+ struct Shader {
+ GLuint glsl_program_num = -1;
+
+ // Uniform locations.
+ GLuint max_cbcr_x_pos = -1, inbuf_pos = -1, outbuf_pos = -1;
+ };
+ std::map<unsigned, Shader> shaders;
+};
+
+#endif // !defined(_V210CONVERTER_H)