1 #ifndef _V210CONVERTER_H
2 #define _V210CONVERTER_H 1
4 // v210 is a 10-bit 4:2:2 interleaved Y'CbCr format, packing three values
5 // into a 32-bit int (leaving two unused bits at the top) with chroma being
6 // sub-sited with the left luma sample. Even though this 2:10:10:10-arrangement
7 // can be sampled from using the GL_RGB10_A2/GL_UNSIGNED_2_10_10_10_REV format,
8 // the placement of the Y', Cb and Cr parts within these ints is rather
9 // complicated, and thus hard to get a single Y'CbCr pixel from efficiently,
10 // especially on a GPU. Six pixels (six Y', three Cb, three Cr) are packed into
11 // four such ints in the following pattern (see e.g. the DeckLink documentation
21 // This patterns repeats for as long as needed, with the additional constraint
22 // that stride must be divisible by 128 (or equivalently, 32 four-byte ints,
23 // or eight pixel groups representing 48 pixels in all).
25 // Thus, v210Converter allows you to convert from v210 to a more regular
26 // 4:4:4 format (upsampling Cb/Cr on the way, using linear interpolation)
27 // that the GPU supports natively, again in the form of GL_RGB10_A2
28 // (with Y', Cb, Cr packed as R, G and B, respectively -- the “alpha” channel
31 // It does this fairly efficiently using a compute shader, which means you'll
32 // need compute shader support (GL_ARB_compute_shader + GL_ARB_shader_image_load_store,
33 // or equivalently, OpenGL 4.3 or newer) to use it. There are many possible
34 // strategies for doing this in a compute shader, but I ended up settling
35 // a fairly simple one after some benchmarking; each work unit takes in
36 // a single four-int group and writes six samples, but as the interpolation
37 // needs the leftmost chroma samples from the work unit at the right, each line
38 // is put into a local work group. Cb/Cr is first decoded into shared memory
39 // (OpenGL guarantees at least 32 kB shared memory for the work group, which is
40 // enough for up to 6K video or so), and then the rest of the shuffling and
41 // writing happens. Each line can of course be converted entirely
42 // independently, so we can fire up as many such work groups as we have lines.
44 // On the Haswell GPU where I developed it (with single-channel memory),
45 // conversion takes about 1.4 ms for a 720p frame, so it should be possible to
46 // keep up multiple inputs at 720p60, although probably a faster machine is
47 // needed if we want to run e.g. heavy scaling filters in the same pipeline.
48 // (1.4 ms equates to about 35% of the theoretical memory bandwidth of
49 // 12.8 GB/sec, which is pretty good.)
59 // Whether the current hardware and driver supports the compute shader
60 // necessary to do this conversion.
61 static bool has_hardware_support();
63 // Given an image width, returns the minimum number of 32-bit groups
64 // needed for each line. This can be used to size the input texture properly.
65 static GLuint get_minimum_v210_texture_width(unsigned width)
67 unsigned num_local_groups = (width + 5) / 6;
68 return 4 * num_local_groups;
71 // Given an image width, returns the stride (in bytes) for each line.
72 static size_t get_v210_stride(unsigned width)
74 return (width + 47) / 48 * 128;
77 // Since work groups need to be determined at shader compile time,
78 // each width needs potentially a different shader. You can call this
79 // function at startup to make sure a shader for the given width
80 // has been compiled, making sure you don't need to start an expensive
81 // compilation job while video is running if a new resolution comes along.
82 // This is not required, but generally recommended.
83 void precompile_shader(unsigned width);
85 // Do the actual conversion. tex_src is assumed to be a GL_RGB10_A2
86 // texture of at least [get_minimum_v210_texture_width(width), height].
87 // tex_dst is assumed to be a GL_RGB10_A2 texture of exactly [width, height]
88 // (actually, other sizes will work fine, but be nonsensical).
89 // No textures will be allocated or deleted.
90 void convert(GLuint tex_src, GLuint tex_dst, unsigned width, unsigned height);
93 // Key is number of local groups, ie., ceil(width / 6).
95 GLuint glsl_program_num = -1;
98 GLuint max_cbcr_x_pos = -1, inbuf_pos = -1, outbuf_pos = -1;
100 std::map<unsigned, Shader> shaders;
103 #endif // !defined(_V210CONVERTER_H)