git.sesse.net Git - nageru/blob - v210_converter.h

   1 #ifndef _V210CONVERTER_H
   2 #define _V210CONVERTER_H 1
   3
   4 // v210 is a 10-bit 4:2:2 interleaved Y'CbCr format, packing three values
   5 // into a 32-bit int (leaving two unused bits at the top) with chroma being
   6 // sub-sited with the left luma sample. Even though this 2:10:10:10-arrangement
   7 // can be sampled from using the GL_RGB10_A2/GL_UNSIGNED_2_10_10_10_REV format,
   8 // the placement of the Y', Cb and Cr parts within these ints is rather
   9 // complicated, and thus hard to get a single Y'CbCr pixel from efficiently,
  10 // especially on a GPU. Six pixels (six Y', three Cb, three Cr) are packed into
  11 // four such ints in the following pattern (see e.g. the DeckLink documentation
  12 // for reference):
  13 //
  14 //   A  B   G   R
  15 // -----------------
  16 //   X Cr0 Y0  Cb0
  17 //   X  Y2 Cb2  Y1
  18 //   X Cb4 Y3  Cr2
  19 //   X  Y5 Cr4  Y4
  20 //
  21 // This patterns repeats for as long as needed, with the additional constraint
  22 // that stride must be divisible by 128 (or equivalently, 32 four-byte ints,
  23 // or eight pixel groups representing 48 pixels in all).
  24 //
  25 // Thus, v210Converter allows you to convert from v210 to a more regular
  26 // 4:4:4 format (upsampling Cb/Cr on the way, using linear interpolation)
  27 // that the GPU supports natively, again in the form of GL_RGB10_A2
  28 // (with Y', Cb, Cr packed as R, G and B, respectively -- the “alpha” channel
  29 // is always 1).
  30 //
  31 // It does this fairly efficiently using a compute shader, which means you'll
  32 // need compute shader support (GL_ARB_compute_shader + GL_ARB_shader_image_load_store,
  33 // or equivalently, OpenGL 4.3 or newer) to use it. There are many possible
  34 // strategies for doing this in a compute shader, but I ended up settling
  35 // a fairly simple one after some benchmarking; each work unit takes in
  36 // a single four-int group and writes six samples, but as the interpolation
  37 // needs the leftmost chroma samples from the work unit at the right, each line
  38 // is put into a local work group. Cb/Cr is first decoded into shared memory
  39 // (OpenGL guarantees at least 32 kB shared memory for the work group, which is
  40 // enough for up to 6K video or so), and then the rest of the shuffling and
  41 // writing happens. Each line can of course be converted entirely
  42 // independently, so we can fire up as many such work groups as we have lines.
  43 //
  44 // On the Haswell GPU where I developed it (with single-channel memory),
  45 // conversion takes about 1.4 ms for a 720p frame, so it should be possible to
  46 // keep up multiple inputs at 720p60, although probably a faster machine is
  47 // needed if we want to run e.g. heavy scaling filters in the same pipeline.
  48 // (1.4 ms equates to about 35% of the theoretical memory bandwidth of
  49 // 12.8 GB/sec, which is pretty good.)
  50
  51 #include <map>
  52
  53 #include <epoxy/gl.h>
  54
  55 class v210Converter {
  56 public:
  57         ~v210Converter();
  58
  59         // Whether the current hardware and driver supports the compute shader
  60         // necessary to do this conversion.
  61         static bool has_hardware_support();
  62
  63         // Given an image width, returns the minimum number of 32-bit groups
  64         // needed for each line. This can be used to size the input texture properly.
  65         static GLuint get_minimum_v210_texture_width(unsigned width)
  66         {
  67                 unsigned num_local_groups = (width + 5) / 6;
  68                 return 4 * num_local_groups;
  69         }
  70
  71         // Given an image width, returns the stride (in bytes) for each line.
  72         static size_t get_v210_stride(unsigned width)
  73         {
  74                 return (width + 47) / 48 * 128;
  75         }
  76
  77         // Since work groups need to be determined at shader compile time,
  78         // each width needs potentially a different shader. You can call this
  79         // function at startup to make sure a shader for the given width
  80         // has been compiled, making sure you don't need to start an expensive
  81         // compilation job while video is running if a new resolution comes along.
  82         // This is not required, but generally recommended.
  83         void precompile_shader(unsigned width);
  84
  85         // Do the actual conversion. tex_src is assumed to be a GL_RGB10_A2
  86         // texture of at least [get_minimum_v210_texture_width(width), height].
  87         // tex_dst is assumed to be a GL_RGB10_A2 texture of exactly [width, height]
  88         // (actually, other sizes will work fine, but be nonsensical).
  89         // No textures will be allocated or deleted.
  90         void convert(GLuint tex_src, GLuint tex_dst, unsigned width, unsigned height);
  91
  92 private:
  93         // Key is number of local groups, ie., ceil(width / 6).
  94         struct Shader {
  95                 GLuint glsl_program_num = -1;
  96
  97                 // Uniform locations.
  98                 GLuint max_cbcr_x_pos = -1, inbuf_pos = -1, outbuf_pos = -1;
  99         };
 100         std::map<unsigned, Shader> shaders;
 101 };
 102
 103 #endif  // !defined(_V210CONVERTER_H)