Move everything into a separate futatabi/ subdir, for the upcoming merge with Futatabi.

[nageru] / v210_converter.h
diff --git a/v210_converter.h b/v210_converter.h

deleted file mode 100644 (file)

index 39c456f..0000000
--- a/v210_converter.h
+++ /dev/null
@@ -1,103 +0,0 @@
-#ifndef _V210CONVERTER_H
-#define _V210CONVERTER_H 1
-
-// v210 is a 10-bit 4:2:2 interleaved Y'CbCr format, packing three values
-// into a 32-bit int (leaving two unused bits at the top) with chroma being
-// sub-sited with the left luma sample. Even though this 2:10:10:10-arrangement
-// can be sampled from using the GL_RGB10_A2/GL_UNSIGNED_2_10_10_10_REV format,
-// the placement of the Y', Cb and Cr parts within these ints is rather
-// complicated, and thus hard to get a single Y'CbCr pixel from efficiently,
-// especially on a GPU. Six pixels (six Y', three Cb, three Cr) are packed into
-// four such ints in the following pattern (see e.g. the DeckLink documentation
-// for reference):
-//
-//   A  B   G   R
-// -----------------
-//   X Cr0 Y0  Cb0
-//   X  Y2 Cb2  Y1
-//   X Cb4 Y3  Cr2
-//   X  Y5 Cr4  Y4
-//
-// This patterns repeats for as long as needed, with the additional constraint
-// that stride must be divisible by 128 (or equivalently, 32 four-byte ints,
-// or eight pixel groups representing 48 pixels in all).
-//
-// Thus, v210Converter allows you to convert from v210 to a more regular
-// 4:4:4 format (upsampling Cb/Cr on the way, using linear interpolation)
-// that the GPU supports natively, again in the form of GL_RGB10_A2
-// (with Y', Cb, Cr packed as R, G and B, respectively -- the “alpha” channel
-// is always 1).
-//
-// It does this fairly efficiently using a compute shader, which means you'll
-// need compute shader support (GL_ARB_compute_shader + GL_ARB_shader_image_load_store,
-// or equivalently, OpenGL 4.3 or newer) to use it. There are many possible
-// strategies for doing this in a compute shader, but I ended up settling
-// a fairly simple one after some benchmarking; each work unit takes in
-// a single four-int group and writes six samples, but as the interpolation
-// needs the leftmost chroma samples from the work unit at the right, each line
-// is put into a local work group. Cb/Cr is first decoded into shared memory
-// (OpenGL guarantees at least 32 kB shared memory for the work group, which is
-// enough for up to 6K video or so), and then the rest of the shuffling and
-// writing happens. Each line can of course be converted entirely
-// independently, so we can fire up as many such work groups as we have lines.
-//
-// On the Haswell GPU where I developed it (with single-channel memory),
-// conversion takes about 1.4 ms for a 720p frame, so it should be possible to
-// keep up multiple inputs at 720p60, although probably a faster machine is
-// needed if we want to run e.g. heavy scaling filters in the same pipeline.
-// (1.4 ms equates to about 35% of the theoretical memory bandwidth of
-// 12.8 GB/sec, which is pretty good.)
-
-#include <map>
-
-#include <epoxy/gl.h>
-
-class v210Converter {
-public:
-       ~v210Converter();
-
-       // Whether the current hardware and driver supports the compute shader
-       // necessary to do this conversion.
-       static bool has_hardware_support();
-
-       // Given an image width, returns the minimum number of 32-bit groups
-       // needed for each line. This can be used to size the input texture properly.
-       static GLuint get_minimum_v210_texture_width(unsigned width)
-       {
-               unsigned num_local_groups = (width + 5) / 6;
-               return 4 * num_local_groups;
-       }
-
-       // Given an image width, returns the stride (in bytes) for each line.
-       static size_t get_v210_stride(unsigned width)
-       {
-               return (width + 47) / 48 * 128;
-       }
-
-       // Since work groups need to be determined at shader compile time,
-       // each width needs potentially a different shader. You can call this
-       // function at startup to make sure a shader for the given width
-       // has been compiled, making sure you don't need to start an expensive
-       // compilation job while video is running if a new resolution comes along.
-       // This is not required, but generally recommended.
-       void precompile_shader(unsigned width);
-
-       // Do the actual conversion. tex_src is assumed to be a GL_RGB10_A2
-       // texture of at least [get_minimum_v210_texture_width(width), height].
-       // tex_dst is assumed to be a GL_RGB10_A2 texture of exactly [width, height]
-       // (actually, other sizes will work fine, but be nonsensical).
-       // No textures will be allocated or deleted.
-       void convert(GLuint tex_src, GLuint tex_dst, unsigned width, unsigned height);
-
-private:
-       // Key is number of local groups, ie., ceil(width / 6).
-       struct Shader {
-               GLuint glsl_program_num = -1;
-
-               // Uniform locations.
-               GLuint max_cbcr_x_pos = -1, inbuf_pos = -1, outbuf_pos = -1;
-       };
-       std::map<unsigned, Shader> shaders;
-};
-
-#endif  // !defined(_V210CONVERTER_H)