From 4179bef190d88739038233ac5d7e5ffa2ff4282f Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Thu, 16 Nov 2017 23:07:47 +0100
Subject: [PATCH] Implement a compute shdaer version of DeinterlaceEffect.

This is currently a loss for grayscale (probably due to the extra
rgba16f bounce), but a win of about ~30% on BGRA on my Haswell.
NVIDIA doesn't care much either way.

There are some performance mysteries remaining, but it's a good start.
---
 deinterlace_effect.comp     | 236 ++++++++++++++++++++++++++++++++++++
 deinterlace_effect.cpp      | 132 ++++++++++++++++++--
 deinterlace_effect.h        |  57 +++++++++
 deinterlace_effect_test.cpp |  39 ++++--
 4 files changed, 449 insertions(+), 15 deletions(-)
 create mode 100644 deinterlace_effect.comp

diff --git a/deinterlace_effect.comp b/deinterlace_effect.comp
new file mode 100644
index 0000000..b46e5bf
--- /dev/null
+++ b/deinterlace_effect.comp
@@ -0,0 +1,236 @@
+// Implicit uniforms:
+// uniform int PREFIX(current_field_position);
+// uniform float PREFIX(inv_width);
+// uniform float PREFIX(inv_height);
+// uniform float PREFIX(current_field_vertical_offset);
+
+// Compute shader implementation of DeinterlaceEffect. See the fragment
+// shader implementation (deinterlace_effect.frag) for comments about the
+// algorithm; comments here will mainly be about issues specific to the
+// compute shader implementation.
+
+#define DIFF(s1, s2) dot((s1) - (s2), (s1) - (s2))
+
+// In input pixels (so output will be 8x32). Corresponds to get_compute_dimensions()
+// in the C++ code. It is illogical that 8x32 would be better than e.g. 32x8,
+// since we reuse more data horizontally, but especially Intel cards are much more
+// happy about this for whatever reason.
+#define GROUP_W 8
+#define GROUP_H 16
+
+// When sampling from the current field (spatial interpolation below), we have
+// a fringe of three pixels on the left and right sides, so we need to load
+// more. We also have one pixel above and below, although our destination pixel
+// is squeezed in the middle of them (they don't overlap), so we only need one
+// extra pixel.
+#define GROUP_W_FRINGE (GROUP_W + 6)
+#define GROUP_H_FRINGE (GROUP_H + 1)
+
+layout(local_size_x = GROUP_W, local_size_y = GROUP_H) in;
+
+#if (GROUP_W_FRINGE * GROUP_H_FRINGE) > (GROUP_W * (GROUP_H + 2))
+#define TEMP_NUM_ELEM (GROUP_W_FRINGE * GROUP_H_FRINGE)
+#else
+#define TEMP_NUM_ELEM (GROUP_W * (GROUP_H + 2))
+#endif
+
+shared vec4 temp[TEMP_NUM_ELEM];
+
+#if TEMP_NUM_ELEM > (GROUP_W * GROUP_H * 2)
+#error Not enough threads to load all data in two loads
+#endif
+
+// Load a WxH block of samples. We need to do this in two phases,
+// since we have more input samples than we have output samples (threads);
+// in the second phase, some threads will be idle.
+#define LOAD_PIXEL_BLOCK(base_tc, block_width, block_height, func) \
+{ \
+	memoryBarrierShared(); \
+	barrier(); \
+	int thread_id = int(gl_LocalInvocationID.y) * GROUP_W + int(gl_LocalInvocationID.x); \
+	{ \
+		int x = thread_id % (block_width); \
+		int y = thread_id / (block_width); \
+		temp[thread_id] = func(vec2((base_tc).x + x * PREFIX(inv_width), \
+		                            (base_tc).y + y * PREFIX(inv_height))); \
+	} \
+	const int num_threads = GROUP_W * GROUP_H; \
+	if (thread_id + num_threads < (block_width) * (block_height)) { \
+		int x = (thread_id + num_threads) % (block_width); \
+		int y = (thread_id + num_threads) / (block_width); \
+		temp[thread_id + num_threads] = \
+			func(vec2((base_tc).x + x * PREFIX(inv_width), \
+		                  (base_tc).y + y * PREFIX(inv_height))); \
+	} \
+	memoryBarrierShared(); \
+	barrier(); \
+}
+
+void FUNCNAME() {
+	// The current thread is responsible for output of two pixels, namely (x,2y)
+	// and (x,2y+1). One will be an unmodified one, the other one will be the
+	// pixel we are trying to interpolate. If TFF (current_field_position==0),
+	// the unmodified one is 2y+1 (remember OpenGL's bottom-left convention),
+	// and if BFF, the unmodified one is 2y. So we need to invert current_field_position
+	// to figure out which value to add.
+	int yi = int(gl_GlobalInvocationID.y) * 2 + (PREFIX(current_field_position) ^ 1);
+
+	// Load in data for the current field. current_offset signals where the block
+	// starts vertically; see set_gl_state() in the C++ code.
+	vec2 base_tc = vec2((gl_WorkGroupID.x * uint(GROUP_W) + (0.5f - 3.0f)) * PREFIX(inv_width),
+	                    (gl_WorkGroupID.y * uint(GROUP_H) + 0.5f) * PREFIX(inv_height) + PREFIX(current_field_vertical_offset));
+	LOAD_PIXEL_BLOCK(base_tc, GROUP_W_FRINGE, GROUP_H_FRINGE, INPUT3);
+
+	int lx = int(gl_LocalInvocationID.x) + 3;
+	int ly = int(gl_LocalInvocationID.y);
+
+	// Output the unmodified pixel. For TFF (current_field_position == 0),
+	// we have an extra pixel on the bottom that we're only using for interpolation
+	// (it's being output by another workgroup), so we have to add 1.
+	vec4 val = temp[(ly + (PREFIX(current_field_position) ^ 1)) * GROUP_W_FRINGE + lx];
+	OUTPUT(ivec2(gl_GlobalInvocationID.x, yi), val);
+
+	// a b c d e f g     â y
+	//       x           |
+	// h i j k l m n     +--> x
+
+	vec4 a = temp[(ly + 1) * GROUP_W_FRINGE + lx - 3];
+	vec4 b = temp[(ly + 1) * GROUP_W_FRINGE + lx - 2];
+	vec4 c = temp[(ly + 1) * GROUP_W_FRINGE + lx - 1];
+	vec4 d = temp[(ly + 1) * GROUP_W_FRINGE + lx];
+	vec4 e = temp[(ly + 1) * GROUP_W_FRINGE + lx + 1];
+	vec4 f = temp[(ly + 1) * GROUP_W_FRINGE + lx + 2];
+	vec4 g = temp[(ly + 1) * GROUP_W_FRINGE + lx + 3];
+
+	vec4 h = temp[ly * GROUP_W_FRINGE + lx - 3];
+	vec4 i = temp[ly * GROUP_W_FRINGE + lx - 2];
+	vec4 j = temp[ly * GROUP_W_FRINGE + lx - 1];
+	vec4 k = temp[ly * GROUP_W_FRINGE + lx];
+	vec4 l = temp[ly * GROUP_W_FRINGE + lx + 1];
+	vec4 m = temp[ly * GROUP_W_FRINGE + lx + 2];
+	vec4 n = temp[ly * GROUP_W_FRINGE + lx + 3];
+
+	// 0 degrees.
+	vec4 pred = d + k;
+	float score;
+	float best_score = DIFF(c, j) + DIFF(d, k) + DIFF(e, l) - 1e-4;
+
+	// -45 degrees.
+	score = DIFF(b, k) + DIFF(c, l) + DIFF(d, m);
+	if (score < best_score) {
+		pred = c + l;
+		best_score = score;
+	}
+
+	// -63 degrees.
+	score = DIFF(a, l) + DIFF(b, m) + DIFF(c, n);
+	if (score < best_score) {
+		pred = b + m;
+		best_score = score;
+	}
+
+	// +45 degrees.
+	score = DIFF(d, i) + DIFF(e, j) + DIFF(f, k);
+	if (score < best_score) {
+		pred = e + j;
+		best_score = score;
+	}
+
+	// +63 degrees.
+	score = DIFF(e, h) + DIFF(f, i) + DIFF(g, j);
+	if (score < best_score) {
+		pred = f + i;
+		// best_score isn't used anymore.
+	}
+
+	pred *= 0.5f;
+
+	// Temporal prediction (p2) of this pixel based on the previous and next fields.
+	//
+	//                â y
+	//     C   H      |
+	//   A   F   K    |
+	//     D x I      |
+	//   B   G   L    |
+	//     E   J      |
+	//                +-----> time
+	//
+	// x is obviously aligned with D and I, so we don't need texcoord
+	// adjustment for top/bottom field here, unlike earlier. However, we need
+	// to start the block one pixel below since we need E/J, thus the -1 in
+	// the y coordinate.
+	base_tc = vec2((gl_WorkGroupID.x * uint(GROUP_W) + 0.5f) * PREFIX(inv_width),
+	               (gl_WorkGroupID.y * uint(GROUP_H) + (0.5f - 1.0f)) * PREFIX(inv_height));
+	lx = int(gl_LocalInvocationID.x);
+#if YADIF_ENABLE_SPATIAL_INTERLACING_CHECK
+	LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 2, INPUT2);
+	vec4 C = temp[(ly + 2) * GROUP_W + lx];
+	vec4 D = temp[(ly + 1) * GROUP_W + lx];
+	vec4 E = temp[ ly      * GROUP_W + lx];
+
+	LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 2, INPUT4);
+	vec4 H = temp[(ly + 2) * GROUP_W + lx];
+	vec4 I = temp[(ly + 1) * GROUP_W + lx];
+	vec4 J = temp[ ly      * GROUP_W + lx];
+#else
+	// Since spatial interlacing check is not enabled, we only need D
+	// and I from the previous and next fields; since they are not shared
+	// between the neighboring pixels, they can be straight-up loads.
+	vec2 DI_pos = vec2((gl_GlobalInvocationID.x + 0.5f) * PREFIX(inv_width),
+	                   (gl_GlobalInvocationID.y + 0.5f) * PREFIX(inv_height));
+	vec4 D = INPUT2(DI_pos);
+	vec4 I = INPUT4(DI_pos);
+#endif
+
+	// Load what we need from the previous field into shared memory,
+	// since A/B can be reused between neighboring pixels. We need one
+	// line above/below, but we don't need the horizontal fringe.
+	LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 1, INPUT1);
+	vec4 A = temp[(ly + 1) * GROUP_W + lx];
+	vec4 B = temp[ ly      * GROUP_W + lx];
+
+	// What we need from the current field was loaded earlier.
+	vec4 F = d;
+	vec4 G = k;
+
+	// Next field.
+	LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 1, INPUT5);
+	vec4 K = temp[(ly + 1) * GROUP_W + lx];
+	vec4 L = temp[ ly      * GROUP_W + lx];
+
+	// Find temporal differences around this line.
+	vec4 tdiff0 = abs(D - I);
+	vec4 tdiff1 = abs(A - F) + abs(B - G);  // Actually twice tdiff1.
+	vec4 tdiff2 = abs(K - F) + abs(L - G);  // Actually twice tdiff2.
+	vec4 diff = max(tdiff0, 0.5f * max(tdiff1, tdiff2));
+
+#if YADIF_ENABLE_SPATIAL_INTERLACING_CHECK
+	// Spatial interlacing check.
+	// We start by temporally interpolating the current vertical line (p0âp4):
+	//
+	//     C p0 H      â y
+	//       p1        |
+	//     D p2 I      |
+	//       p3        |
+	//     E p4 J      +-----> time
+	//
+	vec4 p0 = 0.5f * (C + H);
+	vec4 p1 = F;
+	vec4 p2 = 0.5f * (D + I);
+	vec4 p3 = G;
+	vec4 p4 = 0.5f * (E + J);
+
+	vec4 max_ = max(max(p2 - p3, p2 - p1), min(p0 - p1, p4 - p3));
+	vec4 min_ = min(min(p2 - p3, p2 - p1), max(p0 - p1, p4 - p3));
+	diff = max(diff, max(min_, -max_));
+#else
+	vec4 p2 = 0.5f * (D + I);
+#endif
+
+	val = clamp(pred, p2 - diff, p2 + diff);
+	OUTPUT(ivec2(gl_GlobalInvocationID.x, yi ^ 1), val);
+}
+
+#undef LOAD_PIXEL_BLOCK
+#undef DIFF
+#undef YADIF_ENABLE_SPATIAL_INTERLACING_CHECK
diff --git a/deinterlace_effect.cpp b/deinterlace_effect.cpp
index 8d9a96c..078983a 100644
--- a/deinterlace_effect.cpp
+++ b/deinterlace_effect.cpp
@@ -1,6 +1,8 @@
 #include <epoxy/gl.h>
 
 #include "deinterlace_effect.h"
+#include "effect_chain.h"
+#include "init.h"
 #include "util.h"
 
 using namespace std;
@@ -12,13 +14,18 @@ DeinterlaceEffect::DeinterlaceEffect()
 	  current_field_position(TOP),
 	  num_lines(1080)
 {
-	register_int("enable_spatial_interlacing_check", (int *)&enable_spatial_interlacing_check);
-	register_int("current_field_position", (int *)&current_field_position);
-	register_uniform_float("num_lines", &num_lines);
-	register_uniform_float("inv_width", &inv_width);
-	register_uniform_float("self_offset", &self_offset);
-	register_uniform_float_array("current_offset", current_offset, 2);
-	register_uniform_float_array("other_offset", other_offset, 3);
+	if (movit_compute_shaders_supported) {
+		compute_effect_owner.reset(new DeinterlaceComputeEffect);
+		compute_effect = compute_effect_owner.get();
+	} else {
+		register_int("enable_spatial_interlacing_check", (int *)&enable_spatial_interlacing_check);
+		register_int("current_field_position", (int *)&current_field_position);
+		register_uniform_float("num_lines", &num_lines);
+		register_uniform_float("inv_width", &inv_width);
+		register_uniform_float("self_offset", &self_offset);
+		register_uniform_float_array("current_offset", current_offset, 2);
+		register_uniform_float_array("other_offset", other_offset, 3);
+	}
 }
 
 string DeinterlaceEffect::output_fragment_shader()
@@ -32,6 +39,25 @@ string DeinterlaceEffect::output_fragment_shader()
 	return frag_shader;
 }
 
+void DeinterlaceEffect::rewrite_graph(EffectChain *graph, Node *self)
+{
+	if (compute_effect != nullptr) {
+		Node *compute_node = graph->add_node(compute_effect_owner.release());
+		graph->replace_receiver(self, compute_node);
+		graph->replace_sender(self, compute_node);
+		self->disabled = true;
+	}
+}
+
+bool DeinterlaceEffect::set_int(const std::string &key, int value)
+{
+	if (compute_effect != nullptr) {
+		return compute_effect->set_int(key, value);
+	} else {
+		return Effect::set_int(key, value);
+	}
+}
+
 void DeinterlaceEffect::inform_input_size(unsigned input_num, unsigned width, unsigned height)
 {
 	assert(input_num >= 0 && input_num < 5);
@@ -125,4 +151,96 @@ void DeinterlaceEffect::set_gl_state(GLuint glsl_program_num, const string &pref
 	other_offset[2] = center_offset + 1.0 / heights[0];
 }
 
+// Implementation of DeinterlaceComputeEffect.
+
+DeinterlaceComputeEffect::DeinterlaceComputeEffect()
+	: enable_spatial_interlacing_check(true),
+	  current_field_position(TOP)
+{
+	register_int("enable_spatial_interlacing_check", (int *)&enable_spatial_interlacing_check);
+	register_int("current_field_position", (int *)&current_field_position);
+	register_uniform_float("inv_width", &inv_width);
+	register_uniform_float("inv_height", &inv_height);
+	register_uniform_float("current_field_vertical_offset", &current_field_vertical_offset);
+}
+
+string DeinterlaceComputeEffect::output_fragment_shader()
+{
+	char buf[256];
+	snprintf(buf, sizeof(buf), "#define YADIF_ENABLE_SPATIAL_INTERLACING_CHECK %d\n",
+		enable_spatial_interlacing_check);
+	string frag_shader = buf;
+
+	frag_shader += read_file("deinterlace_effect.comp");
+	return frag_shader;
+}
+
+void DeinterlaceComputeEffect::inform_input_size(unsigned input_num, unsigned width, unsigned height)
+{
+	assert(input_num >= 0 && input_num < 5);
+	widths[input_num] = width;
+	heights[input_num] = height;
+}
+
+void DeinterlaceComputeEffect::get_output_size(unsigned *width, unsigned *height,
+                                        unsigned *virtual_width, unsigned *virtual_height) const
+{
+	assert(widths[0] == widths[1]);
+	assert(widths[1] == widths[2]);
+	assert(widths[2] == widths[3]);
+	assert(widths[3] == widths[4]);
+	assert(heights[0] == heights[1]);
+	assert(heights[1] == heights[2]);
+	assert(heights[2] == heights[3]);
+	assert(heights[3] == heights[4]);
+	*width = *virtual_width = widths[0];
+	*height = *virtual_height = heights[0] * 2;
+}
+
+void DeinterlaceComputeEffect::set_gl_state(GLuint glsl_program_num, const string &prefix, unsigned *sampler_num)
+{
+	Effect::set_gl_state(glsl_program_num, prefix, sampler_num);
+
+	inv_width = 1.0 / widths[0];
+	inv_height = 1.0 / heights[0];
+
+	// For the compute shader, we need to load a block of pixels. Marking off the
+	// ones we are supposed to interpolate (looking only at one column):
+	//
+	//  field_pos==0            field_pos==1
+	//
+	//  6     x      â          6     .      â
+	//  6     .      |          6     x      |
+	//  5     x      |          5     .      |
+	//  5     .      |          5     x      |
+	//  4     x      |          4     .      |
+	//  4     .      |          4     x      |
+	//  3     x      | y        3     o      | y
+	//  3     o      |          3     x      |
+	//  2     x      |          2     o      |
+	//  2     o      |          2     x      |
+	//  1     x      |          1     .      |
+	//  1     .      |          1     x      |
+	//  0     x      |          0     .      |
+	//  0     .      |          0     x      |
+	//
+	// So if we are to compute e.g. output samples [2,4), we load input samples
+	// [1,3] for TFF and samples [2,4] for BFF.
+	if (current_field_position == 0) {
+		current_field_vertical_offset = -1.0 / heights[0];
+	} else {
+		current_field_vertical_offset =  0.0 / heights[0];
+	}
+}
+
+void DeinterlaceComputeEffect::get_compute_dimensions(unsigned output_width, unsigned output_height,
+                                               unsigned *x, unsigned *y, unsigned *z) const
+{
+	// Each workgroup outputs 8x32 pixels (see GROUP_W and GROUP_H in the shader),
+	// so figure out the number of groups by simply rounding up.
+	*x = (output_width + 7) / 8;
+	*y = (output_height + 31) / 32;
+	*z = 1;
+}
+
 }  // namespace movit
diff --git a/deinterlace_effect.h b/deinterlace_effect.h
index 7935322..5841f86 100644
--- a/deinterlace_effect.h
+++ b/deinterlace_effect.h
@@ -52,18 +52,26 @@
 // parity, so all the others are implicit).
 
 #include <epoxy/gl.h>
+#include <memory>
 #include <string>
 
 #include "effect.h"
 
 namespace movit {
 
+class DeinterlaceComputeEffect;
+
 class DeinterlaceEffect : public Effect {
 public:
 	DeinterlaceEffect();
 	virtual std::string effect_type_id() const { return "DeinterlaceEffect"; }
 	std::string output_fragment_shader();
 
+	// Replaces itself with DeinterlaceComputeEffect if compute shaders are supported.
+	// Otherwise, does nothing.
+	void rewrite_graph(EffectChain *graph, Node *self);
+	bool set_int(const std::string &key, int value);
+
 	void set_gl_state(GLuint glsl_program_num, const std::string &prefix, unsigned *sampler_num);
 
 	// First = before previous, second = previous, third = current,
@@ -85,6 +93,11 @@ public:
 	enum FieldPosition { TOP = 0, BOTTOM = 1 };
 
 private:
+	// If compute shaders are supported, contains the actual effect.
+	// If not, nullptr.
+	std::unique_ptr<DeinterlaceComputeEffect> compute_effect_owner;
+	DeinterlaceComputeEffect *compute_effect = nullptr;
+
 	unsigned widths[5], heights[5];
 
 	// See file-level comment for explanation of this option.
@@ -114,6 +127,50 @@ private:
 	float other_offset[3];
 };
 
+// A compute shader implementation of DeinterlaceEffect. It saves a bunch of loads
+// since it can share them between neighboring pixels (and also does not need
+// texture bounce), so it has the potential to be faster, although exactly how
+// much depends on your chain and other factors. DeinterlaceEffect will
+// automatically become a proxy to DeinterlaceComputeEffect if your system
+// supports compute shaders.
+class DeinterlaceComputeEffect : public Effect {
+public:
+	DeinterlaceComputeEffect();
+	virtual std::string effect_type_id() const { return "DeinterlaceComputeEffect"; }
+	std::string output_fragment_shader();
+
+	void set_gl_state(GLuint glsl_program_num, const std::string &prefix, unsigned *sampler_num);
+
+	virtual unsigned num_inputs() const { return 5; }
+	virtual bool changes_output_size() const { return true; }
+	virtual bool is_compute_shader() const { return true; }
+	virtual void get_compute_dimensions(unsigned output_width, unsigned output_height,
+	                                    unsigned *x, unsigned *y, unsigned *z) const;
+
+	virtual AlphaHandling alpha_handling() const { return INPUT_PREMULTIPLIED_ALPHA_KEEP_BLANK; }
+
+	virtual void inform_input_size(unsigned input_num, unsigned width, unsigned height);
+	virtual void get_output_size(unsigned *width, unsigned *height,
+	                             unsigned *virtual_width, unsigned *virtual_height) const;
+
+	enum FieldPosition { TOP = 0, BOTTOM = 1 };
+
+private:
+	unsigned widths[5], heights[5];
+
+	// See file-level comment for explanation of this option.
+	bool enable_spatial_interlacing_check;
+
+	// Which field the current input (the middle one) is.
+	FieldPosition current_field_position;
+
+	// Offset for one pixel in the horizontal and verticla direction (1/width, 1/height).
+	float inv_width, inv_height;
+
+	// For evaluating the low-pass filter (in the current field). Four taps.
+	float current_field_vertical_offset;
+};
+
 }  // namespace movit
 
 #endif // !defined(_MOVIT_DEINTERLACE_EFFECT_H)
diff --git a/deinterlace_effect_test.cpp b/deinterlace_effect_test.cpp
index 0c625c5..0ac1953 100644
--- a/deinterlace_effect_test.cpp
+++ b/deinterlace_effect_test.cpp
@@ -19,7 +19,17 @@ using namespace std;
 
 namespace movit {
 
-TEST(DeinterlaceTest, ConstantColor) {
+class DeinterlaceTest : public testing::TestWithParam<string> {
+protected:
+	DeinterlaceTest() : disabler(GetParam() == "fragment") {}
+	bool should_skip() { return disabler.should_skip(); }
+
+private:
+	DisableComputeShadersTemporarily disabler;
+};
+
+TEST_P(DeinterlaceTest, ConstantColor) {
+	if (should_skip()) return;
 	float data[] = {
 		0.3f, 0.3f,
 		0.3f, 0.3f,
@@ -52,7 +62,8 @@ TEST(DeinterlaceTest, ConstantColor) {
 }
 
 // Also tests that top/bottom change works like expected.
-TEST(DeinterlaceTest, VerticalInterpolation) {
+TEST_P(DeinterlaceTest, VerticalInterpolation) {
+	if (should_skip()) return;
 	const int width = 11;
 	const int height = 2;
 	float data[width * height] = {
@@ -97,7 +108,8 @@ TEST(DeinterlaceTest, VerticalInterpolation) {
 	expect_equal(expected_data_bottom, out_data, width, height * 2);
 }
 
-TEST(DeinterlaceTest, DiagonalInterpolation) {
+TEST_P(DeinterlaceTest, DiagonalInterpolation) {
+	if (should_skip()) return;
 	const int width = 11;
 	const int height = 3;
 	float data[width * height] = {
@@ -145,7 +157,8 @@ TEST(DeinterlaceTest, DiagonalInterpolation) {
 	expect_equal(expected_data_top, out_data, width, height * 2);
 }
 
-TEST(DeinterlaceTest, FlickerBox) {
+TEST_P(DeinterlaceTest, FlickerBox) {
+	if (should_skip()) return;
 	const int width = 4;
 	const int height = 4;
 	float white_data[width * height] = {
@@ -197,6 +210,10 @@ TEST(DeinterlaceTest, FlickerBox) {
 	}
 }
 
+INSTANTIATE_TEST_CASE_P(DeinterlaceTest,
+                        DeinterlaceTest,
+                        testing::Values("fragment", "compute"));
+
 #ifdef HAVE_BENCHMARK
 namespace {
 
@@ -210,8 +227,11 @@ TestFormat bgra_format = { FORMAT_BGRA_PREMULTIPLIED_ALPHA, GL_BGRA, 4 };
 
 }  // namespace
 
-void BM_DeinterlaceEffect(benchmark::State &state, TestFormat format, bool spatial_interlacing_check)
+void BM_DeinterlaceEffect(benchmark::State &state, TestFormat format, bool spatial_interlacing_check, const std::string &shader_type)
 {
+	DisableComputeShadersTemporarily disabler(shader_type == "fragment");
+	if (disabler.should_skip()) return;
+
 	unsigned width = state.range(0), height = state.range(1);
 	unsigned field_height = height / 2;
 
@@ -243,9 +263,12 @@ void BM_DeinterlaceEffect(benchmark::State &state, TestFormat format, bool spati
 
 	tester.benchmark(state, out_data.get(), format.output_format, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED);
 }
-BENCHMARK_CAPTURE(BM_DeinterlaceEffect, Gray, gray_format, true)->Args({720, 576})->Args({1280, 720})->Args({1920, 1080})->UseRealTime()->Unit(benchmark::kMicrosecond);
-BENCHMARK_CAPTURE(BM_DeinterlaceEffect, BGRA, bgra_format, true)->Args({720, 576})->Args({1280, 720})->Args({1920, 1080})->UseRealTime()->Unit(benchmark::kMicrosecond);
-BENCHMARK_CAPTURE(BM_DeinterlaceEffect, BGRANoSpatialCheck, bgra_format, false)->Args({720, 576})->Args({1280, 720})->Args({1920, 1080})->UseRealTime()->Unit(benchmark::kMicrosecond);
+BENCHMARK_CAPTURE(BM_DeinterlaceEffect, Gray, gray_format, true, "fragment")->Args({720, 576})->Args({1280, 720})->Args({1920, 1080})->UseRealTime()->Unit(benchmark::kMicrosecond);
+BENCHMARK_CAPTURE(BM_DeinterlaceEffect, BGRA, bgra_format, true, "fragment")->Args({720, 576})->Args({1280, 720})->Args({1920, 1080})->UseRealTime()->Unit(benchmark::kMicrosecond);
+BENCHMARK_CAPTURE(BM_DeinterlaceEffect, BGRANoSpatialCheck, bgra_format, false, "fragment")->Args({720, 576})->Args({1280, 720})->Args({1920, 1080})->UseRealTime()->Unit(benchmark::kMicrosecond);
+BENCHMARK_CAPTURE(BM_DeinterlaceEffect, GrayCompute, gray_format, true, "compute")->Args({720, 576})->Args({1280, 720})->Args({1920, 1080})->UseRealTime()->Unit(benchmark::kMicrosecond);
+BENCHMARK_CAPTURE(BM_DeinterlaceEffect, BGRACompute, bgra_format, true, "compute")->Args({720, 576})->Args({1280, 720})->Args({1920, 1080})->UseRealTime()->Unit(benchmark::kMicrosecond);
+BENCHMARK_CAPTURE(BM_DeinterlaceEffect, BGRANoSpatialCheckCompute, bgra_format, false, "compute")->Args({720, 576})->Args({1280, 720})->Args({1920, 1080})->UseRealTime()->Unit(benchmark::kMicrosecond);
 
 #endif
 
-- 
2.39.2