From 4179bef190d88739038233ac5d7e5ffa2ff4282f Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Thu, 16 Nov 2017 23:07:47 +0100 Subject: [PATCH] Implement a compute shdaer version of DeinterlaceEffect. This is currently a loss for grayscale (probably due to the extra rgba16f bounce), but a win of about ~30% on BGRA on my Haswell. NVIDIA doesn't care much either way. There are some performance mysteries remaining, but it's a good start. --- deinterlace_effect.comp | 236 ++++++++++++++++++++++++++++++++++++ deinterlace_effect.cpp | 132 ++++++++++++++++++-- deinterlace_effect.h | 57 +++++++++ deinterlace_effect_test.cpp | 39 ++++-- 4 files changed, 449 insertions(+), 15 deletions(-) create mode 100644 deinterlace_effect.comp diff --git a/deinterlace_effect.comp b/deinterlace_effect.comp new file mode 100644 index 0000000..b46e5bf --- /dev/null +++ b/deinterlace_effect.comp @@ -0,0 +1,236 @@ +// Implicit uniforms: +// uniform int PREFIX(current_field_position); +// uniform float PREFIX(inv_width); +// uniform float PREFIX(inv_height); +// uniform float PREFIX(current_field_vertical_offset); + +// Compute shader implementation of DeinterlaceEffect. See the fragment +// shader implementation (deinterlace_effect.frag) for comments about the +// algorithm; comments here will mainly be about issues specific to the +// compute shader implementation. + +#define DIFF(s1, s2) dot((s1) - (s2), (s1) - (s2)) + +// In input pixels (so output will be 8x32). Corresponds to get_compute_dimensions() +// in the C++ code. It is illogical that 8x32 would be better than e.g. 32x8, +// since we reuse more data horizontally, but especially Intel cards are much more +// happy about this for whatever reason. +#define GROUP_W 8 +#define GROUP_H 16 + +// When sampling from the current field (spatial interpolation below), we have +// a fringe of three pixels on the left and right sides, so we need to load +// more. We also have one pixel above and below, although our destination pixel +// is squeezed in the middle of them (they don't overlap), so we only need one +// extra pixel. +#define GROUP_W_FRINGE (GROUP_W + 6) +#define GROUP_H_FRINGE (GROUP_H + 1) + +layout(local_size_x = GROUP_W, local_size_y = GROUP_H) in; + +#if (GROUP_W_FRINGE * GROUP_H_FRINGE) > (GROUP_W * (GROUP_H + 2)) +#define TEMP_NUM_ELEM (GROUP_W_FRINGE * GROUP_H_FRINGE) +#else +#define TEMP_NUM_ELEM (GROUP_W * (GROUP_H + 2)) +#endif + +shared vec4 temp[TEMP_NUM_ELEM]; + +#if TEMP_NUM_ELEM > (GROUP_W * GROUP_H * 2) +#error Not enough threads to load all data in two loads +#endif + +// Load a WxH block of samples. We need to do this in two phases, +// since we have more input samples than we have output samples (threads); +// in the second phase, some threads will be idle. +#define LOAD_PIXEL_BLOCK(base_tc, block_width, block_height, func) \ +{ \ + memoryBarrierShared(); \ + barrier(); \ + int thread_id = int(gl_LocalInvocationID.y) * GROUP_W + int(gl_LocalInvocationID.x); \ + { \ + int x = thread_id % (block_width); \ + int y = thread_id / (block_width); \ + temp[thread_id] = func(vec2((base_tc).x + x * PREFIX(inv_width), \ + (base_tc).y + y * PREFIX(inv_height))); \ + } \ + const int num_threads = GROUP_W * GROUP_H; \ + if (thread_id + num_threads < (block_width) * (block_height)) { \ + int x = (thread_id + num_threads) % (block_width); \ + int y = (thread_id + num_threads) / (block_width); \ + temp[thread_id + num_threads] = \ + func(vec2((base_tc).x + x * PREFIX(inv_width), \ + (base_tc).y + y * PREFIX(inv_height))); \ + } \ + memoryBarrierShared(); \ + barrier(); \ +} + +void FUNCNAME() { + // The current thread is responsible for output of two pixels, namely (x,2y) + // and (x,2y+1). One will be an unmodified one, the other one will be the + // pixel we are trying to interpolate. If TFF (current_field_position==0), + // the unmodified one is 2y+1 (remember OpenGL's bottom-left convention), + // and if BFF, the unmodified one is 2y. So we need to invert current_field_position + // to figure out which value to add. + int yi = int(gl_GlobalInvocationID.y) * 2 + (PREFIX(current_field_position) ^ 1); + + // Load in data for the current field. current_offset signals where the block + // starts vertically; see set_gl_state() in the C++ code. + vec2 base_tc = vec2((gl_WorkGroupID.x * uint(GROUP_W) + (0.5f - 3.0f)) * PREFIX(inv_width), + (gl_WorkGroupID.y * uint(GROUP_H) + 0.5f) * PREFIX(inv_height) + PREFIX(current_field_vertical_offset)); + LOAD_PIXEL_BLOCK(base_tc, GROUP_W_FRINGE, GROUP_H_FRINGE, INPUT3); + + int lx = int(gl_LocalInvocationID.x) + 3; + int ly = int(gl_LocalInvocationID.y); + + // Output the unmodified pixel. For TFF (current_field_position == 0), + // we have an extra pixel on the bottom that we're only using for interpolation + // (it's being output by another workgroup), so we have to add 1. + vec4 val = temp[(ly + (PREFIX(current_field_position) ^ 1)) * GROUP_W_FRINGE + lx]; + OUTPUT(ivec2(gl_GlobalInvocationID.x, yi), val); + + // a b c d e f g ↑ y + // x | + // h i j k l m n +--> x + + vec4 a = temp[(ly + 1) * GROUP_W_FRINGE + lx - 3]; + vec4 b = temp[(ly + 1) * GROUP_W_FRINGE + lx - 2]; + vec4 c = temp[(ly + 1) * GROUP_W_FRINGE + lx - 1]; + vec4 d = temp[(ly + 1) * GROUP_W_FRINGE + lx]; + vec4 e = temp[(ly + 1) * GROUP_W_FRINGE + lx + 1]; + vec4 f = temp[(ly + 1) * GROUP_W_FRINGE + lx + 2]; + vec4 g = temp[(ly + 1) * GROUP_W_FRINGE + lx + 3]; + + vec4 h = temp[ly * GROUP_W_FRINGE + lx - 3]; + vec4 i = temp[ly * GROUP_W_FRINGE + lx - 2]; + vec4 j = temp[ly * GROUP_W_FRINGE + lx - 1]; + vec4 k = temp[ly * GROUP_W_FRINGE + lx]; + vec4 l = temp[ly * GROUP_W_FRINGE + lx + 1]; + vec4 m = temp[ly * GROUP_W_FRINGE + lx + 2]; + vec4 n = temp[ly * GROUP_W_FRINGE + lx + 3]; + + // 0 degrees. + vec4 pred = d + k; + float score; + float best_score = DIFF(c, j) + DIFF(d, k) + DIFF(e, l) - 1e-4; + + // -45 degrees. + score = DIFF(b, k) + DIFF(c, l) + DIFF(d, m); + if (score < best_score) { + pred = c + l; + best_score = score; + } + + // -63 degrees. + score = DIFF(a, l) + DIFF(b, m) + DIFF(c, n); + if (score < best_score) { + pred = b + m; + best_score = score; + } + + // +45 degrees. + score = DIFF(d, i) + DIFF(e, j) + DIFF(f, k); + if (score < best_score) { + pred = e + j; + best_score = score; + } + + // +63 degrees. + score = DIFF(e, h) + DIFF(f, i) + DIFF(g, j); + if (score < best_score) { + pred = f + i; + // best_score isn't used anymore. + } + + pred *= 0.5f; + + // Temporal prediction (p2) of this pixel based on the previous and next fields. + // + // ↑ y + // C H | + // A F K | + // D x I | + // B G L | + // E J | + // +-----> time + // + // x is obviously aligned with D and I, so we don't need texcoord + // adjustment for top/bottom field here, unlike earlier. However, we need + // to start the block one pixel below since we need E/J, thus the -1 in + // the y coordinate. + base_tc = vec2((gl_WorkGroupID.x * uint(GROUP_W) + 0.5f) * PREFIX(inv_width), + (gl_WorkGroupID.y * uint(GROUP_H) + (0.5f - 1.0f)) * PREFIX(inv_height)); + lx = int(gl_LocalInvocationID.x); +#if YADIF_ENABLE_SPATIAL_INTERLACING_CHECK + LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 2, INPUT2); + vec4 C = temp[(ly + 2) * GROUP_W + lx]; + vec4 D = temp[(ly + 1) * GROUP_W + lx]; + vec4 E = temp[ ly * GROUP_W + lx]; + + LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 2, INPUT4); + vec4 H = temp[(ly + 2) * GROUP_W + lx]; + vec4 I = temp[(ly + 1) * GROUP_W + lx]; + vec4 J = temp[ ly * GROUP_W + lx]; +#else + // Since spatial interlacing check is not enabled, we only need D + // and I from the previous and next fields; since they are not shared + // between the neighboring pixels, they can be straight-up loads. + vec2 DI_pos = vec2((gl_GlobalInvocationID.x + 0.5f) * PREFIX(inv_width), + (gl_GlobalInvocationID.y + 0.5f) * PREFIX(inv_height)); + vec4 D = INPUT2(DI_pos); + vec4 I = INPUT4(DI_pos); +#endif + + // Load what we need from the previous field into shared memory, + // since A/B can be reused between neighboring pixels. We need one + // line above/below, but we don't need the horizontal fringe. + LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 1, INPUT1); + vec4 A = temp[(ly + 1) * GROUP_W + lx]; + vec4 B = temp[ ly * GROUP_W + lx]; + + // What we need from the current field was loaded earlier. + vec4 F = d; + vec4 G = k; + + // Next field. + LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 1, INPUT5); + vec4 K = temp[(ly + 1) * GROUP_W + lx]; + vec4 L = temp[ ly * GROUP_W + lx]; + + // Find temporal differences around this line. + vec4 tdiff0 = abs(D - I); + vec4 tdiff1 = abs(A - F) + abs(B - G); // Actually twice tdiff1. + vec4 tdiff2 = abs(K - F) + abs(L - G); // Actually twice tdiff2. + vec4 diff = max(tdiff0, 0.5f * max(tdiff1, tdiff2)); + +#if YADIF_ENABLE_SPATIAL_INTERLACING_CHECK + // Spatial interlacing check. + // We start by temporally interpolating the current vertical line (p0–p4): + // + // C p0 H ↑ y + // p1 | + // D p2 I | + // p3 | + // E p4 J +-----> time + // + vec4 p0 = 0.5f * (C + H); + vec4 p1 = F; + vec4 p2 = 0.5f * (D + I); + vec4 p3 = G; + vec4 p4 = 0.5f * (E + J); + + vec4 max_ = max(max(p2 - p3, p2 - p1), min(p0 - p1, p4 - p3)); + vec4 min_ = min(min(p2 - p3, p2 - p1), max(p0 - p1, p4 - p3)); + diff = max(diff, max(min_, -max_)); +#else + vec4 p2 = 0.5f * (D + I); +#endif + + val = clamp(pred, p2 - diff, p2 + diff); + OUTPUT(ivec2(gl_GlobalInvocationID.x, yi ^ 1), val); +} + +#undef LOAD_PIXEL_BLOCK +#undef DIFF +#undef YADIF_ENABLE_SPATIAL_INTERLACING_CHECK diff --git a/deinterlace_effect.cpp b/deinterlace_effect.cpp index 8d9a96c..078983a 100644 --- a/deinterlace_effect.cpp +++ b/deinterlace_effect.cpp @@ -1,6 +1,8 @@ #include #include "deinterlace_effect.h" +#include "effect_chain.h" +#include "init.h" #include "util.h" using namespace std; @@ -12,13 +14,18 @@ DeinterlaceEffect::DeinterlaceEffect() current_field_position(TOP), num_lines(1080) { - register_int("enable_spatial_interlacing_check", (int *)&enable_spatial_interlacing_check); - register_int("current_field_position", (int *)¤t_field_position); - register_uniform_float("num_lines", &num_lines); - register_uniform_float("inv_width", &inv_width); - register_uniform_float("self_offset", &self_offset); - register_uniform_float_array("current_offset", current_offset, 2); - register_uniform_float_array("other_offset", other_offset, 3); + if (movit_compute_shaders_supported) { + compute_effect_owner.reset(new DeinterlaceComputeEffect); + compute_effect = compute_effect_owner.get(); + } else { + register_int("enable_spatial_interlacing_check", (int *)&enable_spatial_interlacing_check); + register_int("current_field_position", (int *)¤t_field_position); + register_uniform_float("num_lines", &num_lines); + register_uniform_float("inv_width", &inv_width); + register_uniform_float("self_offset", &self_offset); + register_uniform_float_array("current_offset", current_offset, 2); + register_uniform_float_array("other_offset", other_offset, 3); + } } string DeinterlaceEffect::output_fragment_shader() @@ -32,6 +39,25 @@ string DeinterlaceEffect::output_fragment_shader() return frag_shader; } +void DeinterlaceEffect::rewrite_graph(EffectChain *graph, Node *self) +{ + if (compute_effect != nullptr) { + Node *compute_node = graph->add_node(compute_effect_owner.release()); + graph->replace_receiver(self, compute_node); + graph->replace_sender(self, compute_node); + self->disabled = true; + } +} + +bool DeinterlaceEffect::set_int(const std::string &key, int value) +{ + if (compute_effect != nullptr) { + return compute_effect->set_int(key, value); + } else { + return Effect::set_int(key, value); + } +} + void DeinterlaceEffect::inform_input_size(unsigned input_num, unsigned width, unsigned height) { assert(input_num >= 0 && input_num < 5); @@ -125,4 +151,96 @@ void DeinterlaceEffect::set_gl_state(GLuint glsl_program_num, const string &pref other_offset[2] = center_offset + 1.0 / heights[0]; } +// Implementation of DeinterlaceComputeEffect. + +DeinterlaceComputeEffect::DeinterlaceComputeEffect() + : enable_spatial_interlacing_check(true), + current_field_position(TOP) +{ + register_int("enable_spatial_interlacing_check", (int *)&enable_spatial_interlacing_check); + register_int("current_field_position", (int *)¤t_field_position); + register_uniform_float("inv_width", &inv_width); + register_uniform_float("inv_height", &inv_height); + register_uniform_float("current_field_vertical_offset", ¤t_field_vertical_offset); +} + +string DeinterlaceComputeEffect::output_fragment_shader() +{ + char buf[256]; + snprintf(buf, sizeof(buf), "#define YADIF_ENABLE_SPATIAL_INTERLACING_CHECK %d\n", + enable_spatial_interlacing_check); + string frag_shader = buf; + + frag_shader += read_file("deinterlace_effect.comp"); + return frag_shader; +} + +void DeinterlaceComputeEffect::inform_input_size(unsigned input_num, unsigned width, unsigned height) +{ + assert(input_num >= 0 && input_num < 5); + widths[input_num] = width; + heights[input_num] = height; +} + +void DeinterlaceComputeEffect::get_output_size(unsigned *width, unsigned *height, + unsigned *virtual_width, unsigned *virtual_height) const +{ + assert(widths[0] == widths[1]); + assert(widths[1] == widths[2]); + assert(widths[2] == widths[3]); + assert(widths[3] == widths[4]); + assert(heights[0] == heights[1]); + assert(heights[1] == heights[2]); + assert(heights[2] == heights[3]); + assert(heights[3] == heights[4]); + *width = *virtual_width = widths[0]; + *height = *virtual_height = heights[0] * 2; +} + +void DeinterlaceComputeEffect::set_gl_state(GLuint glsl_program_num, const string &prefix, unsigned *sampler_num) +{ + Effect::set_gl_state(glsl_program_num, prefix, sampler_num); + + inv_width = 1.0 / widths[0]; + inv_height = 1.0 / heights[0]; + + // For the compute shader, we need to load a block of pixels. Marking off the + // ones we are supposed to interpolate (looking only at one column): + // + // field_pos==0 field_pos==1 + // + // 6 x ↑ 6 . ↑ + // 6 . | 6 x | + // 5 x | 5 . | + // 5 . | 5 x | + // 4 x | 4 . | + // 4 . | 4 x | + // 3 x | y 3 o | y + // 3 o | 3 x | + // 2 x | 2 o | + // 2 o | 2 x | + // 1 x | 1 . | + // 1 . | 1 x | + // 0 x | 0 . | + // 0 . | 0 x | + // + // So if we are to compute e.g. output samples [2,4), we load input samples + // [1,3] for TFF and samples [2,4] for BFF. + if (current_field_position == 0) { + current_field_vertical_offset = -1.0 / heights[0]; + } else { + current_field_vertical_offset = 0.0 / heights[0]; + } +} + +void DeinterlaceComputeEffect::get_compute_dimensions(unsigned output_width, unsigned output_height, + unsigned *x, unsigned *y, unsigned *z) const +{ + // Each workgroup outputs 8x32 pixels (see GROUP_W and GROUP_H in the shader), + // so figure out the number of groups by simply rounding up. + *x = (output_width + 7) / 8; + *y = (output_height + 31) / 32; + *z = 1; +} + } // namespace movit diff --git a/deinterlace_effect.h b/deinterlace_effect.h index 7935322..5841f86 100644 --- a/deinterlace_effect.h +++ b/deinterlace_effect.h @@ -52,18 +52,26 @@ // parity, so all the others are implicit). #include +#include #include #include "effect.h" namespace movit { +class DeinterlaceComputeEffect; + class DeinterlaceEffect : public Effect { public: DeinterlaceEffect(); virtual std::string effect_type_id() const { return "DeinterlaceEffect"; } std::string output_fragment_shader(); + // Replaces itself with DeinterlaceComputeEffect if compute shaders are supported. + // Otherwise, does nothing. + void rewrite_graph(EffectChain *graph, Node *self); + bool set_int(const std::string &key, int value); + void set_gl_state(GLuint glsl_program_num, const std::string &prefix, unsigned *sampler_num); // First = before previous, second = previous, third = current, @@ -85,6 +93,11 @@ public: enum FieldPosition { TOP = 0, BOTTOM = 1 }; private: + // If compute shaders are supported, contains the actual effect. + // If not, nullptr. + std::unique_ptr compute_effect_owner; + DeinterlaceComputeEffect *compute_effect = nullptr; + unsigned widths[5], heights[5]; // See file-level comment for explanation of this option. @@ -114,6 +127,50 @@ private: float other_offset[3]; }; +// A compute shader implementation of DeinterlaceEffect. It saves a bunch of loads +// since it can share them between neighboring pixels (and also does not need +// texture bounce), so it has the potential to be faster, although exactly how +// much depends on your chain and other factors. DeinterlaceEffect will +// automatically become a proxy to DeinterlaceComputeEffect if your system +// supports compute shaders. +class DeinterlaceComputeEffect : public Effect { +public: + DeinterlaceComputeEffect(); + virtual std::string effect_type_id() const { return "DeinterlaceComputeEffect"; } + std::string output_fragment_shader(); + + void set_gl_state(GLuint glsl_program_num, const std::string &prefix, unsigned *sampler_num); + + virtual unsigned num_inputs() const { return 5; } + virtual bool changes_output_size() const { return true; } + virtual bool is_compute_shader() const { return true; } + virtual void get_compute_dimensions(unsigned output_width, unsigned output_height, + unsigned *x, unsigned *y, unsigned *z) const; + + virtual AlphaHandling alpha_handling() const { return INPUT_PREMULTIPLIED_ALPHA_KEEP_BLANK; } + + virtual void inform_input_size(unsigned input_num, unsigned width, unsigned height); + virtual void get_output_size(unsigned *width, unsigned *height, + unsigned *virtual_width, unsigned *virtual_height) const; + + enum FieldPosition { TOP = 0, BOTTOM = 1 }; + +private: + unsigned widths[5], heights[5]; + + // See file-level comment for explanation of this option. + bool enable_spatial_interlacing_check; + + // Which field the current input (the middle one) is. + FieldPosition current_field_position; + + // Offset for one pixel in the horizontal and verticla direction (1/width, 1/height). + float inv_width, inv_height; + + // For evaluating the low-pass filter (in the current field). Four taps. + float current_field_vertical_offset; +}; + } // namespace movit #endif // !defined(_MOVIT_DEINTERLACE_EFFECT_H) diff --git a/deinterlace_effect_test.cpp b/deinterlace_effect_test.cpp index 0c625c5..0ac1953 100644 --- a/deinterlace_effect_test.cpp +++ b/deinterlace_effect_test.cpp @@ -19,7 +19,17 @@ using namespace std; namespace movit { -TEST(DeinterlaceTest, ConstantColor) { +class DeinterlaceTest : public testing::TestWithParam { +protected: + DeinterlaceTest() : disabler(GetParam() == "fragment") {} + bool should_skip() { return disabler.should_skip(); } + +private: + DisableComputeShadersTemporarily disabler; +}; + +TEST_P(DeinterlaceTest, ConstantColor) { + if (should_skip()) return; float data[] = { 0.3f, 0.3f, 0.3f, 0.3f, @@ -52,7 +62,8 @@ TEST(DeinterlaceTest, ConstantColor) { } // Also tests that top/bottom change works like expected. -TEST(DeinterlaceTest, VerticalInterpolation) { +TEST_P(DeinterlaceTest, VerticalInterpolation) { + if (should_skip()) return; const int width = 11; const int height = 2; float data[width * height] = { @@ -97,7 +108,8 @@ TEST(DeinterlaceTest, VerticalInterpolation) { expect_equal(expected_data_bottom, out_data, width, height * 2); } -TEST(DeinterlaceTest, DiagonalInterpolation) { +TEST_P(DeinterlaceTest, DiagonalInterpolation) { + if (should_skip()) return; const int width = 11; const int height = 3; float data[width * height] = { @@ -145,7 +157,8 @@ TEST(DeinterlaceTest, DiagonalInterpolation) { expect_equal(expected_data_top, out_data, width, height * 2); } -TEST(DeinterlaceTest, FlickerBox) { +TEST_P(DeinterlaceTest, FlickerBox) { + if (should_skip()) return; const int width = 4; const int height = 4; float white_data[width * height] = { @@ -197,6 +210,10 @@ TEST(DeinterlaceTest, FlickerBox) { } } +INSTANTIATE_TEST_CASE_P(DeinterlaceTest, + DeinterlaceTest, + testing::Values("fragment", "compute")); + #ifdef HAVE_BENCHMARK namespace { @@ -210,8 +227,11 @@ TestFormat bgra_format = { FORMAT_BGRA_PREMULTIPLIED_ALPHA, GL_BGRA, 4 }; } // namespace -void BM_DeinterlaceEffect(benchmark::State &state, TestFormat format, bool spatial_interlacing_check) +void BM_DeinterlaceEffect(benchmark::State &state, TestFormat format, bool spatial_interlacing_check, const std::string &shader_type) { + DisableComputeShadersTemporarily disabler(shader_type == "fragment"); + if (disabler.should_skip()) return; + unsigned width = state.range(0), height = state.range(1); unsigned field_height = height / 2; @@ -243,9 +263,12 @@ void BM_DeinterlaceEffect(benchmark::State &state, TestFormat format, bool spati tester.benchmark(state, out_data.get(), format.output_format, COLORSPACE_sRGB, GAMMA_LINEAR, OUTPUT_ALPHA_FORMAT_PREMULTIPLIED); } -BENCHMARK_CAPTURE(BM_DeinterlaceEffect, Gray, gray_format, true)->Args({720, 576})->Args({1280, 720})->Args({1920, 1080})->UseRealTime()->Unit(benchmark::kMicrosecond); -BENCHMARK_CAPTURE(BM_DeinterlaceEffect, BGRA, bgra_format, true)->Args({720, 576})->Args({1280, 720})->Args({1920, 1080})->UseRealTime()->Unit(benchmark::kMicrosecond); -BENCHMARK_CAPTURE(BM_DeinterlaceEffect, BGRANoSpatialCheck, bgra_format, false)->Args({720, 576})->Args({1280, 720})->Args({1920, 1080})->UseRealTime()->Unit(benchmark::kMicrosecond); +BENCHMARK_CAPTURE(BM_DeinterlaceEffect, Gray, gray_format, true, "fragment")->Args({720, 576})->Args({1280, 720})->Args({1920, 1080})->UseRealTime()->Unit(benchmark::kMicrosecond); +BENCHMARK_CAPTURE(BM_DeinterlaceEffect, BGRA, bgra_format, true, "fragment")->Args({720, 576})->Args({1280, 720})->Args({1920, 1080})->UseRealTime()->Unit(benchmark::kMicrosecond); +BENCHMARK_CAPTURE(BM_DeinterlaceEffect, BGRANoSpatialCheck, bgra_format, false, "fragment")->Args({720, 576})->Args({1280, 720})->Args({1920, 1080})->UseRealTime()->Unit(benchmark::kMicrosecond); +BENCHMARK_CAPTURE(BM_DeinterlaceEffect, GrayCompute, gray_format, true, "compute")->Args({720, 576})->Args({1280, 720})->Args({1920, 1080})->UseRealTime()->Unit(benchmark::kMicrosecond); +BENCHMARK_CAPTURE(BM_DeinterlaceEffect, BGRACompute, bgra_format, true, "compute")->Args({720, 576})->Args({1280, 720})->Args({1920, 1080})->UseRealTime()->Unit(benchmark::kMicrosecond); +BENCHMARK_CAPTURE(BM_DeinterlaceEffect, BGRANoSpatialCheckCompute, bgra_format, false, "compute")->Args({720, 576})->Args({1280, 720})->Args({1920, 1080})->UseRealTime()->Unit(benchmark::kMicrosecond); #endif -- 2.39.2