// Implicit uniforms: // uniform int PREFIX(current_field_position); // uniform float PREFIX(inv_width); // uniform float PREFIX(inv_height); // uniform float PREFIX(current_field_vertical_offset); // Compute shader implementation of DeinterlaceEffect. See the fragment // shader implementation (deinterlace_effect.frag) for comments about the // algorithm; comments here will mainly be about issues specific to the // compute shader implementation. #define DIFF(s1, s2) dot((s1) - (s2), (s1) - (s2)) // In input pixels (so output will be 8x32). Corresponds to get_compute_dimensions() // in the C++ code. It is illogical that 8x32 would be better than e.g. 32x8, // since we reuse more data horizontally, but especially Intel cards are much more // happy about this for whatever reason. #define GROUP_W 8 #define GROUP_H 16 // When sampling from the current field (spatial interpolation below), we have // a fringe of three pixels on the left and right sides, so we need to load // more. We also have one pixel above and below, although our destination pixel // is squeezed in the middle of them (they don't overlap), so we only need one // extra pixel. #define GROUP_W_FRINGE (GROUP_W + 6) #define GROUP_H_FRINGE (GROUP_H + 1) layout(local_size_x = GROUP_W, local_size_y = GROUP_H) in; #if (GROUP_W_FRINGE * GROUP_H_FRINGE) > (GROUP_W * (GROUP_H + 2)) #define TEMP_NUM_ELEM (GROUP_W_FRINGE * GROUP_H_FRINGE) #else #define TEMP_NUM_ELEM (GROUP_W * (GROUP_H + 2)) #endif shared vec4 temp[TEMP_NUM_ELEM]; #if TEMP_NUM_ELEM > (GROUP_W * GROUP_H * 2) #error Not enough threads to load all data in two loads #endif // Load a WxH block of samples. We need to do this in two phases, // since we have more input samples than we have output samples (threads); // in the second phase, some threads will be idle. #define LOAD_PIXEL_BLOCK(base_tc, block_width, block_height, func) \ { \ memoryBarrierShared(); \ barrier(); \ int thread_id = int(gl_LocalInvocationID.y) * GROUP_W + int(gl_LocalInvocationID.x); \ { \ int x = thread_id % (block_width); \ int y = thread_id / (block_width); \ temp[thread_id] = func(vec2((base_tc).x + x * PREFIX(inv_width), \ (base_tc).y + y * PREFIX(inv_height))); \ } \ const int num_threads = GROUP_W * GROUP_H; \ if (thread_id + num_threads < (block_width) * (block_height)) { \ int x = (thread_id + num_threads) % (block_width); \ int y = (thread_id + num_threads) / (block_width); \ temp[thread_id + num_threads] = \ func(vec2((base_tc).x + x * PREFIX(inv_width), \ (base_tc).y + y * PREFIX(inv_height))); \ } \ memoryBarrierShared(); \ barrier(); \ } void FUNCNAME() { // The current thread is responsible for output of two pixels, namely (x,2y) // and (x,2y+1). One will be an unmodified one, the other one will be the // pixel we are trying to interpolate. If TFF (current_field_position==0), // the unmodified one is 2y+1 (remember OpenGL's bottom-left convention), // and if BFF, the unmodified one is 2y. So we need to invert current_field_position // to figure out which value to add. int yi = int(gl_GlobalInvocationID.y) * 2 + (PREFIX(current_field_position) ^ 1); // Load in data for the current field. current_offset signals where the block // starts vertically; see set_gl_state() in the C++ code. vec2 base_tc = vec2((gl_WorkGroupID.x * uint(GROUP_W) + (0.5f - 3.0f)) * PREFIX(inv_width), (gl_WorkGroupID.y * uint(GROUP_H) + 0.5f) * PREFIX(inv_height) + PREFIX(current_field_vertical_offset)); LOAD_PIXEL_BLOCK(base_tc, GROUP_W_FRINGE, GROUP_H_FRINGE, INPUT3); int lx = int(gl_LocalInvocationID.x) + 3; int ly = int(gl_LocalInvocationID.y); // Output the unmodified pixel. For TFF (current_field_position == 0), // we have an extra pixel on the bottom that we're only using for interpolation // (it's being output by another workgroup), so we have to add 1. vec4 val = temp[(ly + (PREFIX(current_field_position) ^ 1)) * GROUP_W_FRINGE + lx]; OUTPUT(ivec2(gl_GlobalInvocationID.x, yi), val); // a b c d e f g ↑ y // x | // h i j k l m n +--> x vec4 a = temp[(ly + 1) * GROUP_W_FRINGE + lx - 3]; vec4 b = temp[(ly + 1) * GROUP_W_FRINGE + lx - 2]; vec4 c = temp[(ly + 1) * GROUP_W_FRINGE + lx - 1]; vec4 d = temp[(ly + 1) * GROUP_W_FRINGE + lx]; vec4 e = temp[(ly + 1) * GROUP_W_FRINGE + lx + 1]; vec4 f = temp[(ly + 1) * GROUP_W_FRINGE + lx + 2]; vec4 g = temp[(ly + 1) * GROUP_W_FRINGE + lx + 3]; vec4 h = temp[ly * GROUP_W_FRINGE + lx - 3]; vec4 i = temp[ly * GROUP_W_FRINGE + lx - 2]; vec4 j = temp[ly * GROUP_W_FRINGE + lx - 1]; vec4 k = temp[ly * GROUP_W_FRINGE + lx]; vec4 l = temp[ly * GROUP_W_FRINGE + lx + 1]; vec4 m = temp[ly * GROUP_W_FRINGE + lx + 2]; vec4 n = temp[ly * GROUP_W_FRINGE + lx + 3]; // 0 degrees. vec4 pred = d + k; float score; float best_score = DIFF(c, j) + DIFF(d, k) + DIFF(e, l) - 1e-4; // -45 degrees. score = DIFF(b, k) + DIFF(c, l) + DIFF(d, m); if (score < best_score) { pred = c + l; best_score = score; } // -63 degrees. score = DIFF(a, l) + DIFF(b, m) + DIFF(c, n); if (score < best_score) { pred = b + m; best_score = score; } // +45 degrees. score = DIFF(d, i) + DIFF(e, j) + DIFF(f, k); if (score < best_score) { pred = e + j; best_score = score; } // +63 degrees. score = DIFF(e, h) + DIFF(f, i) + DIFF(g, j); if (score < best_score) { pred = f + i; // best_score isn't used anymore. } pred *= 0.5f; // Temporal prediction (p2) of this pixel based on the previous and next fields. // // ↑ y // C H | // A F K | // D x I | // B G L | // E J | // +-----> time // // x is obviously aligned with D and I, so we don't need texcoord // adjustment for top/bottom field here, unlike earlier. However, we need // to start the block one pixel below since we need E/J, thus the -1 in // the y coordinate. base_tc = vec2((gl_WorkGroupID.x * uint(GROUP_W) + 0.5f) * PREFIX(inv_width), (gl_WorkGroupID.y * uint(GROUP_H) + (0.5f - 1.0f)) * PREFIX(inv_height)); lx = int(gl_LocalInvocationID.x); #if YADIF_ENABLE_SPATIAL_INTERLACING_CHECK LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 2, INPUT2); vec4 C = temp[(ly + 2) * GROUP_W + lx]; vec4 D = temp[(ly + 1) * GROUP_W + lx]; vec4 E = temp[ ly * GROUP_W + lx]; LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 2, INPUT4); vec4 H = temp[(ly + 2) * GROUP_W + lx]; vec4 I = temp[(ly + 1) * GROUP_W + lx]; vec4 J = temp[ ly * GROUP_W + lx]; #else // Since spatial interlacing check is not enabled, we only need D // and I from the previous and next fields; since they are not shared // between the neighboring pixels, they can be straight-up loads. vec2 DI_pos = vec2((gl_GlobalInvocationID.x + 0.5f) * PREFIX(inv_width), (gl_GlobalInvocationID.y + 0.5f) * PREFIX(inv_height)); vec4 D = INPUT2(DI_pos); vec4 I = INPUT4(DI_pos); #endif // Load what we need from the previous field into shared memory, // since A/B can be reused between neighboring pixels. We need one // line above/below, but we don't need the horizontal fringe. LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 1, INPUT1); vec4 A = temp[(ly + 1) * GROUP_W + lx]; vec4 B = temp[ ly * GROUP_W + lx]; // What we need from the current field was loaded earlier. vec4 F = d; vec4 G = k; // Next field. LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 1, INPUT5); vec4 K = temp[(ly + 1) * GROUP_W + lx]; vec4 L = temp[ ly * GROUP_W + lx]; // Find temporal differences around this line. vec4 tdiff0 = abs(D - I); vec4 tdiff1 = abs(A - F) + abs(B - G); // Actually twice tdiff1. vec4 tdiff2 = abs(K - F) + abs(L - G); // Actually twice tdiff2. vec4 diff = max(tdiff0, 0.5f * max(tdiff1, tdiff2)); #if YADIF_ENABLE_SPATIAL_INTERLACING_CHECK // Spatial interlacing check. // We start by temporally interpolating the current vertical line (p0–p4): // // C p0 H ↑ y // p1 | // D p2 I | // p3 | // E p4 J +-----> time // vec4 p0 = 0.5f * (C + H); vec4 p1 = F; vec4 p2 = 0.5f * (D + I); vec4 p3 = G; vec4 p4 = 0.5f * (E + J); vec4 max_ = max(max(p2 - p3, p2 - p1), min(p0 - p1, p4 - p3)); vec4 min_ = min(min(p2 - p3, p2 - p1), max(p0 - p1, p4 - p3)); diff = max(diff, max(min_, -max_)); #else vec4 p2 = 0.5f * (D + I); #endif val = clamp(pred, p2 - diff, p2 + diff); OUTPUT(ivec2(gl_GlobalInvocationID.x, yi ^ 1), val); } #undef LOAD_PIXEL_BLOCK #undef DIFF #undef YADIF_ENABLE_SPATIAL_INTERLACING_CHECK