git.sesse.net Git - movit/blob - deinterlace_effect.comp

   1 // Implicit uniforms:
   2 // uniform int PREFIX(current_field_position);
   3 // uniform float PREFIX(inv_width);
   4 // uniform float PREFIX(inv_height);
   5 // uniform float PREFIX(current_field_vertical_offset);
   6
   7 // Compute shader implementation of DeinterlaceEffect. See the fragment
   8 // shader implementation (deinterlace_effect.frag) for comments about the
   9 // algorithm; comments here will mainly be about issues specific to the
  10 // compute shader implementation.
  11
  12 #define DIFF(s1, s2) dot((s1) - (s2), (s1) - (s2))
  13
  14 // In input pixels (so output will be 8x32). Corresponds to get_compute_dimensions()
  15 // in the C++ code. It is illogical that 8x32 would be better than e.g. 32x8,
  16 // since we reuse more data horizontally, but especially Intel cards are much more
  17 // happy about this for whatever reason.
  18 #define GROUP_W 8
  19 #define GROUP_H 16
  20
  21 // When sampling from the current field (spatial interpolation below), we have
  22 // a fringe of three pixels on the left and right sides, so we need to load
  23 // more. We also have one pixel above and below, although our destination pixel
  24 // is squeezed in the middle of them (they don't overlap), so we only need one
  25 // extra pixel.
  26 #define GROUP_W_FRINGE (GROUP_W + 6)
  27 #define GROUP_H_FRINGE (GROUP_H + 1)
  28
  29 layout(local_size_x = GROUP_W, local_size_y = GROUP_H) in;
  30
  31 #if (GROUP_W_FRINGE * GROUP_H_FRINGE) > (GROUP_W * (GROUP_H + 2))
  32 #define TEMP_NUM_ELEM (GROUP_W_FRINGE * GROUP_H_FRINGE)
  33 #else
  34 #define TEMP_NUM_ELEM (GROUP_W * (GROUP_H + 2))
  35 #endif
  36
  37 shared vec4 temp[TEMP_NUM_ELEM];
  38
  39 #if TEMP_NUM_ELEM > (GROUP_W * GROUP_H * 2)
  40 #error Not enough threads to load all data in two loads
  41 #endif
  42
  43 // Load a WxH block of samples. We need to do this in two phases,
  44 // since we have more input samples than we have output samples (threads);
  45 // in the second phase, some threads will be idle.
  46 #define LOAD_PIXEL_BLOCK(base_tc, block_width, block_height, func) \
  47 { \
  48         memoryBarrierShared(); \
  49         barrier(); \
  50         int thread_id = int(gl_LocalInvocationID.y) * GROUP_W + int(gl_LocalInvocationID.x); \
  51         { \
  52                 int x = thread_id % (block_width); \
  53                 int y = thread_id / (block_width); \
  54                 temp[thread_id] = func(vec2((base_tc).x + x * PREFIX(inv_width), \
  55                                             (base_tc).y + y * PREFIX(inv_height))); \
  56         } \
  57         const int num_threads = GROUP_W * GROUP_H; \
  58         if (thread_id + num_threads < (block_width) * (block_height)) { \
  59                 int x = (thread_id + num_threads) % (block_width); \
  60                 int y = (thread_id + num_threads) / (block_width); \
  61                 temp[thread_id + num_threads] = \
  62                         func(vec2((base_tc).x + x * PREFIX(inv_width), \
  63                                   (base_tc).y + y * PREFIX(inv_height))); \
  64         } \
  65         memoryBarrierShared(); \
  66         barrier(); \
  67 }
  68
  69 void FUNCNAME() {
  70         // The current thread is responsible for output of two pixels, namely (x,2y)
  71         // and (x,2y+1). One will be an unmodified one, the other one will be the
  72         // pixel we are trying to interpolate. If TFF (current_field_position==0),
  73         // the unmodified one is 2y+1 (remember OpenGL's bottom-left convention),
  74         // and if BFF, the unmodified one is 2y. So we need to invert current_field_position
  75         // to figure out which value to add.
  76         int yi = int(gl_GlobalInvocationID.y) * 2 + (PREFIX(current_field_position) ^ 1);
  77
  78         // Load in data for the current field. current_offset signals where the block
  79         // starts vertically; see set_gl_state() in the C++ code.
  80         vec2 base_tc = vec2((gl_WorkGroupID.x * uint(GROUP_W) + (0.5f - 3.0f)) * PREFIX(inv_width),
  81                             (gl_WorkGroupID.y * uint(GROUP_H) + 0.5f) * PREFIX(inv_height) + PREFIX(current_field_vertical_offset));
  82         LOAD_PIXEL_BLOCK(base_tc, GROUP_W_FRINGE, GROUP_H_FRINGE, INPUT3);
  83
  84         int lx = int(gl_LocalInvocationID.x) + 3;
  85         int ly = int(gl_LocalInvocationID.y);
  86
  87         // Output the unmodified pixel. For TFF (current_field_position == 0),
  88         // we have an extra pixel on the bottom that we're only using for interpolation
  89         // (it's being output by another workgroup), so we have to add 1.
  90         vec4 val = temp[(ly + (PREFIX(current_field_position) ^ 1)) * GROUP_W_FRINGE + lx];
  91         OUTPUT(ivec2(gl_GlobalInvocationID.x, yi), val);
  92
  93         // a b c d e f g     ↑ y
  94         //       x           |
  95         // h i j k l m n     +--> x
  96
  97         vec4 a = temp[(ly + 1) * GROUP_W_FRINGE + lx - 3];
  98         vec4 b = temp[(ly + 1) * GROUP_W_FRINGE + lx - 2];
  99         vec4 c = temp[(ly + 1) * GROUP_W_FRINGE + lx - 1];
 100         vec4 d = temp[(ly + 1) * GROUP_W_FRINGE + lx];
 101         vec4 e = temp[(ly + 1) * GROUP_W_FRINGE + lx + 1];
 102         vec4 f = temp[(ly + 1) * GROUP_W_FRINGE + lx + 2];
 103         vec4 g = temp[(ly + 1) * GROUP_W_FRINGE + lx + 3];
 104
 105         vec4 h = temp[ly * GROUP_W_FRINGE + lx - 3];
 106         vec4 i = temp[ly * GROUP_W_FRINGE + lx - 2];
 107         vec4 j = temp[ly * GROUP_W_FRINGE + lx - 1];
 108         vec4 k = temp[ly * GROUP_W_FRINGE + lx];
 109         vec4 l = temp[ly * GROUP_W_FRINGE + lx + 1];
 110         vec4 m = temp[ly * GROUP_W_FRINGE + lx + 2];
 111         vec4 n = temp[ly * GROUP_W_FRINGE + lx + 3];
 112
 113         // 0 degrees.
 114         vec4 pred = d + k;
 115         float score;
 116         float best_score = DIFF(c, j) + DIFF(d, k) + DIFF(e, l) - 1e-4;
 117
 118         // -45 degrees.
 119         score = DIFF(b, k) + DIFF(c, l) + DIFF(d, m);
 120         if (score < best_score) {
 121                 pred = c + l;
 122                 best_score = score;
 123         }
 124
 125         // -63 degrees.
 126         score = DIFF(a, l) + DIFF(b, m) + DIFF(c, n);
 127         if (score < best_score) {
 128                 pred = b + m;
 129                 best_score = score;
 130         }
 131
 132         // +45 degrees.
 133         score = DIFF(d, i) + DIFF(e, j) + DIFF(f, k);
 134         if (score < best_score) {
 135                 pred = e + j;
 136                 best_score = score;
 137         }
 138
 139         // +63 degrees.
 140         score = DIFF(e, h) + DIFF(f, i) + DIFF(g, j);
 141         if (score < best_score) {
 142                 pred = f + i;
 143                 // best_score isn't used anymore.
 144         }
 145
 146         pred *= 0.5f;
 147
 148         // Temporal prediction (p2) of this pixel based on the previous and next fields.
 149         //
 150         //                ↑ y
 151         //     C   H      |
 152         //   A   F   K    |
 153         //     D x I      |
 154         //   B   G   L    |
 155         //     E   J      |
 156         //                +-----> time
 157         //
 158         // x is obviously aligned with D and I, so we don't need texcoord
 159         // adjustment for top/bottom field here, unlike earlier. However, we need
 160         // to start the block one pixel below since we need E/J, thus the -1 in
 161         // the y coordinate.
 162         base_tc = vec2((gl_WorkGroupID.x * uint(GROUP_W) + 0.5f) * PREFIX(inv_width),
 163                        (gl_WorkGroupID.y * uint(GROUP_H) + (0.5f - 1.0f)) * PREFIX(inv_height));
 164         lx = int(gl_LocalInvocationID.x);
 165 #if YADIF_ENABLE_SPATIAL_INTERLACING_CHECK
 166         LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 2, INPUT2);
 167         vec4 C = temp[(ly + 2) * GROUP_W + lx];
 168         vec4 D = temp[(ly + 1) * GROUP_W + lx];
 169         vec4 E = temp[ ly      * GROUP_W + lx];
 170
 171         LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 2, INPUT4);
 172         vec4 H = temp[(ly + 2) * GROUP_W + lx];
 173         vec4 I = temp[(ly + 1) * GROUP_W + lx];
 174         vec4 J = temp[ ly      * GROUP_W + lx];
 175 #else
 176         // Since spatial interlacing check is not enabled, we only need D
 177         // and I from the previous and next fields; since they are not shared
 178         // between the neighboring pixels, they can be straight-up loads.
 179         vec2 DI_pos = vec2((gl_GlobalInvocationID.x + 0.5f) * PREFIX(inv_width),
 180                            (gl_GlobalInvocationID.y + 0.5f) * PREFIX(inv_height));
 181         vec4 D = INPUT2(DI_pos);
 182         vec4 I = INPUT4(DI_pos);
 183 #endif
 184
 185         // Load what we need from the previous field into shared memory,
 186         // since A/B can be reused between neighboring pixels. We need one
 187         // line above/below, but we don't need the horizontal fringe.
 188         LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 1, INPUT1);
 189         vec4 A = temp[(ly + 1) * GROUP_W + lx];
 190         vec4 B = temp[ ly      * GROUP_W + lx];
 191
 192         // What we need from the current field was loaded earlier.
 193         vec4 F = d;
 194         vec4 G = k;
 195
 196         // Next field.
 197         LOAD_PIXEL_BLOCK(base_tc, GROUP_W, GROUP_H + 1, INPUT5);
 198         vec4 K = temp[(ly + 1) * GROUP_W + lx];
 199         vec4 L = temp[ ly      * GROUP_W + lx];
 200
 201         // Find temporal differences around this line.
 202         vec4 tdiff0 = abs(D - I);
 203         vec4 tdiff1 = abs(A - F) + abs(B - G);  // Actually twice tdiff1.
 204         vec4 tdiff2 = abs(K - F) + abs(L - G);  // Actually twice tdiff2.
 205         vec4 diff = max(tdiff0, 0.5f * max(tdiff1, tdiff2));
 206
 207 #if YADIF_ENABLE_SPATIAL_INTERLACING_CHECK
 208         // Spatial interlacing check.
 209         // We start by temporally interpolating the current vertical line (p0–p4):
 210         //
 211         //     C p0 H      ↑ y
 212         //       p1        |
 213         //     D p2 I      |
 214         //       p3        |
 215         //     E p4 J      +-----> time
 216         //
 217         vec4 p0 = 0.5f * (C + H);
 218         vec4 p1 = F;
 219         vec4 p2 = 0.5f * (D + I);
 220         vec4 p3 = G;
 221         vec4 p4 = 0.5f * (E + J);
 222
 223         vec4 max_ = max(max(p2 - p3, p2 - p1), min(p0 - p1, p4 - p3));
 224         vec4 min_ = min(min(p2 - p3, p2 - p1), max(p0 - p1, p4 - p3));
 225         diff = max(diff, max(min_, -max_));
 226 #else
 227         vec4 p2 = 0.5f * (D + I);
 228 #endif
 229
 230         val = clamp(pred, p2 - diff, p2 + diff);
 231         OUTPUT(ivec2(gl_GlobalInvocationID.x, yi ^ 1), val);
 232 }
 233
 234 #undef LOAD_PIXEL_BLOCK
 235 #undef DIFF
 236 #undef YADIF_ENABLE_SPATIAL_INTERLACING_CHECK