From: Steinar H. Gunderson Date: Fri, 3 Aug 2018 18:53:36 +0000 (+0200) Subject: Pack the gradients and image together into a single 32-bit texture; seems to help... X-Git-Tag: 1.8.0~76^2~140 X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=d55856fef7b51604230ab3480f2930155d830ca8;p=nageru Pack the gradients and image together into a single 32-bit texture; seems to help ~1.5 ms for flow on NVIDIA. --- diff --git a/flow.cpp b/flow.cpp index 726e74f..3f3ff90 100644 --- a/flow.cpp +++ b/flow.cpp @@ -439,7 +439,7 @@ private: GLuint motion_search_program; GLuint uniform_inv_image_size, uniform_inv_prev_level_size; - GLuint uniform_image0_tex, uniform_image1_tex, uniform_grad0_tex, uniform_flow_tex; + GLuint uniform_image1_tex, uniform_grad0_tex, uniform_flow_tex; }; MotionSearch::MotionSearch() @@ -450,7 +450,6 @@ MotionSearch::MotionSearch() uniform_inv_image_size = glGetUniformLocation(motion_search_program, "inv_image_size"); uniform_inv_prev_level_size = glGetUniformLocation(motion_search_program, "inv_prev_level_size"); - uniform_image0_tex = glGetUniformLocation(motion_search_program, "image0_tex"); uniform_image1_tex = glGetUniformLocation(motion_search_program, "image1_tex"); uniform_grad0_tex = glGetUniformLocation(motion_search_program, "grad0_tex"); uniform_flow_tex = glGetUniformLocation(motion_search_program, "flow_tex"); @@ -460,9 +459,8 @@ void MotionSearch::exec(GLuint tex0_view, GLuint tex1_view, GLuint grad0_tex, GL { glUseProgram(motion_search_program); - bind_sampler(motion_search_program, uniform_image0_tex, 0, tex0_view, nearest_sampler); bind_sampler(motion_search_program, uniform_image1_tex, 1, tex1_view, linear_sampler); - bind_sampler(motion_search_program, uniform_grad0_tex, 2, grad0_tex, zero_border_sampler); + bind_sampler(motion_search_program, uniform_grad0_tex, 2, grad0_tex, linear_sampler); bind_sampler(motion_search_program, uniform_flow_tex, 3, flow_tex, linear_sampler); glProgramUniform2f(motion_search_program, uniform_inv_image_size, 1.0f / level_width, 1.0f / level_height); @@ -1035,7 +1033,7 @@ GLuint DISComputeFlow::exec(GLuint tex0, GLuint tex1, ResizeStrategy resize_stra // Create a new texture; we could be fancy and render use a multi-level // texture, but meh. - GLuint grad0_tex = pool.get_texture(GL_RG16F, level_width, level_height); + GLuint grad0_tex = pool.get_texture(GL_R32UI, level_width, level_height); // Find the derivative. { diff --git a/motion_search.frag b/motion_search.frag index 136d316..9ef607c 100644 --- a/motion_search.frag +++ b/motion_search.frag @@ -42,12 +42,37 @@ in vec2 flow_tc; in vec2 patch_center; out vec3 out_flow; -uniform sampler2D flow_tex, grad0_tex, image0_tex, image1_tex; +uniform sampler2D flow_tex, image1_tex; +uniform usampler2D grad0_tex; // Also contains image0. uniform vec2 inv_image_size, inv_prev_level_size; +vec3 unpack_gradients(uint v) +{ + uint vi = v & 0xff; + uint xi = (v >> 8) & 0xfff; + uint yi = v >> 20; + vec3 r = vec3(xi * (1.0f / 4095.0f) - 0.5f, yi * (1.0f / 4095.0f) - 0.5f, vi * (1.0f / 255.0f)); + return r; +} + +// Note: The third variable is the actual pixel value. +vec3 get_gradients(vec2 tc) +{ + vec3 grad = unpack_gradients(texture(grad0_tex, tc).x); + + // Zero gradients outside the image. (We'd do this with a sampler, + // but we want the repeat behavior for the actual texels, in the + // z channel.) + if (any(lessThan(tc, vec2(0.0f))) || any(greaterThan(tc, vec2(1.0f)))) { + grad.xy = vec2(0.0f); + } + + return grad; +} + void main() { - vec2 image_size = textureSize(image0_tex, 0); + vec2 image_size = textureSize(grad0_tex, 0); // Lock the patch center to an integer, so that we never get // any bilinear artifacts for the gradient. (NOTE: This assumes an @@ -71,13 +96,13 @@ void main() for (uint y = 0; y < patch_size; ++y) { for (uint x = 0; x < patch_size; ++x) { vec2 tc = base + uvec2(x, y) * inv_image_size; - vec2 grad = texture(grad0_tex, tc).xy; + vec3 grad = get_gradients(tc); H[0][0] += grad.x * grad.x; H[1][1] += grad.y * grad.y; H[0][1] += grad.x * grad.y; - template_sum += texture(image0_tex, tc).x; - grad_sum += grad; + template_sum += grad.z; // The actual template pixel value. + grad_sum += grad.xy; } } H[1][0] = H[0][1]; @@ -105,10 +130,10 @@ void main() for (uint y = 0; y < patch_size; ++y) { for (uint x = 0; x < patch_size; ++x) { vec2 tc = base + uvec2(x, y) * inv_image_size; - vec2 grad = texture(grad0_tex, tc).xy; - float t = texture(image0_tex, tc).x; + vec3 grad = get_gradients(tc); + float t = grad.z; float warped = texture(image1_tex, tc + u_norm).x; - du += grad * (warped - t); + du += grad.xy * (warped - t); warped_sum += warped; } } diff --git a/sobel.frag b/sobel.frag index 90c6d8a..3066300 100644 --- a/sobel.frag +++ b/sobel.frag @@ -1,10 +1,21 @@ #version 450 core in vec2 tc; -out vec2 gradients; +out uint packed_gradients; uniform sampler2D tex; +uint pack_gradients(float x, float y, float v) +{ + x = clamp(x, -0.5f, 0.5f); + y = clamp(y, -0.5f, 0.5f); + + uint vi = uint(round(v * 255.0f)); + uint xi = uint(round((x + 0.5f) * 4095.0f)); + uint yi = uint(round((y + 0.5f) * 4095.0f)); + return vi | (xi << 8) | (yi << 20); +} + void main() { // There are two common Sobel filters, horizontal and vertical @@ -36,10 +47,18 @@ void main() float right = textureOffset(tex, tc, ivec2( 1, 0)).x; float bottom_right = textureOffset(tex, tc, ivec2( 1, -1)).x; + vec2 gradients; gradients.x = (top_right + 2.0f * right + bottom_right) - (top_left + 2.0f * left + bottom_left); gradients.y = (top_left + 2.0 * top + top_right) - (bottom_left + 2.0f * bottom + bottom_right); // Normalize so that we have a normalized unit of intensity levels per pixel. gradients.x *= 0.125; gradients.y *= 0.125; + + // Also store the actual pixel value, so that we get it “for free” + // when we sample the gradients in motion_search.frag later. + float center = texture(tex, tc).x; + + // Pack everything into a single 32-bit value, using simple fixed-point. + packed_gradients = pack_gradients(gradients.x, gradients.y, center); }