From: Steinar H. Gunderson <sgunderson@bigfoot.com>
Date: Fri, 3 Aug 2018 18:53:36 +0000 (+0200)
Subject: Pack the gradients and image together into a single 32-bit texture; seems to help... 
X-Git-Tag: 1.8.0~76^2~140
X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=d55856fef7b51604230ab3480f2930155d830ca8;p=nageru

Pack the gradients and image together into a single 32-bit texture; seems to help ~1.5 ms for flow on NVIDIA.
---

diff --git a/flow.cpp b/flow.cpp
index 726e74f..3f3ff90 100644
--- a/flow.cpp
+++ b/flow.cpp
@@ -439,7 +439,7 @@ private:
 	GLuint motion_search_program;
 
 	GLuint uniform_inv_image_size, uniform_inv_prev_level_size;
-	GLuint uniform_image0_tex, uniform_image1_tex, uniform_grad0_tex, uniform_flow_tex;
+	GLuint uniform_image1_tex, uniform_grad0_tex, uniform_flow_tex;
 };
 
 MotionSearch::MotionSearch()
@@ -450,7 +450,6 @@ MotionSearch::MotionSearch()
 
 	uniform_inv_image_size = glGetUniformLocation(motion_search_program, "inv_image_size");
 	uniform_inv_prev_level_size = glGetUniformLocation(motion_search_program, "inv_prev_level_size");
-	uniform_image0_tex = glGetUniformLocation(motion_search_program, "image0_tex");
 	uniform_image1_tex = glGetUniformLocation(motion_search_program, "image1_tex");
 	uniform_grad0_tex = glGetUniformLocation(motion_search_program, "grad0_tex");
 	uniform_flow_tex = glGetUniformLocation(motion_search_program, "flow_tex");
@@ -460,9 +459,8 @@ void MotionSearch::exec(GLuint tex0_view, GLuint tex1_view, GLuint grad0_tex, GL
 {
 	glUseProgram(motion_search_program);
 
-	bind_sampler(motion_search_program, uniform_image0_tex, 0, tex0_view, nearest_sampler);
 	bind_sampler(motion_search_program, uniform_image1_tex, 1, tex1_view, linear_sampler);
-	bind_sampler(motion_search_program, uniform_grad0_tex, 2, grad0_tex, zero_border_sampler);
+	bind_sampler(motion_search_program, uniform_grad0_tex, 2, grad0_tex, linear_sampler);
 	bind_sampler(motion_search_program, uniform_flow_tex, 3, flow_tex, linear_sampler);
 
 	glProgramUniform2f(motion_search_program, uniform_inv_image_size, 1.0f / level_width, 1.0f / level_height);
@@ -1035,7 +1033,7 @@ GLuint DISComputeFlow::exec(GLuint tex0, GLuint tex1, ResizeStrategy resize_stra
 
 		// Create a new texture; we could be fancy and render use a multi-level
 		// texture, but meh.
-		GLuint grad0_tex = pool.get_texture(GL_RG16F, level_width, level_height);
+		GLuint grad0_tex = pool.get_texture(GL_R32UI, level_width, level_height);
 
 		// Find the derivative.
 		{
diff --git a/motion_search.frag b/motion_search.frag
index 136d316..9ef607c 100644
--- a/motion_search.frag
+++ b/motion_search.frag
@@ -42,12 +42,37 @@ in vec2 flow_tc;
 in vec2 patch_center;
 out vec3 out_flow;
 
-uniform sampler2D flow_tex, grad0_tex, image0_tex, image1_tex;
+uniform sampler2D flow_tex, image1_tex;
+uniform usampler2D grad0_tex;  // Also contains image0.
 uniform vec2 inv_image_size, inv_prev_level_size;
 
+vec3 unpack_gradients(uint v)
+{
+	uint vi = v & 0xff;
+	uint xi = (v >> 8) & 0xfff;
+	uint yi = v >> 20;
+	vec3 r = vec3(xi * (1.0f / 4095.0f) - 0.5f, yi * (1.0f / 4095.0f) - 0.5f, vi * (1.0f / 255.0f));
+	return r;
+}
+
+// Note: The third variable is the actual pixel value.
+vec3 get_gradients(vec2 tc)
+{
+	vec3 grad = unpack_gradients(texture(grad0_tex, tc).x);
+
+	// Zero gradients outside the image. (We'd do this with a sampler,
+	// but we want the repeat behavior for the actual texels, in the
+	// z channel.)
+	if (any(lessThan(tc, vec2(0.0f))) || any(greaterThan(tc, vec2(1.0f)))) {
+		grad.xy = vec2(0.0f);
+	}
+
+	return grad;
+}
+
 void main()
 {
-	vec2 image_size = textureSize(image0_tex, 0);
+	vec2 image_size = textureSize(grad0_tex, 0);
 
 	// Lock the patch center to an integer, so that we never get
 	// any bilinear artifacts for the gradient. (NOTE: This assumes an
@@ -71,13 +96,13 @@ void main()
 	for (uint y = 0; y < patch_size; ++y) {
 		for (uint x = 0; x < patch_size; ++x) {
 			vec2 tc = base + uvec2(x, y) * inv_image_size;
-			vec2 grad = texture(grad0_tex, tc).xy;
+			vec3 grad = get_gradients(tc);
 			H[0][0] += grad.x * grad.x;
 			H[1][1] += grad.y * grad.y;
 			H[0][1] += grad.x * grad.y;
 
-			template_sum += texture(image0_tex, tc).x;
-			grad_sum += grad;
+			template_sum += grad.z;  // The actual template pixel value.
+			grad_sum += grad.xy;
 		}
 	}
 	H[1][0] = H[0][1];
@@ -105,10 +130,10 @@ void main()
 		for (uint y = 0; y < patch_size; ++y) {
 			for (uint x = 0; x < patch_size; ++x) {
 				vec2 tc = base + uvec2(x, y) * inv_image_size;
-				vec2 grad = texture(grad0_tex, tc).xy;
-				float t = texture(image0_tex, tc).x;
+				vec3 grad = get_gradients(tc);
+				float t = grad.z;
 				float warped = texture(image1_tex, tc + u_norm).x;
-				du += grad * (warped - t);
+				du += grad.xy * (warped - t);
 				warped_sum += warped;
 			}
 		}
diff --git a/sobel.frag b/sobel.frag
index 90c6d8a..3066300 100644
--- a/sobel.frag
+++ b/sobel.frag
@@ -1,10 +1,21 @@
 #version 450 core
 
 in vec2 tc;
-out vec2 gradients;
+out uint packed_gradients;
 
 uniform sampler2D tex;
 
+uint pack_gradients(float x, float y, float v)
+{
+	x = clamp(x, -0.5f, 0.5f);
+	y = clamp(y, -0.5f, 0.5f);
+
+	uint vi = uint(round(v * 255.0f));
+	uint xi = uint(round((x + 0.5f) * 4095.0f));
+	uint yi = uint(round((y + 0.5f) * 4095.0f));
+	return vi | (xi << 8) | (yi << 20);
+}
+
 void main()
 {
 	// There are two common Sobel filters, horizontal and vertical
@@ -36,10 +47,18 @@ void main()
 	float right        = textureOffset(tex, tc, ivec2( 1,  0)).x;
 	float bottom_right = textureOffset(tex, tc, ivec2( 1, -1)).x;
 
+	vec2 gradients;
 	gradients.x = (top_right + 2.0f * right + bottom_right) - (top_left + 2.0f * left + bottom_left);
 	gradients.y = (top_left + 2.0 * top + top_right) - (bottom_left + 2.0f * bottom + bottom_right);
 
 	// Normalize so that we have a normalized unit of intensity levels per pixel.
 	gradients.x *= 0.125;
 	gradients.y *= 0.125;
+
+	// Also store the actual pixel value, so that we get it âfor freeâ
+	// when we sample the gradients in motion_search.frag later.
+	float center = texture(tex, tc).x;
+
+	// Pack everything into a single 32-bit value, using simple fixed-point.
+	packed_gradients = pack_gradients(gradients.x, gradients.y, center);
 }