-in vec2 tc, tc_left, tc_down;
-in vec2 equation_tc_assuming_left, equation_tc_assuming_right;
+in vec3 tc, tc_left, tc_down;
+in vec3 equation_tc_assuming_left, equation_tc_assuming_right;
-uniform sampler2D diff_flow_tex, diffusivity_tex;
-uniform usampler2D equation_red_tex, equation_black_tex;
+uniform sampler2DArray diff_flow_tex, diffusivity_tex;
+uniform usampler2DArray equation_red_tex, equation_black_tex;
// just immediately throws away half of the warp, but it helps convergence
// a _lot_ (rough testing indicates that five iterations of SOR is as good
// as ~50 iterations of Jacobi). We could probably do better by reorganizing
// just immediately throws away half of the warp, but it helps convergence
// a _lot_ (rough testing indicates that five iterations of SOR is as good
// as ~50 iterations of Jacobi). We could probably do better by reorganizing
- // the data into two-values-per-pixel, so-called “twinning buffering”,
- // but it makes for rather annoying code in the rest of the pipeline.
+ // the data into two-values-per-pixel, so-called “twinned buffering”;
+ // seemingly, it helps Haswell by ~15% on the SOR code, but GTX 950 not at all
+ // (at least not on 720p). Presumably the latter is already bandwidth bound.
// Simplified version of the code below, assuming diff_flow == 0.0f everywhere.
diff_flow.x = omega * b.x * inv_A11;
diff_flow.y = omega * b.y * inv_A22;
// Simplified version of the code below, assuming diff_flow == 0.0f everywhere.
diff_flow.x = omega * b.x * inv_A11;
diff_flow.y = omega * b.y * inv_A22;
b += smooth_r * textureOffset(diff_flow_tex, tc, ivec2( 1, 0)).xy;
b += smooth_d * textureOffset(diff_flow_tex, tc, ivec2( 0, -1)).xy;
b += smooth_u * textureOffset(diff_flow_tex, tc, ivec2( 0, 1)).xy;
b += smooth_r * textureOffset(diff_flow_tex, tc, ivec2( 1, 0)).xy;
b += smooth_d * textureOffset(diff_flow_tex, tc, ivec2( 0, -1)).xy;
b += smooth_u * textureOffset(diff_flow_tex, tc, ivec2( 0, 1)).xy;
- diff_flow = texture(diff_flow_tex, tc).xy;
+
+ if (num_nonzero_phases == 1) {
+ diff_flow = vec2(0.0f);
+ } else {
+ diff_flow = texture(diff_flow_tex, tc).xy;
+ }