layout(std430, binding = 9) buffer layoutName
{
uint dist[4 * 256];
+ uvec2 ransdist[4 * 256];
};
const uint prob_bits = 12;
// Apply corrections one by one, greedily, until we are at the exact right sum.
if (actual_sum > prob_scale) {
- float loss = -true_prob * log2(new_val / (new_val - 1));
+ float loss = true_prob * log2(new_val / float(new_val - 1));
voting_areas[i] = 0xffffffff;
memoryBarrierShared();
barrier();
// Stick the thread ID in the lower mantissa bits so we never get a tie.
- uint my_vote = (floatBitsToUint(loss) & ~0xff) | gl_LocalInvocationID.x;
+ uint my_vote = (floatBitsToUint(loss) & ~0xffu) | gl_LocalInvocationID.x;
if (new_val <= 1) {
// We can't touch this one any more, but it needs to participate in the barriers,
// so we can't break.
if (my_vote == voting_areas[vote_no]) {
--new_val;
- loss = -true_prob * log2(new_val / (new_val - 1));
+ loss = true_prob * log2(new_val / float(new_val - 1));
}
}
} else {
- float benefit = true_prob * log2(new_val / (new_val + 1));
+ float benefit = -true_prob * log2(new_val / float(new_val + 1));
voting_areas[i] = 0;
memoryBarrierShared();
for ( ; actual_sum != prob_scale; ++actual_sum, ++vote_no) {
// Stick the thread ID in the lower mantissa bits so we never get a tie.
- uint my_vote = (floatBitsToUint(benefit) & ~0xff) | gl_LocalInvocationID.x;
+ uint my_vote = (floatBitsToUint(benefit) & ~0xffu) | gl_LocalInvocationID.x;
if (new_val == 0) {
// It's meaningless to increase this, but it needs to participate in the barriers,
// so we can't break.
if (my_vote == voting_areas[vote_no]) {
++new_val;
- benefit = true_prob * log2(new_val / (new_val + 1));
+ benefit = -true_prob * log2(new_val / float(new_val + 1));
}
}
}
}
// Parallel prefix sum.
- new_dist[(i + 255) & 255] = new_val; // Move the zero symbol last.
+ new_dist[(i + 255) & 255u] = new_val; // Move the zero symbol last.
+ memoryBarrierShared();
+ barrier();
+
+ new_val = new_dist[i];
+
+ // TODO: Why do we need this next barrier? It makes no sense.
memoryBarrierShared();
barrier();
memoryBarrierShared();
barrier();
}
- dist[base + i] = new_dist[i];
+ ransdist[base + i] = uvec2(new_dist[i] - new_val, new_val);
}