#version 440
#extension GL_ARB_gpu_shader_int64 : enable

// http://cbloomrants.blogspot.no/2014/02/02-11-14-understanding-ans-10.html

layout(local_size_x = 256) in;

layout(std430, binding = 9) buffer layoutName
{
	uint dist[4 * 256];
	uint ransfreq[4 * 256];
};

layout(std140, binding = 12) buffer distBlock  // Will become an UBO to rans.shader, thus layout std140.
{
	uvec4 ransdist[4 * 256];
	uint sign_biases[4];
};

const uint prob_bits = 12;
const uint prob_scale = 1 << prob_bits;
const uint RANS_BYTE_L = (1u << 23);

// FIXME: should come through a uniform.
const uint sums[4] = { 57600, 115200, 302400, 446400 };

shared uint new_dist[256];
shared uint shared_tmp = 0;

// Multiple voting areas for the min/max reductions, so we don't need to clear them
// for every round.
shared uint voting_areas[256];

void main()
{
	uint base = 256 * gl_WorkGroupID.x;
	uint sum = sums[gl_WorkGroupID.x];
	uint zero_correction;
	uint i = gl_LocalInvocationID.x;  // Which element we are working on.

	// Zero has no sign bit. Yes, this is trickery.
	if (i == 0) {
		uint old_zero_freq = dist[base];
		uint new_zero_freq = (old_zero_freq + 1) / 2;  // Rounding up ensures we get a zero frequency.
		zero_correction = old_zero_freq - new_zero_freq;
		shared_tmp = sum - zero_correction;
		dist[base] = new_zero_freq;
	}

	memoryBarrierShared();
	barrier();

	sum = shared_tmp;

	// Normalize the pdf, taking care to never make a nonzero frequency into a zero.
	// This rounding is presumably optimal according to cbloom.
	float true_prob = float(dist[base + i]) / sum;
	float from_scaled = true_prob * prob_scale;
	float down = floor(from_scaled);
	uint new_val = (from_scaled * from_scaled <= down * (down + 1)) ? uint(down) : uint(down) + 1;
	if (dist[base + i] > 0) {
		new_val = max(new_val, 1);
	}

	// Reset shared_tmp. We could do without this barrier if we wanted to,
	// but meh. :-)
	if (i == 0) {
		shared_tmp = 0;
	}

	memoryBarrierShared();
	barrier();

	// Parallel sum to find the total.
	atomicAdd(shared_tmp, new_val);

	memoryBarrierShared();
	barrier();

	uint actual_sum = shared_tmp;

	// Apply corrections one by one, greedily, until we are at the exact right sum.
	if (actual_sum > prob_scale) {
		float loss = true_prob * log2(new_val / float(new_val - 1));

		voting_areas[i] = 0xffffffff;
		memoryBarrierShared();
		barrier();

		uint vote_no = 0;

		for ( ; actual_sum != prob_scale; --actual_sum, ++vote_no) {
			memoryBarrierShared();
			barrier();

			// Stick the thread ID in the lower mantissa bits so we never get a tie.
			uint my_vote = (floatBitsToUint(loss) & ~0xffu) | gl_LocalInvocationID.x;
			if (new_val <= 1) {
				// We can't touch this one any more, but it needs to participate in the barriers,
				// so we can't break.
				my_vote = 0xffffffff;
			}

			// Find out which thread has the one with the smallest loss.
			// (Positive floats compare like uints just fine.)
			voting_areas[vote_no] = atomicMin(voting_areas[vote_no], my_vote);
			memoryBarrierShared();
			barrier();

			if (my_vote == voting_areas[vote_no]) {
				--new_val;
				loss = true_prob * log2(new_val / float(new_val - 1));
			}
		}
	} else {
		float benefit = -true_prob * log2(new_val / float(new_val + 1));

		voting_areas[i] = 0;
		memoryBarrierShared();
		barrier();

		uint vote_no = 0;

		for ( ; actual_sum != prob_scale; ++actual_sum, ++vote_no) {
			// Stick the thread ID in the lower mantissa bits so we never get a tie.
			uint my_vote = (floatBitsToUint(benefit) & ~0xffu) | gl_LocalInvocationID.x;
			if (new_val == 0) {
				// It's meaningless to increase this, but it needs to participate in the barriers,
				// so we can't break.
				my_vote = 0;
			}

			// Find out which thread has the one with the most benefit.
			// (Positive floats compare like uints just fine.)
			voting_areas[vote_no] = atomicMax(voting_areas[vote_no], my_vote);
			memoryBarrierShared();
			barrier();

			if (my_vote == voting_areas[vote_no]) {
				++new_val;
				benefit = -true_prob * log2(new_val / float(new_val + 1));
			}
		}
	}

	if (i == 0) {
		// Undo what we did above.
		new_val *= 2;
	}

	// Parallel prefix sum.
	new_dist[(i + 255) & 255u] = new_val;  // Move the zero symbol last.
	memoryBarrierShared();
	barrier();

	new_val = new_dist[i];

	// TODO: Why do we need this next barrier? It makes no sense.
	memoryBarrierShared();
	barrier();

	for (uint layer = 2; layer <= 256; layer *= 2) {
		if ((i & (layer - 1)) == layer - 1) {
			new_dist[i] += new_dist[i - (layer / 2)];
		}
		memoryBarrierShared();
		barrier();
	}
	for (uint layer = 128; layer >= 2; layer /= 2) {
		if ((i & (layer - 1)) == layer - 1 && i != 255) {
			new_dist[i + (layer / 2)] += new_dist[i];
		}
		memoryBarrierShared();
		barrier();
	}

	uint start = new_dist[i] - new_val;
	uint freq = new_val;

	uint x_max = ((RANS_BYTE_L >> (prob_bits + 1)) << 8) * freq;
	uint cmpl_freq = ((1 << (prob_bits + 1)) - freq);
	uint rcp_freq, rcp_shift, bias;
	if (freq < 2) {
		rcp_freq = ~0u;
		rcp_shift = 0;
		bias = start + (1 << (prob_bits + 1)) - 1;
	} else {
		uint shift = 0;
		while (freq > (1u << shift)) {
			shift++;
		}

		rcp_freq = uint(((uint64_t(1) << (shift + 31)) + freq-1) / freq);
		rcp_shift = shift - 1;
		bias = start;
	}

	ransfreq[base + i] = freq;
	ransdist[base + i] = uvec4(x_max, rcp_freq, bias, (cmpl_freq << 16) | rcp_shift);

	if (i == 255) {
		sign_biases[gl_WorkGroupID.x] = new_dist[i];
	}
}