From: Steinar H. Gunderson <sgunderson@bigfoot.com>
Date: Mon, 16 Oct 2017 19:27:50 +0000 (+0200)
Subject: Make the encoder 100% GPU. Not working yet, though.
X-Git-Url: https://git.sesse.net/?p=narabu;a=commitdiff_plain;h=5e1d27014149311318e97b8e04a6e05ec858e57c

Make the encoder 100% GPU. Not working yet, though.
---

diff --git a/coded.dat b/coded.dat
index 469306f..bb889f2 100644
Binary files a/coded.dat and b/coded.dat differ
diff --git a/narabu-encoder.cpp b/narabu-encoder.cpp
index a4e7716..6b1df48 100644
--- a/narabu-encoder.cpp
+++ b/narabu-encoder.cpp
@@ -43,6 +43,11 @@ unsigned char pix_y[WIDTH * HEIGHT];
 unsigned char pix_cb[(WIDTH/2) * HEIGHT];
 unsigned char pix_cr[(WIDTH/2) * HEIGHT];
 
+struct RansDistSSBO {
+	unsigned dist[4 * 256];
+	std::pair<unsigned, unsigned> ransdist[4 * 256];
+};
+
 using namespace std;
 using namespace std::chrono;
 
@@ -72,182 +77,6 @@ void readpix(unsigned char *ptr, const char *filename)
 	fclose(fp);
 }
 
-struct SymbolStats
-{
-    uint32_t freqs[NUM_SYMS];
-    uint32_t cum_freqs[NUM_SYMS + 1];
-
-    void clear();
-    void calc_cum_freqs();
-    void normalize_freqs(uint32_t target_total);
-};
-
-void SymbolStats::clear()
-{
-    for (int i=0; i < NUM_SYMS; i++)
-        freqs[i] = 0;
-}
-
-void SymbolStats::calc_cum_freqs()
-{
-    cum_freqs[0] = 0;
-    for (int i=0; i < NUM_SYMS; i++)
-        cum_freqs[i+1] = cum_freqs[i] + freqs[i];
-}
-
-void SymbolStats::normalize_freqs(uint32_t target_total)
-{
-    uint64_t real_freq[NUM_SYMS + 1];  // hack
-
-    assert(target_total >= NUM_SYMS);
-
-    calc_cum_freqs();
-    uint32_t cur_total = cum_freqs[NUM_SYMS];
-
-    if (cur_total == 0) return;
-
-    double ideal_cost = 0.0;
-    for (int i = 1; i <= NUM_SYMS; i++)
-    {
-      real_freq[i] = cum_freqs[i] - cum_freqs[i - 1];
-      if (real_freq[i] > 0)
-        ideal_cost -= real_freq[i] * log2(real_freq[i] / double(cur_total));
-    }
-
-    OptimalRenormalize(cum_freqs, NUM_SYMS, prob_scale);
-
-    // calculate updated freqs and make sure we didn't screw anything up
-    assert(cum_freqs[0] == 0 && cum_freqs[NUM_SYMS] == target_total);
-    for (int i=0; i < NUM_SYMS; i++) {
-        if (freqs[i] == 0)
-            assert(cum_freqs[i+1] == cum_freqs[i]);
-        else
-            assert(cum_freqs[i+1] > cum_freqs[i]);
-
-        // calc updated freq
-        freqs[i] = cum_freqs[i+1] - cum_freqs[i];
-    }
-
-    double calc_cost = 0.0;
-    for (int i = 1; i <= NUM_SYMS; i++)
-    {
-      uint64_t freq = cum_freqs[i] - cum_freqs[i - 1];
-      if (real_freq[i] > 0)
-        calc_cost -= real_freq[i] * log2(freq / double(target_total));
-    }
-
-    static double total_loss = 0.0;
-    total_loss += calc_cost - ideal_cost;
-    static double total_loss_with_dp = 0.0;
-	double optimal_cost = 0.0;
-    //total_loss_with_dp += optimal_cost - ideal_cost;
-    printf("ideal cost = %.0f bits, DP cost = %.0f bits, calc cost = %.0f bits (loss = %.2f bytes, total loss = %.2f bytes, total loss with DP = %.2f bytes)\n",
-		ideal_cost, optimal_cost,
-		 calc_cost, (calc_cost - ideal_cost) / 8.0, total_loss / 8.0, total_loss_with_dp / 8.0);
-}
-
-SymbolStats stats[128];
-
-const int luma_mapping[64] = {
-	0, 0, 1, 1, 2, 2, 3, 3,
-	0, 0, 1, 2, 2, 2, 3, 3,
-	1, 1, 2, 2, 2, 3, 3, 3,
-	1, 1, 2, 2, 2, 3, 3, 3,
-	1, 2, 2, 2, 2, 3, 3, 3,
-	2, 2, 2, 2, 3, 3, 3, 3,
-	2, 2, 3, 3, 3, 3, 3, 3,
-	3, 3, 3, 3, 3, 3, 3, 3,
-};
-
-int pick_stats_for(int x, int y)
-{
-	return luma_mapping[y * 8 + x];
-}
-
-class RansEncoder {
-public:
-	RansEncoder()
-	{
-		out_buf.reset(new uint8_t[out_max_size]);
-		clear();
-	}
-
-	void init_prob(SymbolStats &s)
-	{
-		for (int i = 0; i < NUM_SYMS; i++) {
-			//printf("%d: cumfreqs=%d freqs=%d prob_bits=%d\n", i, s.cum_freqs[i], s.freqs[i], prob_bits + 1);
-			RansEncSymbolInit(&esyms[i], s.cum_freqs[i], s.freqs[i], prob_bits + 1);
-		}
-		sign_bias = s.cum_freqs[NUM_SYMS];
-	}
-
-	void clear()
-	{
-		out_end = out_buf.get() + out_max_size;
-		ptr = out_end; // *end* of output buffer
-		RansEncInit(&rans);
-	}
-
-	uint32_t save_block(FILE *codedfp)  // Returns number of bytes.
-	{
-		RansEncFlush(&rans, &ptr);
-		//printf("post-flush = %08x\n", rans);
-
-		uint32_t num_rans_bytes = out_end - ptr;
-		if (num_rans_bytes == last_block.size() &&
-		    memcmp(last_block.data(), ptr, last_block.size()) == 0) {
-			write_varint(0, codedfp);
-			clear();
-			return 1;
-		} else {
-			last_block = string((const char *)ptr, num_rans_bytes);
-		}
-
-		write_varint(num_rans_bytes, codedfp);
-		//fwrite(&num_rans_bytes, 1, 4, codedfp);
-		fwrite(ptr, 1, num_rans_bytes, codedfp);
-
-		//printf("first rANS bytes: %02x %02x %02x %02x %02x %02x %02x %02x\n", ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7]);
-
-
-		clear();
-
-		//printf("Saving block: %d rANS bytes\n", num_rans_bytes);
-		return num_rans_bytes;
-		//return num_rans_bytes;
-	}
-
-	void encode_coeff(short signed_k)
-	{
-		//printf("encoding coeff %d (sym %d), rans before encoding = %08x\n", signed_k, ((abs(signed_k) - 1) & 255), rans);
-		unsigned short k = abs(signed_k);
-		if (k >= ESCAPE_LIMIT) {
-			// Put the coefficient as a 1/(2^12) symbol _before_
-			// the 255 coefficient, since the decoder will read the
-			// 255 coefficient first.
-			RansEncPut(&rans, &ptr, k, 1, prob_bits);
-			k = ESCAPE_LIMIT;
-		}
-		RansEncPutSymbol(&rans, &ptr, &esyms[(k - 1) & (NUM_SYMS - 1)]);
-		if (signed_k < 0) {
-			rans += sign_bias;
-		}
-	}
-
-private:
-	static constexpr size_t out_max_size = 32 << 20; // 32 MB.
-	static constexpr size_t max_num_sign = 1048576;  // Way too big. And actually bytes.
-
-	unique_ptr<uint8_t[]> out_buf;
-	uint8_t *out_end;
-	uint8_t *ptr;
-	RansState rans;
-	RansEncSymbol esyms[NUM_SYMS];
-	uint32_t sign_bias;
-
-	std::string last_block;
-};
-
 // Should be done on the GPU, of course, but irrelevant for the demonstration.
 void convert_ycbcr()
 {
@@ -316,7 +145,7 @@ int main(int argc, char **argv)
 		readpix(rgb, "color.pnm");
 	convert_ycbcr();
 
-	// Compile the shader.
+	// Compile the DCT shader.
 	string shader_src = ::read_file("encoder.shader");
 	GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
 	GLuint glsl_program_num = glCreateProgram();
@@ -347,15 +176,57 @@ int main(int argc, char **argv)
 		exit(1);
 	}
 
-	glUseProgram(glsl_program_num);
+	// Compile the rANS shader.
+	shader_src = ::read_file("rans.shader");
+	shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
+	GLuint glsl_rans_program_num = glCreateProgram();
+	glAttachShader(glsl_rans_program_num, shader_num);
+	glLinkProgram(glsl_rans_program_num);
+
+	glGetProgramiv(glsl_rans_program_num, GL_LINK_STATUS, &success);
+	if (success == GL_FALSE) {
+		GLchar error_log[1024] = {0};
+		glGetProgramInfoLog(glsl_rans_program_num, 1024, nullptr, error_log);
+		fprintf(stderr, "Error linking program: %s\n", error_log);
+		exit(1);
+	}
+	check_error();
 
 	// An SSBO for the rANS distributions.
 	GLuint ssbo;
 	glGenBuffers(1, &ssbo);
 	glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
-	glBufferData(GL_SHADER_STORAGE_BUFFER, 256 * 4 * sizeof(uint32_t), nullptr, GL_DYNAMIC_COPY);
+	glNamedBufferStorage(ssbo, 256 * 16 * sizeof(uint32_t), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+	check_error();
+
+	// SSBOs for the rANS output (data and offsets).
+	GLuint output_ssbo;
+	glGenBuffers(1, &output_ssbo);
+	glBindBuffer(GL_SHADER_STORAGE_BUFFER, output_ssbo);
+	glNamedBufferStorage(output_ssbo, 45 * 64 * 1024, nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+	check_error();
+
+	GLuint output_offset_ssbo;
+	glGenBuffers(1, &output_offset_ssbo);
+	glBindBuffer(GL_SHADER_STORAGE_BUFFER, output_offset_ssbo);
+	glNamedBufferStorage(output_offset_ssbo, 45 * 64 * sizeof(uint32_t), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+	check_error();
+
+	// Bind SSBOs.
+	glUseProgram(glsl_program_num);
 	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
 
+	glUseProgram(glsl_tally_program_num);
+	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
+
+	glUseProgram(glsl_rans_program_num);
+	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
+	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, output_ssbo);
+	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, output_offset_ssbo);
+
+	glUseProgram(glsl_program_num);
+	check_error();
+
 	// Upload luma.
 	GLuint y_tex;
 	glGenTextures(1, &y_tex);
@@ -392,158 +263,73 @@ int main(int argc, char **argv)
 		check_error();
 	}
 
+	glBindImageTexture(0, dc_ac7_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
+	glBindImageTexture(1, ac1_ac6_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
+	glBindImageTexture(2, ac2_ac5_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
+	glBindImageTexture(3, ac3_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I);
+	glBindImageTexture(4, ac4_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I);
+	glBindImageTexture(5, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
+	check_error();
+
+	// Bind uniforms.
+	glUseProgram(glsl_program_num);
 	GLint dc_ac7_tex_uniform = glGetUniformLocation(glsl_program_num, "dc_ac7_tex");
 	GLint ac1_ac6_tex_uniform = glGetUniformLocation(glsl_program_num, "ac1_ac6_tex");
 	GLint ac2_ac5_tex_uniform = glGetUniformLocation(glsl_program_num, "ac2_ac5_tex");
 	GLint ac3_tex_uniform = glGetUniformLocation(glsl_program_num, "ac3_tex");
 	GLint ac4_tex_uniform = glGetUniformLocation(glsl_program_num, "ac4_tex");
 	GLint image_tex_uniform = glGetUniformLocation(glsl_program_num, "image_tex");
-
 	glUniform1i(dc_ac7_tex_uniform, 0);
 	glUniform1i(ac1_ac6_tex_uniform, 1);
 	glUniform1i(ac2_ac5_tex_uniform, 2);
 	glUniform1i(ac3_tex_uniform, 3);
 	glUniform1i(ac4_tex_uniform, 4);
 	glUniform1i(image_tex_uniform, 5);
-	glBindImageTexture(0, dc_ac7_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
-	glBindImageTexture(1, ac1_ac6_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
-	glBindImageTexture(2, ac2_ac5_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
-	glBindImageTexture(3, ac3_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I);
-	glBindImageTexture(4, ac4_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I);
-	glBindImageTexture(5, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
-	check_error();
 
-	glUseProgram(glsl_tally_program_num);
-	glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
+	glUseProgram(glsl_rans_program_num);
+	dc_ac7_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "dc_ac7_tex");
+	ac1_ac6_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac1_ac6_tex");
+	ac2_ac5_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac2_ac5_tex");
+	ac3_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac3_tex");
+	ac4_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac4_tex");
+	image_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "image_tex");
+	glUniform1i(dc_ac7_tex_uniform, 0);
+	glUniform1i(ac1_ac6_tex_uniform, 1);
+	glUniform1i(ac2_ac5_tex_uniform, 2);
+	glUniform1i(ac3_tex_uniform, 3);
+	glUniform1i(ac4_tex_uniform, 4);
 
 	steady_clock::time_point start = steady_clock::now();
-	unsigned num_iterations = 1000;
+	unsigned num_iterations = 100;
 	for (unsigned i = 0; i < num_iterations; ++i) {
-		glClearNamedBufferSubData(ssbo, GL_R8, 0, 256 * 4 * sizeof(uint32_t), GL_RED, GL_UNSIGNED_BYTE, nullptr);
+		glClearNamedBufferSubData(ssbo, GL_R8, 0, 256 * 16 * sizeof(uint32_t), GL_RED, GL_UNSIGNED_BYTE, nullptr);
 		glUseProgram(glsl_program_num);
 		glDispatchCompute(WIDTH_BLOCKS / 16, HEIGHT_BLOCKS, 1);
 		glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
 
 		glUseProgram(glsl_tally_program_num);
 		glDispatchCompute(4, 1, 1);
+		glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+	
+		glUseProgram(glsl_rans_program_num);
+		glDispatchCompute(NUM_BLOCKS / BLOCKS_PER_STREAM, 8, 5);
 	}
 	check_error();
 	glFinish();
-	steady_clock::time_point now = steady_clock::now();
-
-	// CPU part starts here -- will be GPU later.
-	// We only do luma for now.
-
-	int16_t *coeff_y = new int16_t[WIDTH * HEIGHT];
-
-	glBindTexture(GL_TEXTURE_2D, dc_ac7_tex);
-	uint16_t *dc_ac7_data = new uint16_t[(WIDTH/8) * HEIGHT];
-	glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, dc_ac7_data);
-	check_error();
-
-	glBindTexture(GL_TEXTURE_2D, ac1_ac6_tex);
-	uint16_t *ac1_ac6_data = new uint16_t[(WIDTH/8) * HEIGHT];
-	glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ac1_ac6_data);
 	check_error();
-
-	glBindTexture(GL_TEXTURE_2D, ac2_ac5_tex);
-	uint16_t *ac2_ac5_data = new uint16_t[(WIDTH/8) * HEIGHT];
-	glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ac2_ac5_data);
-	check_error();
-
-	glBindTexture(GL_TEXTURE_2D, ac3_tex);
-	int8_t *ac3_data = new int8_t[(WIDTH/8) * HEIGHT];
-	glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_BYTE, ac3_data);
-	check_error();
-
-	glBindTexture(GL_TEXTURE_2D, ac4_tex);
-	int8_t *ac4_data = new int8_t[(WIDTH/8) * HEIGHT];
-	glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_BYTE, ac4_data);
-	check_error();
-
-	for (unsigned y = 0; y < HEIGHT; ++y) {
-		for (unsigned xb = 0; xb < WIDTH/8; ++xb) {
-			coeff_y[y * WIDTH + xb*8 + 0] = int(dc_ac7_data[y * (WIDTH/8) + xb] << 23) >> 23;
-			coeff_y[y * WIDTH + xb*8 + 7] = int(dc_ac7_data[y * (WIDTH/8) + xb] << 16) >> 25;
-			coeff_y[y * WIDTH + xb*8 + 1] = int(ac1_ac6_data[y * (WIDTH/8) + xb] << 23) >> 23;
-			coeff_y[y * WIDTH + xb*8 + 6] = int(ac1_ac6_data[y * (WIDTH/8) + xb] << 16) >> 25;
-			coeff_y[y * WIDTH + xb*8 + 2] = int(ac2_ac5_data[y * (WIDTH/8) + xb] << 23) >> 23;
-			coeff_y[y * WIDTH + xb*8 + 5] = int(ac2_ac5_data[y * (WIDTH/8) + xb] << 16) >> 25;
-			coeff_y[y * WIDTH + xb*8 + 3] = ac3_data[y * (WIDTH/8) + xb];
-			coeff_y[y * WIDTH + xb*8 + 4] = ac4_data[y * (WIDTH/8) + xb];
-		}
-	}
+	steady_clock::time_point now = steady_clock::now();
 
 #if 0
-	for (unsigned y = 0; y < HEIGHT; ++y) {
-		for (unsigned xb = 0; xb < WIDTH/8; ++xb) {
-			printf("%4d %4d %4d %4d %4d %4d %4d %4d | ",
-				coeff_y[y * WIDTH + xb*8 + 0],
-				coeff_y[y * WIDTH + xb*8 + 1],
-				coeff_y[y * WIDTH + xb*8 + 2],
-				coeff_y[y * WIDTH + xb*8 + 3],
-				coeff_y[y * WIDTH + xb*8 + 4],
-				coeff_y[y * WIDTH + xb*8 + 5],
-				coeff_y[y * WIDTH + xb*8 + 6],
-				coeff_y[y * WIDTH + xb*8 + 7]);
-			printf("%4d %4d %4d %4d %4d %4d %4d %4d || ",
-				pix_y[y * WIDTH + xb*8 + 0],
-				pix_y[y * WIDTH + xb*8 + 1],
-				pix_y[y * WIDTH + xb*8 + 2],
-				pix_y[y * WIDTH + xb*8 + 3],
-				pix_y[y * WIDTH + xb*8 + 4],
-				pix_y[y * WIDTH + xb*8 + 5],
-				pix_y[y * WIDTH + xb*8 + 6],
-				pix_y[y * WIDTH + xb*8 + 7]);
-		}
-		printf("\n");
-	}
-#endif
-
-	// DC coefficient pred from the right to left (within each slice)
-	for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; block_idx += BLOCKS_PER_STREAM) {
-		int prev_k = 128;
-
-		for (unsigned subblock_idx = BLOCKS_PER_STREAM; subblock_idx --> 0; ) {
-			unsigned yb = (block_idx + subblock_idx) / WIDTH_BLOCKS;
-			unsigned xb = (block_idx + subblock_idx) % WIDTH_BLOCKS;
-			int k = coeff_y[(yb * 8) * WIDTH + xb * 8];
-
-			coeff_y[(yb * 8) * WIDTH + xb * 8] = k - prev_k;
-
-			prev_k = k;
-		}
-	}
+	printf("%ld bytes + %ld escape bits (%ld) = %ld total bytes\n",
+		tot_bytes - extra_bits / 8,
+		extra_bits,
+		extra_bits / 8,
+		tot_bytes);
 
-	// For each coefficient, make some tables.
-	size_t extra_bits = 0;
-	for (unsigned i = 0; i < 64; ++i) {
-		stats[i].clear();
-	}
-	for (unsigned y = 0; y < 8; ++y) {
-		for (unsigned x = 0; x < 8; ++x) {
-			SymbolStats &s_luma = stats[pick_stats_for(x, y)];
-
-			// Luma
-			for (unsigned yb = 0; yb < HEIGHT; yb += 8) {
-				for (unsigned xb = 0; xb < WIDTH; xb += 8) {
-					unsigned short k = abs(coeff_y[(yb + y) * WIDTH + (xb + x)]);
-					if (k >= ESCAPE_LIMIT) {
-						k = ESCAPE_LIMIT;
-						extra_bits += 12;  // escape this one
-					}
-					++s_luma.freqs[(k - 1) & (NUM_SYMS - 1)];
-				}
-			}
-		}
-	}
+	printf("\n");
+#endif
 
-	for (unsigned i = 0; i < 64; ++i) {
-		stats[i].freqs[NUM_SYMS - 1] /= 2;  // zero, has no sign bits (yes, this is trickery)
-		stats[i].normalize_freqs(prob_scale);
-		stats[i].cum_freqs[NUM_SYMS] += stats[i].freqs[NUM_SYMS - 1];
-		stats[i].freqs[NUM_SYMS - 1] *= 2;
-	}
+	printf("Each iteration took %.3f ms.\n", 1e3 * duration<double>(now - start).count() / num_iterations);
 
 	FILE *codedfp = fopen("coded.dat", "wb");
 	if (codedfp == nullptr) {
@@ -551,66 +337,52 @@ int main(int argc, char **argv)
 		exit(1);
 	}
 
+	// Write out the distributions.
+	const RansDistSSBO *rans_dist = (const RansDistSSBO *)glMapNamedBufferRange(ssbo, 0, 256 * 16 * sizeof(uint32_t), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
 	for (unsigned r = 0; r < 2; ++r) {  // Hack to write fake chroma tables.
 		// TODO: rather gamma-k or something
-		for (unsigned i = 0; i < 64; ++i) {
-			if (stats[i].cum_freqs[NUM_SYMS] == 0) {
-				continue;
-			}
+		for (unsigned i = 0; i < 4; ++i) {
 			printf("writing table %d\n", i);
 			for (unsigned j = 0; j < NUM_SYMS; ++j) {
-				write_varint(stats[i].freqs[j], codedfp);
+				printf("%d,%d: %d\n", i, j, rans_dist->ransdist[i * 256 + j].first);
+				write_varint(rans_dist->ransdist[i * 256 + j].first, codedfp);
 			}
 		}
 	}
 
-	RansEncoder rans_encoder;
+	// Write out the actual data.
+	// TODO: Do the deduplication.
+
+	const uint32_t *offsets = (const uint32_t *)glMapNamedBufferRange(output_offset_ssbo, 0, 45 * 64 * sizeof(uint32_t), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+#if 0
+	for (int i = 0; i < 45*64; ++i) {
+		printf("%d,%d,%d: %u\n", i / 64, (i / 8) % 8, i % 8, 1024 * (i + 1) - offsets[i]);
+	}
+#endif
 
-	size_t tot_bytes = 0;
+	const uint8_t *data = (const uint8_t *)glMapNamedBufferRange(output_ssbo, 0, 45 * 64 * 1024, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
 
-	// Luma
 	for (unsigned y = 0; y < 8; ++y) {
 		for (unsigned x = 0; x < 8; ++x) {
-			SymbolStats &s_luma = stats[pick_stats_for(x, y)];
-			rans_encoder.init_prob(s_luma);
-
-			// Luma
-			std::vector<int> lens;
-
-			rans_encoder.clear();
-			size_t num_bytes = 0;
-			for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; ++block_idx) {
-				unsigned yb = block_idx / WIDTH_BLOCKS;
-				unsigned xb = block_idx % WIDTH_BLOCKS;
-
-				int k = coeff_y[(yb * 8 + y) * WIDTH + (xb * 8 + x)];
-				rans_encoder.encode_coeff(k);
-
-				if (block_idx % BLOCKS_PER_STREAM == (BLOCKS_PER_STREAM - 1) || block_idx == NUM_BLOCKS - 1) {
-					int l = rans_encoder.save_block(codedfp);
-					num_bytes += l;
-					lens.push_back(l);
+			for (unsigned int stream_idx = 0; stream_idx < 45; ++stream_idx) {
+				const uint8_t *out_end = data + (stream_idx * 64 + y * 8 + x + 1) * 1024;
+				const uint8_t *ptr = data + offsets[stream_idx * 64 + y * 8 + x];
+				uint32_t num_rans_bytes = out_end - ptr;
+#if 0
+				if (num_rans_bytes == last_block.size() &&
+				    memcmp(last_block.data(), ptr, last_block.size()) == 0) {
+					write_varint(0, codedfp);
+					clear();
+					return 1;
+				} else {
+					last_block = string((const char *)ptr, num_rans_bytes);
 				}
+#endif
+
+				write_varint(num_rans_bytes, codedfp);
+				fwrite(ptr, 1, num_rans_bytes, codedfp);
 			}
-			tot_bytes += num_bytes;
-			printf("coeff %d Y': %ld bytes\n", y * 8 + x, num_bytes);
 		}
 	}
-
-	printf("%ld bytes + %ld escape bits (%ld) = %ld total bytes\n",
-		tot_bytes - extra_bits / 8,
-		extra_bits,
-		extra_bits / 8,
-		tot_bytes);
-
-	printf("\n");
-	printf("Each iteration took %.3f ms (but note that is DCT only, no rANS).\n", 1e3 * duration<double>(now - start).count() / num_iterations);
-
-#if 1
-	glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
-	const uint32_t *dist = (const uint32_t *)glMapBuffer(GL_SHADER_STORAGE_BUFFER, GL_READ_ONLY);
-	for (int i = 0; i < 1024; ++i) {
-		printf("%d,%d: %u\n", i / 256, i % 256, dist[i]);
-	}
-#endif
+	fclose(codedfp);
 }
diff --git a/rans.shader b/rans.shader
new file mode 100644
index 0000000..a7c6f8c
--- /dev/null
+++ b/rans.shader
@@ -0,0 +1,180 @@
+#version 440
+#extension GL_NV_gpu_shader5 : enable
+
+layout(local_size_x = 1) in;
+
+const uint prob_bits = 13;  // Note!
+const uint prob_scale = 1 << prob_bits;
+const uint RANS_BYTE_L = (1u << 23);
+const uint BLOCKS_PER_STREAM = 320;
+const uint STREAM_BUF_SIZE = 1024;  // 1 kB per stream ought to be enough for everyone :-)
+const uint NUM_SYMS = 256;
+const uint ESCAPE_LIMIT = NUM_SYMS - 1;
+
+#define MAPPING(s0, s1, s2, s3, s4, s5, s6, s7) ((s0) | (s1 << 2) | (s2 << 4) | (s3 << 6) | (s4 << 8) | (s5 << 10) | (s6 << 12) | (s7 << 14))
+
+const uint luma_mapping[8] = {
+	MAPPING(0, 0, 1, 1, 2, 2, 3, 3),
+	MAPPING(0, 0, 1, 2, 2, 2, 3, 3),
+	MAPPING(1, 1, 2, 2, 2, 3, 3, 3),
+	MAPPING(1, 1, 2, 2, 2, 3, 3, 3),
+	MAPPING(1, 2, 2, 2, 2, 3, 3, 3),
+	MAPPING(2, 2, 2, 2, 3, 3, 3, 3),
+	MAPPING(2, 2, 3, 3, 3, 3, 3, 3),
+	MAPPING(3, 3, 3, 3, 3, 3, 3, 3),
+};
+
+layout(std430, binding = 9) buffer layoutName
+{
+	uint dist[4 * 256];
+	uvec2 ransdist[4 * 256];
+};
+
+layout(std430, binding = 10) buffer outputBuf
+{
+	uint8_t rans_output[];
+};
+
+layout(std430, binding = 11) buffer outputBuf2
+{
+	uint rans_start_offset[];
+};
+
+struct RansEncoder {
+	uint stream_num;   // const
+	uint lut_base;     // const
+	uint rans_offset;
+	uint rans;
+};
+
+layout(r16ui) uniform restrict readonly uimage2D dc_ac7_tex;
+layout(r16ui) uniform restrict readonly uimage2D ac1_ac6_tex;
+layout(r16ui) uniform restrict readonly uimage2D ac2_ac5_tex;
+layout(r8i) uniform restrict readonly iimage2D ac3_tex;
+layout(r8i) uniform restrict readonly iimage2D ac4_tex;
+
+void RansEncInit(uint streamgroup_num, uint coeff_row, uint coeff_col, uint dist_num, out RansEncoder enc)
+{
+	enc.stream_num = streamgroup_num * 64 + coeff_row * 8 + coeff_col;
+	enc.lut_base = dist_num * 256;
+	enc.rans_offset = enc.stream_num * STREAM_BUF_SIZE + STREAM_BUF_SIZE;  // Starts at the end.
+	enc.rans = RANS_BYTE_L;
+}
+
+void RansEncRenorm(inout uint rans, inout uint rans_offset, uint freq, uint prob_bits)
+{
+	uint x_max = ((RANS_BYTE_L >> prob_bits) << 8) * freq; // this turns into a shift.
+	if (rans >= x_max) {
+		do {
+			rans_output[--rans_offset] = uint8_t(rans & 0xff);
+			rans >>= 8;
+		} while (rans >= x_max);
+	}
+}
+
+void RansEncPut(inout uint rans, inout uint rans_offset, uint start, uint freq, uint prob_bits)
+{
+	RansEncRenorm(rans, rans_offset, freq, prob_bits);
+	rans = ((rans / freq) << prob_bits) + (rans % freq) + start;
+}
+
+void RansEncFlush(uint rans, inout uint rans_offset)
+{
+	rans_offset -= 4;
+	rans_output[rans_offset + 0] = uint8_t(rans >> 0);
+	rans_output[rans_offset + 1] = uint8_t(rans >> 8);
+	rans_output[rans_offset + 2] = uint8_t(rans >> 16);
+	rans_output[rans_offset + 3] = uint8_t(rans >> 24);
+}
+
+void encode_coeff(uint coeff, uint bits, inout RansEncoder enc)
+{
+	// Sign-extend to recover the coefficient.
+	// FIXME: not needed for the bits == 8 case!
+	int signed_k = int(coeff << (32 - bits)) >> (32 - bits);
+	uint k = abs(signed_k);
+
+	if (k >= ESCAPE_LIMIT) {
+		// ... boring stuff here
+		RansEncPut(enc.rans, enc.rans_offset, k, 1, prob_bits);
+		k = ESCAPE_LIMIT;
+	}
+
+	uvec2 sym = ransdist[enc.lut_base + (k - 1) & (NUM_SYMS - 1)];
+	RansEncPut(enc.rans, enc.rans_offset, sym.x, sym.y, prob_bits);
+	
+	// fix some bias stuff here
+}
+
+void encode_end(inout RansEncoder enc)
+{
+	RansEncFlush(enc.rans, enc.rans_offset);
+	rans_start_offset[enc.stream_num] = enc.rans_offset;
+}
+
+void encode_9_7(uint streamgroup_num, uint coeff_row, layout(r16ui) restrict readonly uimage2D tex, uint col1, uint col2, uint dist1, uint dist2)
+{
+	RansEncoder enc1, enc2;
+	RansEncInit(streamgroup_num, coeff_row, col1, dist1, enc1);
+	RansEncInit(streamgroup_num, coeff_row, col2, dist2, enc2);
+
+	for (uint subblock_idx = BLOCKS_PER_STREAM; subblock_idx --> 0; ) {
+		// TODO: Use SSBOs instead of a texture?
+		uint x = (streamgroup_num * BLOCKS_PER_STREAM + subblock_idx) % 160;
+		uint y = (streamgroup_num * BLOCKS_PER_STREAM + subblock_idx) / 160;
+		uint f = imageLoad(tex, ivec2(x, y * 8 + coeff_row)).x;
+
+		encode_coeff(f & 0x1ffu, 9, enc1);
+		encode_coeff(f >> 9, 7, enc2);
+	}
+
+	encode_end(enc1);
+	encode_end(enc2);
+}
+
+void encode_8(uint streamgroup_num, uint coeff_row, layout(r8i) restrict readonly iimage2D tex, uint col, uint dist)
+{
+	RansEncoder enc;
+	RansEncInit(streamgroup_num, coeff_row, col, dist, enc);
+
+	for (uint subblock_idx = BLOCKS_PER_STREAM; subblock_idx --> 0; ) {
+		// TODO: Use SSBOs instead of a texture?
+		uint x = (streamgroup_num * BLOCKS_PER_STREAM + subblock_idx) % 160;
+		uint y = (streamgroup_num * BLOCKS_PER_STREAM + subblock_idx) / 160;
+		int f = imageLoad(tex, ivec2(x, y * 8 + coeff_row)).x;
+
+		encode_coeff(f, 8, enc);
+	}
+
+	encode_end(enc);
+}
+
+void main()
+{
+	uint streamgroup_num = gl_WorkGroupID.x;
+	uint coeff_row = gl_WorkGroupID.y;    // 0..7
+	uint coeff_colset = gl_WorkGroupID.z;   // 0 = dc+ac7, 1 = ac1+ac6, 2 = ac2+ac5, 3 = ac3, 4 = ac5
+	uint m = luma_mapping[coeff_row];
+
+	// TODO: DC coeff pred
+
+	if (coeff_colset == 0) {
+		uint dist_dc = bitfieldExtract(m, 0, 2);
+		uint dist_ac7 = bitfieldExtract(m, 14, 2);
+		encode_9_7(streamgroup_num, coeff_row, dc_ac7_tex, 0, 7, dist_dc, dist_ac7);
+	} else if (coeff_colset == 1) {
+		uint dist_ac1 = bitfieldExtract(m, 2, 2);
+		uint dist_ac6 = bitfieldExtract(m, 12, 2);
+		encode_9_7(streamgroup_num, coeff_row, ac1_ac6_tex, 1, 6, dist_ac1, dist_ac6);
+	} else if (coeff_colset == 2) {
+		uint dist_ac2 = bitfieldExtract(m, 4, 2);
+		uint dist_ac5 = bitfieldExtract(m, 10, 2);
+		encode_9_7(streamgroup_num, coeff_row, ac2_ac5_tex, 2, 5, dist_ac2, dist_ac5);
+	} else if (coeff_colset == 3) {
+		uint dist_ac3 = bitfieldExtract(m, 6, 2);
+		encode_8(streamgroup_num, coeff_row, ac3_tex, 3, dist_ac3);
+	} else {
+		uint dist_ac4 = bitfieldExtract(m, 8, 2);
+		encode_8(streamgroup_num, coeff_row, ac4_tex, 4, dist_ac4);
+	}
+}
diff --git a/tally.shader b/tally.shader
index a97a825..1623a1d 100644
--- a/tally.shader
+++ b/tally.shader
@@ -7,6 +7,7 @@ layout(local_size_x = 256) in;
 layout(std430, binding = 9) buffer layoutName
 {
 	uint dist[4 * 256];
+	uvec2 ransdist[4 * 256];
 };
 
 const uint prob_bits = 12;
@@ -158,5 +159,5 @@ void main()
 		memoryBarrierShared();
 		barrier();
 	}
-	dist[base + i] = new_dist[i];
+	ransdist[base + i] = uvec2(new_val, new_dist[i]);
 }