#include <movit/util.h>
-#include "ryg_rans/rans_byte.h"
-#include "ryg_rans/renormalize.h"
#include "util.h"
#define WIDTH 1280
#define NUM_SYMS 256
#define ESCAPE_LIMIT (NUM_SYMS - 1)
#define BLOCKS_PER_STREAM 320
+#define STREAM_BUF_SIZE 1024 // In bytes.
static constexpr uint32_t prob_bits = 12;
static constexpr uint32_t prob_scale = 1 << prob_bits;
unsigned char pix_cb[(WIDTH/2) * HEIGHT];
unsigned char pix_cr[(WIDTH/2) * HEIGHT];
+struct RansCountSSBO {
+ unsigned dist[4 * 256];
+ unsigned ransfreq[4 * 256];
+};
+
+struct RansDistUBO {
+ struct {
+ uint32_t x_max, rcp_freq, bias, rcp_shift_and_cmpl_freq;
+ } ransdist[4 * 256];
+ struct {
+ uint32_t val;
+ uint32_t padding[3]; // std140 layout.
+ } sign_biases[4];
+};
+
using namespace std;
using namespace std::chrono;
fclose(fp);
}
-struct SymbolStats
-{
- uint32_t freqs[NUM_SYMS];
- uint32_t cum_freqs[NUM_SYMS + 1];
-
- void clear();
- void calc_cum_freqs();
- void normalize_freqs(uint32_t target_total);
-};
-
-void SymbolStats::clear()
-{
- for (int i=0; i < NUM_SYMS; i++)
- freqs[i] = 0;
-}
-
-void SymbolStats::calc_cum_freqs()
-{
- cum_freqs[0] = 0;
- for (int i=0; i < NUM_SYMS; i++)
- cum_freqs[i+1] = cum_freqs[i] + freqs[i];
-}
-
-void SymbolStats::normalize_freqs(uint32_t target_total)
-{
- uint64_t real_freq[NUM_SYMS + 1]; // hack
-
- assert(target_total >= NUM_SYMS);
-
- calc_cum_freqs();
- uint32_t cur_total = cum_freqs[NUM_SYMS];
-
- if (cur_total == 0) return;
-
- double ideal_cost = 0.0;
- for (int i = 1; i <= NUM_SYMS; i++)
- {
- real_freq[i] = cum_freqs[i] - cum_freqs[i - 1];
- if (real_freq[i] > 0)
- ideal_cost -= real_freq[i] * log2(real_freq[i] / double(cur_total));
- }
-
- OptimalRenormalize(cum_freqs, NUM_SYMS, prob_scale);
-
- // calculate updated freqs and make sure we didn't screw anything up
- assert(cum_freqs[0] == 0 && cum_freqs[NUM_SYMS] == target_total);
- for (int i=0; i < NUM_SYMS; i++) {
- if (freqs[i] == 0)
- assert(cum_freqs[i+1] == cum_freqs[i]);
- else
- assert(cum_freqs[i+1] > cum_freqs[i]);
-
- // calc updated freq
- freqs[i] = cum_freqs[i+1] - cum_freqs[i];
- }
-
- double calc_cost = 0.0;
- for (int i = 1; i <= NUM_SYMS; i++)
- {
- uint64_t freq = cum_freqs[i] - cum_freqs[i - 1];
- if (real_freq[i] > 0)
- calc_cost -= real_freq[i] * log2(freq / double(target_total));
- }
-
- static double total_loss = 0.0;
- total_loss += calc_cost - ideal_cost;
- static double total_loss_with_dp = 0.0;
- double optimal_cost = 0.0;
- //total_loss_with_dp += optimal_cost - ideal_cost;
- printf("ideal cost = %.0f bits, DP cost = %.0f bits, calc cost = %.0f bits (loss = %.2f bytes, total loss = %.2f bytes, total loss with DP = %.2f bytes)\n",
- ideal_cost, optimal_cost,
- calc_cost, (calc_cost - ideal_cost) / 8.0, total_loss / 8.0, total_loss_with_dp / 8.0);
-}
-
-SymbolStats stats[128];
-
-const int luma_mapping[64] = {
- 0, 0, 1, 1, 2, 2, 3, 3,
- 0, 0, 1, 2, 2, 2, 3, 3,
- 1, 1, 2, 2, 2, 3, 3, 3,
- 1, 1, 2, 2, 2, 3, 3, 3,
- 1, 2, 2, 2, 2, 3, 3, 3,
- 2, 2, 2, 2, 3, 3, 3, 3,
- 2, 2, 3, 3, 3, 3, 3, 3,
- 3, 3, 3, 3, 3, 3, 3, 3,
-};
-
-int pick_stats_for(int x, int y)
-{
- return luma_mapping[y * 8 + x];
-}
-
-class RansEncoder {
-public:
- RansEncoder()
- {
- out_buf.reset(new uint8_t[out_max_size]);
- clear();
- }
-
- void init_prob(SymbolStats &s)
- {
- for (int i = 0; i < NUM_SYMS; i++) {
- //printf("%d: cumfreqs=%d freqs=%d prob_bits=%d\n", i, s.cum_freqs[i], s.freqs[i], prob_bits + 1);
- RansEncSymbolInit(&esyms[i], s.cum_freqs[i], s.freqs[i], prob_bits + 1);
- }
- sign_bias = s.cum_freqs[NUM_SYMS];
- }
-
- void clear()
- {
- out_end = out_buf.get() + out_max_size;
- ptr = out_end; // *end* of output buffer
- RansEncInit(&rans);
- }
-
- uint32_t save_block(FILE *codedfp) // Returns number of bytes.
- {
- RansEncFlush(&rans, &ptr);
- //printf("post-flush = %08x\n", rans);
-
- uint32_t num_rans_bytes = out_end - ptr;
- if (num_rans_bytes == last_block.size() &&
- memcmp(last_block.data(), ptr, last_block.size()) == 0) {
- write_varint(0, codedfp);
- clear();
- return 1;
- } else {
- last_block = string((const char *)ptr, num_rans_bytes);
- }
-
- write_varint(num_rans_bytes, codedfp);
- //fwrite(&num_rans_bytes, 1, 4, codedfp);
- fwrite(ptr, 1, num_rans_bytes, codedfp);
-
- //printf("first rANS bytes: %02x %02x %02x %02x %02x %02x %02x %02x\n", ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7]);
-
-
- clear();
-
- //printf("Saving block: %d rANS bytes\n", num_rans_bytes);
- return num_rans_bytes;
- //return num_rans_bytes;
- }
-
- void encode_coeff(short signed_k)
- {
- //printf("encoding coeff %d (sym %d), rans before encoding = %08x\n", signed_k, ((abs(signed_k) - 1) & 255), rans);
- unsigned short k = abs(signed_k);
- if (k >= ESCAPE_LIMIT) {
- // Put the coefficient as a 1/(2^12) symbol _before_
- // the 255 coefficient, since the decoder will read the
- // 255 coefficient first.
- RansEncPut(&rans, &ptr, k, 1, prob_bits);
- k = ESCAPE_LIMIT;
- }
- RansEncPutSymbol(&rans, &ptr, &esyms[(k - 1) & (NUM_SYMS - 1)]);
- if (signed_k < 0) {
- rans += sign_bias;
- }
- }
-
-private:
- static constexpr size_t out_max_size = 32 << 20; // 32 MB.
- static constexpr size_t max_num_sign = 1048576; // Way too big. And actually bytes.
-
- unique_ptr<uint8_t[]> out_buf;
- uint8_t *out_end;
- uint8_t *ptr;
- RansState rans;
- RansEncSymbol esyms[NUM_SYMS];
- uint32_t sign_bias;
-
- std::string last_block;
-};
-
// Should be done on the GPU, of course, but irrelevant for the demonstration.
void convert_ycbcr()
{
readpix(rgb, "color.pnm");
convert_ycbcr();
- // Compile the shader.
+ // Compile the DCT shader.
string shader_src = ::read_file("encoder.shader");
GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
GLuint glsl_program_num = glCreateProgram();
exit(1);
}
+ // Compile the tally shader.
+ shader_src = ::read_file("tally.shader");
+ shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
+ GLuint glsl_tally_program_num = glCreateProgram();
+ glAttachShader(glsl_tally_program_num, shader_num);
+ glLinkProgram(glsl_tally_program_num);
+
+ glGetProgramiv(glsl_tally_program_num, GL_LINK_STATUS, &success);
+ if (success == GL_FALSE) {
+ GLchar error_log[1024] = {0};
+ glGetProgramInfoLog(glsl_tally_program_num, 1024, nullptr, error_log);
+ fprintf(stderr, "Error linking program: %s\n", error_log);
+ exit(1);
+ }
+
+ // Compile the rANS shader.
+ shader_src = ::read_file("rans.shader");
+ shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
+ GLuint glsl_rans_program_num = glCreateProgram();
+ glAttachShader(glsl_rans_program_num, shader_num);
+ glLinkProgram(glsl_rans_program_num);
+
+ glGetProgramiv(glsl_rans_program_num, GL_LINK_STATUS, &success);
+ if (success == GL_FALSE) {
+ GLchar error_log[1024] = {0};
+ glGetProgramInfoLog(glsl_rans_program_num, 1024, nullptr, error_log);
+ fprintf(stderr, "Error linking program: %s\n", error_log);
+ exit(1);
+ }
+ check_error();
+
+ // An SSBO for the raw rANS counts.
+ GLuint ssbo;
+ glGenBuffers(1, &ssbo);
+ glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
+ glNamedBufferStorage(ssbo, sizeof(RansCountSSBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+ check_error();
+
+ // UBO for the rANS distributions (copied from an SSBO).
+ GLuint dist_ssbo;
+ glGenBuffers(1, &dist_ssbo);
+ glBindBuffer(GL_SHADER_STORAGE_BUFFER, dist_ssbo);
+ glNamedBufferStorage(dist_ssbo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+ check_error();
+
+ GLuint dist_ubo;
+ glGenBuffers(1, &dist_ubo);
+ glBindBuffer(GL_UNIFORM_BUFFER, dist_ubo);
+ glNamedBufferStorage(dist_ubo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+ check_error();
+
+ // SSBOs for the rANS output (data and offsets).
+ GLuint output_ssbo;
+ glGenBuffers(1, &output_ssbo);
+ glBindBuffer(GL_SHADER_STORAGE_BUFFER, output_ssbo);
+ glNamedBufferStorage(output_ssbo, HEIGHT_BLOCKS * WIDTH_BLOCKS * STREAM_BUF_SIZE, nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+ check_error();
+
+ GLuint bytes_written_ssbo;
+ glGenBuffers(1, &bytes_written_ssbo);
+ glBindBuffer(GL_SHADER_STORAGE_BUFFER, bytes_written_ssbo);
+ glNamedBufferStorage(bytes_written_ssbo, HEIGHT_BLOCKS * WIDTH_BLOCKS * sizeof(uint32_t), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+ check_error();
+
+ // Bind SSBOs.
glUseProgram(glsl_program_num);
+ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
+
+ glUseProgram(glsl_tally_program_num);
+ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
+ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 12, dist_ssbo);
+
+ glUseProgram(glsl_rans_program_num);
+ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, output_ssbo);
+ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, bytes_written_ssbo);
+ glBindBufferBase(GL_UNIFORM_BUFFER, 13, dist_ubo);
+
+ glUseProgram(glsl_program_num);
+ check_error();
// Upload luma.
GLuint y_tex;
check_error();
}
+ glBindImageTexture(0, dc_ac7_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R16UI);
+ glBindImageTexture(1, ac1_ac6_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R16UI);
+ glBindImageTexture(2, ac2_ac5_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R16UI);
+ glBindImageTexture(3, ac3_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R8I);
+ glBindImageTexture(4, ac4_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R8I);
+ glBindImageTexture(5, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
+ check_error();
+
+ // Bind uniforms.
+ glUseProgram(glsl_program_num);
GLint dc_ac7_tex_uniform = glGetUniformLocation(glsl_program_num, "dc_ac7_tex");
GLint ac1_ac6_tex_uniform = glGetUniformLocation(glsl_program_num, "ac1_ac6_tex");
GLint ac2_ac5_tex_uniform = glGetUniformLocation(glsl_program_num, "ac2_ac5_tex");
GLint ac3_tex_uniform = glGetUniformLocation(glsl_program_num, "ac3_tex");
GLint ac4_tex_uniform = glGetUniformLocation(glsl_program_num, "ac4_tex");
GLint image_tex_uniform = glGetUniformLocation(glsl_program_num, "image_tex");
-
glUniform1i(dc_ac7_tex_uniform, 0);
glUniform1i(ac1_ac6_tex_uniform, 1);
glUniform1i(ac2_ac5_tex_uniform, 2);
glUniform1i(ac3_tex_uniform, 3);
glUniform1i(ac4_tex_uniform, 4);
glUniform1i(image_tex_uniform, 5);
- glBindImageTexture(0, dc_ac7_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
- glBindImageTexture(1, ac1_ac6_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
- glBindImageTexture(2, ac2_ac5_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI);
- glBindImageTexture(3, ac3_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I);
- glBindImageTexture(4, ac4_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I);
- glBindImageTexture(5, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
- check_error();
+
+ glUseProgram(glsl_rans_program_num);
+ dc_ac7_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "dc_ac7_tex");
+ ac1_ac6_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac1_ac6_tex");
+ ac2_ac5_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac2_ac5_tex");
+ ac3_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac3_tex");
+ ac4_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac4_tex");
+ image_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "image_tex");
+ glUniform1i(dc_ac7_tex_uniform, 0);
+ glUniform1i(ac1_ac6_tex_uniform, 1);
+ glUniform1i(ac2_ac5_tex_uniform, 2);
+ glUniform1i(ac3_tex_uniform, 3);
+ glUniform1i(ac4_tex_uniform, 4);
steady_clock::time_point start = steady_clock::now();
- unsigned num_iterations = 1000;
+ unsigned num_iterations = 100;
for (unsigned i = 0; i < num_iterations; ++i) {
- glDispatchCompute(WIDTH_BLOCKS, HEIGHT_BLOCKS, 1);
+ glClearNamedBufferSubData(ssbo, GL_R8, 0, sizeof(RansCountSSBO), GL_RED, GL_UNSIGNED_BYTE, nullptr);
+ glUseProgram(glsl_program_num);
+ glDispatchCompute(WIDTH_BLOCKS / 16, HEIGHT_BLOCKS, 1);
+ glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+
+ glUseProgram(glsl_tally_program_num);
+ glDispatchCompute(4, 1, 1);
+ glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+
+ glCopyNamedBufferSubData(dist_ssbo, dist_ubo, 0, 0, sizeof(RansDistUBO));
+ glMemoryBarrier(GL_UNIFORM_BARRIER_BIT);
+
+ glUseProgram(glsl_rans_program_num);
+ glDispatchCompute(NUM_BLOCKS / BLOCKS_PER_STREAM, 8, 5);
}
check_error();
glFinish();
- steady_clock::time_point now = steady_clock::now();
-
- // CPU part starts here -- will be GPU later.
- // We only do luma for now.
-
- int16_t *coeff_y = new int16_t[WIDTH * HEIGHT];
-
- glBindTexture(GL_TEXTURE_2D, dc_ac7_tex);
- uint16_t *dc_ac7_data = new uint16_t[(WIDTH/8) * HEIGHT];
- glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, dc_ac7_data);
- check_error();
-
- glBindTexture(GL_TEXTURE_2D, ac1_ac6_tex);
- uint16_t *ac1_ac6_data = new uint16_t[(WIDTH/8) * HEIGHT];
- glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ac1_ac6_data);
- check_error();
-
- glBindTexture(GL_TEXTURE_2D, ac2_ac5_tex);
- uint16_t *ac2_ac5_data = new uint16_t[(WIDTH/8) * HEIGHT];
- glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ac2_ac5_data);
- check_error();
-
- glBindTexture(GL_TEXTURE_2D, ac3_tex);
- int8_t *ac3_data = new int8_t[(WIDTH/8) * HEIGHT];
- glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_BYTE, ac3_data);
- check_error();
-
- glBindTexture(GL_TEXTURE_2D, ac4_tex);
- int8_t *ac4_data = new int8_t[(WIDTH/8) * HEIGHT];
- glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_BYTE, ac4_data);
check_error();
+ steady_clock::time_point now = steady_clock::now();
- for (unsigned y = 0; y < HEIGHT; ++y) {
- for (unsigned xb = 0; xb < WIDTH/8; ++xb) {
- coeff_y[y * WIDTH + xb*8 + 0] = int(dc_ac7_data[y * (WIDTH/8) + xb] << 23) >> 23;
- coeff_y[y * WIDTH + xb*8 + 7] = int(dc_ac7_data[y * (WIDTH/8) + xb] << 16) >> 25;
- coeff_y[y * WIDTH + xb*8 + 1] = int(ac1_ac6_data[y * (WIDTH/8) + xb] << 23) >> 23;
- coeff_y[y * WIDTH + xb*8 + 6] = int(ac1_ac6_data[y * (WIDTH/8) + xb] << 16) >> 25;
- coeff_y[y * WIDTH + xb*8 + 2] = int(ac2_ac5_data[y * (WIDTH/8) + xb] << 23) >> 23;
- coeff_y[y * WIDTH + xb*8 + 5] = int(ac2_ac5_data[y * (WIDTH/8) + xb] << 16) >> 25;
- coeff_y[y * WIDTH + xb*8 + 3] = ac3_data[y * (WIDTH/8) + xb];
- coeff_y[y * WIDTH + xb*8 + 4] = ac4_data[y * (WIDTH/8) + xb];
- }
- }
+#if 0
+ printf("%ld bytes + %ld escape bits (%ld) = %ld total bytes\n",
+ tot_bytes - extra_bits / 8,
+ extra_bits,
+ extra_bits / 8,
+ tot_bytes);
-#if 1
- for (unsigned y = 0; y < HEIGHT; ++y) {
- for (unsigned xb = 0; xb < WIDTH/8; ++xb) {
- printf("%4d %4d %4d %4d %4d %4d %4d %4d | ",
- coeff_y[y * WIDTH + xb*8 + 0],
- coeff_y[y * WIDTH + xb*8 + 1],
- coeff_y[y * WIDTH + xb*8 + 2],
- coeff_y[y * WIDTH + xb*8 + 3],
- coeff_y[y * WIDTH + xb*8 + 4],
- coeff_y[y * WIDTH + xb*8 + 5],
- coeff_y[y * WIDTH + xb*8 + 6],
- coeff_y[y * WIDTH + xb*8 + 7]);
- printf("%4d %4d %4d %4d %4d %4d %4d %4d || ",
- pix_y[y * WIDTH + xb*8 + 0],
- pix_y[y * WIDTH + xb*8 + 1],
- pix_y[y * WIDTH + xb*8 + 2],
- pix_y[y * WIDTH + xb*8 + 3],
- pix_y[y * WIDTH + xb*8 + 4],
- pix_y[y * WIDTH + xb*8 + 5],
- pix_y[y * WIDTH + xb*8 + 6],
- pix_y[y * WIDTH + xb*8 + 7]);
- }
- printf("\n");
- }
+ printf("\n");
#endif
- // DC coefficient pred from the right to left (within each slice)
- for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; block_idx += BLOCKS_PER_STREAM) {
- int prev_k = 128;
-
- for (unsigned subblock_idx = BLOCKS_PER_STREAM; subblock_idx --> 0; ) {
- unsigned yb = (block_idx + subblock_idx) / WIDTH_BLOCKS;
- unsigned xb = (block_idx + subblock_idx) % WIDTH_BLOCKS;
- int k = coeff_y[(yb * 8) * WIDTH + xb * 8];
-
- coeff_y[(yb * 8) * WIDTH + xb * 8] = k - prev_k;
-
- prev_k = k;
- }
- }
-
- // For each coefficient, make some tables.
- size_t extra_bits = 0;
- for (unsigned i = 0; i < 64; ++i) {
- stats[i].clear();
- }
- for (unsigned y = 0; y < 8; ++y) {
- for (unsigned x = 0; x < 8; ++x) {
- SymbolStats &s_luma = stats[pick_stats_for(x, y)];
-
- // Luma
- for (unsigned yb = 0; yb < HEIGHT; yb += 8) {
- for (unsigned xb = 0; xb < WIDTH; xb += 8) {
- unsigned short k = abs(coeff_y[(yb + y) * WIDTH + (xb + x)]);
- if (k >= ESCAPE_LIMIT) {
- k = ESCAPE_LIMIT;
- extra_bits += 12; // escape this one
- }
- ++s_luma.freqs[(k - 1) & (NUM_SYMS - 1)];
- }
- }
- }
- }
-
- for (unsigned i = 0; i < 64; ++i) {
- stats[i].freqs[NUM_SYMS - 1] /= 2; // zero, has no sign bits (yes, this is trickery)
- stats[i].normalize_freqs(prob_scale);
- stats[i].cum_freqs[NUM_SYMS] += stats[i].freqs[NUM_SYMS - 1];
- stats[i].freqs[NUM_SYMS - 1] *= 2;
- }
+ printf("Each iteration took %.3f ms.\n", 1e3 * duration<double>(now - start).count() / num_iterations);
FILE *codedfp = fopen("coded.dat", "wb");
if (codedfp == nullptr) {
exit(1);
}
+ // Write out the distributions.
+ const RansCountSSBO *rans_count = (const RansCountSSBO *)glMapNamedBufferRange(ssbo, 0, sizeof(RansCountSSBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+ const RansDistUBO *rans_dist = (const RansDistUBO *)glMapNamedBufferRange(dist_ssbo, 0, sizeof(RansDistUBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
for (unsigned r = 0; r < 2; ++r) { // Hack to write fake chroma tables.
// TODO: rather gamma-k or something
- for (unsigned i = 0; i < 64; ++i) {
- if (stats[i].cum_freqs[NUM_SYMS] == 0) {
- continue;
- }
+ for (unsigned i = 0; i < 4; ++i) {
printf("writing table %d\n", i);
for (unsigned j = 0; j < NUM_SYMS; ++j) {
- write_varint(stats[i].freqs[j], codedfp);
+ printf("%d,%d: freq=%d x_max=%d, rcp_freq=%08x, bias=%d, rcp_shift=%d, cmpl_freq=%d\n",
+ i, j, rans_count->ransfreq[i * 256 + j],
+ rans_dist->ransdist[i * 256 + j].x_max,
+ rans_dist->ransdist[i * 256 + j].rcp_freq,
+ rans_dist->ransdist[i * 256 + j].bias,
+ rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq & 0xffff,
+ rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq >> 16);
+ write_varint(rans_count->ransfreq[i * 256 + j], codedfp);
}
}
}
- RansEncoder rans_encoder;
+ // Write out the actual data.
- size_t tot_bytes = 0;
+ const uint32_t *bytes_written = (const uint32_t *)glMapNamedBufferRange(bytes_written_ssbo, 0, HEIGHT_BLOCKS * WIDTH_BLOCKS * sizeof(uint32_t), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+#if 0
+ for (int i = 0; i < HEIGHT_BLOCKS*64; ++i) {
+ printf("%d,%d,%d: %u\n", i / 64, (i / 8) % 8, i % 8, 1024 * (i + 1) - offsets[i]);
+ }
+#endif
- // Luma
+ const uint8_t *data = (const uint8_t *)glMapNamedBufferRange(output_ssbo, 0, HEIGHT_BLOCKS * WIDTH_BLOCKS * STREAM_BUF_SIZE, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+
+ string last_block;
for (unsigned y = 0; y < 8; ++y) {
for (unsigned x = 0; x < 8; ++x) {
- SymbolStats &s_luma = stats[pick_stats_for(x, y)];
- rans_encoder.init_prob(s_luma);
-
- // Luma
- std::vector<int> lens;
-
- rans_encoder.clear();
- size_t num_bytes = 0;
- for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; ++block_idx) {
- unsigned yb = block_idx / WIDTH_BLOCKS;
- unsigned xb = block_idx % WIDTH_BLOCKS;
-
- int k = coeff_y[(yb * 8 + y) * WIDTH + (xb * 8 + x)];
- rans_encoder.encode_coeff(k);
-
- if (block_idx % BLOCKS_PER_STREAM == (BLOCKS_PER_STREAM - 1) || block_idx == NUM_BLOCKS - 1) {
- int l = rans_encoder.save_block(codedfp);
- num_bytes += l;
- lens.push_back(l);
+ for (unsigned int stream_idx = 0; stream_idx < HEIGHT_BLOCKS; ++stream_idx) {
+ const uint8_t *out_end = data + (stream_idx * 64 + y * 8 + x + 1) * STREAM_BUF_SIZE;
+ uint32_t num_rans_bytes = bytes_written[stream_idx * 64 + y * 8 + x];
+ const uint8_t *ptr = out_end - num_rans_bytes;
+ assert(num_rans_bytes <= STREAM_BUF_SIZE);
+
+ if (num_rans_bytes == last_block.size() &&
+ memcmp(last_block.data(), ptr, last_block.size()) == 0) {
+ write_varint(0, codedfp);
+ } else {
+ last_block = string((const char *)ptr, num_rans_bytes);
+ write_varint(num_rans_bytes, codedfp);
+ fwrite(ptr, 1, num_rans_bytes, codedfp);
}
}
- tot_bytes += num_bytes;
- printf("coeff %d Y': %ld bytes\n", y * 8 + x, num_bytes);
}
}
-
- printf("%ld bytes + %ld escape bits (%ld) = %ld total bytes\n",
- tot_bytes - extra_bits / 8,
- extra_bits,
- extra_bits / 8,
- tot_bytes);
-
- printf("\n");
- printf("Each iteration took %.3f ms (but note that is DCT only, no rANS).\n", 1e3 * duration<double>(now - start).count() / num_iterations);
-
+ fclose(codedfp);
}