unsigned char pix_cb[(WIDTH/2) * HEIGHT];
unsigned char pix_cr[(WIDTH/2) * HEIGHT];
-struct RansDistSSBO {
+struct RansCountSSBO {
unsigned dist[4 * 256];
- std::pair<unsigned, unsigned> ransdist[4 * 256];
+ unsigned ransfreq[4 * 256];
+};
+
+struct RansDistUBO {
+ struct {
+ uint32_t x_max, rcp_freq, bias, rcp_shift_and_cmpl_freq;
+ } ransdist[4 * 256];
+ struct {
+ uint32_t val;
+ uint32_t padding[3]; // std140 layout.
+ } sign_biases[4];
};
using namespace std;
}
check_error();
- // An SSBO for the rANS distributions.
+ // An SSBO for the raw rANS counts.
GLuint ssbo;
glGenBuffers(1, &ssbo);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
- glNamedBufferStorage(ssbo, 256 * 16 * sizeof(uint32_t), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+ glNamedBufferStorage(ssbo, sizeof(RansCountSSBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+ check_error();
+
+ // UBO for the rANS distributions (copied from an SSBO).
+ GLuint dist_ssbo;
+ glGenBuffers(1, &dist_ssbo);
+ glBindBuffer(GL_SHADER_STORAGE_BUFFER, dist_ssbo);
+ glNamedBufferStorage(dist_ssbo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+ check_error();
+
+ GLuint dist_ubo;
+ glGenBuffers(1, &dist_ubo);
+ glBindBuffer(GL_UNIFORM_BUFFER, dist_ubo);
+ glNamedBufferStorage(dist_ubo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
check_error();
// SSBOs for the rANS output (data and offsets).
glUseProgram(glsl_tally_program_num);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
+ glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 12, dist_ssbo);
glUseProgram(glsl_rans_program_num);
- glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, output_ssbo);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, output_offset_ssbo);
+ glBindBufferBase(GL_UNIFORM_BUFFER, 13, dist_ubo);
glUseProgram(glsl_program_num);
check_error();
steady_clock::time_point start = steady_clock::now();
unsigned num_iterations = 100;
for (unsigned i = 0; i < num_iterations; ++i) {
- glClearNamedBufferSubData(ssbo, GL_R8, 0, 256 * 16 * sizeof(uint32_t), GL_RED, GL_UNSIGNED_BYTE, nullptr);
+ glClearNamedBufferSubData(ssbo, GL_R8, 0, sizeof(RansCountSSBO), GL_RED, GL_UNSIGNED_BYTE, nullptr);
glUseProgram(glsl_program_num);
glDispatchCompute(WIDTH_BLOCKS / 16, HEIGHT_BLOCKS, 1);
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
glDispatchCompute(4, 1, 1);
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+ glCopyNamedBufferSubData(dist_ssbo, dist_ubo, 0, 0, sizeof(RansDistUBO));
+ glMemoryBarrier(GL_UNIFORM_BARRIER_BIT);
+
glUseProgram(glsl_rans_program_num);
glDispatchCompute(NUM_BLOCKS / BLOCKS_PER_STREAM, 8, 5);
}
}
// Write out the distributions.
- const RansDistSSBO *rans_dist = (const RansDistSSBO *)glMapNamedBufferRange(ssbo, 0, 256 * 16 * sizeof(uint32_t), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+ const RansCountSSBO *rans_count = (const RansCountSSBO *)glMapNamedBufferRange(ssbo, 0, sizeof(RansCountSSBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+ const RansDistUBO *rans_dist = (const RansDistUBO *)glMapNamedBufferRange(dist_ssbo, 0, sizeof(RansDistUBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
for (unsigned r = 0; r < 2; ++r) { // Hack to write fake chroma tables.
// TODO: rather gamma-k or something
for (unsigned i = 0; i < 4; ++i) {
printf("writing table %d\n", i);
for (unsigned j = 0; j < NUM_SYMS; ++j) {
- printf("%d,%d: start=%d freq=%d\n", i, j, rans_dist->ransdist[i * 256 + j].first, rans_dist->ransdist[i * 256 + j].second);
- write_varint(rans_dist->ransdist[i * 256 + j].second, codedfp);
+ printf("%d,%d: freq=%d x_max=%d, rcp_freq=%08x, bias=%d, rcp_shift=%d, cmpl_freq=%d\n",
+ i, j, rans_count->ransfreq[i * 256 + j],
+ rans_dist->ransdist[i * 256 + j].x_max,
+ rans_dist->ransdist[i * 256 + j].rcp_freq,
+ rans_dist->ransdist[i * 256 + j].bias,
+ rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq & 0xffff,
+ rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq >> 16);
+ write_varint(rans_count->ransfreq[i * 256 + j], codedfp);
}
}
}
const uint8_t *data = (const uint8_t *)glMapNamedBufferRange(output_ssbo, 0, 45 * 64 * 1024, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+ string last_block;
for (unsigned y = 0; y < 8; ++y) {
for (unsigned x = 0; x < 8; ++x) {
for (unsigned int stream_idx = 0; stream_idx < 45; ++stream_idx) {
const uint8_t *out_end = data + (stream_idx * 64 + y * 8 + x + 1) * 1024;
const uint8_t *ptr = data + offsets[stream_idx * 64 + y * 8 + x];
uint32_t num_rans_bytes = out_end - ptr;
-#if 0
+ assert(num_rans_bytes <= 1024);
+
if (num_rans_bytes == last_block.size() &&
memcmp(last_block.data(), ptr, last_block.size()) == 0) {
write_varint(0, codedfp);
- clear();
- return 1;
} else {
last_block = string((const char *)ptr, num_rans_bytes);
+ write_varint(num_rans_bytes, codedfp);
+ fwrite(ptr, 1, num_rans_bytes, codedfp);
}
-#endif
-
- write_varint(num_rans_bytes, codedfp);
- fwrite(ptr, 1, num_rans_bytes, codedfp);
}
}
}