From: Steinar H. Gunderson Date: Mon, 16 Oct 2017 19:27:50 +0000 (+0200) Subject: Make the encoder 100% GPU. Not working yet, though. X-Git-Url: https://git.sesse.net/?p=narabu;a=commitdiff_plain;h=5e1d27014149311318e97b8e04a6e05ec858e57c Make the encoder 100% GPU. Not working yet, though. --- diff --git a/coded.dat b/coded.dat index 469306f..bb889f2 100644 Binary files a/coded.dat and b/coded.dat differ diff --git a/narabu-encoder.cpp b/narabu-encoder.cpp index a4e7716..6b1df48 100644 --- a/narabu-encoder.cpp +++ b/narabu-encoder.cpp @@ -43,6 +43,11 @@ unsigned char pix_y[WIDTH * HEIGHT]; unsigned char pix_cb[(WIDTH/2) * HEIGHT]; unsigned char pix_cr[(WIDTH/2) * HEIGHT]; +struct RansDistSSBO { + unsigned dist[4 * 256]; + std::pair ransdist[4 * 256]; +}; + using namespace std; using namespace std::chrono; @@ -72,182 +77,6 @@ void readpix(unsigned char *ptr, const char *filename) fclose(fp); } -struct SymbolStats -{ - uint32_t freqs[NUM_SYMS]; - uint32_t cum_freqs[NUM_SYMS + 1]; - - void clear(); - void calc_cum_freqs(); - void normalize_freqs(uint32_t target_total); -}; - -void SymbolStats::clear() -{ - for (int i=0; i < NUM_SYMS; i++) - freqs[i] = 0; -} - -void SymbolStats::calc_cum_freqs() -{ - cum_freqs[0] = 0; - for (int i=0; i < NUM_SYMS; i++) - cum_freqs[i+1] = cum_freqs[i] + freqs[i]; -} - -void SymbolStats::normalize_freqs(uint32_t target_total) -{ - uint64_t real_freq[NUM_SYMS + 1]; // hack - - assert(target_total >= NUM_SYMS); - - calc_cum_freqs(); - uint32_t cur_total = cum_freqs[NUM_SYMS]; - - if (cur_total == 0) return; - - double ideal_cost = 0.0; - for (int i = 1; i <= NUM_SYMS; i++) - { - real_freq[i] = cum_freqs[i] - cum_freqs[i - 1]; - if (real_freq[i] > 0) - ideal_cost -= real_freq[i] * log2(real_freq[i] / double(cur_total)); - } - - OptimalRenormalize(cum_freqs, NUM_SYMS, prob_scale); - - // calculate updated freqs and make sure we didn't screw anything up - assert(cum_freqs[0] == 0 && cum_freqs[NUM_SYMS] == target_total); - for (int i=0; i < NUM_SYMS; i++) { - if (freqs[i] == 0) - assert(cum_freqs[i+1] == cum_freqs[i]); - else - assert(cum_freqs[i+1] > cum_freqs[i]); - - // calc updated freq - freqs[i] = cum_freqs[i+1] - cum_freqs[i]; - } - - double calc_cost = 0.0; - for (int i = 1; i <= NUM_SYMS; i++) - { - uint64_t freq = cum_freqs[i] - cum_freqs[i - 1]; - if (real_freq[i] > 0) - calc_cost -= real_freq[i] * log2(freq / double(target_total)); - } - - static double total_loss = 0.0; - total_loss += calc_cost - ideal_cost; - static double total_loss_with_dp = 0.0; - double optimal_cost = 0.0; - //total_loss_with_dp += optimal_cost - ideal_cost; - printf("ideal cost = %.0f bits, DP cost = %.0f bits, calc cost = %.0f bits (loss = %.2f bytes, total loss = %.2f bytes, total loss with DP = %.2f bytes)\n", - ideal_cost, optimal_cost, - calc_cost, (calc_cost - ideal_cost) / 8.0, total_loss / 8.0, total_loss_with_dp / 8.0); -} - -SymbolStats stats[128]; - -const int luma_mapping[64] = { - 0, 0, 1, 1, 2, 2, 3, 3, - 0, 0, 1, 2, 2, 2, 3, 3, - 1, 1, 2, 2, 2, 3, 3, 3, - 1, 1, 2, 2, 2, 3, 3, 3, - 1, 2, 2, 2, 2, 3, 3, 3, - 2, 2, 2, 2, 3, 3, 3, 3, - 2, 2, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, -}; - -int pick_stats_for(int x, int y) -{ - return luma_mapping[y * 8 + x]; -} - -class RansEncoder { -public: - RansEncoder() - { - out_buf.reset(new uint8_t[out_max_size]); - clear(); - } - - void init_prob(SymbolStats &s) - { - for (int i = 0; i < NUM_SYMS; i++) { - //printf("%d: cumfreqs=%d freqs=%d prob_bits=%d\n", i, s.cum_freqs[i], s.freqs[i], prob_bits + 1); - RansEncSymbolInit(&esyms[i], s.cum_freqs[i], s.freqs[i], prob_bits + 1); - } - sign_bias = s.cum_freqs[NUM_SYMS]; - } - - void clear() - { - out_end = out_buf.get() + out_max_size; - ptr = out_end; // *end* of output buffer - RansEncInit(&rans); - } - - uint32_t save_block(FILE *codedfp) // Returns number of bytes. - { - RansEncFlush(&rans, &ptr); - //printf("post-flush = %08x\n", rans); - - uint32_t num_rans_bytes = out_end - ptr; - if (num_rans_bytes == last_block.size() && - memcmp(last_block.data(), ptr, last_block.size()) == 0) { - write_varint(0, codedfp); - clear(); - return 1; - } else { - last_block = string((const char *)ptr, num_rans_bytes); - } - - write_varint(num_rans_bytes, codedfp); - //fwrite(&num_rans_bytes, 1, 4, codedfp); - fwrite(ptr, 1, num_rans_bytes, codedfp); - - //printf("first rANS bytes: %02x %02x %02x %02x %02x %02x %02x %02x\n", ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], ptr[6], ptr[7]); - - - clear(); - - //printf("Saving block: %d rANS bytes\n", num_rans_bytes); - return num_rans_bytes; - //return num_rans_bytes; - } - - void encode_coeff(short signed_k) - { - //printf("encoding coeff %d (sym %d), rans before encoding = %08x\n", signed_k, ((abs(signed_k) - 1) & 255), rans); - unsigned short k = abs(signed_k); - if (k >= ESCAPE_LIMIT) { - // Put the coefficient as a 1/(2^12) symbol _before_ - // the 255 coefficient, since the decoder will read the - // 255 coefficient first. - RansEncPut(&rans, &ptr, k, 1, prob_bits); - k = ESCAPE_LIMIT; - } - RansEncPutSymbol(&rans, &ptr, &esyms[(k - 1) & (NUM_SYMS - 1)]); - if (signed_k < 0) { - rans += sign_bias; - } - } - -private: - static constexpr size_t out_max_size = 32 << 20; // 32 MB. - static constexpr size_t max_num_sign = 1048576; // Way too big. And actually bytes. - - unique_ptr out_buf; - uint8_t *out_end; - uint8_t *ptr; - RansState rans; - RansEncSymbol esyms[NUM_SYMS]; - uint32_t sign_bias; - - std::string last_block; -}; - // Should be done on the GPU, of course, but irrelevant for the demonstration. void convert_ycbcr() { @@ -316,7 +145,7 @@ int main(int argc, char **argv) readpix(rgb, "color.pnm"); convert_ycbcr(); - // Compile the shader. + // Compile the DCT shader. string shader_src = ::read_file("encoder.shader"); GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER); GLuint glsl_program_num = glCreateProgram(); @@ -347,15 +176,57 @@ int main(int argc, char **argv) exit(1); } - glUseProgram(glsl_program_num); + // Compile the rANS shader. + shader_src = ::read_file("rans.shader"); + shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER); + GLuint glsl_rans_program_num = glCreateProgram(); + glAttachShader(glsl_rans_program_num, shader_num); + glLinkProgram(glsl_rans_program_num); + + glGetProgramiv(glsl_rans_program_num, GL_LINK_STATUS, &success); + if (success == GL_FALSE) { + GLchar error_log[1024] = {0}; + glGetProgramInfoLog(glsl_rans_program_num, 1024, nullptr, error_log); + fprintf(stderr, "Error linking program: %s\n", error_log); + exit(1); + } + check_error(); // An SSBO for the rANS distributions. GLuint ssbo; glGenBuffers(1, &ssbo); glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo); - glBufferData(GL_SHADER_STORAGE_BUFFER, 256 * 4 * sizeof(uint32_t), nullptr, GL_DYNAMIC_COPY); + glNamedBufferStorage(ssbo, 256 * 16 * sizeof(uint32_t), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + check_error(); + + // SSBOs for the rANS output (data and offsets). + GLuint output_ssbo; + glGenBuffers(1, &output_ssbo); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, output_ssbo); + glNamedBufferStorage(output_ssbo, 45 * 64 * 1024, nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + check_error(); + + GLuint output_offset_ssbo; + glGenBuffers(1, &output_offset_ssbo); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, output_offset_ssbo); + glNamedBufferStorage(output_offset_ssbo, 45 * 64 * sizeof(uint32_t), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + check_error(); + + // Bind SSBOs. + glUseProgram(glsl_program_num); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo); + glUseProgram(glsl_tally_program_num); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo); + + glUseProgram(glsl_rans_program_num); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, output_ssbo); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, output_offset_ssbo); + + glUseProgram(glsl_program_num); + check_error(); + // Upload luma. GLuint y_tex; glGenTextures(1, &y_tex); @@ -392,158 +263,73 @@ int main(int argc, char **argv) check_error(); } + glBindImageTexture(0, dc_ac7_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI); + glBindImageTexture(1, ac1_ac6_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI); + glBindImageTexture(2, ac2_ac5_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI); + glBindImageTexture(3, ac3_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I); + glBindImageTexture(4, ac4_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I); + glBindImageTexture(5, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI); + check_error(); + + // Bind uniforms. + glUseProgram(glsl_program_num); GLint dc_ac7_tex_uniform = glGetUniformLocation(glsl_program_num, "dc_ac7_tex"); GLint ac1_ac6_tex_uniform = glGetUniformLocation(glsl_program_num, "ac1_ac6_tex"); GLint ac2_ac5_tex_uniform = glGetUniformLocation(glsl_program_num, "ac2_ac5_tex"); GLint ac3_tex_uniform = glGetUniformLocation(glsl_program_num, "ac3_tex"); GLint ac4_tex_uniform = glGetUniformLocation(glsl_program_num, "ac4_tex"); GLint image_tex_uniform = glGetUniformLocation(glsl_program_num, "image_tex"); - glUniform1i(dc_ac7_tex_uniform, 0); glUniform1i(ac1_ac6_tex_uniform, 1); glUniform1i(ac2_ac5_tex_uniform, 2); glUniform1i(ac3_tex_uniform, 3); glUniform1i(ac4_tex_uniform, 4); glUniform1i(image_tex_uniform, 5); - glBindImageTexture(0, dc_ac7_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI); - glBindImageTexture(1, ac1_ac6_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI); - glBindImageTexture(2, ac2_ac5_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16UI); - glBindImageTexture(3, ac3_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I); - glBindImageTexture(4, ac4_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8I); - glBindImageTexture(5, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI); - check_error(); - glUseProgram(glsl_tally_program_num); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo); + glUseProgram(glsl_rans_program_num); + dc_ac7_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "dc_ac7_tex"); + ac1_ac6_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac1_ac6_tex"); + ac2_ac5_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac2_ac5_tex"); + ac3_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac3_tex"); + ac4_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac4_tex"); + image_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "image_tex"); + glUniform1i(dc_ac7_tex_uniform, 0); + glUniform1i(ac1_ac6_tex_uniform, 1); + glUniform1i(ac2_ac5_tex_uniform, 2); + glUniform1i(ac3_tex_uniform, 3); + glUniform1i(ac4_tex_uniform, 4); steady_clock::time_point start = steady_clock::now(); - unsigned num_iterations = 1000; + unsigned num_iterations = 100; for (unsigned i = 0; i < num_iterations; ++i) { - glClearNamedBufferSubData(ssbo, GL_R8, 0, 256 * 4 * sizeof(uint32_t), GL_RED, GL_UNSIGNED_BYTE, nullptr); + glClearNamedBufferSubData(ssbo, GL_R8, 0, 256 * 16 * sizeof(uint32_t), GL_RED, GL_UNSIGNED_BYTE, nullptr); glUseProgram(glsl_program_num); glDispatchCompute(WIDTH_BLOCKS / 16, HEIGHT_BLOCKS, 1); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); glUseProgram(glsl_tally_program_num); glDispatchCompute(4, 1, 1); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + + glUseProgram(glsl_rans_program_num); + glDispatchCompute(NUM_BLOCKS / BLOCKS_PER_STREAM, 8, 5); } check_error(); glFinish(); - steady_clock::time_point now = steady_clock::now(); - - // CPU part starts here -- will be GPU later. - // We only do luma for now. - - int16_t *coeff_y = new int16_t[WIDTH * HEIGHT]; - - glBindTexture(GL_TEXTURE_2D, dc_ac7_tex); - uint16_t *dc_ac7_data = new uint16_t[(WIDTH/8) * HEIGHT]; - glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, dc_ac7_data); - check_error(); - - glBindTexture(GL_TEXTURE_2D, ac1_ac6_tex); - uint16_t *ac1_ac6_data = new uint16_t[(WIDTH/8) * HEIGHT]; - glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ac1_ac6_data); check_error(); - - glBindTexture(GL_TEXTURE_2D, ac2_ac5_tex); - uint16_t *ac2_ac5_data = new uint16_t[(WIDTH/8) * HEIGHT]; - glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, ac2_ac5_data); - check_error(); - - glBindTexture(GL_TEXTURE_2D, ac3_tex); - int8_t *ac3_data = new int8_t[(WIDTH/8) * HEIGHT]; - glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_BYTE, ac3_data); - check_error(); - - glBindTexture(GL_TEXTURE_2D, ac4_tex); - int8_t *ac4_data = new int8_t[(WIDTH/8) * HEIGHT]; - glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_BYTE, ac4_data); - check_error(); - - for (unsigned y = 0; y < HEIGHT; ++y) { - for (unsigned xb = 0; xb < WIDTH/8; ++xb) { - coeff_y[y * WIDTH + xb*8 + 0] = int(dc_ac7_data[y * (WIDTH/8) + xb] << 23) >> 23; - coeff_y[y * WIDTH + xb*8 + 7] = int(dc_ac7_data[y * (WIDTH/8) + xb] << 16) >> 25; - coeff_y[y * WIDTH + xb*8 + 1] = int(ac1_ac6_data[y * (WIDTH/8) + xb] << 23) >> 23; - coeff_y[y * WIDTH + xb*8 + 6] = int(ac1_ac6_data[y * (WIDTH/8) + xb] << 16) >> 25; - coeff_y[y * WIDTH + xb*8 + 2] = int(ac2_ac5_data[y * (WIDTH/8) + xb] << 23) >> 23; - coeff_y[y * WIDTH + xb*8 + 5] = int(ac2_ac5_data[y * (WIDTH/8) + xb] << 16) >> 25; - coeff_y[y * WIDTH + xb*8 + 3] = ac3_data[y * (WIDTH/8) + xb]; - coeff_y[y * WIDTH + xb*8 + 4] = ac4_data[y * (WIDTH/8) + xb]; - } - } + steady_clock::time_point now = steady_clock::now(); #if 0 - for (unsigned y = 0; y < HEIGHT; ++y) { - for (unsigned xb = 0; xb < WIDTH/8; ++xb) { - printf("%4d %4d %4d %4d %4d %4d %4d %4d | ", - coeff_y[y * WIDTH + xb*8 + 0], - coeff_y[y * WIDTH + xb*8 + 1], - coeff_y[y * WIDTH + xb*8 + 2], - coeff_y[y * WIDTH + xb*8 + 3], - coeff_y[y * WIDTH + xb*8 + 4], - coeff_y[y * WIDTH + xb*8 + 5], - coeff_y[y * WIDTH + xb*8 + 6], - coeff_y[y * WIDTH + xb*8 + 7]); - printf("%4d %4d %4d %4d %4d %4d %4d %4d || ", - pix_y[y * WIDTH + xb*8 + 0], - pix_y[y * WIDTH + xb*8 + 1], - pix_y[y * WIDTH + xb*8 + 2], - pix_y[y * WIDTH + xb*8 + 3], - pix_y[y * WIDTH + xb*8 + 4], - pix_y[y * WIDTH + xb*8 + 5], - pix_y[y * WIDTH + xb*8 + 6], - pix_y[y * WIDTH + xb*8 + 7]); - } - printf("\n"); - } -#endif - - // DC coefficient pred from the right to left (within each slice) - for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; block_idx += BLOCKS_PER_STREAM) { - int prev_k = 128; - - for (unsigned subblock_idx = BLOCKS_PER_STREAM; subblock_idx --> 0; ) { - unsigned yb = (block_idx + subblock_idx) / WIDTH_BLOCKS; - unsigned xb = (block_idx + subblock_idx) % WIDTH_BLOCKS; - int k = coeff_y[(yb * 8) * WIDTH + xb * 8]; - - coeff_y[(yb * 8) * WIDTH + xb * 8] = k - prev_k; - - prev_k = k; - } - } + printf("%ld bytes + %ld escape bits (%ld) = %ld total bytes\n", + tot_bytes - extra_bits / 8, + extra_bits, + extra_bits / 8, + tot_bytes); - // For each coefficient, make some tables. - size_t extra_bits = 0; - for (unsigned i = 0; i < 64; ++i) { - stats[i].clear(); - } - for (unsigned y = 0; y < 8; ++y) { - for (unsigned x = 0; x < 8; ++x) { - SymbolStats &s_luma = stats[pick_stats_for(x, y)]; - - // Luma - for (unsigned yb = 0; yb < HEIGHT; yb += 8) { - for (unsigned xb = 0; xb < WIDTH; xb += 8) { - unsigned short k = abs(coeff_y[(yb + y) * WIDTH + (xb + x)]); - if (k >= ESCAPE_LIMIT) { - k = ESCAPE_LIMIT; - extra_bits += 12; // escape this one - } - ++s_luma.freqs[(k - 1) & (NUM_SYMS - 1)]; - } - } - } - } + printf("\n"); +#endif - for (unsigned i = 0; i < 64; ++i) { - stats[i].freqs[NUM_SYMS - 1] /= 2; // zero, has no sign bits (yes, this is trickery) - stats[i].normalize_freqs(prob_scale); - stats[i].cum_freqs[NUM_SYMS] += stats[i].freqs[NUM_SYMS - 1]; - stats[i].freqs[NUM_SYMS - 1] *= 2; - } + printf("Each iteration took %.3f ms.\n", 1e3 * duration(now - start).count() / num_iterations); FILE *codedfp = fopen("coded.dat", "wb"); if (codedfp == nullptr) { @@ -551,66 +337,52 @@ int main(int argc, char **argv) exit(1); } + // Write out the distributions. + const RansDistSSBO *rans_dist = (const RansDistSSBO *)glMapNamedBufferRange(ssbo, 0, 256 * 16 * sizeof(uint32_t), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); for (unsigned r = 0; r < 2; ++r) { // Hack to write fake chroma tables. // TODO: rather gamma-k or something - for (unsigned i = 0; i < 64; ++i) { - if (stats[i].cum_freqs[NUM_SYMS] == 0) { - continue; - } + for (unsigned i = 0; i < 4; ++i) { printf("writing table %d\n", i); for (unsigned j = 0; j < NUM_SYMS; ++j) { - write_varint(stats[i].freqs[j], codedfp); + printf("%d,%d: %d\n", i, j, rans_dist->ransdist[i * 256 + j].first); + write_varint(rans_dist->ransdist[i * 256 + j].first, codedfp); } } } - RansEncoder rans_encoder; + // Write out the actual data. + // TODO: Do the deduplication. + + const uint32_t *offsets = (const uint32_t *)glMapNamedBufferRange(output_offset_ssbo, 0, 45 * 64 * sizeof(uint32_t), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); +#if 0 + for (int i = 0; i < 45*64; ++i) { + printf("%d,%d,%d: %u\n", i / 64, (i / 8) % 8, i % 8, 1024 * (i + 1) - offsets[i]); + } +#endif - size_t tot_bytes = 0; + const uint8_t *data = (const uint8_t *)glMapNamedBufferRange(output_ssbo, 0, 45 * 64 * 1024, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); - // Luma for (unsigned y = 0; y < 8; ++y) { for (unsigned x = 0; x < 8; ++x) { - SymbolStats &s_luma = stats[pick_stats_for(x, y)]; - rans_encoder.init_prob(s_luma); - - // Luma - std::vector lens; - - rans_encoder.clear(); - size_t num_bytes = 0; - for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; ++block_idx) { - unsigned yb = block_idx / WIDTH_BLOCKS; - unsigned xb = block_idx % WIDTH_BLOCKS; - - int k = coeff_y[(yb * 8 + y) * WIDTH + (xb * 8 + x)]; - rans_encoder.encode_coeff(k); - - if (block_idx % BLOCKS_PER_STREAM == (BLOCKS_PER_STREAM - 1) || block_idx == NUM_BLOCKS - 1) { - int l = rans_encoder.save_block(codedfp); - num_bytes += l; - lens.push_back(l); + for (unsigned int stream_idx = 0; stream_idx < 45; ++stream_idx) { + const uint8_t *out_end = data + (stream_idx * 64 + y * 8 + x + 1) * 1024; + const uint8_t *ptr = data + offsets[stream_idx * 64 + y * 8 + x]; + uint32_t num_rans_bytes = out_end - ptr; +#if 0 + if (num_rans_bytes == last_block.size() && + memcmp(last_block.data(), ptr, last_block.size()) == 0) { + write_varint(0, codedfp); + clear(); + return 1; + } else { + last_block = string((const char *)ptr, num_rans_bytes); } +#endif + + write_varint(num_rans_bytes, codedfp); + fwrite(ptr, 1, num_rans_bytes, codedfp); } - tot_bytes += num_bytes; - printf("coeff %d Y': %ld bytes\n", y * 8 + x, num_bytes); } } - - printf("%ld bytes + %ld escape bits (%ld) = %ld total bytes\n", - tot_bytes - extra_bits / 8, - extra_bits, - extra_bits / 8, - tot_bytes); - - printf("\n"); - printf("Each iteration took %.3f ms (but note that is DCT only, no rANS).\n", 1e3 * duration(now - start).count() / num_iterations); - -#if 1 - glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo); - const uint32_t *dist = (const uint32_t *)glMapBuffer(GL_SHADER_STORAGE_BUFFER, GL_READ_ONLY); - for (int i = 0; i < 1024; ++i) { - printf("%d,%d: %u\n", i / 256, i % 256, dist[i]); - } -#endif + fclose(codedfp); } diff --git a/rans.shader b/rans.shader new file mode 100644 index 0000000..a7c6f8c --- /dev/null +++ b/rans.shader @@ -0,0 +1,180 @@ +#version 440 +#extension GL_NV_gpu_shader5 : enable + +layout(local_size_x = 1) in; + +const uint prob_bits = 13; // Note! +const uint prob_scale = 1 << prob_bits; +const uint RANS_BYTE_L = (1u << 23); +const uint BLOCKS_PER_STREAM = 320; +const uint STREAM_BUF_SIZE = 1024; // 1 kB per stream ought to be enough for everyone :-) +const uint NUM_SYMS = 256; +const uint ESCAPE_LIMIT = NUM_SYMS - 1; + +#define MAPPING(s0, s1, s2, s3, s4, s5, s6, s7) ((s0) | (s1 << 2) | (s2 << 4) | (s3 << 6) | (s4 << 8) | (s5 << 10) | (s6 << 12) | (s7 << 14)) + +const uint luma_mapping[8] = { + MAPPING(0, 0, 1, 1, 2, 2, 3, 3), + MAPPING(0, 0, 1, 2, 2, 2, 3, 3), + MAPPING(1, 1, 2, 2, 2, 3, 3, 3), + MAPPING(1, 1, 2, 2, 2, 3, 3, 3), + MAPPING(1, 2, 2, 2, 2, 3, 3, 3), + MAPPING(2, 2, 2, 2, 3, 3, 3, 3), + MAPPING(2, 2, 3, 3, 3, 3, 3, 3), + MAPPING(3, 3, 3, 3, 3, 3, 3, 3), +}; + +layout(std430, binding = 9) buffer layoutName +{ + uint dist[4 * 256]; + uvec2 ransdist[4 * 256]; +}; + +layout(std430, binding = 10) buffer outputBuf +{ + uint8_t rans_output[]; +}; + +layout(std430, binding = 11) buffer outputBuf2 +{ + uint rans_start_offset[]; +}; + +struct RansEncoder { + uint stream_num; // const + uint lut_base; // const + uint rans_offset; + uint rans; +}; + +layout(r16ui) uniform restrict readonly uimage2D dc_ac7_tex; +layout(r16ui) uniform restrict readonly uimage2D ac1_ac6_tex; +layout(r16ui) uniform restrict readonly uimage2D ac2_ac5_tex; +layout(r8i) uniform restrict readonly iimage2D ac3_tex; +layout(r8i) uniform restrict readonly iimage2D ac4_tex; + +void RansEncInit(uint streamgroup_num, uint coeff_row, uint coeff_col, uint dist_num, out RansEncoder enc) +{ + enc.stream_num = streamgroup_num * 64 + coeff_row * 8 + coeff_col; + enc.lut_base = dist_num * 256; + enc.rans_offset = enc.stream_num * STREAM_BUF_SIZE + STREAM_BUF_SIZE; // Starts at the end. + enc.rans = RANS_BYTE_L; +} + +void RansEncRenorm(inout uint rans, inout uint rans_offset, uint freq, uint prob_bits) +{ + uint x_max = ((RANS_BYTE_L >> prob_bits) << 8) * freq; // this turns into a shift. + if (rans >= x_max) { + do { + rans_output[--rans_offset] = uint8_t(rans & 0xff); + rans >>= 8; + } while (rans >= x_max); + } +} + +void RansEncPut(inout uint rans, inout uint rans_offset, uint start, uint freq, uint prob_bits) +{ + RansEncRenorm(rans, rans_offset, freq, prob_bits); + rans = ((rans / freq) << prob_bits) + (rans % freq) + start; +} + +void RansEncFlush(uint rans, inout uint rans_offset) +{ + rans_offset -= 4; + rans_output[rans_offset + 0] = uint8_t(rans >> 0); + rans_output[rans_offset + 1] = uint8_t(rans >> 8); + rans_output[rans_offset + 2] = uint8_t(rans >> 16); + rans_output[rans_offset + 3] = uint8_t(rans >> 24); +} + +void encode_coeff(uint coeff, uint bits, inout RansEncoder enc) +{ + // Sign-extend to recover the coefficient. + // FIXME: not needed for the bits == 8 case! + int signed_k = int(coeff << (32 - bits)) >> (32 - bits); + uint k = abs(signed_k); + + if (k >= ESCAPE_LIMIT) { + // ... boring stuff here + RansEncPut(enc.rans, enc.rans_offset, k, 1, prob_bits); + k = ESCAPE_LIMIT; + } + + uvec2 sym = ransdist[enc.lut_base + (k - 1) & (NUM_SYMS - 1)]; + RansEncPut(enc.rans, enc.rans_offset, sym.x, sym.y, prob_bits); + + // fix some bias stuff here +} + +void encode_end(inout RansEncoder enc) +{ + RansEncFlush(enc.rans, enc.rans_offset); + rans_start_offset[enc.stream_num] = enc.rans_offset; +} + +void encode_9_7(uint streamgroup_num, uint coeff_row, layout(r16ui) restrict readonly uimage2D tex, uint col1, uint col2, uint dist1, uint dist2) +{ + RansEncoder enc1, enc2; + RansEncInit(streamgroup_num, coeff_row, col1, dist1, enc1); + RansEncInit(streamgroup_num, coeff_row, col2, dist2, enc2); + + for (uint subblock_idx = BLOCKS_PER_STREAM; subblock_idx --> 0; ) { + // TODO: Use SSBOs instead of a texture? + uint x = (streamgroup_num * BLOCKS_PER_STREAM + subblock_idx) % 160; + uint y = (streamgroup_num * BLOCKS_PER_STREAM + subblock_idx) / 160; + uint f = imageLoad(tex, ivec2(x, y * 8 + coeff_row)).x; + + encode_coeff(f & 0x1ffu, 9, enc1); + encode_coeff(f >> 9, 7, enc2); + } + + encode_end(enc1); + encode_end(enc2); +} + +void encode_8(uint streamgroup_num, uint coeff_row, layout(r8i) restrict readonly iimage2D tex, uint col, uint dist) +{ + RansEncoder enc; + RansEncInit(streamgroup_num, coeff_row, col, dist, enc); + + for (uint subblock_idx = BLOCKS_PER_STREAM; subblock_idx --> 0; ) { + // TODO: Use SSBOs instead of a texture? + uint x = (streamgroup_num * BLOCKS_PER_STREAM + subblock_idx) % 160; + uint y = (streamgroup_num * BLOCKS_PER_STREAM + subblock_idx) / 160; + int f = imageLoad(tex, ivec2(x, y * 8 + coeff_row)).x; + + encode_coeff(f, 8, enc); + } + + encode_end(enc); +} + +void main() +{ + uint streamgroup_num = gl_WorkGroupID.x; + uint coeff_row = gl_WorkGroupID.y; // 0..7 + uint coeff_colset = gl_WorkGroupID.z; // 0 = dc+ac7, 1 = ac1+ac6, 2 = ac2+ac5, 3 = ac3, 4 = ac5 + uint m = luma_mapping[coeff_row]; + + // TODO: DC coeff pred + + if (coeff_colset == 0) { + uint dist_dc = bitfieldExtract(m, 0, 2); + uint dist_ac7 = bitfieldExtract(m, 14, 2); + encode_9_7(streamgroup_num, coeff_row, dc_ac7_tex, 0, 7, dist_dc, dist_ac7); + } else if (coeff_colset == 1) { + uint dist_ac1 = bitfieldExtract(m, 2, 2); + uint dist_ac6 = bitfieldExtract(m, 12, 2); + encode_9_7(streamgroup_num, coeff_row, ac1_ac6_tex, 1, 6, dist_ac1, dist_ac6); + } else if (coeff_colset == 2) { + uint dist_ac2 = bitfieldExtract(m, 4, 2); + uint dist_ac5 = bitfieldExtract(m, 10, 2); + encode_9_7(streamgroup_num, coeff_row, ac2_ac5_tex, 2, 5, dist_ac2, dist_ac5); + } else if (coeff_colset == 3) { + uint dist_ac3 = bitfieldExtract(m, 6, 2); + encode_8(streamgroup_num, coeff_row, ac3_tex, 3, dist_ac3); + } else { + uint dist_ac4 = bitfieldExtract(m, 8, 2); + encode_8(streamgroup_num, coeff_row, ac4_tex, 4, dist_ac4); + } +} diff --git a/tally.shader b/tally.shader index a97a825..1623a1d 100644 --- a/tally.shader +++ b/tally.shader @@ -7,6 +7,7 @@ layout(local_size_x = 256) in; layout(std430, binding = 9) buffer layoutName { uint dist[4 * 256]; + uvec2 ransdist[4 * 256]; }; const uint prob_bits = 12; @@ -158,5 +159,5 @@ void main() memoryBarrierShared(); barrier(); } - dist[base + i] = new_dist[i]; + ransdist[base + i] = uvec2(new_val, new_dist[i]); }