From: Steinar H. Gunderson Date: Tue, 3 Oct 2017 22:38:36 +0000 (+0200) Subject: Switch to 64-bit rANS, although probably due for immediate revert (just want to prese... X-Git-Url: https://git.sesse.net/?p=narabu;a=commitdiff_plain;h=3fb87c6b953be3382cd216c74ff6aa025c8eaa2a Switch to 64-bit rANS, although probably due for immediate revert (just want to preserve history). --- diff --git a/coded.dat b/coded.dat index b249f4c..dd463d3 100644 Binary files a/coded.dat and b/coded.dat differ diff --git a/decoder.shader b/decoder.shader index 3b72126..012cbe3 100644 --- a/decoder.shader +++ b/decoder.shader @@ -9,7 +9,8 @@ layout(local_size_x = 64*PARALLEL_SLICES) in; layout(r8ui) uniform restrict readonly uimage2D cum2sym_tex; layout(rg16ui) uniform restrict readonly uimage2D dsyms_tex; layout(r8) uniform restrict writeonly image2D out_tex; -layout(r16i) uniform restrict writeonly iimage2D coeff_tex; +layout(r32i) uniform restrict writeonly iimage2D coeff_tex; +layout(r32i) uniform restrict writeonly iimage2D coeff2_tex; uniform int num_blocks; const uint prob_bits = 12; @@ -71,40 +72,50 @@ layout(std430, binding = 0) buffer whatever3 }; uniform uint sign_bias_per_model[16]; -const uint RANS_BYTE_L = (1u << 23); // lower bound of our normalization interval +struct myuint64 { + uint high, low; +}; -uint get_rans_byte(uint offset) -{ - // We assume little endian. - return bitfieldExtract(data_SSBO[offset >> 2], 8 * int(offset & 3u), 8); -} +const uint RANS64_L = (1u << 31); // lower bound of our normalization interval -uint RansDecInit(inout uint offset) +myuint64 RansDecInit(inout uint offset) { - uint x; - - x = get_rans_byte(offset); - x |= get_rans_byte(offset + 1) << 8; - x |= get_rans_byte(offset + 2) << 16; - x |= get_rans_byte(offset + 3) << 24; - offset += 4; - + myuint64 x; + x.low = data_SSBO[offset++]; + x.high = data_SSBO[offset++]; return x; } -uint RansDecGet(uint r, uint scale_bits) +uint RansDecGet(myuint64 r, uint scale_bits) { - return r & ((1u << scale_bits) - 1); + return r.low & ((1u << scale_bits) - 1); } -void RansDecAdvance(inout uint rans, inout uint offset, const uint start, const uint freq, uint prob_bits) +void RansDecAdvance(inout myuint64 rans, inout uint offset, const uint start, const uint freq, uint prob_bits) { const uint mask = (1u << prob_bits) - 1; - rans = freq * (rans >> prob_bits) + (rans & mask) - start; - + const uint recovered_lowbits = (rans.low & mask) - start; + + // rans >>= prob_bits; + rans.low = (rans.low >> prob_bits) | ((rans.high & mask) << (32 - prob_bits)); + rans.high >>= prob_bits; + + // rans *= freq; + uint h1, l1, h2, l2; + umulExtended(rans.low, freq, h1, l1); + umulExtended(rans.high, freq, h2, l2); + rans.low = l1; + rans.high = l2 + h1; + + // rans += recovered_lowbits; + uint carry; + rans.low = uaddCarry(rans.low, recovered_lowbits, carry); + rans.high += carry; + // renormalize - while (rans < RANS_BYTE_L) { - rans = (rans << 8) | get_rans_byte(offset++); + if (rans.high == 0 && rans.low < RANS64_L) { + rans.high = rans.low; + rans.low = data_SSBO[offset++]; } } @@ -221,8 +232,8 @@ void main() const uint sign_bias = sign_bias_per_model[model_num]; // Initialize rANS decoder. - uint offset = streams[stream_num].src_offset; - uint rans = RansDecInit(offset); + uint offset = streams[stream_num].src_offset >> 2; + myuint64 rans = RansDecInit(offset); float q = (coeff_num == 0) ? 1.0 : (quant_matrix[coeff_num] * quant_scalefac / 128.0 / sqrt(2.0)); // FIXME: fold q *= (1.0 / 255.0); @@ -241,7 +252,7 @@ void main() bool sign = false; if (bottom_bits >= sign_bias) { bottom_bits -= sign_bias; - rans -= sign_bias; + rans.low -= sign_bias; sign = true; } int k = int(cum2sym(bottom_bits, model_num)); // Can go out-of-bounds; that will return zero. @@ -255,17 +266,19 @@ void main() if (sign) { k = -k; } +#if 0 + if (coeff_num == 0) { + //imageStore(coeff_tex, ivec2((block_row * 40 + block_idx) * 8 + subblock_idx, 0), ivec4(k, 0,0,0)); + imageStore(coeff_tex, ivec2((block_row * 40 + block_idx) * 8 + subblock_idx, 0), ivec4(rans.low, 0,0,0)); + imageStore(coeff2_tex, ivec2((block_row * 40 + block_idx) * 8 + subblock_idx, 0), ivec4(rans.high, 0,0,0)); + } +#endif if (coeff_num == 0) { k += last_k; last_k = k; } -#if 0 - uint y = block_row * 16 + block_y * 8 + local_y; - uint x = block_x * 64 + subblock_idx * 8 + local_x; - imageStore(coeff_tex, ivec2(x, y), ivec4(k, 0,0,0)); -#endif temp[slice_num * 64 * 8 + subblock_idx * 64 + coeff_num] = k * q; //temp[subblock_idx * 64 + 8 * y + x] = (2 * k * w * 4) / 32; // 100% matching unquant diff --git a/narabu.cpp b/narabu.cpp index bb1209c..81b0726 100644 --- a/narabu.cpp +++ b/narabu.cpp @@ -44,19 +44,9 @@ RansDecodeTable decode_tables[NUM_TABLES]; optional read_varint(const char **ptr, const char *end) { uint32_t x = 0; - int shift = 0; - while (*ptr < end) { - int ch = **ptr; - ++(*ptr); - - x |= (ch & 0x7f) << shift; - if ((ch & 0x80) == 0) return x; - shift += 7; - if (shift >= 32) { - return nullopt; // Error: Overlong int. - } - } - return nullopt; // Error: EOF. + memcpy(&x, *ptr, 4); + *ptr += 4; + return x; } const unsigned num_blocks = ((NUM_BLOCKS + BLOCKS_PER_STREAM - 1) / BLOCKS_PER_STREAM); @@ -113,6 +103,7 @@ int main(int argc, char **argv) string coded = ::read_file(argc >= 2 ? argv[1] : "coded.dat"); const char *ptr = &coded[0]; + //assert((intptr_t)ptr % 4 == 0); const char *end = ptr + coded.size(); GLuint sign_bias[NUM_TABLES]; @@ -187,7 +178,21 @@ int main(int argc, char **argv) check_error(); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); check_error(); - glTexImage2D(GL_TEXTURE_2D, 0, GL_R16I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_SHORT, nullptr); + glTexImage2D(GL_TEXTURE_2D, 0, GL_R32I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_INT, nullptr); + check_error(); + + GLuint coeff2_tex; + glGenTextures(1, &coeff2_tex); + glBindTexture(GL_TEXTURE_2D, coeff2_tex); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); + check_error(); + glTexImage2D(GL_TEXTURE_2D, 0, GL_R32I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_INT, nullptr); check_error(); GLuint out_tex; @@ -205,6 +210,7 @@ int main(int argc, char **argv) GLint dsyms_tex_pos = glGetUniformLocation(glsl_program_num, "dsyms_tex"); GLint out_tex_pos = glGetUniformLocation(glsl_program_num, "out_tex"); GLint coeff_tex_pos = glGetUniformLocation(glsl_program_num, "coeff_tex"); + GLint coeff2_tex_pos = glGetUniformLocation(glsl_program_num, "coeff2_tex"); GLint sign_bias_pos = glGetUniformLocation(glsl_program_num, "sign_bias_per_model"); GLint num_blocks_pos = glGetUniformLocation(glsl_program_num, "num_blocks"); printf("%d err=0x%x pos=%d,%d,%d,%d\n", __LINE__, glGetError(), cum2sym_tex_pos, dsyms_tex_pos, out_tex_pos, sign_bias_pos); @@ -214,12 +220,14 @@ int main(int argc, char **argv) glUniform1i(dsyms_tex_pos, 1); glUniform1i(out_tex_pos, 2); glUniform1i(coeff_tex_pos, 3); + glUniform1i(coeff2_tex_pos, 4); glUniform1uiv(sign_bias_pos, 16, sign_bias); glUniform1i(num_blocks_pos, num_blocks); glBindImageTexture(0, cum2sym_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI); glBindImageTexture(1, dsyms_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG16UI); glBindImageTexture(2, out_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8); - glBindImageTexture(3, coeff_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16I); + glBindImageTexture(3, coeff_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R32I); + glBindImageTexture(4, coeff2_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R32I); printf("%d err=0x%x\n", __LINE__, glGetError()); // Decode all luma blocks. @@ -237,6 +245,7 @@ int main(int argc, char **argv) CoeffStream *stream = &streams[coeff_num * num_blocks + block_idx / BLOCKS_PER_STREAM]; stream->src_offset = ptr - coded.data(); stream->src_len = *num_rans_bytes; + //assert(stream->src_offset % 4 == 0); // TODO: check len ptr += *num_rans_bytes; @@ -353,22 +362,22 @@ int main(int argc, char **argv) } fclose(fp); - int16_t *coeff_data = new int16_t[WIDTH * HEIGHT]; +#if 0 + uint32_t *coeff_data = new uint32_t[WIDTH * HEIGHT]; glBindTexture(GL_TEXTURE_2D, coeff_tex); check_error(); - glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_SHORT, coeff_data); + glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_INT, coeff_data); check_error(); - for (int k = 0; k < 4; ++k) { - for (int y = 0; y < 8; ++y) { - for (int x = 0; x < 8; ++x) { - printf("%3d ", coeff_data[y * WIDTH + x + k*8]); - } - printf("\n"); - } - printf("\n"); + uint32_t *coeff2_data = new uint32_t[WIDTH * HEIGHT]; + glBindTexture(GL_TEXTURE_2D, coeff2_tex); + check_error(); + glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_INT, coeff2_data); + check_error(); + for (int x = 0; x < 320; ++x) { + printf("%08x.%08x ", coeff2_data[x], coeff_data[x]); } printf("\n"); - +#endif check_error(); glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind diff --git a/qdc.cpp b/qdc.cpp index 094f3b6..4bb3e9e 100644 --- a/qdc.cpp +++ b/qdc.cpp @@ -5,8 +5,8 @@ #include #include -//#include "ryg_rans/rans64.h" -#include "ryg_rans/rans_byte.h" +#include "ryg_rans/rans64.h" +//#include "ryg_rans/rans_byte.h" #include "ryg_rans/renormalize.h" #include @@ -198,11 +198,7 @@ int pick_stats_for(int x, int y, bool is_chroma) void write_varint(int x, FILE *fp) { - while (x >= 128) { - putc((x & 0x7f) | 0x80, fp); - x >>= 7; - } - putc(x, fp); + fwrite(&x, sizeof(x), 1, fp); } class RansEncoder { @@ -217,7 +213,7 @@ public: { for (int i = 0; i < NUM_SYMS; i++) { //printf("%d: cumfreqs=%d freqs=%d prob_bits=%d\n", i, s.cum_freqs[i], s.freqs[i], prob_bits + 1); - RansEncSymbolInit(&esyms[i], s.cum_freqs[i], s.freqs[i], prob_bits + 1); + Rans64EncSymbolInit(&esyms[i], s.cum_freqs[i], s.freqs[i], prob_bits + 1); } sign_bias = s.cum_freqs[NUM_SYMS]; } @@ -226,12 +222,12 @@ public: { out_end = out_buf.get() + out_max_size; ptr = out_end; // *end* of output buffer - RansEncInit(&rans); + Rans64EncInit(&rans); } uint32_t save_block(FILE *codedfp) // Returns number of bytes. { - RansEncFlush(&rans, &ptr); + Rans64EncFlush(&rans, (uint32_t **)&ptr); //printf("post-flush = %08x\n", rans); uint32_t num_rans_bytes = out_end - ptr; @@ -268,16 +264,16 @@ public: void encode_coeff(short signed_k) { - //printf("encoding coeff %d (sym %d), rans before encoding = %08x\n", signed_k, ((abs(signed_k) - 1) & 255), rans); + //printf("encoding coeff %d (sym %d), rans before encoding = %016lx\n", signed_k, ((abs(signed_k) - 1) & 255), rans); unsigned short k = abs(signed_k); if (k >= ESCAPE_LIMIT) { // Put the coefficient as a 1/(2^12) symbol _before_ // the 255 coefficient, since the decoder will read the // 255 coefficient first. - RansEncPut(&rans, &ptr, k, 1, prob_bits); + Rans64EncPut(&rans, (uint32_t **)&ptr, k, 1, prob_bits); k = ESCAPE_LIMIT; } - RansEncPutSymbol(&rans, &ptr, &esyms[(k - 1) & (NUM_SYMS - 1)]); + Rans64EncPutSymbol(&rans, (uint32_t **)&ptr, &esyms[(k - 1) & (NUM_SYMS - 1)], prob_bits + 1); if (signed_k < 0) { rans += sign_bias; } @@ -290,8 +286,8 @@ private: unique_ptr out_buf; uint8_t *out_end; uint8_t *ptr; - RansState rans; - RansEncSymbol esyms[NUM_SYMS]; + Rans64State rans; + Rans64EncSymbol esyms[NUM_SYMS]; uint32_t sign_bias; uint32_t last_block = 0; // Not a valid 4-byte rANS block (?)