Switch to 64-bit rANS, although probably due for immediate revert (just want to prese...

author Steinar H. Gunderson <sgunderson@bigfoot.com>

Tue, 3 Oct 2017 22:38:36 +0000 (00:38 +0200)

committer Steinar H. Gunderson <sgunderson@bigfoot.com>

Tue, 3 Oct 2017 22:38:36 +0000 (00:38 +0200)
author Steinar H. Gunderson <sgunderson@bigfoot.com>
Tue, 3 Oct 2017 22:38:36 +0000 (00:38 +0200)
committer Steinar H. Gunderson <sgunderson@bigfoot.com>
Tue, 3 Oct 2017 22:38:36 +0000 (00:38 +0200)
diff --git a/coded.dat b/coded.dat

index b249f4cc37271b75b0e67e977769665a3160db63..dd463d3c42443a2b8b37b965d1b4ed535bb52fe8 100644 (file)

Binary files a/coded.dat and b/coded.dat differ
diff --git a/decoder.shader b/decoder.shader

index 3b721264b4026f5a6b9cbb6e9c6a5749b15bd30e..012cbe3bbac0ec13bb40b0cf42bd9ecc4bdcadf0 100644 (file)
--- a/decoder.shader
+++ b/decoder.shader
@@ -9,7 +9,8 @@ layout(local_size_x = 64*PARALLEL_SLICES) in;
  layout(r8ui) uniform restrict readonly uimage2D cum2sym_tex;
  layout(rg16ui) uniform restrict readonly uimage2D dsyms_tex;
  layout(r8) uniform restrict writeonly image2D out_tex;
  layout(r8ui) uniform restrict readonly uimage2D cum2sym_tex;
  layout(rg16ui) uniform restrict readonly uimage2D dsyms_tex;
  layout(r8) uniform restrict writeonly image2D out_tex;
-layout(r16i) uniform restrict writeonly iimage2D coeff_tex;
+layout(r32i) uniform restrict writeonly iimage2D coeff_tex;
+layout(r32i) uniform restrict writeonly iimage2D coeff2_tex;
  uniform int num_blocks;
  
  const uint prob_bits = 12;
  uniform int num_blocks;
  
  const uint prob_bits = 12;
@@ -71,40 +72,50 @@ layout(std430, binding = 0) buffer whatever3
  };
  uniform uint sign_bias_per_model[16];
  
  };
  uniform uint sign_bias_per_model[16];
  
-const uint RANS_BYTE_L = (1u << 23);  // lower bound of our normalization interval
+struct myuint64 {
+       uint high, low;
+};
  
  
-uint get_rans_byte(uint offset)
-{
-       // We assume little endian.
-       return bitfieldExtract(data_SSBO[offset >> 2], 8 * int(offset & 3u), 8);
-}
+const uint RANS64_L = (1u << 31);  // lower bound of our normalization interval
  
  
-uint RansDecInit(inout uint offset)
+myuint64 RansDecInit(inout uint offset)
  {
  {
-       uint x;
-
-       x  = get_rans_byte(offset);
-       x |= get_rans_byte(offset + 1) << 8;
-       x |= get_rans_byte(offset + 2) << 16;
-       x |= get_rans_byte(offset + 3) << 24;
-       offset += 4;
-
+       myuint64 x;
+       x.low  = data_SSBO[offset++];
+       x.high = data_SSBO[offset++];
         return x;
  }
  
         return x;
  }
  
-uint RansDecGet(uint r, uint scale_bits)
+uint RansDecGet(myuint64 r, uint scale_bits)
  {
  {
-       return r & ((1u << scale_bits) - 1);
+       return r.low & ((1u << scale_bits) - 1);
  }
  
  }
  
-void RansDecAdvance(inout uint rans, inout uint offset, const uint start, const uint freq, uint prob_bits)
+void RansDecAdvance(inout myuint64 rans, inout uint offset, const uint start, const uint freq, uint prob_bits)
  {
         const uint mask = (1u << prob_bits) - 1;
  {
         const uint mask = (1u << prob_bits) - 1;
-       rans = freq * (rans >> prob_bits) + (rans & mask) - start;
-       
+       const uint recovered_lowbits = (rans.low & mask) - start;
+
+       // rans >>= prob_bits;
+       rans.low = (rans.low >> prob_bits) | ((rans.high & mask) << (32 - prob_bits));
+       rans.high >>= prob_bits;
+
+       // rans *= freq;
+       uint h1, l1, h2, l2;
+       umulExtended(rans.low, freq, h1, l1);
+       umulExtended(rans.high, freq, h2, l2);
+       rans.low = l1;
+       rans.high = l2 + h1;
+
+       // rans += recovered_lowbits;
+       uint carry;
+       rans.low = uaddCarry(rans.low, recovered_lowbits, carry);
+       rans.high += carry;
+
         // renormalize
         // renormalize
-       while (rans < RANS_BYTE_L) {
-               rans = (rans << 8) | get_rans_byte(offset++);
+       if (rans.high == 0 && rans.low < RANS64_L) {
+               rans.high = rans.low;
+               rans.low = data_SSBO[offset++];
         }
  }
  
         }
  }
  
@@ -221,8 +232,8 @@ void main()
         const uint sign_bias = sign_bias_per_model[model_num];
  
         // Initialize rANS decoder.
         const uint sign_bias = sign_bias_per_model[model_num];
  
         // Initialize rANS decoder.
-       uint offset = streams[stream_num].src_offset;
-       uint rans = RansDecInit(offset);
+       uint offset = streams[stream_num].src_offset >> 2;
+       myuint64 rans = RansDecInit(offset);
  
         float q = (coeff_num == 0) ? 1.0 : (quant_matrix[coeff_num] * quant_scalefac / 128.0 / sqrt(2.0));  // FIXME: fold
         q *= (1.0 / 255.0);
  
         float q = (coeff_num == 0) ? 1.0 : (quant_matrix[coeff_num] * quant_scalefac / 128.0 / sqrt(2.0));  // FIXME: fold
         q *= (1.0 / 255.0);
@@ -241,7 +252,7 @@ void main()
                         bool sign = false;
                         if (bottom_bits >= sign_bias) {
                                 bottom_bits -= sign_bias;
                         bool sign = false;
                         if (bottom_bits >= sign_bias) {
                                 bottom_bits -= sign_bias;
-                               rans -= sign_bias;
+                               rans.low -= sign_bias;
                                 sign = true;
                         }
                         int k = int(cum2sym(bottom_bits, model_num));  // Can go out-of-bounds; that will return zero.
                                 sign = true;
                         }
                         int k = int(cum2sym(bottom_bits, model_num));  // Can go out-of-bounds; that will return zero.
@@ -255,17 +266,19 @@ void main()
                         if (sign) {
                                 k = -k;
                         }
                         if (sign) {
                                 k = -k;
                         }
+#if 0
+                       if (coeff_num == 0) {
+                               //imageStore(coeff_tex, ivec2((block_row * 40 + block_idx) * 8 + subblock_idx, 0), ivec4(k, 0,0,0));
+                               imageStore(coeff_tex, ivec2((block_row * 40 + block_idx) * 8 + subblock_idx, 0), ivec4(rans.low, 0,0,0));
+                               imageStore(coeff2_tex, ivec2((block_row * 40 + block_idx) * 8 + subblock_idx, 0), ivec4(rans.high, 0,0,0));
+                       }
+#endif
  
                         if (coeff_num == 0) {
                                 k += last_k;
                                 last_k = k;
                         }
  
  
                         if (coeff_num == 0) {
                                 k += last_k;
                                 last_k = k;
                         }
  
-#if 0
-                       uint y = block_row * 16 + block_y * 8 + local_y;
-                       uint x = block_x * 64 + subblock_idx * 8 + local_x;
-                       imageStore(coeff_tex, ivec2(x, y), ivec4(k, 0,0,0));
-#endif
  
                         temp[slice_num * 64 * 8 + subblock_idx * 64 + coeff_num] = k * q;
                         //temp[subblock_idx * 64 + 8 * y + x] = (2 * k * w * 4) / 32;  // 100% matching unquant
  
                         temp[slice_num * 64 * 8 + subblock_idx * 64 + coeff_num] = k * q;
                         //temp[subblock_idx * 64 + 8 * y + x] = (2 * k * w * 4) / 32;  // 100% matching unquant
diff --git a/narabu.cpp b/narabu.cpp

index bb1209c7fc3ebe7c4bf66123567736cb6bff3854..81b0726cfd4aed5ae27f956ffd75f1d035796113 100644 (file)
--- a/narabu.cpp
+++ b/narabu.cpp
@@ -44,19 +44,9 @@ RansDecodeTable decode_tables[NUM_TABLES];
  optional<uint32_t> read_varint(const char **ptr, const char *end)
  {
         uint32_t x = 0;
  optional<uint32_t> read_varint(const char **ptr, const char *end)
  {
         uint32_t x = 0;
-       int shift = 0;
-       while (*ptr < end) {
-               int ch = **ptr;
-               ++(*ptr);       
-
-               x |= (ch & 0x7f) << shift;
-               if ((ch & 0x80) == 0) return x;
-               shift += 7;
-               if (shift >= 32) {
-                       return nullopt;  // Error: Overlong int.
-               }
-       }
-       return nullopt;  // Error: EOF.
+       memcpy(&x, *ptr, 4);
+       *ptr += 4;
+       return x;
  }
  
  const unsigned num_blocks = ((NUM_BLOCKS + BLOCKS_PER_STREAM - 1) / BLOCKS_PER_STREAM);
  }
  
  const unsigned num_blocks = ((NUM_BLOCKS + BLOCKS_PER_STREAM - 1) / BLOCKS_PER_STREAM);
@@ -113,6 +103,7 @@ int main(int argc, char **argv)
  
         string coded = ::read_file(argc >= 2 ? argv[1] : "coded.dat");
         const char *ptr = &coded[0];
  
         string coded = ::read_file(argc >= 2 ? argv[1] : "coded.dat");
         const char *ptr = &coded[0];
+       //assert((intptr_t)ptr % 4 == 0);
         const char *end = ptr + coded.size();
         GLuint sign_bias[NUM_TABLES];
  
         const char *end = ptr + coded.size();
         GLuint sign_bias[NUM_TABLES];
  
@@ -187,7 +178,21 @@ int main(int argc, char **argv)
         check_error();
          glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
         check_error();
         check_error();
          glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
         check_error();
-        glTexImage2D(GL_TEXTURE_2D, 0, GL_R16I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_SHORT, nullptr);
+        glTexImage2D(GL_TEXTURE_2D, 0, GL_R32I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_INT, nullptr);
+       check_error();
+
+       GLuint coeff2_tex;
+       glGenTextures(1, &coeff2_tex);
+        glBindTexture(GL_TEXTURE_2D, coeff2_tex);
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+       check_error();
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+       check_error();
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+       check_error();
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+       check_error();
+        glTexImage2D(GL_TEXTURE_2D, 0, GL_R32I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_INT, nullptr);
         check_error();
  
         GLuint out_tex;
         check_error();
  
         GLuint out_tex;
@@ -205,6 +210,7 @@ int main(int argc, char **argv)
         GLint dsyms_tex_pos = glGetUniformLocation(glsl_program_num, "dsyms_tex");
         GLint out_tex_pos = glGetUniformLocation(glsl_program_num, "out_tex");
         GLint coeff_tex_pos = glGetUniformLocation(glsl_program_num, "coeff_tex");
         GLint dsyms_tex_pos = glGetUniformLocation(glsl_program_num, "dsyms_tex");
         GLint out_tex_pos = glGetUniformLocation(glsl_program_num, "out_tex");
         GLint coeff_tex_pos = glGetUniformLocation(glsl_program_num, "coeff_tex");
+       GLint coeff2_tex_pos = glGetUniformLocation(glsl_program_num, "coeff2_tex");
         GLint sign_bias_pos = glGetUniformLocation(glsl_program_num, "sign_bias_per_model");
         GLint num_blocks_pos = glGetUniformLocation(glsl_program_num, "num_blocks");
         printf("%d err=0x%x pos=%d,%d,%d,%d\n", __LINE__, glGetError(), cum2sym_tex_pos, dsyms_tex_pos, out_tex_pos, sign_bias_pos);
         GLint sign_bias_pos = glGetUniformLocation(glsl_program_num, "sign_bias_per_model");
         GLint num_blocks_pos = glGetUniformLocation(glsl_program_num, "num_blocks");
         printf("%d err=0x%x pos=%d,%d,%d,%d\n", __LINE__, glGetError(), cum2sym_tex_pos, dsyms_tex_pos, out_tex_pos, sign_bias_pos);
@@ -214,12 +220,14 @@ int main(int argc, char **argv)
         glUniform1i(dsyms_tex_pos, 1);
         glUniform1i(out_tex_pos, 2);
         glUniform1i(coeff_tex_pos, 3);
         glUniform1i(dsyms_tex_pos, 1);
         glUniform1i(out_tex_pos, 2);
         glUniform1i(coeff_tex_pos, 3);
+       glUniform1i(coeff2_tex_pos, 4);
         glUniform1uiv(sign_bias_pos, 16, sign_bias);
         glUniform1i(num_blocks_pos, num_blocks);
          glBindImageTexture(0, cum2sym_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
          glBindImageTexture(1, dsyms_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG16UI);
          glBindImageTexture(2, out_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8);
         glUniform1uiv(sign_bias_pos, 16, sign_bias);
         glUniform1i(num_blocks_pos, num_blocks);
          glBindImageTexture(0, cum2sym_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
          glBindImageTexture(1, dsyms_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG16UI);
          glBindImageTexture(2, out_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8);
-        glBindImageTexture(3, coeff_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16I);
+        glBindImageTexture(3, coeff_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R32I);
+        glBindImageTexture(4, coeff2_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R32I);
         printf("%d err=0x%x\n", __LINE__, glGetError());
  
         // Decode all luma blocks.
         printf("%d err=0x%x\n", __LINE__, glGetError());
  
         // Decode all luma blocks.
@@ -237,6 +245,7 @@ int main(int argc, char **argv)
                                 CoeffStream *stream = &streams[coeff_num * num_blocks + block_idx / BLOCKS_PER_STREAM];
                                 stream->src_offset = ptr - coded.data();
                                 stream->src_len = *num_rans_bytes;
                                 CoeffStream *stream = &streams[coeff_num * num_blocks + block_idx / BLOCKS_PER_STREAM];
                                 stream->src_offset = ptr - coded.data();
                                 stream->src_len = *num_rans_bytes;
+                               //assert(stream->src_offset % 4 == 0);
  
                                 // TODO: check len
                                 ptr += *num_rans_bytes;
  
                                 // TODO: check len
                                 ptr += *num_rans_bytes;
@@ -353,22 +362,22 @@ int main(int argc, char **argv)
         }
         fclose(fp);
  
         }
         fclose(fp);
  
-       int16_t *coeff_data = new int16_t[WIDTH * HEIGHT];
+#if 0
+       uint32_t *coeff_data = new uint32_t[WIDTH * HEIGHT];
          glBindTexture(GL_TEXTURE_2D, coeff_tex);
         check_error();
          glBindTexture(GL_TEXTURE_2D, coeff_tex);
         check_error();
-       glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_SHORT, coeff_data);
+       glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_INT, coeff_data);
         check_error();
         check_error();
-       for (int k = 0; k < 4; ++k) {
-               for (int y = 0; y < 8; ++y) {
-                       for (int x = 0; x < 8; ++x) {
-                               printf("%3d ", coeff_data[y * WIDTH + x + k*8]);
-                       }
-                       printf("\n");
-               }
-               printf("\n");
+       uint32_t *coeff2_data = new uint32_t[WIDTH * HEIGHT];
+        glBindTexture(GL_TEXTURE_2D, coeff2_tex);
+       check_error();
+       glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_INT, coeff2_data);
+       check_error();
+       for (int x = 0; x < 320; ++x) {
+               printf("%08x.%08x ", coeff2_data[x], coeff_data[x]);
         }
         printf("\n");
         }
         printf("\n");
-       
+#endif
         
         check_error();
         glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind
         
         check_error();
         glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind
diff --git a/qdc.cpp b/qdc.cpp

index 094f3b6b7e0e80b451b288bcce4d458cbcabde45..4bb3e9e75c3696eec5b60ed7a976fbefb212d912 100644 (file)
--- a/qdc.cpp
+++ b/qdc.cpp
@@ -5,8 +5,8 @@
  #include <assert.h>
  #include <math.h>
  
  #include <assert.h>
  #include <math.h>
  
-//#include "ryg_rans/rans64.h"
-#include "ryg_rans/rans_byte.h"
+#include "ryg_rans/rans64.h"
+//#include "ryg_rans/rans_byte.h"
  #include "ryg_rans/renormalize.h"
  
  #include <algorithm>
  #include "ryg_rans/renormalize.h"
  
  #include <algorithm>
@@ -198,11 +198,7 @@ int pick_stats_for(int x, int y, bool is_chroma)
  
  void write_varint(int x, FILE *fp)
  {
  
  void write_varint(int x, FILE *fp)
  {
-       while (x >= 128) {
-               putc((x & 0x7f) | 0x80, fp);
-               x >>= 7;
-       }
-       putc(x, fp);
+       fwrite(&x, sizeof(x), 1, fp);
  }
  
  class RansEncoder {
  }
  
  class RansEncoder {
@@ -217,7 +213,7 @@ public:
         {
                 for (int i = 0; i < NUM_SYMS; i++) {
                         //printf("%d: cumfreqs=%d freqs=%d prob_bits=%d\n", i, s.cum_freqs[i], s.freqs[i], prob_bits + 1);
         {
                 for (int i = 0; i < NUM_SYMS; i++) {
                         //printf("%d: cumfreqs=%d freqs=%d prob_bits=%d\n", i, s.cum_freqs[i], s.freqs[i], prob_bits + 1);
-                       RansEncSymbolInit(&esyms[i], s.cum_freqs[i], s.freqs[i], prob_bits + 1);
+                       Rans64EncSymbolInit(&esyms[i], s.cum_freqs[i], s.freqs[i], prob_bits + 1);
                 }
                 sign_bias = s.cum_freqs[NUM_SYMS];
         }
                 }
                 sign_bias = s.cum_freqs[NUM_SYMS];
         }
@@ -226,12 +222,12 @@ public:
         {
                 out_end = out_buf.get() + out_max_size;
                 ptr = out_end; // *end* of output buffer
         {
                 out_end = out_buf.get() + out_max_size;
                 ptr = out_end; // *end* of output buffer
-               RansEncInit(&rans);
+               Rans64EncInit(&rans);
         }
  
         uint32_t save_block(FILE *codedfp)  // Returns number of bytes.
         {
         }
  
         uint32_t save_block(FILE *codedfp)  // Returns number of bytes.
         {
-               RansEncFlush(&rans, &ptr);
+               Rans64EncFlush(&rans, (uint32_t **)&ptr);
                 //printf("post-flush = %08x\n", rans);
  
                 uint32_t num_rans_bytes = out_end - ptr;
                 //printf("post-flush = %08x\n", rans);
  
                 uint32_t num_rans_bytes = out_end - ptr;
@@ -268,16 +264,16 @@ public:
  
         void encode_coeff(short signed_k)
         {
  
         void encode_coeff(short signed_k)
         {
-               //printf("encoding coeff %d (sym %d), rans before encoding = %08x\n", signed_k, ((abs(signed_k) - 1) & 255), rans);
+               //printf("encoding coeff %d (sym %d), rans before encoding = %016lx\n", signed_k, ((abs(signed_k) - 1) & 255), rans);
                 unsigned short k = abs(signed_k);
                 if (k >= ESCAPE_LIMIT) {
                         // Put the coefficient as a 1/(2^12) symbol _before_
                         // the 255 coefficient, since the decoder will read the
                         // 255 coefficient first.
                 unsigned short k = abs(signed_k);
                 if (k >= ESCAPE_LIMIT) {
                         // Put the coefficient as a 1/(2^12) symbol _before_
                         // the 255 coefficient, since the decoder will read the
                         // 255 coefficient first.
-                       RansEncPut(&rans, &ptr, k, 1, prob_bits);
+                       Rans64EncPut(&rans, (uint32_t **)&ptr, k, 1, prob_bits);
                         k = ESCAPE_LIMIT;
                 }
                         k = ESCAPE_LIMIT;
                 }
-               RansEncPutSymbol(&rans, &ptr, &esyms[(k - 1) & (NUM_SYMS - 1)]);
+               Rans64EncPutSymbol(&rans, (uint32_t **)&ptr, &esyms[(k - 1) & (NUM_SYMS - 1)], prob_bits + 1);
                 if (signed_k < 0) {
                         rans += sign_bias;
                 }
                 if (signed_k < 0) {
                         rans += sign_bias;
                 }
@@ -290,8 +286,8 @@ private:
         unique_ptr<uint8_t[]> out_buf;
         uint8_t *out_end;
         uint8_t *ptr;
         unique_ptr<uint8_t[]> out_buf;
         uint8_t *out_end;
         uint8_t *ptr;
-       RansState rans;
-       RansEncSymbol esyms[NUM_SYMS];
+       Rans64State rans;
+       Rans64EncSymbol esyms[NUM_SYMS];
         uint32_t sign_bias;
  
         uint32_t last_block = 0;  // Not a valid 4-byte rANS block (?)
         uint32_t sign_bias;
  
         uint32_t last_block = 0;  // Not a valid 4-byte rANS block (?)
author	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Tue, 3 Oct 2017 22:38:36 +0000 (00:38 +0200)
committer	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Tue, 3 Oct 2017 22:38:36 +0000 (00:38 +0200)
coded.dat		patch \| blob \| history
decoder.shader		patch \| blob \| history
narabu.cpp		patch \| blob \| history
qdc.cpp		patch \| blob \| history