layout(r8ui) uniform restrict readonly uimage2D cum2sym_tex;
layout(rg16ui) uniform restrict readonly uimage2D dsyms_tex;
layout(r8) uniform restrict writeonly image2D out_tex;
-layout(r32i) uniform restrict writeonly iimage2D coeff_tex;
-layout(r32i) uniform restrict writeonly iimage2D coeff2_tex;
+layout(r16i) uniform restrict writeonly iimage2D coeff_tex;
uniform int num_blocks;
const uint prob_bits = 12;
};
uniform uint sign_bias_per_model[16];
-struct myuint64 {
- uint high, low;
-};
+const uint RANS_BYTE_L = (1u << 23); // lower bound of our normalization interval
-const uint RANS64_L = (1u << 31); // lower bound of our normalization interval
+uint get_rans_byte(uint offset)
+{
+ // We assume little endian.
+ return bitfieldExtract(data_SSBO[offset >> 2], 8 * int(offset & 3u), 8);
+}
-myuint64 RansDecInit(inout uint offset)
+uint RansDecInit(inout uint offset)
{
- myuint64 x;
- x.low = data_SSBO[offset++];
- x.high = data_SSBO[offset++];
+ uint x;
+
+ x = get_rans_byte(offset);
+ x |= get_rans_byte(offset + 1) << 8;
+ x |= get_rans_byte(offset + 2) << 16;
+ x |= get_rans_byte(offset + 3) << 24;
+ offset += 4;
+
return x;
}
-uint RansDecGet(myuint64 r, uint scale_bits)
+uint RansDecGet(uint r, uint scale_bits)
{
- return r.low & ((1u << scale_bits) - 1);
+ return r & ((1u << scale_bits) - 1);
}
-void RansDecAdvance(inout myuint64 rans, inout uint offset, const uint start, const uint freq, uint prob_bits)
+void RansDecAdvance(inout uint rans, inout uint offset, const uint start, const uint freq, uint prob_bits)
{
const uint mask = (1u << prob_bits) - 1;
- const uint recovered_lowbits = (rans.low & mask) - start;
-
- // rans >>= prob_bits;
- rans.low = (rans.low >> prob_bits) | ((rans.high & mask) << (32 - prob_bits));
- rans.high >>= prob_bits;
-
- // rans *= freq;
- uint h1, l1, h2, l2;
- umulExtended(rans.low, freq, h1, l1);
- umulExtended(rans.high, freq, h2, l2);
- rans.low = l1;
- rans.high = l2 + h1;
-
- // rans += recovered_lowbits;
- uint carry;
- rans.low = uaddCarry(rans.low, recovered_lowbits, carry);
- rans.high += carry;
-
+ rans = freq * (rans >> prob_bits) + (rans & mask) - start;
+
// renormalize
- if (rans.high == 0 && rans.low < RANS64_L) {
- rans.high = rans.low;
- rans.low = data_SSBO[offset++];
+ while (rans < RANS_BYTE_L) {
+ rans = (rans << 8) | get_rans_byte(offset++);
}
}
const uint sign_bias = sign_bias_per_model[model_num];
// Initialize rANS decoder.
- uint offset = streams[stream_num].src_offset >> 2;
- myuint64 rans = RansDecInit(offset);
+ uint offset = streams[stream_num].src_offset;
+ uint rans = RansDecInit(offset);
float q = (coeff_num == 0) ? 1.0 : (quant_matrix[coeff_num] * quant_scalefac / 128.0 / sqrt(2.0)); // FIXME: fold
q *= (1.0 / 255.0);
bool sign = false;
if (bottom_bits >= sign_bias) {
bottom_bits -= sign_bias;
- rans.low -= sign_bias;
+ rans -= sign_bias;
sign = true;
}
int k = int(cum2sym(bottom_bits, model_num)); // Can go out-of-bounds; that will return zero.
if (sign) {
k = -k;
}
-#if 0
- if (coeff_num == 0) {
- //imageStore(coeff_tex, ivec2((block_row * 40 + block_idx) * 8 + subblock_idx, 0), ivec4(k, 0,0,0));
- imageStore(coeff_tex, ivec2((block_row * 40 + block_idx) * 8 + subblock_idx, 0), ivec4(rans.low, 0,0,0));
- imageStore(coeff2_tex, ivec2((block_row * 40 + block_idx) * 8 + subblock_idx, 0), ivec4(rans.high, 0,0,0));
- }
-#endif
if (coeff_num == 0) {
k += last_k;
last_k = k;
}
+#if 0
+ uint y = block_row * 16 + block_y * 8 + local_y;
+ uint x = block_x * 64 + subblock_idx * 8 + local_x;
+ imageStore(coeff_tex, ivec2(x, y), ivec4(k, 0,0,0));
+#endif
temp[slice_num * 64 * 8 + subblock_idx * 64 + coeff_num] = k * q;
//temp[subblock_idx * 64 + 8 * y + x] = (2 * k * w * 4) / 32; // 100% matching unquant
optional<uint32_t> read_varint(const char **ptr, const char *end)
{
uint32_t x = 0;
- memcpy(&x, *ptr, 4);
- *ptr += 4;
- return x;
+ int shift = 0;
+ while (*ptr < end) {
+ int ch = **ptr;
+ ++(*ptr);
+
+ x |= (ch & 0x7f) << shift;
+ if ((ch & 0x80) == 0) return x;
+ shift += 7;
+ if (shift >= 32) {
+ return nullopt; // Error: Overlong int.
+ }
+ }
+ return nullopt; // Error: EOF.
}
const unsigned num_blocks = ((NUM_BLOCKS + BLOCKS_PER_STREAM - 1) / BLOCKS_PER_STREAM);
string coded = ::read_file(argc >= 2 ? argv[1] : "coded.dat");
const char *ptr = &coded[0];
- //assert((intptr_t)ptr % 4 == 0);
const char *end = ptr + coded.size();
GLuint sign_bias[NUM_TABLES];
check_error();
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
check_error();
- glTexImage2D(GL_TEXTURE_2D, 0, GL_R32I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_INT, nullptr);
- check_error();
-
- GLuint coeff2_tex;
- glGenTextures(1, &coeff2_tex);
- glBindTexture(GL_TEXTURE_2D, coeff2_tex);
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
- check_error();
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
- check_error();
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
- check_error();
- glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
- check_error();
- glTexImage2D(GL_TEXTURE_2D, 0, GL_R32I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_INT, nullptr);
+ glTexImage2D(GL_TEXTURE_2D, 0, GL_R16I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_SHORT, nullptr);
check_error();
GLuint out_tex;
GLint dsyms_tex_pos = glGetUniformLocation(glsl_program_num, "dsyms_tex");
GLint out_tex_pos = glGetUniformLocation(glsl_program_num, "out_tex");
GLint coeff_tex_pos = glGetUniformLocation(glsl_program_num, "coeff_tex");
- GLint coeff2_tex_pos = glGetUniformLocation(glsl_program_num, "coeff2_tex");
GLint sign_bias_pos = glGetUniformLocation(glsl_program_num, "sign_bias_per_model");
GLint num_blocks_pos = glGetUniformLocation(glsl_program_num, "num_blocks");
printf("%d err=0x%x pos=%d,%d,%d,%d\n", __LINE__, glGetError(), cum2sym_tex_pos, dsyms_tex_pos, out_tex_pos, sign_bias_pos);
glUniform1i(dsyms_tex_pos, 1);
glUniform1i(out_tex_pos, 2);
glUniform1i(coeff_tex_pos, 3);
- glUniform1i(coeff2_tex_pos, 4);
glUniform1uiv(sign_bias_pos, 16, sign_bias);
glUniform1i(num_blocks_pos, num_blocks);
glBindImageTexture(0, cum2sym_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
glBindImageTexture(1, dsyms_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG16UI);
glBindImageTexture(2, out_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8);
- glBindImageTexture(3, coeff_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R32I);
- glBindImageTexture(4, coeff2_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R32I);
+ glBindImageTexture(3, coeff_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16I);
printf("%d err=0x%x\n", __LINE__, glGetError());
// Decode all luma blocks.
CoeffStream *stream = &streams[coeff_num * num_blocks + block_idx / BLOCKS_PER_STREAM];
stream->src_offset = ptr - coded.data();
stream->src_len = *num_rans_bytes;
- //assert(stream->src_offset % 4 == 0);
// TODO: check len
ptr += *num_rans_bytes;
}
fclose(fp);
-#if 0
- uint32_t *coeff_data = new uint32_t[WIDTH * HEIGHT];
+ int16_t *coeff_data = new int16_t[WIDTH * HEIGHT];
glBindTexture(GL_TEXTURE_2D, coeff_tex);
check_error();
- glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_INT, coeff_data);
- check_error();
- uint32_t *coeff2_data = new uint32_t[WIDTH * HEIGHT];
- glBindTexture(GL_TEXTURE_2D, coeff2_tex);
- check_error();
- glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_INT, coeff2_data);
+ glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_SHORT, coeff_data);
check_error();
- for (int x = 0; x < 320; ++x) {
- printf("%08x.%08x ", coeff2_data[x], coeff_data[x]);
+ for (int k = 0; k < 4; ++k) {
+ for (int y = 0; y < 8; ++y) {
+ for (int x = 0; x < 8; ++x) {
+ printf("%3d ", coeff_data[y * WIDTH + x + k*8]);
+ }
+ printf("\n");
+ }
+ printf("\n");
}
printf("\n");
-#endif
+
check_error();
glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind
#include <assert.h>
#include <math.h>
-#include "ryg_rans/rans64.h"
-//#include "ryg_rans/rans_byte.h"
+//#include "ryg_rans/rans64.h"
+#include "ryg_rans/rans_byte.h"
#include "ryg_rans/renormalize.h"
#include <algorithm>
void write_varint(int x, FILE *fp)
{
- fwrite(&x, sizeof(x), 1, fp);
+ while (x >= 128) {
+ putc((x & 0x7f) | 0x80, fp);
+ x >>= 7;
+ }
+ putc(x, fp);
}
class RansEncoder {
{
for (int i = 0; i < NUM_SYMS; i++) {
//printf("%d: cumfreqs=%d freqs=%d prob_bits=%d\n", i, s.cum_freqs[i], s.freqs[i], prob_bits + 1);
- Rans64EncSymbolInit(&esyms[i], s.cum_freqs[i], s.freqs[i], prob_bits + 1);
+ RansEncSymbolInit(&esyms[i], s.cum_freqs[i], s.freqs[i], prob_bits + 1);
}
sign_bias = s.cum_freqs[NUM_SYMS];
}
{
out_end = out_buf.get() + out_max_size;
ptr = out_end; // *end* of output buffer
- Rans64EncInit(&rans);
+ RansEncInit(&rans);
}
uint32_t save_block(FILE *codedfp) // Returns number of bytes.
{
- Rans64EncFlush(&rans, (uint32_t **)&ptr);
+ RansEncFlush(&rans, &ptr);
//printf("post-flush = %08x\n", rans);
uint32_t num_rans_bytes = out_end - ptr;
void encode_coeff(short signed_k)
{
- //printf("encoding coeff %d (sym %d), rans before encoding = %016lx\n", signed_k, ((abs(signed_k) - 1) & 255), rans);
+ //printf("encoding coeff %d (sym %d), rans before encoding = %08x\n", signed_k, ((abs(signed_k) - 1) & 255), rans);
unsigned short k = abs(signed_k);
if (k >= ESCAPE_LIMIT) {
// Put the coefficient as a 1/(2^12) symbol _before_
// the 255 coefficient, since the decoder will read the
// 255 coefficient first.
- Rans64EncPut(&rans, (uint32_t **)&ptr, k, 1, prob_bits);
+ RansEncPut(&rans, &ptr, k, 1, prob_bits);
k = ESCAPE_LIMIT;
}
- Rans64EncPutSymbol(&rans, (uint32_t **)&ptr, &esyms[(k - 1) & (NUM_SYMS - 1)], prob_bits + 1);
+ RansEncPutSymbol(&rans, &ptr, &esyms[(k - 1) & (NUM_SYMS - 1)]);
if (signed_k < 0) {
rans += sign_bias;
}
unique_ptr<uint8_t[]> out_buf;
uint8_t *out_end;
uint8_t *ptr;
- Rans64State rans;
- Rans64EncSymbol esyms[NUM_SYMS];
+ RansState rans;
+ RansEncSymbol esyms[NUM_SYMS];
uint32_t sign_bias;
uint32_t last_block = 0; // Not a valid 4-byte rANS block (?)