]> git.sesse.net Git - narabu/blobdiff - narabu.cpp
More fixes of hard-coded values.
[narabu] / narabu.cpp
index 48973664e66a2e3c51f7ed191191186a596e292b..0ac4f71d75ab58794da432e64c22ea0811cdb437 100644 (file)
@@ -4,23 +4,32 @@
 #include <SDL2/SDL_error.h>
 #include <SDL2/SDL_video.h>
 #include <epoxy/gl.h>
+#include <movit/util.h>
 #include <string>
 #include <optional>
 #include <algorithm>
 #include <vector>
 #include <memory>
+#include <chrono>
 
 #include "util.h"
 
 using namespace std;
+using namespace std::chrono;
 
 #define WIDTH 1280
 #define HEIGHT 720
+#define WIDTH_BLOCKS (WIDTH/8)
+#define WIDTH_BLOCKS_CHROMA (WIDTH/16)
+#define HEIGHT_BLOCKS (HEIGHT/8)
+#define NUM_BLOCKS (WIDTH_BLOCKS * HEIGHT_BLOCKS)
+#define NUM_BLOCKS_CHROMA (WIDTH_BLOCKS_CHROMA * HEIGHT_BLOCKS)
 
 const unsigned prob_bits = 12;
 const unsigned prob_scale = 1 << prob_bits;
 const unsigned NUM_SYMS = 256;
-const unsigned NUM_TABLES = 16;
+const unsigned NUM_TABLES = 8;
+const unsigned BLOCKS_PER_STREAM = 320;
 
 struct RansDecSymbol {
         unsigned sym_start;
@@ -50,10 +59,12 @@ optional<uint32_t> read_varint(const char **ptr, const char *end)
        return nullopt;  // Error: EOF.
 }
 
+const unsigned num_blocks = ((NUM_BLOCKS + BLOCKS_PER_STREAM - 1) / BLOCKS_PER_STREAM);
+
 struct CoeffStream {
-        uint src_offset, src_len, sign_offset, sign_len, extra_bits;
+        uint src_offset, src_len;
 };
-CoeffStream streams[45 * 64];  // HACK
+CoeffStream streams[num_blocks * 64];
 
 int main(int argc, char **argv)
 {
@@ -83,7 +94,7 @@ int main(int argc, char **argv)
        glGetIntegerv(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &size);
        printf("shared_memory_size=%u\n", size);
 
-       string shader_src = read_file("decoder-pre-sign.shader");
+       string shader_src = ::read_file("decoder.shader");
        GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
        GLuint glsl_program_num = glCreateProgram();
        glAttachShader(glsl_program_num, shader_num);
@@ -100,9 +111,10 @@ int main(int argc, char **argv)
 
        glUseProgram(glsl_program_num);
 
-       string coded = read_file(argc >= 2 ? argv[1] : "coded.dat");
+       string coded = ::read_file(argc >= 2 ? argv[1] : "coded.dat");
        const char *ptr = &coded[0];
        const char *end = ptr + coded.size();
+       GLuint sign_bias[NUM_TABLES];
 
 //     printf("first few bytes offs=%zu: %d %d %d %d %d %d %d %d\n", ptr - coded.data(),
 //             (uint8_t)ptr[0], (uint8_t)ptr[1], (uint8_t)ptr[2], (uint8_t)ptr[3],
@@ -118,12 +130,15 @@ int main(int argc, char **argv)
                                exit(1);
                        }
 
-                       decode_tables[table].dsyms[sym].sym_start = cum_freq;
-                       decode_tables[table].dsyms[sym].sym_freq = *freq;
+                       decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_start = cum_freq;
+                       decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_freq = *freq;
                        for (uint32_t i = 0; i < freq; ++i) {
-                               decode_tables[table].cum2sym[cum_freq++] = sym;
+                               if (cum_freq < prob_scale)
+                                       decode_tables[table].cum2sym[cum_freq] = (sym + 1) & (NUM_SYMS - 1);
+                               ++cum_freq;
                        }
                }
+               sign_bias[table] = cum_freq;
        }
 
        // Make cum2sym texture.
@@ -141,6 +156,7 @@ int main(int argc, char **argv)
         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
         glTexImage2D(GL_TEXTURE_2D, 0, GL_R8UI, prob_scale, NUM_TABLES, 0, GL_RED_INTEGER, GL_UNSIGNED_BYTE, cum2sym_data.get());
+       check_error();
 
        // Make dsyms texture.
        unique_ptr<pair<uint16_t, uint16_t>[]> dsyms_data(new pair<uint16_t, uint16_t>[NUM_SYMS * NUM_TABLES]);
@@ -158,6 +174,21 @@ int main(int argc, char **argv)
         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
         glTexImage2D(GL_TEXTURE_2D, 0, GL_RG16UI, NUM_SYMS, NUM_TABLES, 0, GL_RG_INTEGER, GL_UNSIGNED_SHORT, dsyms_data.get());
+       check_error();
+
+       GLuint coeff_tex;
+       glGenTextures(1, &coeff_tex);
+        glBindTexture(GL_TEXTURE_2D, coeff_tex);
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+       check_error();
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+       check_error();
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
+       check_error();
+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
+       check_error();
+        glTexImage2D(GL_TEXTURE_2D, 0, GL_R16I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_SHORT, nullptr);
+       check_error();
 
        GLuint out_tex;
        glGenTextures(1, &out_tex);
@@ -166,59 +197,58 @@ int main(int argc, char **argv)
         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
-        glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, 1280, 720, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
-        //glTexImage2D(GL_TEXTURE_2D, 0, GL_R32F, 1280, 720, 0, GL_RED, GL_FLOAT, nullptr);
+        glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, WIDTH, HEIGHT, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
+        //glTexImage2D(GL_TEXTURE_2D, 0, GL_R32F, WIDTH, HEIGHT, 0, GL_RED, GL_FLOAT, nullptr);
+       check_error();
 
-       //GLint src_offset_pos = glGetUniformLocation(glsl_program_num, "src_offset");
-       //GLint sign_offset_pos = glGetUniformLocation(glsl_program_num, "sign_offset");
-       //GLint extra_bits_pos = glGetUniformLocation(glsl_program_num, "extra_bits");
        GLint cum2sym_tex_pos = glGetUniformLocation(glsl_program_num, "cum2sym_tex");
        GLint dsyms_tex_pos = glGetUniformLocation(glsl_program_num, "dsyms_tex");
        GLint out_tex_pos = glGetUniformLocation(glsl_program_num, "out_tex");
-       printf("%d err=0x%x pos=%d,%d,%d\n", __LINE__, glGetError(), cum2sym_tex_pos, dsyms_tex_pos, out_tex_pos);
+       GLint coeff_tex_pos = glGetUniformLocation(glsl_program_num, "coeff_tex");
+       GLint sign_bias_pos = glGetUniformLocation(glsl_program_num, "sign_bias_per_model");
+       GLint num_blocks_pos = glGetUniformLocation(glsl_program_num, "num_blocks");
+       printf("%d err=0x%x pos=%d,%d,%d,%d\n", __LINE__, glGetError(), cum2sym_tex_pos, dsyms_tex_pos, out_tex_pos, sign_bias_pos);
 
        // Bind the textures.
        glUniform1i(cum2sym_tex_pos, 0);
        glUniform1i(dsyms_tex_pos, 1);
        glUniform1i(out_tex_pos, 2);
+       glUniform1i(coeff_tex_pos, 3);
+       glUniform1uiv(sign_bias_pos, 16, sign_bias);
+       glUniform1i(num_blocks_pos, num_blocks);
         glBindImageTexture(0, cum2sym_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
         glBindImageTexture(1, dsyms_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG16UI);
         glBindImageTexture(2, out_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8);
+        glBindImageTexture(3, coeff_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16I);
        printf("%d err=0x%x\n", __LINE__, glGetError());
 
        // Decode all luma blocks.
-       unsigned num_blocks = (HEIGHT / 16);
+       size_t last_src_offset = 0, last_src_len = 0;
        for (unsigned y = 0; y < 8; ++y) {
                 for (unsigned x = 0; x < 8; ++x) {
                        unsigned coeff_num = y * 8 + x;
 
-                       for (unsigned yb = 0; yb < HEIGHT; yb += 16) {
+                       for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; block_idx += BLOCKS_PER_STREAM) {
                                optional<uint32_t> num_rans_bytes = read_varint(&ptr, end);
                                if (!num_rans_bytes) {
-                                       fprintf(stderr, "Error parsing varint for block %d rANS bytes\n", yb);
+                                       fprintf(stderr, "Error parsing varint for block %d rANS bytes\n", block_idx);
                                        exit(1);
                                }
 
-                               CoeffStream *stream = &streams[coeff_num * num_blocks + (yb/16)];
-                               stream->src_offset = ptr - coded.data();
-                               stream->src_len = *num_rans_bytes;
-
-                               // TODO: check len
-                               ptr += *num_rans_bytes;
-
-                               optional<uint32_t> num_sign_bytes = read_varint(&ptr, end);
-                               if (!num_sign_bytes) {
-                                       fprintf(stderr, "Error parsing varint for block %d rANS bytes\n", yb);
-                                       exit(1);
+                               CoeffStream *stream = &streams[coeff_num * num_blocks + block_idx / BLOCKS_PER_STREAM];
+                               if (*num_rans_bytes == 0) {
+                                       // Repeat last stream.
+                                       stream->src_offset = last_src_offset;
+                                       stream->src_len = last_src_len;
+                               } else {
+                                       stream->src_offset = ptr - coded.data();
+                                       stream->src_len = *num_rans_bytes;
+                                       last_src_offset = stream->src_offset;
+                                       last_src_len = last_src_len;
                                }
 
-                               stream->sign_offset = ptr - coded.data();
-                               stream->sign_len = *num_sign_bytes >> 3;
-                               stream->extra_bits = *num_sign_bytes & 0x7;
-
                                // TODO: check len
-                               // TODO: free bits
-                               ptr += *num_sign_bytes >> 3;
+                               ptr += *num_rans_bytes;
 
                                //printf("read %d rANS bytes, %d sign bytes\n", *num_rans_bytes, *num_sign_bytes);
                        }
@@ -244,13 +274,22 @@ int main(int argc, char **argv)
 
        glGenBuffers(1, &ssbo_out);
        glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_out);
-       glBufferData(GL_SHADER_STORAGE_BUFFER, 16384, nullptr, GL_STREAM_DRAW);  // ??
+       glBufferData(GL_SHADER_STORAGE_BUFFER, 65536, nullptr, GL_STREAM_DRAW);  // ??
        glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, ssbo_out);
+       check_error();
+
+#define PARALLEL_SLICES 1
+       steady_clock::time_point start = steady_clock::now();
+       unsigned num_iterations = 1000;
+       for (unsigned i = 0; i < num_iterations; ++i) {
+               unsigned num_slices = (WIDTH/8)*(HEIGHT/8)/BLOCKS_PER_STREAM;
+               glDispatchCompute(1, (num_slices+PARALLEL_SLICES-1)/PARALLEL_SLICES, 1);
+       }
+       check_error();
+       glFinish();
+       steady_clock::time_point now = steady_clock::now();
 
-       for (int i = 0; i < 10000; ++i)
-       glDispatchCompute(1, 45, 1);
-
-       unsigned *timing = (unsigned *)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, 16384, GL_MAP_READ_BIT);
+       unsigned *timing = (unsigned *)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, 65536, GL_MAP_READ_BIT);
        //setlocale(LC_ALL, "nb_NO.UTF-8");
 
        string phases[] = {
@@ -269,7 +308,7 @@ int main(int argc, char **argv)
        for (int i = 0; i < 10; ++i) {
                //printf("%d: %'18.0f  [%s]\n", i, double((uint64_t(timing[i * 2 + 1]) << 32) | timing[i * 2]), phases[i].c_str());
                printf("%d,%s", i, phases[i].c_str());
-               for (int j = 0; j < 64; ++j) {
+               for (int j = 0; j < 512; ++j) {
                        int idx = (j * 10 + i) * 2;
                        uint64_t val = (uint64_t(timing[idx + 1]) << 32) | timing[idx];
                //      printf(" %'18.0f", double(val));
@@ -281,15 +320,16 @@ int main(int argc, char **argv)
        }
        printf("\n");
 
-       unsigned char *data = new unsigned char[1280 * 720];
+       unsigned char *data = new unsigned char[WIDTH * HEIGHT];
        glGetTexImage(GL_TEXTURE_2D, 0, GL_RED, GL_UNSIGNED_BYTE, data);
+       check_error();
        printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
 
 #if 0
        for (int k = 0; k < 4; ++k) {
                for (int y = 0; y < 8; ++y) {
                        for (int x = 0; x < 8; ++x) {
-                               printf("%3d ", data[y * 1280 + x + k*8]);
+                               printf("%3d ", data[y * WIDTH + x + k*8]);
                        }
                        printf("\n");
                }
@@ -300,8 +340,8 @@ int main(int argc, char **argv)
        for (int k = 0; k < 4; ++k) {
                for (int y = 0; y < 8; ++y) {
                        for (int x = 0; x < 8; ++x) {
-                               //printf("%5.2f ", data[(y+8) * 1280 + x + (1272-k*8)]);
-                               printf("%3d ", data[y * 1280 + x + k*8]);
+                               //printf("%5.2f ", data[(y+8) * WIDTH + x + (1272-k*8)]);
+                               printf("%3d ", data[y * WIDTH + x + k*8]);
                        }
                        printf("\n");
                }
@@ -311,18 +351,37 @@ int main(int argc, char **argv)
 #endif
 
        FILE *fp = fopen("narabu.pgm", "wb");
-       fprintf(fp, "P5\n1280 720\n255\n");
-       for (int y = 0; y < 720; ++y) {
-               for (int x = 0; x < 1280; ++x) {
-                       int k = lrintf(data[y * 1280 + x]);
+       fprintf(fp, "P5\n%d %d\n255\n", WIDTH, HEIGHT);
+       for (int y = 0; y < HEIGHT; ++y) {
+               for (int x = 0; x < WIDTH; ++x) {
+                       int k = lrintf(data[y * WIDTH + x]);
                        if (k < 0) k = 0;
                        if (k > 255) k = 255;
                        putc(k, fp);
                }
        }
        fclose(fp);
+
+       int16_t *coeff_data = new int16_t[WIDTH * HEIGHT];
+        glBindTexture(GL_TEXTURE_2D, coeff_tex);
+       check_error();
+       glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_SHORT, coeff_data);
+       check_error();
+       for (int k = 0; k < 4; ++k) {
+               for (int y = 0; y < 8; ++y) {
+                       for (int x = 0; x < 8; ++x) {
+                               printf("%3d ", coeff_data[y * WIDTH + x + k*8]);
+                       }
+                       printf("\n");
+               }
+               printf("\n");
+       }
+       printf("\n");
+       
        
+       check_error();
        glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind
        
        printf("foo = 0x%x\n", glGetError());
+       printf("Each iteration took %.3f ms.\n", 1e3 * duration<double>(now - start).count() / num_iterations);
 }