]> git.sesse.net Git - narabu/blobdiff - narabu.cpp
More fixes of hard-coded values.
[narabu] / narabu.cpp
index 49afa9ad9d502703281820f3ba5880d848745cf1..0ac4f71d75ab58794da432e64c22ea0811cdb437 100644 (file)
@@ -19,11 +19,17 @@ using namespace std::chrono;
 
 #define WIDTH 1280
 #define HEIGHT 720
+#define WIDTH_BLOCKS (WIDTH/8)
+#define WIDTH_BLOCKS_CHROMA (WIDTH/16)
+#define HEIGHT_BLOCKS (HEIGHT/8)
+#define NUM_BLOCKS (WIDTH_BLOCKS * HEIGHT_BLOCKS)
+#define NUM_BLOCKS_CHROMA (WIDTH_BLOCKS_CHROMA * HEIGHT_BLOCKS)
 
 const unsigned prob_bits = 12;
 const unsigned prob_scale = 1 << prob_bits;
 const unsigned NUM_SYMS = 256;
 const unsigned NUM_TABLES = 8;
+const unsigned BLOCKS_PER_STREAM = 320;
 
 struct RansDecSymbol {
         unsigned sym_start;
@@ -53,10 +59,12 @@ optional<uint32_t> read_varint(const char **ptr, const char *end)
        return nullopt;  // Error: EOF.
 }
 
+const unsigned num_blocks = ((NUM_BLOCKS + BLOCKS_PER_STREAM - 1) / BLOCKS_PER_STREAM);
+
 struct CoeffStream {
         uint src_offset, src_len;
 };
-CoeffStream streams[45 * 64];  // HACK
+CoeffStream streams[num_blocks * 64];
 
 int main(int argc, char **argv)
 {
@@ -198,6 +206,7 @@ int main(int argc, char **argv)
        GLint out_tex_pos = glGetUniformLocation(glsl_program_num, "out_tex");
        GLint coeff_tex_pos = glGetUniformLocation(glsl_program_num, "coeff_tex");
        GLint sign_bias_pos = glGetUniformLocation(glsl_program_num, "sign_bias_per_model");
+       GLint num_blocks_pos = glGetUniformLocation(glsl_program_num, "num_blocks");
        printf("%d err=0x%x pos=%d,%d,%d,%d\n", __LINE__, glGetError(), cum2sym_tex_pos, dsyms_tex_pos, out_tex_pos, sign_bias_pos);
 
        // Bind the textures.
@@ -206,6 +215,7 @@ int main(int argc, char **argv)
        glUniform1i(out_tex_pos, 2);
        glUniform1i(coeff_tex_pos, 3);
        glUniform1uiv(sign_bias_pos, 16, sign_bias);
+       glUniform1i(num_blocks_pos, num_blocks);
         glBindImageTexture(0, cum2sym_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
         glBindImageTexture(1, dsyms_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG16UI);
         glBindImageTexture(2, out_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8);
@@ -213,21 +223,29 @@ int main(int argc, char **argv)
        printf("%d err=0x%x\n", __LINE__, glGetError());
 
        // Decode all luma blocks.
-       unsigned num_blocks = (HEIGHT / 16);
+       size_t last_src_offset = 0, last_src_len = 0;
        for (unsigned y = 0; y < 8; ++y) {
                 for (unsigned x = 0; x < 8; ++x) {
                        unsigned coeff_num = y * 8 + x;
 
-                       for (unsigned yb = 0; yb < HEIGHT; yb += 16) {
+                       for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; block_idx += BLOCKS_PER_STREAM) {
                                optional<uint32_t> num_rans_bytes = read_varint(&ptr, end);
                                if (!num_rans_bytes) {
-                                       fprintf(stderr, "Error parsing varint for block %d rANS bytes\n", yb);
+                                       fprintf(stderr, "Error parsing varint for block %d rANS bytes\n", block_idx);
                                        exit(1);
                                }
 
-                               CoeffStream *stream = &streams[coeff_num * num_blocks + (yb/16)];
-                               stream->src_offset = ptr - coded.data();
-                               stream->src_len = *num_rans_bytes;
+                               CoeffStream *stream = &streams[coeff_num * num_blocks + block_idx / BLOCKS_PER_STREAM];
+                               if (*num_rans_bytes == 0) {
+                                       // Repeat last stream.
+                                       stream->src_offset = last_src_offset;
+                                       stream->src_len = last_src_len;
+                               } else {
+                                       stream->src_offset = ptr - coded.data();
+                                       stream->src_len = *num_rans_bytes;
+                                       last_src_offset = stream->src_offset;
+                                       last_src_len = last_src_len;
+                               }
 
                                // TODO: check len
                                ptr += *num_rans_bytes;
@@ -262,8 +280,9 @@ int main(int argc, char **argv)
 
 #define PARALLEL_SLICES 1
        steady_clock::time_point start = steady_clock::now();
-       for (int i = 0; i < 1000; ++i) {
-               unsigned num_slices = (WIDTH/8)*(HEIGHT/8)/320;
+       unsigned num_iterations = 1000;
+       for (unsigned i = 0; i < num_iterations; ++i) {
+               unsigned num_slices = (WIDTH/8)*(HEIGHT/8)/BLOCKS_PER_STREAM;
                glDispatchCompute(1, (num_slices+PARALLEL_SLICES-1)/PARALLEL_SLICES, 1);
        }
        check_error();
@@ -364,5 +383,5 @@ int main(int argc, char **argv)
        glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind
        
        printf("foo = 0x%x\n", glGetError());
-       printf("Each iteration took %.3f ms.\n", 1e3 * duration<double>(now - start).count() / 1000);
+       printf("Each iteration took %.3f ms.\n", 1e3 * duration<double>(now - start).count() / num_iterations);
 }