X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=narabu.cpp;h=0ac4f71d75ab58794da432e64c22ea0811cdb437;hb=daf421e32981645e551621551c6b82697ad078de;hp=49afa9ad9d502703281820f3ba5880d848745cf1;hpb=704e9573dc4a4480edbf855ecd5ab7838398843d;p=narabu diff --git a/narabu.cpp b/narabu.cpp index 49afa9a..0ac4f71 100644 --- a/narabu.cpp +++ b/narabu.cpp @@ -19,11 +19,17 @@ using namespace std::chrono; #define WIDTH 1280 #define HEIGHT 720 +#define WIDTH_BLOCKS (WIDTH/8) +#define WIDTH_BLOCKS_CHROMA (WIDTH/16) +#define HEIGHT_BLOCKS (HEIGHT/8) +#define NUM_BLOCKS (WIDTH_BLOCKS * HEIGHT_BLOCKS) +#define NUM_BLOCKS_CHROMA (WIDTH_BLOCKS_CHROMA * HEIGHT_BLOCKS) const unsigned prob_bits = 12; const unsigned prob_scale = 1 << prob_bits; const unsigned NUM_SYMS = 256; const unsigned NUM_TABLES = 8; +const unsigned BLOCKS_PER_STREAM = 320; struct RansDecSymbol { unsigned sym_start; @@ -53,10 +59,12 @@ optional read_varint(const char **ptr, const char *end) return nullopt; // Error: EOF. } +const unsigned num_blocks = ((NUM_BLOCKS + BLOCKS_PER_STREAM - 1) / BLOCKS_PER_STREAM); + struct CoeffStream { uint src_offset, src_len; }; -CoeffStream streams[45 * 64]; // HACK +CoeffStream streams[num_blocks * 64]; int main(int argc, char **argv) { @@ -198,6 +206,7 @@ int main(int argc, char **argv) GLint out_tex_pos = glGetUniformLocation(glsl_program_num, "out_tex"); GLint coeff_tex_pos = glGetUniformLocation(glsl_program_num, "coeff_tex"); GLint sign_bias_pos = glGetUniformLocation(glsl_program_num, "sign_bias_per_model"); + GLint num_blocks_pos = glGetUniformLocation(glsl_program_num, "num_blocks"); printf("%d err=0x%x pos=%d,%d,%d,%d\n", __LINE__, glGetError(), cum2sym_tex_pos, dsyms_tex_pos, out_tex_pos, sign_bias_pos); // Bind the textures. @@ -206,6 +215,7 @@ int main(int argc, char **argv) glUniform1i(out_tex_pos, 2); glUniform1i(coeff_tex_pos, 3); glUniform1uiv(sign_bias_pos, 16, sign_bias); + glUniform1i(num_blocks_pos, num_blocks); glBindImageTexture(0, cum2sym_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI); glBindImageTexture(1, dsyms_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG16UI); glBindImageTexture(2, out_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8); @@ -213,21 +223,29 @@ int main(int argc, char **argv) printf("%d err=0x%x\n", __LINE__, glGetError()); // Decode all luma blocks. - unsigned num_blocks = (HEIGHT / 16); + size_t last_src_offset = 0, last_src_len = 0; for (unsigned y = 0; y < 8; ++y) { for (unsigned x = 0; x < 8; ++x) { unsigned coeff_num = y * 8 + x; - for (unsigned yb = 0; yb < HEIGHT; yb += 16) { + for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; block_idx += BLOCKS_PER_STREAM) { optional num_rans_bytes = read_varint(&ptr, end); if (!num_rans_bytes) { - fprintf(stderr, "Error parsing varint for block %d rANS bytes\n", yb); + fprintf(stderr, "Error parsing varint for block %d rANS bytes\n", block_idx); exit(1); } - CoeffStream *stream = &streams[coeff_num * num_blocks + (yb/16)]; - stream->src_offset = ptr - coded.data(); - stream->src_len = *num_rans_bytes; + CoeffStream *stream = &streams[coeff_num * num_blocks + block_idx / BLOCKS_PER_STREAM]; + if (*num_rans_bytes == 0) { + // Repeat last stream. + stream->src_offset = last_src_offset; + stream->src_len = last_src_len; + } else { + stream->src_offset = ptr - coded.data(); + stream->src_len = *num_rans_bytes; + last_src_offset = stream->src_offset; + last_src_len = last_src_len; + } // TODO: check len ptr += *num_rans_bytes; @@ -262,8 +280,9 @@ int main(int argc, char **argv) #define PARALLEL_SLICES 1 steady_clock::time_point start = steady_clock::now(); - for (int i = 0; i < 1000; ++i) { - unsigned num_slices = (WIDTH/8)*(HEIGHT/8)/320; + unsigned num_iterations = 1000; + for (unsigned i = 0; i < num_iterations; ++i) { + unsigned num_slices = (WIDTH/8)*(HEIGHT/8)/BLOCKS_PER_STREAM; glDispatchCompute(1, (num_slices+PARALLEL_SLICES-1)/PARALLEL_SLICES, 1); } check_error(); @@ -364,5 +383,5 @@ int main(int argc, char **argv) glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind printf("foo = 0x%x\n", glGetError()); - printf("Each iteration took %.3f ms.\n", 1e3 * duration(now - start).count() / 1000); + printf("Each iteration took %.3f ms.\n", 1e3 * duration(now - start).count() / num_iterations); }