X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=narabu.cpp;h=0ac4f71d75ab58794da432e64c22ea0811cdb437;hb=daf421e32981645e551621551c6b82697ad078de;hp=48973664e66a2e3c51f7ed191191186a596e292b;hpb=28409aed1a0cbf8d2e8d9d157d08c3f6d9a3f51a;p=narabu diff --git a/narabu.cpp b/narabu.cpp index 4897366..0ac4f71 100644 --- a/narabu.cpp +++ b/narabu.cpp @@ -4,23 +4,32 @@ #include #include #include +#include #include #include #include #include #include +#include #include "util.h" using namespace std; +using namespace std::chrono; #define WIDTH 1280 #define HEIGHT 720 +#define WIDTH_BLOCKS (WIDTH/8) +#define WIDTH_BLOCKS_CHROMA (WIDTH/16) +#define HEIGHT_BLOCKS (HEIGHT/8) +#define NUM_BLOCKS (WIDTH_BLOCKS * HEIGHT_BLOCKS) +#define NUM_BLOCKS_CHROMA (WIDTH_BLOCKS_CHROMA * HEIGHT_BLOCKS) const unsigned prob_bits = 12; const unsigned prob_scale = 1 << prob_bits; const unsigned NUM_SYMS = 256; -const unsigned NUM_TABLES = 16; +const unsigned NUM_TABLES = 8; +const unsigned BLOCKS_PER_STREAM = 320; struct RansDecSymbol { unsigned sym_start; @@ -50,10 +59,12 @@ optional read_varint(const char **ptr, const char *end) return nullopt; // Error: EOF. } +const unsigned num_blocks = ((NUM_BLOCKS + BLOCKS_PER_STREAM - 1) / BLOCKS_PER_STREAM); + struct CoeffStream { - uint src_offset, src_len, sign_offset, sign_len, extra_bits; + uint src_offset, src_len; }; -CoeffStream streams[45 * 64]; // HACK +CoeffStream streams[num_blocks * 64]; int main(int argc, char **argv) { @@ -83,7 +94,7 @@ int main(int argc, char **argv) glGetIntegerv(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &size); printf("shared_memory_size=%u\n", size); - string shader_src = read_file("decoder-pre-sign.shader"); + string shader_src = ::read_file("decoder.shader"); GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER); GLuint glsl_program_num = glCreateProgram(); glAttachShader(glsl_program_num, shader_num); @@ -100,9 +111,10 @@ int main(int argc, char **argv) glUseProgram(glsl_program_num); - string coded = read_file(argc >= 2 ? argv[1] : "coded.dat"); + string coded = ::read_file(argc >= 2 ? argv[1] : "coded.dat"); const char *ptr = &coded[0]; const char *end = ptr + coded.size(); + GLuint sign_bias[NUM_TABLES]; // printf("first few bytes offs=%zu: %d %d %d %d %d %d %d %d\n", ptr - coded.data(), // (uint8_t)ptr[0], (uint8_t)ptr[1], (uint8_t)ptr[2], (uint8_t)ptr[3], @@ -118,12 +130,15 @@ int main(int argc, char **argv) exit(1); } - decode_tables[table].dsyms[sym].sym_start = cum_freq; - decode_tables[table].dsyms[sym].sym_freq = *freq; + decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_start = cum_freq; + decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_freq = *freq; for (uint32_t i = 0; i < freq; ++i) { - decode_tables[table].cum2sym[cum_freq++] = sym; + if (cum_freq < prob_scale) + decode_tables[table].cum2sym[cum_freq] = (sym + 1) & (NUM_SYMS - 1); + ++cum_freq; } } + sign_bias[table] = cum_freq; } // Make cum2sym texture. @@ -141,6 +156,7 @@ int main(int argc, char **argv) glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); glTexImage2D(GL_TEXTURE_2D, 0, GL_R8UI, prob_scale, NUM_TABLES, 0, GL_RED_INTEGER, GL_UNSIGNED_BYTE, cum2sym_data.get()); + check_error(); // Make dsyms texture. unique_ptr[]> dsyms_data(new pair[NUM_SYMS * NUM_TABLES]); @@ -158,6 +174,21 @@ int main(int argc, char **argv) glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); glTexImage2D(GL_TEXTURE_2D, 0, GL_RG16UI, NUM_SYMS, NUM_TABLES, 0, GL_RG_INTEGER, GL_UNSIGNED_SHORT, dsyms_data.get()); + check_error(); + + GLuint coeff_tex; + glGenTextures(1, &coeff_tex); + glBindTexture(GL_TEXTURE_2D, coeff_tex); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); + check_error(); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); + check_error(); + glTexImage2D(GL_TEXTURE_2D, 0, GL_R16I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_SHORT, nullptr); + check_error(); GLuint out_tex; glGenTextures(1, &out_tex); @@ -166,59 +197,58 @@ int main(int argc, char **argv) glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT); - glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, 1280, 720, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr); - //glTexImage2D(GL_TEXTURE_2D, 0, GL_R32F, 1280, 720, 0, GL_RED, GL_FLOAT, nullptr); + glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, WIDTH, HEIGHT, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr); + //glTexImage2D(GL_TEXTURE_2D, 0, GL_R32F, WIDTH, HEIGHT, 0, GL_RED, GL_FLOAT, nullptr); + check_error(); - //GLint src_offset_pos = glGetUniformLocation(glsl_program_num, "src_offset"); - //GLint sign_offset_pos = glGetUniformLocation(glsl_program_num, "sign_offset"); - //GLint extra_bits_pos = glGetUniformLocation(glsl_program_num, "extra_bits"); GLint cum2sym_tex_pos = glGetUniformLocation(glsl_program_num, "cum2sym_tex"); GLint dsyms_tex_pos = glGetUniformLocation(glsl_program_num, "dsyms_tex"); GLint out_tex_pos = glGetUniformLocation(glsl_program_num, "out_tex"); - printf("%d err=0x%x pos=%d,%d,%d\n", __LINE__, glGetError(), cum2sym_tex_pos, dsyms_tex_pos, out_tex_pos); + GLint coeff_tex_pos = glGetUniformLocation(glsl_program_num, "coeff_tex"); + GLint sign_bias_pos = glGetUniformLocation(glsl_program_num, "sign_bias_per_model"); + GLint num_blocks_pos = glGetUniformLocation(glsl_program_num, "num_blocks"); + printf("%d err=0x%x pos=%d,%d,%d,%d\n", __LINE__, glGetError(), cum2sym_tex_pos, dsyms_tex_pos, out_tex_pos, sign_bias_pos); // Bind the textures. glUniform1i(cum2sym_tex_pos, 0); glUniform1i(dsyms_tex_pos, 1); glUniform1i(out_tex_pos, 2); + glUniform1i(coeff_tex_pos, 3); + glUniform1uiv(sign_bias_pos, 16, sign_bias); + glUniform1i(num_blocks_pos, num_blocks); glBindImageTexture(0, cum2sym_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI); glBindImageTexture(1, dsyms_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG16UI); glBindImageTexture(2, out_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8); + glBindImageTexture(3, coeff_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16I); printf("%d err=0x%x\n", __LINE__, glGetError()); // Decode all luma blocks. - unsigned num_blocks = (HEIGHT / 16); + size_t last_src_offset = 0, last_src_len = 0; for (unsigned y = 0; y < 8; ++y) { for (unsigned x = 0; x < 8; ++x) { unsigned coeff_num = y * 8 + x; - for (unsigned yb = 0; yb < HEIGHT; yb += 16) { + for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; block_idx += BLOCKS_PER_STREAM) { optional num_rans_bytes = read_varint(&ptr, end); if (!num_rans_bytes) { - fprintf(stderr, "Error parsing varint for block %d rANS bytes\n", yb); + fprintf(stderr, "Error parsing varint for block %d rANS bytes\n", block_idx); exit(1); } - CoeffStream *stream = &streams[coeff_num * num_blocks + (yb/16)]; - stream->src_offset = ptr - coded.data(); - stream->src_len = *num_rans_bytes; - - // TODO: check len - ptr += *num_rans_bytes; - - optional num_sign_bytes = read_varint(&ptr, end); - if (!num_sign_bytes) { - fprintf(stderr, "Error parsing varint for block %d rANS bytes\n", yb); - exit(1); + CoeffStream *stream = &streams[coeff_num * num_blocks + block_idx / BLOCKS_PER_STREAM]; + if (*num_rans_bytes == 0) { + // Repeat last stream. + stream->src_offset = last_src_offset; + stream->src_len = last_src_len; + } else { + stream->src_offset = ptr - coded.data(); + stream->src_len = *num_rans_bytes; + last_src_offset = stream->src_offset; + last_src_len = last_src_len; } - stream->sign_offset = ptr - coded.data(); - stream->sign_len = *num_sign_bytes >> 3; - stream->extra_bits = *num_sign_bytes & 0x7; - // TODO: check len - // TODO: free bits - ptr += *num_sign_bytes >> 3; + ptr += *num_rans_bytes; //printf("read %d rANS bytes, %d sign bytes\n", *num_rans_bytes, *num_sign_bytes); } @@ -244,13 +274,22 @@ int main(int argc, char **argv) glGenBuffers(1, &ssbo_out); glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_out); - glBufferData(GL_SHADER_STORAGE_BUFFER, 16384, nullptr, GL_STREAM_DRAW); // ?? + glBufferData(GL_SHADER_STORAGE_BUFFER, 65536, nullptr, GL_STREAM_DRAW); // ?? glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, ssbo_out); + check_error(); + +#define PARALLEL_SLICES 1 + steady_clock::time_point start = steady_clock::now(); + unsigned num_iterations = 1000; + for (unsigned i = 0; i < num_iterations; ++i) { + unsigned num_slices = (WIDTH/8)*(HEIGHT/8)/BLOCKS_PER_STREAM; + glDispatchCompute(1, (num_slices+PARALLEL_SLICES-1)/PARALLEL_SLICES, 1); + } + check_error(); + glFinish(); + steady_clock::time_point now = steady_clock::now(); - for (int i = 0; i < 10000; ++i) - glDispatchCompute(1, 45, 1); - - unsigned *timing = (unsigned *)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, 16384, GL_MAP_READ_BIT); + unsigned *timing = (unsigned *)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, 65536, GL_MAP_READ_BIT); //setlocale(LC_ALL, "nb_NO.UTF-8"); string phases[] = { @@ -269,7 +308,7 @@ int main(int argc, char **argv) for (int i = 0; i < 10; ++i) { //printf("%d: %'18.0f [%s]\n", i, double((uint64_t(timing[i * 2 + 1]) << 32) | timing[i * 2]), phases[i].c_str()); printf("%d,%s", i, phases[i].c_str()); - for (int j = 0; j < 64; ++j) { + for (int j = 0; j < 512; ++j) { int idx = (j * 10 + i) * 2; uint64_t val = (uint64_t(timing[idx + 1]) << 32) | timing[idx]; // printf(" %'18.0f", double(val)); @@ -281,15 +320,16 @@ int main(int argc, char **argv) } printf("\n"); - unsigned char *data = new unsigned char[1280 * 720]; + unsigned char *data = new unsigned char[WIDTH * HEIGHT]; glGetTexImage(GL_TEXTURE_2D, 0, GL_RED, GL_UNSIGNED_BYTE, data); + check_error(); printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size()); #if 0 for (int k = 0; k < 4; ++k) { for (int y = 0; y < 8; ++y) { for (int x = 0; x < 8; ++x) { - printf("%3d ", data[y * 1280 + x + k*8]); + printf("%3d ", data[y * WIDTH + x + k*8]); } printf("\n"); } @@ -300,8 +340,8 @@ int main(int argc, char **argv) for (int k = 0; k < 4; ++k) { for (int y = 0; y < 8; ++y) { for (int x = 0; x < 8; ++x) { - //printf("%5.2f ", data[(y+8) * 1280 + x + (1272-k*8)]); - printf("%3d ", data[y * 1280 + x + k*8]); + //printf("%5.2f ", data[(y+8) * WIDTH + x + (1272-k*8)]); + printf("%3d ", data[y * WIDTH + x + k*8]); } printf("\n"); } @@ -311,18 +351,37 @@ int main(int argc, char **argv) #endif FILE *fp = fopen("narabu.pgm", "wb"); - fprintf(fp, "P5\n1280 720\n255\n"); - for (int y = 0; y < 720; ++y) { - for (int x = 0; x < 1280; ++x) { - int k = lrintf(data[y * 1280 + x]); + fprintf(fp, "P5\n%d %d\n255\n", WIDTH, HEIGHT); + for (int y = 0; y < HEIGHT; ++y) { + for (int x = 0; x < WIDTH; ++x) { + int k = lrintf(data[y * WIDTH + x]); if (k < 0) k = 0; if (k > 255) k = 255; putc(k, fp); } } fclose(fp); + + int16_t *coeff_data = new int16_t[WIDTH * HEIGHT]; + glBindTexture(GL_TEXTURE_2D, coeff_tex); + check_error(); + glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_SHORT, coeff_data); + check_error(); + for (int k = 0; k < 4; ++k) { + for (int y = 0; y < 8; ++y) { + for (int x = 0; x < 8; ++x) { + printf("%3d ", coeff_data[y * WIDTH + x + k*8]); + } + printf("\n"); + } + printf("\n"); + } + printf("\n"); + + check_error(); glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind printf("foo = 0x%x\n", glGetError()); + printf("Each iteration took %.3f ms.\n", 1e3 * duration(now - start).count() / num_iterations); }