4 #include <SDL2/SDL_error.h>
5 #include <SDL2/SDL_video.h>
7 #include <movit/util.h>
18 using namespace std::chrono;
22 #define WIDTH_BLOCKS (WIDTH/8)
23 #define WIDTH_BLOCKS_CHROMA (WIDTH/16)
24 #define HEIGHT_BLOCKS (HEIGHT/8)
25 #define NUM_BLOCKS (WIDTH_BLOCKS * HEIGHT_BLOCKS)
26 #define NUM_BLOCKS_CHROMA (WIDTH_BLOCKS_CHROMA * HEIGHT_BLOCKS)
28 const unsigned prob_bits = 12;
29 const unsigned prob_scale = 1 << prob_bits;
30 const unsigned NUM_SYMS = 256;
31 const unsigned NUM_TABLES = 8;
32 const unsigned BLOCKS_PER_STREAM = 320;
34 struct RansDecSymbol {
38 struct RansDecodeTable {
39 int cum2sym[prob_scale];
40 RansDecSymbol dsyms[NUM_SYMS];
42 RansDecodeTable decode_tables[NUM_TABLES];
44 optional<uint32_t> read_varint(const char **ptr, const char *end)
52 const unsigned num_blocks = ((NUM_BLOCKS + BLOCKS_PER_STREAM - 1) / BLOCKS_PER_STREAM);
55 uint src_offset, src_len;
57 CoeffStream streams[num_blocks * 64];
59 int main(int argc, char **argv)
61 // Set up an OpenGL context using SDL.
62 if (SDL_Init(SDL_INIT_VIDEO) == -1) {
63 fprintf(stderr, "SDL_Init failed: %s\n", SDL_GetError());
66 SDL_GL_SetAttribute(SDL_GL_DEPTH_SIZE, 0);
67 SDL_GL_SetAttribute(SDL_GL_STENCIL_SIZE, 0);
68 SDL_GL_SetAttribute(SDL_GL_DOUBLEBUFFER, 1);
69 SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
70 SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
71 SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 5);
73 SDL_Window *window = SDL_CreateWindow("OpenGL window for unit test",
74 SDL_WINDOWPOS_UNDEFINED,
75 SDL_WINDOWPOS_UNDEFINED,
78 SDL_GLContext context = SDL_GL_CreateContext(window);
79 assert(context != nullptr);
81 //char buf[16] = { 0 };
84 glGetIntegerv(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &size);
85 printf("shared_memory_size=%u\n", size);
87 string shader_src = ::read_file("decoder.shader");
88 GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
89 GLuint glsl_program_num = glCreateProgram();
90 glAttachShader(glsl_program_num, shader_num);
91 glLinkProgram(glsl_program_num);
94 glGetProgramiv(glsl_program_num, GL_LINK_STATUS, &success);
95 if (success == GL_FALSE) {
96 GLchar error_log[1024] = {0};
97 glGetProgramInfoLog(glsl_program_num, 1024, nullptr, error_log);
98 fprintf(stderr, "Error linking program: %s\n", error_log);
102 glUseProgram(glsl_program_num);
104 string coded = ::read_file(argc >= 2 ? argv[1] : "coded.dat");
105 const char *ptr = &coded[0];
106 //assert((intptr_t)ptr % 4 == 0);
107 const char *end = ptr + coded.size();
108 GLuint sign_bias[NUM_TABLES];
110 // printf("first few bytes offs=%zu: %d %d %d %d %d %d %d %d\n", ptr - coded.data(),
111 // (uint8_t)ptr[0], (uint8_t)ptr[1], (uint8_t)ptr[2], (uint8_t)ptr[3],
112 // (uint8_t)ptr[4], (uint8_t)ptr[5], (uint8_t)ptr[6], (uint8_t)ptr[7]);
114 // read the rANS tables
115 for (unsigned table = 0; table < NUM_TABLES; ++table) {
116 uint32_t cum_freq = 0;
117 for (unsigned sym = 0; sym < NUM_SYMS; ++sym) {
118 optional<uint32_t> freq = read_varint(&ptr, end);
120 fprintf(stderr, "Error parsing varint for table %d symbol %d\n", table, sym);
124 decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_start = cum_freq;
125 decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_freq = *freq;
126 for (uint32_t i = 0; i < freq; ++i) {
127 if (cum_freq < prob_scale)
128 decode_tables[table].cum2sym[cum_freq] = (sym + 1) & (NUM_SYMS - 1);
132 sign_bias[table] = cum_freq;
135 // Make cum2sym texture.
136 unique_ptr<uint8_t[]> cum2sym_data(new uint8_t[prob_scale * NUM_TABLES]);
137 for (unsigned table = 0; table < NUM_TABLES; ++table) {
138 for (unsigned i = 0; i < prob_scale; ++i) {
139 cum2sym_data[prob_scale * table + i] = decode_tables[table].cum2sym[i];
143 glGenTextures(1, &cum2sym_tex);
144 glBindTexture(GL_TEXTURE_2D, cum2sym_tex);
145 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
146 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
147 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
148 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
149 glTexImage2D(GL_TEXTURE_2D, 0, GL_R8UI, prob_scale, NUM_TABLES, 0, GL_RED_INTEGER, GL_UNSIGNED_BYTE, cum2sym_data.get());
152 // Make dsyms texture.
153 unique_ptr<pair<uint16_t, uint16_t>[]> dsyms_data(new pair<uint16_t, uint16_t>[NUM_SYMS * NUM_TABLES]);
154 for (unsigned table = 0; table < NUM_TABLES; ++table) {
155 for (unsigned sym = 0; sym < NUM_SYMS; ++sym) {
156 dsyms_data[NUM_SYMS * table + sym].first = decode_tables[table].dsyms[sym].sym_start;
157 dsyms_data[NUM_SYMS * table + sym].second = decode_tables[table].dsyms[sym].sym_freq;
161 glGenTextures(1, &dsyms_tex);
162 glBindTexture(GL_TEXTURE_2D, dsyms_tex);
163 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
164 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
165 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
166 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
167 glTexImage2D(GL_TEXTURE_2D, 0, GL_RG16UI, NUM_SYMS, NUM_TABLES, 0, GL_RG_INTEGER, GL_UNSIGNED_SHORT, dsyms_data.get());
171 glGenTextures(1, &coeff_tex);
172 glBindTexture(GL_TEXTURE_2D, coeff_tex);
173 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
175 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
177 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
179 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
181 glTexImage2D(GL_TEXTURE_2D, 0, GL_R32I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_INT, nullptr);
185 glGenTextures(1, &coeff2_tex);
186 glBindTexture(GL_TEXTURE_2D, coeff2_tex);
187 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
189 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
191 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
193 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
195 glTexImage2D(GL_TEXTURE_2D, 0, GL_R32I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_INT, nullptr);
199 glGenTextures(1, &out_tex);
200 glBindTexture(GL_TEXTURE_2D, out_tex);
201 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
202 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
203 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
204 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
205 glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, WIDTH, HEIGHT, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
206 //glTexImage2D(GL_TEXTURE_2D, 0, GL_R32F, WIDTH, HEIGHT, 0, GL_RED, GL_FLOAT, nullptr);
209 GLint cum2sym_tex_pos = glGetUniformLocation(glsl_program_num, "cum2sym_tex");
210 GLint dsyms_tex_pos = glGetUniformLocation(glsl_program_num, "dsyms_tex");
211 GLint out_tex_pos = glGetUniformLocation(glsl_program_num, "out_tex");
212 GLint coeff_tex_pos = glGetUniformLocation(glsl_program_num, "coeff_tex");
213 GLint coeff2_tex_pos = glGetUniformLocation(glsl_program_num, "coeff2_tex");
214 GLint sign_bias_pos = glGetUniformLocation(glsl_program_num, "sign_bias_per_model");
215 GLint num_blocks_pos = glGetUniformLocation(glsl_program_num, "num_blocks");
216 printf("%d err=0x%x pos=%d,%d,%d,%d\n", __LINE__, glGetError(), cum2sym_tex_pos, dsyms_tex_pos, out_tex_pos, sign_bias_pos);
218 // Bind the textures.
219 glUniform1i(cum2sym_tex_pos, 0);
220 glUniform1i(dsyms_tex_pos, 1);
221 glUniform1i(out_tex_pos, 2);
222 glUniform1i(coeff_tex_pos, 3);
223 glUniform1i(coeff2_tex_pos, 4);
224 glUniform1uiv(sign_bias_pos, 16, sign_bias);
225 glUniform1i(num_blocks_pos, num_blocks);
226 glBindImageTexture(0, cum2sym_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
227 glBindImageTexture(1, dsyms_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG16UI);
228 glBindImageTexture(2, out_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8);
229 glBindImageTexture(3, coeff_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R32I);
230 glBindImageTexture(4, coeff2_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R32I);
231 printf("%d err=0x%x\n", __LINE__, glGetError());
233 // Decode all luma blocks.
234 for (unsigned y = 0; y < 8; ++y) {
235 for (unsigned x = 0; x < 8; ++x) {
236 unsigned coeff_num = y * 8 + x;
238 for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; block_idx += BLOCKS_PER_STREAM) {
239 optional<uint32_t> num_rans_bytes = read_varint(&ptr, end);
240 if (!num_rans_bytes) {
241 fprintf(stderr, "Error parsing varint for block %d rANS bytes\n", block_idx);
245 CoeffStream *stream = &streams[coeff_num * num_blocks + block_idx / BLOCKS_PER_STREAM];
246 stream->src_offset = ptr - coded.data();
247 stream->src_len = *num_rans_bytes;
248 //assert(stream->src_offset % 4 == 0);
251 ptr += *num_rans_bytes;
253 //printf("read %d rANS bytes, %d sign bytes\n", *num_rans_bytes, *num_sign_bytes);
258 // put the coded data (as a whole) into an SSBO
259 printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
261 GLuint ssbo_stream, ssbo, ssbo_out;
263 glGenBuffers(1, &ssbo_stream);
264 glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_stream);
265 glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(streams), streams, GL_STREAM_DRAW);
266 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ssbo_stream);
267 printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
269 glGenBuffers(1, &ssbo);
270 glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
271 glBufferData(GL_SHADER_STORAGE_BUFFER, coded.size(), coded.data(), GL_STREAM_DRAW);
272 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
273 printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
275 glGenBuffers(1, &ssbo_out);
276 glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_out);
277 glBufferData(GL_SHADER_STORAGE_BUFFER, 65536, nullptr, GL_STREAM_DRAW); // ??
278 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, ssbo_out);
281 #define PARALLEL_SLICES 1
282 steady_clock::time_point start = steady_clock::now();
283 unsigned num_iterations = 1000;
284 for (unsigned i = 0; i < num_iterations; ++i) {
285 unsigned num_slices = (WIDTH/8)*(HEIGHT/8)/BLOCKS_PER_STREAM;
286 glDispatchCompute(1, (num_slices+PARALLEL_SLICES-1)/PARALLEL_SLICES, 1);
290 steady_clock::time_point now = steady_clock::now();
292 unsigned *timing = (unsigned *)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, 65536, GL_MAP_READ_BIT);
293 //setlocale(LC_ALL, "nb_NO.UTF-8");
299 "barrier after rANS decode",
301 "barrier after horizontal IDCT",
304 "barrier after store to texture",
305 "dummy timer for overhead measurement",
308 for (int i = 0; i < 10; ++i) {
309 //printf("%d: %'18.0f [%s]\n", i, double((uint64_t(timing[i * 2 + 1]) << 32) | timing[i * 2]), phases[i].c_str());
310 printf("%d,%s", i, phases[i].c_str());
311 for (int j = 0; j < 512; ++j) {
312 int idx = (j * 10 + i) * 2;
313 uint64_t val = (uint64_t(timing[idx + 1]) << 32) | timing[idx];
314 // printf(" %'18.0f", double(val));
315 // printf(" %'6.0f", double(val) * 1e-6);
316 printf(",%.0f", double(val) * 1e-6);
319 //printf(" [%s]\n", phases[i].c_str());
323 unsigned char *data = new unsigned char[WIDTH * HEIGHT];
324 glGetTexImage(GL_TEXTURE_2D, 0, GL_RED, GL_UNSIGNED_BYTE, data);
326 printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
329 for (int k = 0; k < 4; ++k) {
330 for (int y = 0; y < 8; ++y) {
331 for (int x = 0; x < 8; ++x) {
332 printf("%3d ", data[y * WIDTH + x + k*8]);
340 for (int k = 0; k < 4; ++k) {
341 for (int y = 0; y < 8; ++y) {
342 for (int x = 0; x < 8; ++x) {
343 //printf("%5.2f ", data[(y+8) * WIDTH + x + (1272-k*8)]);
344 printf("%3d ", data[y * WIDTH + x + k*8]);
353 FILE *fp = fopen("narabu.pgm", "wb");
354 fprintf(fp, "P5\n%d %d\n255\n", WIDTH, HEIGHT);
355 for (int y = 0; y < HEIGHT; ++y) {
356 for (int x = 0; x < WIDTH; ++x) {
357 int k = lrintf(data[y * WIDTH + x]);
359 if (k > 255) k = 255;
366 uint32_t *coeff_data = new uint32_t[WIDTH * HEIGHT];
367 glBindTexture(GL_TEXTURE_2D, coeff_tex);
369 glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_INT, coeff_data);
371 uint32_t *coeff2_data = new uint32_t[WIDTH * HEIGHT];
372 glBindTexture(GL_TEXTURE_2D, coeff2_tex);
374 glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_INT, coeff2_data);
376 for (int x = 0; x < 320; ++x) {
377 printf("%08x.%08x ", coeff2_data[x], coeff_data[x]);
383 glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind
385 printf("foo = 0x%x\n", glGetError());
386 printf("Each iteration took %.3f ms.\n", 1e3 * duration<double>(now - start).count() / num_iterations);