]> git.sesse.net Git - narabu/blob - narabu.cpp
bb1209c7fc3ebe7c4bf66123567736cb6bff3854
[narabu] / narabu.cpp
1 #include <stdio.h>
2 #include <assert.h>
3 #include <SDL2/SDL.h>
4 #include <SDL2/SDL_error.h>
5 #include <SDL2/SDL_video.h>
6 #include <epoxy/gl.h>
7 #include <movit/util.h>
8 #include <string>
9 #include <optional>
10 #include <algorithm>
11 #include <vector>
12 #include <memory>
13 #include <chrono>
14
15 #include "util.h"
16
17 using namespace std;
18 using namespace std::chrono;
19
20 #define WIDTH 1280
21 #define HEIGHT 720
22 #define WIDTH_BLOCKS (WIDTH/8)
23 #define WIDTH_BLOCKS_CHROMA (WIDTH/16)
24 #define HEIGHT_BLOCKS (HEIGHT/8)
25 #define NUM_BLOCKS (WIDTH_BLOCKS * HEIGHT_BLOCKS)
26 #define NUM_BLOCKS_CHROMA (WIDTH_BLOCKS_CHROMA * HEIGHT_BLOCKS)
27
28 const unsigned prob_bits = 12;
29 const unsigned prob_scale = 1 << prob_bits;
30 const unsigned NUM_SYMS = 256;
31 const unsigned NUM_TABLES = 8;
32 const unsigned BLOCKS_PER_STREAM = 320;
33
34 struct RansDecSymbol {
35         unsigned sym_start;
36         unsigned sym_freq;
37 };
38 struct RansDecodeTable {
39         int cum2sym[prob_scale];
40         RansDecSymbol dsyms[NUM_SYMS];
41 };
42 RansDecodeTable decode_tables[NUM_TABLES];
43
44 optional<uint32_t> read_varint(const char **ptr, const char *end)
45 {
46         uint32_t x = 0;
47         int shift = 0;
48         while (*ptr < end) {
49                 int ch = **ptr;
50                 ++(*ptr);       
51
52                 x |= (ch & 0x7f) << shift;
53                 if ((ch & 0x80) == 0) return x;
54                 shift += 7;
55                 if (shift >= 32) {
56                         return nullopt;  // Error: Overlong int.
57                 }
58         }
59         return nullopt;  // Error: EOF.
60 }
61
62 const unsigned num_blocks = ((NUM_BLOCKS + BLOCKS_PER_STREAM - 1) / BLOCKS_PER_STREAM);
63
64 struct CoeffStream {
65         uint src_offset, src_len;
66 };
67 CoeffStream streams[num_blocks * 64];
68
69 int main(int argc, char **argv)
70 {
71         // Set up an OpenGL context using SDL.
72         if (SDL_Init(SDL_INIT_VIDEO) == -1) {
73                 fprintf(stderr, "SDL_Init failed: %s\n", SDL_GetError());
74                 exit(1);
75         }
76         SDL_GL_SetAttribute(SDL_GL_DEPTH_SIZE, 0);
77         SDL_GL_SetAttribute(SDL_GL_STENCIL_SIZE, 0);
78         SDL_GL_SetAttribute(SDL_GL_DOUBLEBUFFER, 1);
79         SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
80         SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
81         SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 5);
82
83         SDL_Window *window = SDL_CreateWindow("OpenGL window for unit test",
84                 SDL_WINDOWPOS_UNDEFINED,
85                 SDL_WINDOWPOS_UNDEFINED,
86                 32, 32,
87                 SDL_WINDOW_OPENGL);
88         SDL_GLContext context = SDL_GL_CreateContext(window);
89         assert(context != nullptr);
90
91         //char buf[16] = { 0 };
92
93         GLint size;
94         glGetIntegerv(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &size);
95         printf("shared_memory_size=%u\n", size);
96
97         string shader_src = ::read_file("decoder.shader");
98         GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
99         GLuint glsl_program_num = glCreateProgram();
100         glAttachShader(glsl_program_num, shader_num);
101         glLinkProgram(glsl_program_num);
102
103         GLint success;
104         glGetProgramiv(glsl_program_num, GL_LINK_STATUS, &success);
105         if (success == GL_FALSE) {
106                 GLchar error_log[1024] = {0};
107                 glGetProgramInfoLog(glsl_program_num, 1024, nullptr, error_log);
108                 fprintf(stderr, "Error linking program: %s\n", error_log);
109                 exit(1);
110         }
111
112         glUseProgram(glsl_program_num);
113
114         string coded = ::read_file(argc >= 2 ? argv[1] : "coded.dat");
115         const char *ptr = &coded[0];
116         const char *end = ptr + coded.size();
117         GLuint sign_bias[NUM_TABLES];
118
119 //      printf("first few bytes offs=%zu: %d %d %d %d %d %d %d %d\n", ptr - coded.data(),
120 //              (uint8_t)ptr[0], (uint8_t)ptr[1], (uint8_t)ptr[2], (uint8_t)ptr[3],
121 //              (uint8_t)ptr[4], (uint8_t)ptr[5], (uint8_t)ptr[6], (uint8_t)ptr[7]);
122
123         // read the rANS tables
124         for (unsigned table = 0; table < NUM_TABLES; ++table) {
125                 uint32_t cum_freq = 0;
126                 for (unsigned sym = 0; sym < NUM_SYMS; ++sym) {
127                         optional<uint32_t> freq = read_varint(&ptr, end);
128                         if (!freq) {
129                                 fprintf(stderr, "Error parsing varint for table %d symbol %d\n", table, sym);
130                                 exit(1);
131                         }
132
133                         decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_start = cum_freq;
134                         decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_freq = *freq;
135                         for (uint32_t i = 0; i < freq; ++i) {
136                                 if (cum_freq < prob_scale)
137                                         decode_tables[table].cum2sym[cum_freq] = (sym + 1) & (NUM_SYMS - 1);
138                                 ++cum_freq;
139                         }
140                 }
141                 sign_bias[table] = cum_freq;
142         }
143
144         // Make cum2sym texture.
145         unique_ptr<uint8_t[]> cum2sym_data(new uint8_t[prob_scale * NUM_TABLES]);
146         for (unsigned table = 0; table < NUM_TABLES; ++table) {
147                 for (unsigned i = 0; i < prob_scale; ++i) {
148                         cum2sym_data[prob_scale * table + i] = decode_tables[table].cum2sym[i];
149                 }
150         }
151         GLuint cum2sym_tex;
152         glGenTextures(1, &cum2sym_tex);
153         glBindTexture(GL_TEXTURE_2D, cum2sym_tex);
154         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
155         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
156         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
157         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
158         glTexImage2D(GL_TEXTURE_2D, 0, GL_R8UI, prob_scale, NUM_TABLES, 0, GL_RED_INTEGER, GL_UNSIGNED_BYTE, cum2sym_data.get());
159         check_error();
160
161         // Make dsyms texture.
162         unique_ptr<pair<uint16_t, uint16_t>[]> dsyms_data(new pair<uint16_t, uint16_t>[NUM_SYMS * NUM_TABLES]);
163         for (unsigned table = 0; table < NUM_TABLES; ++table) {
164                 for (unsigned sym = 0; sym < NUM_SYMS; ++sym) {
165                         dsyms_data[NUM_SYMS * table + sym].first = decode_tables[table].dsyms[sym].sym_start;
166                         dsyms_data[NUM_SYMS * table + sym].second = decode_tables[table].dsyms[sym].sym_freq;
167                 }
168         }
169         GLuint dsyms_tex;
170         glGenTextures(1, &dsyms_tex);
171         glBindTexture(GL_TEXTURE_2D, dsyms_tex);
172         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
173         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
174         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
175         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
176         glTexImage2D(GL_TEXTURE_2D, 0, GL_RG16UI, NUM_SYMS, NUM_TABLES, 0, GL_RG_INTEGER, GL_UNSIGNED_SHORT, dsyms_data.get());
177         check_error();
178
179         GLuint coeff_tex;
180         glGenTextures(1, &coeff_tex);
181         glBindTexture(GL_TEXTURE_2D, coeff_tex);
182         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
183         check_error();
184         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
185         check_error();
186         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
187         check_error();
188         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
189         check_error();
190         glTexImage2D(GL_TEXTURE_2D, 0, GL_R16I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_SHORT, nullptr);
191         check_error();
192
193         GLuint out_tex;
194         glGenTextures(1, &out_tex);
195         glBindTexture(GL_TEXTURE_2D, out_tex);
196         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
197         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
198         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
199         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
200         glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, WIDTH, HEIGHT, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
201         //glTexImage2D(GL_TEXTURE_2D, 0, GL_R32F, WIDTH, HEIGHT, 0, GL_RED, GL_FLOAT, nullptr);
202         check_error();
203
204         GLint cum2sym_tex_pos = glGetUniformLocation(glsl_program_num, "cum2sym_tex");
205         GLint dsyms_tex_pos = glGetUniformLocation(glsl_program_num, "dsyms_tex");
206         GLint out_tex_pos = glGetUniformLocation(glsl_program_num, "out_tex");
207         GLint coeff_tex_pos = glGetUniformLocation(glsl_program_num, "coeff_tex");
208         GLint sign_bias_pos = glGetUniformLocation(glsl_program_num, "sign_bias_per_model");
209         GLint num_blocks_pos = glGetUniformLocation(glsl_program_num, "num_blocks");
210         printf("%d err=0x%x pos=%d,%d,%d,%d\n", __LINE__, glGetError(), cum2sym_tex_pos, dsyms_tex_pos, out_tex_pos, sign_bias_pos);
211
212         // Bind the textures.
213         glUniform1i(cum2sym_tex_pos, 0);
214         glUniform1i(dsyms_tex_pos, 1);
215         glUniform1i(out_tex_pos, 2);
216         glUniform1i(coeff_tex_pos, 3);
217         glUniform1uiv(sign_bias_pos, 16, sign_bias);
218         glUniform1i(num_blocks_pos, num_blocks);
219         glBindImageTexture(0, cum2sym_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
220         glBindImageTexture(1, dsyms_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG16UI);
221         glBindImageTexture(2, out_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8);
222         glBindImageTexture(3, coeff_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16I);
223         printf("%d err=0x%x\n", __LINE__, glGetError());
224
225         // Decode all luma blocks.
226         for (unsigned y = 0; y < 8; ++y) {
227                 for (unsigned x = 0; x < 8; ++x) {
228                         unsigned coeff_num = y * 8 + x;
229
230                         for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; block_idx += BLOCKS_PER_STREAM) {
231                                 optional<uint32_t> num_rans_bytes = read_varint(&ptr, end);
232                                 if (!num_rans_bytes) {
233                                         fprintf(stderr, "Error parsing varint for block %d rANS bytes\n", block_idx);
234                                         exit(1);
235                                 }
236
237                                 CoeffStream *stream = &streams[coeff_num * num_blocks + block_idx / BLOCKS_PER_STREAM];
238                                 stream->src_offset = ptr - coded.data();
239                                 stream->src_len = *num_rans_bytes;
240
241                                 // TODO: check len
242                                 ptr += *num_rans_bytes;
243
244                                 //printf("read %d rANS bytes, %d sign bytes\n", *num_rans_bytes, *num_sign_bytes);
245                         }
246                 }
247         }
248
249         // put the coded data (as a whole) into an SSBO
250         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
251
252         GLuint ssbo_stream, ssbo, ssbo_out;
253
254         glGenBuffers(1, &ssbo_stream);
255         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_stream);
256         glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(streams), streams, GL_STREAM_DRAW);
257         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ssbo_stream);
258         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
259
260         glGenBuffers(1, &ssbo);
261         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
262         glBufferData(GL_SHADER_STORAGE_BUFFER, coded.size(), coded.data(), GL_STREAM_DRAW);
263         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
264         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
265
266         glGenBuffers(1, &ssbo_out);
267         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_out);
268         glBufferData(GL_SHADER_STORAGE_BUFFER, 65536, nullptr, GL_STREAM_DRAW);  // ??
269         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, ssbo_out);
270         check_error();
271
272 #define PARALLEL_SLICES 1
273         steady_clock::time_point start = steady_clock::now();
274         unsigned num_iterations = 1000;
275         for (unsigned i = 0; i < num_iterations; ++i) {
276                 unsigned num_slices = (WIDTH/8)*(HEIGHT/8)/BLOCKS_PER_STREAM;
277                 glDispatchCompute(1, (num_slices+PARALLEL_SLICES-1)/PARALLEL_SLICES, 1);
278         }
279         check_error();
280         glFinish();
281         steady_clock::time_point now = steady_clock::now();
282
283         unsigned *timing = (unsigned *)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, 65536, GL_MAP_READ_BIT);
284         //setlocale(LC_ALL, "nb_NO.UTF-8");
285
286         string phases[] = {
287                 "init",
288                 "loop overhead",
289                 "rANS decode",
290                 "barrier after rANS decode",
291                 "horizontal IDCT",
292                 "barrier after horizontal IDCT",
293                 "vertical IDCT",
294                 "store to texture",
295                 "barrier after store to texture",
296                 "dummy timer for overhead measurement",
297         };
298         printf("\n");
299         for (int i = 0; i < 10; ++i) {
300                 //printf("%d: %'18.0f  [%s]\n", i, double((uint64_t(timing[i * 2 + 1]) << 32) | timing[i * 2]), phases[i].c_str());
301                 printf("%d,%s", i, phases[i].c_str());
302                 for (int j = 0; j < 512; ++j) {
303                         int idx = (j * 10 + i) * 2;
304                         uint64_t val = (uint64_t(timing[idx + 1]) << 32) | timing[idx];
305                 //      printf(" %'18.0f", double(val));
306                 //      printf(" %'6.0f", double(val) * 1e-6);
307                         printf(",%.0f", double(val) * 1e-6);
308                 }
309                 printf("\n");
310                 //printf("  [%s]\n", phases[i].c_str());
311         }
312         printf("\n");
313
314         unsigned char *data = new unsigned char[WIDTH * HEIGHT];
315         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED, GL_UNSIGNED_BYTE, data);
316         check_error();
317         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
318
319 #if 0
320         for (int k = 0; k < 4; ++k) {
321                 for (int y = 0; y < 8; ++y) {
322                         for (int x = 0; x < 8; ++x) {
323                                 printf("%3d ", data[y * WIDTH + x + k*8]);
324                         }
325                         printf("\n");
326                 }
327                 printf("\n");
328         }
329         printf("\n");
330 #else
331         for (int k = 0; k < 4; ++k) {
332                 for (int y = 0; y < 8; ++y) {
333                         for (int x = 0; x < 8; ++x) {
334                                 //printf("%5.2f ", data[(y+8) * WIDTH + x + (1272-k*8)]);
335                                 printf("%3d ", data[y * WIDTH + x + k*8]);
336                         }
337                         printf("\n");
338                 }
339                 printf("\n");
340         }
341         printf("\n");
342 #endif
343
344         FILE *fp = fopen("narabu.pgm", "wb");
345         fprintf(fp, "P5\n%d %d\n255\n", WIDTH, HEIGHT);
346         for (int y = 0; y < HEIGHT; ++y) {
347                 for (int x = 0; x < WIDTH; ++x) {
348                         int k = lrintf(data[y * WIDTH + x]);
349                         if (k < 0) k = 0;
350                         if (k > 255) k = 255;
351                         putc(k, fp);
352                 }
353         }
354         fclose(fp);
355
356         int16_t *coeff_data = new int16_t[WIDTH * HEIGHT];
357         glBindTexture(GL_TEXTURE_2D, coeff_tex);
358         check_error();
359         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_SHORT, coeff_data);
360         check_error();
361         for (int k = 0; k < 4; ++k) {
362                 for (int y = 0; y < 8; ++y) {
363                         for (int x = 0; x < 8; ++x) {
364                                 printf("%3d ", coeff_data[y * WIDTH + x + k*8]);
365                         }
366                         printf("\n");
367                 }
368                 printf("\n");
369         }
370         printf("\n");
371         
372         
373         check_error();
374         glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind
375         
376         printf("foo = 0x%x\n", glGetError());
377         printf("Each iteration took %.3f ms.\n", 1e3 * duration<double>(now - start).count() / num_iterations);
378 }