]> git.sesse.net Git - narabu/blob - narabu.cpp
49afa9ad9d502703281820f3ba5880d848745cf1
[narabu] / narabu.cpp
1 #include <stdio.h>
2 #include <assert.h>
3 #include <SDL2/SDL.h>
4 #include <SDL2/SDL_error.h>
5 #include <SDL2/SDL_video.h>
6 #include <epoxy/gl.h>
7 #include <movit/util.h>
8 #include <string>
9 #include <optional>
10 #include <algorithm>
11 #include <vector>
12 #include <memory>
13 #include <chrono>
14
15 #include "util.h"
16
17 using namespace std;
18 using namespace std::chrono;
19
20 #define WIDTH 1280
21 #define HEIGHT 720
22
23 const unsigned prob_bits = 12;
24 const unsigned prob_scale = 1 << prob_bits;
25 const unsigned NUM_SYMS = 256;
26 const unsigned NUM_TABLES = 8;
27
28 struct RansDecSymbol {
29         unsigned sym_start;
30         unsigned sym_freq;
31 };
32 struct RansDecodeTable {
33         int cum2sym[prob_scale];
34         RansDecSymbol dsyms[NUM_SYMS];
35 };
36 RansDecodeTable decode_tables[NUM_TABLES];
37
38 optional<uint32_t> read_varint(const char **ptr, const char *end)
39 {
40         uint32_t x = 0;
41         int shift = 0;
42         while (*ptr < end) {
43                 int ch = **ptr;
44                 ++(*ptr);       
45
46                 x |= (ch & 0x7f) << shift;
47                 if ((ch & 0x80) == 0) return x;
48                 shift += 7;
49                 if (shift >= 32) {
50                         return nullopt;  // Error: Overlong int.
51                 }
52         }
53         return nullopt;  // Error: EOF.
54 }
55
56 struct CoeffStream {
57         uint src_offset, src_len;
58 };
59 CoeffStream streams[45 * 64];  // HACK
60
61 int main(int argc, char **argv)
62 {
63         // Set up an OpenGL context using SDL.
64         if (SDL_Init(SDL_INIT_VIDEO) == -1) {
65                 fprintf(stderr, "SDL_Init failed: %s\n", SDL_GetError());
66                 exit(1);
67         }
68         SDL_GL_SetAttribute(SDL_GL_DEPTH_SIZE, 0);
69         SDL_GL_SetAttribute(SDL_GL_STENCIL_SIZE, 0);
70         SDL_GL_SetAttribute(SDL_GL_DOUBLEBUFFER, 1);
71         SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
72         SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
73         SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 5);
74
75         SDL_Window *window = SDL_CreateWindow("OpenGL window for unit test",
76                 SDL_WINDOWPOS_UNDEFINED,
77                 SDL_WINDOWPOS_UNDEFINED,
78                 32, 32,
79                 SDL_WINDOW_OPENGL);
80         SDL_GLContext context = SDL_GL_CreateContext(window);
81         assert(context != nullptr);
82
83         //char buf[16] = { 0 };
84
85         GLint size;
86         glGetIntegerv(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &size);
87         printf("shared_memory_size=%u\n", size);
88
89         string shader_src = ::read_file("decoder.shader");
90         GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
91         GLuint glsl_program_num = glCreateProgram();
92         glAttachShader(glsl_program_num, shader_num);
93         glLinkProgram(glsl_program_num);
94
95         GLint success;
96         glGetProgramiv(glsl_program_num, GL_LINK_STATUS, &success);
97         if (success == GL_FALSE) {
98                 GLchar error_log[1024] = {0};
99                 glGetProgramInfoLog(glsl_program_num, 1024, nullptr, error_log);
100                 fprintf(stderr, "Error linking program: %s\n", error_log);
101                 exit(1);
102         }
103
104         glUseProgram(glsl_program_num);
105
106         string coded = ::read_file(argc >= 2 ? argv[1] : "coded.dat");
107         const char *ptr = &coded[0];
108         const char *end = ptr + coded.size();
109         GLuint sign_bias[NUM_TABLES];
110
111 //      printf("first few bytes offs=%zu: %d %d %d %d %d %d %d %d\n", ptr - coded.data(),
112 //              (uint8_t)ptr[0], (uint8_t)ptr[1], (uint8_t)ptr[2], (uint8_t)ptr[3],
113 //              (uint8_t)ptr[4], (uint8_t)ptr[5], (uint8_t)ptr[6], (uint8_t)ptr[7]);
114
115         // read the rANS tables
116         for (unsigned table = 0; table < NUM_TABLES; ++table) {
117                 uint32_t cum_freq = 0;
118                 for (unsigned sym = 0; sym < NUM_SYMS; ++sym) {
119                         optional<uint32_t> freq = read_varint(&ptr, end);
120                         if (!freq) {
121                                 fprintf(stderr, "Error parsing varint for table %d symbol %d\n", table, sym);
122                                 exit(1);
123                         }
124
125                         decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_start = cum_freq;
126                         decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_freq = *freq;
127                         for (uint32_t i = 0; i < freq; ++i) {
128                                 if (cum_freq < prob_scale)
129                                         decode_tables[table].cum2sym[cum_freq] = (sym + 1) & (NUM_SYMS - 1);
130                                 ++cum_freq;
131                         }
132                 }
133                 sign_bias[table] = cum_freq;
134         }
135
136         // Make cum2sym texture.
137         unique_ptr<uint8_t[]> cum2sym_data(new uint8_t[prob_scale * NUM_TABLES]);
138         for (unsigned table = 0; table < NUM_TABLES; ++table) {
139                 for (unsigned i = 0; i < prob_scale; ++i) {
140                         cum2sym_data[prob_scale * table + i] = decode_tables[table].cum2sym[i];
141                 }
142         }
143         GLuint cum2sym_tex;
144         glGenTextures(1, &cum2sym_tex);
145         glBindTexture(GL_TEXTURE_2D, cum2sym_tex);
146         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
147         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
148         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
149         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
150         glTexImage2D(GL_TEXTURE_2D, 0, GL_R8UI, prob_scale, NUM_TABLES, 0, GL_RED_INTEGER, GL_UNSIGNED_BYTE, cum2sym_data.get());
151         check_error();
152
153         // Make dsyms texture.
154         unique_ptr<pair<uint16_t, uint16_t>[]> dsyms_data(new pair<uint16_t, uint16_t>[NUM_SYMS * NUM_TABLES]);
155         for (unsigned table = 0; table < NUM_TABLES; ++table) {
156                 for (unsigned sym = 0; sym < NUM_SYMS; ++sym) {
157                         dsyms_data[NUM_SYMS * table + sym].first = decode_tables[table].dsyms[sym].sym_start;
158                         dsyms_data[NUM_SYMS * table + sym].second = decode_tables[table].dsyms[sym].sym_freq;
159                 }
160         }
161         GLuint dsyms_tex;
162         glGenTextures(1, &dsyms_tex);
163         glBindTexture(GL_TEXTURE_2D, dsyms_tex);
164         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
165         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
166         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
167         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
168         glTexImage2D(GL_TEXTURE_2D, 0, GL_RG16UI, NUM_SYMS, NUM_TABLES, 0, GL_RG_INTEGER, GL_UNSIGNED_SHORT, dsyms_data.get());
169         check_error();
170
171         GLuint coeff_tex;
172         glGenTextures(1, &coeff_tex);
173         glBindTexture(GL_TEXTURE_2D, coeff_tex);
174         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
175         check_error();
176         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
177         check_error();
178         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
179         check_error();
180         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
181         check_error();
182         glTexImage2D(GL_TEXTURE_2D, 0, GL_R16I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_SHORT, nullptr);
183         check_error();
184
185         GLuint out_tex;
186         glGenTextures(1, &out_tex);
187         glBindTexture(GL_TEXTURE_2D, out_tex);
188         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
189         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
190         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
191         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
192         glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, WIDTH, HEIGHT, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
193         //glTexImage2D(GL_TEXTURE_2D, 0, GL_R32F, WIDTH, HEIGHT, 0, GL_RED, GL_FLOAT, nullptr);
194         check_error();
195
196         GLint cum2sym_tex_pos = glGetUniformLocation(glsl_program_num, "cum2sym_tex");
197         GLint dsyms_tex_pos = glGetUniformLocation(glsl_program_num, "dsyms_tex");
198         GLint out_tex_pos = glGetUniformLocation(glsl_program_num, "out_tex");
199         GLint coeff_tex_pos = glGetUniformLocation(glsl_program_num, "coeff_tex");
200         GLint sign_bias_pos = glGetUniformLocation(glsl_program_num, "sign_bias_per_model");
201         printf("%d err=0x%x pos=%d,%d,%d,%d\n", __LINE__, glGetError(), cum2sym_tex_pos, dsyms_tex_pos, out_tex_pos, sign_bias_pos);
202
203         // Bind the textures.
204         glUniform1i(cum2sym_tex_pos, 0);
205         glUniform1i(dsyms_tex_pos, 1);
206         glUniform1i(out_tex_pos, 2);
207         glUniform1i(coeff_tex_pos, 3);
208         glUniform1uiv(sign_bias_pos, 16, sign_bias);
209         glBindImageTexture(0, cum2sym_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
210         glBindImageTexture(1, dsyms_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG16UI);
211         glBindImageTexture(2, out_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8);
212         glBindImageTexture(3, coeff_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16I);
213         printf("%d err=0x%x\n", __LINE__, glGetError());
214
215         // Decode all luma blocks.
216         unsigned num_blocks = (HEIGHT / 16);
217         for (unsigned y = 0; y < 8; ++y) {
218                 for (unsigned x = 0; x < 8; ++x) {
219                         unsigned coeff_num = y * 8 + x;
220
221                         for (unsigned yb = 0; yb < HEIGHT; yb += 16) {
222                                 optional<uint32_t> num_rans_bytes = read_varint(&ptr, end);
223                                 if (!num_rans_bytes) {
224                                         fprintf(stderr, "Error parsing varint for block %d rANS bytes\n", yb);
225                                         exit(1);
226                                 }
227
228                                 CoeffStream *stream = &streams[coeff_num * num_blocks + (yb/16)];
229                                 stream->src_offset = ptr - coded.data();
230                                 stream->src_len = *num_rans_bytes;
231
232                                 // TODO: check len
233                                 ptr += *num_rans_bytes;
234
235                                 //printf("read %d rANS bytes, %d sign bytes\n", *num_rans_bytes, *num_sign_bytes);
236                         }
237                 }
238         }
239
240         // put the coded data (as a whole) into an SSBO
241         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
242
243         GLuint ssbo_stream, ssbo, ssbo_out;
244
245         glGenBuffers(1, &ssbo_stream);
246         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_stream);
247         glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(streams), streams, GL_STREAM_DRAW);
248         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ssbo_stream);
249         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
250
251         glGenBuffers(1, &ssbo);
252         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
253         glBufferData(GL_SHADER_STORAGE_BUFFER, coded.size(), coded.data(), GL_STREAM_DRAW);
254         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
255         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
256
257         glGenBuffers(1, &ssbo_out);
258         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_out);
259         glBufferData(GL_SHADER_STORAGE_BUFFER, 65536, nullptr, GL_STREAM_DRAW);  // ??
260         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, ssbo_out);
261         check_error();
262
263 #define PARALLEL_SLICES 1
264         steady_clock::time_point start = steady_clock::now();
265         for (int i = 0; i < 1000; ++i) {
266                 unsigned num_slices = (WIDTH/8)*(HEIGHT/8)/320;
267                 glDispatchCompute(1, (num_slices+PARALLEL_SLICES-1)/PARALLEL_SLICES, 1);
268         }
269         check_error();
270         glFinish();
271         steady_clock::time_point now = steady_clock::now();
272
273         unsigned *timing = (unsigned *)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, 65536, GL_MAP_READ_BIT);
274         //setlocale(LC_ALL, "nb_NO.UTF-8");
275
276         string phases[] = {
277                 "init",
278                 "loop overhead",
279                 "rANS decode",
280                 "barrier after rANS decode",
281                 "horizontal IDCT",
282                 "barrier after horizontal IDCT",
283                 "vertical IDCT",
284                 "store to texture",
285                 "barrier after store to texture",
286                 "dummy timer for overhead measurement",
287         };
288         printf("\n");
289         for (int i = 0; i < 10; ++i) {
290                 //printf("%d: %'18.0f  [%s]\n", i, double((uint64_t(timing[i * 2 + 1]) << 32) | timing[i * 2]), phases[i].c_str());
291                 printf("%d,%s", i, phases[i].c_str());
292                 for (int j = 0; j < 512; ++j) {
293                         int idx = (j * 10 + i) * 2;
294                         uint64_t val = (uint64_t(timing[idx + 1]) << 32) | timing[idx];
295                 //      printf(" %'18.0f", double(val));
296                 //      printf(" %'6.0f", double(val) * 1e-6);
297                         printf(",%.0f", double(val) * 1e-6);
298                 }
299                 printf("\n");
300                 //printf("  [%s]\n", phases[i].c_str());
301         }
302         printf("\n");
303
304         unsigned char *data = new unsigned char[WIDTH * HEIGHT];
305         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED, GL_UNSIGNED_BYTE, data);
306         check_error();
307         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
308
309 #if 0
310         for (int k = 0; k < 4; ++k) {
311                 for (int y = 0; y < 8; ++y) {
312                         for (int x = 0; x < 8; ++x) {
313                                 printf("%3d ", data[y * WIDTH + x + k*8]);
314                         }
315                         printf("\n");
316                 }
317                 printf("\n");
318         }
319         printf("\n");
320 #else
321         for (int k = 0; k < 4; ++k) {
322                 for (int y = 0; y < 8; ++y) {
323                         for (int x = 0; x < 8; ++x) {
324                                 //printf("%5.2f ", data[(y+8) * WIDTH + x + (1272-k*8)]);
325                                 printf("%3d ", data[y * WIDTH + x + k*8]);
326                         }
327                         printf("\n");
328                 }
329                 printf("\n");
330         }
331         printf("\n");
332 #endif
333
334         FILE *fp = fopen("narabu.pgm", "wb");
335         fprintf(fp, "P5\n%d %d\n255\n", WIDTH, HEIGHT);
336         for (int y = 0; y < HEIGHT; ++y) {
337                 for (int x = 0; x < WIDTH; ++x) {
338                         int k = lrintf(data[y * WIDTH + x]);
339                         if (k < 0) k = 0;
340                         if (k > 255) k = 255;
341                         putc(k, fp);
342                 }
343         }
344         fclose(fp);
345
346         int16_t *coeff_data = new int16_t[WIDTH * HEIGHT];
347         glBindTexture(GL_TEXTURE_2D, coeff_tex);
348         check_error();
349         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_SHORT, coeff_data);
350         check_error();
351         for (int k = 0; k < 4; ++k) {
352                 for (int y = 0; y < 8; ++y) {
353                         for (int x = 0; x < 8; ++x) {
354                                 printf("%3d ", coeff_data[y * WIDTH + x + k*8]);
355                         }
356                         printf("\n");
357                 }
358                 printf("\n");
359         }
360         printf("\n");
361         
362         
363         check_error();
364         glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind
365         
366         printf("foo = 0x%x\n", glGetError());
367         printf("Each iteration took %.3f ms.\n", 1e3 * duration<double>(now - start).count() / 1000);
368 }