]> git.sesse.net Git - narabu/blob - narabu.cpp
Make blocks per stream a named constant.
[narabu] / narabu.cpp
1 #include <stdio.h>
2 #include <assert.h>
3 #include <SDL2/SDL.h>
4 #include <SDL2/SDL_error.h>
5 #include <SDL2/SDL_video.h>
6 #include <epoxy/gl.h>
7 #include <movit/util.h>
8 #include <string>
9 #include <optional>
10 #include <algorithm>
11 #include <vector>
12 #include <memory>
13 #include <chrono>
14
15 #include "util.h"
16
17 using namespace std;
18 using namespace std::chrono;
19
20 #define WIDTH 1280
21 #define HEIGHT 720
22
23 const unsigned prob_bits = 12;
24 const unsigned prob_scale = 1 << prob_bits;
25 const unsigned NUM_SYMS = 256;
26 const unsigned NUM_TABLES = 8;
27 const unsigned BLOCKS_PER_STREAM = 320;
28
29 struct RansDecSymbol {
30         unsigned sym_start;
31         unsigned sym_freq;
32 };
33 struct RansDecodeTable {
34         int cum2sym[prob_scale];
35         RansDecSymbol dsyms[NUM_SYMS];
36 };
37 RansDecodeTable decode_tables[NUM_TABLES];
38
39 optional<uint32_t> read_varint(const char **ptr, const char *end)
40 {
41         uint32_t x = 0;
42         int shift = 0;
43         while (*ptr < end) {
44                 int ch = **ptr;
45                 ++(*ptr);       
46
47                 x |= (ch & 0x7f) << shift;
48                 if ((ch & 0x80) == 0) return x;
49                 shift += 7;
50                 if (shift >= 32) {
51                         return nullopt;  // Error: Overlong int.
52                 }
53         }
54         return nullopt;  // Error: EOF.
55 }
56
57 struct CoeffStream {
58         uint src_offset, src_len;
59 };
60 CoeffStream streams[45 * 64];  // HACK
61
62 int main(int argc, char **argv)
63 {
64         // Set up an OpenGL context using SDL.
65         if (SDL_Init(SDL_INIT_VIDEO) == -1) {
66                 fprintf(stderr, "SDL_Init failed: %s\n", SDL_GetError());
67                 exit(1);
68         }
69         SDL_GL_SetAttribute(SDL_GL_DEPTH_SIZE, 0);
70         SDL_GL_SetAttribute(SDL_GL_STENCIL_SIZE, 0);
71         SDL_GL_SetAttribute(SDL_GL_DOUBLEBUFFER, 1);
72         SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
73         SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
74         SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 5);
75
76         SDL_Window *window = SDL_CreateWindow("OpenGL window for unit test",
77                 SDL_WINDOWPOS_UNDEFINED,
78                 SDL_WINDOWPOS_UNDEFINED,
79                 32, 32,
80                 SDL_WINDOW_OPENGL);
81         SDL_GLContext context = SDL_GL_CreateContext(window);
82         assert(context != nullptr);
83
84         //char buf[16] = { 0 };
85
86         GLint size;
87         glGetIntegerv(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &size);
88         printf("shared_memory_size=%u\n", size);
89
90         string shader_src = ::read_file("decoder.shader");
91         GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
92         GLuint glsl_program_num = glCreateProgram();
93         glAttachShader(glsl_program_num, shader_num);
94         glLinkProgram(glsl_program_num);
95
96         GLint success;
97         glGetProgramiv(glsl_program_num, GL_LINK_STATUS, &success);
98         if (success == GL_FALSE) {
99                 GLchar error_log[1024] = {0};
100                 glGetProgramInfoLog(glsl_program_num, 1024, nullptr, error_log);
101                 fprintf(stderr, "Error linking program: %s\n", error_log);
102                 exit(1);
103         }
104
105         glUseProgram(glsl_program_num);
106
107         string coded = ::read_file(argc >= 2 ? argv[1] : "coded.dat");
108         const char *ptr = &coded[0];
109         const char *end = ptr + coded.size();
110         GLuint sign_bias[NUM_TABLES];
111
112 //      printf("first few bytes offs=%zu: %d %d %d %d %d %d %d %d\n", ptr - coded.data(),
113 //              (uint8_t)ptr[0], (uint8_t)ptr[1], (uint8_t)ptr[2], (uint8_t)ptr[3],
114 //              (uint8_t)ptr[4], (uint8_t)ptr[5], (uint8_t)ptr[6], (uint8_t)ptr[7]);
115
116         // read the rANS tables
117         for (unsigned table = 0; table < NUM_TABLES; ++table) {
118                 uint32_t cum_freq = 0;
119                 for (unsigned sym = 0; sym < NUM_SYMS; ++sym) {
120                         optional<uint32_t> freq = read_varint(&ptr, end);
121                         if (!freq) {
122                                 fprintf(stderr, "Error parsing varint for table %d symbol %d\n", table, sym);
123                                 exit(1);
124                         }
125
126                         decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_start = cum_freq;
127                         decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_freq = *freq;
128                         for (uint32_t i = 0; i < freq; ++i) {
129                                 if (cum_freq < prob_scale)
130                                         decode_tables[table].cum2sym[cum_freq] = (sym + 1) & (NUM_SYMS - 1);
131                                 ++cum_freq;
132                         }
133                 }
134                 sign_bias[table] = cum_freq;
135         }
136
137         // Make cum2sym texture.
138         unique_ptr<uint8_t[]> cum2sym_data(new uint8_t[prob_scale * NUM_TABLES]);
139         for (unsigned table = 0; table < NUM_TABLES; ++table) {
140                 for (unsigned i = 0; i < prob_scale; ++i) {
141                         cum2sym_data[prob_scale * table + i] = decode_tables[table].cum2sym[i];
142                 }
143         }
144         GLuint cum2sym_tex;
145         glGenTextures(1, &cum2sym_tex);
146         glBindTexture(GL_TEXTURE_2D, cum2sym_tex);
147         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
148         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
149         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
150         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
151         glTexImage2D(GL_TEXTURE_2D, 0, GL_R8UI, prob_scale, NUM_TABLES, 0, GL_RED_INTEGER, GL_UNSIGNED_BYTE, cum2sym_data.get());
152         check_error();
153
154         // Make dsyms texture.
155         unique_ptr<pair<uint16_t, uint16_t>[]> dsyms_data(new pair<uint16_t, uint16_t>[NUM_SYMS * NUM_TABLES]);
156         for (unsigned table = 0; table < NUM_TABLES; ++table) {
157                 for (unsigned sym = 0; sym < NUM_SYMS; ++sym) {
158                         dsyms_data[NUM_SYMS * table + sym].first = decode_tables[table].dsyms[sym].sym_start;
159                         dsyms_data[NUM_SYMS * table + sym].second = decode_tables[table].dsyms[sym].sym_freq;
160                 }
161         }
162         GLuint dsyms_tex;
163         glGenTextures(1, &dsyms_tex);
164         glBindTexture(GL_TEXTURE_2D, dsyms_tex);
165         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
166         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
167         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
168         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
169         glTexImage2D(GL_TEXTURE_2D, 0, GL_RG16UI, NUM_SYMS, NUM_TABLES, 0, GL_RG_INTEGER, GL_UNSIGNED_SHORT, dsyms_data.get());
170         check_error();
171
172         GLuint coeff_tex;
173         glGenTextures(1, &coeff_tex);
174         glBindTexture(GL_TEXTURE_2D, coeff_tex);
175         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
176         check_error();
177         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
178         check_error();
179         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
180         check_error();
181         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
182         check_error();
183         glTexImage2D(GL_TEXTURE_2D, 0, GL_R16I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_SHORT, nullptr);
184         check_error();
185
186         GLuint out_tex;
187         glGenTextures(1, &out_tex);
188         glBindTexture(GL_TEXTURE_2D, out_tex);
189         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
190         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
191         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
192         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
193         glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, WIDTH, HEIGHT, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
194         //glTexImage2D(GL_TEXTURE_2D, 0, GL_R32F, WIDTH, HEIGHT, 0, GL_RED, GL_FLOAT, nullptr);
195         check_error();
196
197         GLint cum2sym_tex_pos = glGetUniformLocation(glsl_program_num, "cum2sym_tex");
198         GLint dsyms_tex_pos = glGetUniformLocation(glsl_program_num, "dsyms_tex");
199         GLint out_tex_pos = glGetUniformLocation(glsl_program_num, "out_tex");
200         GLint coeff_tex_pos = glGetUniformLocation(glsl_program_num, "coeff_tex");
201         GLint sign_bias_pos = glGetUniformLocation(glsl_program_num, "sign_bias_per_model");
202         GLint num_blocks_pos = glGetUniformLocation(glsl_program_num, "num_blocks");
203         printf("%d err=0x%x pos=%d,%d,%d,%d\n", __LINE__, glGetError(), cum2sym_tex_pos, dsyms_tex_pos, out_tex_pos, sign_bias_pos);
204
205         unsigned num_blocks = (HEIGHT / 16);
206
207         // Bind the textures.
208         glUniform1i(cum2sym_tex_pos, 0);
209         glUniform1i(dsyms_tex_pos, 1);
210         glUniform1i(out_tex_pos, 2);
211         glUniform1i(coeff_tex_pos, 3);
212         glUniform1uiv(sign_bias_pos, 16, sign_bias);
213         glUniform1i(num_blocks_pos, num_blocks);
214         glBindImageTexture(0, cum2sym_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
215         glBindImageTexture(1, dsyms_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG16UI);
216         glBindImageTexture(2, out_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8);
217         glBindImageTexture(3, coeff_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16I);
218         printf("%d err=0x%x\n", __LINE__, glGetError());
219
220         // Decode all luma blocks.
221         for (unsigned y = 0; y < 8; ++y) {
222                 for (unsigned x = 0; x < 8; ++x) {
223                         unsigned coeff_num = y * 8 + x;
224
225                         for (unsigned yb = 0; yb < HEIGHT; yb += 16) {
226                                 optional<uint32_t> num_rans_bytes = read_varint(&ptr, end);
227                                 if (!num_rans_bytes) {
228                                         fprintf(stderr, "Error parsing varint for block %d rANS bytes\n", yb);
229                                         exit(1);
230                                 }
231
232                                 CoeffStream *stream = &streams[coeff_num * num_blocks + (yb/16)];
233                                 stream->src_offset = ptr - coded.data();
234                                 stream->src_len = *num_rans_bytes;
235
236                                 // TODO: check len
237                                 ptr += *num_rans_bytes;
238
239                                 //printf("read %d rANS bytes, %d sign bytes\n", *num_rans_bytes, *num_sign_bytes);
240                         }
241                 }
242         }
243
244         // put the coded data (as a whole) into an SSBO
245         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
246
247         GLuint ssbo_stream, ssbo, ssbo_out;
248
249         glGenBuffers(1, &ssbo_stream);
250         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_stream);
251         glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(streams), streams, GL_STREAM_DRAW);
252         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ssbo_stream);
253         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
254
255         glGenBuffers(1, &ssbo);
256         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
257         glBufferData(GL_SHADER_STORAGE_BUFFER, coded.size(), coded.data(), GL_STREAM_DRAW);
258         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
259         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
260
261         glGenBuffers(1, &ssbo_out);
262         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_out);
263         glBufferData(GL_SHADER_STORAGE_BUFFER, 65536, nullptr, GL_STREAM_DRAW);  // ??
264         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, ssbo_out);
265         check_error();
266
267 #define PARALLEL_SLICES 1
268         steady_clock::time_point start = steady_clock::now();
269         for (int i = 0; i < 1000; ++i) {
270                 unsigned num_slices = (WIDTH/8)*(HEIGHT/8)/BLOCKS_PER_STREAM;
271                 glDispatchCompute(1, (num_slices+PARALLEL_SLICES-1)/PARALLEL_SLICES, 1);
272         }
273         check_error();
274         glFinish();
275         steady_clock::time_point now = steady_clock::now();
276
277         unsigned *timing = (unsigned *)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, 65536, GL_MAP_READ_BIT);
278         //setlocale(LC_ALL, "nb_NO.UTF-8");
279
280         string phases[] = {
281                 "init",
282                 "loop overhead",
283                 "rANS decode",
284                 "barrier after rANS decode",
285                 "horizontal IDCT",
286                 "barrier after horizontal IDCT",
287                 "vertical IDCT",
288                 "store to texture",
289                 "barrier after store to texture",
290                 "dummy timer for overhead measurement",
291         };
292         printf("\n");
293         for (int i = 0; i < 10; ++i) {
294                 //printf("%d: %'18.0f  [%s]\n", i, double((uint64_t(timing[i * 2 + 1]) << 32) | timing[i * 2]), phases[i].c_str());
295                 printf("%d,%s", i, phases[i].c_str());
296                 for (int j = 0; j < 512; ++j) {
297                         int idx = (j * 10 + i) * 2;
298                         uint64_t val = (uint64_t(timing[idx + 1]) << 32) | timing[idx];
299                 //      printf(" %'18.0f", double(val));
300                 //      printf(" %'6.0f", double(val) * 1e-6);
301                         printf(",%.0f", double(val) * 1e-6);
302                 }
303                 printf("\n");
304                 //printf("  [%s]\n", phases[i].c_str());
305         }
306         printf("\n");
307
308         unsigned char *data = new unsigned char[WIDTH * HEIGHT];
309         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED, GL_UNSIGNED_BYTE, data);
310         check_error();
311         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
312
313 #if 0
314         for (int k = 0; k < 4; ++k) {
315                 for (int y = 0; y < 8; ++y) {
316                         for (int x = 0; x < 8; ++x) {
317                                 printf("%3d ", data[y * WIDTH + x + k*8]);
318                         }
319                         printf("\n");
320                 }
321                 printf("\n");
322         }
323         printf("\n");
324 #else
325         for (int k = 0; k < 4; ++k) {
326                 for (int y = 0; y < 8; ++y) {
327                         for (int x = 0; x < 8; ++x) {
328                                 //printf("%5.2f ", data[(y+8) * WIDTH + x + (1272-k*8)]);
329                                 printf("%3d ", data[y * WIDTH + x + k*8]);
330                         }
331                         printf("\n");
332                 }
333                 printf("\n");
334         }
335         printf("\n");
336 #endif
337
338         FILE *fp = fopen("narabu.pgm", "wb");
339         fprintf(fp, "P5\n%d %d\n255\n", WIDTH, HEIGHT);
340         for (int y = 0; y < HEIGHT; ++y) {
341                 for (int x = 0; x < WIDTH; ++x) {
342                         int k = lrintf(data[y * WIDTH + x]);
343                         if (k < 0) k = 0;
344                         if (k > 255) k = 255;
345                         putc(k, fp);
346                 }
347         }
348         fclose(fp);
349
350         int16_t *coeff_data = new int16_t[WIDTH * HEIGHT];
351         glBindTexture(GL_TEXTURE_2D, coeff_tex);
352         check_error();
353         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_SHORT, coeff_data);
354         check_error();
355         for (int k = 0; k < 4; ++k) {
356                 for (int y = 0; y < 8; ++y) {
357                         for (int x = 0; x < 8; ++x) {
358                                 printf("%3d ", coeff_data[y * WIDTH + x + k*8]);
359                         }
360                         printf("\n");
361                 }
362                 printf("\n");
363         }
364         printf("\n");
365         
366         
367         check_error();
368         glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind
369         
370         printf("foo = 0x%x\n", glGetError());
371         printf("Each iteration took %.3f ms.\n", 1e3 * duration<double>(now - start).count() / 1000);
372 }