]> git.sesse.net Git - narabu/blob - narabu.cpp
Silence some Mesa warnings.
[narabu] / narabu.cpp
1 #include <stdio.h>
2 #include <assert.h>
3 #include <SDL2/SDL.h>
4 #include <SDL2/SDL_error.h>
5 #include <SDL2/SDL_video.h>
6 #include <epoxy/gl.h>
7 #include <movit/util.h>
8 #include <string>
9 #include <optional>
10 #include <algorithm>
11 #include <vector>
12 #include <memory>
13 #include <chrono>
14
15 #include "util.h"
16
17 using namespace std;
18 using namespace std::chrono;
19
20 #define WIDTH 1280
21 #define HEIGHT 720
22 #define WIDTH_BLOCKS (WIDTH/8)
23 #define WIDTH_BLOCKS_CHROMA (WIDTH/16)
24 #define HEIGHT_BLOCKS (HEIGHT/8)
25 #define NUM_BLOCKS (WIDTH_BLOCKS * HEIGHT_BLOCKS)
26 #define NUM_BLOCKS_CHROMA (WIDTH_BLOCKS_CHROMA * HEIGHT_BLOCKS)
27
28 const unsigned prob_bits = 12;
29 const unsigned prob_scale = 1 << prob_bits;
30 const unsigned NUM_SYMS = 256;
31 const unsigned NUM_TABLES = 8;
32 const unsigned BLOCKS_PER_STREAM = 320;
33
34 struct RansDecSymbol {
35         unsigned sym_start;
36         unsigned sym_freq;
37 };
38 struct RansDecodeTable {
39         int cum2sym[prob_scale];
40         RansDecSymbol dsyms[NUM_SYMS];
41 };
42 RansDecodeTable decode_tables[NUM_TABLES];
43
44 optional<uint32_t> read_varint(const char **ptr, const char *end)
45 {
46         uint32_t x = 0;
47         int shift = 0;
48         while (*ptr < end) {
49                 int ch = **ptr;
50                 ++(*ptr);       
51
52                 x |= (ch & 0x7f) << shift;
53                 if ((ch & 0x80) == 0) return x;
54                 shift += 7;
55                 if (shift >= 32) {
56                         return nullopt;  // Error: Overlong int.
57                 }
58         }
59         return nullopt;  // Error: EOF.
60 }
61
62 const unsigned num_blocks = ((NUM_BLOCKS + BLOCKS_PER_STREAM - 1) / BLOCKS_PER_STREAM);
63
64 struct CoeffStream {
65         uint src_offset, src_len;
66 };
67 CoeffStream streams[num_blocks * 64];
68
69 int main(int argc, char **argv)
70 {
71         // Set up an OpenGL context using SDL.
72         if (SDL_Init(SDL_INIT_VIDEO) == -1) {
73                 fprintf(stderr, "SDL_Init failed: %s\n", SDL_GetError());
74                 exit(1);
75         }
76         SDL_GL_SetAttribute(SDL_GL_DEPTH_SIZE, 0);
77         SDL_GL_SetAttribute(SDL_GL_STENCIL_SIZE, 0);
78         SDL_GL_SetAttribute(SDL_GL_DOUBLEBUFFER, 1);
79         SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
80         SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
81         SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 5);
82
83         SDL_Window *window = SDL_CreateWindow("OpenGL window for unit test",
84                 SDL_WINDOWPOS_UNDEFINED,
85                 SDL_WINDOWPOS_UNDEFINED,
86                 32, 32,
87                 SDL_WINDOW_OPENGL);
88         SDL_GLContext context = SDL_GL_CreateContext(window);
89         assert(context != nullptr);
90
91         //char buf[16] = { 0 };
92
93         GLint size;
94         glGetIntegerv(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &size);
95         printf("shared_memory_size=%u\n", size);
96
97         string shader_src = ::read_file("decoder.shader");
98         GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
99         GLuint glsl_program_num = glCreateProgram();
100         glAttachShader(glsl_program_num, shader_num);
101         glLinkProgram(glsl_program_num);
102
103         GLint success;
104         glGetProgramiv(glsl_program_num, GL_LINK_STATUS, &success);
105         if (success == GL_FALSE) {
106                 GLchar error_log[1024] = {0};
107                 glGetProgramInfoLog(glsl_program_num, 1024, nullptr, error_log);
108                 fprintf(stderr, "Error linking program: %s\n", error_log);
109                 exit(1);
110         }
111
112         glUseProgram(glsl_program_num);
113
114         string coded = ::read_file(argc >= 2 ? argv[1] : "coded.dat");
115         const char *ptr = &coded[0];
116         const char *end = ptr + coded.size();
117         GLuint sign_bias[NUM_TABLES];
118
119 //      printf("first few bytes offs=%zu: %d %d %d %d %d %d %d %d\n", ptr - coded.data(),
120 //              (uint8_t)ptr[0], (uint8_t)ptr[1], (uint8_t)ptr[2], (uint8_t)ptr[3],
121 //              (uint8_t)ptr[4], (uint8_t)ptr[5], (uint8_t)ptr[6], (uint8_t)ptr[7]);
122
123         // read the rANS tables
124         for (unsigned table = 0; table < NUM_TABLES; ++table) {
125                 uint32_t cum_freq = 0;
126                 for (unsigned sym = 0; sym < NUM_SYMS; ++sym) {
127                         optional<uint32_t> freq = read_varint(&ptr, end);
128                         if (!freq) {
129                                 fprintf(stderr, "Error parsing varint for table %d symbol %d\n", table, sym);
130                                 exit(1);
131                         }
132
133                         decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_start = cum_freq;
134                         decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_freq = *freq;
135                         for (uint32_t i = 0; i < freq; ++i) {
136                                 if (cum_freq < prob_scale)
137                                         decode_tables[table].cum2sym[cum_freq] = (sym + 1) & (NUM_SYMS - 1);
138                                 ++cum_freq;
139                         }
140                 }
141                 sign_bias[table] = cum_freq;
142         }
143
144         // Make cum2sym texture.
145         unique_ptr<uint8_t[]> cum2sym_data(new uint8_t[prob_scale * NUM_TABLES]);
146         for (unsigned table = 0; table < NUM_TABLES; ++table) {
147                 for (unsigned i = 0; i < prob_scale; ++i) {
148                         cum2sym_data[prob_scale * table + i] = decode_tables[table].cum2sym[i];
149                 }
150         }
151         GLuint cum2sym_tex;
152         glGenTextures(1, &cum2sym_tex);
153         glBindTexture(GL_TEXTURE_2D, cum2sym_tex);
154         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
155         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
156         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
157         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
158         glTexImage2D(GL_TEXTURE_2D, 0, GL_R8UI, prob_scale, NUM_TABLES, 0, GL_RED_INTEGER, GL_UNSIGNED_BYTE, cum2sym_data.get());
159         check_error();
160
161         // Make dsyms texture.
162         unique_ptr<pair<uint16_t, uint16_t>[]> dsyms_data(new pair<uint16_t, uint16_t>[NUM_SYMS * NUM_TABLES]);
163         for (unsigned table = 0; table < NUM_TABLES; ++table) {
164                 for (unsigned sym = 0; sym < NUM_SYMS; ++sym) {
165                         dsyms_data[NUM_SYMS * table + sym].first = decode_tables[table].dsyms[sym].sym_start;
166                         dsyms_data[NUM_SYMS * table + sym].second = decode_tables[table].dsyms[sym].sym_freq;
167                 }
168         }
169         GLuint dsyms_tex;
170         glGenTextures(1, &dsyms_tex);
171         glBindTexture(GL_TEXTURE_2D, dsyms_tex);
172         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
173         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
174         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
175         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
176         glTexImage2D(GL_TEXTURE_2D, 0, GL_RG16UI, NUM_SYMS, NUM_TABLES, 0, GL_RG_INTEGER, GL_UNSIGNED_SHORT, dsyms_data.get());
177         check_error();
178
179         GLuint coeff_tex;
180         glGenTextures(1, &coeff_tex);
181         glBindTexture(GL_TEXTURE_2D, coeff_tex);
182         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
183         check_error();
184         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
185         check_error();
186         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
187         check_error();
188         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
189         check_error();
190         glTexImage2D(GL_TEXTURE_2D, 0, GL_R16I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_SHORT, nullptr);
191         check_error();
192
193         GLuint out_tex;
194         glGenTextures(1, &out_tex);
195         glBindTexture(GL_TEXTURE_2D, out_tex);
196         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
197         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
198         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
199         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
200         glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, WIDTH, HEIGHT, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
201         //glTexImage2D(GL_TEXTURE_2D, 0, GL_R32F, WIDTH, HEIGHT, 0, GL_RED, GL_FLOAT, nullptr);
202         check_error();
203
204         GLint cum2sym_tex_pos = glGetUniformLocation(glsl_program_num, "cum2sym_tex");
205         GLint dsyms_tex_pos = glGetUniformLocation(glsl_program_num, "dsyms_tex");
206         GLint out_tex_pos = glGetUniformLocation(glsl_program_num, "out_tex");
207         GLint coeff_tex_pos = glGetUniformLocation(glsl_program_num, "coeff_tex");
208         GLint sign_bias_pos = glGetUniformLocation(glsl_program_num, "sign_bias_per_model");
209         GLint num_blocks_pos = glGetUniformLocation(glsl_program_num, "num_blocks");
210         printf("%d err=0x%x pos=%d,%d,%d,%d\n", __LINE__, glGetError(), cum2sym_tex_pos, dsyms_tex_pos, out_tex_pos, sign_bias_pos);
211
212         // Bind the textures.
213         glUniform1i(cum2sym_tex_pos, 0);
214         glUniform1i(dsyms_tex_pos, 1);
215         glUniform1i(out_tex_pos, 2);
216         glUniform1i(coeff_tex_pos, 3);
217         glUniform1uiv(sign_bias_pos, 16, sign_bias);
218         glUniform1i(num_blocks_pos, num_blocks);
219         glBindImageTexture(0, cum2sym_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
220         glBindImageTexture(1, dsyms_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG16UI);
221         glBindImageTexture(2, out_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8);
222         glBindImageTexture(3, coeff_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16I);
223         printf("%d err=0x%x\n", __LINE__, glGetError());
224
225         // Decode all luma blocks.
226         size_t last_src_offset = 0, last_src_len = 0;
227         for (unsigned y = 0; y < 8; ++y) {
228                 for (unsigned x = 0; x < 8; ++x) {
229                         unsigned coeff_num = y * 8 + x;
230
231                         for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; block_idx += BLOCKS_PER_STREAM) {
232                                 optional<uint32_t> num_rans_bytes = read_varint(&ptr, end);
233                                 if (!num_rans_bytes) {
234                                         fprintf(stderr, "Error parsing varint for block %d rANS bytes\n", block_idx);
235                                         exit(1);
236                                 }
237
238                                 CoeffStream *stream = &streams[coeff_num * num_blocks + block_idx / BLOCKS_PER_STREAM];
239                                 if (*num_rans_bytes == 0) {
240                                         // Repeat last stream.
241                                         stream->src_offset = last_src_offset;
242                                         stream->src_len = last_src_len;
243                                 } else {
244                                         stream->src_offset = ptr - coded.data();
245                                         stream->src_len = *num_rans_bytes;
246                                         last_src_offset = stream->src_offset;
247                                         last_src_len = last_src_len;
248                                 }
249
250                                 // TODO: check len
251                                 ptr += *num_rans_bytes;
252
253                                 //printf("read %d rANS bytes, %d sign bytes\n", *num_rans_bytes, *num_sign_bytes);
254                         }
255                 }
256         }
257
258         // put the coded data (as a whole) into an SSBO
259         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
260
261         GLuint ssbo_stream, ssbo, ssbo_out;
262
263         glGenBuffers(1, &ssbo_stream);
264         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_stream);
265         glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(streams), streams, GL_STREAM_DRAW);
266         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ssbo_stream);
267         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
268
269         glGenBuffers(1, &ssbo);
270         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
271         glBufferData(GL_SHADER_STORAGE_BUFFER, coded.size(), coded.data(), GL_STREAM_DRAW);
272         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
273         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
274
275         glGenBuffers(1, &ssbo_out);
276         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_out);
277         glBufferData(GL_SHADER_STORAGE_BUFFER, 65536, nullptr, GL_STREAM_DRAW);  // ??
278         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, ssbo_out);
279         check_error();
280
281 #define PARALLEL_SLICES 1
282         steady_clock::time_point start = steady_clock::now();
283         unsigned num_iterations = 1000;
284         for (unsigned i = 0; i < num_iterations; ++i) {
285                 unsigned num_slices = (WIDTH/8)*(HEIGHT/8)/BLOCKS_PER_STREAM;
286                 glDispatchCompute(1, (num_slices+PARALLEL_SLICES-1)/PARALLEL_SLICES, 1);
287         }
288         check_error();
289         glFinish();
290         steady_clock::time_point now = steady_clock::now();
291
292         unsigned *timing = (unsigned *)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, 65536, GL_MAP_READ_BIT);
293         //setlocale(LC_ALL, "nb_NO.UTF-8");
294
295         string phases[] = {
296                 "init",
297                 "loop overhead",
298                 "rANS decode",
299                 "barrier after rANS decode",
300                 "horizontal IDCT",
301                 "barrier after horizontal IDCT",
302                 "vertical IDCT",
303                 "store to texture",
304                 "barrier after store to texture",
305                 "dummy timer for overhead measurement",
306         };
307         printf("\n");
308         for (int i = 0; i < 10; ++i) {
309                 //printf("%d: %'18.0f  [%s]\n", i, double((uint64_t(timing[i * 2 + 1]) << 32) | timing[i * 2]), phases[i].c_str());
310                 printf("%d,%s", i, phases[i].c_str());
311                 for (int j = 0; j < 512; ++j) {
312                         int idx = (j * 10 + i) * 2;
313                         uint64_t val = (uint64_t(timing[idx + 1]) << 32) | timing[idx];
314                 //      printf(" %'18.0f", double(val));
315                 //      printf(" %'6.0f", double(val) * 1e-6);
316                         printf(",%.0f", double(val) * 1e-6);
317                 }
318                 printf("\n");
319                 //printf("  [%s]\n", phases[i].c_str());
320         }
321         printf("\n");
322
323         unsigned char *data = new unsigned char[WIDTH * HEIGHT];
324         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED, GL_UNSIGNED_BYTE, data);
325         check_error();
326         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
327
328 #if 0
329         for (int k = 0; k < 4; ++k) {
330                 for (int y = 0; y < 8; ++y) {
331                         for (int x = 0; x < 8; ++x) {
332                                 printf("%3d ", data[y * WIDTH + x + k*8]);
333                         }
334                         printf("\n");
335                 }
336                 printf("\n");
337         }
338         printf("\n");
339 #else
340         for (int k = 0; k < 4; ++k) {
341                 for (int y = 0; y < 8; ++y) {
342                         for (int x = 0; x < 8; ++x) {
343                                 //printf("%5.2f ", data[(y+8) * WIDTH + x + (1272-k*8)]);
344                                 printf("%3d ", data[y * WIDTH + x + k*8]);
345                         }
346                         printf("\n");
347                 }
348                 printf("\n");
349         }
350         printf("\n");
351 #endif
352
353         FILE *fp = fopen("narabu.pgm", "wb");
354         fprintf(fp, "P5\n%d %d\n255\n", WIDTH, HEIGHT);
355         for (int y = 0; y < HEIGHT; ++y) {
356                 for (int x = 0; x < WIDTH; ++x) {
357                         int k = lrintf(data[y * WIDTH + x]);
358                         if (k < 0) k = 0;
359                         if (k > 255) k = 255;
360                         putc(k, fp);
361                 }
362         }
363         fclose(fp);
364
365         int16_t *coeff_data = new int16_t[WIDTH * HEIGHT];
366         glBindTexture(GL_TEXTURE_2D, coeff_tex);
367         check_error();
368         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_SHORT, coeff_data);
369         check_error();
370         for (int k = 0; k < 4; ++k) {
371                 for (int y = 0; y < 8; ++y) {
372                         for (int x = 0; x < 8; ++x) {
373                                 printf("%3d ", coeff_data[y * WIDTH + x + k*8]);
374                         }
375                         printf("\n");
376                 }
377                 printf("\n");
378         }
379         printf("\n");
380         
381         
382         check_error();
383         glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind
384         
385         printf("foo = 0x%x\n", glGetError());
386         printf("Each iteration took %.3f ms.\n", 1e3 * duration<double>(now - start).count() / num_iterations);
387 }