]> git.sesse.net Git - narabu/blob - narabu.cpp
Make num_blocks a uniform.
[narabu] / narabu.cpp
1 #include <stdio.h>
2 #include <assert.h>
3 #include <SDL2/SDL.h>
4 #include <SDL2/SDL_error.h>
5 #include <SDL2/SDL_video.h>
6 #include <epoxy/gl.h>
7 #include <movit/util.h>
8 #include <string>
9 #include <optional>
10 #include <algorithm>
11 #include <vector>
12 #include <memory>
13 #include <chrono>
14
15 #include "util.h"
16
17 using namespace std;
18 using namespace std::chrono;
19
20 #define WIDTH 1280
21 #define HEIGHT 720
22
23 const unsigned prob_bits = 12;
24 const unsigned prob_scale = 1 << prob_bits;
25 const unsigned NUM_SYMS = 256;
26 const unsigned NUM_TABLES = 8;
27
28 struct RansDecSymbol {
29         unsigned sym_start;
30         unsigned sym_freq;
31 };
32 struct RansDecodeTable {
33         int cum2sym[prob_scale];
34         RansDecSymbol dsyms[NUM_SYMS];
35 };
36 RansDecodeTable decode_tables[NUM_TABLES];
37
38 optional<uint32_t> read_varint(const char **ptr, const char *end)
39 {
40         uint32_t x = 0;
41         int shift = 0;
42         while (*ptr < end) {
43                 int ch = **ptr;
44                 ++(*ptr);       
45
46                 x |= (ch & 0x7f) << shift;
47                 if ((ch & 0x80) == 0) return x;
48                 shift += 7;
49                 if (shift >= 32) {
50                         return nullopt;  // Error: Overlong int.
51                 }
52         }
53         return nullopt;  // Error: EOF.
54 }
55
56 struct CoeffStream {
57         uint src_offset, src_len;
58 };
59 CoeffStream streams[45 * 64];  // HACK
60
61 int main(int argc, char **argv)
62 {
63         // Set up an OpenGL context using SDL.
64         if (SDL_Init(SDL_INIT_VIDEO) == -1) {
65                 fprintf(stderr, "SDL_Init failed: %s\n", SDL_GetError());
66                 exit(1);
67         }
68         SDL_GL_SetAttribute(SDL_GL_DEPTH_SIZE, 0);
69         SDL_GL_SetAttribute(SDL_GL_STENCIL_SIZE, 0);
70         SDL_GL_SetAttribute(SDL_GL_DOUBLEBUFFER, 1);
71         SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
72         SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
73         SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 5);
74
75         SDL_Window *window = SDL_CreateWindow("OpenGL window for unit test",
76                 SDL_WINDOWPOS_UNDEFINED,
77                 SDL_WINDOWPOS_UNDEFINED,
78                 32, 32,
79                 SDL_WINDOW_OPENGL);
80         SDL_GLContext context = SDL_GL_CreateContext(window);
81         assert(context != nullptr);
82
83         //char buf[16] = { 0 };
84
85         GLint size;
86         glGetIntegerv(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &size);
87         printf("shared_memory_size=%u\n", size);
88
89         string shader_src = ::read_file("decoder.shader");
90         GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
91         GLuint glsl_program_num = glCreateProgram();
92         glAttachShader(glsl_program_num, shader_num);
93         glLinkProgram(glsl_program_num);
94
95         GLint success;
96         glGetProgramiv(glsl_program_num, GL_LINK_STATUS, &success);
97         if (success == GL_FALSE) {
98                 GLchar error_log[1024] = {0};
99                 glGetProgramInfoLog(glsl_program_num, 1024, nullptr, error_log);
100                 fprintf(stderr, "Error linking program: %s\n", error_log);
101                 exit(1);
102         }
103
104         glUseProgram(glsl_program_num);
105
106         string coded = ::read_file(argc >= 2 ? argv[1] : "coded.dat");
107         const char *ptr = &coded[0];
108         const char *end = ptr + coded.size();
109         GLuint sign_bias[NUM_TABLES];
110
111 //      printf("first few bytes offs=%zu: %d %d %d %d %d %d %d %d\n", ptr - coded.data(),
112 //              (uint8_t)ptr[0], (uint8_t)ptr[1], (uint8_t)ptr[2], (uint8_t)ptr[3],
113 //              (uint8_t)ptr[4], (uint8_t)ptr[5], (uint8_t)ptr[6], (uint8_t)ptr[7]);
114
115         // read the rANS tables
116         for (unsigned table = 0; table < NUM_TABLES; ++table) {
117                 uint32_t cum_freq = 0;
118                 for (unsigned sym = 0; sym < NUM_SYMS; ++sym) {
119                         optional<uint32_t> freq = read_varint(&ptr, end);
120                         if (!freq) {
121                                 fprintf(stderr, "Error parsing varint for table %d symbol %d\n", table, sym);
122                                 exit(1);
123                         }
124
125                         decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_start = cum_freq;
126                         decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_freq = *freq;
127                         for (uint32_t i = 0; i < freq; ++i) {
128                                 if (cum_freq < prob_scale)
129                                         decode_tables[table].cum2sym[cum_freq] = (sym + 1) & (NUM_SYMS - 1);
130                                 ++cum_freq;
131                         }
132                 }
133                 sign_bias[table] = cum_freq;
134         }
135
136         // Make cum2sym texture.
137         unique_ptr<uint8_t[]> cum2sym_data(new uint8_t[prob_scale * NUM_TABLES]);
138         for (unsigned table = 0; table < NUM_TABLES; ++table) {
139                 for (unsigned i = 0; i < prob_scale; ++i) {
140                         cum2sym_data[prob_scale * table + i] = decode_tables[table].cum2sym[i];
141                 }
142         }
143         GLuint cum2sym_tex;
144         glGenTextures(1, &cum2sym_tex);
145         glBindTexture(GL_TEXTURE_2D, cum2sym_tex);
146         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
147         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
148         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
149         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
150         glTexImage2D(GL_TEXTURE_2D, 0, GL_R8UI, prob_scale, NUM_TABLES, 0, GL_RED_INTEGER, GL_UNSIGNED_BYTE, cum2sym_data.get());
151         check_error();
152
153         // Make dsyms texture.
154         unique_ptr<pair<uint16_t, uint16_t>[]> dsyms_data(new pair<uint16_t, uint16_t>[NUM_SYMS * NUM_TABLES]);
155         for (unsigned table = 0; table < NUM_TABLES; ++table) {
156                 for (unsigned sym = 0; sym < NUM_SYMS; ++sym) {
157                         dsyms_data[NUM_SYMS * table + sym].first = decode_tables[table].dsyms[sym].sym_start;
158                         dsyms_data[NUM_SYMS * table + sym].second = decode_tables[table].dsyms[sym].sym_freq;
159                 }
160         }
161         GLuint dsyms_tex;
162         glGenTextures(1, &dsyms_tex);
163         glBindTexture(GL_TEXTURE_2D, dsyms_tex);
164         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
165         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
166         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
167         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
168         glTexImage2D(GL_TEXTURE_2D, 0, GL_RG16UI, NUM_SYMS, NUM_TABLES, 0, GL_RG_INTEGER, GL_UNSIGNED_SHORT, dsyms_data.get());
169         check_error();
170
171         GLuint coeff_tex;
172         glGenTextures(1, &coeff_tex);
173         glBindTexture(GL_TEXTURE_2D, coeff_tex);
174         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
175         check_error();
176         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
177         check_error();
178         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
179         check_error();
180         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
181         check_error();
182         glTexImage2D(GL_TEXTURE_2D, 0, GL_R16I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_SHORT, nullptr);
183         check_error();
184
185         GLuint out_tex;
186         glGenTextures(1, &out_tex);
187         glBindTexture(GL_TEXTURE_2D, out_tex);
188         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
189         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
190         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
191         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
192         glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, WIDTH, HEIGHT, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
193         //glTexImage2D(GL_TEXTURE_2D, 0, GL_R32F, WIDTH, HEIGHT, 0, GL_RED, GL_FLOAT, nullptr);
194         check_error();
195
196         GLint cum2sym_tex_pos = glGetUniformLocation(glsl_program_num, "cum2sym_tex");
197         GLint dsyms_tex_pos = glGetUniformLocation(glsl_program_num, "dsyms_tex");
198         GLint out_tex_pos = glGetUniformLocation(glsl_program_num, "out_tex");
199         GLint coeff_tex_pos = glGetUniformLocation(glsl_program_num, "coeff_tex");
200         GLint sign_bias_pos = glGetUniformLocation(glsl_program_num, "sign_bias_per_model");
201         GLint num_blocks_pos = glGetUniformLocation(glsl_program_num, "num_blocks");
202         printf("%d err=0x%x pos=%d,%d,%d,%d\n", __LINE__, glGetError(), cum2sym_tex_pos, dsyms_tex_pos, out_tex_pos, sign_bias_pos);
203
204         unsigned num_blocks = (HEIGHT / 16);
205
206         // Bind the textures.
207         glUniform1i(cum2sym_tex_pos, 0);
208         glUniform1i(dsyms_tex_pos, 1);
209         glUniform1i(out_tex_pos, 2);
210         glUniform1i(coeff_tex_pos, 3);
211         glUniform1uiv(sign_bias_pos, 16, sign_bias);
212         glUniform1i(num_blocks_pos, num_blocks);
213         glBindImageTexture(0, cum2sym_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
214         glBindImageTexture(1, dsyms_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG16UI);
215         glBindImageTexture(2, out_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8);
216         glBindImageTexture(3, coeff_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16I);
217         printf("%d err=0x%x\n", __LINE__, glGetError());
218
219         // Decode all luma blocks.
220         for (unsigned y = 0; y < 8; ++y) {
221                 for (unsigned x = 0; x < 8; ++x) {
222                         unsigned coeff_num = y * 8 + x;
223
224                         for (unsigned yb = 0; yb < HEIGHT; yb += 16) {
225                                 optional<uint32_t> num_rans_bytes = read_varint(&ptr, end);
226                                 if (!num_rans_bytes) {
227                                         fprintf(stderr, "Error parsing varint for block %d rANS bytes\n", yb);
228                                         exit(1);
229                                 }
230
231                                 CoeffStream *stream = &streams[coeff_num * num_blocks + (yb/16)];
232                                 stream->src_offset = ptr - coded.data();
233                                 stream->src_len = *num_rans_bytes;
234
235                                 // TODO: check len
236                                 ptr += *num_rans_bytes;
237
238                                 //printf("read %d rANS bytes, %d sign bytes\n", *num_rans_bytes, *num_sign_bytes);
239                         }
240                 }
241         }
242
243         // put the coded data (as a whole) into an SSBO
244         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
245
246         GLuint ssbo_stream, ssbo, ssbo_out;
247
248         glGenBuffers(1, &ssbo_stream);
249         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_stream);
250         glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(streams), streams, GL_STREAM_DRAW);
251         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ssbo_stream);
252         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
253
254         glGenBuffers(1, &ssbo);
255         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
256         glBufferData(GL_SHADER_STORAGE_BUFFER, coded.size(), coded.data(), GL_STREAM_DRAW);
257         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
258         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
259
260         glGenBuffers(1, &ssbo_out);
261         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_out);
262         glBufferData(GL_SHADER_STORAGE_BUFFER, 65536, nullptr, GL_STREAM_DRAW);  // ??
263         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, ssbo_out);
264         check_error();
265
266 #define PARALLEL_SLICES 1
267         steady_clock::time_point start = steady_clock::now();
268         for (int i = 0; i < 1000; ++i) {
269                 unsigned num_slices = (WIDTH/8)*(HEIGHT/8)/320;
270                 glDispatchCompute(1, (num_slices+PARALLEL_SLICES-1)/PARALLEL_SLICES, 1);
271         }
272         check_error();
273         glFinish();
274         steady_clock::time_point now = steady_clock::now();
275
276         unsigned *timing = (unsigned *)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, 65536, GL_MAP_READ_BIT);
277         //setlocale(LC_ALL, "nb_NO.UTF-8");
278
279         string phases[] = {
280                 "init",
281                 "loop overhead",
282                 "rANS decode",
283                 "barrier after rANS decode",
284                 "horizontal IDCT",
285                 "barrier after horizontal IDCT",
286                 "vertical IDCT",
287                 "store to texture",
288                 "barrier after store to texture",
289                 "dummy timer for overhead measurement",
290         };
291         printf("\n");
292         for (int i = 0; i < 10; ++i) {
293                 //printf("%d: %'18.0f  [%s]\n", i, double((uint64_t(timing[i * 2 + 1]) << 32) | timing[i * 2]), phases[i].c_str());
294                 printf("%d,%s", i, phases[i].c_str());
295                 for (int j = 0; j < 512; ++j) {
296                         int idx = (j * 10 + i) * 2;
297                         uint64_t val = (uint64_t(timing[idx + 1]) << 32) | timing[idx];
298                 //      printf(" %'18.0f", double(val));
299                 //      printf(" %'6.0f", double(val) * 1e-6);
300                         printf(",%.0f", double(val) * 1e-6);
301                 }
302                 printf("\n");
303                 //printf("  [%s]\n", phases[i].c_str());
304         }
305         printf("\n");
306
307         unsigned char *data = new unsigned char[WIDTH * HEIGHT];
308         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED, GL_UNSIGNED_BYTE, data);
309         check_error();
310         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
311
312 #if 0
313         for (int k = 0; k < 4; ++k) {
314                 for (int y = 0; y < 8; ++y) {
315                         for (int x = 0; x < 8; ++x) {
316                                 printf("%3d ", data[y * WIDTH + x + k*8]);
317                         }
318                         printf("\n");
319                 }
320                 printf("\n");
321         }
322         printf("\n");
323 #else
324         for (int k = 0; k < 4; ++k) {
325                 for (int y = 0; y < 8; ++y) {
326                         for (int x = 0; x < 8; ++x) {
327                                 //printf("%5.2f ", data[(y+8) * WIDTH + x + (1272-k*8)]);
328                                 printf("%3d ", data[y * WIDTH + x + k*8]);
329                         }
330                         printf("\n");
331                 }
332                 printf("\n");
333         }
334         printf("\n");
335 #endif
336
337         FILE *fp = fopen("narabu.pgm", "wb");
338         fprintf(fp, "P5\n%d %d\n255\n", WIDTH, HEIGHT);
339         for (int y = 0; y < HEIGHT; ++y) {
340                 for (int x = 0; x < WIDTH; ++x) {
341                         int k = lrintf(data[y * WIDTH + x]);
342                         if (k < 0) k = 0;
343                         if (k > 255) k = 255;
344                         putc(k, fp);
345                 }
346         }
347         fclose(fp);
348
349         int16_t *coeff_data = new int16_t[WIDTH * HEIGHT];
350         glBindTexture(GL_TEXTURE_2D, coeff_tex);
351         check_error();
352         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_SHORT, coeff_data);
353         check_error();
354         for (int k = 0; k < 4; ++k) {
355                 for (int y = 0; y < 8; ++y) {
356                         for (int x = 0; x < 8; ++x) {
357                                 printf("%3d ", coeff_data[y * WIDTH + x + k*8]);
358                         }
359                         printf("\n");
360                 }
361                 printf("\n");
362         }
363         printf("\n");
364         
365         
366         check_error();
367         glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind
368         
369         printf("foo = 0x%x\n", glGetError());
370         printf("Each iteration took %.3f ms.\n", 1e3 * duration<double>(now - start).count() / 1000);
371 }