]> git.sesse.net Git - narabu/blob - narabu.cpp
Prepare for more flexible slices.
[narabu] / narabu.cpp
1 #include <stdio.h>
2 #include <assert.h>
3 #include <SDL2/SDL.h>
4 #include <SDL2/SDL_error.h>
5 #include <SDL2/SDL_video.h>
6 #include <epoxy/gl.h>
7 #include <movit/util.h>
8 #include <string>
9 #include <optional>
10 #include <algorithm>
11 #include <vector>
12 #include <memory>
13 #include <chrono>
14
15 #include "util.h"
16
17 using namespace std;
18 using namespace std::chrono;
19
20 #define WIDTH 1280
21 #define HEIGHT 720
22
23 const unsigned prob_bits = 12;
24 const unsigned prob_scale = 1 << prob_bits;
25 const unsigned NUM_SYMS = 256;
26 const unsigned NUM_TABLES = 8;
27
28 struct RansDecSymbol {
29         unsigned sym_start;
30         unsigned sym_freq;
31 };
32 struct RansDecodeTable {
33         int cum2sym[prob_scale];
34         RansDecSymbol dsyms[NUM_SYMS];
35 };
36 RansDecodeTable decode_tables[NUM_TABLES];
37
38 optional<uint32_t> read_varint(const char **ptr, const char *end)
39 {
40         uint32_t x = 0;
41         int shift = 0;
42         while (*ptr < end) {
43                 int ch = **ptr;
44                 ++(*ptr);       
45
46                 x |= (ch & 0x7f) << shift;
47                 if ((ch & 0x80) == 0) return x;
48                 shift += 7;
49                 if (shift >= 32) {
50                         return nullopt;  // Error: Overlong int.
51                 }
52         }
53         return nullopt;  // Error: EOF.
54 }
55
56 struct CoeffStream {
57         uint src_offset, src_len;
58 };
59 CoeffStream streams[45 * 64];  // HACK
60
61 int main(int argc, char **argv)
62 {
63         // Set up an OpenGL context using SDL.
64         if (SDL_Init(SDL_INIT_VIDEO) == -1) {
65                 fprintf(stderr, "SDL_Init failed: %s\n", SDL_GetError());
66                 exit(1);
67         }
68         SDL_GL_SetAttribute(SDL_GL_DEPTH_SIZE, 0);
69         SDL_GL_SetAttribute(SDL_GL_STENCIL_SIZE, 0);
70         SDL_GL_SetAttribute(SDL_GL_DOUBLEBUFFER, 1);
71         SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
72         SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
73         SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 5);
74
75         SDL_Window *window = SDL_CreateWindow("OpenGL window for unit test",
76                 SDL_WINDOWPOS_UNDEFINED,
77                 SDL_WINDOWPOS_UNDEFINED,
78                 32, 32,
79                 SDL_WINDOW_OPENGL);
80         SDL_GLContext context = SDL_GL_CreateContext(window);
81         assert(context != nullptr);
82
83         //char buf[16] = { 0 };
84
85         GLint size;
86         glGetIntegerv(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &size);
87         printf("shared_memory_size=%u\n", size);
88
89         string shader_src = ::read_file("decoder.shader");
90         GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
91         GLuint glsl_program_num = glCreateProgram();
92         glAttachShader(glsl_program_num, shader_num);
93         glLinkProgram(glsl_program_num);
94
95         GLint success;
96         glGetProgramiv(glsl_program_num, GL_LINK_STATUS, &success);
97         if (success == GL_FALSE) {
98                 GLchar error_log[1024] = {0};
99                 glGetProgramInfoLog(glsl_program_num, 1024, nullptr, error_log);
100                 fprintf(stderr, "Error linking program: %s\n", error_log);
101                 exit(1);
102         }
103
104         glUseProgram(glsl_program_num);
105
106         string coded = ::read_file(argc >= 2 ? argv[1] : "coded.dat");
107         const char *ptr = &coded[0];
108         const char *end = ptr + coded.size();
109         GLuint sign_bias[NUM_TABLES];
110
111 //      printf("first few bytes offs=%zu: %d %d %d %d %d %d %d %d\n", ptr - coded.data(),
112 //              (uint8_t)ptr[0], (uint8_t)ptr[1], (uint8_t)ptr[2], (uint8_t)ptr[3],
113 //              (uint8_t)ptr[4], (uint8_t)ptr[5], (uint8_t)ptr[6], (uint8_t)ptr[7]);
114
115         // read the rANS tables
116         for (unsigned table = 0; table < NUM_TABLES; ++table) {
117                 uint32_t cum_freq = 0;
118                 for (unsigned sym = 0; sym < NUM_SYMS; ++sym) {
119                         optional<uint32_t> freq = read_varint(&ptr, end);
120                         if (!freq) {
121                                 fprintf(stderr, "Error parsing varint for table %d symbol %d\n", table, sym);
122                                 exit(1);
123                         }
124
125                         decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_start = cum_freq;
126                         decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_freq = *freq;
127                         for (uint32_t i = 0; i < freq; ++i) {
128                                 if (cum_freq < prob_scale)
129                                         decode_tables[table].cum2sym[cum_freq] = (sym + 1) & (NUM_SYMS - 1);
130                                 ++cum_freq;
131                         }
132                 }
133                 sign_bias[table] = cum_freq;
134         }
135
136         // Make cum2sym texture.
137         unique_ptr<uint8_t[]> cum2sym_data(new uint8_t[prob_scale * NUM_TABLES]);
138         for (unsigned table = 0; table < NUM_TABLES; ++table) {
139                 for (unsigned i = 0; i < prob_scale; ++i) {
140                         cum2sym_data[prob_scale * table + i] = decode_tables[table].cum2sym[i];
141                 }
142         }
143         GLuint cum2sym_tex;
144         glGenTextures(1, &cum2sym_tex);
145         glBindTexture(GL_TEXTURE_2D, cum2sym_tex);
146         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
147         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
148         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
149         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
150         glTexImage2D(GL_TEXTURE_2D, 0, GL_R8UI, prob_scale, NUM_TABLES, 0, GL_RED_INTEGER, GL_UNSIGNED_BYTE, cum2sym_data.get());
151         check_error();
152
153         // Make dsyms texture.
154         unique_ptr<pair<uint16_t, uint16_t>[]> dsyms_data(new pair<uint16_t, uint16_t>[NUM_SYMS * NUM_TABLES]);
155         for (unsigned table = 0; table < NUM_TABLES; ++table) {
156                 for (unsigned sym = 0; sym < NUM_SYMS; ++sym) {
157                         dsyms_data[NUM_SYMS * table + sym].first = decode_tables[table].dsyms[sym].sym_start;
158                         dsyms_data[NUM_SYMS * table + sym].second = decode_tables[table].dsyms[sym].sym_freq;
159                 }
160         }
161         GLuint dsyms_tex;
162         glGenTextures(1, &dsyms_tex);
163         glBindTexture(GL_TEXTURE_2D, dsyms_tex);
164         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
165         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
166         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
167         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
168         glTexImage2D(GL_TEXTURE_2D, 0, GL_RG16UI, NUM_SYMS, NUM_TABLES, 0, GL_RG_INTEGER, GL_UNSIGNED_SHORT, dsyms_data.get());
169         check_error();
170
171         GLuint coeff_tex;
172         glGenTextures(1, &coeff_tex);
173         glBindTexture(GL_TEXTURE_2D, coeff_tex);
174         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
175         check_error();
176         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
177         check_error();
178         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
179         check_error();
180         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
181         check_error();
182         glTexImage2D(GL_TEXTURE_2D, 0, GL_R16I, 1280, 720, 0, GL_RED_INTEGER, GL_SHORT, nullptr);
183         check_error();
184
185         GLuint out_tex;
186         glGenTextures(1, &out_tex);
187         glBindTexture(GL_TEXTURE_2D, out_tex);
188         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
189         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
190         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
191         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
192         glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, 1280, 720, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
193         //glTexImage2D(GL_TEXTURE_2D, 0, GL_R32F, 1280, 720, 0, GL_RED, GL_FLOAT, nullptr);
194         check_error();
195
196         GLint cum2sym_tex_pos = glGetUniformLocation(glsl_program_num, "cum2sym_tex");
197         GLint dsyms_tex_pos = glGetUniformLocation(glsl_program_num, "dsyms_tex");
198         GLint out_tex_pos = glGetUniformLocation(glsl_program_num, "out_tex");
199         GLint coeff_tex_pos = glGetUniformLocation(glsl_program_num, "coeff_tex");
200         GLint sign_bias_pos = glGetUniformLocation(glsl_program_num, "sign_bias_per_model");
201         printf("%d err=0x%x pos=%d,%d,%d,%d\n", __LINE__, glGetError(), cum2sym_tex_pos, dsyms_tex_pos, out_tex_pos, sign_bias_pos);
202
203         // Bind the textures.
204         glUniform1i(cum2sym_tex_pos, 0);
205         glUniform1i(dsyms_tex_pos, 1);
206         glUniform1i(out_tex_pos, 2);
207         glUniform1i(coeff_tex_pos, 3);
208         glUniform1uiv(sign_bias_pos, 16, sign_bias);
209         glBindImageTexture(0, cum2sym_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
210         glBindImageTexture(1, dsyms_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG16UI);
211         glBindImageTexture(2, out_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8);
212         glBindImageTexture(3, coeff_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R16I);
213         printf("%d err=0x%x\n", __LINE__, glGetError());
214
215         // Decode all luma blocks.
216         unsigned num_blocks = (HEIGHT / 16);
217         for (unsigned y = 0; y < 8; ++y) {
218                 for (unsigned x = 0; x < 8; ++x) {
219                         unsigned coeff_num = y * 8 + x;
220
221                         for (unsigned yb = 0; yb < HEIGHT; yb += 16) {
222                                 optional<uint32_t> num_rans_bytes = read_varint(&ptr, end);
223                                 if (!num_rans_bytes) {
224                                         fprintf(stderr, "Error parsing varint for block %d rANS bytes\n", yb);
225                                         exit(1);
226                                 }
227
228                                 CoeffStream *stream = &streams[coeff_num * num_blocks + (yb/16)];
229                                 stream->src_offset = ptr - coded.data();
230                                 stream->src_len = *num_rans_bytes;
231
232                                 // TODO: check len
233                                 ptr += *num_rans_bytes;
234
235                                 //printf("read %d rANS bytes, %d sign bytes\n", *num_rans_bytes, *num_sign_bytes);
236                         }
237                 }
238         }
239
240         // put the coded data (as a whole) into an SSBO
241         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
242
243         GLuint ssbo_stream, ssbo, ssbo_out;
244
245         glGenBuffers(1, &ssbo_stream);
246         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_stream);
247         glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(streams), streams, GL_STREAM_DRAW);
248         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ssbo_stream);
249         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
250
251         glGenBuffers(1, &ssbo);
252         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
253         glBufferData(GL_SHADER_STORAGE_BUFFER, coded.size(), coded.data(), GL_STREAM_DRAW);
254         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
255         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
256
257         glGenBuffers(1, &ssbo_out);
258         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_out);
259         glBufferData(GL_SHADER_STORAGE_BUFFER, 65536, nullptr, GL_STREAM_DRAW);  // ??
260         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, ssbo_out);
261         check_error();
262
263 #define PARALLEL_SLICES 1
264         steady_clock::time_point start = steady_clock::now();
265         for (int i = 0; i < 1000; ++i)
266                 glDispatchCompute(1, (45+PARALLEL_SLICES-1)/PARALLEL_SLICES, 1);
267         check_error();
268         glFinish();
269         steady_clock::time_point now = steady_clock::now();
270
271         unsigned *timing = (unsigned *)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, 65536, GL_MAP_READ_BIT);
272         //setlocale(LC_ALL, "nb_NO.UTF-8");
273
274         string phases[] = {
275                 "init",
276                 "loop overhead",
277                 "rANS decode",
278                 "barrier after rANS decode",
279                 "horizontal IDCT",
280                 "barrier after horizontal IDCT",
281                 "vertical IDCT",
282                 "store to texture",
283                 "barrier after store to texture",
284                 "dummy timer for overhead measurement",
285         };
286         printf("\n");
287         for (int i = 0; i < 10; ++i) {
288                 //printf("%d: %'18.0f  [%s]\n", i, double((uint64_t(timing[i * 2 + 1]) << 32) | timing[i * 2]), phases[i].c_str());
289                 printf("%d,%s", i, phases[i].c_str());
290                 for (int j = 0; j < 512; ++j) {
291                         int idx = (j * 10 + i) * 2;
292                         uint64_t val = (uint64_t(timing[idx + 1]) << 32) | timing[idx];
293                 //      printf(" %'18.0f", double(val));
294                 //      printf(" %'6.0f", double(val) * 1e-6);
295                         printf(",%.0f", double(val) * 1e-6);
296                 }
297                 printf("\n");
298                 //printf("  [%s]\n", phases[i].c_str());
299         }
300         printf("\n");
301
302         unsigned char *data = new unsigned char[1280 * 720];
303         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED, GL_UNSIGNED_BYTE, data);
304         check_error();
305         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
306
307 #if 0
308         for (int k = 0; k < 4; ++k) {
309                 for (int y = 0; y < 8; ++y) {
310                         for (int x = 0; x < 8; ++x) {
311                                 printf("%3d ", data[y * 1280 + x + k*8]);
312                         }
313                         printf("\n");
314                 }
315                 printf("\n");
316         }
317         printf("\n");
318 #else
319         for (int k = 0; k < 4; ++k) {
320                 for (int y = 0; y < 8; ++y) {
321                         for (int x = 0; x < 8; ++x) {
322                                 //printf("%5.2f ", data[(y+8) * 1280 + x + (1272-k*8)]);
323                                 printf("%3d ", data[y * 1280 + x + k*8]);
324                         }
325                         printf("\n");
326                 }
327                 printf("\n");
328         }
329         printf("\n");
330 #endif
331
332         FILE *fp = fopen("narabu.pgm", "wb");
333         fprintf(fp, "P5\n1280 720\n255\n");
334         for (int y = 0; y < 720; ++y) {
335                 for (int x = 0; x < 1280; ++x) {
336                         int k = lrintf(data[y * 1280 + x]);
337                         if (k < 0) k = 0;
338                         if (k > 255) k = 255;
339                         putc(k, fp);
340                 }
341         }
342         fclose(fp);
343
344         int16_t *coeff_data = new int16_t[1280 * 720];
345         glBindTexture(GL_TEXTURE_2D, coeff_tex);
346         check_error();
347         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_SHORT, coeff_data);
348         check_error();
349         for (int k = 0; k < 4; ++k) {
350                 for (int y = 0; y < 8; ++y) {
351                         for (int x = 0; x < 8; ++x) {
352                                 printf("%3d ", coeff_data[y * 1280 + x + k*8]);
353                         }
354                         printf("\n");
355                 }
356                 printf("\n");
357         }
358         printf("\n");
359         
360         
361         check_error();
362         glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind
363         
364         printf("foo = 0x%x\n", glGetError());
365         printf("Each iteration took %.3f ms.\n", 1e3 * duration<double>(now - start).count() / 1000);
366 }