]> git.sesse.net Git - narabu/blob - narabu.cpp
Switch to 64-bit rANS, although probably due for immediate revert (just want to prese...
[narabu] / narabu.cpp
1 #include <stdio.h>
2 #include <assert.h>
3 #include <SDL2/SDL.h>
4 #include <SDL2/SDL_error.h>
5 #include <SDL2/SDL_video.h>
6 #include <epoxy/gl.h>
7 #include <movit/util.h>
8 #include <string>
9 #include <optional>
10 #include <algorithm>
11 #include <vector>
12 #include <memory>
13 #include <chrono>
14
15 #include "util.h"
16
17 using namespace std;
18 using namespace std::chrono;
19
20 #define WIDTH 1280
21 #define HEIGHT 720
22 #define WIDTH_BLOCKS (WIDTH/8)
23 #define WIDTH_BLOCKS_CHROMA (WIDTH/16)
24 #define HEIGHT_BLOCKS (HEIGHT/8)
25 #define NUM_BLOCKS (WIDTH_BLOCKS * HEIGHT_BLOCKS)
26 #define NUM_BLOCKS_CHROMA (WIDTH_BLOCKS_CHROMA * HEIGHT_BLOCKS)
27
28 const unsigned prob_bits = 12;
29 const unsigned prob_scale = 1 << prob_bits;
30 const unsigned NUM_SYMS = 256;
31 const unsigned NUM_TABLES = 8;
32 const unsigned BLOCKS_PER_STREAM = 320;
33
34 struct RansDecSymbol {
35         unsigned sym_start;
36         unsigned sym_freq;
37 };
38 struct RansDecodeTable {
39         int cum2sym[prob_scale];
40         RansDecSymbol dsyms[NUM_SYMS];
41 };
42 RansDecodeTable decode_tables[NUM_TABLES];
43
44 optional<uint32_t> read_varint(const char **ptr, const char *end)
45 {
46         uint32_t x = 0;
47         memcpy(&x, *ptr, 4);
48         *ptr += 4;
49         return x;
50 }
51
52 const unsigned num_blocks = ((NUM_BLOCKS + BLOCKS_PER_STREAM - 1) / BLOCKS_PER_STREAM);
53
54 struct CoeffStream {
55         uint src_offset, src_len;
56 };
57 CoeffStream streams[num_blocks * 64];
58
59 int main(int argc, char **argv)
60 {
61         // Set up an OpenGL context using SDL.
62         if (SDL_Init(SDL_INIT_VIDEO) == -1) {
63                 fprintf(stderr, "SDL_Init failed: %s\n", SDL_GetError());
64                 exit(1);
65         }
66         SDL_GL_SetAttribute(SDL_GL_DEPTH_SIZE, 0);
67         SDL_GL_SetAttribute(SDL_GL_STENCIL_SIZE, 0);
68         SDL_GL_SetAttribute(SDL_GL_DOUBLEBUFFER, 1);
69         SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
70         SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
71         SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 5);
72
73         SDL_Window *window = SDL_CreateWindow("OpenGL window for unit test",
74                 SDL_WINDOWPOS_UNDEFINED,
75                 SDL_WINDOWPOS_UNDEFINED,
76                 32, 32,
77                 SDL_WINDOW_OPENGL);
78         SDL_GLContext context = SDL_GL_CreateContext(window);
79         assert(context != nullptr);
80
81         //char buf[16] = { 0 };
82
83         GLint size;
84         glGetIntegerv(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE, &size);
85         printf("shared_memory_size=%u\n", size);
86
87         string shader_src = ::read_file("decoder.shader");
88         GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
89         GLuint glsl_program_num = glCreateProgram();
90         glAttachShader(glsl_program_num, shader_num);
91         glLinkProgram(glsl_program_num);
92
93         GLint success;
94         glGetProgramiv(glsl_program_num, GL_LINK_STATUS, &success);
95         if (success == GL_FALSE) {
96                 GLchar error_log[1024] = {0};
97                 glGetProgramInfoLog(glsl_program_num, 1024, nullptr, error_log);
98                 fprintf(stderr, "Error linking program: %s\n", error_log);
99                 exit(1);
100         }
101
102         glUseProgram(glsl_program_num);
103
104         string coded = ::read_file(argc >= 2 ? argv[1] : "coded.dat");
105         const char *ptr = &coded[0];
106         //assert((intptr_t)ptr % 4 == 0);
107         const char *end = ptr + coded.size();
108         GLuint sign_bias[NUM_TABLES];
109
110 //      printf("first few bytes offs=%zu: %d %d %d %d %d %d %d %d\n", ptr - coded.data(),
111 //              (uint8_t)ptr[0], (uint8_t)ptr[1], (uint8_t)ptr[2], (uint8_t)ptr[3],
112 //              (uint8_t)ptr[4], (uint8_t)ptr[5], (uint8_t)ptr[6], (uint8_t)ptr[7]);
113
114         // read the rANS tables
115         for (unsigned table = 0; table < NUM_TABLES; ++table) {
116                 uint32_t cum_freq = 0;
117                 for (unsigned sym = 0; sym < NUM_SYMS; ++sym) {
118                         optional<uint32_t> freq = read_varint(&ptr, end);
119                         if (!freq) {
120                                 fprintf(stderr, "Error parsing varint for table %d symbol %d\n", table, sym);
121                                 exit(1);
122                         }
123
124                         decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_start = cum_freq;
125                         decode_tables[table].dsyms[(sym + 1) & (NUM_SYMS - 1)].sym_freq = *freq;
126                         for (uint32_t i = 0; i < freq; ++i) {
127                                 if (cum_freq < prob_scale)
128                                         decode_tables[table].cum2sym[cum_freq] = (sym + 1) & (NUM_SYMS - 1);
129                                 ++cum_freq;
130                         }
131                 }
132                 sign_bias[table] = cum_freq;
133         }
134
135         // Make cum2sym texture.
136         unique_ptr<uint8_t[]> cum2sym_data(new uint8_t[prob_scale * NUM_TABLES]);
137         for (unsigned table = 0; table < NUM_TABLES; ++table) {
138                 for (unsigned i = 0; i < prob_scale; ++i) {
139                         cum2sym_data[prob_scale * table + i] = decode_tables[table].cum2sym[i];
140                 }
141         }
142         GLuint cum2sym_tex;
143         glGenTextures(1, &cum2sym_tex);
144         glBindTexture(GL_TEXTURE_2D, cum2sym_tex);
145         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
146         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
147         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
148         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
149         glTexImage2D(GL_TEXTURE_2D, 0, GL_R8UI, prob_scale, NUM_TABLES, 0, GL_RED_INTEGER, GL_UNSIGNED_BYTE, cum2sym_data.get());
150         check_error();
151
152         // Make dsyms texture.
153         unique_ptr<pair<uint16_t, uint16_t>[]> dsyms_data(new pair<uint16_t, uint16_t>[NUM_SYMS * NUM_TABLES]);
154         for (unsigned table = 0; table < NUM_TABLES; ++table) {
155                 for (unsigned sym = 0; sym < NUM_SYMS; ++sym) {
156                         dsyms_data[NUM_SYMS * table + sym].first = decode_tables[table].dsyms[sym].sym_start;
157                         dsyms_data[NUM_SYMS * table + sym].second = decode_tables[table].dsyms[sym].sym_freq;
158                 }
159         }
160         GLuint dsyms_tex;
161         glGenTextures(1, &dsyms_tex);
162         glBindTexture(GL_TEXTURE_2D, dsyms_tex);
163         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
164         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
165         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
166         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
167         glTexImage2D(GL_TEXTURE_2D, 0, GL_RG16UI, NUM_SYMS, NUM_TABLES, 0, GL_RG_INTEGER, GL_UNSIGNED_SHORT, dsyms_data.get());
168         check_error();
169
170         GLuint coeff_tex;
171         glGenTextures(1, &coeff_tex);
172         glBindTexture(GL_TEXTURE_2D, coeff_tex);
173         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
174         check_error();
175         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
176         check_error();
177         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
178         check_error();
179         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
180         check_error();
181         glTexImage2D(GL_TEXTURE_2D, 0, GL_R32I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_INT, nullptr);
182         check_error();
183
184         GLuint coeff2_tex;
185         glGenTextures(1, &coeff2_tex);
186         glBindTexture(GL_TEXTURE_2D, coeff2_tex);
187         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
188         check_error();
189         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
190         check_error();
191         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
192         check_error();
193         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
194         check_error();
195         glTexImage2D(GL_TEXTURE_2D, 0, GL_R32I, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_INT, nullptr);
196         check_error();
197
198         GLuint out_tex;
199         glGenTextures(1, &out_tex);
200         glBindTexture(GL_TEXTURE_2D, out_tex);
201         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
202         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
203         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
204         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
205         glTexImage2D(GL_TEXTURE_2D, 0, GL_R8, WIDTH, HEIGHT, 0, GL_RED, GL_UNSIGNED_BYTE, nullptr);
206         //glTexImage2D(GL_TEXTURE_2D, 0, GL_R32F, WIDTH, HEIGHT, 0, GL_RED, GL_FLOAT, nullptr);
207         check_error();
208
209         GLint cum2sym_tex_pos = glGetUniformLocation(glsl_program_num, "cum2sym_tex");
210         GLint dsyms_tex_pos = glGetUniformLocation(glsl_program_num, "dsyms_tex");
211         GLint out_tex_pos = glGetUniformLocation(glsl_program_num, "out_tex");
212         GLint coeff_tex_pos = glGetUniformLocation(glsl_program_num, "coeff_tex");
213         GLint coeff2_tex_pos = glGetUniformLocation(glsl_program_num, "coeff2_tex");
214         GLint sign_bias_pos = glGetUniformLocation(glsl_program_num, "sign_bias_per_model");
215         GLint num_blocks_pos = glGetUniformLocation(glsl_program_num, "num_blocks");
216         printf("%d err=0x%x pos=%d,%d,%d,%d\n", __LINE__, glGetError(), cum2sym_tex_pos, dsyms_tex_pos, out_tex_pos, sign_bias_pos);
217
218         // Bind the textures.
219         glUniform1i(cum2sym_tex_pos, 0);
220         glUniform1i(dsyms_tex_pos, 1);
221         glUniform1i(out_tex_pos, 2);
222         glUniform1i(coeff_tex_pos, 3);
223         glUniform1i(coeff2_tex_pos, 4);
224         glUniform1uiv(sign_bias_pos, 16, sign_bias);
225         glUniform1i(num_blocks_pos, num_blocks);
226         glBindImageTexture(0, cum2sym_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
227         glBindImageTexture(1, dsyms_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RG16UI);
228         glBindImageTexture(2, out_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R8);
229         glBindImageTexture(3, coeff_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R32I);
230         glBindImageTexture(4, coeff2_tex, 0, GL_FALSE, 0, GL_WRITE_ONLY, GL_R32I);
231         printf("%d err=0x%x\n", __LINE__, glGetError());
232
233         // Decode all luma blocks.
234         for (unsigned y = 0; y < 8; ++y) {
235                 for (unsigned x = 0; x < 8; ++x) {
236                         unsigned coeff_num = y * 8 + x;
237
238                         for (unsigned block_idx = 0; block_idx < NUM_BLOCKS; block_idx += BLOCKS_PER_STREAM) {
239                                 optional<uint32_t> num_rans_bytes = read_varint(&ptr, end);
240                                 if (!num_rans_bytes) {
241                                         fprintf(stderr, "Error parsing varint for block %d rANS bytes\n", block_idx);
242                                         exit(1);
243                                 }
244
245                                 CoeffStream *stream = &streams[coeff_num * num_blocks + block_idx / BLOCKS_PER_STREAM];
246                                 stream->src_offset = ptr - coded.data();
247                                 stream->src_len = *num_rans_bytes;
248                                 //assert(stream->src_offset % 4 == 0);
249
250                                 // TODO: check len
251                                 ptr += *num_rans_bytes;
252
253                                 //printf("read %d rANS bytes, %d sign bytes\n", *num_rans_bytes, *num_sign_bytes);
254                         }
255                 }
256         }
257
258         // put the coded data (as a whole) into an SSBO
259         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
260
261         GLuint ssbo_stream, ssbo, ssbo_out;
262
263         glGenBuffers(1, &ssbo_stream);
264         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_stream);
265         glBufferData(GL_SHADER_STORAGE_BUFFER, sizeof(streams), streams, GL_STREAM_DRAW);
266         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, ssbo_stream);
267         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
268
269         glGenBuffers(1, &ssbo);
270         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
271         glBufferData(GL_SHADER_STORAGE_BUFFER, coded.size(), coded.data(), GL_STREAM_DRAW);
272         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
273         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
274
275         glGenBuffers(1, &ssbo_out);
276         glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo_out);
277         glBufferData(GL_SHADER_STORAGE_BUFFER, 65536, nullptr, GL_STREAM_DRAW);  // ??
278         glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, ssbo_out);
279         check_error();
280
281 #define PARALLEL_SLICES 1
282         steady_clock::time_point start = steady_clock::now();
283         unsigned num_iterations = 1000;
284         for (unsigned i = 0; i < num_iterations; ++i) {
285                 unsigned num_slices = (WIDTH/8)*(HEIGHT/8)/BLOCKS_PER_STREAM;
286                 glDispatchCompute(1, (num_slices+PARALLEL_SLICES-1)/PARALLEL_SLICES, 1);
287         }
288         check_error();
289         glFinish();
290         steady_clock::time_point now = steady_clock::now();
291
292         unsigned *timing = (unsigned *)glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, 65536, GL_MAP_READ_BIT);
293         //setlocale(LC_ALL, "nb_NO.UTF-8");
294
295         string phases[] = {
296                 "init",
297                 "loop overhead",
298                 "rANS decode",
299                 "barrier after rANS decode",
300                 "horizontal IDCT",
301                 "barrier after horizontal IDCT",
302                 "vertical IDCT",
303                 "store to texture",
304                 "barrier after store to texture",
305                 "dummy timer for overhead measurement",
306         };
307         printf("\n");
308         for (int i = 0; i < 10; ++i) {
309                 //printf("%d: %'18.0f  [%s]\n", i, double((uint64_t(timing[i * 2 + 1]) << 32) | timing[i * 2]), phases[i].c_str());
310                 printf("%d,%s", i, phases[i].c_str());
311                 for (int j = 0; j < 512; ++j) {
312                         int idx = (j * 10 + i) * 2;
313                         uint64_t val = (uint64_t(timing[idx + 1]) << 32) | timing[idx];
314                 //      printf(" %'18.0f", double(val));
315                 //      printf(" %'6.0f", double(val) * 1e-6);
316                         printf(",%.0f", double(val) * 1e-6);
317                 }
318                 printf("\n");
319                 //printf("  [%s]\n", phases[i].c_str());
320         }
321         printf("\n");
322
323         unsigned char *data = new unsigned char[WIDTH * HEIGHT];
324         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED, GL_UNSIGNED_BYTE, data);
325         check_error();
326         printf("%d err=0x%x bufsize=%zu\n", __LINE__, glGetError(), coded.size());
327
328 #if 0
329         for (int k = 0; k < 4; ++k) {
330                 for (int y = 0; y < 8; ++y) {
331                         for (int x = 0; x < 8; ++x) {
332                                 printf("%3d ", data[y * WIDTH + x + k*8]);
333                         }
334                         printf("\n");
335                 }
336                 printf("\n");
337         }
338         printf("\n");
339 #else
340         for (int k = 0; k < 4; ++k) {
341                 for (int y = 0; y < 8; ++y) {
342                         for (int x = 0; x < 8; ++x) {
343                                 //printf("%5.2f ", data[(y+8) * WIDTH + x + (1272-k*8)]);
344                                 printf("%3d ", data[y * WIDTH + x + k*8]);
345                         }
346                         printf("\n");
347                 }
348                 printf("\n");
349         }
350         printf("\n");
351 #endif
352
353         FILE *fp = fopen("narabu.pgm", "wb");
354         fprintf(fp, "P5\n%d %d\n255\n", WIDTH, HEIGHT);
355         for (int y = 0; y < HEIGHT; ++y) {
356                 for (int x = 0; x < WIDTH; ++x) {
357                         int k = lrintf(data[y * WIDTH + x]);
358                         if (k < 0) k = 0;
359                         if (k > 255) k = 255;
360                         putc(k, fp);
361                 }
362         }
363         fclose(fp);
364
365 #if 0
366         uint32_t *coeff_data = new uint32_t[WIDTH * HEIGHT];
367         glBindTexture(GL_TEXTURE_2D, coeff_tex);
368         check_error();
369         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_INT, coeff_data);
370         check_error();
371         uint32_t *coeff2_data = new uint32_t[WIDTH * HEIGHT];
372         glBindTexture(GL_TEXTURE_2D, coeff2_tex);
373         check_error();
374         glGetTexImage(GL_TEXTURE_2D, 0, GL_RED_INTEGER, GL_INT, coeff2_data);
375         check_error();
376         for (int x = 0; x < 320; ++x) {
377                 printf("%08x.%08x ", coeff2_data[x], coeff_data[x]);
378         }
379         printf("\n");
380 #endif
381         
382         check_error();
383         glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0); // unbind
384         
385         printf("foo = 0x%x\n", glGetError());
386         printf("Each iteration took %.3f ms.\n", 1e3 * duration<double>(now - start).count() / num_iterations);
387 }