8 #include <SDL2/SDL_error.h>
9 #include <SDL2/SDL_video.h>
18 #include <unordered_map>
20 #include <movit/util.h>
26 #define WIDTH_BLOCKS (WIDTH/8)
27 #define WIDTH_BLOCKS_CHROMA (WIDTH/16)
28 #define HEIGHT_BLOCKS (HEIGHT/8)
29 #define NUM_BLOCKS (WIDTH_BLOCKS * HEIGHT_BLOCKS)
30 #define NUM_BLOCKS_CHROMA (WIDTH_BLOCKS_CHROMA * HEIGHT_BLOCKS)
33 #define ESCAPE_LIMIT (NUM_SYMS - 1)
34 #define BLOCKS_PER_STREAM 320
35 #define STREAM_BUF_SIZE 1024 // In bytes.
37 static constexpr uint32_t prob_bits = 12;
38 static constexpr uint32_t prob_scale = 1 << prob_bits;
40 unsigned char rgb[WIDTH * HEIGHT * 3];
41 unsigned char pix_y[WIDTH * HEIGHT];
42 unsigned char pix_cb[(WIDTH/2) * HEIGHT];
43 unsigned char pix_cr[(WIDTH/2) * HEIGHT];
45 struct RansCountSSBO {
46 unsigned dist[4 * 256];
47 unsigned ransfreq[4 * 256];
52 uint32_t x_max, rcp_freq, bias, rcp_shift_and_cmpl_freq;
56 uint32_t padding[3]; // std140 layout.
61 using namespace std::chrono;
63 void write_varint(int x, FILE *fp)
66 putc((x & 0x7f) | 0x80, fp);
72 void readpix(unsigned char *ptr, const char *filename)
74 FILE *fp = fopen(filename, "rb");
80 fseek(fp, 0, SEEK_END);
82 assert(len >= WIDTH * HEIGHT * 3);
83 fseek(fp, len - WIDTH * HEIGHT * 3, SEEK_SET);
85 fread(ptr, 1, WIDTH * HEIGHT * 3, fp);
89 // Should be done on the GPU, of course, but irrelevant for the demonstration.
92 double coeff[3] = { 0.2126, 0.7152, 0.0722 }; // sum = 1.0
93 double cb_fac = 1.0 / (coeff[0] + coeff[1] + 1.0f - coeff[2]); // 0.539
94 double cr_fac = 1.0 / (1.0f - coeff[0] + coeff[1] + coeff[2]); // 0.635
96 unique_ptr<float[]> temp_cb(new float[WIDTH * HEIGHT]);
97 unique_ptr<float[]> temp_cr(new float[WIDTH * HEIGHT]);
98 for (unsigned yb = 0; yb < HEIGHT; ++yb) {
99 for (unsigned xb = 0; xb < WIDTH; ++xb) {
100 int r = rgb[((yb * WIDTH) + xb) * 3 + 0];
101 int g = rgb[((yb * WIDTH) + xb) * 3 + 1];
102 int b = rgb[((yb * WIDTH) + xb) * 3 + 2];
103 double y = std::min(std::max(coeff[0] * r + coeff[1] * g + coeff[2] * b, 0.0), 255.0);
104 double cb = (b - y) * cb_fac + 128.0;
105 double cr = (r - y) * cr_fac + 128.0;
106 pix_y[(yb * WIDTH) + xb] = lrint(y);
107 temp_cb[(yb * WIDTH) + xb] = cb;
108 temp_cr[(yb * WIDTH) + xb] = cr;
112 // Simple 4:2:2 subsampling with left convention.
113 for (unsigned yb = 0; yb < HEIGHT; ++yb) {
114 for (unsigned xb = 0; xb < WIDTH / 2; ++xb) {
115 int c0 = yb * WIDTH + std::max(int(xb) * 2 - 1, 0);
116 int c1 = yb * WIDTH + xb * 2;
117 int c2 = yb * WIDTH + xb * 2 + 1;
119 double cb = 0.25 * temp_cb[c0] + 0.5 * temp_cb[c1] + 0.25 * temp_cb[c2];
120 double cr = 0.25 * temp_cr[c0] + 0.5 * temp_cr[c1] + 0.25 * temp_cr[c2];
121 cb = std::min(std::max(cb, 0.0), 255.0);
122 cr = std::min(std::max(cr, 0.0), 255.0);
123 pix_cb[(yb * WIDTH/2) + xb] = lrint(cb);
124 pix_cr[(yb * WIDTH/2) + xb] = lrint(cr);
129 int main(int argc, char **argv)
131 // Set up an OpenGL context using SDL.
132 if (SDL_Init(SDL_INIT_VIDEO) == -1) {
133 fprintf(stderr, "SDL_Init failed: %s\n", SDL_GetError());
136 SDL_GL_SetAttribute(SDL_GL_DEPTH_SIZE, 0);
137 SDL_GL_SetAttribute(SDL_GL_STENCIL_SIZE, 0);
138 SDL_GL_SetAttribute(SDL_GL_DOUBLEBUFFER, 1);
139 SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE);
140 SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 4);
141 SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 5);
143 SDL_Window *window = SDL_CreateWindow("OpenGL window for unit test",
144 SDL_WINDOWPOS_UNDEFINED,
145 SDL_WINDOWPOS_UNDEFINED,
148 SDL_GLContext context = SDL_GL_CreateContext(window);
149 assert(context != nullptr);
152 readpix(rgb, argv[1]);
154 readpix(rgb, "color.pnm");
157 // Compile the DCT shader.
158 string shader_src = ::read_file("encoder.shader");
159 GLuint shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
160 GLuint glsl_program_num = glCreateProgram();
161 glAttachShader(glsl_program_num, shader_num);
162 glLinkProgram(glsl_program_num);
165 glGetProgramiv(glsl_program_num, GL_LINK_STATUS, &success);
166 if (success == GL_FALSE) {
167 GLchar error_log[1024] = {0};
168 glGetProgramInfoLog(glsl_program_num, 1024, nullptr, error_log);
169 fprintf(stderr, "Error linking program: %s\n", error_log);
173 // Compile the tally shader.
174 shader_src = ::read_file("tally.shader");
175 shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
176 GLuint glsl_tally_program_num = glCreateProgram();
177 glAttachShader(glsl_tally_program_num, shader_num);
178 glLinkProgram(glsl_tally_program_num);
180 glGetProgramiv(glsl_tally_program_num, GL_LINK_STATUS, &success);
181 if (success == GL_FALSE) {
182 GLchar error_log[1024] = {0};
183 glGetProgramInfoLog(glsl_tally_program_num, 1024, nullptr, error_log);
184 fprintf(stderr, "Error linking program: %s\n", error_log);
188 // Compile the rANS shader.
189 shader_src = ::read_file("rans.shader");
190 shader_num = compile_shader(shader_src, GL_COMPUTE_SHADER);
191 GLuint glsl_rans_program_num = glCreateProgram();
192 glAttachShader(glsl_rans_program_num, shader_num);
193 glLinkProgram(glsl_rans_program_num);
195 glGetProgramiv(glsl_rans_program_num, GL_LINK_STATUS, &success);
196 if (success == GL_FALSE) {
197 GLchar error_log[1024] = {0};
198 glGetProgramInfoLog(glsl_rans_program_num, 1024, nullptr, error_log);
199 fprintf(stderr, "Error linking program: %s\n", error_log);
204 // An SSBO for the raw rANS counts.
206 glGenBuffers(1, &ssbo);
207 glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo);
208 glNamedBufferStorage(ssbo, sizeof(RansCountSSBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
211 // UBO for the rANS distributions (copied from an SSBO).
213 glGenBuffers(1, &dist_ssbo);
214 glBindBuffer(GL_SHADER_STORAGE_BUFFER, dist_ssbo);
215 glNamedBufferStorage(dist_ssbo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
219 glGenBuffers(1, &dist_ubo);
220 glBindBuffer(GL_UNIFORM_BUFFER, dist_ubo);
221 glNamedBufferStorage(dist_ubo, sizeof(RansDistUBO), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
224 // SSBOs for the rANS output (data and offsets).
226 glGenBuffers(1, &output_ssbo);
227 glBindBuffer(GL_SHADER_STORAGE_BUFFER, output_ssbo);
228 glNamedBufferStorage(output_ssbo, HEIGHT_BLOCKS * WIDTH_BLOCKS * STREAM_BUF_SIZE, nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
231 GLuint bytes_written_ssbo;
232 glGenBuffers(1, &bytes_written_ssbo);
233 glBindBuffer(GL_SHADER_STORAGE_BUFFER, bytes_written_ssbo);
234 glNamedBufferStorage(bytes_written_ssbo, HEIGHT_BLOCKS * WIDTH_BLOCKS * sizeof(uint32_t), nullptr, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
238 glUseProgram(glsl_program_num);
239 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
241 glUseProgram(glsl_tally_program_num);
242 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo);
243 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 12, dist_ssbo);
245 glUseProgram(glsl_rans_program_num);
246 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 10, output_ssbo);
247 glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 11, bytes_written_ssbo);
248 glBindBufferBase(GL_UNIFORM_BUFFER, 13, dist_ubo);
250 glUseProgram(glsl_program_num);
255 glGenTextures(1, &y_tex);
256 glBindTexture(GL_TEXTURE_2D, y_tex);
257 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
258 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
259 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
260 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
261 glTexImage2D(GL_TEXTURE_2D, 0, GL_R8UI, WIDTH, HEIGHT, 0, GL_RED_INTEGER, GL_UNSIGNED_BYTE, pix_y);
264 // Make destination textures.
265 GLuint dc_ac7_tex, ac1_ac6_tex, ac2_ac5_tex;
266 for (GLuint *tex : { &dc_ac7_tex, &ac1_ac6_tex, &ac2_ac5_tex }) {
267 glGenTextures(1, tex);
268 glBindTexture(GL_TEXTURE_2D, *tex);
269 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
270 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
271 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
272 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
273 glTexImage2D(GL_TEXTURE_2D, 0, GL_R16UI, WIDTH / 8, HEIGHT, 0, GL_RED_INTEGER, GL_UNSIGNED_SHORT, nullptr);
277 GLuint ac3_tex, ac4_tex;
278 for (GLuint *tex : { &ac3_tex, &ac4_tex }) {
279 glGenTextures(1, tex);
280 glBindTexture(GL_TEXTURE_2D, *tex);
281 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
282 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
283 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
284 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_REPEAT);
285 glTexImage2D(GL_TEXTURE_2D, 0, GL_R8I, WIDTH / 8, HEIGHT, 0, GL_RED_INTEGER, GL_BYTE, nullptr);
289 glBindImageTexture(0, dc_ac7_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R16UI);
290 glBindImageTexture(1, ac1_ac6_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R16UI);
291 glBindImageTexture(2, ac2_ac5_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R16UI);
292 glBindImageTexture(3, ac3_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R8I);
293 glBindImageTexture(4, ac4_tex, 0, GL_FALSE, 0, GL_READ_WRITE, GL_R8I);
294 glBindImageTexture(5, y_tex, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8UI);
298 glUseProgram(glsl_program_num);
299 GLint dc_ac7_tex_uniform = glGetUniformLocation(glsl_program_num, "dc_ac7_tex");
300 GLint ac1_ac6_tex_uniform = glGetUniformLocation(glsl_program_num, "ac1_ac6_tex");
301 GLint ac2_ac5_tex_uniform = glGetUniformLocation(glsl_program_num, "ac2_ac5_tex");
302 GLint ac3_tex_uniform = glGetUniformLocation(glsl_program_num, "ac3_tex");
303 GLint ac4_tex_uniform = glGetUniformLocation(glsl_program_num, "ac4_tex");
304 GLint image_tex_uniform = glGetUniformLocation(glsl_program_num, "image_tex");
305 glUniform1i(dc_ac7_tex_uniform, 0);
306 glUniform1i(ac1_ac6_tex_uniform, 1);
307 glUniform1i(ac2_ac5_tex_uniform, 2);
308 glUniform1i(ac3_tex_uniform, 3);
309 glUniform1i(ac4_tex_uniform, 4);
310 glUniform1i(image_tex_uniform, 5);
312 glUseProgram(glsl_rans_program_num);
313 dc_ac7_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "dc_ac7_tex");
314 ac1_ac6_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac1_ac6_tex");
315 ac2_ac5_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac2_ac5_tex");
316 ac3_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac3_tex");
317 ac4_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "ac4_tex");
318 image_tex_uniform = glGetUniformLocation(glsl_rans_program_num, "image_tex");
319 glUniform1i(dc_ac7_tex_uniform, 0);
320 glUniform1i(ac1_ac6_tex_uniform, 1);
321 glUniform1i(ac2_ac5_tex_uniform, 2);
322 glUniform1i(ac3_tex_uniform, 3);
323 glUniform1i(ac4_tex_uniform, 4);
325 steady_clock::time_point start = steady_clock::now();
326 unsigned num_iterations = 100;
327 for (unsigned i = 0; i < num_iterations; ++i) {
328 glClearNamedBufferSubData(ssbo, GL_R8, 0, sizeof(RansCountSSBO), GL_RED, GL_UNSIGNED_BYTE, nullptr);
329 glUseProgram(glsl_program_num);
330 glDispatchCompute(WIDTH_BLOCKS / 16, HEIGHT_BLOCKS, 1);
331 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
333 glUseProgram(glsl_tally_program_num);
334 glDispatchCompute(4, 1, 1);
335 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
337 glCopyNamedBufferSubData(dist_ssbo, dist_ubo, 0, 0, sizeof(RansDistUBO));
338 glMemoryBarrier(GL_UNIFORM_BARRIER_BIT);
340 glUseProgram(glsl_rans_program_num);
341 glDispatchCompute(NUM_BLOCKS / BLOCKS_PER_STREAM, 8, 5);
346 steady_clock::time_point now = steady_clock::now();
349 printf("%ld bytes + %ld escape bits (%ld) = %ld total bytes\n",
350 tot_bytes - extra_bits / 8,
358 printf("Each iteration took %.3f ms.\n", 1e3 * duration<double>(now - start).count() / num_iterations);
360 FILE *codedfp = fopen("coded.dat", "wb");
361 if (codedfp == nullptr) {
366 // Write out the distributions.
367 const RansCountSSBO *rans_count = (const RansCountSSBO *)glMapNamedBufferRange(ssbo, 0, sizeof(RansCountSSBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
368 const RansDistUBO *rans_dist = (const RansDistUBO *)glMapNamedBufferRange(dist_ssbo, 0, sizeof(RansDistUBO), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
369 for (unsigned r = 0; r < 2; ++r) { // Hack to write fake chroma tables.
370 // TODO: rather gamma-k or something
371 for (unsigned i = 0; i < 4; ++i) {
372 printf("writing table %d\n", i);
373 for (unsigned j = 0; j < NUM_SYMS; ++j) {
374 printf("%d,%d: freq=%d x_max=%d, rcp_freq=%08x, bias=%d, rcp_shift=%d, cmpl_freq=%d\n",
375 i, j, rans_count->ransfreq[i * 256 + j],
376 rans_dist->ransdist[i * 256 + j].x_max,
377 rans_dist->ransdist[i * 256 + j].rcp_freq,
378 rans_dist->ransdist[i * 256 + j].bias,
379 rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq & 0xffff,
380 rans_dist->ransdist[i * 256 + j].rcp_shift_and_cmpl_freq >> 16);
381 write_varint(rans_count->ransfreq[i * 256 + j], codedfp);
386 // Write out the actual data.
388 const uint32_t *bytes_written = (const uint32_t *)glMapNamedBufferRange(bytes_written_ssbo, 0, HEIGHT_BLOCKS * WIDTH_BLOCKS * sizeof(uint32_t), GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
390 for (int i = 0; i < HEIGHT_BLOCKS*64; ++i) {
391 printf("%d,%d,%d: %u\n", i / 64, (i / 8) % 8, i % 8, 1024 * (i + 1) - offsets[i]);
395 const uint8_t *data = (const uint8_t *)glMapNamedBufferRange(output_ssbo, 0, HEIGHT_BLOCKS * WIDTH_BLOCKS * STREAM_BUF_SIZE, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
398 for (unsigned y = 0; y < 8; ++y) {
399 for (unsigned x = 0; x < 8; ++x) {
400 for (unsigned int stream_idx = 0; stream_idx < HEIGHT_BLOCKS; ++stream_idx) {
401 const uint8_t *out_end = data + (stream_idx * 64 + y * 8 + x + 1) * STREAM_BUF_SIZE;
402 uint32_t num_rans_bytes = bytes_written[stream_idx * 64 + y * 8 + x];
403 const uint8_t *ptr = out_end - num_rans_bytes;
404 assert(num_rans_bytes <= STREAM_BUF_SIZE);
406 if (num_rans_bytes == last_block.size() &&
407 memcmp(last_block.data(), ptr, last_block.size()) == 0) {
408 write_varint(0, codedfp);
410 last_block = string((const char *)ptr, num_rans_bytes);
411 write_varint(num_rans_bytes, codedfp);
412 fwrite(ptr, 1, num_rans_bytes, codedfp);