#version 440 #extension GL_ARB_shader_clock : enable #define PARALLEL_SLICES 1 #define ENABLE_TIMING 0 layout(local_size_x = 64*PARALLEL_SLICES) in; layout(r8ui) uniform restrict readonly uimage2D cum2sym_tex; layout(rg16ui) uniform restrict readonly uimage2D dsyms_tex; layout(r8) uniform restrict writeonly image2D out_tex; layout(r16i) uniform restrict writeonly iimage2D coeff_tex; uniform int num_blocks; const uint prob_bits = 12; const uint prob_scale = 1 << prob_bits; const uint NUM_SYMS = 256; const uint ESCAPE_LIMIT = NUM_SYMS - 1; const uint BLOCKS_PER_STREAM = 320; // These need to be folded into quant_matrix. const float dc_scalefac = 8.0; const float quant_scalefac = 4.0; const float quant_matrix[64] = { 8, 16, 19, 22, 26, 27, 29, 34, 16, 16, 22, 24, 27, 29, 34, 37, 19, 22, 26, 27, 29, 34, 34, 38, 22, 22, 26, 27, 29, 34, 37, 40, 22, 26, 27, 29, 32, 35, 40, 48, 26, 27, 29, 32, 35, 40, 48, 58, 26, 27, 29, 34, 38, 46, 56, 69, 27, 29, 35, 38, 46, 56, 69, 83 }; const uint ff_zigzag_direct[64] = { 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63 }; const uint stream_mapping[64] = { 0, 0, 1, 1, 2, 2, 3, 3, 0, 0, 1, 2, 2, 2, 3, 3, 1, 1, 2, 2, 2, 3, 3, 3, 1, 1, 2, 2, 2, 3, 3, 3, 1, 2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 2, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, }; layout(std430, binding = 9) buffer layoutName { uint data_SSBO[]; }; layout(std430, binding = 10) buffer layoutName2 { uvec2 timing[10 * 64]; }; struct CoeffStream { uint src_offset, src_len; }; layout(std430, binding = 0) buffer whatever3 { CoeffStream streams[]; }; uniform uint sign_bias_per_model[16]; const uint RANS_BYTE_L = (1u << 23); // lower bound of our normalization interval uint get_rans_byte(uint offset) { // We assume little endian. return bitfieldExtract(data_SSBO[offset >> 2], 8 * int(offset & 3u), 8); } uint RansDecInit(inout uint offset) { uint x; x = get_rans_byte(offset); x |= get_rans_byte(offset + 1) << 8; x |= get_rans_byte(offset + 2) << 16; x |= get_rans_byte(offset + 3) << 24; offset += 4; return x; } uint RansDecGet(uint r, uint scale_bits) { return r & ((1u << scale_bits) - 1); } void RansDecAdvance(inout uint rans, inout uint offset, const uint start, const uint freq, uint prob_bits) { const uint mask = (1u << prob_bits) - 1; rans = freq * (rans >> prob_bits) + (rans & mask) - start; // renormalize while (rans < RANS_BYTE_L) { rans = (rans << 8) | get_rans_byte(offset++); } } uint cum2sym(uint bits, uint table) { return imageLoad(cum2sym_tex, ivec2(bits, table)).x; } uvec2 get_dsym(uint k, uint table) { return imageLoad(dsyms_tex, ivec2(k, table)).xy; } void idct_1d(inout float y0, inout float y1, inout float y2, inout float y3, inout float y4, inout float y5, inout float y6, inout float y7) { const float a1 = 0.7071067811865474; // sqrt(2) const float a2 = 0.5411961001461971; // cos(3/8 pi) * sqrt(2) const float a4 = 1.3065629648763766; // cos(pi/8) * sqrt(2) // static const float a5 = 0.5 * (a4 - a2); const float a5 = 0.3826834323650897; // phase 2 (phase 1 is just moving around) const float p2_4 = y5 - y3; const float p2_5 = y1 + y7; const float p2_6 = y1 - y7; const float p2_7 = y5 + y3; // phase 3 const float p3_2 = y2 - y6; const float p3_3 = y2 + y6; const float p3_5 = p2_5 - p2_7; const float p3_7 = p2_5 + p2_7; // phase 4 const float p4_2 = a1 * p3_2; const float p4_4 = p2_4 * a2 + (p2_4 + p2_6) * a5; // Inverted. const float p4_5 = a1 * p3_5; const float p4_6 = p2_6 * a4 - (p2_4 + p2_6) * a5; // phase 5 const float p5_0 = y0 + y4; const float p5_1 = y0 - y4; const float p5_3 = p4_2 + p3_3; // phase 6 const float p6_0 = p5_0 + p5_3; const float p6_1 = p5_1 + p4_2; const float p6_2 = p5_1 - p4_2; const float p6_3 = p5_0 - p5_3; const float p6_5 = p4_5 + p4_4; const float p6_6 = p4_5 + p4_6; const float p6_7 = p4_6 + p3_7; // phase 7 y0 = p6_0 + p6_7; y1 = p6_1 + p6_6; y2 = p6_2 + p6_5; y3 = p6_3 + p4_4; y4 = p6_3 - p4_4; y5 = p6_2 - p6_5; y6 = p6_1 - p6_6; y7 = p6_0 - p6_7; } shared float temp[64 * 8 * PARALLEL_SLICES]; void pick_timer(inout uvec2 start, inout uvec2 t) { #if ENABLE_TIMING uvec2 now = clock2x32ARB(); uvec2 delta = now - start; if (now.x < start.x) { --delta.y; } uvec2 new_t = t + delta; if (new_t.x < t.x) { ++new_t.y; } t = new_t; start = clock2x32ARB(); #endif } void main() { uvec2 local_timing[10]; #if ENABLE_TIMING for (int timer_idx = 0; timer_idx < 10; ++timer_idx) { local_timing[timer_idx] = uvec2(0, 0); } uvec2 start = clock2x32ARB(); #else uvec2 start = uvec2(0, 0); local_timing[0] = start; #endif const uint blocks_per_row = (imageSize(out_tex).x + 7) / 8; const uint local_x = gl_LocalInvocationID.x % 8; const uint local_y = (gl_LocalInvocationID.x / 8) % 8; const uint local_z = gl_LocalInvocationID.x / 64; const uint slice_num = local_z; const uint thread_num = local_y * 8 + local_x; const uint block_row = gl_WorkGroupID.y * PARALLEL_SLICES + slice_num; //const uint coeff_num = ff_zigzag_direct[thread_num]; const uint coeff_num = thread_num; const uint stream_num = coeff_num * num_blocks + block_row; const uint model_num = stream_mapping[coeff_num]; const uint sign_bias = sign_bias_per_model[model_num]; // Initialize rANS decoder. uint offset = streams[stream_num].src_offset; uint rans = RansDecInit(offset); float q = (coeff_num == 0) ? 1.0 : (quant_matrix[coeff_num] * quant_scalefac / 128.0 / sqrt(2.0)); // FIXME: fold q *= (1.0 / 255.0); //int w = (coeff_num == 0) ? 32 : int(quant_matrix[coeff_num]); int last_k = 128; pick_timer(start, local_timing[0]); for (uint block_idx = BLOCKS_PER_STREAM / 8; block_idx --> 0; ) { pick_timer(start, local_timing[1]); // rANS decode one coefficient across eight blocks (so 64x8 coefficients). for (uint subblock_idx = 8; subblock_idx --> 0; ) { // Read a symbol. uint bottom_bits = RansDecGet(rans, prob_bits + 1); bool sign = false; if (bottom_bits >= sign_bias) { bottom_bits -= sign_bias; rans -= sign_bias; sign = true; } int k = int(cum2sym(bottom_bits, model_num)); // Can go out-of-bounds; that will return zero. uvec2 sym = get_dsym(k, model_num); RansDecAdvance(rans, offset, sym.x, sym.y, prob_bits + 1); if (k == ESCAPE_LIMIT) { k = int(RansDecGet(rans, prob_bits)); RansDecAdvance(rans, offset, k, 1, prob_bits); } if (sign) { k = -k; } if (coeff_num == 0) { k += last_k; last_k = k; } #if 0 uint y = block_row * 16 + block_y * 8 + local_y; uint x = block_x * 64 + subblock_idx * 8 + local_x; imageStore(coeff_tex, ivec2(x, y), ivec4(k, 0,0,0)); #endif temp[slice_num * 64 * 8 + subblock_idx * 64 + coeff_num] = k * q; //temp[subblock_idx * 64 + 8 * y + x] = (2 * k * w * 4) / 32; // 100% matching unquant } pick_timer(start, local_timing[2]); memoryBarrierShared(); barrier(); pick_timer(start, local_timing[3]); // Horizontal DCT one row (so 64 rows). idct_1d(temp[slice_num * 64 * 8 + thread_num * 8 + 0], temp[slice_num * 64 * 8 + thread_num * 8 + 1], temp[slice_num * 64 * 8 + thread_num * 8 + 2], temp[slice_num * 64 * 8 + thread_num * 8 + 3], temp[slice_num * 64 * 8 + thread_num * 8 + 4], temp[slice_num * 64 * 8 + thread_num * 8 + 5], temp[slice_num * 64 * 8 + thread_num * 8 + 6], temp[slice_num * 64 * 8 + thread_num * 8 + 7]); pick_timer(start, local_timing[4]); memoryBarrierShared(); barrier(); pick_timer(start, local_timing[5]); // Vertical DCT one row (so 64 columns). uint row_offset = local_z * 64 * 8 + local_y * 64 + local_x; idct_1d(temp[row_offset + 0 * 8], temp[row_offset + 1 * 8], temp[row_offset + 2 * 8], temp[row_offset + 3 * 8], temp[row_offset + 4 * 8], temp[row_offset + 5 * 8], temp[row_offset + 6 * 8], temp[row_offset + 7 * 8]); pick_timer(start, local_timing[6]); uint global_block_idx = (block_row * 40 + block_idx) * 8 + local_y; uint block_x = global_block_idx % blocks_per_row; uint block_y = global_block_idx / blocks_per_row; uint y = block_y * 8; uint x = block_x * 8 + local_x; for (uint yl = 0; yl < 8; ++yl) { imageStore(out_tex, ivec2(x, yl + y), vec4(temp[row_offset + yl * 8], 0.0, 0.0, 1.0)); } pick_timer(start, local_timing[7]); memoryBarrierShared(); // is this needed? barrier(); pick_timer(start, local_timing[8]); pick_timer(start, local_timing[9]); // should be nearly nothing } #if ENABLE_TIMING for (int timer_idx = 0; timer_idx < 10; ++timer_idx) { uint global_idx = thread_num * 10 + timer_idx; uint old_val = atomicAdd(timing[global_idx].x, local_timing[timer_idx].x); if (old_val + local_timing[timer_idx].x < old_val) { ++local_timing[timer_idx].y; } atomicAdd(timing[global_idx].y, local_timing[timer_idx].y); } #endif }