From: Steinar H. Gunderson Date: Tue, 10 Oct 2017 16:04:07 +0000 (+0200) Subject: Start trying to count the rANS distributions from the encoding shader. X-Git-Url: https://git.sesse.net/?p=narabu;a=commitdiff_plain;h=fb687c35b84f1376293338d1d88a7ed99eb80421 Start trying to count the rANS distributions from the encoding shader. --- diff --git a/coded.dat b/coded.dat index e94ba8c..469306f 100644 Binary files a/coded.dat and b/coded.dat differ diff --git a/encoder.shader b/encoder.shader index dc17453..da9e6c0 100644 --- a/encoder.shader +++ b/encoder.shader @@ -12,6 +12,24 @@ layout(r8ui) uniform restrict readonly uimage2D image_tex; shared float temp[64]; +layout(std430, binding = 9) buffer layoutName +{ + uint dist[4][256]; +}; + +#define MAPPING(s0, s1, s2, s3, s4, s5, s6, s7) ((s0) | (s1 << 2) | (s2 << 4) | (s3 << 6) | (s4 << 8) | (s5 << 10) | (s6 << 12) | (s7 << 14)) + +const uint luma_mapping[8] = { + MAPPING(0, 0, 1, 1, 2, 2, 3, 3), + MAPPING(0, 0, 1, 2, 2, 2, 3, 3), + MAPPING(1, 1, 2, 2, 2, 3, 3, 3), + MAPPING(1, 1, 2, 2, 2, 3, 3, 3), + MAPPING(1, 2, 2, 2, 2, 3, 3, 3), + MAPPING(2, 2, 2, 2, 3, 3, 3, 3), + MAPPING(2, 2, 3, 3, 3, 3, 3, 3), + MAPPING(3, 3, 3, 3, 3, 3, 3, 3), +}; + // Scale factors; 1.0 / (sqrt(2.0) * cos(k * M_PI / 16.0)), except for the first which is 1. const float sf[8] = { 1.0, 0.7209598220069479, 0.765366864730180, 0.8504300947672564, @@ -166,5 +184,39 @@ void main() imageStore(ac2_ac5_tex, ivec2(sx, y + n), uvec4(pack_9_7(c2, c5), 0, 0, 0)); imageStore(ac3_tex, ivec2(sx, y + n), ivec4(c3, 0, 0, 0)); imageStore(ac4_tex, ivec2(sx, y + n), ivec4(c4, 0, 0, 0)); + + // Count frequencies, but only for every 8th block or so, randomly selected. + uint wg_index = gl_WorkGroupID.y * gl_WorkGroupSize.x + gl_WorkGroupID.x; + if ((wg_index * 0x9E3779B9u) >> 29 == 0) { // Fibonacci hashing, essentially a PRNG in this context. + c0 = min(abs(c0), 255); + c1 = min(abs(c1), 255); + c2 = min(abs(c2), 255); + c3 = min(abs(c3), 255); + c4 = min(abs(c4), 255); + c5 = min(abs(c5), 255); + c6 = min(abs(c6), 255); + c7 = min(abs(c7), 255); + + // Spread out the most popular elements among the cache lines by reversing the bits + // of the index, reducing false sharing. + c0 = bitfieldReverse(c0) >> 24; + c1 = bitfieldReverse(c1) >> 24; + c2 = bitfieldReverse(c2) >> 24; + c3 = bitfieldReverse(c3) >> 24; + c4 = bitfieldReverse(c4) >> 24; + c5 = bitfieldReverse(c5) >> 24; + c6 = bitfieldReverse(c6) >> 24; + c7 = bitfieldReverse(c7) >> 24; + + uint m = luma_mapping[n]; + atomicAdd(dist[bitfieldExtract(m, 0, 2)][c0], 1); + atomicAdd(dist[bitfieldExtract(m, 2, 2)][c1], 1); + atomicAdd(dist[bitfieldExtract(m, 4, 2)][c2], 1); + atomicAdd(dist[bitfieldExtract(m, 6, 2)][c3], 1); + atomicAdd(dist[bitfieldExtract(m, 8, 2)][c4], 1); + atomicAdd(dist[bitfieldExtract(m, 10, 2)][c5], 1); + atomicAdd(dist[bitfieldExtract(m, 12, 2)][c6], 1); + atomicAdd(dist[bitfieldExtract(m, 14, 2)][c7], 1); + } } diff --git a/narabu-encoder.cpp b/narabu-encoder.cpp index 2ddd899..e8d20e0 100644 --- a/narabu-encoder.cpp +++ b/narabu-encoder.cpp @@ -334,6 +334,13 @@ int main(int argc, char **argv) glUseProgram(glsl_program_num); + // An SSBO for the rANS distributions. + GLuint ssbo; + glGenBuffers(1, &ssbo); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo); + glBufferData(GL_SHADER_STORAGE_BUFFER, 65536 * 4 * sizeof(uint32_t), nullptr, GL_DYNAMIC_COPY); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 9, ssbo); + // Upload luma. GLuint y_tex; glGenTextures(1, &y_tex); @@ -392,7 +399,7 @@ int main(int argc, char **argv) check_error(); steady_clock::time_point start = steady_clock::now(); - unsigned num_iterations = 1000; + unsigned num_iterations = 100; for (unsigned i = 0; i < num_iterations; ++i) { glDispatchCompute(WIDTH_BLOCKS, HEIGHT_BLOCKS, 1); } @@ -575,4 +582,11 @@ int main(int argc, char **argv) printf("\n"); printf("Each iteration took %.3f ms (but note that is DCT only, no rANS).\n", 1e3 * duration(now - start).count() / num_iterations); +#if 1 + glBindBuffer(GL_SHADER_STORAGE_BUFFER, ssbo); + const uint32_t *dist = (const uint32_t *)glMapBuffer(GL_SHADER_STORAGE_BUFFER, GL_READ_ONLY); + for (int i = 0; i < 1024; ++i) { + printf("%d,%d: %u\n", i / 256, i % 256, dist[i] / num_iterations); + } +#endif }