+
+ // Count frequencies, but only for every 8th block or so, randomly selected.
+ uint wg_index = gl_WorkGroupID.y * gl_WorkGroupSize.x + gl_WorkGroupID.x;
+ if ((wg_index * 0x9E3779B9u) >> 29 == 0) { // Fibonacci hashing, essentially a PRNG in this context.
+ c0 = min(abs(c0), 255);
+ c1 = min(abs(c1), 255);
+ c2 = min(abs(c2), 255);
+ c3 = min(abs(c3), 255);
+ c4 = min(abs(c4), 255);
+ c5 = min(abs(c5), 255);
+ c6 = min(abs(c6), 255);
+ c7 = min(abs(c7), 255);
+
+ // Spread out the most popular elements among the cache lines by reversing the bits
+ // of the index, reducing false sharing.
+ c0 = bitfieldReverse(c0) >> 24;
+ c1 = bitfieldReverse(c1) >> 24;
+ c2 = bitfieldReverse(c2) >> 24;
+ c3 = bitfieldReverse(c3) >> 24;
+ c4 = bitfieldReverse(c4) >> 24;
+ c5 = bitfieldReverse(c5) >> 24;
+ c6 = bitfieldReverse(c6) >> 24;
+ c7 = bitfieldReverse(c7) >> 24;
+
+ uint m = luma_mapping[n];
+ atomicAdd(dist[bitfieldExtract(m, 0, 2)][c0], 1);
+ atomicAdd(dist[bitfieldExtract(m, 2, 2)][c1], 1);
+ atomicAdd(dist[bitfieldExtract(m, 4, 2)][c2], 1);
+ atomicAdd(dist[bitfieldExtract(m, 6, 2)][c3], 1);
+ atomicAdd(dist[bitfieldExtract(m, 8, 2)][c4], 1);
+ atomicAdd(dist[bitfieldExtract(m, 10, 2)][c5], 1);
+ atomicAdd(dist[bitfieldExtract(m, 12, 2)][c6], 1);
+ atomicAdd(dist[bitfieldExtract(m, 14, 2)][c7], 1);
+ }