+ // Count frequencies into four histograms. We do this to local memory first,
+ // because this is _much_ faster; then we do global atomic adds for the nonzero
+ // members.
+
+ // First take the absolute value (signs are encoded differently) and clamp,
+ // as any value over 255 is going to be encoded as an escape.
+ c0 = min(abs(c0), 255);
+ c1 = min(abs(c1), 255);
+ c2 = min(abs(c2), 255);
+ c3 = min(abs(c3), 255);
+ c4 = min(abs(c4), 255);
+ c5 = min(abs(c5), 255);
+ c6 = min(abs(c6), 255);
+ c7 = min(abs(c7), 255);
+
+ // Add up in local memory.
+ uint m = luma_mapping[n];
+ atomicAdd(temp[bitfieldExtract(m, 0, 2) * 256 + c0], 1);
+ atomicAdd(temp[bitfieldExtract(m, 2, 2) * 256 + c1], 1);
+ atomicAdd(temp[bitfieldExtract(m, 4, 2) * 256 + c2], 1);
+ atomicAdd(temp[bitfieldExtract(m, 6, 2) * 256 + c3], 1);
+ atomicAdd(temp[bitfieldExtract(m, 8, 2) * 256 + c4], 1);
+ atomicAdd(temp[bitfieldExtract(m, 10, 2) * 256 + c5], 1);
+ atomicAdd(temp[bitfieldExtract(m, 12, 2) * 256 + c6], 1);
+ atomicAdd(temp[bitfieldExtract(m, 14, 2) * 256 + c7], 1);
+
+ memoryBarrierShared();
+ barrier();
+
+ // Add from local memory to global memory.
+ if (temp[base_idx + 0] != 0) atomicAdd(dist[base_idx + 0], temp[base_idx + 0]);
+ if (temp[base_idx + 1] != 0) atomicAdd(dist[base_idx + 1], temp[base_idx + 1]);
+ if (temp[base_idx + 2] != 0) atomicAdd(dist[base_idx + 2], temp[base_idx + 2]);
+ if (temp[base_idx + 3] != 0) atomicAdd(dist[base_idx + 3], temp[base_idx + 3]);
+ if (temp[base_idx + 4] != 0) atomicAdd(dist[base_idx + 4], temp[base_idx + 4]);
+ if (temp[base_idx + 5] != 0) atomicAdd(dist[base_idx + 5], temp[base_idx + 5]);
+ if (temp[base_idx + 6] != 0) atomicAdd(dist[base_idx + 6], temp[base_idx + 6]);
+ if (temp[base_idx + 7] != 0) atomicAdd(dist[base_idx + 7], temp[base_idx + 7]);
+}