2 #extension GL_ARB_shader_clock : enable
4 // Do sixteen 8x8 blocks in a local group, because that matches up perfectly
5 // with needing 1024 coefficients for our four histograms (of 256 bins each).
8 layout(local_size_x = 8, local_size_z = NUM_Z) in;
10 layout(r16ui) uniform restrict writeonly uimage2D dc_ac7_tex;
11 layout(r16ui) uniform restrict writeonly uimage2D ac1_ac6_tex;
12 layout(r16ui) uniform restrict writeonly uimage2D ac2_ac5_tex;
13 layout(r8i) uniform restrict writeonly iimage2D ac3_tex;
14 layout(r8i) uniform restrict writeonly iimage2D ac4_tex;
15 layout(r8ui) uniform restrict readonly uimage2D image_tex;
17 shared uint temp[64 * NUM_Z];
19 layout(std430, binding = 9) buffer layoutName
24 #define MAPPING(s0, s1, s2, s3, s4, s5, s6, s7) ((s0) | (s1 << 2) | (s2 << 4) | (s3 << 6) | (s4 << 8) | (s5 << 10) | (s6 << 12) | (s7 << 14))
26 const uint luma_mapping[8] = {
27 MAPPING(0, 0, 1, 1, 2, 2, 3, 3),
28 MAPPING(0, 0, 1, 2, 2, 2, 3, 3),
29 MAPPING(1, 1, 2, 2, 2, 3, 3, 3),
30 MAPPING(1, 1, 2, 2, 2, 3, 3, 3),
31 MAPPING(1, 2, 2, 2, 2, 3, 3, 3),
32 MAPPING(2, 2, 2, 2, 3, 3, 3, 3),
33 MAPPING(2, 2, 3, 3, 3, 3, 3, 3),
34 MAPPING(3, 3, 3, 3, 3, 3, 3, 3),
37 // Scale factors; 1.0 / (sqrt(2.0) * cos(k * M_PI / 16.0)), except for the first which is 1.
39 1.0, 0.7209598220069479, 0.765366864730180, 0.8504300947672564,
40 1.0, 1.2727585805728336, 1.847759065022573, 3.6245097854115502
44 8, 16, 19, 22, 26, 27, 29, 34,
45 16, 16, 22, 24, 27, 29, 34, 37,
46 19, 22, 26, 27, 29, 34, 34, 38,
47 22, 22, 26, 27, 29, 34, 37, 40,
48 22, 26, 27, 29, 32, 35, 40, 48,
49 26, 27, 29, 32, 35, 40, 48, 58,
50 26, 27, 29, 34, 38, 46, 56, 69,
51 27, 29, 35, 38, 46, 56, 69, 83
53 const float S = 4.0 * 0.5; // whatever?
55 // NOTE: Contains factors to counteract the scaling in the DCT implementation.
56 #define QM(x, y) (sf[x] * sf[y] / (W[y*8 + x] * S))
57 const float quant_matrix[64] = {
58 1.0 / 64.0, QM(1, 0), QM(2, 0), QM(3, 0), QM(4, 0), QM(5, 0), QM(6, 0), QM(7, 0),
59 QM(0, 1), QM(1, 1), QM(2, 1), QM(3, 1), QM(4, 1), QM(5, 1), QM(6, 1), QM(7, 1),
60 QM(0, 2), QM(1, 2), QM(2, 2), QM(3, 2), QM(4, 2), QM(5, 2), QM(6, 2), QM(7, 2),
61 QM(0, 3), QM(1, 3), QM(2, 3), QM(3, 3), QM(4, 3), QM(5, 3), QM(6, 3), QM(7, 3),
62 QM(0, 4), QM(1, 4), QM(2, 4), QM(3, 4), QM(4, 4), QM(5, 4), QM(6, 4), QM(7, 4),
63 QM(0, 5), QM(1, 5), QM(2, 5), QM(3, 5), QM(4, 5), QM(5, 5), QM(6, 5), QM(7, 5),
64 QM(0, 6), QM(1, 6), QM(2, 6), QM(3, 6), QM(4, 6), QM(5, 6), QM(6, 6), QM(7, 6),
65 QM(0, 7), QM(1, 7), QM(2, 7), QM(3, 7), QM(4, 7), QM(5, 7), QM(6, 7), QM(7, 7)
68 // Clamp and pack a 9-bit and a 7-bit signed value into a 16-bit word.
69 uint pack_9_7(int v9, int v7)
71 return (uint(clamp(v9, -256, 255)) & 0x1ffu) | ((uint(clamp(v7, -64, 63)) & 0x7fu) << 9);
74 // Scaled 1D DCT (AA&N). y0 is correctly scaled, all other y_k are scaled by sqrt(2) cos(k * Pi / 16).
75 void dct_1d(inout float y0, inout float y1, inout float y2, inout float y3, inout float y4, inout float y5, inout float y6, inout float y7)
77 const float a1 = 0.7071067811865474; // sqrt(2)
78 const float a2 = 0.5411961001461971; // cos(3/8 pi) * sqrt(2)
79 const float a4 = 1.3065629648763766; // cos(pi/8) * sqrt(2)
80 // static const float a5 = 0.5 * (a4 - a2);
81 const float a5 = 0.3826834323650897;
84 const float p1_0 = y0 + y7;
85 const float p1_1 = y1 + y6;
86 const float p1_2 = y2 + y5;
87 const float p1_3 = y3 + y4;
88 const float p1_4 = y3 - y4;
89 const float p1_5 = y2 - y5;
90 const float p1_6 = y1 - y6;
91 const float p1_7 = y0 - y7;
94 const float p2_0 = p1_0 + p1_3;
95 const float p2_1 = p1_1 + p1_2;
96 const float p2_2 = p1_1 - p1_2;
97 const float p2_3 = p1_0 - p1_3;
98 const float p2_4 = p1_4 + p1_5; // Inverted.
99 const float p2_5 = p1_5 + p1_6;
100 const float p2_6 = p1_6 + p1_7;
103 const float p3_0 = p2_0 + p2_1;
104 const float p3_1 = p2_0 - p2_1;
105 const float p3_2 = p2_2 + p2_3;
108 const float p4_2 = p3_2 * a1;
109 const float p4_4 = p2_4 * a2 + (p2_4 - p2_6) * a5;
110 const float p4_5 = p2_5 * a1;
111 const float p4_6 = p2_6 * a4 + (p2_4 - p2_6) * a5;
114 const float p5_2 = p2_3 + p4_2;
115 const float p5_3 = p2_3 - p4_2;
116 const float p5_5 = p1_7 + p4_5;
117 const float p5_7 = p1_7 - p4_5;
132 uint sx = gl_WorkGroupID.x * NUM_Z + gl_LocalInvocationID.z;
134 uint y = 8 * gl_WorkGroupID.y;
135 uint n = gl_LocalInvocationID.x;
136 uint z = gl_LocalInvocationID.z;
139 float y0 = imageLoad(image_tex, ivec2(x + n, y + 0)).x;
140 float y1 = imageLoad(image_tex, ivec2(x + n, y + 1)).x;
141 float y2 = imageLoad(image_tex, ivec2(x + n, y + 2)).x;
142 float y3 = imageLoad(image_tex, ivec2(x + n, y + 3)).x;
143 float y4 = imageLoad(image_tex, ivec2(x + n, y + 4)).x;
144 float y5 = imageLoad(image_tex, ivec2(x + n, y + 5)).x;
145 float y6 = imageLoad(image_tex, ivec2(x + n, y + 6)).x;
146 float y7 = imageLoad(image_tex, ivec2(x + n, y + 7)).x;
149 dct_1d(y0, y1, y2, y3, y4, y5, y6, y7);
151 // Communicate with the other shaders in the group.
152 uint base_idx = 64 * z;
153 temp[base_idx + 0 * 8 + n] = floatBitsToUint(y0);
154 temp[base_idx + 1 * 8 + n] = floatBitsToUint(y1);
155 temp[base_idx + 2 * 8 + n] = floatBitsToUint(y2);
156 temp[base_idx + 3 * 8 + n] = floatBitsToUint(y3);
157 temp[base_idx + 4 * 8 + n] = floatBitsToUint(y4);
158 temp[base_idx + 5 * 8 + n] = floatBitsToUint(y5);
159 temp[base_idx + 6 * 8 + n] = floatBitsToUint(y6);
160 temp[base_idx + 7 * 8 + n] = floatBitsToUint(y7);
162 memoryBarrierShared();
165 // Load row (so transpose, in a sense).
166 y0 = uintBitsToFloat(temp[base_idx + n * 8 + 0]);
167 y1 = uintBitsToFloat(temp[base_idx + n * 8 + 1]);
168 y2 = uintBitsToFloat(temp[base_idx + n * 8 + 2]);
169 y3 = uintBitsToFloat(temp[base_idx + n * 8 + 3]);
170 y4 = uintBitsToFloat(temp[base_idx + n * 8 + 4]);
171 y5 = uintBitsToFloat(temp[base_idx + n * 8 + 5]);
172 y6 = uintBitsToFloat(temp[base_idx + n * 8 + 6]);
173 y7 = uintBitsToFloat(temp[base_idx + n * 8 + 7]);
176 dct_1d(y0, y1, y2, y3, y4, y5, y6, y7);
179 int c0 = int(round(y0 * quant_matrix[n * 8 + 0]));
180 int c1 = int(round(y1 * quant_matrix[n * 8 + 1]));
181 int c2 = int(round(y2 * quant_matrix[n * 8 + 2]));
182 int c3 = int(round(y3 * quant_matrix[n * 8 + 3]));
183 int c4 = int(round(y4 * quant_matrix[n * 8 + 4]));
184 int c5 = int(round(y5 * quant_matrix[n * 8 + 5]));
185 int c6 = int(round(y6 * quant_matrix[n * 8 + 6]));
186 int c7 = int(round(y7 * quant_matrix[n * 8 + 7]));
188 // Clamp, pack and store.
189 imageStore(dc_ac7_tex, ivec2(sx, y + n), uvec4(pack_9_7(c0, c7), 0, 0, 0));
190 imageStore(ac1_ac6_tex, ivec2(sx, y + n), uvec4(pack_9_7(c1, c6), 0, 0, 0));
191 imageStore(ac2_ac5_tex, ivec2(sx, y + n), uvec4(pack_9_7(c2, c5), 0, 0, 0));
192 imageStore(ac3_tex, ivec2(sx, y + n), ivec4(c3, 0, 0, 0));
193 imageStore(ac4_tex, ivec2(sx, y + n), ivec4(c4, 0, 0, 0));
195 // Zero out the temporary area in preparation for counting up the histograms.
197 temp[base_idx + 0] = 0;
198 temp[base_idx + 1] = 0;
199 temp[base_idx + 2] = 0;
200 temp[base_idx + 3] = 0;
201 temp[base_idx + 4] = 0;
202 temp[base_idx + 5] = 0;
203 temp[base_idx + 6] = 0;
204 temp[base_idx + 7] = 0;
206 memoryBarrierShared();
209 // Count frequencies into four histograms. We do this to local memory first,
210 // because this is _much_ faster; then we do global atomic adds for the nonzero
213 // First take the absolute value (signs are encoded differently) and clamp,
214 // as any value over 255 is going to be encoded as an escape.
215 c0 = min(abs(c0), 255);
216 c1 = min(abs(c1), 255);
217 c2 = min(abs(c2), 255);
218 c3 = min(abs(c3), 255);
219 c4 = min(abs(c4), 255);
220 c5 = min(abs(c5), 255);
221 c6 = min(abs(c6), 255);
222 c7 = min(abs(c7), 255);
224 // Add up in local memory.
225 uint m = luma_mapping[n];
226 atomicAdd(temp[bitfieldExtract(m, 0, 2) * 256 + c0], 1);
227 atomicAdd(temp[bitfieldExtract(m, 2, 2) * 256 + c1], 1);
228 atomicAdd(temp[bitfieldExtract(m, 4, 2) * 256 + c2], 1);
229 atomicAdd(temp[bitfieldExtract(m, 6, 2) * 256 + c3], 1);
230 atomicAdd(temp[bitfieldExtract(m, 8, 2) * 256 + c4], 1);
231 atomicAdd(temp[bitfieldExtract(m, 10, 2) * 256 + c5], 1);
232 atomicAdd(temp[bitfieldExtract(m, 12, 2) * 256 + c6], 1);
233 atomicAdd(temp[bitfieldExtract(m, 14, 2) * 256 + c7], 1);
235 memoryBarrierShared();
238 // Add from local memory to global memory.
239 if (temp[base_idx + 0] != 0) atomicAdd(dist[base_idx + 0], temp[base_idx + 0]);
240 if (temp[base_idx + 1] != 0) atomicAdd(dist[base_idx + 1], temp[base_idx + 1]);
241 if (temp[base_idx + 2] != 0) atomicAdd(dist[base_idx + 2], temp[base_idx + 2]);
242 if (temp[base_idx + 3] != 0) atomicAdd(dist[base_idx + 3], temp[base_idx + 3]);
243 if (temp[base_idx + 4] != 0) atomicAdd(dist[base_idx + 4], temp[base_idx + 4]);
244 if (temp[base_idx + 5] != 0) atomicAdd(dist[base_idx + 5], temp[base_idx + 5]);
245 if (temp[base_idx + 6] != 0) atomicAdd(dist[base_idx + 6], temp[base_idx + 6]);
246 if (temp[base_idx + 7] != 0) atomicAdd(dist[base_idx + 7], temp[base_idx + 7]);