2 #extension GL_ARB_shader_clock : enable
4 layout(local_size_x = 8) in;
6 layout(r16ui) uniform restrict writeonly uimage2D dc_ac7_tex;
7 layout(r16ui) uniform restrict writeonly uimage2D ac1_ac6_tex;
8 layout(r16ui) uniform restrict writeonly uimage2D ac2_ac5_tex;
9 layout(r8i) uniform restrict writeonly iimage2D ac3_tex;
10 layout(r8i) uniform restrict writeonly iimage2D ac4_tex;
11 layout(r8ui) uniform restrict readonly uimage2D image_tex;
13 shared float temp[64];
15 // Scale factors; 1.0 / (sqrt(2.0) * cos(k * M_PI / 16.0)), except for the first which is 1.
17 1.0, 0.7209598220069479, 0.765366864730180, 0.8504300947672564,
18 1.0, 1.2727585805728336, 1.847759065022573, 3.6245097854115502
22 8, 16, 19, 22, 26, 27, 29, 34,
23 16, 16, 22, 24, 27, 29, 34, 37,
24 19, 22, 26, 27, 29, 34, 34, 38,
25 22, 22, 26, 27, 29, 34, 37, 40,
26 22, 26, 27, 29, 32, 35, 40, 48,
27 26, 27, 29, 32, 35, 40, 48, 58,
28 26, 27, 29, 34, 38, 46, 56, 69,
29 27, 29, 35, 38, 46, 56, 69, 83
31 const float S = 4.0 * 0.5; // whatever?
33 // NOTE: Contains factors to counteract the scaling in the DCT implementation.
34 const float quant_matrix[64] = {
35 sf[0] * sf[0] / 64.0, sf[1] * sf[0] / (W[ 1] * S), sf[2] * sf[0] / (W[ 2] * S), sf[3] * sf[0] / (W[ 3] * S), sf[4] * sf[0] / (W[ 4] * S), sf[5] * sf[0] / (W[ 5] * S), sf[6] * sf[0] / (W[ 6] * S), sf[7] * sf[0] / (W[ 7] * S),
36 sf[0] * sf[1] / (W[ 8] * S), sf[1] * sf[1] / (W[ 9] * S), sf[2] * sf[1] / (W[10] * S), sf[3] * sf[1] / (W[11] * S), sf[4] * sf[1] / (W[12] * S), sf[5] * sf[1] / (W[13] * S), sf[6] * sf[1] / (W[14] * S), sf[7] * sf[1] / (W[15] * S),
37 sf[0] * sf[2] / (W[16] * S), sf[1] * sf[2] / (W[17] * S), sf[2] * sf[2] / (W[18] * S), sf[3] * sf[2] / (W[19] * S), sf[4] * sf[2] / (W[20] * S), sf[5] * sf[2] / (W[21] * S), sf[6] * sf[2] / (W[22] * S), sf[7] * sf[2] / (W[23] * S),
38 sf[0] * sf[3] / (W[24] * S), sf[1] * sf[3] / (W[25] * S), sf[2] * sf[3] / (W[26] * S), sf[3] * sf[3] / (W[27] * S), sf[4] * sf[3] / (W[28] * S), sf[5] * sf[3] / (W[29] * S), sf[6] * sf[3] / (W[30] * S), sf[7] * sf[3] / (W[31] * S),
39 sf[0] * sf[4] / (W[32] * S), sf[1] * sf[4] / (W[33] * S), sf[2] * sf[4] / (W[34] * S), sf[3] * sf[4] / (W[35] * S), sf[4] * sf[4] / (W[36] * S), sf[5] * sf[4] / (W[37] * S), sf[6] * sf[4] / (W[38] * S), sf[7] * sf[4] / (W[39] * S),
40 sf[0] * sf[5] / (W[40] * S), sf[1] * sf[5] / (W[41] * S), sf[2] * sf[5] / (W[42] * S), sf[3] * sf[5] / (W[43] * S), sf[4] * sf[5] / (W[44] * S), sf[5] * sf[5] / (W[45] * S), sf[6] * sf[5] / (W[46] * S), sf[7] * sf[5] / (W[47] * S),
41 sf[0] * sf[6] / (W[48] * S), sf[1] * sf[6] / (W[49] * S), sf[2] * sf[6] / (W[50] * S), sf[3] * sf[6] / (W[51] * S), sf[4] * sf[6] / (W[52] * S), sf[5] * sf[6] / (W[53] * S), sf[6] * sf[6] / (W[54] * S), sf[7] * sf[6] / (W[55] * S),
42 sf[0] * sf[7] / (W[56] * S), sf[1] * sf[7] / (W[57] * S), sf[2] * sf[7] / (W[58] * S), sf[3] * sf[7] / (W[59] * S), sf[4] * sf[7] / (W[60] * S), sf[5] * sf[7] / (W[61] * S), sf[6] * sf[7] / (W[62] * S), sf[7] * sf[7] / (W[63] * S)
45 // Clamp and pack a 9-bit and a 7-bit signed value into a 16-bit word.
46 uint pack_9_7(int v9, int v7)
48 return (uint(clamp(v9, -256, 255)) & 0x1ffu) | ((uint(clamp(v7, -64, 63)) & 0x7fu) << 9);
51 // Scaled 1D DCT (AA&N). y0 is correctly scaled, all other y_k are scaled by sqrt(2) cos(k * Pi / 16).
52 void dct_1d(inout float y0, inout float y1, inout float y2, inout float y3, inout float y4, inout float y5, inout float y6, inout float y7)
54 const float a1 = 0.7071067811865474; // sqrt(2)
55 const float a2 = 0.5411961001461971; // cos(3/8 pi) * sqrt(2)
56 const float a4 = 1.3065629648763766; // cos(pi/8) * sqrt(2)
57 // static const float a5 = 0.5 * (a4 - a2);
58 const float a5 = 0.3826834323650897;
61 const float p1_0 = y0 + y7;
62 const float p1_1 = y1 + y6;
63 const float p1_2 = y2 + y5;
64 const float p1_3 = y3 + y4;
65 const float p1_4 = y3 - y4;
66 const float p1_5 = y2 - y5;
67 const float p1_6 = y1 - y6;
68 const float p1_7 = y0 - y7;
71 const float p2_0 = p1_0 + p1_3;
72 const float p2_1 = p1_1 + p1_2;
73 const float p2_2 = p1_1 - p1_2;
74 const float p2_3 = p1_0 - p1_3;
75 const float p2_4 = p1_4 + p1_5; // Inverted.
76 const float p2_5 = p1_5 + p1_6;
77 const float p2_6 = p1_6 + p1_7;
80 const float p3_0 = p2_0 + p2_1;
81 const float p3_1 = p2_0 - p2_1;
82 const float p3_2 = p2_2 + p2_3;
85 const float p4_2 = p3_2 * a1;
86 const float p4_4 = p2_4 * a2 + (p2_4 - p2_6) * a5;
87 const float p4_5 = p2_5 * a1;
88 const float p4_6 = p2_6 * a4 + (p2_4 - p2_6) * a5;
91 const float p5_2 = p2_3 + p4_2;
92 const float p5_3 = p2_3 - p4_2;
93 const float p5_5 = p1_7 + p4_5;
94 const float p5_7 = p1_7 - p4_5;
109 uint x = 8 * gl_WorkGroupID.x;
110 uint y = 8 * gl_WorkGroupID.y;
111 uint n = gl_LocalInvocationID.x;
114 float y0 = imageLoad(image_tex, ivec2(x + n, y + 0)).x;
115 float y1 = imageLoad(image_tex, ivec2(x + n, y + 1)).x;
116 float y2 = imageLoad(image_tex, ivec2(x + n, y + 2)).x;
117 float y3 = imageLoad(image_tex, ivec2(x + n, y + 3)).x;
118 float y4 = imageLoad(image_tex, ivec2(x + n, y + 4)).x;
119 float y5 = imageLoad(image_tex, ivec2(x + n, y + 5)).x;
120 float y6 = imageLoad(image_tex, ivec2(x + n, y + 6)).x;
121 float y7 = imageLoad(image_tex, ivec2(x + n, y + 7)).x;
124 dct_1d(y0, y1, y2, y3, y4, y5, y6, y7);
126 // Communicate with the other shaders in the group.
127 temp[n + 0 * 8] = y0;
128 temp[n + 1 * 8] = y1;
129 temp[n + 2 * 8] = y2;
130 temp[n + 3 * 8] = y3;
131 temp[n + 4 * 8] = y4;
132 temp[n + 5 * 8] = y5;
133 temp[n + 6 * 8] = y6;
134 temp[n + 7 * 8] = y7;
136 memoryBarrierShared();
139 // Load row (so transpose, in a sense).
140 y0 = temp[n * 8 + 0];
141 y1 = temp[n * 8 + 1];
142 y2 = temp[n * 8 + 2];
143 y3 = temp[n * 8 + 3];
144 y4 = temp[n * 8 + 4];
145 y5 = temp[n * 8 + 5];
146 y6 = temp[n * 8 + 6];
147 y7 = temp[n * 8 + 7];
150 dct_1d(y0, y1, y2, y3, y4, y5, y6, y7);
153 int c0 = int(round(y0 * quant_matrix[n * 8 + 0]));
154 int c1 = int(round(y1 * quant_matrix[n * 8 + 1]));
155 int c2 = int(round(y2 * quant_matrix[n * 8 + 2]));
156 int c3 = int(round(y3 * quant_matrix[n * 8 + 3]));
157 int c4 = int(round(y4 * quant_matrix[n * 8 + 4]));
158 int c5 = int(round(y5 * quant_matrix[n * 8 + 5]));
159 int c6 = int(round(y6 * quant_matrix[n * 8 + 6]));
160 int c7 = int(round(y7 * quant_matrix[n * 8 + 7]));
162 // Clamp, pack and store.
163 uint sx = gl_WorkGroupID.x;
164 imageStore(dc_ac7_tex, ivec2(sx, y + n), uvec4(pack_9_7(c0, c7), 0, 0, 0));
165 imageStore(ac1_ac6_tex, ivec2(sx, y + n), uvec4(pack_9_7(c1, c6), 0, 0, 0));
166 imageStore(ac2_ac5_tex, ivec2(sx, y + n), uvec4(pack_9_7(c2, c5), 0, 0, 0));
167 imageStore(ac3_tex, ivec2(sx, y + n), ivec4(c3, 0, 0, 0));
168 imageStore(ac4_tex, ivec2(sx, y + n), ivec4(c4, 0, 0, 0));