#version 440 #extension GL_ARB_shader_clock : enable layout(local_size_x = 8) in; layout(r16ui) uniform restrict writeonly uimage2D dc_ac7_tex; layout(r16ui) uniform restrict writeonly uimage2D ac1_ac6_tex; layout(r16ui) uniform restrict writeonly uimage2D ac2_ac5_tex; layout(r8i) uniform restrict writeonly iimage2D ac3_tex; layout(r8i) uniform restrict writeonly iimage2D ac4_tex; layout(r8ui) uniform restrict readonly uimage2D image_tex; shared float temp[64]; // Scale factors; 1.0 / (sqrt(2.0) * cos(k * M_PI / 16.0)), except for the first which is 1. const float sf[8] = { 1.0, 0.7209598220069479, 0.765366864730180, 0.8504300947672564, 1.0, 1.2727585805728336, 1.847759065022573, 3.6245097854115502 }; const float W[64] = { 8, 16, 19, 22, 26, 27, 29, 34, 16, 16, 22, 24, 27, 29, 34, 37, 19, 22, 26, 27, 29, 34, 34, 38, 22, 22, 26, 27, 29, 34, 37, 40, 22, 26, 27, 29, 32, 35, 40, 48, 26, 27, 29, 32, 35, 40, 48, 58, 26, 27, 29, 34, 38, 46, 56, 69, 27, 29, 35, 38, 46, 56, 69, 83 }; const float S = 4.0 * 0.5; // whatever? // NOTE: Contains factors to counteract the scaling in the DCT implementation. const float quant_matrix[64] = { sf[0] * sf[0] / 64.0, sf[1] * sf[0] / (W[ 1] * S), sf[2] * sf[0] / (W[ 2] * S), sf[3] * sf[0] / (W[ 3] * S), sf[4] * sf[0] / (W[ 4] * S), sf[5] * sf[0] / (W[ 5] * S), sf[6] * sf[0] / (W[ 6] * S), sf[7] * sf[0] / (W[ 7] * S), sf[0] * sf[1] / (W[ 8] * S), sf[1] * sf[1] / (W[ 9] * S), sf[2] * sf[1] / (W[10] * S), sf[3] * sf[1] / (W[11] * S), sf[4] * sf[1] / (W[12] * S), sf[5] * sf[1] / (W[13] * S), sf[6] * sf[1] / (W[14] * S), sf[7] * sf[1] / (W[15] * S), sf[0] * sf[2] / (W[16] * S), sf[1] * sf[2] / (W[17] * S), sf[2] * sf[2] / (W[18] * S), sf[3] * sf[2] / (W[19] * S), sf[4] * sf[2] / (W[20] * S), sf[5] * sf[2] / (W[21] * S), sf[6] * sf[2] / (W[22] * S), sf[7] * sf[2] / (W[23] * S), sf[0] * sf[3] / (W[24] * S), sf[1] * sf[3] / (W[25] * S), sf[2] * sf[3] / (W[26] * S), sf[3] * sf[3] / (W[27] * S), sf[4] * sf[3] / (W[28] * S), sf[5] * sf[3] / (W[29] * S), sf[6] * sf[3] / (W[30] * S), sf[7] * sf[3] / (W[31] * S), sf[0] * sf[4] / (W[32] * S), sf[1] * sf[4] / (W[33] * S), sf[2] * sf[4] / (W[34] * S), sf[3] * sf[4] / (W[35] * S), sf[4] * sf[4] / (W[36] * S), sf[5] * sf[4] / (W[37] * S), sf[6] * sf[4] / (W[38] * S), sf[7] * sf[4] / (W[39] * S), sf[0] * sf[5] / (W[40] * S), sf[1] * sf[5] / (W[41] * S), sf[2] * sf[5] / (W[42] * S), sf[3] * sf[5] / (W[43] * S), sf[4] * sf[5] / (W[44] * S), sf[5] * sf[5] / (W[45] * S), sf[6] * sf[5] / (W[46] * S), sf[7] * sf[5] / (W[47] * S), sf[0] * sf[6] / (W[48] * S), sf[1] * sf[6] / (W[49] * S), sf[2] * sf[6] / (W[50] * S), sf[3] * sf[6] / (W[51] * S), sf[4] * sf[6] / (W[52] * S), sf[5] * sf[6] / (W[53] * S), sf[6] * sf[6] / (W[54] * S), sf[7] * sf[6] / (W[55] * S), sf[0] * sf[7] / (W[56] * S), sf[1] * sf[7] / (W[57] * S), sf[2] * sf[7] / (W[58] * S), sf[3] * sf[7] / (W[59] * S), sf[4] * sf[7] / (W[60] * S), sf[5] * sf[7] / (W[61] * S), sf[6] * sf[7] / (W[62] * S), sf[7] * sf[7] / (W[63] * S) }; // Clamp and pack a 9-bit and a 7-bit signed value into a 16-bit word. uint pack_9_7(int v9, int v7) { return (uint(clamp(v9, -256, 255)) & 0x1ffu) | ((uint(clamp(v7, -64, 63)) & 0x7fu) << 9); } // Scaled 1D DCT (AA&N). y0 is correctly scaled, all other y_k are scaled by sqrt(2) cos(k * Pi / 16). void dct_1d(inout float y0, inout float y1, inout float y2, inout float y3, inout float y4, inout float y5, inout float y6, inout float y7) { const float a1 = 0.7071067811865474; // sqrt(2) const float a2 = 0.5411961001461971; // cos(3/8 pi) * sqrt(2) const float a4 = 1.3065629648763766; // cos(pi/8) * sqrt(2) // static const float a5 = 0.5 * (a4 - a2); const float a5 = 0.3826834323650897; // phase 1 const float p1_0 = y0 + y7; const float p1_1 = y1 + y6; const float p1_2 = y2 + y5; const float p1_3 = y3 + y4; const float p1_4 = y3 - y4; const float p1_5 = y2 - y5; const float p1_6 = y1 - y6; const float p1_7 = y0 - y7; // phase 2 const float p2_0 = p1_0 + p1_3; const float p2_1 = p1_1 + p1_2; const float p2_2 = p1_1 - p1_2; const float p2_3 = p1_0 - p1_3; const float p2_4 = p1_4 + p1_5; // Inverted. const float p2_5 = p1_5 + p1_6; const float p2_6 = p1_6 + p1_7; // phase 3 const float p3_0 = p2_0 + p2_1; const float p3_1 = p2_0 - p2_1; const float p3_2 = p2_2 + p2_3; // phase 4 const float p4_2 = p3_2 * a1; const float p4_4 = p2_4 * a2 + (p2_4 - p2_6) * a5; const float p4_5 = p2_5 * a1; const float p4_6 = p2_6 * a4 + (p2_4 - p2_6) * a5; // phase 5 const float p5_2 = p2_3 + p4_2; const float p5_3 = p2_3 - p4_2; const float p5_5 = p1_7 + p4_5; const float p5_7 = p1_7 - p4_5; // phase 6 y0 = p3_0; y4 = p3_1; y2 = p5_2; y6 = p5_3; y5 = p4_4 + p5_7; y1 = p5_5 + p4_6; y7 = p5_5 - p4_6; y3 = p5_7 - p4_4; } void main() { uint x = 8 * gl_WorkGroupID.x; uint y = 8 * gl_WorkGroupID.y; uint n = gl_LocalInvocationID.x; // Load column. float y0 = imageLoad(image_tex, ivec2(x + n, y + 0)).x; float y1 = imageLoad(image_tex, ivec2(x + n, y + 1)).x; float y2 = imageLoad(image_tex, ivec2(x + n, y + 2)).x; float y3 = imageLoad(image_tex, ivec2(x + n, y + 3)).x; float y4 = imageLoad(image_tex, ivec2(x + n, y + 4)).x; float y5 = imageLoad(image_tex, ivec2(x + n, y + 5)).x; float y6 = imageLoad(image_tex, ivec2(x + n, y + 6)).x; float y7 = imageLoad(image_tex, ivec2(x + n, y + 7)).x; // Vertical DCT. dct_1d(y0, y1, y2, y3, y4, y5, y6, y7); // Communicate with the other shaders in the group. temp[n + 0 * 8] = y0; temp[n + 1 * 8] = y1; temp[n + 2 * 8] = y2; temp[n + 3 * 8] = y3; temp[n + 4 * 8] = y4; temp[n + 5 * 8] = y5; temp[n + 6 * 8] = y6; temp[n + 7 * 8] = y7; memoryBarrierShared(); barrier(); // Load row (so transpose, in a sense). y0 = temp[n * 8 + 0]; y1 = temp[n * 8 + 1]; y2 = temp[n * 8 + 2]; y3 = temp[n * 8 + 3]; y4 = temp[n * 8 + 4]; y5 = temp[n * 8 + 5]; y6 = temp[n * 8 + 6]; y7 = temp[n * 8 + 7]; // Horizontal DCT. dct_1d(y0, y1, y2, y3, y4, y5, y6, y7); // Quantize. int c0 = int(round(y0 * quant_matrix[n * 8 + 0])); int c1 = int(round(y1 * quant_matrix[n * 8 + 1])); int c2 = int(round(y2 * quant_matrix[n * 8 + 2])); int c3 = int(round(y3 * quant_matrix[n * 8 + 3])); int c4 = int(round(y4 * quant_matrix[n * 8 + 4])); int c5 = int(round(y5 * quant_matrix[n * 8 + 5])); int c6 = int(round(y6 * quant_matrix[n * 8 + 6])); int c7 = int(round(y7 * quant_matrix[n * 8 + 7])); // Clamp, pack and store. uint sx = gl_WorkGroupID.x; imageStore(dc_ac7_tex, ivec2(sx, y + n), uvec4(pack_9_7(c0, c7), 0, 0, 0)); imageStore(ac1_ac6_tex, ivec2(sx, y + n), uvec4(pack_9_7(c1, c6), 0, 0, 0)); imageStore(ac2_ac5_tex, ivec2(sx, y + n), uvec4(pack_9_7(c2, c5), 0, 0, 0)); imageStore(ac3_tex, ivec2(sx, y + n), ivec4(c3, 0, 0, 0)); imageStore(ac4_tex, ivec2(sx, y + n), ivec4(c4, 0, 0, 0)); }