git.sesse.net Git - narabu/blob - encoder.shader

   1 #version 440
   2 #extension GL_ARB_shader_clock : enable
   3
   4 layout(local_size_x = 8) in;
   5
   6 layout(r16ui) uniform restrict writeonly uimage2D dc_ac7_tex;
   7 layout(r16ui) uniform restrict writeonly uimage2D ac1_ac6_tex;
   8 layout(r16ui) uniform restrict writeonly uimage2D ac2_ac5_tex;
   9 layout(r8i) uniform restrict writeonly iimage2D ac3_tex;
  10 layout(r8i) uniform restrict writeonly iimage2D ac4_tex;
  11 layout(r8ui) uniform restrict readonly uimage2D image_tex;
  12
  13 shared float temp[64];
  14
  15 const float W[64] = {
  16          8, 16, 19, 22, 26, 27, 29, 34,
  17         16, 16, 22, 24, 27, 29, 34, 37,
  18         19, 22, 26, 27, 29, 34, 34, 38,
  19         22, 22, 26, 27, 29, 34, 37, 40,
  20         22, 26, 27, 29, 32, 35, 40, 48,
  21         26, 27, 29, 32, 35, 40, 48, 58,
  22         26, 27, 29, 34, 38, 46, 56, 69,
  23         27, 29, 35, 38, 46, 56, 69, 83
  24 };
  25 const float S = 4.0;  // whatever?
  26
  27 // NOTE: Contains factors to counteract the scaling in the DCT implementation.
  28 const float quant_matrix[64] = {
  29         1.0 / 64.0,         1.0 / (W[ 1] * S),  1.0 / (W[ 2] * S),  1.0 / (W[ 3] * S),  1.0 / (W[ 4] * S),  1.0 / (W[ 5] * S),  1.0 / (W[ 6] * S),  1.0 / (W[ 7] * S),
  30         1.0 / (W[ 8] * S),  2.0 / (W[ 9] * S),  2.0 / (W[10] * S),  2.0 / (W[11] * S),  2.0 / (W[12] * S),  2.0 / (W[13] * S),  2.0 / (W[14] * S),  2.0 / (W[15] * S),
  31         1.0 / (W[16] * S),  2.0 / (W[17] * S),  2.0 / (W[18] * S),  2.0 / (W[19] * S),  2.0 / (W[20] * S),  2.0 / (W[21] * S),  2.0 / (W[22] * S),  2.0 / (W[23] * S),
  32         1.0 / (W[24] * S),  2.0 / (W[25] * S),  2.0 / (W[26] * S),  2.0 / (W[27] * S),  2.0 / (W[28] * S),  2.0 / (W[29] * S),  2.0 / (W[30] * S),  2.0 / (W[31] * S),
  33         1.0 / (W[32] * S),  2.0 / (W[33] * S),  2.0 / (W[34] * S),  2.0 / (W[35] * S),  2.0 / (W[36] * S),  2.0 / (W[37] * S),  2.0 / (W[38] * S),  2.0 / (W[39] * S),
  34         1.0 / (W[40] * S),  2.0 / (W[41] * S),  2.0 / (W[42] * S),  2.0 / (W[43] * S),  2.0 / (W[44] * S),  2.0 / (W[45] * S),  2.0 / (W[46] * S),  2.0 / (W[47] * S),
  35         1.0 / (W[48] * S),  2.0 / (W[49] * S),  2.0 / (W[50] * S),  2.0 / (W[51] * S),  2.0 / (W[52] * S),  2.0 / (W[53] * S),  2.0 / (W[54] * S),  2.0 / (W[55] * S),
  36         1.0 / (W[56] * S),  2.0 / (W[57] * S),  2.0 / (W[58] * S),  2.0 / (W[59] * S),  2.0 / (W[60] * S),  2.0 / (W[61] * S),  2.0 / (W[62] * S),  2.0 / (W[63] * S)
  37 };
  38
  39 // Clamp and pack a 9-bit and a 7-bit signed value into a 16-bit word.
  40 uint pack_9_7(int v9, int v7)
  41 {
  42         return (uint(clamp(v9, -256, 255)) & 0x1ffu) | ((uint(clamp(v7, -64, 63)) & 0x7fu) << 9);
  43 }
  44
  45 // Scaled 1D DCT. y0 output is scaled by 8, everything else is scaled by 16.
  46 void dct_1d(inout float y0, inout float y1, inout float y2, inout float y3, inout float y4, inout float y5, inout float y6, inout float y7)
  47 {
  48         const float a1 = 0.7071067811865474;   // sqrt(2)
  49         const float a2 = 0.5411961001461971;   // cos(3/8 pi) * sqrt(2)
  50         const float a4 = 1.3065629648763766;   // cos(pi/8) * sqrt(2)
  51         // static const float a5 = 0.5 * (a4 - a2);
  52         const float a5 = 0.3826834323650897;
  53
  54         // phase 1
  55         const float p1_0 = y0 + y7;
  56         const float p1_1 = y1 + y6;
  57         const float p1_2 = y2 + y5;
  58         const float p1_3 = y3 + y4;
  59         const float p1_4 = y3 - y4;
  60         const float p1_5 = y2 - y5;
  61         const float p1_6 = y1 - y6;
  62         const float p1_7 = y0 - y7;
  63
  64         // phase 2
  65         const float p2_0 = p1_0 + p1_3;
  66         const float p2_1 = p1_1 + p1_2;
  67         const float p2_2 = p1_1 - p1_2;
  68         const float p2_3 = p1_0 - p1_3;
  69         const float p2_4 = p1_4 + p1_5;  // Inverted.
  70         const float p2_5 = p1_5 + p1_6;
  71         const float p2_6 = p1_6 + p1_7;
  72
  73         // phase 3
  74         const float p3_0 = p2_0 + p2_1;
  75         const float p3_1 = p2_0 - p2_1;
  76         const float p3_2 = p2_2 + p2_3;
  77
  78         // phase 4
  79         const float p4_2 = p3_2 * a1;
  80         const float p4_4 = p2_4 * a2 + (p2_4 - p2_6) * a5;
  81         const float p4_5 = p2_5 * a1;
  82         const float p4_6 = p2_6 * a4 + (p2_4 - p2_6) * a5;
  83
  84         // phase 5
  85         const float p5_2 = p2_3 + p4_2;
  86         const float p5_3 = p2_3 - p4_2;
  87         const float p5_5 = p1_7 + p4_5;
  88         const float p5_7 = p1_7 - p4_5;
  89
  90         // phase 6
  91         y0 = p3_0;
  92         y4 = p3_1;
  93         y2 = p5_2;
  94         y6 = p5_3;
  95         y5 = p4_4 + p5_7;
  96         y1 = p5_5 + p4_6;
  97         y7 = p5_5 - p4_6;
  98         y3 = p5_7 - p4_4;
  99 }
 100 void main()
 101 {
 102         uint x = 8 * gl_WorkGroupID.x;
 103         uint y = 8 * gl_WorkGroupID.y;
 104         uint n = gl_LocalInvocationID.x;
 105
 106         // Load column.
 107         float y0 = imageLoad(image_tex, ivec2(x + n, y + 0)).x;
 108         float y1 = imageLoad(image_tex, ivec2(x + n, y + 1)).x;
 109         float y2 = imageLoad(image_tex, ivec2(x + n, y + 2)).x;
 110         float y3 = imageLoad(image_tex, ivec2(x + n, y + 3)).x;
 111         float y4 = imageLoad(image_tex, ivec2(x + n, y + 4)).x;
 112         float y5 = imageLoad(image_tex, ivec2(x + n, y + 5)).x;
 113         float y6 = imageLoad(image_tex, ivec2(x + n, y + 6)).x;
 114         float y7 = imageLoad(image_tex, ivec2(x + n, y + 7)).x;
 115
 116         // Vertical DCT.
 117         dct_1d(y0, y1, y2, y3, y4, y5, y6, y7);
 118
 119         // Communicate with the other shaders in the group.
 120         temp[n + 0 * 8] = y0;
 121         temp[n + 1 * 8] = y1;
 122         temp[n + 2 * 8] = y2;
 123         temp[n + 3 * 8] = y3;
 124         temp[n + 4 * 8] = y4;
 125         temp[n + 5 * 8] = y5;
 126         temp[n + 6 * 8] = y6;
 127         temp[n + 7 * 8] = y7;
 128
 129         memoryBarrierShared();
 130         barrier();
 131
 132         // Load row (so transpose, in a sense).
 133         y0 = temp[n * 8 + 0];
 134         y1 = temp[n * 8 + 1];
 135         y2 = temp[n * 8 + 2];
 136         y3 = temp[n * 8 + 3];
 137         y4 = temp[n * 8 + 4];
 138         y5 = temp[n * 8 + 5];
 139         y6 = temp[n * 8 + 6];
 140         y7 = temp[n * 8 + 7];
 141
 142         // Horizontal DCT.
 143         dct_1d(y0, y1, y2, y3, y4, y5, y6, y7);
 144
 145         // Quantize.
 146         int c0 = int(round(y0 * quant_matrix[n * 8 + 0]));
 147         int c1 = int(round(y1 * quant_matrix[n * 8 + 1]));
 148         int c2 = int(round(y2 * quant_matrix[n * 8 + 2]));
 149         int c3 = int(round(y3 * quant_matrix[n * 8 + 3]));
 150         int c4 = int(round(y4 * quant_matrix[n * 8 + 4]));
 151         int c5 = int(round(y5 * quant_matrix[n * 8 + 5]));
 152         int c6 = int(round(y6 * quant_matrix[n * 8 + 6]));
 153         int c7 = int(round(y7 * quant_matrix[n * 8 + 7]));
 154
 155         // Clamp, pack and store.
 156         uint sx = gl_WorkGroupID.x;
 157         imageStore(dc_ac7_tex,  ivec2(sx, y + n), uvec4(pack_9_7(c0, c7), 0, 0, 0));
 158         imageStore(ac1_ac6_tex, ivec2(sx, y + n), uvec4(pack_9_7(c1, c6), 0, 0, 0));
 159         imageStore(ac2_ac5_tex, ivec2(sx, y + n), uvec4(pack_9_7(c2, c5), 0, 0, 0));
 160         imageStore(ac3_tex,     ivec2(sx, y + n), ivec4(c3, 0, 0, 0));
 161         imageStore(ac4_tex,     ivec2(sx, y + n), ivec4(c4, 0, 0, 0));
 162 }
 163