X-Git-Url: https://git.sesse.net/?p=fjl;a=blobdiff_plain;f=idct.c;h=c71d599833affb62944c4f915f89c9009aed034b;hp=54a9dc9a9ecaa6a0e3bea13b33cc626c42587309;hb=def75cfab6ca4da59ddc2e924898f3a8ed11ec89;hpb=039d3c953460ec41fa8b1f9d26e9cf62a344c8ef

diff --git a/idct.c b/idct.c
index 54a9dc9..c71d599 100644
--- a/idct.c
+++ b/idct.c
@@ -1,9 +1,27 @@
 #include <math.h>
+#include <string.h>
+#include <stdlib.h>
 
 #include "idct.h"
 
-void idct_reference(const int16_t* input, const uint32_t* quant_table, uint8_t* output)
+void* idct_reference_alloc(const uint32_t* quant_table)
 {
+	uint32_t* qt_copy = (uint32_t*)malloc(DCTSIZE2 * sizeof(uint32_t));
+	// FIXME: check for NULL return
+
+	memcpy(qt_copy, quant_table, DCTSIZE2 * sizeof(uint32_t));
+
+	return qt_copy;
+}
+
+void idct_reference_free(void* userdata)
+{
+	free(userdata);
+}
+
+void idct_reference(const int16_t* input, const void* userdata, uint8_t* output)
+{
+	const uint32_t* quant_table = (const uint32_t*)userdata;
 	double temp[DCTSIZE2];
 
 	for (unsigned y = 0; y < 8; ++y) {
@@ -37,3 +55,166 @@ void idct_reference(const int16_t* input, const uint32_t* quant_table, uint8_t*
 	}
 }
 
+// AA&N (Arai, Agui and Nakajima) floating-point IDCT.
+// This IDCT is based on the same DCT that libjpeg uses -- in fact, exactly the
+// same figure from the same book ("JPEG: Still Image Data Compression Standard",
+// page 52, figure 4-8). However, it is coded from scratch, and uses the
+// transposition method for converting DCT -> IDCT suggested in the book.
+// (libjpeg seems to use some other method that yields similar, but not
+// the same, code.) 
+
+// As this is generally meant as a reference and not useful code (we expect
+// a SIMD fixed-point algorithm to be used in most cases), it has not been
+// attempted significantly optimized. We assume the compiler will be smart
+// enough to do all the variable propagation for us anyway.
+
+// Scale factors; 1.0 / (sqrt(2.0) * cos(k * M_PI / 16.0)), except for the first which is 1.
+static const double scalefac[] = {
+	1.0, 0.7209598220069479, 0.765366864730180, 0.8504300947672564,
+	1.0, 1.2727585805728336, 1.847759065022573, 3.6245097854115502
+};
+
+// Premultiply the scale factors and the overall 1/8 factor into the quantization
+// table entries (and convert to double).
+void* idct_float_alloc(const uint32_t* quant_table)
+{
+	double* qt_copy = (double*)malloc(DCTSIZE2 * sizeof(double));
+
+	for (unsigned y = 0; y < DCTSIZE; ++y) {
+		for (unsigned x = 0; x < DCTSIZE; ++x) {
+			qt_copy[y * DCTSIZE + x] = (1.0/DCTSIZE) * quant_table[y * DCTSIZE + x] *
+				scalefac[x] * scalefac[y];
+		}
+	}
+
+	return qt_copy;
+}
+
+void idct_float_free(void* userdata)
+{
+	free(userdata);
+}
+
+// 1D 8-point DCT.
+static inline void idct1d_float(double y0, double y1, double y2, double y3, double y4, double y5, double y6, double y7, double *x)
+{
+	// constants
+	static const double a1 = 0.7071067811865474;   // sqrt(2)
+	static const double a2 = 0.5411961001461971;   // cos(3/8 pi) * sqrt(2)
+	static const double a3 = a1;
+	static const double a4 = 1.3065629648763766;   // cos(pi/8) * sqrt(2)
+	static const double a5 = 0.5 * (a4 - a2);
+
+	// phase 1
+	const double p1_0 = y0;
+	const double p1_1 = y4;
+	const double p1_2 = y2;
+	const double p1_3 = y6;
+	const double p1_4 = y5;
+	const double p1_5 = y1;
+	const double p1_6 = y7;
+	const double p1_7 = y3;
+
+	// phase 2
+	const double p2_0 = p1_0;
+	const double p2_1 = p1_1;
+	const double p2_2 = p1_2;
+	const double p2_3 = p1_3;
+	const double p2_4 = p1_4 - p1_7;
+	const double p2_5 = p1_5 + p1_6;
+	const double p2_6 = p1_5 - p1_6;
+	const double p2_7 = p1_4 + p1_7;
+
+	// phase 3
+	const double p3_0 = p2_0;
+	const double p3_1 = p2_1;
+	const double p3_2 = p2_2 - p2_3;
+	const double p3_3 = p2_2 + p2_3;
+	const double p3_4 = p2_4;
+	const double p3_5 = p2_5 - p2_7;
+	const double p3_6 = p2_6;
+	const double p3_7 = p2_5 + p2_7;
+	
+	// phase 4
+	const double p4_0 = p3_0;
+	const double p4_1 = p3_1;
+	const double p4_2 = a1 * p3_2;
+	const double p4_3 = p3_3;
+	const double p4_4 = p3_4 * -a2 + (p3_4 + p3_6) * -a5;
+	const double p4_5 = a3 * p3_5;
+	const double p4_6 = p3_6 * a4 + (p3_4 + p3_6) * -a5;
+	const double p4_7 = p3_7;
+
+	// phase 5
+	const double p5_0 = p4_0 + p4_1;
+	const double p5_1 = p4_0 - p4_1;
+	const double p5_2 = p4_2;
+	const double p5_3 = p4_2 + p4_3;
+	const double p5_4 = p4_4;
+	const double p5_5 = p4_5;
+	const double p5_6 = p4_6;
+	const double p5_7 = p4_7;
+
+	// phase 6
+	const double p6_0 = p5_0 + p5_3;
+	const double p6_1 = p5_1 + p5_2;
+	const double p6_2 = p5_1 - p5_2;
+	const double p6_3 = p5_0 - p5_3;
+	const double p6_4 = -p5_4;
+	const double p6_5 = p5_5 - p5_4;
+	const double p6_6 = p5_5 + p5_6;
+	const double p6_7 = p5_6 + p5_7;
+
+	// phase 7
+	x[0] = p6_0 + p6_7;
+	x[1] = p6_1 + p6_6;
+	x[2] = p6_2 + p6_5;
+	x[3] = p6_3 + p6_4;
+	x[4] = p6_3 - p6_4;
+	x[5] = p6_2 - p6_5;
+	x[6] = p6_1 - p6_6;
+	x[7] = p6_0 - p6_7;
+}
+
+void idct_float(const int16_t* input, const void* userdata, uint8_t* output)
+{
+	const double* quant_table = (const double*)userdata;
+	double temp[DCTSIZE2];
+
+	// IDCT columns.
+	for (unsigned x = 0; x < DCTSIZE; ++x) {
+		idct1d_float(input[DCTSIZE * 0 + x] * quant_table[DCTSIZE * 0 + x],
+		             input[DCTSIZE * 1 + x] * quant_table[DCTSIZE * 1 + x],
+		             input[DCTSIZE * 2 + x] * quant_table[DCTSIZE * 2 + x],
+		             input[DCTSIZE * 3 + x] * quant_table[DCTSIZE * 3 + x],
+		             input[DCTSIZE * 4 + x] * quant_table[DCTSIZE * 4 + x],
+		             input[DCTSIZE * 5 + x] * quant_table[DCTSIZE * 5 + x],
+		             input[DCTSIZE * 6 + x] * quant_table[DCTSIZE * 6 + x],
+		             input[DCTSIZE * 7 + x] * quant_table[DCTSIZE * 7 + x],
+		             temp + x * DCTSIZE);
+	}
+	
+	// IDCT rows.
+	for (unsigned y = 0; y < DCTSIZE; ++y) {
+		double temp2[DCTSIZE];
+		idct1d_float(temp[DCTSIZE * 0 + y],
+		             temp[DCTSIZE * 1 + y],
+		             temp[DCTSIZE * 2 + y],
+		             temp[DCTSIZE * 3 + y],
+		             temp[DCTSIZE * 4 + y],
+		             temp[DCTSIZE * 5 + y],
+		             temp[DCTSIZE * 6 + y],
+		             temp[DCTSIZE * 7 + y],
+		             temp2);
+		for (unsigned x = 0; x < DCTSIZE; ++x) {
+			const double val = temp2[x];
+			if (val < 0.0) {
+				output[y * DCTSIZE + x] = 0;
+			} else if (val >= 255.0) {
+				output[y * DCTSIZE + x] = 255;
+			} else {
+				output[y * DCTSIZE + x] = (uint8_t)(val + 0.5);
+			}
+		}
+	}
+}