From: Steinar H. Gunderson <sesse@debian.org>
Date: Sun, 11 Jan 2009 21:03:54 +0000 (+0100)
Subject: Let IDCTs do precalculation outside the inner loops. Speeds up (as expected)
X-Git-Url: https://git.sesse.net/?p=fjl;a=commitdiff_plain;h=13b48c4e2de5ceb4922ee47db32dd1a18edfffe4;hp=4b87e80c1ee4dd6a5d5c13e7b4321a956f53378f

Let IDCTs do precalculation outside the inner loops. Speeds up (as expected)
the AA&N IDCT by about 50%, as almost all the multiplications can go into
the quantization table.
---

diff --git a/idct.c b/idct.c
index b187a90..ad1cc36 100644
--- a/idct.c
+++ b/idct.c
@@ -1,9 +1,27 @@
 #include <math.h>
+#include <string.h>
+#include <stdlib.h>
 
 #include "idct.h"
 
-void idct_reference(const int16_t* input, const uint32_t* quant_table, uint8_t* output)
+void* idct_reference_alloc(const uint32_t* quant_table)
 {
+	uint32_t* qt_copy = (uint32_t*)malloc(DCTSIZE2 * sizeof(uint32_t));
+	// FIXME: check for NULL return
+
+	memcpy(qt_copy, quant_table, DCTSIZE2 * sizeof(uint32_t));
+
+	return qt_copy;
+}
+
+void idct_reference_free(void* userdata)
+{
+	free(userdata);
+}
+
+void idct_reference(const int16_t* input, const void* userdata, uint8_t* output)
+{
+	const uint32_t* quant_table = (const uint32_t*)userdata;
 	double temp[DCTSIZE2];
 
 	for (unsigned y = 0; y < 8; ++y) {
@@ -56,6 +74,27 @@ static const double scalefac[] = {
 	1.0, 1.2727585805728336, 1.847759065022573, 3.6245097854115502
 };
 
+// Premultiply the scale factors and the overall 1/8 factor into the quantization
+// table entries (and convert to double).
+void* idct_float_alloc(const uint32_t* quant_table)
+{
+	double* qt_copy = (double*)malloc(DCTSIZE2 * sizeof(double));
+
+	for (unsigned y = 0; y < DCTSIZE; ++y) {
+		for (unsigned x = 0; x < DCTSIZE; ++x) {
+			qt_copy[y * DCTSIZE + x] = 0.125 * quant_table[y * DCTSIZE + x] *
+				scalefac[x] * scalefac[y];
+		}
+	}
+
+	return qt_copy;
+}
+
+void idct_float_free(void* userdata)
+{
+	free(userdata);
+}
+
 // 1D 8-point DCT.
 static inline void idct1d_float(double y0, double y1, double y2, double y3, double y4, double y5, double y6, double y7, double *x)
 {
@@ -67,14 +106,14 @@ static inline void idct1d_float(double y0, double y1, double y2, double y3, doub
 	static const double a5 = 0.5 * (a4 - a2);
 
 	// phase 1
-	const double p1_0 = y0 * scalefac[0];
-	const double p1_1 = y4 * scalefac[4];
-	const double p1_2 = y2 * scalefac[2];
-	const double p1_3 = y6 * scalefac[6];
-	const double p1_4 = y5 * scalefac[5];
-	const double p1_5 = y1 * scalefac[1];
-	const double p1_6 = y7 * scalefac[7];
-	const double p1_7 = y3 * scalefac[3];
+	const double p1_0 = y0;
+	const double p1_1 = y4;
+	const double p1_2 = y2;
+	const double p1_3 = y6;
+	const double p1_4 = y5;
+	const double p1_5 = y1;
+	const double p1_6 = y7;
+	const double p1_7 = y3;
 
 	// phase 2
 	const double p2_0 = p1_0;
@@ -137,20 +176,21 @@ static inline void idct1d_float(double y0, double y1, double y2, double y3, doub
 	x[7] = p6_0 - p6_7;
 }
 
-void idct_float(const int16_t* input, const uint32_t* quant_table, uint8_t* output)
+void idct_float(const int16_t* input, const void* userdata, uint8_t* output)
 {
+	const double* quant_table = (const double*)userdata;
 	double temp[DCTSIZE2];
 
 	// IDCT columns.
 	for (unsigned x = 0; x < DCTSIZE; ++x) {
-		idct1d_float(input[DCTSIZE * 0 + x] * (int32_t)quant_table[DCTSIZE * 0 + x],
-		             input[DCTSIZE * 1 + x] * (int32_t)quant_table[DCTSIZE * 1 + x],
-		             input[DCTSIZE * 2 + x] * (int32_t)quant_table[DCTSIZE * 2 + x],
-		             input[DCTSIZE * 3 + x] * (int32_t)quant_table[DCTSIZE * 3 + x],
-		             input[DCTSIZE * 4 + x] * (int32_t)quant_table[DCTSIZE * 4 + x],
-		             input[DCTSIZE * 5 + x] * (int32_t)quant_table[DCTSIZE * 5 + x],
-		             input[DCTSIZE * 6 + x] * (int32_t)quant_table[DCTSIZE * 6 + x],
-		             input[DCTSIZE * 7 + x] * (int32_t)quant_table[DCTSIZE * 7 + x],
+		idct1d_float(input[DCTSIZE * 0 + x] * quant_table[DCTSIZE * 0 + x],
+		             input[DCTSIZE * 1 + x] * quant_table[DCTSIZE * 1 + x],
+		             input[DCTSIZE * 2 + x] * quant_table[DCTSIZE * 2 + x],
+		             input[DCTSIZE * 3 + x] * quant_table[DCTSIZE * 3 + x],
+		             input[DCTSIZE * 4 + x] * quant_table[DCTSIZE * 4 + x],
+		             input[DCTSIZE * 5 + x] * quant_table[DCTSIZE * 5 + x],
+		             input[DCTSIZE * 6 + x] * quant_table[DCTSIZE * 6 + x],
+		             input[DCTSIZE * 7 + x] * quant_table[DCTSIZE * 7 + x],
 		             temp + x * DCTSIZE);
 	}
 	
@@ -167,7 +207,7 @@ void idct_float(const int16_t* input, const uint32_t* quant_table, uint8_t* outp
 		             temp[DCTSIZE * 7 + y],
 		             temp2);
 		for (unsigned x = 0; x < DCTSIZE; ++x) {
-			double val = (1.0/8.0) * temp2[x];
+			const double val = temp2[x];
 			if (val < 0.0) {
 				output[y * DCTSIZE + x] = 0;
 			} else if (val >= 255.0) {
diff --git a/idct.h b/idct.h
index 97a6a5c..4a16ea0 100644
--- a/idct.h
+++ b/idct.h
@@ -6,13 +6,26 @@
 #define DCTSIZE 8
 #define DCTSIZE2 64
 
-typedef void (idct_func_t)(const int16_t*, const uint32_t*, uint8_t*);
+// void* idct_example_alloc(const uint32_t* quant_table);
+typedef void* (idct_alloc_t)(const uint32_t*);
+
+// void idct_example_free(const void* userdata);
+// userdata is the same as returned by the alloc function.
+typedef void (idct_free_t)(void*);
+
+// void idct_example(const int16_t* input, const void* userdata, uint8_t* output);
+// userdata is the same as returned by the alloc function.
+typedef void (idct_func_t)(const int16_t*, const void*, uint8_t*);
 
 // Non-factorized reference version (section A.3.3 of the JPEG standard).
-void idct_reference(const int16_t* input, const uint32_t* quant_table, uint8_t* output);
+void* idct_reference_alloc(const uint32_t* quant_table);
+void idct_reference_free(void* userdata);
+void idct_reference(const int16_t* input, const void* userdata, uint8_t* output);
 
 // Floating-point IDCT due to Arai, Agui and Nakajima (also known as AA&N).
 // See idct.c for more details.
-void idct_float(const int16_t* input, const uint32_t* quant_table, uint8_t* output);
+void* idct_float_alloc(const uint32_t* quant_table);
+void idct_float_free(void* userdata);
+void idct_float(const int16_t* input, const void* userdata, uint8_t* output);
 
 #endif /* !defined(_IDCT_H) */
diff --git a/idct_test.c b/idct_test.c
index cf2556b..78374f8 100644
--- a/idct_test.c
+++ b/idct_test.c
@@ -25,23 +25,26 @@ void gen_random_coeffs(int16_t* dst, size_t len)
 // Test that the input is pretty close to the reference for random inputs. 
 // (If the reference funtion is given in, this becomes a simple test of its
 // determinism.)
-void test_random_inputs(idct_func_t* idct)
+void test_random_inputs(idct_alloc_t* idct_alloc, idct_free_t* idct_free, idct_func_t* idct)
 {
 	int16_t coeff[DCTSIZE2]; 
 	uint32_t quant[DCTSIZE2];
 	uint8_t output[DCTSIZE2];
 	uint8_t reference[DCTSIZE2];
-		
+
 	// Unit quantization (ie., no scaling).
 	for (unsigned i = 0; i < DCTSIZE2; ++i) {
 		quant[i] = 1;
 	}
+	
+	void* userdata_reference = idct_reference_alloc(quant);
+	void* userdata = idct_alloc(quant);
 
 	for (unsigned i = 0; i < 1000; ++i) {	
 		gen_random_coeffs(coeff, DCTSIZE2);
 
-		(*idct)(coeff, quant, output);
-		(idct_reference)(coeff, quant, reference);
+		(*idct)(coeff, userdata, output);
+		(idct_reference)(coeff, userdata_reference, reference);
 
 		// Find the RMS difference.
 		int diff_squared = 0;
@@ -51,30 +54,37 @@ void test_random_inputs(idct_func_t* idct)
 
 		assert(diff_squared <= 5);
 	}
+
+	idct_reference_free(userdata_reference);
+	idct_free(userdata);
 }
 
 // Test that a single DC coefficient becomes spread out to all blocks.
-void test_dc_becomes_spread_out(idct_func_t* idct)
+void test_dc_becomes_spread_out(idct_alloc_t* idct_alloc, idct_free_t* idct_free, idct_func_t* idct)
 {
 	int16_t coeff[DCTSIZE2] = { 0 }; 
 	uint32_t quant[DCTSIZE2];
 	uint8_t output[DCTSIZE2];
-
+	
 	// Unit quantization (ie., no scaling).
 	for (unsigned i = 0; i < DCTSIZE2; ++i) {
 		quant[i] = 1;
 	}
 
+	void* userdata = idct_alloc(quant);
+
 	for (unsigned i = 0; i < 255*8; ++i) {	
 		uint32_t reference_value = i / 8;
 		coeff[0] = i;
 
-		(*idct)(coeff, quant, output);
+		(*idct)(coeff, userdata, output);
 
 		for (unsigned i = 0; i < DCTSIZE2; ++i) {
 			assert(abs(output[i] - reference_value) <= 1);
 		}
 	}
+	
+	idct_free(userdata);
 }
 
 double timediff(const struct timeval* a, const struct timeval* b)
@@ -83,7 +93,7 @@ double timediff(const struct timeval* a, const struct timeval* b)
 		(double)(b->tv_usec - a->tv_usec) * 1e-6;
 }
 
-void test_performance(idct_func_t* idct)
+void test_performance(idct_alloc_t* idct_alloc, idct_free_t* idct_free, idct_func_t* idct)
 {
 	const unsigned num_runs = (idct == idct_reference) ? 5000 : 5000000;
 
@@ -98,36 +108,40 @@ void test_performance(idct_func_t* idct)
 		quant[i] = 1;
 	}
 
+	void* userdata = idct_alloc(quant);
+
 	start_benchmark_timer();
 
 	for (unsigned i = 0; i < num_runs; ++i) {
-		(*idct)(coeff, quant, output);
+		(*idct)(coeff, userdata, output);
 	}
 	
 	double diff = stop_benchmark_timer();
 	printf("%u runs in %.2f CPU seconds = %.2f IDCTs/sec\n",
 		num_runs, diff, num_runs / diff);
+
+	idct_free(userdata);
 }
 
-void test_all_idct(idct_func_t* idct)
+void test_all_idct(idct_alloc_t* idct_alloc, idct_free_t* idct_free, idct_func_t* idct)
 {
 	printf("  test_dc_becomes_spread_out()\n");
-	test_dc_becomes_spread_out(idct);	
+	test_dc_becomes_spread_out(idct_alloc, idct_free, idct);	
 
 	printf("  test_random_inputs()\n");
-	test_random_inputs(idct);	
+	test_random_inputs(idct_alloc, idct_free, idct);	
 
 	printf("  performance test: ");
-	test_performance(idct);
+	test_performance(idct_alloc, idct_free, idct);
 }
 
 int main(void)
 {
 	printf("idct_reference:\n");
-	test_all_idct(idct_reference);
+	test_all_idct(idct_reference_alloc, idct_reference_free, idct_reference);
 
 	printf("idct_float:\n");
-	test_all_idct(idct_float);
+	test_all_idct(idct_float_alloc, idct_float_free, idct_float);
 
 	printf("All tests pass.\n");
 	return 0;