From: Steinar H. Gunderson <sesse@debian.org>
Date: Wed, 28 Jan 2009 21:06:59 +0000 (+0100)
Subject: Add a stupid integerization of the AA&N IDCT -- 30% faster or so, mostly
X-Git-Url: https://git.sesse.net/?p=fjl;a=commitdiff_plain;h=85cda6b1da31916b0127d1217825dea725619245;ds=sidebyside

Add a stupid integerization of the AA&N IDCT -- 30% faster or so, mostly
because of the use of a limiter table.
---

diff --git a/Makefile b/Makefile
index 3fc9768..526a9cc 100644
--- a/Makefile
+++ b/Makefile
@@ -20,7 +20,7 @@ BYTESOURCE_TEST_OBJS=bytesource.o choice.o unstuff.o bytesource_test.o
 bytesource_test: $(BYTESOURCE_TEST_OBJS)
 	$(CC) $(LDFLAGS) -o $@ $(BYTESOURCE_TEST_OBJS)
 
-IDCT_TEST_OBJS=idct_float.o idct_reference.o idct_test.o benchmark.o
+IDCT_TEST_OBJS=idct_float.o idct_imprecise_int.o idct_reference.o idct_test.o benchmark.o
 idct_test: $(IDCT_TEST_OBJS)
 	$(CC) $(LDFLAGS) -o $@ $(IDCT_TEST_OBJS)
 
diff --git a/idct_imprecise_int.c b/idct_imprecise_int.c
new file mode 100644
index 0000000..6dd6983
--- /dev/null
+++ b/idct_imprecise_int.c
@@ -0,0 +1,175 @@
+#include <math.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "idct.h"
+
+#define TRUNCATE_BITS 11
+#define TRUNCATE_TABLE_SIZE (1 << TRUNCATE_BITS)
+#define TRUNCATE_TABLE_BIAS (1 << (TRUNCATE_BITS - 1))
+
+struct idct_imprecise_int_userdata {
+	int32_t qt_copy[DCTSIZE2];
+	uint8_t truncate_table[TRUNCATE_TABLE_SIZE];
+};
+
+#define PRECISION 12
+#define ROUND_BIAS (1LL << (PRECISION - 1))
+
+#define FIX(x) ((int32_t)((x) * (1LL << PRECISION) + 0.5))
+
+// Scale factors; 1.0 / (sqrt(2.0) * cos(k * M_PI / 16.0)), except for the first which is 1.
+static const double scalefac[] = {
+	1.0, 0.7209598220069479, 0.765366864730180, 0.8504300947672564,
+	1.0, 1.2727585805728336, 1.847759065022573, 3.6245097854115502
+};
+
+// Premultiply the scale factors and the overall 1/8 factor into the quantization
+// table entries (and convert to fixed-point).
+void* idct_imprecise_int_alloc(const uint32_t* quant_table)
+{
+	struct idct_imprecise_int_userdata* userdata = (struct idct_imprecise_int_userdata*)malloc(sizeof(struct idct_imprecise_int_userdata));
+
+	for (unsigned y = 0; y < DCTSIZE; ++y) {
+		for (unsigned x = 0; x < DCTSIZE; ++x) {
+			userdata->qt_copy[y * DCTSIZE + x] = FIX((1.0/DCTSIZE) * quant_table[y * DCTSIZE + x] *
+				scalefac[x] * scalefac[y]);
+		}
+	}
+	for (unsigned i = 0; i < TRUNCATE_TABLE_SIZE; ++i) {
+		int source_val = i - TRUNCATE_TABLE_BIAS;
+		if (source_val < 0) {
+			userdata->truncate_table[i] = 0;
+		} else if (source_val >= 255) {
+			userdata->truncate_table[i] = 255;
+		} else {
+			userdata->truncate_table[i] = source_val;
+		}
+	}
+
+	return userdata;
+}
+
+void idct_imprecise_int_free(void* userdata)
+{
+	free(userdata);
+}
+
+// 1D 8-point DCT.
+static inline void idct1d_int(int32_t y0, int32_t y1, int32_t y2, int32_t y3, int32_t y4, int32_t y5, int32_t y6, int32_t y7, int32_t *x)
+{
+	// constants
+	static const int32_t a1 = FIX(0.7071067811865474);   // sqrt(2)
+	static const int32_t a2 = FIX(0.5411961001461971);   // cos(3/8 pi) * sqrt(2)
+	static const int32_t a3 = a1;
+	static const int32_t a4 = FIX(1.3065629648763766);   // cos(pi/8) * sqrt(2)
+	static const int32_t a5 = FIX(0.5 * (1.3065629648763766 - 0.5411961001461971));
+
+	// phase 1
+	const int32_t p1_0 = y0;
+	const int32_t p1_1 = y4;
+	const int32_t p1_2 = y2;
+	const int32_t p1_3 = y6;
+	const int32_t p1_4 = y5;
+	const int32_t p1_5 = y1;
+	const int32_t p1_6 = y7;
+	const int32_t p1_7 = y3;
+
+	// phase 2
+	const int32_t p2_0 = p1_0;
+	const int32_t p2_1 = p1_1;
+	const int32_t p2_2 = p1_2;
+	const int32_t p2_3 = p1_3;
+	const int32_t p2_4 = p1_4 - p1_7;
+	const int32_t p2_5 = p1_5 + p1_6;
+	const int32_t p2_6 = p1_5 - p1_6;
+	const int32_t p2_7 = p1_4 + p1_7;
+
+	// phase 3
+	const int32_t p3_0 = p2_0;
+	const int32_t p3_1 = p2_1;
+	const int32_t p3_2 = p2_2 - p2_3;
+	const int32_t p3_3 = p2_2 + p2_3;
+	const int32_t p3_4 = p2_4;
+	const int32_t p3_5 = p2_5 - p2_7;
+	const int32_t p3_6 = p2_6;
+	const int32_t p3_7 = p2_5 + p2_7;
+	
+	// phase 4
+	const int32_t p4_0 = p3_0;
+	const int32_t p4_1 = p3_1;
+	const int32_t p4_2 = (a1 * p3_2) >> PRECISION;
+	const int32_t p4_3 = p3_3;
+	const int32_t p4_4 = (p3_4 * -a2 + (p3_4 + p3_6) * -a5) >> PRECISION;
+	const int32_t p4_5 = (a3 * p3_5) >> PRECISION;
+	const int32_t p4_6 = (p3_6 * a4 + (p3_4 + p3_6) * -a5) >> PRECISION;
+	const int32_t p4_7 = p3_7;
+
+	// phase 5
+	const int32_t p5_0 = p4_0 + p4_1;
+	const int32_t p5_1 = p4_0 - p4_1;
+	const int32_t p5_2 = p4_2;
+	const int32_t p5_3 = p4_2 + p4_3;
+	const int32_t p5_4 = p4_4;
+	const int32_t p5_5 = p4_5;
+	const int32_t p5_6 = p4_6;
+	const int32_t p5_7 = p4_7;
+
+	// phase 6
+	const int32_t p6_0 = p5_0 + p5_3;
+	const int32_t p6_1 = p5_1 + p5_2;
+	const int32_t p6_2 = p5_1 - p5_2;
+	const int32_t p6_3 = p5_0 - p5_3;
+	const int32_t p6_4 = -p5_4;
+	const int32_t p6_5 = p5_5 - p5_4;
+	const int32_t p6_6 = p5_5 + p5_6;
+	const int32_t p6_7 = p5_6 + p5_7;
+
+	// phase 7
+	x[0] = p6_0 + p6_7;
+	x[1] = p6_1 + p6_6;
+	x[2] = p6_2 + p6_5;
+	x[3] = p6_3 + p6_4;
+	x[4] = p6_3 - p6_4;
+	x[5] = p6_2 - p6_5;
+	x[6] = p6_1 - p6_6;
+	x[7] = p6_0 - p6_7;
+}
+
+void idct_imprecise_int(const int16_t* input, const void* userdata, uint8_t* output)
+{
+	const struct idct_imprecise_int_userdata* my_userdata = (const struct idct_imprecise_int_userdata*)userdata;
+	const int32_t* quant_table = my_userdata->qt_copy;
+	int32_t temp[DCTSIZE2];
+
+	// IDCT columns.
+	for (unsigned x = 0; x < DCTSIZE; ++x) {
+		idct1d_int(input[DCTSIZE * 0 + x] * quant_table[DCTSIZE * 0 + x],
+		           input[DCTSIZE * 1 + x] * quant_table[DCTSIZE * 1 + x],
+		           input[DCTSIZE * 2 + x] * quant_table[DCTSIZE * 2 + x],
+		           input[DCTSIZE * 3 + x] * quant_table[DCTSIZE * 3 + x],
+		           input[DCTSIZE * 4 + x] * quant_table[DCTSIZE * 4 + x],
+		           input[DCTSIZE * 5 + x] * quant_table[DCTSIZE * 5 + x],
+		           input[DCTSIZE * 6 + x] * quant_table[DCTSIZE * 6 + x],
+		           input[DCTSIZE * 7 + x] * quant_table[DCTSIZE * 7 + x],
+		           temp + x * DCTSIZE);
+	}
+	
+	// IDCT rows.
+	for (unsigned y = 0; y < DCTSIZE; ++y) {
+		int32_t temp2[DCTSIZE];
+		idct1d_int(temp[DCTSIZE * 0 + y],
+		           temp[DCTSIZE * 1 + y],
+		           temp[DCTSIZE * 2 + y],
+		           temp[DCTSIZE * 3 + y],
+		           temp[DCTSIZE * 4 + y],
+		           temp[DCTSIZE * 5 + y],
+		           temp[DCTSIZE * 6 + y],
+		           temp[DCTSIZE * 7 + y],
+		           temp2);
+		for (unsigned x = 0; x < DCTSIZE; ++x) {
+			const int32_t val = (temp2[x] + ROUND_BIAS + FIX(TRUNCATE_TABLE_BIAS)) >> PRECISION;
+			output[y * DCTSIZE + x] = my_userdata->truncate_table[val & ((1 << TRUNCATE_BITS)-1)];
+		}
+	}
+}
diff --git a/idct_imprecise_int.h b/idct_imprecise_int.h
new file mode 100644
index 0000000..f103270
--- /dev/null
+++ b/idct_imprecise_int.h
@@ -0,0 +1,15 @@
+#ifndef _IDCT_IMPRECISE_INT_H
+#define _IDCT_IMPRECISE_INT_H
+
+#include "idct.h"
+
+// Straight-forward, stupid integerization of idct_float. (There are quite
+// possibly better integer IDCTs possible.)
+//
+// NOTE: This routine most likely does not conform to the precision and/or
+// range demands set forth by the JPEG standard. Caveat emptor.
+void* idct_imprecise_int_alloc(const uint32_t* quant_table);
+void idct_imprecise_int_free(void* userdata);
+void idct_imprecise_int(const int16_t* input, const void* userdata, uint8_t* output);
+
+#endif /* !defined(_IDCT_IMPRECISE_INT_H) */
diff --git a/idct_test.c b/idct_test.c
index dd15270..d7b543c 100644
--- a/idct_test.c
+++ b/idct_test.c
@@ -8,6 +8,7 @@
 #include "idct.h"
 #include "idct_reference.h"
 #include "idct_float.h"
+#include "idct_imprecise_int.h"
 
 // Generate random coefficients in the range [-15..15].
 void gen_random_coeffs(int16_t* dst, size_t len)
@@ -144,6 +145,9 @@ int main(void)
 
 	printf("idct_float:\n");
 	test_all_idct(idct_float_alloc, idct_float_free, idct_float);
+	
+	printf("idct_imprecise_int:\n");
+	test_all_idct(idct_imprecise_int_alloc, idct_imprecise_int_free, idct_imprecise_int);
 
 	printf("All tests pass.\n");
 	return 0;