From: Steinar H. Gunderson Date: Wed, 28 Jan 2009 21:06:59 +0000 (+0100) Subject: Add a stupid integerization of the AA&N IDCT -- 30% faster or so, mostly X-Git-Url: https://git.sesse.net/?p=fjl;a=commitdiff_plain;h=85cda6b1da31916b0127d1217825dea725619245;ds=sidebyside Add a stupid integerization of the AA&N IDCT -- 30% faster or so, mostly because of the use of a limiter table. --- diff --git a/Makefile b/Makefile index 3fc9768..526a9cc 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ BYTESOURCE_TEST_OBJS=bytesource.o choice.o unstuff.o bytesource_test.o bytesource_test: $(BYTESOURCE_TEST_OBJS) $(CC) $(LDFLAGS) -o $@ $(BYTESOURCE_TEST_OBJS) -IDCT_TEST_OBJS=idct_float.o idct_reference.o idct_test.o benchmark.o +IDCT_TEST_OBJS=idct_float.o idct_imprecise_int.o idct_reference.o idct_test.o benchmark.o idct_test: $(IDCT_TEST_OBJS) $(CC) $(LDFLAGS) -o $@ $(IDCT_TEST_OBJS) diff --git a/idct_imprecise_int.c b/idct_imprecise_int.c new file mode 100644 index 0000000..6dd6983 --- /dev/null +++ b/idct_imprecise_int.c @@ -0,0 +1,175 @@ +#include +#include +#include + +#include "idct.h" + +#define TRUNCATE_BITS 11 +#define TRUNCATE_TABLE_SIZE (1 << TRUNCATE_BITS) +#define TRUNCATE_TABLE_BIAS (1 << (TRUNCATE_BITS - 1)) + +struct idct_imprecise_int_userdata { + int32_t qt_copy[DCTSIZE2]; + uint8_t truncate_table[TRUNCATE_TABLE_SIZE]; +}; + +#define PRECISION 12 +#define ROUND_BIAS (1LL << (PRECISION - 1)) + +#define FIX(x) ((int32_t)((x) * (1LL << PRECISION) + 0.5)) + +// Scale factors; 1.0 / (sqrt(2.0) * cos(k * M_PI / 16.0)), except for the first which is 1. +static const double scalefac[] = { + 1.0, 0.7209598220069479, 0.765366864730180, 0.8504300947672564, + 1.0, 1.2727585805728336, 1.847759065022573, 3.6245097854115502 +}; + +// Premultiply the scale factors and the overall 1/8 factor into the quantization +// table entries (and convert to fixed-point). +void* idct_imprecise_int_alloc(const uint32_t* quant_table) +{ + struct idct_imprecise_int_userdata* userdata = (struct idct_imprecise_int_userdata*)malloc(sizeof(struct idct_imprecise_int_userdata)); + + for (unsigned y = 0; y < DCTSIZE; ++y) { + for (unsigned x = 0; x < DCTSIZE; ++x) { + userdata->qt_copy[y * DCTSIZE + x] = FIX((1.0/DCTSIZE) * quant_table[y * DCTSIZE + x] * + scalefac[x] * scalefac[y]); + } + } + for (unsigned i = 0; i < TRUNCATE_TABLE_SIZE; ++i) { + int source_val = i - TRUNCATE_TABLE_BIAS; + if (source_val < 0) { + userdata->truncate_table[i] = 0; + } else if (source_val >= 255) { + userdata->truncate_table[i] = 255; + } else { + userdata->truncate_table[i] = source_val; + } + } + + return userdata; +} + +void idct_imprecise_int_free(void* userdata) +{ + free(userdata); +} + +// 1D 8-point DCT. +static inline void idct1d_int(int32_t y0, int32_t y1, int32_t y2, int32_t y3, int32_t y4, int32_t y5, int32_t y6, int32_t y7, int32_t *x) +{ + // constants + static const int32_t a1 = FIX(0.7071067811865474); // sqrt(2) + static const int32_t a2 = FIX(0.5411961001461971); // cos(3/8 pi) * sqrt(2) + static const int32_t a3 = a1; + static const int32_t a4 = FIX(1.3065629648763766); // cos(pi/8) * sqrt(2) + static const int32_t a5 = FIX(0.5 * (1.3065629648763766 - 0.5411961001461971)); + + // phase 1 + const int32_t p1_0 = y0; + const int32_t p1_1 = y4; + const int32_t p1_2 = y2; + const int32_t p1_3 = y6; + const int32_t p1_4 = y5; + const int32_t p1_5 = y1; + const int32_t p1_6 = y7; + const int32_t p1_7 = y3; + + // phase 2 + const int32_t p2_0 = p1_0; + const int32_t p2_1 = p1_1; + const int32_t p2_2 = p1_2; + const int32_t p2_3 = p1_3; + const int32_t p2_4 = p1_4 - p1_7; + const int32_t p2_5 = p1_5 + p1_6; + const int32_t p2_6 = p1_5 - p1_6; + const int32_t p2_7 = p1_4 + p1_7; + + // phase 3 + const int32_t p3_0 = p2_0; + const int32_t p3_1 = p2_1; + const int32_t p3_2 = p2_2 - p2_3; + const int32_t p3_3 = p2_2 + p2_3; + const int32_t p3_4 = p2_4; + const int32_t p3_5 = p2_5 - p2_7; + const int32_t p3_6 = p2_6; + const int32_t p3_7 = p2_5 + p2_7; + + // phase 4 + const int32_t p4_0 = p3_0; + const int32_t p4_1 = p3_1; + const int32_t p4_2 = (a1 * p3_2) >> PRECISION; + const int32_t p4_3 = p3_3; + const int32_t p4_4 = (p3_4 * -a2 + (p3_4 + p3_6) * -a5) >> PRECISION; + const int32_t p4_5 = (a3 * p3_5) >> PRECISION; + const int32_t p4_6 = (p3_6 * a4 + (p3_4 + p3_6) * -a5) >> PRECISION; + const int32_t p4_7 = p3_7; + + // phase 5 + const int32_t p5_0 = p4_0 + p4_1; + const int32_t p5_1 = p4_0 - p4_1; + const int32_t p5_2 = p4_2; + const int32_t p5_3 = p4_2 + p4_3; + const int32_t p5_4 = p4_4; + const int32_t p5_5 = p4_5; + const int32_t p5_6 = p4_6; + const int32_t p5_7 = p4_7; + + // phase 6 + const int32_t p6_0 = p5_0 + p5_3; + const int32_t p6_1 = p5_1 + p5_2; + const int32_t p6_2 = p5_1 - p5_2; + const int32_t p6_3 = p5_0 - p5_3; + const int32_t p6_4 = -p5_4; + const int32_t p6_5 = p5_5 - p5_4; + const int32_t p6_6 = p5_5 + p5_6; + const int32_t p6_7 = p5_6 + p5_7; + + // phase 7 + x[0] = p6_0 + p6_7; + x[1] = p6_1 + p6_6; + x[2] = p6_2 + p6_5; + x[3] = p6_3 + p6_4; + x[4] = p6_3 - p6_4; + x[5] = p6_2 - p6_5; + x[6] = p6_1 - p6_6; + x[7] = p6_0 - p6_7; +} + +void idct_imprecise_int(const int16_t* input, const void* userdata, uint8_t* output) +{ + const struct idct_imprecise_int_userdata* my_userdata = (const struct idct_imprecise_int_userdata*)userdata; + const int32_t* quant_table = my_userdata->qt_copy; + int32_t temp[DCTSIZE2]; + + // IDCT columns. + for (unsigned x = 0; x < DCTSIZE; ++x) { + idct1d_int(input[DCTSIZE * 0 + x] * quant_table[DCTSIZE * 0 + x], + input[DCTSIZE * 1 + x] * quant_table[DCTSIZE * 1 + x], + input[DCTSIZE * 2 + x] * quant_table[DCTSIZE * 2 + x], + input[DCTSIZE * 3 + x] * quant_table[DCTSIZE * 3 + x], + input[DCTSIZE * 4 + x] * quant_table[DCTSIZE * 4 + x], + input[DCTSIZE * 5 + x] * quant_table[DCTSIZE * 5 + x], + input[DCTSIZE * 6 + x] * quant_table[DCTSIZE * 6 + x], + input[DCTSIZE * 7 + x] * quant_table[DCTSIZE * 7 + x], + temp + x * DCTSIZE); + } + + // IDCT rows. + for (unsigned y = 0; y < DCTSIZE; ++y) { + int32_t temp2[DCTSIZE]; + idct1d_int(temp[DCTSIZE * 0 + y], + temp[DCTSIZE * 1 + y], + temp[DCTSIZE * 2 + y], + temp[DCTSIZE * 3 + y], + temp[DCTSIZE * 4 + y], + temp[DCTSIZE * 5 + y], + temp[DCTSIZE * 6 + y], + temp[DCTSIZE * 7 + y], + temp2); + for (unsigned x = 0; x < DCTSIZE; ++x) { + const int32_t val = (temp2[x] + ROUND_BIAS + FIX(TRUNCATE_TABLE_BIAS)) >> PRECISION; + output[y * DCTSIZE + x] = my_userdata->truncate_table[val & ((1 << TRUNCATE_BITS)-1)]; + } + } +} diff --git a/idct_imprecise_int.h b/idct_imprecise_int.h new file mode 100644 index 0000000..f103270 --- /dev/null +++ b/idct_imprecise_int.h @@ -0,0 +1,15 @@ +#ifndef _IDCT_IMPRECISE_INT_H +#define _IDCT_IMPRECISE_INT_H + +#include "idct.h" + +// Straight-forward, stupid integerization of idct_float. (There are quite +// possibly better integer IDCTs possible.) +// +// NOTE: This routine most likely does not conform to the precision and/or +// range demands set forth by the JPEG standard. Caveat emptor. +void* idct_imprecise_int_alloc(const uint32_t* quant_table); +void idct_imprecise_int_free(void* userdata); +void idct_imprecise_int(const int16_t* input, const void* userdata, uint8_t* output); + +#endif /* !defined(_IDCT_IMPRECISE_INT_H) */ diff --git a/idct_test.c b/idct_test.c index dd15270..d7b543c 100644 --- a/idct_test.c +++ b/idct_test.c @@ -8,6 +8,7 @@ #include "idct.h" #include "idct_reference.h" #include "idct_float.h" +#include "idct_imprecise_int.h" // Generate random coefficients in the range [-15..15]. void gen_random_coeffs(int16_t* dst, size_t len) @@ -144,6 +145,9 @@ int main(void) printf("idct_float:\n"); test_all_idct(idct_float_alloc, idct_float_free, idct_float); + + printf("idct_imprecise_int:\n"); + test_all_idct(idct_imprecise_int_alloc, idct_imprecise_int_free, idct_imprecise_int); printf("All tests pass.\n"); return 0;