From: Steinar H. Gunderson Date: Sun, 11 Jan 2009 21:03:54 +0000 (+0100) Subject: Let IDCTs do precalculation outside the inner loops. Speeds up (as expected) X-Git-Url: https://git.sesse.net/?p=fjl;a=commitdiff_plain;h=13b48c4e2de5ceb4922ee47db32dd1a18edfffe4;hp=4b87e80c1ee4dd6a5d5c13e7b4321a956f53378f Let IDCTs do precalculation outside the inner loops. Speeds up (as expected) the AA&N IDCT by about 50%, as almost all the multiplications can go into the quantization table. --- diff --git a/idct.c b/idct.c index b187a90..ad1cc36 100644 --- a/idct.c +++ b/idct.c @@ -1,9 +1,27 @@ #include +#include +#include #include "idct.h" -void idct_reference(const int16_t* input, const uint32_t* quant_table, uint8_t* output) +void* idct_reference_alloc(const uint32_t* quant_table) { + uint32_t* qt_copy = (uint32_t*)malloc(DCTSIZE2 * sizeof(uint32_t)); + // FIXME: check for NULL return + + memcpy(qt_copy, quant_table, DCTSIZE2 * sizeof(uint32_t)); + + return qt_copy; +} + +void idct_reference_free(void* userdata) +{ + free(userdata); +} + +void idct_reference(const int16_t* input, const void* userdata, uint8_t* output) +{ + const uint32_t* quant_table = (const uint32_t*)userdata; double temp[DCTSIZE2]; for (unsigned y = 0; y < 8; ++y) { @@ -56,6 +74,27 @@ static const double scalefac[] = { 1.0, 1.2727585805728336, 1.847759065022573, 3.6245097854115502 }; +// Premultiply the scale factors and the overall 1/8 factor into the quantization +// table entries (and convert to double). +void* idct_float_alloc(const uint32_t* quant_table) +{ + double* qt_copy = (double*)malloc(DCTSIZE2 * sizeof(double)); + + for (unsigned y = 0; y < DCTSIZE; ++y) { + for (unsigned x = 0; x < DCTSIZE; ++x) { + qt_copy[y * DCTSIZE + x] = 0.125 * quant_table[y * DCTSIZE + x] * + scalefac[x] * scalefac[y]; + } + } + + return qt_copy; +} + +void idct_float_free(void* userdata) +{ + free(userdata); +} + // 1D 8-point DCT. static inline void idct1d_float(double y0, double y1, double y2, double y3, double y4, double y5, double y6, double y7, double *x) { @@ -67,14 +106,14 @@ static inline void idct1d_float(double y0, double y1, double y2, double y3, doub static const double a5 = 0.5 * (a4 - a2); // phase 1 - const double p1_0 = y0 * scalefac[0]; - const double p1_1 = y4 * scalefac[4]; - const double p1_2 = y2 * scalefac[2]; - const double p1_3 = y6 * scalefac[6]; - const double p1_4 = y5 * scalefac[5]; - const double p1_5 = y1 * scalefac[1]; - const double p1_6 = y7 * scalefac[7]; - const double p1_7 = y3 * scalefac[3]; + const double p1_0 = y0; + const double p1_1 = y4; + const double p1_2 = y2; + const double p1_3 = y6; + const double p1_4 = y5; + const double p1_5 = y1; + const double p1_6 = y7; + const double p1_7 = y3; // phase 2 const double p2_0 = p1_0; @@ -137,20 +176,21 @@ static inline void idct1d_float(double y0, double y1, double y2, double y3, doub x[7] = p6_0 - p6_7; } -void idct_float(const int16_t* input, const uint32_t* quant_table, uint8_t* output) +void idct_float(const int16_t* input, const void* userdata, uint8_t* output) { + const double* quant_table = (const double*)userdata; double temp[DCTSIZE2]; // IDCT columns. for (unsigned x = 0; x < DCTSIZE; ++x) { - idct1d_float(input[DCTSIZE * 0 + x] * (int32_t)quant_table[DCTSIZE * 0 + x], - input[DCTSIZE * 1 + x] * (int32_t)quant_table[DCTSIZE * 1 + x], - input[DCTSIZE * 2 + x] * (int32_t)quant_table[DCTSIZE * 2 + x], - input[DCTSIZE * 3 + x] * (int32_t)quant_table[DCTSIZE * 3 + x], - input[DCTSIZE * 4 + x] * (int32_t)quant_table[DCTSIZE * 4 + x], - input[DCTSIZE * 5 + x] * (int32_t)quant_table[DCTSIZE * 5 + x], - input[DCTSIZE * 6 + x] * (int32_t)quant_table[DCTSIZE * 6 + x], - input[DCTSIZE * 7 + x] * (int32_t)quant_table[DCTSIZE * 7 + x], + idct1d_float(input[DCTSIZE * 0 + x] * quant_table[DCTSIZE * 0 + x], + input[DCTSIZE * 1 + x] * quant_table[DCTSIZE * 1 + x], + input[DCTSIZE * 2 + x] * quant_table[DCTSIZE * 2 + x], + input[DCTSIZE * 3 + x] * quant_table[DCTSIZE * 3 + x], + input[DCTSIZE * 4 + x] * quant_table[DCTSIZE * 4 + x], + input[DCTSIZE * 5 + x] * quant_table[DCTSIZE * 5 + x], + input[DCTSIZE * 6 + x] * quant_table[DCTSIZE * 6 + x], + input[DCTSIZE * 7 + x] * quant_table[DCTSIZE * 7 + x], temp + x * DCTSIZE); } @@ -167,7 +207,7 @@ void idct_float(const int16_t* input, const uint32_t* quant_table, uint8_t* outp temp[DCTSIZE * 7 + y], temp2); for (unsigned x = 0; x < DCTSIZE; ++x) { - double val = (1.0/8.0) * temp2[x]; + const double val = temp2[x]; if (val < 0.0) { output[y * DCTSIZE + x] = 0; } else if (val >= 255.0) { diff --git a/idct.h b/idct.h index 97a6a5c..4a16ea0 100644 --- a/idct.h +++ b/idct.h @@ -6,13 +6,26 @@ #define DCTSIZE 8 #define DCTSIZE2 64 -typedef void (idct_func_t)(const int16_t*, const uint32_t*, uint8_t*); +// void* idct_example_alloc(const uint32_t* quant_table); +typedef void* (idct_alloc_t)(const uint32_t*); + +// void idct_example_free(const void* userdata); +// userdata is the same as returned by the alloc function. +typedef void (idct_free_t)(void*); + +// void idct_example(const int16_t* input, const void* userdata, uint8_t* output); +// userdata is the same as returned by the alloc function. +typedef void (idct_func_t)(const int16_t*, const void*, uint8_t*); // Non-factorized reference version (section A.3.3 of the JPEG standard). -void idct_reference(const int16_t* input, const uint32_t* quant_table, uint8_t* output); +void* idct_reference_alloc(const uint32_t* quant_table); +void idct_reference_free(void* userdata); +void idct_reference(const int16_t* input, const void* userdata, uint8_t* output); // Floating-point IDCT due to Arai, Agui and Nakajima (also known as AA&N). // See idct.c for more details. -void idct_float(const int16_t* input, const uint32_t* quant_table, uint8_t* output); +void* idct_float_alloc(const uint32_t* quant_table); +void idct_float_free(void* userdata); +void idct_float(const int16_t* input, const void* userdata, uint8_t* output); #endif /* !defined(_IDCT_H) */ diff --git a/idct_test.c b/idct_test.c index cf2556b..78374f8 100644 --- a/idct_test.c +++ b/idct_test.c @@ -25,23 +25,26 @@ void gen_random_coeffs(int16_t* dst, size_t len) // Test that the input is pretty close to the reference for random inputs. // (If the reference funtion is given in, this becomes a simple test of its // determinism.) -void test_random_inputs(idct_func_t* idct) +void test_random_inputs(idct_alloc_t* idct_alloc, idct_free_t* idct_free, idct_func_t* idct) { int16_t coeff[DCTSIZE2]; uint32_t quant[DCTSIZE2]; uint8_t output[DCTSIZE2]; uint8_t reference[DCTSIZE2]; - + // Unit quantization (ie., no scaling). for (unsigned i = 0; i < DCTSIZE2; ++i) { quant[i] = 1; } + + void* userdata_reference = idct_reference_alloc(quant); + void* userdata = idct_alloc(quant); for (unsigned i = 0; i < 1000; ++i) { gen_random_coeffs(coeff, DCTSIZE2); - (*idct)(coeff, quant, output); - (idct_reference)(coeff, quant, reference); + (*idct)(coeff, userdata, output); + (idct_reference)(coeff, userdata_reference, reference); // Find the RMS difference. int diff_squared = 0; @@ -51,30 +54,37 @@ void test_random_inputs(idct_func_t* idct) assert(diff_squared <= 5); } + + idct_reference_free(userdata_reference); + idct_free(userdata); } // Test that a single DC coefficient becomes spread out to all blocks. -void test_dc_becomes_spread_out(idct_func_t* idct) +void test_dc_becomes_spread_out(idct_alloc_t* idct_alloc, idct_free_t* idct_free, idct_func_t* idct) { int16_t coeff[DCTSIZE2] = { 0 }; uint32_t quant[DCTSIZE2]; uint8_t output[DCTSIZE2]; - + // Unit quantization (ie., no scaling). for (unsigned i = 0; i < DCTSIZE2; ++i) { quant[i] = 1; } + void* userdata = idct_alloc(quant); + for (unsigned i = 0; i < 255*8; ++i) { uint32_t reference_value = i / 8; coeff[0] = i; - (*idct)(coeff, quant, output); + (*idct)(coeff, userdata, output); for (unsigned i = 0; i < DCTSIZE2; ++i) { assert(abs(output[i] - reference_value) <= 1); } } + + idct_free(userdata); } double timediff(const struct timeval* a, const struct timeval* b) @@ -83,7 +93,7 @@ double timediff(const struct timeval* a, const struct timeval* b) (double)(b->tv_usec - a->tv_usec) * 1e-6; } -void test_performance(idct_func_t* idct) +void test_performance(idct_alloc_t* idct_alloc, idct_free_t* idct_free, idct_func_t* idct) { const unsigned num_runs = (idct == idct_reference) ? 5000 : 5000000; @@ -98,36 +108,40 @@ void test_performance(idct_func_t* idct) quant[i] = 1; } + void* userdata = idct_alloc(quant); + start_benchmark_timer(); for (unsigned i = 0; i < num_runs; ++i) { - (*idct)(coeff, quant, output); + (*idct)(coeff, userdata, output); } double diff = stop_benchmark_timer(); printf("%u runs in %.2f CPU seconds = %.2f IDCTs/sec\n", num_runs, diff, num_runs / diff); + + idct_free(userdata); } -void test_all_idct(idct_func_t* idct) +void test_all_idct(idct_alloc_t* idct_alloc, idct_free_t* idct_free, idct_func_t* idct) { printf(" test_dc_becomes_spread_out()\n"); - test_dc_becomes_spread_out(idct); + test_dc_becomes_spread_out(idct_alloc, idct_free, idct); printf(" test_random_inputs()\n"); - test_random_inputs(idct); + test_random_inputs(idct_alloc, idct_free, idct); printf(" performance test: "); - test_performance(idct); + test_performance(idct_alloc, idct_free, idct); } int main(void) { printf("idct_reference:\n"); - test_all_idct(idct_reference); + test_all_idct(idct_reference_alloc, idct_reference_free, idct_reference); printf("idct_float:\n"); - test_all_idct(idct_float); + test_all_idct(idct_float_alloc, idct_float_free, idct_float); printf("All tests pass.\n"); return 0;