From f2779423add3f07faccc8332914428fc6eea6652 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Fri, 9 Jan 2009 21:01:05 +0100 Subject: [PATCH] Floating-point implementation of AA&N IDCT. Does not yet fold the scale factors into the quantization table. --- idct.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++++++++ idct.h | 4 ++ idct_test.c | 3 ++ 3 files changed, 148 insertions(+) diff --git a/idct.c b/idct.c index 54a9dc9..056f451 100644 --- a/idct.c +++ b/idct.c @@ -37,3 +37,144 @@ void idct_reference(const int16_t* input, const uint32_t* quant_table, uint8_t* } } +// AA&N (Arai, Agui and Nakajima) floating-point IDCT. +// This IDCT is based on the same DCT that libjpeg uses -- in fact, exactly the +// same figure from the same book ("JPEG: Still Image Data Compression Standard", +// page 52, figure 4-8). However, it is coded from scratch, and uses the +// transposition method for converting DCT -> IDCT suggested in the book. +// (libjpeg seems to use some other method that yields similar, but not +// the same, code.) + +// As this is generally meant as a reference and not useful code (we expect +// a SIMD fixed-point algorithm to be used in most cases), it has not been +// attempted significantly optimized. We assume the compiler will be smart +// enough to do all the variable propagation for us anyway. + +// Scale factors; 1.0 / (sqrt(2.0) * cos(k * M_PI / 16.0)), except for the first which is 1. +static const double scalefac[] = { + 1.0, 0.7209598220069479, 0.765366864730180, 0.8504300947672564, + 1.0, 1.2727585805728336, 1.847759065022573, 3.6245097854115502 +}; + +// 1D 8-point DCT. +static inline void idct1d_float(double y0, double y1, double y2, double y3, double y4, double y5, double y6, double y7, double *x) +{ + // constants + const double a1 = 0.7071067811865474; // sqrt(2) + const double a2 = 0.5411961001461971; // cos(3/8 pi) * sqrt(2) + const double a3 = a1; + const double a4 = 1.3065629648763766; // cos(pi/8) * sqrt(2) + const double a5 = 0.5 * (a4 - a2); + + // phase 1 + const double p1_0 = y0 * scalefac[0]; + const double p1_1 = y4 * scalefac[4]; + const double p1_2 = y2 * scalefac[2]; + const double p1_3 = y6 * scalefac[6]; + const double p1_4 = y5 * scalefac[5]; + const double p1_5 = y1 * scalefac[1]; + const double p1_6 = y7 * scalefac[7]; + const double p1_7 = y3 * scalefac[3]; + + // phase 2 + const double p2_0 = p1_0; + const double p2_1 = p1_1; + const double p2_2 = p1_2; + const double p2_3 = p1_3; + const double p2_4 = p1_4 - p1_7; + const double p2_5 = p1_5 + p1_6; + const double p2_6 = p1_5 - p1_6; + const double p2_7 = p1_4 + p1_7; + + // phase 3 + const double p3_0 = p2_0; + const double p3_1 = p2_1; + const double p3_2 = p2_2 - p2_3; + const double p3_3 = p2_2 + p2_3; + const double p3_4 = p2_4; + const double p3_5 = p2_5 - p2_7; + const double p3_6 = p2_6; + const double p3_7 = p2_5 + p2_7; + + // phase 4 + const double p4_0 = p3_0; + const double p4_1 = p3_1; + const double p4_2 = a1 * p3_2; + const double p4_3 = p3_3; + const double p4_4 = p3_4 * -a2 + (p3_4 + p3_6) * -a5; + const double p4_5 = a3 * p3_5; + const double p4_6 = p3_6 * a4 + (p3_4 + p3_6) * -a5; + const double p4_7 = p3_7; + + // phase 5 + const double p5_0 = p4_0 + p4_1; + const double p5_1 = p4_0 - p4_1; + const double p5_2 = p4_2; + const double p5_3 = p4_2 + p4_3; + const double p5_4 = p4_4; + const double p5_5 = p4_5; + const double p5_6 = p4_6; + const double p5_7 = p4_7; + + // phase 6 + const double p6_0 = p5_0 + p5_3; + const double p6_1 = p5_1 + p5_2; + const double p6_2 = p5_1 - p5_2; + const double p6_3 = p5_0 - p5_3; + const double p6_4 = -p5_4; + const double p6_5 = p5_5 - p5_4; + const double p6_6 = p5_5 + p5_6; + const double p6_7 = p5_6 + p5_7; + + // phase 7 + x[0] = p6_0 + p6_7; + x[1] = p6_1 + p6_6; + x[2] = p6_2 + p6_5; + x[3] = p6_3 + p6_4; + x[4] = p6_3 - p6_4; + x[5] = p6_2 - p6_5; + x[6] = p6_1 - p6_6; + x[7] = p6_0 - p6_7; +} + +void idct_float(const int16_t* input, const uint32_t* quant_table, uint8_t* output) +{ + double temp[DCTSIZE2]; + + // IDCT columns. + for (unsigned x = 0; x < DCTSIZE; ++x) { + idct1d_float(input[DCTSIZE * 0 + x] * (int32_t)quant_table[DCTSIZE * 0 + x], + input[DCTSIZE * 1 + x] * (int32_t)quant_table[DCTSIZE * 1 + x], + input[DCTSIZE * 2 + x] * (int32_t)quant_table[DCTSIZE * 2 + x], + input[DCTSIZE * 3 + x] * (int32_t)quant_table[DCTSIZE * 3 + x], + input[DCTSIZE * 4 + x] * (int32_t)quant_table[DCTSIZE * 4 + x], + input[DCTSIZE * 5 + x] * (int32_t)quant_table[DCTSIZE * 5 + x], + input[DCTSIZE * 6 + x] * (int32_t)quant_table[DCTSIZE * 6 + x], + input[DCTSIZE * 7 + x] * (int32_t)quant_table[DCTSIZE * 7 + x], + temp + x * DCTSIZE); + } + + // IDCT rows. + for (unsigned y = 0; y < DCTSIZE; ++y) { + double temp2[DCTSIZE]; + idct1d_float(temp[DCTSIZE * 0 + y], + temp[DCTSIZE * 1 + y], + temp[DCTSIZE * 2 + y], + temp[DCTSIZE * 3 + y], + temp[DCTSIZE * 4 + y], + temp[DCTSIZE * 5 + y], + temp[DCTSIZE * 6 + y], + temp[DCTSIZE * 7 + y], + temp2); + for (unsigned x = 0; x < DCTSIZE; ++x) { + double val = (1.0/8.0) * temp2[x]; + if (val < 0.0) { + output[y * DCTSIZE + x] = 0; + } else if (val >= 255.0) { + output[y * DCTSIZE + x] = 255; + } else { + output[y * DCTSIZE + x] = (uint8_t)(val + 0.5); + } + } + } +} diff --git a/idct.h b/idct.h index ec41993..97a6a5c 100644 --- a/idct.h +++ b/idct.h @@ -11,4 +11,8 @@ typedef void (idct_func_t)(const int16_t*, const uint32_t*, uint8_t*); // Non-factorized reference version (section A.3.3 of the JPEG standard). void idct_reference(const int16_t* input, const uint32_t* quant_table, uint8_t* output); +// Floating-point IDCT due to Arai, Agui and Nakajima (also known as AA&N). +// See idct.c for more details. +void idct_float(const int16_t* input, const uint32_t* quant_table, uint8_t* output); + #endif /* !defined(_IDCT_H) */ diff --git a/idct_test.c b/idct_test.c index b4cec38..d6dc8b2 100644 --- a/idct_test.c +++ b/idct_test.c @@ -130,6 +130,9 @@ int main(void) printf("idct_reference:\n"); test_all_idct(idct_reference); + printf("idct_float:\n"); + test_all_idct(idct_float); + printf("All tests pass.\n"); return 0; } -- 2.39.2