Let IDCTs do precalculation outside the inner loops. Speeds up (as expected)

author Steinar H. Gunderson <sesse@debian.org>

Sun, 11 Jan 2009 21:03:54 +0000 (22:03 +0100)

committer Steinar H. Gunderson <sesse@debian.org>

Sun, 11 Jan 2009 21:03:54 +0000 (22:03 +0100)
author Steinar H. Gunderson <sesse@debian.org>
Sun, 11 Jan 2009 21:03:54 +0000 (22:03 +0100)
committer Steinar H. Gunderson <sesse@debian.org>
Sun, 11 Jan 2009 21:03:54 +0000 (22:03 +0100)
diff --git a/idct.c b/idct.c

index b187a90c07996b4af5e8a6cd0ba5d954688b4293..ad1cc364642582b1711fb57122a7df186214f72c 100644 (file)
--- a/idct.c
+++ b/idct.c
@@ -1,9 +1,27 @@
  #include <math.h>
+#include <string.h>
+#include <stdlib.h>
  
  #include "idct.h"
  
-void idct_reference(const int16_t* input, const uint32_t* quant_table, uint8_t* output)
+void* idct_reference_alloc(const uint32_t* quant_table)
  {
+       uint32_t* qt_copy = (uint32_t*)malloc(DCTSIZE2 * sizeof(uint32_t));
+       // FIXME: check for NULL return
+
+       memcpy(qt_copy, quant_table, DCTSIZE2 * sizeof(uint32_t));
+
+       return qt_copy;
+}
+
+void idct_reference_free(void* userdata)
+{
+       free(userdata);
+}
+
+void idct_reference(const int16_t* input, const void* userdata, uint8_t* output)
+{
+       const uint32_t* quant_table = (const uint32_t*)userdata;
         double temp[DCTSIZE2];
  
         for (unsigned y = 0; y < 8; ++y) {
@@ -56,6 +74,27 @@ static const double scalefac[] = {
         1.0, 1.2727585805728336, 1.847759065022573, 3.6245097854115502
  };
  
+// Premultiply the scale factors and the overall 1/8 factor into the quantization
+// table entries (and convert to double).
+void* idct_float_alloc(const uint32_t* quant_table)
+{
+       double* qt_copy = (double*)malloc(DCTSIZE2 * sizeof(double));
+
+       for (unsigned y = 0; y < DCTSIZE; ++y) {
+               for (unsigned x = 0; x < DCTSIZE; ++x) {
+                       qt_copy[y * DCTSIZE + x] = 0.125 * quant_table[y * DCTSIZE + x] *
+                               scalefac[x] * scalefac[y];
+               }
+       }
+
+       return qt_copy;
+}
+
+void idct_float_free(void* userdata)
+{
+       free(userdata);
+}
+
  // 1D 8-point DCT.
  static inline void idct1d_float(double y0, double y1, double y2, double y3, double y4, double y5, double y6, double y7, double *x)
  {
@@ -67,14 +106,14 @@ static inline void idct1d_float(double y0, double y1, double y2, double y3, doub
         static const double a5 = 0.5 * (a4 - a2);
  
         // phase 1
-       const double p1_0 = y0 * scalefac[0];
-       const double p1_1 = y4 * scalefac[4];
-       const double p1_2 = y2 * scalefac[2];
-       const double p1_3 = y6 * scalefac[6];
-       const double p1_4 = y5 * scalefac[5];
-       const double p1_5 = y1 * scalefac[1];
-       const double p1_6 = y7 * scalefac[7];
-       const double p1_7 = y3 * scalefac[3];
+       const double p1_0 = y0;
+       const double p1_1 = y4;
+       const double p1_2 = y2;
+       const double p1_3 = y6;
+       const double p1_4 = y5;
+       const double p1_5 = y1;
+       const double p1_6 = y7;
+       const double p1_7 = y3;
  
         // phase 2
         const double p2_0 = p1_0;
@@ -137,20 +176,21 @@ static inline void idct1d_float(double y0, double y1, double y2, double y3, doub
         x[7] = p6_0 - p6_7;
  }
  
-void idct_float(const int16_t* input, const uint32_t* quant_table, uint8_t* output)
+void idct_float(const int16_t* input, const void* userdata, uint8_t* output)
  {
+       const double* quant_table = (const double*)userdata;
         double temp[DCTSIZE2];
  
         // IDCT columns.
         for (unsigned x = 0; x < DCTSIZE; ++x) {
-               idct1d_float(input[DCTSIZE * 0 + x] * (int32_t)quant_table[DCTSIZE * 0 + x],
-                            input[DCTSIZE * 1 + x] * (int32_t)quant_table[DCTSIZE * 1 + x],
-                            input[DCTSIZE * 2 + x] * (int32_t)quant_table[DCTSIZE * 2 + x],
-                            input[DCTSIZE * 3 + x] * (int32_t)quant_table[DCTSIZE * 3 + x],
-                            input[DCTSIZE * 4 + x] * (int32_t)quant_table[DCTSIZE * 4 + x],
-                            input[DCTSIZE * 5 + x] * (int32_t)quant_table[DCTSIZE * 5 + x],
-                            input[DCTSIZE * 6 + x] * (int32_t)quant_table[DCTSIZE * 6 + x],
-                            input[DCTSIZE * 7 + x] * (int32_t)quant_table[DCTSIZE * 7 + x],
+               idct1d_float(input[DCTSIZE * 0 + x] * quant_table[DCTSIZE * 0 + x],
+                            input[DCTSIZE * 1 + x] * quant_table[DCTSIZE * 1 + x],
+                            input[DCTSIZE * 2 + x] * quant_table[DCTSIZE * 2 + x],
+                            input[DCTSIZE * 3 + x] * quant_table[DCTSIZE * 3 + x],
+                            input[DCTSIZE * 4 + x] * quant_table[DCTSIZE * 4 + x],
+                            input[DCTSIZE * 5 + x] * quant_table[DCTSIZE * 5 + x],
+                            input[DCTSIZE * 6 + x] * quant_table[DCTSIZE * 6 + x],
+                            input[DCTSIZE * 7 + x] * quant_table[DCTSIZE * 7 + x],
                              temp + x * DCTSIZE);
         }
         
@@ -167,7 +207,7 @@ void idct_float(const int16_t* input, const uint32_t* quant_table, uint8_t* outp
                              temp[DCTSIZE * 7 + y],
                              temp2);
                 for (unsigned x = 0; x < DCTSIZE; ++x) {
-                       double val = (1.0/8.0) * temp2[x];
+                       const double val = temp2[x];
                         if (val < 0.0) {
                                 output[y * DCTSIZE + x] = 0;
                         } else if (val >= 255.0) {
diff --git a/idct.h b/idct.h

index 97a6a5cc1f0ea7c9430748783754ef4f36e6bc42..4a16ea004fd8b440965de598cea2493126dcecaf 100644 (file)
--- a/idct.h
+++ b/idct.h
@@ -6,13 +6,26 @@
  #define DCTSIZE 8
  #define DCTSIZE2 64
  
-typedef void (idct_func_t)(const int16_t*, const uint32_t*, uint8_t*);
+// void* idct_example_alloc(const uint32_t* quant_table);
+typedef void* (idct_alloc_t)(const uint32_t*);
+
+// void idct_example_free(const void* userdata);
+// userdata is the same as returned by the alloc function.
+typedef void (idct_free_t)(void*);
+
+// void idct_example(const int16_t* input, const void* userdata, uint8_t* output);
+// userdata is the same as returned by the alloc function.
+typedef void (idct_func_t)(const int16_t*, const void*, uint8_t*);
  
  // Non-factorized reference version (section A.3.3 of the JPEG standard).
-void idct_reference(const int16_t* input, const uint32_t* quant_table, uint8_t* output);
+void* idct_reference_alloc(const uint32_t* quant_table);
+void idct_reference_free(void* userdata);
+void idct_reference(const int16_t* input, const void* userdata, uint8_t* output);
  
  // Floating-point IDCT due to Arai, Agui and Nakajima (also known as AA&N).
  // See idct.c for more details.
-void idct_float(const int16_t* input, const uint32_t* quant_table, uint8_t* output);
+void* idct_float_alloc(const uint32_t* quant_table);
+void idct_float_free(void* userdata);
+void idct_float(const int16_t* input, const void* userdata, uint8_t* output);
  
  #endif /* !defined(_IDCT_H) */
diff --git a/idct_test.c b/idct_test.c

index cf2556bd2fc5ee87f85d2eac09d62d48e46e5955..78374f89860b4f788bcee3d2c62fed8c9ac8a3bb 100644 (file)
--- a/idct_test.c
+++ b/idct_test.c
@@ -25,23 +25,26 @@ void gen_random_coeffs(int16_t* dst, size_t len)
  // Test that the input is pretty close to the reference for random inputs. 
  // (If the reference funtion is given in, this becomes a simple test of its
  // determinism.)
-void test_random_inputs(idct_func_t* idct)
+void test_random_inputs(idct_alloc_t* idct_alloc, idct_free_t* idct_free, idct_func_t* idct)
  {
         int16_t coeff[DCTSIZE2]; 
         uint32_t quant[DCTSIZE2];
         uint8_t output[DCTSIZE2];
         uint8_t reference[DCTSIZE2];
-               
+
         // Unit quantization (ie., no scaling).
         for (unsigned i = 0; i < DCTSIZE2; ++i) {
                 quant[i] = 1;
         }
+       
+       void* userdata_reference = idct_reference_alloc(quant);
+       void* userdata = idct_alloc(quant);
  
         for (unsigned i = 0; i < 1000; ++i) {   
                 gen_random_coeffs(coeff, DCTSIZE2);
  
-               (*idct)(coeff, quant, output);
-               (idct_reference)(coeff, quant, reference);
+               (*idct)(coeff, userdata, output);
+               (idct_reference)(coeff, userdata_reference, reference);
  
                 // Find the RMS difference.
                 int diff_squared = 0;
@@ -51,30 +54,37 @@ void test_random_inputs(idct_func_t* idct)
  
                 assert(diff_squared <= 5);
         }
+
+       idct_reference_free(userdata_reference);
+       idct_free(userdata);
  }
  
  // Test that a single DC coefficient becomes spread out to all blocks.
-void test_dc_becomes_spread_out(idct_func_t* idct)
+void test_dc_becomes_spread_out(idct_alloc_t* idct_alloc, idct_free_t* idct_free, idct_func_t* idct)
  {
         int16_t coeff[DCTSIZE2] = { 0 }; 
         uint32_t quant[DCTSIZE2];
         uint8_t output[DCTSIZE2];
-
+       
         // Unit quantization (ie., no scaling).
         for (unsigned i = 0; i < DCTSIZE2; ++i) {
                 quant[i] = 1;
         }
  
+       void* userdata = idct_alloc(quant);
+
         for (unsigned i = 0; i < 255*8; ++i) {  
                 uint32_t reference_value = i / 8;
                 coeff[0] = i;
  
-               (*idct)(coeff, quant, output);
+               (*idct)(coeff, userdata, output);
  
                 for (unsigned i = 0; i < DCTSIZE2; ++i) {
                         assert(abs(output[i] - reference_value) <= 1);
                 }
         }
+       
+       idct_free(userdata);
  }
  
  double timediff(const struct timeval* a, const struct timeval* b)
@@ -83,7 +93,7 @@ double timediff(const struct timeval* a, const struct timeval* b)
                 (double)(b->tv_usec - a->tv_usec) * 1e-6;
  }
  
-void test_performance(idct_func_t* idct)
+void test_performance(idct_alloc_t* idct_alloc, idct_free_t* idct_free, idct_func_t* idct)
  {
         const unsigned num_runs = (idct == idct_reference) ? 5000 : 5000000;
  
@@ -98,36 +108,40 @@ void test_performance(idct_func_t* idct)
                 quant[i] = 1;
         }
  
+       void* userdata = idct_alloc(quant);
+
         start_benchmark_timer();
  
         for (unsigned i = 0; i < num_runs; ++i) {
-               (*idct)(coeff, quant, output);
+               (*idct)(coeff, userdata, output);
         }
         
         double diff = stop_benchmark_timer();
         printf("%u runs in %.2f CPU seconds = %.2f IDCTs/sec\n",
                 num_runs, diff, num_runs / diff);
+
+       idct_free(userdata);
  }
  
-void test_all_idct(idct_func_t* idct)
+void test_all_idct(idct_alloc_t* idct_alloc, idct_free_t* idct_free, idct_func_t* idct)
  {
         printf("  test_dc_becomes_spread_out()\n");
-       test_dc_becomes_spread_out(idct);       
+       test_dc_becomes_spread_out(idct_alloc, idct_free, idct);        
  
         printf("  test_random_inputs()\n");
-       test_random_inputs(idct);       
+       test_random_inputs(idct_alloc, idct_free, idct);        
  
         printf("  performance test: ");
-       test_performance(idct);
+       test_performance(idct_alloc, idct_free, idct);
  }
  
  int main(void)
  {
         printf("idct_reference:\n");
-       test_all_idct(idct_reference);
+       test_all_idct(idct_reference_alloc, idct_reference_free, idct_reference);
  
         printf("idct_float:\n");
-       test_all_idct(idct_float);
+       test_all_idct(idct_float_alloc, idct_float_free, idct_float);
  
         printf("All tests pass.\n");
         return 0;
author	Steinar H. Gunderson <sesse@debian.org>
	Sun, 11 Jan 2009 21:03:54 +0000 (22:03 +0100)
committer	Steinar H. Gunderson <sesse@debian.org>
	Sun, 11 Jan 2009 21:03:54 +0000 (22:03 +0100)
idct.c		patch \| blob \| history
idct.h		patch \| blob \| history
idct_test.c		patch \| blob \| history