git.sesse.net Git - vlc/blob - doc/transforms.py

   1 # Lossy compression algorithms very often make use of DCT or DFT calculations,
   2 # or variations of these calculations. This file is intended to be a short
   3 # reference about classical DCT and DFT algorithms.
   4
   5
   6 from random import random
   7 from math import pi, sin, cos, sqrt
   8 from cmath import exp
   9
  10 def exp_j (alpha):
  11     return exp (alpha * 1j)
  12
  13 def conjugate (c):
  14     c = c + 0j
  15     return c.real - 1j * c.imag
  16
  17 def vector (N):
  18     return [0j] * N
  19
  20
  21 # Let us start withthe canonical definition of the unscaled DFT algorithm :
  22 # (I can not draw sigmas in a text file so I'll use python code instead)  :)
  23
  24 def W (k, N):
  25     return exp_j ((-2*pi*k)/N)
  26
  27 def unscaled_DFT (N, input, output):
  28     for o in range(N):          # o is output index
  29         output[o] = 0
  30         for i in range(N):
  31             output[o] = output[o] + input[i] * W (i*o, N)
  32
  33 # This algorithm takes complex input and output. There are N*N complex
  34 # multiplications and N*(N-1) complex additions.
  35
  36
  37 # Of course this algorithm is an extremely naive implementation and there are
  38 # some ways to use the trigonometric properties of the coefficients to find
  39 # some decompositions that can accelerate the calculation by several orders
  40 # of magnitude... This is a well known and studied problem. One of the
  41 # available explanations of this process is at this url :
  42 # www.cmlab.csie.ntu.edu.tw/cml/dsp/training/coding/transform/fft.html
  43
  44
  45 # Let's start with the radix-2 decimation-in-time algorithm :
  46
  47 def unscaled_DFT_radix2_time (N, input, output):
  48     even_input = vector(N/2)
  49     odd_input = vector(N/2)
  50     even_output = vector(N/2)
  51     odd_output = vector(N/2)
  52
  53     for i in range(N/2):
  54         even_input[i] = input[2*i]
  55         odd_input[i] = input[2*i+1]
  56
  57     unscaled_DFT (N/2, even_input, even_output)
  58     unscaled_DFT (N/2, odd_input, odd_output)
  59
  60     for i in range(N/2):
  61         odd_output[i] = odd_output[i] * W (i, N)
  62
  63     for i in range(N/2):
  64         output[i] = even_output[i] + odd_output[i]
  65         output[i+N/2] = even_output[i] - odd_output[i]
  66
  67 # This algorithm takes complex input and output.
  68
  69 # We divide the DFT calculation into 2 DFT calculations of size N/2
  70 # We then do N/2 complex multiplies followed by N complex additions.
  71 # (actually W(0,N) = 1 and W(N/4,N) = -j so we can skip a few of these complex
  72 # multiplies... we will skip 1 for i=0 and 1 for i=N/4. Also for i=N/8 and for
  73 # i=3*N/8 the W(i,N) values can be special-cased to implement the complex
  74 # multiplication using only 2 real additions and 2 real multiplies)
  75
  76 # Also note that all the basic stages of this DFT algorithm are easily
  77 # reversible, so we can calculate the IDFT with the same complexity.
  78
  79
  80 # A varient of this is the radix-2 decimation-in-frequency algorithm :
  81
  82 def unscaled_DFT_radix2_freq (N, input, output):
  83     even_input = vector(N/2)
  84     odd_input = vector(N/2)
  85     even_output = vector(N/2)
  86     odd_output = vector(N/2)
  87
  88     for i in range(N/2):
  89         even_input[i] = input[i] + input[i+N/2]
  90         odd_input[i] = input[i] - input[i+N/2]
  91
  92     for i in range(N/2):
  93         odd_input[i] = odd_input[i] * W (i, N)
  94
  95     unscaled_DFT (N/2, even_input, even_output)
  96     unscaled_DFT (N/2, odd_input, odd_output)
  97
  98     for i in range(N/2):
  99         output[2*i] = even_output[i]
 100         output[2*i+1] = odd_output[i]
 101
 102 # Note that the decimation-in-time and the decimation-in-frequency varients
 103 # have exactly the same complexity, they only do the operations in a different
 104 # order.
 105
 106 # Actually, if you look at the decimation-in-time varient of the DFT, and
 107 # reverse it to calculate an IDFT, you get something that is extremely close
 108 # to the decimation-in-frequency DFT algorithm...
 109
 110
 111 # The radix-4 algorithms are slightly more efficient : they take into account
 112 # the fact that with complex numbers, multiplications by j and -j are also
 113 # "free"... i.e. when you code them using real numbers, they translate into
 114 # a few data moves but no real operation.
 115
 116 # Let's start with the radix-4 decimation-in-time algorithm :
 117
 118 def unscaled_DFT_radix4_time (N, input, output):
 119     input_0 = vector(N/4)
 120     input_1 = vector(N/4)
 121     input_2 = vector(N/4)
 122     input_3 = vector(N/4)
 123     output_0 = vector(N/4)
 124     output_1 = vector(N/4)
 125     output_2 = vector(N/4)
 126     output_3 = vector(N/4)
 127     tmp_0 = vector(N/4)
 128     tmp_1 = vector(N/4)
 129     tmp_2 = vector(N/4)
 130     tmp_3 = vector(N/4)
 131
 132     for i in range(N/4):
 133         input_0[i] = input[4*i]
 134         input_1[i] = input[4*i+1]
 135         input_2[i] = input[4*i+2]
 136         input_3[i] = input[4*i+3]
 137
 138     unscaled_DFT (N/4, input_0, output_0)
 139     unscaled_DFT (N/4, input_1, output_1)
 140     unscaled_DFT (N/4, input_2, output_2)
 141     unscaled_DFT (N/4, input_3, output_3)
 142
 143     for i in range(N/4):
 144         output_1[i] = output_1[i] * W (i, N)
 145         output_2[i] = output_2[i] * W (2*i, N)
 146         output_3[i] = output_3[i] * W (3*i, N)
 147
 148     for i in range(N/4):
 149         tmp_0[i] = output_0[i] + output_2[i]
 150         tmp_1[i] = output_0[i] - output_2[i]
 151         tmp_2[i] = output_1[i] + output_3[i]
 152         tmp_3[i] = output_1[i] - output_3[i]
 153
 154     for i in range(N/4):
 155         output[i] = tmp_0[i] + tmp_2[i]
 156         output[i+N/4] = tmp_1[i] - 1j * tmp_3[i]
 157         output[i+N/2] = tmp_0[i] - tmp_2[i]
 158         output[i+3*N/4] = tmp_1[i] + 1j * tmp_3[i]
 159
 160 # This algorithm takes complex input and output.
 161
 162 # We divide the DFT calculation into 4 DFT calculations of size N/4
 163 # We then do 3*N/4 complex multiplies followed by 2*N complex additions.
 164 # (actually W(0,N) = 1 and W(N/4,N) = -j so we can skip a few of these complex
 165 # multiplies... we will skip 3 for i=0 and 1 for i=N/8. Also for i=N/8
 166 # the remaining W(i,N) and W(3*i,N) multiplies can be implemented using only
 167 # 2 real additions and 2 real multiplies. For i=N/16 and i=3*N/16 we can also
 168 # optimise the W(2*i/N) multiply this way.
 169
 170 # If we wanted to do the same decomposition with one radix-2 decomposition
 171 # of size N and 2 radix-2 decompositions of size N/2, the total cost would be
 172 # N complex multiplies and 2*N complex additions. Thus we see that the
 173 # decomposition of one DFT calculation of size N into 4 calculations of size
 174 # N/4 using the radix-4 algorithm instead of the radix-2 algorithm saved N/4
 175 # complex multiplies...
 176
 177
 178 # The radix-4 decimation-in-frequency algorithm is similar :
 179
 180 def unscaled_DFT_radix4_freq (N, input, output):
 181     input_0 = vector(N/4)
 182     input_1 = vector(N/4)
 183     input_2 = vector(N/4)
 184     input_3 = vector(N/4)
 185     output_0 = vector(N/4)
 186     output_1 = vector(N/4)
 187     output_2 = vector(N/4)
 188     output_3 = vector(N/4)
 189     tmp_0 = vector(N/4)
 190     tmp_1 = vector(N/4)
 191     tmp_2 = vector(N/4)
 192     tmp_3 = vector(N/4)
 193
 194     for i in range(N/4):
 195         tmp_0[i] = input[i] + input[i+N/2]
 196         tmp_1[i] = input[i+N/4] + input[i+3*N/4]
 197         tmp_2[i] = input[i] - input[i+N/2]
 198         tmp_3[i] = input[i+N/4] - input[i+3*N/4]
 199
 200     for i in range(N/4):
 201         input_0[i] = tmp_0[i] + tmp_1[i]
 202         input_1[i] = tmp_2[i] - 1j * tmp_3[i]
 203         input_2[i] = tmp_0[i] - tmp_1[i]
 204         input_3[i] = tmp_2[i] + 1j * tmp_3[i]
 205
 206     for i in range(N/4):
 207         input_1[i] = input_1[i] * W (i, N)
 208         input_2[i] = input_2[i] * W (2*i, N)
 209         input_3[i] = input_3[i] * W (3*i, N)
 210
 211     unscaled_DFT (N/4, input_0, output_0)
 212     unscaled_DFT (N/4, input_1, output_1)
 213     unscaled_DFT (N/4, input_2, output_2)
 214     unscaled_DFT (N/4, input_3, output_3)
 215
 216     for i in range(N/4):
 217         output[4*i] = output_0[i]
 218         output[4*i+1] = output_1[i]
 219         output[4*i+2] = output_2[i]
 220         output[4*i+3] = output_3[i]
 221
 222 # Once again the complexity is exactly the same as for the radix-4
 223 # decimation-in-time DFT algorithm, only the order of the operations is
 224 # different.
 225
 226
 227 # Now let us reorder the radix-4 algorithms in a different way :
 228
 229 #def unscaled_DFT_radix4_time (N, input, output):
 230 #   input_0 = vector(N/4)
 231 #   input_1 = vector(N/4)
 232 #   input_2 = vector(N/4)
 233 #   input_3 = vector(N/4)
 234 #   output_0 = vector(N/4)
 235 #   output_1 = vector(N/4)
 236 #   output_2 = vector(N/4)
 237 #   output_3 = vector(N/4)
 238 #   tmp_0 = vector(N/4)
 239 #   tmp_1 = vector(N/4)
 240 #   tmp_2 = vector(N/4)
 241 #   tmp_3 = vector(N/4)
 242 #
 243 #   for i in range(N/4):
 244 #       input_0[i] = input[4*i]
 245 #       input_2[i] = input[4*i+2]
 246 #
 247 #   unscaled_DFT (N/4, input_0, output_0)
 248 #   unscaled_DFT (N/4, input_2, output_2)
 249 #
 250 #   for i in range(N/4):
 251 #       output_2[i] = output_2[i] * W (2*i, N)
 252 #
 253 #   for i in range(N/4):
 254 #       tmp_0[i] = output_0[i] + output_2[i]
 255 #       tmp_1[i] = output_0[i] - output_2[i]
 256 #
 257 #   for i in range(N/4):
 258 #       input_1[i] = input[4*i+1]
 259 #       input_3[i] = input[4*i+3]
 260 #
 261 #   unscaled_DFT (N/4, input_1, output_1)
 262 #   unscaled_DFT (N/4, input_3, output_3)
 263 #
 264 #   for i in range(N/4):
 265 #       output_1[i] = output_1[i] * W (i, N)
 266 #       output_3[i] = output_3[i] * W (3*i, N)
 267 #
 268 #   for i in range(N/4):
 269 #       tmp_2[i] = output_1[i] + output_3[i]
 270 #       tmp_3[i] = output_1[i] - output_3[i]
 271 #
 272 #   for i in range(N/4):
 273 #       output[i] = tmp_0[i] + tmp_2[i]
 274 #       output[i+N/4] = tmp_1[i] - 1j * tmp_3[i]
 275 #       output[i+N/2] = tmp_0[i] - tmp_2[i]
 276 #       output[i+3*N/4] = tmp_1[i] + 1j * tmp_3[i]
 277
 278 # We didnt do anything here, only reorder the operations. But now, look at the
 279 # first part of this function, up to the calculations of tmp0 and tmp1 : this
 280 # is extremely similar to the radix-2 decimation-in-time algorithm ! or more
 281 # precisely, it IS the radix-2 decimation-in-time algorithm, with size N/2,
 282 # applied on a vector representing the even input coefficients, and giving
 283 # an output vector that is the concatenation of tmp0 and tmp1.
 284 # This is important to notice, because this means we can now choose to
 285 # calculate tmp0 and tmp1 using any DFT algorithm that we want, and we know
 286 # that some of them are more efficient than radix-2...
 287
 288 # This leads us directly to the split-radix decimation-in-time algorithm :
 289
 290 def unscaled_DFT_split_radix_time (N, input, output):
 291     even_input = vector(N/2)
 292     input_1 = vector(N/4)
 293     input_3 = vector(N/4)
 294     even_output = vector(N/2)
 295     output_1 = vector(N/4)
 296     output_3 = vector(N/4)
 297     tmp_0 = vector(N/4)
 298     tmp_1 = vector(N/4)
 299
 300     for i in range(N/2):
 301         even_input[i] = input[2*i]
 302
 303     for i in range(N/4):
 304         input_1[i] = input[4*i+1]
 305         input_3[i] = input[4*i+3]
 306
 307     unscaled_DFT (N/2, even_input, even_output)
 308     unscaled_DFT (N/4, input_1, output_1)
 309     unscaled_DFT (N/4, input_3, output_3)
 310
 311     for i in range(N/4):
 312         output_1[i] = output_1[i] * W (i, N)
 313         output_3[i] = output_3[i] * W (3*i, N)
 314
 315     for i in range(N/4):
 316         tmp_0[i] = output_1[i] + output_3[i]
 317         tmp_1[i] = output_1[i] - output_3[i]
 318
 319     for i in range(N/4):
 320         output[i] = even_output[i] + tmp_0[i]
 321         output[i+N/4] = even_output[i+N/4] - 1j * tmp_1[i]
 322         output[i+N/2] = even_output[i] - tmp_0[i]
 323         output[i+3*N/4] = even_output[i+N/4] + 1j * tmp_1[i]
 324
 325 # This function performs one DFT of size N/2 and two of size N/4, followed by
 326 # N/2 complex multiplies and 3*N/2 complex additions.
 327 # (actually W(0,N) = 1 and W(N/4,N) = -j so we can skip a few of these complex
 328 # multiplies... we will skip 2 for i=0. Also for i=N/8 the W(i,N) and W(3*i,N)
 329 # multiplies can be implemented using only 2 real additions and 2 real
 330 # multiplies)
 331
 332
 333 # We can similarly define the split-radix decimation-in-frequency DFT :
 334
 335 def unscaled_DFT_split_radix_freq (N, input, output):
 336     even_input = vector(N/2)
 337     input_1 = vector(N/4)
 338     input_3 = vector(N/4)
 339     even_output = vector(N/2)
 340     output_1 = vector(N/4)
 341     output_3 = vector(N/4)
 342     tmp_0 = vector(N/4)
 343     tmp_1 = vector(N/4)
 344
 345     for i in range(N/2):
 346         even_input[i] = input[i] + input[i+N/2]
 347
 348     for i in range(N/4):
 349         tmp_0[i] = input[i] - input[i+N/2]
 350         tmp_1[i] = input[i+N/4] - input[i+3*N/4]
 351
 352     for i in range(N/4):
 353         input_1[i] = tmp_0[i] - 1j * tmp_1[i]
 354         input_3[i] = tmp_0[i] + 1j * tmp_1[i]
 355
 356     for i in range(N/4):
 357         input_1[i] = input_1[i] * W (i, N)
 358         input_3[i] = input_3[i] * W (3*i, N)
 359
 360     unscaled_DFT (N/2, even_input, even_output)
 361     unscaled_DFT (N/4, input_1, output_1)
 362     unscaled_DFT (N/4, input_3, output_3)
 363
 364     for i in range(N/2):
 365         output[2*i] = even_output[i]
 366
 367     for i in range(N/4):
 368         output[4*i+1] = output_1[i]
 369         output[4*i+3] = output_3[i]
 370
 371 # The complexity is again the same as for the decimation-in-time varient.
 372
 373
 374 # Now let us now summarize our various algorithms for DFT decomposition :
 375
 376 # radix-2 : DFT(N) -> 2*DFT(N/2) using N/2 multiplies and N additions
 377 # radix-4 : DFT(N) -> 4*DFT(N/2) using 3*N/4 multiplies and 2*N additions
 378 # split-radix : DFT(N) -> DFT(N/2) + 2*DFT(N/4) using N/2 muls and 3*N/2 adds
 379
 380 # (we are always speaking of complex multiplies and complex additions... a
 381 # complex addition is implemented with 2 real additions, and a complex
 382 # multiply is implemented with either 2 adds and 4 muls or 3 adds and 3 muls,
 383 # so we will keep a separate count of these)
 384
 385 # If we want to take into account the special values of W(i,N), we can remove
 386 # a few complex multiplies. Supposing N>=16 we can remove :
 387 # radix-2 : remove 2 complex multiplies, simplify 2 others
 388 # radix-4 : remove 4 complex multiplies, simplify 4 others
 389 # split-radix : remove 2 complex multiplies, simplify 2 others
 390
 391 # This gives the following table for the complexity of a complex DFT :
 392 #       N       real additions  real multiplies complex multiplies
 393 #       1               0               0               0
 394 #       2               4               0               0
 395 #       4              16               0               0
 396 #       8              52               4               0
 397 #      16             136               8               4
 398 #      32             340              20              16
 399 #      64             808              40              52
 400 #     128            1876              84             144
 401 #     256            4264             168             372
 402 #     512            9556             340             912
 403 #    1024           21160             680            2164
 404 #    2048           46420            1364            5008
 405 #    4096          101032            2728           11380
 406 #    8192          218452            5460           25488
 407 #   16384          469672           10920           56436
 408 #   32768         1004884           21844          123792
 409 #   65536         2140840           43688          269428
 410
 411 # If we chose to implement complex multiplies with 3 real muls + 3 real adds,
 412 # then these results are consistent with the table at the end of the
 413 # www.cmlab.csie.ntu.edu.tw DFT tutorial that I mentionned earlier.
 414
 415
 416 # Now another important case for the DFT is the one where the inputs are
 417 # real numbers instead of complex ones. We have to find ways to optimize for
 418 # this important case.
 419
 420 # If the DFT inputs are real-valued, then the DFT outputs have nice properties
 421 # too : output[0] and output[N/2] will be real numbers, and output[N-i] will
 422 # be the conjugate of output[i] for i in 0...N/2-1
 423
 424 # Likewise, if the DFT inputs are purely imaginary numbers, then the DFT
 425 # outputs will have special properties too : output[0] and output[N/2] will be
 426 # purely imaginary, and output[N-i] will be the opposite of the conjugate of
 427 # output[i] for i in 0...N/2-1
 428
 429 # We can use these properties to calculate two real-valued DFT at once :
 430
 431 def two_real_unscaled_DFT (N, input1, input2, output1, output2):
 432     input = vector(N)
 433     output = vector(N)
 434
 435     for i in range(N):
 436         input[i] = input1[i] + 1j * input2[i]
 437
 438     unscaled_DFT (N, input, output)
 439
 440     output1[0] = output[0].real + 0j
 441     output2[0] = output[0].imag + 0j
 442
 443     for i in range(N/2)[1:]:
 444         output1[i] = 0.5 * (output[i] + conjugate(output[N-i]))
 445         output2[i] = -0.5j * (output[i] - conjugate(output[N-i]))
 446
 447         output1[N-i] = conjugate(output1[i])
 448         output2[N-i] = conjugate(output2[i])
 449
 450     output1[N/2] = output[N/2].real + 0j
 451     output2[N/2] = output[N/2].imag + 0j
 452
 453 # This routine does a total of N-2 complex additions and N-2 complex
 454 # multiplies by 0.5
 455
 456 # This routine can also be inverted to calculate the IDFT of two vectors at
 457 # once if we know that the outputs will be real-valued.
 458
 459
 460 # If we have only one real-valued DFT calculation to do, we can still cut this
 461 # calculation in several parts using one of the decimate-in-time methods
 462 # (so that the different parts are still real-valued)
 463
 464 # As with complex DFT calculations, the best method is to use a split radix.
 465 # There are a lot of symetries in the DFT outputs that we can exploit to
 466 # reduce the number of operations...
 467
 468 def real_unscaled_DFT_split_radix_time_1 (N, input, output):
 469     even_input = vector(N/2)
 470     even_output = vector(N/2)
 471     input_1 = vector(N/4)
 472     output_1 = vector(N/4)
 473     input_3 = vector(N/4)
 474     output_3 = vector(N/4)
 475     tmp_0 = vector(N/4)
 476     tmp_1 = vector(N/4)
 477
 478     for i in range(N/2):
 479         even_input[i] = input[2*i]
 480
 481     for i in range(N/4):
 482         input_1[i] = input[4*i+1]
 483         input_3[i] = input[4*i+3]
 484
 485     unscaled_DFT (N/2, even_input, even_output)
 486     # this is again a real DFT !
 487     # we will only use even_output[i] for i in 0 ... N/4 included. we know that
 488     # even_output[N/2-i] is the conjugate of even_output[i]... also we know
 489     # that even_output[0] and even_output[N/4] are purely real.
 490
 491     unscaled_DFT (N/4, input_1, output_1)
 492     unscaled_DFT (N/4, input_3, output_3)
 493     # these are real DFTs too... with symetries in the outputs... once again
 494
 495     tmp_0[0] = output_1[0] + output_3[0]        # real numbers
 496     tmp_1[0] = output_1[0] - output_3[0]        # real numbers
 497
 498     tmp__0 = (output_1[N/8] + output_3[N/8]) * sqrt(0.5)        # real numbers
 499     tmp__1 = (output_1[N/8] - output_3[N/8]) * sqrt(0.5)        # real numbers
 500     tmp_0[N/8] = tmp__1 - 1j * tmp__0           # real + 1j * real
 501     tmp_1[N/8] = tmp__0 - 1j * tmp__1           # real + 1j * real
 502
 503     for i in range(N/8)[1:]:
 504         output_1[i] = output_1[i] * W (i, N)
 505         output_3[i] = output_3[i] * W (3*i, N)
 506
 507         tmp_0[i] = output_1[i] + output_3[i]
 508         tmp_1[i] = output_1[i] - output_3[i]
 509
 510         tmp_0[N/4-i] = -1j * conjugate(tmp_1[i])
 511         tmp_1[N/4-i] = -1j * conjugate(tmp_0[i])
 512
 513     output[0] = even_output[0] + tmp_0[0]               # real numbers
 514     output[N/4] = even_output[N/4] - 1j * tmp_1[0]      # real + 1j * real
 515     output[N/2] = even_output[0] - tmp_0[0]             # real numbers
 516     output[3*N/4] = even_output[N/4] + 1j * tmp_1[0]    # real + 1j * real
 517
 518     for i in range(N/4)[1:]:
 519         output[i] = even_output[i] + tmp_0[i]
 520         output[i+N/4] = conjugate(even_output[N/4-i]) - 1j * tmp_1[i]
 521
 522         output[N-i] = conjugate(output[i])
 523         output[3*N/4-i] = conjugate(output[i+N/4])
 524
 525 # This function performs one real DFT of size N/2 and two real DFT of size
 526 # N/4, followed by 6 real additions, 2 real multiplies, 3*N/4-4 complex
 527 # additions and N/4-2 complex multiplies.
 528
 529
 530 # We can also try to combine the two real DFT of size N/4 into a single complex
 531 # DFT :
 532
 533 def real_unscaled_DFT_split_radix_time_2 (N, input, output):
 534     even_input = vector(N/2)
 535     even_output = vector(N/2)
 536     odd_input = vector(N/4)
 537     odd_output = vector(N/4)
 538     tmp_0 = vector(N/4)
 539     tmp_1 = vector(N/4)
 540
 541     for i in range(N/2):
 542         even_input[i] = input[2*i]
 543
 544     for i in range(N/4):
 545         odd_input[i] = input[4*i+1] + 1j * input[4*i+3]
 546
 547     unscaled_DFT (N/2, even_input, even_output)
 548     # this is again a real DFT !
 549     # we will only use even_output[i] for i in 0 ... N/4 included. we know that
 550     # even_output[N/2-i] is the conjugate of even_output[i]... also we know
 551     # that even_output[0] and even_output[N/4] are purely real.
 552
 553     unscaled_DFT (N/4, odd_input, odd_output)
 554     # but this one is a complex DFT so no special properties here
 555
 556     output_1 = odd_output[0].real
 557     output_3 = odd_output[0].imag
 558     tmp_0[0] = output_1 + output_3      # real numbers
 559     tmp_1[0] = output_1 - output_3      # real numbers
 560
 561     output_1 = odd_output[N/8].real
 562     output_3 = odd_output[N/8].imag
 563     tmp__0 = (output_1 + output_3) * sqrt(0.5)  # real numbers
 564     tmp__1 = (output_1 - output_3) * sqrt(0.5)  # real numbers
 565     tmp_0[N/8] = tmp__1 - 1j * tmp__0           # real + 1j * real
 566     tmp_1[N/8] = tmp__0 - 1j * tmp__1           # real + 1j * real
 567
 568     for i in range(N/8)[1:]:
 569         output_1 = odd_output[i] + conjugate(odd_output[N/4-i])
 570         output_3 = odd_output[i] - conjugate(odd_output[N/4-i])
 571
 572         output_1 = output_1 * 0.5 * W (i, N)
 573         output_3 = output_3 * -0.5j * W (3*i, N)
 574
 575         tmp_0[i] = output_1 + output_3
 576         tmp_1[i] = output_1 - output_3
 577
 578         tmp_0[N/4-i] = -1j * conjugate(tmp_1[i])
 579         tmp_1[N/4-i] = -1j * conjugate(tmp_0[i])
 580
 581     output[0] = even_output[0] + tmp_0[0]               # real numbers
 582     output[N/4] = even_output[N/4] - 1j * tmp_1[0]      # real + 1j * real
 583     output[N/2] = even_output[0] - tmp_0[0]             # real numbers
 584     output[3*N/4] = even_output[N/4] + 1j * tmp_1[0]    # real + 1j * real
 585
 586     for i in range(N/4)[1:]:
 587         output[i] = even_output[i] + tmp_0[i]
 588         output[i+N/4] = conjugate(even_output[N/4-i]) - 1j * tmp_1[i]
 589
 590         output[N-i] = conjugate(output[i])
 591         output[3*N/4-i] = conjugate(output[i+N/4])
 592
 593 # This function performs one real DFT of size N/2 and one complex DFT of size
 594 # N/4, followed by 6 real additions, 2 real multiplies, N-6 complex additions
 595 # and N/4-2 complex multiplies.
 596
 597 # After comparing the performance, it turns out that for real-valued DFT, the
 598 # version of the algorithm that subdivides the calculation into one real
 599 # DFT of size N/2 and two real DFT of size N/4 is the most efficient one.
 600 # The other version gives exactly the same number of multiplies and a few more
 601 # real additions.
 602
 603
 604 # Now we can also try the decimate-in-frequency method for a real-valued DFT.
 605 # Using the split-radix algorithm, and by taking into account the symetries of
 606 # the outputs :
 607
 608 def real_unscaled_DFT_split_radix_freq (N, input, output):
 609     even_input = vector(N/2)
 610     input_1 = vector(N/4)
 611     even_output = vector(N/2)
 612     output_1 = vector(N/4)
 613     tmp_0 = vector(N/4)
 614     tmp_1 = vector(N/4)
 615
 616     for i in range(N/2):
 617         even_input[i] = input[i] + input[i+N/2]
 618
 619     for i in range(N/4):
 620         tmp_0[i] = input[i] - input[i+N/2]
 621         tmp_1[i] = input[i+N/4] - input[i+3*N/4]
 622
 623     for i in range(N/4):
 624         input_1[i] = tmp_0[i] - 1j * tmp_1[i]
 625
 626     for i in range(N/4):
 627         input_1[i] = input_1[i] * W (i, N)
 628
 629     unscaled_DFT (N/2, even_input, even_output)
 630     # This is still a real-valued DFT
 631
 632     unscaled_DFT (N/4, input_1, output_1)
 633     # But that one is a complex-valued DFT
 634
 635     for i in range(N/2):
 636         output[2*i] = even_output[i]
 637
 638     for i in range(N/4):
 639         output[4*i+1] = output_1[i]
 640         output[N-1-4*i] = conjugate(output_1[i])
 641
 642 # I think this implementation is much more elegant than the decimate-in-time
 643 # version ! It looks very much like the complex-valued version, all we had to
 644 # do was remove one of the complex-valued internal DFT calls because we could
 645 # deduce the outputs by using the symetries of the problem.
 646
 647 # As for performance, we did N real additions, N/4 complex multiplies (a bit
 648 # less actually, because W(0,N) = 1 and W(N/8,N) is a "simple" multiply), then
 649 # one real DFT of size N/2 and one complex DFT of size N/4.
 650
 651 # It turns out that even if the methods are so different, the number of
 652 # operations is exactly the same as for the best of the two decimation-in-time
 653 # methods that we tried.
 654
 655
 656 # This gives us the following performance for real-valued DFT :
 657 #       N       real additions  real multiplies complex multiplies
 658 #       2               2               0               0
 659 #       4               6               0               0
 660 #       8              20               2               0
 661 #      16              54               4               2
 662 #      32             140              10               8
 663 #      64             342              20              26
 664 #     128             812              42              72
 665 #     256            1878              84             186
 666 #     512            4268             170             456
 667 #    1024            9558             340            1082
 668 #    2048           21164             682            2504
 669 #    4096           46422            1364            5690
 670 #    8192          101036            2730           12744
 671 #   16384          218454            5460           28218
 672 #   32768          469676           10922           61896
 673 #   65536         1004886           21844          134714
 674
 675
 676 # As an example, this is an implementation of the real-valued DFT8 :
 677
 678 def DFT8 (input, output):
 679     even_0 = input[0] + input[4]
 680     even_1 = input[1] + input[5]
 681     even_2 = input[2] + input[6]
 682     even_3 = input[3] + input[7]
 683
 684     tmp_0 = even_0 + even_2
 685     tmp_1 = even_0 - even_2
 686     tmp_2 = even_1 + even_3
 687     tmp_3 = even_1 - even_3
 688
 689     output[0] = tmp_0 + tmp_2
 690     output[2] = tmp_1 - 1j * tmp_3
 691     output[4] = tmp_0 - tmp_2
 692
 693     odd_0_r = input[0] - input[4]
 694     odd_0_i = input[2] - input[6]
 695
 696     tmp_0 = input[1] - input[5]
 697     tmp_1 = input[3] - input[7]
 698     odd_1_r = (tmp_0 - tmp_1) * sqrt(0.5)
 699     odd_1_i = (tmp_0 + tmp_1) * sqrt(0.5)
 700
 701     output[1] = (odd_0_r + odd_1_r) - 1j * (odd_0_i + odd_1_i)
 702     output[5] = (odd_0_r - odd_1_r) - 1j * (odd_0_i - odd_1_i)
 703
 704     output[3] = conjugate(output[5])
 705     output[6] = conjugate(output[2])
 706     output[7] = conjugate(output[1])
 707
 708
 709 # Also a basic implementation of the real-valued DFT4 :
 710
 711 def DFT4 (input, output):
 712     tmp_0 = input[0] + input[2]
 713     tmp_1 = input[0] - input[2]
 714     tmp_2 = input[1] + input[3]
 715     tmp_3 = input[1] - input[3]
 716
 717     output[0] = tmp_0 + tmp_2
 718     output[1] = tmp_1 - 1j * tmp_3
 719     output[2] = tmp_0 - tmp_2
 720     output[3] = tmp_1 + 1j * tmp_3
 721
 722
 723 # A similar idea might be used to calculate only the real part of the output
 724 # of a complex DFT : we take an DFT algorithm for real inputs and complex
 725 # outputs and we simply reverse it. The resulting algorithm will only work
 726 # with inputs that satisfy the conjugaison rule (input[i] is the conjugate of
 727 # input[N-i]) so we can do a first pass to modify the input so that it follows
 728 # this rule. An example implementation is as follows (adapted from the
 729 # unscaled_DFT_split_radix_time algorithm) :
 730
 731 def complex2real_unscaled_DFT_split_radix_time (N, input, output):
 732     even_input = vector(N/2)
 733     input_1 = vector(N/4)
 734     even_output = vector(N/2)
 735     output_1 = vector(N/4)
 736
 737     for i in range(N/2):
 738         even_input[i] = input[2*i]
 739
 740     for i in range(N/4):
 741         input_1[i] = input[4*i+1] + conjugate(input[N-1-4*i])
 742
 743     unscaled_DFT (N/2, even_input, even_output)
 744     unscaled_DFT (N/4, input_1, output_1)
 745
 746     for i in range(N/4):
 747         output_1[i] = output_1[i] * W (i, N)
 748
 749     for i in range(N/4):
 750         output[i] = even_output[i] + output_1[i].real
 751         output[i+N/4] = even_output[i+N/4] + output_1[i].imag
 752         output[i+N/2] = even_output[i] - output_1[i].real
 753         output[i+3*N/4] = even_output[i+N/4] - output_1[i].imag
 754
 755 # This algorithm does N/4 complex additions, N/4-1 complex multiplies
 756 # (including one "simple" multiply for i=N/8), N real additions, one
 757 # "complex-to-real" DFT of size N/2, and one complex DFT of size N/4.
 758 # Also, in the complex DFT of size N/4, we do not care about the imaginary
 759 # part of output_1[0], which in practice allows us to save one real addition.
 760
 761 # This gives us the following performance for complex DFT with real outputs :
 762 #       N       real additions  real multiplies complex multiplies
 763 #       1               0               0               0
 764 #       2               2               0               0
 765 #       4               8               0               0
 766 #       8              25               2               0
 767 #      16              66               4               2
 768 #      32             167              10               8
 769 #      64             400              20              26
 770 #     128             933              42              72
 771 #     256            2126              84             186
 772 #     512            4771             170             456
 773 #    1024           10572             340            1082
 774 #    2048           23201             682            2504
 775 #    4096           50506            1364            5690
 776 #    8192          109215            2730           12744
 777 #   16384          234824            5460           28218
 778 #   32768          502429           10922           61896
 779 #   65536         1070406           21844          134714
 780
 781
 782 # Now let's talk about the DCT algorithm. The canonical definition for it is
 783 # as follows :
 784
 785 def C (k, N):
 786     return cos ((k*pi)/(2*N))
 787
 788 def unscaled_DCT (N, input, output):
 789     for o in range(N):          # o is output index
 790         output[o] = 0
 791         for i in range(N):      # i is input index
 792             output[o] = output[o] + input[i] * C ((2*i+1)*o, N)
 793
 794 # This trivial algorithm uses N*N multiplications and N*(N-1) additions.
 795
 796
 797 # One possible decomposition on this calculus is to use the fact that C (i, N)
 798 # and C (2*N-i, N) are opposed. This can lead to this decomposition :
 799
 800 #def unscaled_DCT (N, input, output):
 801 #   even_input = vector (N)
 802 #   odd_input = vector (N)
 803 #   even_output = vector (N)
 804 #   odd_output = vector (N)
 805 #
 806 #   for i in range(N/2):
 807 #       even_input[i] = input[i] + input[N-1-i]
 808 #       odd_input[i] = input[i] - input[N-1-i]
 809 #
 810 #   unscaled_DCT (N, even_input, even_output)
 811 #   unscaled_DCT (N, odd_input, odd_output)
 812 #
 813 #   for i in range(N/2):
 814 #       output[2*i] = even_output[2*i]
 815 #       output[2*i+1] = odd_output[2*i+1]
 816
 817 # Now the even part can easily be calculated : by looking at the C(k,N)
 818 # formula, we see that the even part is actually an unscaled DCT of size N/2.
 819 # The odd part looks like a DCT of size N/2, but the coefficients are
 820 # actually C ((2*i+1)*(2*o+1), 2*N) instead of C ((2*i+1)*o, N).
 821
 822 # We use a trigonometric relation here :
 823 # 2 * C ((a+b)/2, N) * C ((a-b)/2, N) = C (a, N) + C (b, N)
 824 # Thus with a = (2*i+1)*o and b = (2*i+1)*(o+1) :
 825 # 2 * C((2*i+1)*(2*o+1),2N) * C(2*i+1,2N) = C((2*i+1)*o,N) + C((2*i+1)*(o+1),N)
 826
 827 # This leads us to the Lee DCT algorithm :
 828
 829 def unscaled_DCT_Lee (N, input, output):
 830     even_input = vector(N/2)
 831     odd_input = vector(N/2)
 832     even_output = vector(N/2)
 833     odd_output = vector(N/2)
 834
 835     for i in range(N/2):
 836         even_input[i] = input[i] + input[N-1-i]
 837         odd_input[i] = input[i] - input[N-1-i]
 838
 839     for i in range(N/2):
 840         odd_input[i] = odd_input[i] * (0.5 / C (2*i+1, N))
 841
 842     unscaled_DCT (N/2, even_input, even_output)
 843     unscaled_DCT (N/2, odd_input, odd_output)
 844
 845     for i in range(N/2-1):
 846         odd_output[i] = odd_output[i] + odd_output[i+1]
 847
 848     for i in range(N/2):
 849         output[2*i] = even_output[i]
 850         output[2*i+1] = odd_output[i];
 851
 852 # Notes about this algorithm :
 853
 854 # The algorithm can be easily inverted to calculate the IDCT instead :
 855 # each of the basic stages are separately inversible...
 856
 857 # This function does N adds, then N/2 muls, then 2 recursive calls with
 858 # size N/2, then N/2-1 adds again. If we apply it recursively, the total
 859 # number of operations will be N*log2(N)/2 multiplies and N*(3*log2(N)/2-1) + 1
 860 # additions. So this is much faster than the canonical algorithm.
 861
 862 # Some of the multiplication coefficients 0.5/cos(...) can get quite large.
 863 # This means that a small error in the input will give a large error on the
 864 # output... For a DCT of size N the biggest coefficient will be at i=N/2-1
 865 # and it will be slighly more than N/pi which can be large for large N's.
 866
 867 # In the IDCT however, the multiplication coefficients for the reverse
 868 # transformation are of the form 2*cos(...) so they can not get big and there
 869 # is no accuracy problem.
 870
 871 # You can find another description of this algorithm at
 872 # http://www.intel.com/drg/mmx/appnotes/ap533.htm
 873
 874
 875
 876 # Another idea is to observe that the DCT calculation can be made to look like
 877 # the DFT calculation : C (k, N) is the real part of W (k, 4*N) or W (-k, 4*N).
 878 # We can use this idea translate the DCT algorithm into a call to the DFT
 879 # algorithm :
 880
 881 def unscaled_DCT_DFT (N, input, output):
 882     DFT_input = vector (4*N)
 883     DFT_output = vector (4*N)
 884
 885     for i in range(N):
 886         DFT_input[2*i+1] = input[i]
 887         #DFT_input[4*N-2*i-1] = input[i]        # We could use this instead
 888
 889     unscaled_DFT (4*N, DFT_input, DFT_output)
 890
 891     for i in range(N):
 892         output[i] = DFT_output[i].real
 893
 894
 895 # We can then use our knowledge of the DFT calculation to optimize for this
 896 # particular case. For example using the radix-2 decimation-in-time method :
 897
 898 #def unscaled_DCT_DFT (N, input, output):
 899 #   DFT_input = vector (2*N)
 900 #   DFT_output = vector (2*N)
 901 #
 902 #   for i in range(N):
 903 #       DFT_input[i] = input[i]
 904 #       #DFT_input[2*N-1-i] = input[i]  # We could use this instead
 905 #
 906 #   unscaled_DFT (2*N, DFT_input, DFT_output)
 907 #
 908 #   for i in range(N):
 909 #       DFT_output[i] = DFT_output[i] * W (i, 4*N)
 910 #
 911 #   for i in range(N):
 912 #       output[i] = DFT_output[i].real
 913
 914 # This leads us to the AAN implementation of the DCT algorithm : if we set
 915 # both DFT_input[i] and DFT_input[2*N-1-i] to input[i], then the imaginary
 916 # parts of W(2*i+1) and W(-2*i-1) will compensate, and output_DFT[i] will
 917 # already be a real after the multiplication by W(i,4*N). Which means that
 918 # before the multiplication, it is the product of a real number and W(-i,4*N).
 919 # This leads to the following code, called the AAN algorithm :
 920
 921 def unscaled_DCT_AAN (N, input, output):
 922     DFT_input = vector (2*N)
 923     DFT_output = vector (2*N)
 924
 925     for i in range(N):
 926         DFT_input[i] = input[i]
 927         DFT_input[2*N-1-i] = input[i]
 928
 929     symetrical_unscaled_DFT (2*N, DFT_input, DFT_output)
 930
 931     for i in range(N):
 932         output[i] = DFT_output[i].real * (0.5 / C (i, N))
 933
 934 # Notes about the AAN algorithm :
 935
 936 # The cost of this function is N real multiplies and a DFT of size 2*N. The
 937 # DFT to calculate has special properties : the inputs are real and symmetric.
 938 # Also, we only need to calculate the real parts of the N first DFT outputs.
 939 # We can try to take advantage of all that.
 940
 941 # We can invert this algorithm to calculate the IDCT. The final multiply
 942 # stage is trivially invertible. The DFT stage is invertible too, but we have
 943 # to take into account the special properties of this particular DFT for that.
 944
 945 # Once again we have to take care of numerical precision for the DFT : the
 946 # output coefficients can get large, so that a small error in the input will
 947 # give a large error on the output... For a DCT of size N the biggest
 948 # coefficient will be at i=N/2-1 and it will be slightly more than N/pi
 949
 950 # You can find another description of this algorithm at this url :
 951 # www.cmlab.csie.ntu.edu.tw/cml/dsp/training/coding/transform/fastdct.html
 952 # (It is the same server where we already found a description of the fast DFT)
 953
 954
 955 # To optimize the DFT calculation, we can take a lot of specific things into
 956 # account : the input is real and symetric, and we only care about the real
 957 # part of the output. Also, we only care about the N first output coefficients,
 958 # but that one does not save operations actually, because the other
 959 # coefficients are the conjugates of the ones we look anyway.
 960
 961 # One useful way to use the symetry of the input is to use the radix-2
 962 # decimation-in-frequency algorithm. We can write a version of
 963 # unscaled_DFT_radix2_freq for the case where the input is symetrical :
 964 # we have removed a few additions in the first stages because even_input
 965 # is symetrical and odd_input is antisymetrical. Also, we have modified the
 966 # odd_input vector so that the second half of it is set to zero and the real
 967 # part of the DFT output is not modified. After that modification, the second
 968 # part of the odd_input was null so we used the radix-2 decimation-in-frequency
 969 # again on the odd DFT. Also odd_output is symetrical because input is real...
 970
 971 def symetrical_unscaled_DFT (N, input, output):
 972     even_input = vector(N/2)
 973     odd_tmp = vector(N/2)
 974     odd_input = vector(N/2)
 975     even_output = vector(N/2)
 976     odd_output = vector(N/2)
 977
 978     for i in range(N/4):
 979         even_input[N/2-i-1] = even_input[i] = input[i] + input[N/2-1-i]
 980
 981     for i in range(N/4):
 982         odd_tmp[i] = input[i] - input[N/2-1-i]
 983
 984     odd_input[0] = odd_tmp[0]
 985     for i in range(N/4)[1:]:
 986         odd_input[i] = (odd_tmp[i] + odd_tmp[i-1]) * W (i, N)
 987
 988     unscaled_DFT (N/2, even_input, even_output)
 989     # symetrical real inputs, real outputs
 990
 991     unscaled_DFT (N/4, odd_input, odd_output)
 992     # complex inputs, real outputs
 993
 994     for i in range(N/2):
 995         output[2*i] = even_output[i]
 996
 997     for i in range(N/4):
 998         output[N-1-4*i] = output[4*i+1] = odd_output[i]
 999
1000 # This procedure takes 3*N/4-1 real additions and N/2-3 real multiplies,
1001 # followed by another symetrical real DFT of size N/2 and a "complex to real"
1002 # DFT of size N/4.
1003
1004 # We thus get the following performance results :
1005 #       N       real additions  real multiplies complex multiplies
1006 #       1               0               0               0
1007 #       2               0               0               0
1008 #       4               2               0               0
1009 #       8               9               1               0
1010 #      16              28               6               0
1011 #      32              76              21               0
1012 #      64             189              54               2
1013 #     128             451             125              10
1014 #     256            1042             270              36
1015 #     512            2358             565             108
1016 #    1024            5251            1158             294
1017 #    2048           11557            2349             750
1018 #    4096           25200            4734            1832
1019 #    8192           54544            9509            4336
1020 #   16384          117337           19062           10026
1021 #   32768          251127           38173           22770
1022 #   65536          535102           76398           50988
1023
1024
1025 # We thus get a better performance with the AAN DCT algorithm than with the
1026 # Lee DCT algorithm : we can do a DCT of size 32 with 189 additions, 54+32 real
1027 # multiplies, and 2 complex multiplies. The Lee algorithm would have used 209
1028 # additions and 80 multiplies. With the AAN algorithm, we also have the
1029 # advantage that a big number of the multiplies are actually grouped at the
1030 # output stage of the algorithm, so if we want to do a DCT followed by a
1031 # quantization stage, we will be able to group the multiply of the output with
1032 # the multiply of the quantization stage, thus saving 32 more operations. In
1033 # the mpeg audio layer 1 or 2 processing, we can also group the multiply of the
1034 # output with the multiply of the convolution stage...
1035
1036 # Another source code for the AAN algorithm (implemented on 8 points, and
1037 # without all of the explanations) can be found at this URL :
1038 # http://developer.intel.com/drg/pentiumII/appnotes/aan_org.c . This
1039 # implementation uses 28 adds and 6+8 muls instead of 29 adds and 5+8 muls -
1040 # the difference is that in the symetrical_unscaled_DFT procedure, they noticed
1041 # how odd_input[i] and odd_input[N/4-i] will be combined at the start of the
1042 # complex-to-real DFT and they took advantage of this to convert 2 real adds
1043 # and 4 real muls into one complex multiply.
1044
1045
1046 # TODO : write about multi-dimentional DCT
1047
1048
1049 # TEST CODE
1050
1051 def dump (vector):
1052     str = ""
1053     for i in range(len(vector)):
1054         if i:
1055             str = str + ", "
1056         vector[i] = vector[i] + 0j
1057         realstr = "%+.4f" % vector[i].real
1058         imagstr = "%+.4fj" % vector[i].imag
1059         if (realstr == "-0.0000"):
1060             realstr = "+0.0000"
1061         if (imagstr == "-0.0000j"):
1062             imagstr = "+0.0000j"
1063         str = str + realstr #+ imagstr
1064     return "[%s]" % str
1065
1066 def test(N):
1067     input = vector(N)
1068     output = vector(N)
1069     verify = vector(N)
1070
1071     for i in range(N):
1072         input[i] = random() + 1j * random()
1073
1074     unscaled_DFT (N, input, output)
1075     unscaled_DFT (N, input, verify)
1076
1077     if (dump(output) != dump(verify)):
1078         print dump(output)
1079         print dump(verify)
1080
1081 #test (64)
1082
1083
1084 # PERFORMANCE ANALYSIS CODE
1085
1086 def display (table):
1087     N = 1
1088     print "#\tN\treal additions\treal multiplies\tcomplex multiplies"
1089     while table.has_key(N):
1090         print "#%8d%16d%16d%16d" % (N, table[N][0], table[N][1], table[N][2])
1091         N = 2*N
1092     print
1093
1094 best_complex_DFT = {}
1095
1096 def complex_DFT (max_N):
1097     best_complex_DFT[1] = (0,0,0)
1098     best_complex_DFT[2] = (4,0,0)
1099     best_complex_DFT[4] = (16,0,0)
1100     N = 8
1101     while (N<=max_N):
1102         # best method = split radix
1103         best2 = best_complex_DFT[N/2]
1104         best4 = best_complex_DFT[N/4]
1105         best_complex_DFT[N] = (best2[0] + 2*best4[0] + 3*N + 4,
1106                                best2[1] + 2*best4[1] + 4,
1107                                best2[2] + 2*best4[2] + N/2 - 4)
1108         N = 2*N
1109
1110 best_real_DFT = {}
1111
1112 def real_DFT (max_N):
1113     best_real_DFT[1] = (0,0,0)
1114     best_real_DFT[2] = (2,0,0)
1115     best_real_DFT[4] = (6,0,0)
1116     N = 8
1117     while (N<=max_N):
1118         # best method = split radix decimate-in-frequency
1119         best2 = best_real_DFT[N/2]
1120         best4 = best_complex_DFT[N/4]
1121         best_real_DFT[N] = (best2[0] + best4[0] + N + 2,
1122                             best2[1] + best4[1] + 2,
1123                             best2[2] + best4[2] + N/4 - 2)
1124         N = 2*N
1125
1126 best_complex2real_DFT = {}
1127
1128 def complex2real_DFT (max_N):
1129     best_complex2real_DFT[1] = (0,0,0)
1130     best_complex2real_DFT[2] = (2,0,0)
1131     best_complex2real_DFT[4] = (8,0,0)
1132     N = 8
1133     while (N<=max_N):
1134         best2 = best_complex2real_DFT[N/2]
1135         best4 = best_complex_DFT[N/4]
1136         best_complex2real_DFT[N] = (best2[0] + best4[0] + 3*N/2 + 1,
1137                                     best2[1] + best4[1] + 2,
1138                                     best2[2] + best4[2] + N/4 - 2)
1139         N = 2*N
1140
1141 best_real_symetric_DFT = {}
1142
1143 def real_symetric_DFT (max_N):
1144     best_real_symetric_DFT[1] = (0,0,0)
1145     best_real_symetric_DFT[2] = (0,0,0)
1146     best_real_symetric_DFT[4] = (2,0,0)
1147     N = 8
1148     while (N<=max_N):
1149         best2 = best_real_symetric_DFT[N/2]
1150         best4 = best_complex2real_DFT[N/4]
1151         best_real_symetric_DFT[N] = (best2[0] + best4[0] + 3*N/4 - 1,
1152                                      best2[1] + best4[1] + N/2 - 3,
1153                                      best2[2] + best4[2])
1154         N = 2*N
1155
1156 complex_DFT (65536)
1157 real_DFT (65536)
1158 complex2real_DFT (65536)
1159 real_symetric_DFT (65536)
1160
1161
1162 print "complex DFT"
1163 display (best_complex_DFT)
1164
1165 print "real DFT"
1166 display (best_real_DFT)
1167
1168 print "complex2real DFT"
1169 display (best_complex2real_DFT)
1170
1171 print "real symetric DFT"
1172 display (best_real_symetric_DFT)