git.sesse.net Git - vlc/blob - doc/transforms.py

   1 # Lossy compression algorithms very often make use of DCT or DFT calculations,
   2 # or variations of these calculations. This file is intended to be a short
   3 # reference about classical DCT and DFT algorithms.
   4
   5
   6 import math
   7 import cmath
   8
   9 pi = math.pi
  10 sin = math.sin
  11 cos = math.cos
  12 sqrt = math.sqrt
  13
  14 def exp_j (alpha):
  15     return cmath.exp (alpha*1j)
  16
  17 def conjugate (c):
  18     c = c * (1+0j)
  19     return c.real-1j*c.imag
  20
  21 def vector (N):
  22     return [0.0] * N
  23
  24
  25 # Let us start with the canonical definition of the unscaled DCT algorithm :
  26 # (I can not draw sigmas in text mode so I'll use python code instead)  :)
  27
  28 def unscaled_DCT (N, input, output):
  29     for o in range(N):          # o is output index
  30         output[o] = 0
  31         for i in range(N):      # i is input index
  32             output[o] = output[o] + input[i] * cos (((2*i+1)*o*pi)/(2*N))
  33
  34 # This trivial algorithm uses N*N multiplications and N*(N-1) additions.
  35
  36
  37 # And the unscaled DFT algorithm :
  38
  39 def W (k, N):
  40     return exp_j ((-2*pi*k)/N)
  41
  42 def unscaled_DFT (N, input, output):
  43     for o in range(N):          # o is output index
  44         output[o] = 0
  45         for i in range(N):
  46             output[o] = output[o] + input[i] * W(i*o,N)
  47
  48 # This algorithm takes complex input and output. There are N*N complex
  49 # multiplications and N*(N-1) complex additions. One complex addition can be
  50 # implemented with 2 real additions, and one complex multiplication by a
  51 # constant can be implemented with either 4 real multiplications and 2 real
  52 # additions, or 3 real multiplications and 3 real additions.
  53
  54
  55 # Of course these algorithms are extremely naive implementations and there are
  56 # some ways to use the trigonometric properties of the coefficients to find
  57 # some decompositions that can accelerate the calculations by several orders
  58 # of magnitude...
  59
  60
  61 # The Lee algorithm splits a DCT calculation of size N into two DCT
  62 # calculations of size N/2
  63
  64 def unscaled_DCT_Lee (N, input, output):
  65     even_input = vector(N/2)
  66     odd_input = vector(N/2)
  67     even_output = vector(N/2)
  68     odd_output = vector(N/2)
  69
  70     for i in range(N/2):
  71         even_input[i] = input[i] + input[N-1-i]
  72         odd_input[i] = input[i] - input[N-1-i]
  73
  74     for i in range(N/2):
  75         odd_input[i] = odd_input[i] * (0.5 / cos (((2*i+1)*pi)/(2*N)))
  76
  77     unscaled_DCT (N/2, even_input, even_output)
  78     unscaled_DCT (N/2, odd_input, odd_output)
  79
  80     for i in range(N/2-1):
  81         odd_output[i] = odd_output[i] + odd_output[i+1]
  82
  83     for i in range(N/2):
  84         output[2*i] = even_output[i]
  85         output[2*i+1] = odd_output[i];
  86
  87 # Notes about this algorithm :
  88
  89 # The algorithm can be easily inverted to calculate the IDCT instead :
  90 # each of the basic stages are separately inversible...
  91
  92 # This function does N adds, then N/2 muls, then 2 recursive calls with
  93 # size N/2, then N/2-1 adds again. The total number of operations will be
  94 # N*log2(N)/2 multiplies and less than 3*N*log2(N)/2 additions.
  95 # (exactly N*(3*log2(N)/2-1) + 1 additions). So this is much
  96 # faster than the canonical algorithm.
  97
  98 # Some of the multiplication coefficient, 0.5/cos(...) can get quite large.
  99 # This means that a small error in the input will give a large error on the
 100 # output... For a DCT of size N the biggest coefficient will be at i=N/2-1
 101 # and it will be slighly more than N/pi which can be large for large N's.
 102
 103 # In the IDCT however, the multiplication coefficients for the reverse
 104 # transformation are of the form 2*cos(...) so they can not get big and there
 105 # is no accuracy problem.
 106
 107 # You can find another description of this algorithm at
 108 # http://www.intel.com/drg/mmx/appnotes/ap533.htm
 109
 110
 111 # The AAN algorithm uses another approach, transforming a DCT calculation into
 112 # a DFT calculation of size 2N:
 113
 114 def unscaled_DCT_AAN (N, input, output):
 115     DFT_input = vector (2*N)
 116     DFT_output = vector (2*N)
 117
 118     for i in range(N):
 119         DFT_input[i] = input[i]
 120         DFT_input[2*N-1-i] = input[i]
 121
 122     unscaled_DFT (2*N, DFT_input, DFT_output)
 123
 124     for i in range(N):
 125         output[i] = DFT_output[i].real * (0.5 / cos ((i*pi)/(2*N)))
 126
 127 # Notes about the AAN algorithm :
 128
 129 # The cost of this function is N real multiplies and a DFT of size 2*N. The
 130 # DFT to calculate has special properties : the inputs are real and symmetric.
 131 # Also, we only need to calculate the real parts of the N first DFT outputs.
 132 # We will see how we can take advantage of that later.
 133
 134 # We can invert this algorithm to calculate the IDCT. The final multiply
 135 # stage is trivially invertible. The DFT stage is invertible too, but we have
 136 # to take into account the special properties of this particular DFT for that.
 137
 138 # Once again we have to take care of numerical precision for the DFT : the
 139 # output coefficients can get large, so that a small error in the input will
 140 # give a large error on the output... For a DCT of size N the biggest
 141 # coefficient will be at i=N/2-1 and it will be slightly more than N/pi
 142
 143 # You can find another description of this algorithm at this url :
 144 # www.cmlab.csie.ntu.edu.tw/cml/dsp/training/coding/transform/fastdct.html
 145
 146
 147 # The DFT calculation can be decomposed into smaller DFT calculations just like
 148 # the Lee algorithm does for DCT calculations. This is a well known and studied
 149 # problem. One of the available explanations of this process is at this url :
 150 # www.cmlab.csie.ntu.edu.tw/cml/dsp/training/coding/transform/fft.html
 151 # (This is on the same server as the AAN algorithm description !)
 152
 153
 154 # Let's start with the radix-2 decimation-in-time algorithm :
 155
 156 def unscaled_DFT_radix2_time (N, input, output):
 157     even_input = vector(N/2)
 158     odd_input = vector(N/2)
 159     even_output = vector(N/2)
 160     odd_output = vector(N/2)
 161
 162     for i in range(N/2):
 163         even_input[i] = input[2*i]
 164         odd_input[i] = input[2*i+1]
 165
 166     unscaled_DFT (N/2, even_input, even_output)
 167     unscaled_DFT (N/2, odd_input, odd_output)
 168
 169     for i in range(N/2):
 170         odd_output[i] = odd_output[i] * W(i,N)
 171
 172     for i in range(N/2):
 173         output[i] = even_output[i] + odd_output[i]
 174         output[i+N/2] = even_output[i] - odd_output[i]
 175
 176 # This algorithm takes complex input and output.
 177
 178 # We divide the DFT calculation into 2 DFT calculations of size N/2
 179 # We then do N/2 complex multiplies followed by N complex additions.
 180 # (actually W(0,N) = 1 and W(N/4,N) = -j so we can skip a few of these complex
 181 # multiplies... we will skip 1 for i=0 and 1 for i=N/4. Also for i=N/8 and for
 182 # i=3*N/8 the W(i,N) values can be special-cased to implement the complex
 183 # multiplication using only 2 real additions and 2 real multiplies)
 184
 185 # Also note that all the basic stages of this DFT algorithm are easily
 186 # reversible, so we can calculate the IDFT with the same complexity.
 187
 188
 189 # A varient of this is the radix-2 decimation-in-frequency algorithm :
 190
 191 def unscaled_DFT_radix2_freq (N, input, output):
 192     even_input = vector(N/2)
 193     odd_input = vector(N/2)
 194     even_output = vector(N/2)
 195     odd_output = vector(N/2)
 196
 197     for i in range(N/2):
 198         even_input[i] = input[i] + input[i+N/2]
 199         odd_input[i] = input[i] - input[i+N/2]
 200
 201     for i in range(N/2):
 202         odd_input[i] = odd_input[i] * W(i,N)
 203
 204     unscaled_DFT (N/2, even_input, even_output)
 205     unscaled_DFT (N/2, odd_input, odd_output)
 206
 207     for i in range(N/2):
 208         output[2*i] = even_output[i]
 209         output[2*i+1] = odd_output[i]
 210
 211 # Note that the decimation-in-time and the decimation-in-frequency varients
 212 # have exactly the same complexity, they only do the operations in a different
 213 # order.
 214
 215 # Actually, if you look at the decimation-in-time varient of the DFT, and
 216 # reverse it to calculate an IDFT, you get something that is extremely close
 217 # to the decimation-in-frequency DFT algorithm...
 218
 219
 220 # The radix-4 algorithms are slightly more efficient : they take into account
 221 # the fact that with complex numbers, multiplications by j and -j are also
 222 # free...
 223
 224 # Let's start with the radix-4 decimation-in-time algorithm :
 225
 226 def unscaled_DFT_radix4_time (N, input, output):
 227     input_0 = vector(N/4)
 228     input_1 = vector(N/4)
 229     input_2 = vector(N/4)
 230     input_3 = vector(N/4)
 231     output_0 = vector(N/4)
 232     output_1 = vector(N/4)
 233     output_2 = vector(N/4)
 234     output_3 = vector(N/4)
 235     tmp_0 = vector(N/4)
 236     tmp_1 = vector(N/4)
 237     tmp_2 = vector(N/4)
 238     tmp_3 = vector(N/4)
 239
 240     for i in range(N/4):
 241         input_0[i] = input[4*i]
 242         input_1[i] = input[4*i+1]
 243         input_2[i] = input[4*i+2]
 244         input_3[i] = input[4*i+3]
 245
 246     unscaled_DFT (N/4, input_0, output_0)
 247     unscaled_DFT (N/4, input_1, output_1)
 248     unscaled_DFT (N/4, input_2, output_2)
 249     unscaled_DFT (N/4, input_3, output_3)
 250
 251     for i in range(N/4):
 252         output_1[i] = output_1[i] * W(i,N)
 253         output_2[i] = output_2[i] * W(2*i,N)
 254         output_3[i] = output_3[i] * W(3*i,N)
 255
 256     for i in range(N/4):
 257         tmp_0[i] = output_0[i] + output_2[i]
 258         tmp_1[i] = output_0[i] - output_2[i]
 259         tmp_2[i] = output_1[i] + output_3[i]
 260         tmp_3[i] = output_1[i] - output_3[i]
 261
 262     for i in range(N/4):
 263         output[i] = tmp_0[i] + tmp_2[i]
 264         output[i+N/4] = tmp_1[i] - 1j * tmp_3[i]
 265         output[i+N/2] = tmp_0[i] - tmp_2[i]
 266         output[i+3*N/4] = tmp_1[i] + 1j * tmp_3[i]
 267
 268 # This algorithm takes complex input and output.
 269
 270 # We divide the DFT calculation into 4 DFT calculations of size N/4
 271 # We then do 3*N/4 complex multiplies followed by 2*N complex additions.
 272 # (actually W(0,N) = 1 and W(N/4,N) = -j so we can skip a few of these complex
 273 # multiplies... we will skip 3 for i=0 and 1 for i=N/8. Also for i=N/8
 274 # the remaining W(i,N) and W(3*i,N) multiplies can be implemented using only
 275 # 2 real additions and 2 real multiplies. For i=N/16 and i=3*N/16 we can also
 276 # optimise the W(2*i/N) multiply this way.
 277
 278 # If we wanted to do the same decomposition with one radix-2 decomposition
 279 # of size N and 2 radix-2 decompositions of size N/2, the total cost would be
 280 # N complex multiplies and 2*N complex additions. Thus we see that the
 281 # decomposition of one DFT calculation of size N into 4 calculations of size
 282 # N/4 using the radix-4 algorithm instead of the radix-2 algorithm saved N/4
 283 # complex multiplies...
 284
 285
 286 # The radix-4 decimation-in-frequency algorithm is similar :
 287
 288 def unscaled_DFT_radix4_freq (N, input, output):
 289     input_0 = vector(N/4)
 290     input_1 = vector(N/4)
 291     input_2 = vector(N/4)
 292     input_3 = vector(N/4)
 293     output_0 = vector(N/4)
 294     output_1 = vector(N/4)
 295     output_2 = vector(N/4)
 296     output_3 = vector(N/4)
 297     tmp_0 = vector(N/4)
 298     tmp_1 = vector(N/4)
 299     tmp_2 = vector(N/4)
 300     tmp_3 = vector(N/4)
 301
 302     for i in range(N/4):
 303         tmp_0[i] = input[i] + input[i+N/2]
 304         tmp_1[i] = input[i+N/4] + input[i+3*N/4]
 305         tmp_2[i] = input[i] - input[i+N/2]
 306         tmp_3[i] = input[i+N/4] - input[i+3*N/4]
 307
 308     for i in range(N/4):
 309         input_0[i] = tmp_0[i] + tmp_1[i]
 310         input_1[i] = tmp_2[i] - 1j * tmp_3[i]
 311         input_2[i] = tmp_0[i] - tmp_1[i]
 312         input_3[i] = tmp_2[i] + 1j * tmp_3[i]
 313
 314     for i in range(N/4):
 315         input_1[i] = input_1[i] * W(i,N)
 316         input_2[i] = input_2[i] * W(2*i,N)
 317         input_3[i] = input_3[i] * W(3*i,N)
 318
 319     unscaled_DFT (N/4, input_0, output_0)
 320     unscaled_DFT (N/4, input_1, output_1)
 321     unscaled_DFT (N/4, input_2, output_2)
 322     unscaled_DFT (N/4, input_3, output_3)
 323
 324     for i in range(N/4):
 325         output[4*i] = output_0[i]
 326         output[4*i+1] = output_1[i]
 327         output[4*i+2] = output_2[i]
 328         output[4*i+3] = output_3[i]
 329
 330 # Once again the complexity is exactly the same as for the radix-4
 331 # decimation-in-time DFT algorithm, only the order of the operations is
 332 # different.
 333
 334
 335 # Now let us reorder the radix-4 algorithms in a different way :
 336
 337 #def unscaled_DFT_radix4_time (N, input, output):
 338 #   input_0 = vector(N/4)
 339 #   input_1 = vector(N/4)
 340 #   input_2 = vector(N/4)
 341 #   input_3 = vector(N/4)
 342 #   output_0 = vector(N/4)
 343 #   output_1 = vector(N/4)
 344 #   output_2 = vector(N/4)
 345 #   output_3 = vector(N/4)
 346 #   tmp_0 = vector(N/4)
 347 #   tmp_1 = vector(N/4)
 348 #   tmp_2 = vector(N/4)
 349 #   tmp_3 = vector(N/4)
 350 #
 351 #   for i in range(N/4):
 352 #       input_0[i] = input[4*i]
 353 #       input_2[i] = input[4*i+2]
 354 #
 355 #   unscaled_DFT (N/4, input_0, output_0)
 356 #   unscaled_DFT (N/4, input_2, output_2)
 357 #
 358 #   for i in range(N/4):
 359 #       output_2[i] = output_2[i] * W(2*i,N)
 360 #
 361 #   for i in range(N/4):
 362 #       tmp_0[i] = output_0[i] + output_2[i]
 363 #       tmp_1[i] = output_0[i] - output_2[i]
 364 #
 365 #   for i in range(N/4):
 366 #       input_1[i] = input[4*i+1]
 367 #       input_3[i] = input[4*i+3]
 368 #
 369 #   unscaled_DFT (N/4, input_1, output_1)
 370 #   unscaled_DFT (N/4, input_3, output_3)
 371 #
 372 #   for i in range(N/4):
 373 #       output_1[i] = output_1[i] * W(i,N)
 374 #       output_3[i] = output_3[i] * W(3*i,N)
 375 #
 376 #   for i in range(N/4):
 377 #       tmp_2[i] = output_1[i] + output_3[i]
 378 #       tmp_3[i] = output_1[i] - output_3[i]
 379 #
 380 #   for i in range(N/4):
 381 #       output[i] = tmp_0[i] + tmp_2[i]
 382 #       output[i+N/4] = tmp_1[i] - 1j * tmp_3[i]
 383 #       output[i+N/2] = tmp_0[i] - tmp_2[i]
 384 #       output[i+3*N/4] = tmp_1[i] + 1j * tmp_3[i]
 385
 386 # We didnt do anything here, only reorder the operations. But now, look at the
 387 # first part of this function, up to the calculations of tmp0 and tmp1 : this
 388 # is extremely similar to the radix-2 decimation-in-time algorithm ! or more
 389 # precisely, it IS the radix-2 decimation-in-time algorithm, with size N/2,
 390 # applied on a vector representing the even input coefficients, and giving
 391 # an output vector that is the concatenation of tmp0 and tmp1.
 392 # This is important to notice, because this means we can now choose to
 393 # calculate tmp0 and tmp1 using any DFT algorithm that we want, and we know
 394 # that some of them are more efficient than radix-2...
 395
 396 # This leads us directly to the split-radix decimation-in-time algorithm :
 397
 398 def unscaled_DFT_split_radix_time (N, input, output):
 399     even_input = vector(N/2)
 400     input_1 = vector(N/4)
 401     input_3 = vector(N/4)
 402     even_output = vector(N/2)
 403     output_1 = vector(N/4)
 404     output_3 = vector(N/4)
 405     tmp_0 = vector(N/4)
 406     tmp_1 = vector(N/4)
 407
 408     for i in range(N/2):
 409         even_input[i] = input[2*i]
 410
 411     for i in range(N/4):
 412         input_1[i] = input[4*i+1]
 413         input_3[i] = input[4*i+3]
 414
 415     unscaled_DFT (N/2, even_input, even_output)
 416     unscaled_DFT (N/4, input_1, output_1)
 417     unscaled_DFT (N/4, input_3, output_3)
 418
 419     for i in range(N/4):
 420         output_1[i] = output_1[i] * W(i,N)
 421         output_3[i] = output_3[i] * W(3*i,N)
 422
 423     for i in range(N/4):
 424         tmp_0[i] = output_1[i] + output_3[i]
 425         tmp_1[i] = output_1[i] - output_3[i]
 426
 427     for i in range(N/4):
 428         output[i] = even_output[i] + tmp_0[i]
 429         output[i+N/4] = even_output[i+N/4] - 1j * tmp_1[i]
 430         output[i+N/2] = even_output[i] - tmp_0[i]
 431         output[i+3*N/4] = even_output[i+N/4] + 1j * tmp_1[i]
 432
 433 # This function performs one DFT of size N/2 and two of size N/4, followed by
 434 # N/2 complex multiplies and 3*N/2 complex additions.
 435 # (actually W(0,N) = 1 and W(N/4,N) = -j so we can skip a few of these complex
 436 # multiplies... we will skip 2 for i=0. Also for i=N/8 the W(i,N) and W(3*i,N)
 437 # multiplies can be implemented using only 2 real additions and 2 real
 438 # multiplies)
 439
 440
 441 # We can similarly define the split-radix decimation-in-frequency DFT :
 442
 443 def unscaled_DFT_split_radix_freq (N, input, output):
 444     even_input = vector(N/2)
 445     input_1 = vector(N/4)
 446     input_3 = vector(N/4)
 447     even_output = vector(N/2)
 448     output_1 = vector(N/4)
 449     output_3 = vector(N/4)
 450     tmp_0 = vector(N/4)
 451     tmp_1 = vector(N/4)
 452
 453     for i in range(N/2):
 454         even_input[i] = input[i] + input[i+N/2]
 455
 456     for i in range(N/4):
 457         tmp_0[i] = input[i] - input[i+N/2]
 458         tmp_1[i] = input[i+N/4] - input[i+3*N/4]
 459
 460     for i in range(N/4):
 461         input_1[i] = tmp_0[i] - 1j * tmp_1[i]
 462         input_3[i] = tmp_0[i] + 1j * tmp_1[i]
 463
 464     for i in range(N/4):
 465         input_1[i] = input_1[i] * W(i,N)
 466         input_3[i] = input_3[i] * W(3*i,N)
 467
 468     unscaled_DFT (N/2, even_input, even_output)
 469     unscaled_DFT (N/4, input_1, output_1)
 470     unscaled_DFT (N/4, input_3, output_3)
 471
 472     for i in range(N/2):
 473         output[2*i] = even_output[i]
 474
 475     for i in range(N/4):
 476         output[4*i+1] = output_1[i]
 477         output[4*i+3] = output_3[i]
 478
 479 # The complexity is again the same as for the decimation-in-time varient.
 480
 481
 482 # Now let us now summarize our various algorithms for DFT decomposition :
 483
 484 # radix-2 : DFT(N) -> 2*DFT(N/2) using N/2 multiplies and N additions
 485 # radix-4 : DFT(N) -> 4*DFT(N/2) using 3*N/4 multiplies and 2*N additions
 486 # split-radix : DFT(N) -> DFT(N/2) + 2*DFT(N/4) using N/2 muls and 3*N/2 adds
 487
 488 # (we are always speaking of complex multiplies and complex additions...
 489 # remember than a complex addition is implemented with 2 real additions, and
 490 # a complex multiply is implemented with)
 491
 492 # If we want to take into account the special values of W(i,N), we can remove
 493 # a few complex multiplies. Supposing N>=16 we can remove :
 494 # radix-2 : remove 2 complex multiplies, simplify 2 others
 495 # radix-4 : remove 4 complex multiplies, simplify 4 others
 496 # split-radix : remove 2 complex multiplies, simplify 2 others
 497
 498 # The best performance using these methods is thus :
 499 #       N       complex muls    simple muls     complex adds    method
 500 #       1               0               0               0       trivial!
 501 #       2               0               0               2       trivial!
 502 #       4               0               0               8       radix-4
 503 #       8               0               2              24       radix-4
 504 #      16               4               4              64       split radix
 505 #      32              16              10             160       split radix
 506 #      64              52              20             384       split radix
 507 #     128             144              42             896       split radix
 508 #     256             372              84            2048       split radix
 509 #     512             912             170            4608       split radix
 510 #    1024            2164             340           10240       split radix
 511 #    2048            5008             682           22528       split radix
 512 #    4096           11380            1364           49152       split radix
 513 #    8192           25488            2730          106496       split radix
 514 #   16384           56436            5460          229376       split radix
 515 #   32768          123792           10922          491520       split radix
 516 #   65536          269428           21844         1048576       split radix
 517
 518 # Now a complex addition is implemented with 2 real additions, a "simple"
 519 # complex multiply is implemented with 2 real multiplies and 2 real additions,
 520 # and complex multiplies can be implemented with either 2 real additions and
 521 # 4 real multiplies, or 3 real additions and 3 real multiplies, so we will
 522 # keep them in a separate column. Which gives...
 523
 524 #       N       real additions  real multiplies complex multiplies
 525 #       1               0               0               0
 526 #       2               4               0               0
 527 #       4              16               0               0
 528 #       8              52               4               0
 529 #      16             136               8               4
 530 #      32             340              20              16
 531 #      64             808              40              52
 532 #     128            1876              84             144
 533 #     256            4264             168             372
 534 #     512            9556             340             912
 535 #    1024           21160             680            2164
 536 #    2048           46420            1364            5008
 537 #    4096          101032            2728           11380
 538 #    8192          218452            5460           25488
 539 #   16384          469672           10920           56436
 540 #   32768         1004884           21844          123792
 541 #   65536         2140840           43688          269428
 542
 543 # If a complex multiply is implemented with 3 real muls + 3 real adds,
 544 # a complex "simple" multiply is implemented with 2 real muls + 2 real adds,
 545 # and a complex addition is implemented with 2 real adds, then these results
 546 # are consistent with the table at the end of the www.cmlab.csie.ntu.edu.tw
 547 # DFT tutorial that I mentionned earlier.
 548
 549
 550 # Now another important case for the DFT is the one where the inputs are
 551 # real numbers instead of complex ones. We have to find ways to optimize for
 552 # this important case.
 553
 554 # If the DFT inputs are real-valued, then the DFT outputs have nice properties
 555 # too : output[0] and output[N/2] will be real numbers, and output[N-i] will
 556 # be the conjugate of output[i] for i in 0...N/2-1
 557
 558 # Likewise, if the DFT inputs are purely imaginary numbers, then the DFT
 559 # outputs will have special properties too : output[0] and output[N/2] will be
 560 # purely imaginary, and output[N-i] will be the opposite of the conjugate of
 561 # output[i] for i in 0...N/2-1
 562
 563 # We can use these properties to calculate two real-valued DFT at once :
 564
 565 def two_real_unscaled_DFT (N, input1, input2, output1, output2):
 566     input = vector(N)
 567     output = vector(N)
 568
 569     for i in range(N):
 570         input[i] = input1[i] + 1j * input2[i]
 571
 572     unscaled_DFT (N, input, output)
 573
 574     output1[0] = output[0].real + 0j
 575     output2[0] = output[0].imag + 0j
 576
 577     for i in range(N/2)[1:]:
 578         output1[i] = 0.5 * (output[i] + conjugate(output[N-i]))
 579         output2[i] = -0.5j * (output[i] - conjugate(output[N-i]))
 580
 581         output1[N-i] = conjugate(output1[i])
 582         output2[N-i] = conjugate(output2[i])
 583
 584     output1[N/2] = output[N/2].real + 0j
 585     output2[N/2] = output[N/2].imag + 0j
 586
 587 # This routine does a total of N-2 complex additions and N-2 complex
 588 # multiplies by 0.5
 589
 590 # This routine can also be inverted to calculate the IDFT of two vectors at
 591 # once if we know that the outputs will be real-valued.
 592
 593
 594 # If we have only one real-valued DFT calculation to do, we can still cut this
 595 # calculation in several parts using one of the decimate-in-time methods
 596 # (so that the different parts are still real-valued)
 597
 598 # As with complex DFT calculations, the best method is to use a split radix.
 599 # There are a lot of symetries in the DFT outputs that we can exploit to
 600 # reduce the number of operations...
 601
 602 def real_unscaled_DFT_split_radix_1 (N, input, output):
 603     even_input = vector(N/2)
 604     even_output = vector(N/2)
 605     input_1 = vector(N/4)
 606     output_1 = vector(N/4)
 607     input_3 = vector(N/4)
 608     output_3 = vector(N/4)
 609     tmp_0 = vector(N/4)
 610     tmp_1 = vector(N/4)
 611
 612     for i in range(N/2):
 613         even_input[i] = input[2*i]
 614
 615     for i in range(N/4):
 616         input_1[i] = input[4*i+1]
 617         input_3[i] = input[4*i+3]
 618
 619     unscaled_DFT (N/2, even_input, even_output)
 620     # this is again a real DFT !
 621     # we will only use even_output[i] for i in 0 ... N/4 included. we know that
 622     # even_output[N/2-i] is the conjugate of even_output[i]... also we know
 623     # that even_output[0] and even_output[N/4] are purely real.
 624
 625     unscaled_DFT (N/4, input_1, output_1)
 626     unscaled_DFT (N/4, input_3, output_3)
 627     # these are real DFTs too... with symetries in the outputs... once again
 628
 629     tmp_0[0] = output_1[0] + output_3[0]        # real numbers
 630     tmp_1[0] = output_1[0] - output_3[0]        # real numbers
 631
 632     tmp__0 = (output_1[N/8] + output_3[N/8]) * sqrt(0.5)        # real numbers
 633     tmp__1 = (output_1[N/8] - output_3[N/8]) * sqrt(0.5)        # real numbers
 634     tmp_0[N/8] = tmp__1 - 1j * tmp__0           # real + 1j * real
 635     tmp_1[N/8] = tmp__0 - 1j * tmp__1           # real + 1j * real
 636
 637     for i in range(N/8)[1:]:
 638         output_1[i] = output_1[i] * W(i,N)
 639         output_3[i] = output_3[i] * W(3*i,N)
 640
 641         tmp_0[i] = output_1[i] + output_3[i]
 642         tmp_1[i] = output_1[i] - output_3[i]
 643
 644         tmp_0[N/4-i] = -1j * conjugate(tmp_1[i])
 645         tmp_1[N/4-i] = -1j * conjugate(tmp_0[i])
 646
 647     output[0] = even_output[0] + tmp_0[0]               # real numbers
 648     output[N/4] = even_output[N/4] - 1j * tmp_1[0]      # real + 1j * real
 649     output[N/2] = even_output[0] - tmp_0[0]             # real numbers
 650     output[3*N/4] = even_output[N/4] + 1j * tmp_1[0]    # real + 1j * real
 651
 652     for i in range(N/4)[1:]:
 653         output[i] = even_output[i] + tmp_0[i]
 654         output[i+N/4] = conjugate(even_output[N/4-i]) - 1j * tmp_1[i]
 655
 656         output[N-i] = conjugate(output[i])
 657         output[3*N/4-i] = conjugate(output[i+N/4])
 658
 659 # This function performs one real DFT of size N/2 and two real DFT of size
 660 # N/4, followed by 6 real additions, 2 real multiplies, 3*N/4-4 complex
 661 # additions and N/4-2 complex multiplies.
 662
 663
 664 # We can also try to combine the two real DFT of size N/4 into a single complex
 665 # DFT :
 666
 667 def real_unscaled_DFT_split_radix_2 (N, input, output):
 668     even_input = vector(N/2)
 669     even_output = vector(N/2)
 670     odd_input = vector(N/4)
 671     odd_output = vector(N/4)
 672     tmp_0 = vector(N/4)
 673     tmp_1 = vector(N/4)
 674
 675     for i in range(N/2):
 676         even_input[i] = input[2*i]
 677
 678     for i in range(N/4):
 679         odd_input[i] = input[4*i+1] + 1j * input[4*i+3]
 680
 681     unscaled_DFT (N/2, even_input, even_output)
 682     # this is again a real DFT !
 683     # we will only use even_output[i] for i in 0 ... N/4 included. we know that
 684     # even_output[N/2-i] is the conjugate of even_output[i]... also we know
 685     # that even_output[0] and even_output[N/4] are purely real.
 686
 687     unscaled_DFT (N/4, odd_input, odd_output)
 688     # but this one is a complex DFT so no special properties here
 689
 690     output_1 = odd_output[0].real
 691     output_3 = odd_output[0].imag
 692     tmp_0[0] = output_1 + output_3      # real numbers
 693     tmp_1[0] = output_1 - output_3      # real numbers
 694
 695     output_1 = odd_output[N/8].real
 696     output_3 = odd_output[N/8].imag
 697     tmp__0 = (output_1 + output_3) * sqrt(0.5)  # real numbers
 698     tmp__1 = (output_1 - output_3) * sqrt(0.5)  # real numbers
 699     tmp_0[N/8] = tmp__1 - 1j * tmp__0           # real + 1j * real
 700     tmp_1[N/8] = tmp__0 - 1j * tmp__1           # real + 1j * real
 701
 702     for i in range(N/8)[1:]:
 703         output_1 = odd_output[i] + conjugate(odd_output[N/4-i])
 704         output_3 = odd_output[i] - conjugate(odd_output[N/4-i])
 705
 706         output_1 = output_1 * 0.5 * W(i,N)
 707         output_3 = output_3 * -0.5j * W(3*i,N)
 708
 709         tmp_0[i] = output_1 + output_3
 710         tmp_1[i] = output_1 - output_3
 711
 712         tmp_0[N/4-i] = -1j * conjugate(tmp_1[i])
 713         tmp_1[N/4-i] = -1j * conjugate(tmp_0[i])
 714
 715     output[0] = even_output[0] + tmp_0[0]               # real numbers
 716     output[N/4] = even_output[N/4] - 1j * tmp_1[0]      # real + 1j * real
 717     output[N/2] = even_output[0] - tmp_0[0]             # real numbers
 718     output[3*N/4] = even_output[N/4] + 1j * tmp_1[0]    # real + 1j * real
 719
 720     for i in range(N/4)[1:]:
 721         output[i] = even_output[i] + tmp_0[i]
 722         output[i+N/4] = conjugate(even_output[N/4-i]) - 1j * tmp_1[i]
 723
 724         output[N-i] = conjugate(output[i])
 725         output[3*N/4-i] = conjugate(output[i+N/4])
 726
 727 # This function performs one real DFT of size N/2 and one complex DFT of size
 728 # N/4, followed by 6 real additions, 2 real multiplies, N-6 complex additions
 729 # and N/4-2 complex multiplies.
 730
 731
 732 # After comparing the performance, it turns out that for real-valued DFT, the
 733 # version of the algorithm that subdivides the calculation into one real
 734 # DFT of size N/2 and two real DFT of size N/4 is the most efficient one.
 735 # The other version gives exactly the same number of multiplies and a few more
 736 # real additions.
 737
 738 # The performance we get for real-valued DFT is as follows :
 739
 740 #       N       real additions  real multiplies complex multiplies
 741 #       2               2               0               0
 742 #       4               6               0               0
 743 #       8              20               2               0
 744 #      16              54               4               2
 745 #      32             140              10               8
 746 #      64             342              20              26
 747 #     128             812              42              72
 748 #     256            1878              84             186
 749 #     512            4268             170             456
 750 #    1024            9558             340            1082
 751 #    2048           21164             682            2504
 752 #    4096           46422            1364            5690
 753 #    8192          101036            2730           12744
 754 #   16384          218454            5460           28218
 755 #   32768          469676           10922           61896
 756 #   65536         1004886           21844          134714
 757
 758
 759 # As an example, this is an implementation of a real-valued DFT8, using the
 760 # above-mentionned algorithm :
 761
 762 def DFT8 (input, output):
 763     tmp_0 = input[0] + input[4]
 764     tmp_1 = input[0] - input[4]
 765     tmp_2 = input[2] + input[6]
 766     tmp_3 = input[2] - input[6]
 767
 768     even_0 = tmp_0 + tmp_2              # real + real
 769     even_1 = tmp_1 - 1j * tmp_3         # real + 1j * real
 770     even_2 = tmp_0 - tmp_2              # real + real
 771     even_3 = tmp_1 + 1j * tmp_3         # real + 1j * real
 772
 773     tmp__0 = input[1] + input[5]
 774     tmp__1 = input[1] - input[5]
 775     tmp__2 = input[3] + input[7]
 776     tmp__3 = input[3] - input[7]
 777
 778     tmp_0 = tmp__0 + tmp__2     # real numbers
 779     tmp_2 = tmp__0 - tmp__2     # real numbers
 780
 781     tmp__0 = (tmp__1 + tmp__3) * sqrt(0.5)      # real numbers
 782     tmp__1 = (tmp__1 - tmp__3) * sqrt(0.5)      # real numbers
 783     tmp_1 = tmp__1 - 1j * tmp__0                # real + 1j * real
 784     tmp_3 = tmp__0 - 1j * tmp__1                # real + 1j * real
 785
 786     output[0] = even_0 + tmp_0          # real numbers
 787     output[2] = even_2 - 1j * tmp_2     # real + 1j * real
 788     output[4] = even_0 - tmp_0          # real numbers
 789     output[6] = even_2 + 1j * tmp_2     # real + 1j * real
 790
 791     output[1] = even_1 + tmp_1                  # complex numbers
 792     output[3] = conjugate(even_1) - 1j * tmp_3  # complex numbers
 793     output[5] = conjugate(output[3])
 794     output[7] = conjugate(output[1])
 795
 796
 797 # Also a basic implementation of the real-valued DFT4 :
 798
 799 def DFT4 (input, output):
 800     tmp_0 = input[0] + input[2]
 801     tmp_1 = input[0] - input[2]
 802     tmp_2 = input[1] + input[3]
 803     tmp_3 = input[1] - input[3]
 804
 805     output[0] = tmp_0 + tmp_2           # real + real
 806     output[1] = tmp_1 - 1j * tmp_3      # real + 1j * real
 807     output[2] = tmp_0 - tmp_2           # real + real
 808     output[3] = tmp_1 + 1j * tmp_3      # real + 1j * real
 809
 810
 811 # Now the last piece of the puzzle is the implementation of real-valued DFT
 812 # with a symetrical input. If you remember about the AAN DCT algorithm, this
 813 # is useful there...
 814
 815 # The best method I have found is to use a modification of the radix2
 816 # decimate-in-time algorithm here. The trick is that odd_input will be the
 817 # symetric of even_input... so we can deduce the value of odd_output from
 818 # the value of even_output :
 819 # odd_output[i] = conjugate(even_output[i]) * W(-i,N/2)
 820 # if we then merge this multiply with the one that is just after it in the
 821 # radix-2 decimate-in-time algorithm, and then we take all the symetries into
 822 # account to remove the corresponding code, we get the following function :
 823
 824 def real_symetric_unscaled_DFT (N, input, output):
 825     even_input = vector(N/2)
 826     even_output = vector(N/2)
 827     odd_output = vector(N/2)
 828
 829     for i in range(N/2):
 830         even_input[i] = input[2*i]
 831
 832     unscaled_DFT (N/2, even_input, even_output)
 833     # This is once again a real-valued DFT
 834
 835     output[0] = 2 * even_output[0]      # real number
 836     output[N/2] = 0
 837
 838     output[N/4] = (1 + 1j) * even_output[N/4]   # complex * real
 839     output[3*N/4] = conjugate(output[N/4])
 840
 841     for i in range(N/4)[1:]:
 842         #odd_output = conjugate(even_output[i]) * W(-i,N)
 843         #output[i] = even_output[i] + odd_output
 844         #odd_output = even_output[i] * W(N/2+i,N)
 845         #output[N/2-i] = conjugate(even_output[i]) + odd_output
 846
 847         cr = W(-i,N).real
 848         ci = W(-i,N).imag
 849
 850         real = even_output[i].real * (1+cr) + even_output[i].imag * ci
 851         imag = even_output[i].real * ci + even_output[i].imag * (1-cr)
 852         output[i] = real + 1j * imag
 853
 854         real = even_output[i].real * (1-cr) - even_output[i].imag * ci
 855         imag = even_output[i].real * ci - even_output[i].imag * (1+cr)
 856         output[N/2-i] = real + 1j * imag
 857
 858         output[N-i] = conjugate(output[i])
 859         output[N/2+i] = conjugate(output[N/2-i])
 860
 861 # This function does one real unscaled DFT of size N/2, one multiply by 2, and
 862 # N/4-1 times something that can be written with either 6 real muls and 4 real
 863 # adds (as I did), or 1 complex mul and 2 complex adds (giving 4 real muls and
 864 # 6 adds, or 3 real muls and 7 adds).
 865
 866
 867 # Now we can use this new knowledge to write a new optimized version of the
 868 # AAN algorithm for the DCT calculation :
 869
 870 def unscaled_DCT_AAN_optim (N, input, output):
 871     DFT_input = vector (N)
 872     DFT_output = vector (N)
 873
 874     for i in range(N/2):
 875         DFT_input[i] = input[2*i]
 876         DFT_input[N-1-i] = input[2*i+1]
 877
 878     unscaled_DFT (N, DFT_input, DFT_output)
 879     # This is another real-valued DFT
 880
 881     output[0] = DFT_output[0]
 882     output[N/2] = DFT_output[N/2] * sqrt(0.5)
 883
 884     for i in range(N/2)[1:]:
 885         tmp = (conjugate(DFT_output[i]) *
 886                (1+W(-i,2*N)) * 0.5 / cos ((i*pi)/(2*N)))
 887         output[i] = tmp.real
 888         output[N-i] = tmp.imag
 889
 890 # Now the DCT calculation can be reduced to one real-valued DFT calculation of
 891 # size N, followed by 1 real multiply and N/2-1 complex multiplies
 892
 893 # One funny result is that if we calculate the number of real operations needed
 894 # to implement this AAN DCT algorithm, and supposing that we choose to
 895 # implement complex multiplies with 3 real adds and 3 real muls, then the
 896 # number of operations is *exactly* the same as for the original Lee DCT
 897 # algorithm...
 898
 899
 900 # THATS ALL FOLKS !
 901
 902
 903 def dump (vector):
 904     str = ""
 905     for i in range(len(vector)):
 906         if i:
 907             str = str + ", "
 908         vector[i] = vector[i] + 0j
 909         realstr = "%+.4f" % vector[i].real
 910         imagstr = "%+.4fj" % vector[i].imag
 911         if (realstr == "-0.0000"):
 912             realstr = "+0.0000"
 913         if (imagstr == "-0.0000j"):
 914             imagstr = "+0.0000j"
 915         str = str + realstr + imagstr
 916     return "[%s]" % str
 917
 918 import whrandom
 919
 920 def test(N):
 921     input = vector(N)
 922     output = vector(N)
 923     verify = vector(N)
 924
 925     for i in range(N):
 926         input[i] = whrandom.random()
 927
 928     unscaled_DCT_AAN_optim (N, input, output)
 929     unscaled_DCT (N, input, verify)
 930
 931     if (dump(verify) != dump(output)):
 932         print dump(verify)
 933         #print dump(output)
 934
 935 test (32)