git.sesse.net Git - ffmpeg/blob - doc/transforms.md

   1 The basis transforms used for FFT and various other derived functions are based
   2 on the following unrollings.
   3 The functions can be easily adapted to double precision floats as well.
   4
   5 # Parity permutation
   6 The basis transforms described here all use the following permutation:
   7
   8 ``` C
   9 void ff_tx_gen_split_radix_parity_revtab(int *revtab, int len, int inv,
  10                                          int basis, int dual_stride);
  11 ```
  12 Parity means even and odd complex numbers will be split, e.g. the even
  13 coefficients will come first, after which the odd coefficients will be
  14 placed. For example, a 4-point transform's coefficients after reordering:
  15 `z[0].re, z[0].im, z[2].re, z[2].im, z[1].re, z[1].im, z[3].re, z[3].im`
  16
  17 The basis argument is the length of the largest non-composite transform
  18 supported, and also implies that the basis/2 transform is supported as well,
  19 as the split-radix algorithm requires it to be.
  20
  21 The dual_stride argument indicates that both the basis, as well as the
  22 basis/2 transforms support doing two transforms at once, and the coefficients
  23 will be interleaved between each pair in a split-radix like so (stride == 2):
  24 `tx1[0], tx1[2], tx2[0], tx2[2], tx1[1], tx1[3], tx2[1], tx2[3]`
  25 A non-zero number switches this on, with the value indicating the stride
  26 (how many values of 1 transform to put first before switching to the other).
  27 Must be a power of two or 0. Must be less than the basis.
  28 Value will be clipped to the transform size, so for a basis of 16 and a
  29 dual_stride of 8, dual 8-point transforms will be laid out as if dual_stride
  30 was set to 4.
  31 Usually you'll set this to half the complex numbers that fit in a single
  32 register or 0. This allows to reuse SSE functions as dual-transform
  33 functions in AVX mode.
  34 If length is smaller than basis/2 this function will not do anything.
  35
  36 # 4-point FFT transform
  37 The only permutation this transform needs is to swap the `z[1]` and `z[2]`
  38 elements when performing an inverse transform, which in the assembly code is
  39 hardcoded with the function itself being templated and duplicated for each
  40 direction.
  41
  42 ``` C
  43 static void fft4(FFTComplex *z)
  44 {
  45     FFTSample r1 = z[0].re - z[2].re;
  46     FFTSample r2 = z[0].im - z[2].im;
  47     FFTSample r3 = z[1].re - z[3].re;
  48     FFTSample r4 = z[1].im - z[3].im;
  49     /* r5-r8 second transform */
  50
  51     FFTSample t1 = z[0].re + z[2].re;
  52     FFTSample t2 = z[0].im + z[2].im;
  53     FFTSample t3 = z[1].re + z[3].re;
  54     FFTSample t4 = z[1].im + z[3].im;
  55     /* t5-t8 second transform */
  56
  57     /* 1sub + 1add = 2 instructions */
  58
  59     /* 2 shufs */
  60     FFTSample a3 = t1 - t3;
  61     FFTSample a4 = t2 - t4;
  62     FFTSample b3 = r1 - r4;
  63     FFTSample b2 = r2 - r3;
  64
  65     FFTSample a1 = t1 + t3;
  66     FFTSample a2 = t2 + t4;
  67     FFTSample b1 = r1 + r4;
  68     FFTSample b4 = r2 + r3;
  69     /* 1 add 1 sub 3 shufs */
  70
  71     z[0].re = a1;
  72     z[0].im = a2;
  73     z[2].re = a3;
  74     z[2].im = a4;
  75
  76     z[1].re = b1;
  77     z[1].im = b2;
  78     z[3].re = b3;
  79     z[3].im = b4;
  80 }
  81 ```
  82
  83 # 8-point AVX FFT transform
  84 Input must be pre-permuted using the parity lookup table, generated via
  85 `ff_tx_gen_split_radix_parity_revtab`.
  86
  87 ``` C
  88 static void fft8(FFTComplex *z)
  89 {
  90     FFTSample r1 = z[0].re - z[4].re;
  91     FFTSample r2 = z[0].im - z[4].im;
  92     FFTSample r3 = z[1].re - z[5].re;
  93     FFTSample r4 = z[1].im - z[5].im;
  94
  95     FFTSample r5 = z[2].re - z[6].re;
  96     FFTSample r6 = z[2].im - z[6].im;
  97     FFTSample r7 = z[3].re - z[7].re;
  98     FFTSample r8 = z[3].im - z[7].im;
  99
 100     FFTSample q1 = z[0].re + z[4].re;
 101     FFTSample q2 = z[0].im + z[4].im;
 102     FFTSample q3 = z[1].re + z[5].re;
 103     FFTSample q4 = z[1].im + z[5].im;
 104
 105     FFTSample q5 = z[2].re + z[6].re;
 106     FFTSample q6 = z[2].im + z[6].im;
 107     FFTSample q7 = z[3].re + z[7].re;
 108     FFTSample q8 = z[3].im + z[7].im;
 109
 110     FFTSample s3 = q1 - q3;
 111     FFTSample s1 = q1 + q3;
 112     FFTSample s4 = q2 - q4;
 113     FFTSample s2 = q2 + q4;
 114
 115     FFTSample s7 = q5 - q7;
 116     FFTSample s5 = q5 + q7;
 117     FFTSample s8 = q6 - q8;
 118     FFTSample s6 = q6 + q8;
 119
 120     FFTSample e1 = s1 * -1;
 121     FFTSample e2 = s2 * -1;
 122     FFTSample e3 = s3 * -1;
 123     FFTSample e4 = s4 * -1;
 124
 125     FFTSample e5 = s5 *  1;
 126     FFTSample e6 = s6 *  1;
 127     FFTSample e7 = s7 * -1;
 128     FFTSample e8 = s8 *  1;
 129
 130     FFTSample w1 =  e5 - e1;
 131     FFTSample w2 =  e6 - e2;
 132     FFTSample w3 =  e8 - e3;
 133     FFTSample w4 =  e7 - e4;
 134
 135     FFTSample w5 =  s1 - e5;
 136     FFTSample w6 =  s2 - e6;
 137     FFTSample w7 =  s3 - e8;
 138     FFTSample w8 =  s4 - e7;
 139
 140     z[0].re = w1;
 141     z[0].im = w2;
 142     z[2].re = w3;
 143     z[2].im = w4;
 144     z[4].re = w5;
 145     z[4].im = w6;
 146     z[6].re = w7;
 147     z[6].im = w8;
 148
 149     FFTSample z1 = r1 - r4;
 150     FFTSample z2 = r1 + r4;
 151     FFTSample z3 = r3 - r2;
 152     FFTSample z4 = r3 + r2;
 153
 154     FFTSample z5 = r5 - r6;
 155     FFTSample z6 = r5 + r6;
 156     FFTSample z7 = r7 - r8;
 157     FFTSample z8 = r7 + r8;
 158
 159     z3 *= -1;
 160     z5 *= -M_SQRT1_2;
 161     z6 *= -M_SQRT1_2;
 162     z7 *=  M_SQRT1_2;
 163     z8 *=  M_SQRT1_2;
 164
 165     FFTSample t5 = z7 - z6;
 166     FFTSample t6 = z8 + z5;
 167     FFTSample t7 = z8 - z5;
 168     FFTSample t8 = z7 + z6;
 169
 170     FFTSample u1 =  z2 + t5;
 171     FFTSample u2 =  z3 + t6;
 172     FFTSample u3 =  z1 - t7;
 173     FFTSample u4 =  z4 + t8;
 174
 175     FFTSample u5 =  z2 - t5;
 176     FFTSample u6 =  z3 - t6;
 177     FFTSample u7 =  z1 + t7;
 178     FFTSample u8 =  z4 - t8;
 179
 180     z[1].re = u1;
 181     z[1].im = u2;
 182     z[3].re = u3;
 183     z[3].im = u4;
 184     z[5].re = u5;
 185     z[5].im = u6;
 186     z[7].re = u7;
 187     z[7].im = u8;
 188 }
 189 ```
 190
 191 As you can see, there are 2 independent paths, one for even and one for odd coefficients.
 192 This theme continues throughout the document. Note that in the actual assembly code,
 193 the paths are interleaved to improve unit saturation and CPU dependency tracking, so
 194 to more clearly see them, you'll need to deinterleave the instructions.
 195
 196 # 8-point SSE/ARM64 FFT transform
 197 Input must be pre-permuted using the parity lookup table, generated via
 198 `ff_tx_gen_split_radix_parity_revtab`.
 199
 200 ``` C
 201 static void fft8(FFTComplex *z)
 202 {
 203     FFTSample r1 = z[0].re - z[4].re;
 204     FFTSample r2 = z[0].im - z[4].im;
 205     FFTSample r3 = z[1].re - z[5].re;
 206     FFTSample r4 = z[1].im - z[5].im;
 207
 208     FFTSample j1 = z[2].re - z[6].re;
 209     FFTSample j2 = z[2].im - z[6].im;
 210     FFTSample j3 = z[3].re - z[7].re;
 211     FFTSample j4 = z[3].im - z[7].im;
 212
 213     FFTSample q1 = z[0].re + z[4].re;
 214     FFTSample q2 = z[0].im + z[4].im;
 215     FFTSample q3 = z[1].re + z[5].re;
 216     FFTSample q4 = z[1].im + z[5].im;
 217
 218     FFTSample k1 = z[2].re + z[6].re;
 219     FFTSample k2 = z[2].im + z[6].im;
 220     FFTSample k3 = z[3].re + z[7].re;
 221     FFTSample k4 = z[3].im + z[7].im;
 222     /* 2 add 2 sub = 4 */
 223
 224     /* 2 shufs, 1 add 1 sub = 4 */
 225     FFTSample s1 = q1 + q3;
 226     FFTSample s2 = q2 + q4;
 227     FFTSample g1 = k3 + k1;
 228     FFTSample g2 = k2 + k4;
 229
 230     FFTSample s3 = q1 - q3;
 231     FFTSample s4 = q2 - q4;
 232     FFTSample g4 = k3 - k1;
 233     FFTSample g3 = k2 - k4;
 234
 235     /* 1 unpack + 1 shuffle = 2 */
 236
 237     /* 1 add */
 238     FFTSample w1 =  s1 + g1;
 239     FFTSample w2 =  s2 + g2;
 240     FFTSample w3 =  s3 + g3;
 241     FFTSample w4 =  s4 + g4;
 242
 243     /* 1 sub */
 244     FFTSample h1 =  s1 - g1;
 245     FFTSample h2 =  s2 - g2;
 246     FFTSample h3 =  s3 - g3;
 247     FFTSample h4 =  s4 - g4;
 248
 249     z[0].re = w1;
 250     z[0].im = w2;
 251     z[2].re = w3;
 252     z[2].im = w4;
 253     z[4].re = h1;
 254     z[4].im = h2;
 255     z[6].re = h3;
 256     z[6].im = h4;
 257
 258     /* 1 shuf + 1 shuf + 1 xor + 1 addsub */
 259     FFTSample z1 = r1 + r4;
 260     FFTSample z2 = r2 - r3;
 261     FFTSample z3 = r1 - r4;
 262     FFTSample z4 = r2 + r3;
 263
 264     /* 1 mult */
 265     j1 *=  M_SQRT1_2;
 266     j2 *= -M_SQRT1_2;
 267     j3 *= -M_SQRT1_2;
 268     j4 *=  M_SQRT1_2;
 269
 270     /* 1 shuf + 1 addsub */
 271     FFTSample l2 = j1 - j2;
 272     FFTSample l1 = j2 + j1;
 273     FFTSample l4 = j3 - j4;
 274     FFTSample l3 = j4 + j3;
 275
 276     /* 1 shuf + 1 addsub */
 277     FFTSample t1 = l3 - l2;
 278     FFTSample t2 = l4 + l1;
 279     FFTSample t3 = l1 - l4;
 280     FFTSample t4 = l2 + l3;
 281
 282     /* 1 add */
 283     FFTSample u1 =  z1 - t1;
 284     FFTSample u2 =  z2 - t2;
 285     FFTSample u3 =  z3 - t3;
 286     FFTSample u4 =  z4 - t4;
 287
 288     /* 1 sub */
 289     FFTSample o1 =  z1 + t1;
 290     FFTSample o2 =  z2 + t2;
 291     FFTSample o3 =  z3 + t3;
 292     FFTSample o4 =  z4 + t4;
 293
 294     z[1].re = u1;
 295     z[1].im = u2;
 296     z[3].re = u3;
 297     z[3].im = u4;
 298     z[5].re = o1;
 299     z[5].im = o2;
 300     z[7].re = o3;
 301     z[7].im = o4;
 302 }
 303 ```
 304
 305 Most functions here are highly tuned to use x86's addsub instruction to save on
 306 external sign mask loading.
 307
 308 # 16-point AVX FFT transform
 309 This version expects the output of the 8 and 4-point transforms to follow the
 310 even/odd convention established above.
 311
 312 ``` C
 313 static void fft16(FFTComplex *z)
 314 {
 315     FFTSample cos_16_1 = 0.92387950420379638671875f;
 316     FFTSample cos_16_3 = 0.3826834261417388916015625f;
 317
 318     fft8(z);
 319     fft4(z+8);
 320     fft4(z+10);
 321
 322     FFTSample s[32];
 323
 324     /*
 325         xorps m1, m1 - free
 326         mulps m0
 327         shufps m1, m1, m0
 328         xorps
 329         addsub
 330         shufps
 331         mulps
 332         mulps
 333         addps
 334         or (fma3)
 335         shufps
 336         shufps
 337         mulps
 338         mulps
 339         fma
 340         fma
 341      */
 342
 343     s[0]  =  z[8].re*( 1) - z[8].im*( 0);
 344     s[1]  =  z[8].im*( 1) + z[8].re*( 0);
 345     s[2]  =  z[9].re*( 1) - z[9].im*(-1);
 346     s[3]  =  z[9].im*( 1) + z[9].re*(-1);
 347
 348     s[4]  = z[10].re*( 1) - z[10].im*( 0);
 349     s[5]  = z[10].im*( 1) + z[10].re*( 0);
 350     s[6]  = z[11].re*( 1) - z[11].im*( 1);
 351     s[7]  = z[11].im*( 1) + z[11].re*( 1);
 352
 353     s[8]  = z[12].re*(  cos_16_1) - z[12].im*( -cos_16_3);
 354     s[9]  = z[12].im*(  cos_16_1) + z[12].re*( -cos_16_3);
 355     s[10] = z[13].re*(  cos_16_3) - z[13].im*( -cos_16_1);
 356     s[11] = z[13].im*(  cos_16_3) + z[13].re*( -cos_16_1);
 357
 358     s[12] = z[14].re*(  cos_16_1) - z[14].im*(  cos_16_3);
 359     s[13] = z[14].im*( -cos_16_1) + z[14].re*( -cos_16_3);
 360     s[14] = z[15].re*(  cos_16_3) - z[15].im*(  cos_16_1);
 361     s[15] = z[15].im*( -cos_16_3) + z[15].re*( -cos_16_1);
 362
 363     s[2] *=  M_SQRT1_2;
 364     s[3] *=  M_SQRT1_2;
 365     s[5] *= -1;
 366     s[6] *=  M_SQRT1_2;
 367     s[7] *= -M_SQRT1_2;
 368
 369     FFTSample w5 =  s[0] + s[4];
 370     FFTSample w6 =  s[1] - s[5];
 371     FFTSample x5 =  s[2] + s[6];
 372     FFTSample x6 =  s[3] - s[7];
 373
 374     FFTSample w3 =  s[4] - s[0];
 375     FFTSample w4 =  s[5] + s[1];
 376     FFTSample x3 =  s[6] - s[2];
 377     FFTSample x4 =  s[7] + s[3];
 378
 379     FFTSample y5 =  s[8] + s[12];
 380     FFTSample y6 =  s[9] - s[13];
 381     FFTSample u5 = s[10] + s[14];
 382     FFTSample u6 = s[11] - s[15];
 383
 384     FFTSample y3 = s[12] - s[8];
 385     FFTSample y4 = s[13] + s[9];
 386     FFTSample u3 = s[14] - s[10];
 387     FFTSample u4 = s[15] + s[11];
 388
 389     /* 2xorps, 2vperm2fs, 2 adds, 2 vpermilps = 8 */
 390
 391     FFTSample o1  = z[0].re + w5;
 392     FFTSample o2  = z[0].im + w6;
 393     FFTSample o5  = z[1].re + x5;
 394     FFTSample o6  = z[1].im + x6;
 395     FFTSample o9  = z[2].re + w4; //h
 396     FFTSample o10 = z[2].im + w3;
 397     FFTSample o13 = z[3].re + x4;
 398     FFTSample o14 = z[3].im + x3;
 399
 400     FFTSample o17 = z[0].re - w5;
 401     FFTSample o18 = z[0].im - w6;
 402     FFTSample o21 = z[1].re - x5;
 403     FFTSample o22 = z[1].im - x6;
 404     FFTSample o25 = z[2].re - w4; //h
 405     FFTSample o26 = z[2].im - w3;
 406     FFTSample o29 = z[3].re - x4;
 407     FFTSample o30 = z[3].im - x3;
 408
 409     FFTSample o3  = z[4].re + y5;
 410     FFTSample o4  = z[4].im + y6;
 411     FFTSample o7  = z[5].re + u5;
 412     FFTSample o8  = z[5].im + u6;
 413     FFTSample o11 = z[6].re + y4; //h
 414     FFTSample o12 = z[6].im + y3;
 415     FFTSample o15 = z[7].re + u4;
 416     FFTSample o16 = z[7].im + u3;
 417
 418     FFTSample o19 = z[4].re - y5;
 419     FFTSample o20 = z[4].im - y6;
 420     FFTSample o23 = z[5].re - u5;
 421     FFTSample o24 = z[5].im - u6;
 422     FFTSample o27 = z[6].re - y4; //h
 423     FFTSample o28 = z[6].im - y3;
 424     FFTSample o31 = z[7].re - u4;
 425     FFTSample o32 = z[7].im - u3;
 426
 427     /* This is just deinterleaving, happens separately */
 428     z[0]  = (FFTComplex){  o1,  o2 };
 429     z[1]  = (FFTComplex){  o3,  o4 };
 430     z[2]  = (FFTComplex){  o5,  o6 };
 431     z[3]  = (FFTComplex){  o7,  o8 };
 432     z[4]  = (FFTComplex){  o9, o10 };
 433     z[5]  = (FFTComplex){ o11, o12 };
 434     z[6]  = (FFTComplex){ o13, o14 };
 435     z[7]  = (FFTComplex){ o15, o16 };
 436
 437     z[8]  = (FFTComplex){ o17, o18 };
 438     z[9]  = (FFTComplex){ o19, o20 };
 439     z[10] = (FFTComplex){ o21, o22 };
 440     z[11] = (FFTComplex){ o23, o24 };
 441     z[12] = (FFTComplex){ o25, o26 };
 442     z[13] = (FFTComplex){ o27, o28 };
 443     z[14] = (FFTComplex){ o29, o30 };
 444     z[15] = (FFTComplex){ o31, o32 };
 445 }
 446 ```
 447
 448 # AVX split-radix synthesis
 449 To create larger transforms, the following unrolling of the C split-radix
 450 function is used.
 451
 452 ``` C
 453 #define BF(x, y, a, b)                           \
 454     do {                                         \
 455         x = (a) - (b);                           \
 456         y = (a) + (b);                           \
 457     } while (0)
 458
 459 #define BUTTERFLIES(a0,a1,a2,a3)               \
 460     do {                                       \
 461         r0=a0.re;                              \
 462         i0=a0.im;                              \
 463         r1=a1.re;                              \
 464         i1=a1.im;                              \
 465         BF(q3, q5, q5, q1);                    \
 466         BF(a2.re, a0.re, r0, q5);              \
 467         BF(a3.im, a1.im, i1, q3);              \
 468         BF(q4, q6, q2, q6);                    \
 469         BF(a3.re, a1.re, r1, q4);              \
 470         BF(a2.im, a0.im, i0, q6);              \
 471     } while (0)
 472
 473 #undef TRANSFORM
 474 #define TRANSFORM(a0,a1,a2,a3,wre,wim)         \
 475     do {                                       \
 476         CMUL(q1, q2, a2.re, a2.im, wre, -wim); \
 477         CMUL(q5, q6, a3.re, a3.im, wre,  wim); \
 478         BUTTERFLIES(a0, a1, a2, a3);           \
 479     } while (0)
 480
 481 #define CMUL(dre, dim, are, aim, bre, bim)       \
 482     do {                                         \
 483         (dre) = (are) * (bre) - (aim) * (bim);   \
 484         (dim) = (are) * (bim) + (aim) * (bre);   \
 485     } while (0)
 486
 487 static void recombine(FFTComplex *z, const FFTSample *cos,
 488                       unsigned int n)
 489 {
 490     const int o1 = 2*n;
 491     const int o2 = 4*n;
 492     const int o3 = 6*n;
 493     const FFTSample *wim = cos + o1 - 7;
 494     FFTSample q1, q2, q3, q4, q5, q6, r0, i0, r1, i1;
 495
 496 #if 0
 497     for (int i = 0; i < n; i += 4) {
 498 #endif
 499
 500 #if 0
 501         TRANSFORM(z[ 0 + 0], z[ 0 + 4], z[o2 + 0], z[o2 + 2], cos[0], wim[7]);
 502         TRANSFORM(z[ 0 + 1], z[ 0 + 5], z[o2 + 1], z[o2 + 3], cos[2], wim[5]);
 503         TRANSFORM(z[ 0 + 2], z[ 0 + 6], z[o2 + 4], z[o2 + 6], cos[4], wim[3]);
 504         TRANSFORM(z[ 0 + 3], z[ 0 + 7], z[o2 + 5], z[o2 + 7], cos[6], wim[1]);
 505
 506         TRANSFORM(z[o1 + 0], z[o1 + 4], z[o3 + 0], z[o3 + 2], cos[1], wim[6]);
 507         TRANSFORM(z[o1 + 1], z[o1 + 5], z[o3 + 1], z[o3 + 3], cos[3], wim[4]);
 508         TRANSFORM(z[o1 + 2], z[o1 + 6], z[o3 + 4], z[o3 + 6], cos[5], wim[2]);
 509         TRANSFORM(z[o1 + 3], z[o1 + 7], z[o3 + 5], z[o3 + 7], cos[7], wim[0]);
 510 #else
 511         FFTSample h[8], j[8], r[8], w[8];
 512         FFTSample t[8];
 513         FFTComplex *m0 = &z[0];
 514         FFTComplex *m1 = &z[4];
 515         FFTComplex *m2 = &z[o2 + 0];
 516         FFTComplex *m3 = &z[o2 + 4];
 517
 518         const FFTSample *t1  = &cos[0];
 519         const FFTSample *t2  = &wim[0];
 520
 521         /* 2 loads (tabs) */
 522
 523         /* 2 vperm2fs, 2 shufs (im), 2 shufs (tabs) */
 524         /* 1 xor, 1 add, 1 sub, 4 mults OR 2 mults, 2 fmas */
 525         /* 13 OR 10ish (-2 each for second passovers!) */
 526
 527         w[0] = m2[0].im*t1[0] - m2[0].re*t2[7];
 528         w[1] = m2[0].re*t1[0] + m2[0].im*t2[7];
 529         w[2] = m2[1].im*t1[2] - m2[1].re*t2[5];
 530         w[3] = m2[1].re*t1[2] + m2[1].im*t2[5];
 531         w[4] = m3[0].im*t1[4] - m3[0].re*t2[3];
 532         w[5] = m3[0].re*t1[4] + m3[0].im*t2[3];
 533         w[6] = m3[1].im*t1[6] - m3[1].re*t2[1];
 534         w[7] = m3[1].re*t1[6] + m3[1].im*t2[1];
 535
 536         j[0] = m2[2].im*t1[0] + m2[2].re*t2[7];
 537         j[1] = m2[2].re*t1[0] - m2[2].im*t2[7];
 538         j[2] = m2[3].im*t1[2] + m2[3].re*t2[5];
 539         j[3] = m2[3].re*t1[2] - m2[3].im*t2[5];
 540         j[4] = m3[2].im*t1[4] + m3[2].re*t2[3];
 541         j[5] = m3[2].re*t1[4] - m3[2].im*t2[3];
 542         j[6] = m3[3].im*t1[6] + m3[3].re*t2[1];
 543         j[7] = m3[3].re*t1[6] - m3[3].im*t2[1];
 544
 545         /* 1 add + 1 shuf */
 546         t[1] = j[0] + w[0];
 547         t[0] = j[1] + w[1];
 548         t[3] = j[2] + w[2];
 549         t[2] = j[3] + w[3];
 550         t[5] = j[4] + w[4];
 551         t[4] = j[5] + w[5];
 552         t[7] = j[6] + w[6];
 553         t[6] = j[7] + w[7];
 554
 555         /* 1 sub + 1 xor */
 556         r[0] =  (w[0] - j[0]);
 557         r[1] = -(w[1] - j[1]);
 558         r[2] =  (w[2] - j[2]);
 559         r[3] = -(w[3] - j[3]);
 560         r[4] =  (w[4] - j[4]);
 561         r[5] = -(w[5] - j[5]);
 562         r[6] =  (w[6] - j[6]);
 563         r[7] = -(w[7] - j[7]);
 564
 565         /* Min: 2 subs, 2 adds, 2 vperm2fs (OPTIONAL) */
 566         m2[0].re = m0[0].re - t[0];
 567         m2[0].im = m0[0].im - t[1];
 568         m2[1].re = m0[1].re - t[2];
 569         m2[1].im = m0[1].im - t[3];
 570         m3[0].re = m0[2].re - t[4];
 571         m3[0].im = m0[2].im - t[5];
 572         m3[1].re = m0[3].re - t[6];
 573         m3[1].im = m0[3].im - t[7];
 574
 575         m2[2].re = m1[0].re - r[0];
 576         m2[2].im = m1[0].im - r[1];
 577         m2[3].re = m1[1].re - r[2];
 578         m2[3].im = m1[1].im - r[3];
 579         m3[2].re = m1[2].re - r[4];
 580         m3[2].im = m1[2].im - r[5];
 581         m3[3].re = m1[3].re - r[6];
 582         m3[3].im = m1[3].im - r[7];
 583
 584         m0[0].re = m0[0].re + t[0];
 585         m0[0].im = m0[0].im + t[1];
 586         m0[1].re = m0[1].re + t[2];
 587         m0[1].im = m0[1].im + t[3];
 588         m0[2].re = m0[2].re + t[4];
 589         m0[2].im = m0[2].im + t[5];
 590         m0[3].re = m0[3].re + t[6];
 591         m0[3].im = m0[3].im + t[7];
 592
 593         m1[0].re = m1[0].re + r[0];
 594         m1[0].im = m1[0].im + r[1];
 595         m1[1].re = m1[1].re + r[2];
 596         m1[1].im = m1[1].im + r[3];
 597         m1[2].re = m1[2].re + r[4];
 598         m1[2].im = m1[2].im + r[5];
 599         m1[3].re = m1[3].re + r[6];
 600         m1[3].im = m1[3].im + r[7];
 601
 602         /* Identical for below, but with the following parameters */
 603         m0 = &z[o1];
 604         m1 = &z[o1 + 4];
 605         m2 = &z[o3 + 0];
 606         m3 = &z[o3 + 4];
 607         t1  = &cos[1];
 608         t2  = &wim[-1];
 609
 610         w[0] = m2[0].im*t1[0] - m2[0].re*t2[7];
 611         w[1] = m2[0].re*t1[0] + m2[0].im*t2[7];
 612         w[2] = m2[1].im*t1[2] - m2[1].re*t2[5];
 613         w[3] = m2[1].re*t1[2] + m2[1].im*t2[5];
 614         w[4] = m3[0].im*t1[4] - m3[0].re*t2[3];
 615         w[5] = m3[0].re*t1[4] + m3[0].im*t2[3];
 616         w[6] = m3[1].im*t1[6] - m3[1].re*t2[1];
 617         w[7] = m3[1].re*t1[6] + m3[1].im*t2[1];
 618
 619         j[0] = m2[2].im*t1[0] + m2[2].re*t2[7];
 620         j[1] = m2[2].re*t1[0] - m2[2].im*t2[7];
 621         j[2] = m2[3].im*t1[2] + m2[3].re*t2[5];
 622         j[3] = m2[3].re*t1[2] - m2[3].im*t2[5];
 623         j[4] = m3[2].im*t1[4] + m3[2].re*t2[3];
 624         j[5] = m3[2].re*t1[4] - m3[2].im*t2[3];
 625         j[6] = m3[3].im*t1[6] + m3[3].re*t2[1];
 626         j[7] = m3[3].re*t1[6] - m3[3].im*t2[1];
 627
 628         /* 1 add + 1 shuf */
 629         t[1] = j[0] + w[0];
 630         t[0] = j[1] + w[1];
 631         t[3] = j[2] + w[2];
 632         t[2] = j[3] + w[3];
 633         t[5] = j[4] + w[4];
 634         t[4] = j[5] + w[5];
 635         t[7] = j[6] + w[6];
 636         t[6] = j[7] + w[7];
 637
 638         /* 1 sub + 1 xor */
 639         r[0] =  (w[0] - j[0]);
 640         r[1] = -(w[1] - j[1]);
 641         r[2] =  (w[2] - j[2]);
 642         r[3] = -(w[3] - j[3]);
 643         r[4] =  (w[4] - j[4]);
 644         r[5] = -(w[5] - j[5]);
 645         r[6] =  (w[6] - j[6]);
 646         r[7] = -(w[7] - j[7]);
 647
 648         /* Min: 2 subs, 2 adds, 2 vperm2fs (OPTIONAL) */
 649         m2[0].re = m0[0].re - t[0];
 650         m2[0].im = m0[0].im - t[1];
 651         m2[1].re = m0[1].re - t[2];
 652         m2[1].im = m0[1].im - t[3];
 653         m3[0].re = m0[2].re - t[4];
 654         m3[0].im = m0[2].im - t[5];
 655         m3[1].re = m0[3].re - t[6];
 656         m3[1].im = m0[3].im - t[7];
 657
 658         m2[2].re = m1[0].re - r[0];
 659         m2[2].im = m1[0].im - r[1];
 660         m2[3].re = m1[1].re - r[2];
 661         m2[3].im = m1[1].im - r[3];
 662         m3[2].re = m1[2].re - r[4];
 663         m3[2].im = m1[2].im - r[5];
 664         m3[3].re = m1[3].re - r[6];
 665         m3[3].im = m1[3].im - r[7];
 666
 667         m0[0].re = m0[0].re + t[0];
 668         m0[0].im = m0[0].im + t[1];
 669         m0[1].re = m0[1].re + t[2];
 670         m0[1].im = m0[1].im + t[3];
 671         m0[2].re = m0[2].re + t[4];
 672         m0[2].im = m0[2].im + t[5];
 673         m0[3].re = m0[3].re + t[6];
 674         m0[3].im = m0[3].im + t[7];
 675
 676         m1[0].re = m1[0].re + r[0];
 677         m1[0].im = m1[0].im + r[1];
 678         m1[1].re = m1[1].re + r[2];
 679         m1[1].im = m1[1].im + r[3];
 680         m1[2].re = m1[2].re + r[4];
 681         m1[2].im = m1[2].im + r[5];
 682         m1[3].re = m1[3].re + r[6];
 683         m1[3].im = m1[3].im + r[7];
 684 #endif
 685
 686 #if 0
 687         z   +=   4; // !!!
 688         cos += 2*4;
 689         wim -= 2*4;
 690     }
 691 #endif
 692 }
 693 ```
 694
 695 The macros used are identical to those in the generic C version, only with all
 696 variable declarations exported to the function body.
 697 An important point here is that the high frequency registers (m2 and m3) have
 698 their high and low halves swapped in the output. This is intentional, as the
 699 inputs must also have the same layout, and therefore, the input swapping is only
 700 performed once for the bottom-most basis transform, with all subsequent combinations
 701 using the already swapped halves.
 702
 703 Also note that this function requires a special iteration way, due to coefficients
 704 beginning to overlap, particularly `[o1]` with `[0]` after the second iteration.
 705 To iterate further, set `z = &z[16]` via `z += 8` for the second iteration. After
 706 the 4th iteration, the layout resets, so repeat the same.