+static void ff_fft_permute_c(FFTContext *s, FFTComplex *z)
+{
+ int j, np;
+ const uint16_t *revtab = s->revtab;
+ np = 1 << s->nbits;
+ /* TODO: handle split-radix permute in a more optimal way, probably in-place */
+ for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j];
+ memcpy(z, s->tmp_buf, np * sizeof(FFTComplex));
+}
+
+av_cold void ff_fft_end(FFTContext *s)
+{
+ av_freep(&s->revtab);
+ av_freep(&s->tmp_buf);
+}
+
+#define BUTTERFLIES(a0,a1,a2,a3) {\
+ BF(t3, t5, t5, t1);\
+ BF(a2.re, a0.re, a0.re, t5);\
+ BF(a3.im, a1.im, a1.im, t3);\
+ BF(t4, t6, t2, t6);\
+ BF(a3.re, a1.re, a1.re, t4);\
+ BF(a2.im, a0.im, a0.im, t6);\
+}
+
+// force loading all the inputs before storing any.
+// this is slightly slower for small data, but avoids store->load aliasing
+// for addresses separated by large powers of 2.
+#define BUTTERFLIES_BIG(a0,a1,a2,a3) {\
+ FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\
+ BF(t3, t5, t5, t1);\
+ BF(a2.re, a0.re, r0, t5);\
+ BF(a3.im, a1.im, i1, t3);\
+ BF(t4, t6, t2, t6);\
+ BF(a3.re, a1.re, r1, t4);\
+ BF(a2.im, a0.im, i0, t6);\
+}
+
+#define TRANSFORM(a0,a1,a2,a3,wre,wim) {\
+ CMUL(t1, t2, a2.re, a2.im, wre, -wim);\
+ CMUL(t5, t6, a3.re, a3.im, wre, wim);\
+ BUTTERFLIES(a0,a1,a2,a3)\
+}
+
+#define TRANSFORM_ZERO(a0,a1,a2,a3) {\
+ t1 = a2.re;\
+ t2 = a2.im;\
+ t5 = a3.re;\
+ t6 = a3.im;\
+ BUTTERFLIES(a0,a1,a2,a3)\
+}
+
+/* z[0...8n-1], w[1...2n-1] */
+#define PASS(name)\
+static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\