git.sesse.net Git - ffmpeg/blob - libavcodec/ppc/fft_vsx.h

   1 #ifndef AVCODEC_PPC_FFT_VSX_H
   2 #define AVCODEC_PPC_FFT_VSX_H
   3 /*
   4  * FFT  transform, optimized with VSX built-in functions
   5  * Copyright (c) 2014 Rong Yan  Copyright (c) 2009 Loren Merritt
   6  *
   7  * This algorithm (though not any of the implementation details) is
   8  * based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S.
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27
  28 #include "config.h"
  29 #include "libavutil/cpu.h"
  30 #include "libavutil/ppc/types_altivec.h"
  31 #include "libavutil/ppc/util_altivec.h"
  32 #include "libavcodec/fft.h"
  33 #include "libavcodec/fft-internal.h"
  34
  35 #if HAVE_VSX
  36
  37 void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z);
  38 void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z);
  39
  40
  41 #define byte_2complex (2*sizeof(FFTComplex))
  42 #define byte_4complex (4*sizeof(FFTComplex))
  43 #define byte_6complex (6*sizeof(FFTComplex))
  44 #define byte_8complex (8*sizeof(FFTComplex))
  45 #define byte_10complex (10*sizeof(FFTComplex))
  46 #define byte_12complex (12*sizeof(FFTComplex))
  47 #define byte_14complex (14*sizeof(FFTComplex))
  48
  49 inline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n)
  50 {
  51     int o1 = n<<1;
  52     int o2 = n<<2;
  53     int o3 = o1+o2;
  54     int i1, i2, i3;
  55     FFTSample* out = (FFTSample*)z;
  56     const FFTSample *wim = wre+o1;
  57     vec_f vz0, vzo1, vzo2, vzo3;
  58     vec_f x0, x1, x2, x3;
  59     vec_f x4, x5, x6, x7;
  60     vec_f x8, x9, x10, x11;
  61     vec_f x12, x13, x14, x15;
  62     vec_f x16, x17, x18, x19;
  63     vec_f x20, x21, x22, x23;
  64     vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1;
  65     vec_f y0, y1, y2, y3;
  66     vec_f y4, y5, y8, y9;
  67     vec_f y10, y13, y14, y15;
  68     vec_f y16, y17, y18, y19;
  69     vec_f y20, y21, y22, y23;
  70     vec_f wr1, wi1, wr0, wi0;
  71     vec_f wr2, wi2, wr3, wi3;
  72     vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3;
  73
  74     n = n-2;
  75     i1 = o1*sizeof(FFTComplex);
  76     i2 = o2*sizeof(FFTComplex);
  77     i3 = o3*sizeof(FFTComplex);
  78     vzo2 = vec_ld(i2, &(out[0]));  // zo2.r  zo2.i  z(o2+1).r  z(o2+1).i
  79     vzo2plus1 = vec_ld(i2+16, &(out[0]));
  80     vzo3 = vec_ld(i3, &(out[0]));  // zo3.r  zo3.i  z(o3+1).r  z(o3+1).i
  81     vzo3plus1 = vec_ld(i3+16, &(out[0]));
  82     vz0 = vec_ld(0, &(out[0]));    // z0.r  z0.i  z1.r  z1.i
  83     vz0plus1 = vec_ld(16, &(out[0]));
  84     vzo1 = vec_ld(i1, &(out[0]));  // zo1.r  zo1.i  z(o1+1).r  z(o1+1).i
  85     vzo1plus1 = vec_ld(i1+16, &(out[0]));
  86
  87     x0 = vec_add(vzo2, vzo3);
  88     x1 = vec_sub(vzo2, vzo3);
  89     y0 = vec_add(vzo2plus1, vzo3plus1);
  90     y1 = vec_sub(vzo2plus1, vzo3plus1);
  91
  92     wr1 = vec_splats(wre[1]);
  93     wi1 = vec_splats(wim[-1]);
  94     wi2 = vec_splats(wim[-2]);
  95     wi3 = vec_splats(wim[-3]);
  96     wr2 = vec_splats(wre[2]);
  97     wr3 = vec_splats(wre[3]);
  98
  99     x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
 100     x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
 101
 102     y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
 103     y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
 104     y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
 105     y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
 106
 107     ymulwi2 = vec_mul(y4, wi2);
 108     ymulwi3 = vec_mul(y5, wi3);
 109     x4 = vec_mul(x2, wr1);
 110     x5 = vec_mul(x3, wi1);
 111     y8 = vec_madd(y2, wr2, ymulwi2);
 112     y9 = vec_msub(y2, wr2, ymulwi2);
 113     x6 = vec_add(x4, x5);
 114     x7 = vec_sub(x4, x5);
 115     y13 = vec_madd(y3, wr3, ymulwi3);
 116     y14 = vec_msub(y3, wr3, ymulwi3);
 117
 118     x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3));
 119     y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
 120     y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
 121
 122     x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2));
 123     x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1));
 124
 125     y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
 126     y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
 127
 128     x11 = vec_add(vz0, x9);
 129     x12 = vec_sub(vz0, x9);
 130     x13 = vec_add(vzo1, x10);
 131     x14 = vec_sub(vzo1, x10);
 132
 133     y18 = vec_add(vz0plus1, y16);
 134     y19 = vec_sub(vz0plus1, y16);
 135     y20 = vec_add(vzo1plus1, y17);
 136     y21 = vec_sub(vzo1plus1, y17);
 137
 138     x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3));
 139     x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3));
 140     y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
 141     y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
 142
 143
 144     vec_st(x11, 0, &(out[0]));
 145     vec_st(y18, 16, &(out[0]));
 146     vec_st(x15, i1, &(out[0]));
 147     vec_st(y22, i1+16, &(out[0]));
 148     vec_st(x12, i2, &(out[0]));
 149     vec_st(y19, i2+16, &(out[0]));
 150     vec_st(x16, i3, &(out[0]));
 151     vec_st(y23, i3+16, &(out[0]));
 152
 153     do {
 154         out += 8;
 155         wre += 4;
 156         wim -= 4;
 157         wr0 = vec_splats(wre[0]);
 158         wr1 = vec_splats(wre[1]);
 159         wi0 = vec_splats(wim[0]);
 160         wi1 = vec_splats(wim[-1]);
 161
 162         wr2 = vec_splats(wre[2]);
 163         wr3 = vec_splats(wre[3]);
 164         wi2 = vec_splats(wim[-2]);
 165         wi3 = vec_splats(wim[-3]);
 166
 167         vzo2 = vec_ld(i2, &(out[0]));  // zo2.r  zo2.i  z(o2+1).r  z(o2+1).i
 168         vzo2plus1 = vec_ld(i2+16, &(out[0]));
 169         vzo3 = vec_ld(i3, &(out[0]));  // zo3.r  zo3.i  z(o3+1).r  z(o3+1).i
 170         vzo3plus1 = vec_ld(i3+16, &(out[0]));
 171         vz0 = vec_ld(0, &(out[0]));    // z0.r  z0.i  z1.r  z1.i
 172         vz0plus1 = vec_ld(16, &(out[0]));
 173         vzo1 = vec_ld(i1, &(out[0])); // zo1.r  zo1.i  z(o1+1).r  z(o1+1).i
 174         vzo1plus1 = vec_ld(i1+16, &(out[0]));
 175
 176         x0 = vec_add(vzo2, vzo3);
 177         x1 = vec_sub(vzo2, vzo3);
 178
 179         y0 = vec_add(vzo2plus1, vzo3plus1);
 180         y1 = vec_sub(vzo2plus1, vzo3plus1);
 181
 182         x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0));
 183         x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
 184         x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1));
 185         x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
 186
 187         y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
 188         y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
 189         xmulwi0 = vec_mul(x4, wi0);
 190         xmulwi1 = vec_mul(x5, wi1);
 191
 192         y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
 193         y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
 194
 195         x8 = vec_madd(x2, wr0, xmulwi0);
 196         x9 = vec_msub(x2, wr0, xmulwi0);
 197         ymulwi2 = vec_mul(y4, wi2);
 198         ymulwi3 = vec_mul(y5, wi3);
 199
 200         x13 = vec_madd(x3, wr1, xmulwi1);
 201         x14 = vec_msub(x3, wr1, xmulwi1);
 202
 203         y8 = vec_madd(y2, wr2, ymulwi2);
 204         y9 = vec_msub(y2, wr2, ymulwi2);
 205         y13 = vec_madd(y3, wr3, ymulwi3);
 206         y14 = vec_msub(y3, wr3, ymulwi3);
 207
 208         x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3));
 209         x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3));
 210
 211         y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
 212         y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
 213
 214         x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2));
 215         x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1));
 216
 217         y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
 218         y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
 219
 220         x18 = vec_add(vz0, x16);
 221         x19 = vec_sub(vz0, x16);
 222         x20 = vec_add(vzo1, x17);
 223         x21 = vec_sub(vzo1, x17);
 224
 225         y18 = vec_add(vz0plus1, y16);
 226         y19 = vec_sub(vz0plus1, y16);
 227         y20 = vec_add(vzo1plus1, y17);
 228         y21 = vec_sub(vzo1plus1, y17);
 229
 230         x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3));
 231         x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3));
 232
 233         y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
 234         y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
 235
 236         vec_st(x18, 0, &(out[0]));
 237         vec_st(y18, 16, &(out[0]));
 238         vec_st(x22, i1, &(out[0]));
 239         vec_st(y22, i1+16, &(out[0]));
 240         vec_st(x19, i2, &(out[0]));
 241         vec_st(y19, i2+16, &(out[0]));
 242         vec_st(x23, i3, &(out[0]));
 243         vec_st(y23, i3+16, &(out[0]));
 244     } while (n-=2);
 245 }
 246
 247 inline static void fft2_vsx_interleave(FFTComplex *z)
 248 {
 249     FFTSample r1, i1;
 250
 251     r1 = z[0].re - z[1].re;
 252     z[0].re += z[1].re;
 253     z[1].re = r1;
 254
 255     i1 = z[0].im - z[1].im;
 256     z[0].im += z[1].im;
 257     z[1].im = i1;
 258  }
 259
 260 inline static void fft4_vsx_interleave(FFTComplex *z)
 261 {
 262     vec_f a, b, c, d;
 263     float* out=  (float*)z;
 264     a = vec_ld(0, &(out[0]));
 265     b = vec_ld(byte_2complex, &(out[0]));
 266
 267     c = vec_perm(a, b, vcprm(0,1,s2,s1));
 268     d = vec_perm(a, b, vcprm(2,3,s0,s3));
 269     a = vec_add(c, d);
 270     b = vec_sub(c, d);
 271
 272     c = vec_perm(a, b, vcprm(0,1,s0,s1));
 273     d = vec_perm(a, b, vcprm(2,3,s3,s2));
 274
 275     a = vec_add(c, d);
 276     b = vec_sub(c, d);
 277     vec_st(a, 0, &(out[0]));
 278     vec_st(b, byte_2complex, &(out[0]));
 279 }
 280
 281 inline static void fft8_vsx_interleave(FFTComplex *z)
 282 {
 283     vec_f vz0, vz1, vz2, vz3;
 284     vec_f x0, x1, x2, x3;
 285     vec_f x4, x5, x6, x7;
 286     vec_f x8, x9, x10, x11;
 287     vec_f x12, x13, x14, x15;
 288     vec_f x16, x17, x18, x19;
 289     vec_f x20, x21, x22, x23;
 290     vec_f x24, x25, x26, x27;
 291     vec_f x28, x29, x30, x31;
 292     vec_f x32, x33, x34;
 293
 294     float* out=  (float*)z;
 295     vec_f vc1 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
 296
 297     vz0 = vec_ld(0, &(out[0]));
 298     vz1 = vec_ld(byte_2complex, &(out[0]));
 299     vz2 = vec_ld(byte_4complex, &(out[0]));
 300     vz3 = vec_ld(byte_6complex, &(out[0]));
 301
 302     x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
 303     x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
 304     x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1));
 305     x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3));
 306
 307     x4 = vec_add(x0, x1);
 308     x5 = vec_sub(x0, x1);
 309     x6 = vec_add(x2, x3);
 310     x7 = vec_sub(x2, x3);
 311
 312     x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1));
 313     x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2));
 314     x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1));
 315     x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3));
 316
 317     x12 = vec_add(x8, x9);
 318     x13 = vec_sub(x8, x9);
 319     x14 = vec_add(x10, x11);
 320     x15 = vec_sub(x10, x11);
 321     x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1));
 322     x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1));
 323     x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1));
 324     x19 = vec_add(x16, x18); // z0.r  z2.r  z0.i  z2.i
 325     x20 = vec_sub(x16, x18); // z4.r  z6.r  z4.i  z6.i
 326
 327     x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3));
 328     x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3));
 329     x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2));
 330     x24 = vec_add(x22, x23);
 331     x25 = vec_sub(x22, x23);
 332     x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1);
 333
 334     x27 = vec_add(x21, x26); // z1.r  z7.r z1.i z3.i
 335     x28 = vec_sub(x21, x26); //z5.r  z3.r z5.i z7.i
 336
 337     x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r  z0.i  z1.r  z1.i
 338     x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r  z2.i  z7.r  z3.i
 339     x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r  z4.i  z5.r  z5.i
 340     x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r  z6.i  z3.r  z7.i
 341     x33 = vec_perm(x30, x32, vcprm(0,1,s2,3));  // z2.r  z2.i  z3.r  z3.i
 342     x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r  z6.i  z7.r  z7.i
 343
 344     vec_st(x29, 0, &(out[0]));
 345     vec_st(x33, byte_2complex, &(out[0]));
 346     vec_st(x31, byte_4complex, &(out[0]));
 347     vec_st(x34, byte_6complex, &(out[0]));
 348 }
 349
 350 inline static void fft16_vsx_interleave(FFTComplex *z)
 351 {
 352     float* out=  (float*)z;
 353     vec_f vc0 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
 354     vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]};
 355     vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]};
 356     vec_f vz0, vz1, vz2, vz3;
 357     vec_f vz4, vz5, vz6, vz7;
 358     vec_f x0, x1, x2, x3;
 359     vec_f x4, x5, x6, x7;
 360     vec_f x8, x9, x10, x11;
 361     vec_f x12, x13, x14, x15;
 362     vec_f x16, x17, x18, x19;
 363     vec_f x20, x21, x22, x23;
 364     vec_f x24, x25, x26, x27;
 365     vec_f x28, x29, x30, x31;
 366     vec_f x32, x33, x34, x35;
 367     vec_f x36, x37, x38, x39;
 368     vec_f x40, x41, x42, x43;
 369     vec_f x44, x45, x46, x47;
 370     vec_f x48, x49, x50, x51;
 371     vec_f x52, x53, x54, x55;
 372     vec_f x56, x57, x58, x59;
 373     vec_f x60, x61, x62, x63;
 374     vec_f x64, x65, x66, x67;
 375     vec_f x68, x69, x70, x71;
 376     vec_f x72, x73, x74, x75;
 377     vec_f x76, x77, x78, x79;
 378     vec_f x80, x81, x82, x83;
 379     vec_f x84, x85, x86;
 380
 381     vz0 = vec_ld(0, &(out[0]));
 382     vz1 = vec_ld(byte_2complex, &(out[0]));
 383     vz2 = vec_ld(byte_4complex, &(out[0]));
 384     vz3 = vec_ld(byte_6complex, &(out[0]));
 385     vz4 = vec_ld(byte_8complex, &(out[0]));
 386     vz5 = vec_ld(byte_10complex, &(out[0]));
 387     vz6 = vec_ld(byte_12complex, &(out[0]));
 388     vz7 = vec_ld(byte_14complex, &(out[0]));
 389
 390     x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
 391     x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
 392     x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
 393     x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
 394
 395     x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1));
 396     x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3));
 397     x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1));
 398     x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3));
 399
 400     x8 = vec_add(x0, x1);
 401     x9 = vec_sub(x0, x1);
 402     x10 = vec_add(x2, x3);
 403     x11 = vec_sub(x2, x3);
 404
 405     x12 = vec_add(x4, x5);
 406     x13 = vec_sub(x4, x5);
 407     x14 = vec_add(x6, x7);
 408     x15 = vec_sub(x6, x7);
 409
 410     x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1));
 411     x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2));
 412     x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2));
 413     x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3));
 414     x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1));
 415     x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3));
 416     x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1));
 417     x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2));
 418
 419     x24 = vec_add(x16, x17);
 420     x25 = vec_sub(x16, x17);
 421     x26 = vec_add(x18, x19);
 422     x27 = vec_sub(x18, x19);
 423     x28 = vec_add(x20, x21);
 424     x29 = vec_sub(x20, x21);
 425     x30 = vec_add(x22, x23);
 426     x31 = vec_sub(x22, x23);
 427
 428     x32 = vec_add(x24, x26);
 429     x33 = vec_sub(x24, x26);
 430     x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1));
 431
 432     x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2));
 433     x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3));
 434     x37 = vec_add(x35, x36);
 435     x38 = vec_sub(x35, x36);
 436     x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0));
 437
 438     x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3));
 439     x41 = vec_perm(x26,  x37, vcprm(2,3,s3,s2));
 440     x42 = vec_add(x40, x41);
 441     x43 = vec_sub(x40, x41);
 442     x44 = vec_mul(x42, vc0);
 443     x45 = vec_mul(x43, vc0);
 444
 445     x46 = vec_add(x34, x39);  // z0.r  z0.i  z4.r  z4.i
 446     x47 = vec_sub(x34, x39);  // z8.r  z8.i  z12.r  z12.i
 447
 448     x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2));
 449     x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0));
 450     x50 = vec_add(x48, x49);
 451     x51 = vec_sub(x48, x49);
 452     x52 = vec_mul(x50, vc1);
 453     x53 = vec_mul(x50, vc2);
 454     x54 = vec_mul(x51, vc1);
 455     x55 = vec_mul(x51, vc2);
 456
 457     x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3));
 458     x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0));
 459     x58 = vec_add(x56, x57);
 460     x59 = vec_sub(x56, x57);
 461
 462     x60 = vec_perm(x54, x55, vcprm(1,0,3,2));
 463     x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2));
 464     x62 = vec_add(x52, x61);
 465     x63 = vec_sub(x52, x61);
 466     x64 = vec_add(x60, x53);
 467     x65 = vec_sub(x60, x53);
 468     x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2));
 469     x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2));
 470
 471     x68 = vec_add(x58, x66); // z1.r    z1.i  z3.r    z3.i
 472     x69 = vec_sub(x58, x66); // z9.r    z9.i  z11.r  z11.i
 473     x70 = vec_add(x59, x67); // z5.r    z5.i  z15.r  z15.i
 474     x71 = vec_sub(x59, x67); // z13.r  z13.i z7.r   z7.i
 475
 476     x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3));
 477     x73 = vec_add(x25, x72);
 478     x74 = vec_sub(x25, x72);
 479     x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1));
 480     x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3));
 481     x77 = vec_add(x75, x76); // z2.r   z2.i    z6.r    z6.i
 482     x78 = vec_sub(x75, x76); // z10.r  z10.i  z14.r  z14.i
 483
 484     x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r  z0.i  z1.r  z1.i
 485     x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r  z2.i  z3.r  z3.i
 486     x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r  z4.i  z5.r  z5.i
 487     x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r  z6.i  z7.r  z7.i
 488     vec_st(x79, 0, &(out[0]));
 489     vec_st(x80, byte_2complex, &(out[0]));
 490     vec_st(x81, byte_4complex, &(out[0]));
 491     vec_st(x82, byte_6complex, &(out[0]));
 492     x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r  z8.i  z9.r  z9.i
 493     x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r  z10.i  z11.r  z11.i
 494     x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r  z12.i  z13.r  z13.i
 495     x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r  z14.i  z15.r  z15.i
 496     vec_st(x83, byte_8complex, &(out[0]));
 497     vec_st(x84, byte_10complex, &(out[0]));
 498     vec_st(x85, byte_12complex, &(out[0]));
 499     vec_st(x86, byte_14complex, &(out[0]));
 500 }
 501
 502 inline static void fft4_vsx(FFTComplex *z)
 503 {
 504     vec_f a, b, c, d;
 505     float* out=  (float*)z;
 506     a = vec_ld(0, &(out[0]));
 507     b = vec_ld(byte_2complex, &(out[0]));
 508
 509     c = vec_perm(a, b, vcprm(0,1,s2,s1));
 510     d = vec_perm(a, b, vcprm(2,3,s0,s3));
 511     a = vec_add(c, d);
 512     b = vec_sub(c, d);
 513
 514     c = vec_perm(a,b, vcprm(0,s0,1,s1));
 515     d = vec_perm(a, b, vcprm(2,s3,3,s2));
 516
 517     a = vec_add(c, d);
 518     b = vec_sub(c, d);
 519
 520     c = vec_perm(a, b, vcprm(0,1,s0,s1));
 521     d = vec_perm(a, b, vcprm(2,3,s2,s3));
 522
 523     vec_st(c, 0, &(out[0]));
 524     vec_st(d, byte_2complex, &(out[0]));
 525     return;
 526 }
 527
 528 inline static void fft8_vsx(FFTComplex *z)
 529 {
 530     vec_f vz0, vz1, vz2, vz3;
 531     vec_f vz4, vz5, vz6, vz7, vz8;
 532
 533     float* out=  (float*)z;
 534     vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
 535     vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
 536     vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
 537
 538     vz0 = vec_ld(0, &(out[0]));
 539     vz1 = vec_ld(byte_2complex, &(out[0]));
 540     vz2 = vec_ld(byte_4complex, &(out[0]));
 541     vz3 = vec_ld(byte_6complex, &(out[0]));
 542
 543     vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
 544     vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
 545     vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
 546     vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
 547
 548     vz2 = vec_add(vz6, vz7);
 549     vz3 = vec_sub(vz6, vz7);
 550     vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
 551
 552     vz0 = vec_add(vz4, vz5);
 553     vz1 = vec_sub(vz4, vz5);
 554
 555     vz3 = vec_madd(vz3, vc1, vc0);
 556     vz3 = vec_madd(vz8, vc2, vz3);
 557
 558     vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
 559     vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
 560     vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
 561     vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
 562
 563     vz0 = vec_add(vz4, vz5);
 564     vz1 = vec_sub(vz4, vz5);
 565     vz2 = vec_add(vz6, vz7);
 566     vz3 = vec_sub(vz6, vz7);
 567
 568     vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
 569     vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
 570     vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
 571     vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
 572
 573
 574     vz2 = vec_sub(vz4, vz6);
 575     vz3 = vec_sub(vz5, vz7);
 576
 577     vz0 = vec_add(vz4, vz6);
 578     vz1 = vec_add(vz5, vz7);
 579
 580     vec_st(vz0, 0, &(out[0]));
 581     vec_st(vz1, byte_2complex, &(out[0]));
 582     vec_st(vz2, byte_4complex, &(out[0]));
 583     vec_st(vz3, byte_6complex, &(out[0]));
 584     return;
 585 }
 586
 587 inline static void fft16_vsx(FFTComplex *z)
 588 {
 589     float* out=  (float*)z;
 590     vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
 591     vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
 592     vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
 593     vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343};
 594     vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953};
 595     vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953};
 596
 597     vec_f vz0, vz1, vz2, vz3;
 598     vec_f vz4, vz5, vz6, vz7;
 599     vec_f vz8, vz9, vz10, vz11;
 600     vec_f vz12, vz13;
 601
 602     vz0 = vec_ld(byte_8complex, &(out[0]));
 603     vz1 = vec_ld(byte_10complex, &(out[0]));
 604     vz2 = vec_ld(byte_12complex, &(out[0]));
 605     vz3 = vec_ld(byte_14complex, &(out[0]));
 606
 607     vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
 608     vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
 609     vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1));
 610     vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3));
 611
 612     vz0 = vec_add(vz4, vz5);
 613     vz1= vec_sub(vz4, vz5);
 614     vz2 = vec_add(vz6, vz7);
 615     vz3 = vec_sub(vz6, vz7);
 616
 617     vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
 618     vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
 619     vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
 620     vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2));
 621
 622     vz0 = vec_add(vz4, vz5);
 623     vz1 = vec_sub(vz4, vz5);
 624     vz2 = vec_add(vz6, vz7);
 625     vz3 = vec_sub(vz6, vz7);
 626
 627     vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
 628     vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
 629
 630     vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
 631     vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
 632
 633     vz0 = vec_ld(0, &(out[0]));
 634     vz1 = vec_ld(byte_2complex, &(out[0]));
 635     vz2 = vec_ld(byte_4complex, &(out[0]));
 636     vz3 = vec_ld(byte_6complex, &(out[0]));
 637     vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
 638     vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
 639     vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
 640     vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
 641
 642     vz2 = vec_add(vz10, vz11);
 643     vz3 = vec_sub(vz10, vz11);
 644     vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
 645     vz0 = vec_add(vz8, vz9);
 646     vz1 = vec_sub(vz8, vz9);
 647
 648     vz3 = vec_madd(vz3, vc1, vc0);
 649     vz3 = vec_madd(vz12, vc2, vz3);
 650     vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
 651     vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
 652     vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
 653     vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
 654
 655     vz0 = vec_add(vz8, vz9);
 656     vz1 = vec_sub(vz8, vz9);
 657     vz2 = vec_add(vz10, vz11);
 658     vz3 = vec_sub(vz10, vz11);
 659
 660     vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
 661     vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
 662     vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
 663     vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
 664
 665     vz2 = vec_sub(vz8, vz10);
 666     vz3 = vec_sub(vz9, vz11);
 667     vz0 = vec_add(vz8, vz10);
 668     vz1 = vec_add(vz9, vz11);
 669
 670     vz8 = vec_madd(vz4, vc3, vc0);
 671     vz9 = vec_madd(vz5, vc3, vc0);
 672     vz10 = vec_madd(vz6, vc3, vc0);
 673     vz11 = vec_madd(vz7, vc3, vc0);
 674
 675     vz8 = vec_madd(vz5, vc4, vz8);
 676     vz9 = vec_madd(vz4, vc5, vz9);
 677     vz10 = vec_madd(vz7, vc5, vz10);
 678     vz11 = vec_madd(vz6, vc4, vz11);
 679
 680     vz12 = vec_sub(vz10, vz8);
 681     vz10 = vec_add(vz10, vz8);
 682
 683     vz13 = vec_sub(vz9, vz11);
 684     vz11 = vec_add(vz9, vz11);
 685
 686     vz4 = vec_sub(vz0, vz10);
 687     vz0 = vec_add(vz0, vz10);
 688
 689     vz7= vec_sub(vz3, vz12);
 690     vz3= vec_add(vz3, vz12);
 691
 692     vz5 = vec_sub(vz1, vz11);
 693     vz1 = vec_add(vz1, vz11);
 694
 695     vz6 = vec_sub(vz2, vz13);
 696     vz2 = vec_add(vz2, vz13);
 697
 698     vec_st(vz0, 0, &(out[0]));
 699     vec_st(vz1, byte_2complex, &(out[0]));
 700     vec_st(vz2, byte_4complex, &(out[0]));
 701     vec_st(vz3, byte_6complex, &(out[0]));
 702     vec_st(vz4, byte_8complex, &(out[0]));
 703     vec_st(vz5, byte_10complex, &(out[0]));
 704     vec_st(vz6, byte_12complex, &(out[0]));
 705     vec_st(vz7, byte_14complex, &(out[0]));
 706     return;
 707
 708 }
 709 inline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n)
 710 {
 711     int o1 = n<<1;
 712     int o2 = n<<2;
 713     int o3 = o1+o2;
 714     int i1, i2, i3;
 715     FFTSample* out = (FFTSample*)z;
 716     const FFTSample *wim = wre+o1;
 717     vec_f v0, v1, v2, v3;
 718     vec_f v4, v5, v6, v7;
 719     vec_f v8, v9, v10, v11;
 720     vec_f v12, v13;
 721
 722     n = n-2;
 723     i1 = o1*sizeof(FFTComplex);
 724     i2 = o2*sizeof(FFTComplex);
 725     i3 = o3*sizeof(FFTComplex);
 726
 727     v8 = vec_ld(0, &(wre[0]));
 728     v10 = vec_ld(0, &(wim[0]));
 729     v9 = vec_ld(0, &(wim[-4]));
 730     v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
 731
 732     v4 = vec_ld(i2, &(out[0]));
 733     v5 = vec_ld(i2+16, &(out[0]));
 734     v6 = vec_ld(i3, &(out[0]));
 735     v7 = vec_ld(i3+16, &(out[0]));
 736     v10 = vec_mul(v4, v8); // r2*wre
 737     v11 = vec_mul(v5, v8); // i2*wre
 738     v12 = vec_mul(v6, v8); // r3*wre
 739     v13 = vec_mul(v7, v8); // i3*wre
 740
 741     v0 = vec_ld(0, &(out[0])); // r0
 742     v3 = vec_ld(i1+16, &(out[0])); // i1
 743     v10 = vec_madd(v5, v9, v10); // r2*wim
 744     v11 = vec_nmsub(v4, v9, v11); // i2*wim
 745     v12 = vec_nmsub(v7, v9, v12); // r3*wim
 746     v13 = vec_madd(v6, v9, v13); // i3*wim
 747
 748     v1 = vec_ld(16, &(out[0])); // i0
 749     v2 = vec_ld(i1, &(out[0])); // r1
 750     v8 = vec_sub(v12, v10);
 751     v12 = vec_add(v12, v10);
 752     v9 = vec_sub(v11, v13);
 753     v13 = vec_add(v11, v13);
 754     v4 = vec_sub(v0, v12);
 755     v0 = vec_add(v0, v12);
 756     v7 = vec_sub(v3, v8);
 757     v3 = vec_add(v3, v8);
 758
 759     vec_st(v0, 0, &(out[0])); // r0
 760     vec_st(v3, i1+16, &(out[0])); // i1
 761     vec_st(v4, i2, &(out[0])); // r2
 762     vec_st(v7, i3+16, &(out[0]));// i3
 763
 764     v5 = vec_sub(v1, v13);
 765     v1 = vec_add(v1, v13);
 766     v6 = vec_sub(v2, v9);
 767     v2 = vec_add(v2, v9);
 768
 769     vec_st(v1, 16, &(out[0])); // i0
 770     vec_st(v2, i1, &(out[0])); // r1
 771     vec_st(v5, i2+16, &(out[0])); // i2
 772     vec_st(v6, i3, &(out[0])); // r3
 773
 774     do {
 775         out += 8;
 776         wre += 4;
 777         wim -= 4;
 778
 779         v8 = vec_ld(0, &(wre[0]));
 780         v10 = vec_ld(0, &(wim[0]));
 781         v9 = vec_ld(0, &(wim[-4]));
 782         v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
 783
 784         v4 = vec_ld(i2, &(out[0])); // r2
 785         v5 = vec_ld(i2+16, &(out[0])); // i2
 786         v6 = vec_ld(i3, &(out[0])); // r3
 787         v7 = vec_ld(i3+16, &(out[0]));// i3
 788         v10 = vec_mul(v4, v8); // r2*wre
 789         v11 = vec_mul(v5, v8); // i2*wre
 790         v12 = vec_mul(v6, v8); // r3*wre
 791         v13 = vec_mul(v7, v8); // i3*wre
 792
 793         v0 = vec_ld(0, &(out[0])); // r0
 794         v3 = vec_ld(i1+16, &(out[0])); // i1
 795         v10 = vec_madd(v5, v9, v10); // r2*wim
 796         v11 = vec_nmsub(v4, v9, v11); // i2*wim
 797         v12 = vec_nmsub(v7, v9, v12); // r3*wim
 798         v13 = vec_madd(v6, v9, v13); // i3*wim
 799
 800         v1 = vec_ld(16, &(out[0])); // i0
 801         v2 = vec_ld(i1, &(out[0])); // r1
 802         v8 = vec_sub(v12, v10);
 803         v12 = vec_add(v12, v10);
 804         v9 = vec_sub(v11, v13);
 805         v13 = vec_add(v11, v13);
 806         v4 = vec_sub(v0, v12);
 807         v0 = vec_add(v0, v12);
 808         v7 = vec_sub(v3, v8);
 809         v3 = vec_add(v3, v8);
 810
 811         vec_st(v0, 0, &(out[0])); // r0
 812         vec_st(v3, i1+16, &(out[0])); // i1
 813         vec_st(v4, i2, &(out[0])); // r2
 814         vec_st(v7, i3+16, &(out[0])); // i3
 815
 816         v5 = vec_sub(v1, v13);
 817         v1 = vec_add(v1, v13);
 818         v6 = vec_sub(v2, v9);
 819         v2 = vec_add(v2, v9);
 820
 821         vec_st(v1, 16, &(out[0])); // i0
 822         vec_st(v2, i1, &(out[0])); // r1
 823         vec_st(v5, i2+16, &(out[0])); // i2
 824         vec_st(v6, i3, &(out[0])); // r3
 825     } while (n-=2);
 826 }
 827
 828 #endif
 829
 830 #endif /* AVCODEC_PPC_FFT_VSX_H */