git.sesse.net Git - ffmpeg/blob - libavcodec/x86/mdct15.asm

   1 ;******************************************************************************
   2 ;* SIMD optimized non-power-of-two MDCT functions
   3 ;*
   4 ;* Copyright (C) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
   5 ;*
   6 ;* This file is part of FFmpeg.
   7 ;*
   8 ;* FFmpeg is free software; you can redistribute it and/or
   9 ;* modify it under the terms of the GNU Lesser General Public
  10 ;* License as published by the Free Software Foundation; either
  11 ;* version 2.1 of the License, or (at your option) any later version.
  12 ;*
  13 ;* FFmpeg is distributed in the hope that it will be useful,
  14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 ;* Lesser General Public License for more details.
  17 ;*
  18 ;* You should have received a copy of the GNU Lesser General Public
  19 ;* License along with FFmpeg; if not, write to the Free Software
  20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21 ;******************************************************************************
  22
  23 %include "libavutil/x86/x86util.asm"
  24
  25 SECTION_RODATA 32
  26
  27 perm_neg: dd 2, 5, 3, 4, 6, 1, 7, 0
  28 perm_pos: dd 0, 7, 1, 6, 4, 3, 5, 2
  29 sign_adjust_r: times 4 dd 0x80000000, 0x00000000
  30
  31 sign_adjust_5: dd 0x00000000, 0x80000000, 0x80000000, 0x00000000
  32
  33 SECTION .text
  34
  35 %if ARCH_X86_64
  36
  37 ;*****************************************************************************************
  38 ;void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
  39 ;*****************************************************************************************
  40 %macro FFT5 3 ; %1 - in_offset, %2 - dst1 (64bit used), %3 - dst2
  41     VBROADCASTSD m0, [inq + %1]         ; in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im
  42     movsd   xm1, [inq + 1*16 +  8 + %1] ; in[ 3].re, in[ 3].im,         0,         0
  43     movsd   xm4, [inq + 6*16 +  0 + %1] ; in[12].re, in[12].im,         0,         0
  44     movhps  xm1, [inq + 3*16 +  0 + %1] ; in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im
  45     movhps  xm4, [inq + 4*16 +  8 + %1] ; in[12].re, in[12].im, in[ 9].re, in[ 9].im
  46
  47     subps       xm2,  xm1, xm4          ; t[2].im, t[2].re, t[3].im, t[3].re
  48     addps       xm1,  xm4               ; t[0].re, t[0].im, t[1].re, t[1].im
  49
  50     movhlps     %2,   xm1               ; t[0].re, t[1].re, t[0].im, t[1].im
  51     addps       %2,   xm1
  52     addps       %2,   xm0               ; DC[0].re, DC[0].im, junk...
  53     movlhps     %2,   %2                ; DC[0].re, DC[0].im, DC[0].re, DC[0].im
  54
  55     shufps      xm3,  xm1, xm2, q0110   ; t[0].re, t[0].im, t[2].re, t[2].im
  56     shufps      xm1,  xm2, q2332        ; t[1].re, t[1].im, t[3].re, t[3].im
  57
  58     mulps       xm%3, xm1, xm5
  59     mulps       xm4,  xm3, xm6
  60     mulps       xm1,  xm6
  61
  62     xorps       xm1,  xm7
  63     mulps       xm3,  xm5
  64     addsubps    xm3,  xm1               ; t[0].re, t[0].im, t[2].re, t[2].im
  65     subps       xm%3, xm4               ; t[4].re, t[4].im, t[5].re, t[5].im
  66
  67     movhlps     xm2, xm%3, xm3          ; t[2].re, t[2].im, t[5].re, t[5].im
  68     movlhps     xm3, xm%3               ; t[0].re, t[0].im, t[4].re, t[4].im
  69
  70     xorps       xm2,  xm7
  71     addps       xm%3, xm2, xm3
  72     subps       xm3,  xm2
  73
  74     shufps      xm3,  xm3, q1032
  75     vinsertf128 m%3,  m%3, xm3, 1       ; All ACs (tmp[1] through to tmp[4])
  76     addps       m%3,  m%3,  m0          ; Finally offset with DCs
  77 %endmacro
  78
  79 %macro BUTTERFLIES_DC 1 ; %1 - exptab_offset
  80     mulps xm0,  xm9, [exptabq + %1 + 16*0]
  81     mulps xm1, xm10, [exptabq + %1 + 16*1]
  82
  83     haddps  xm0,  xm1
  84     movhlps xm1,  xm0                   ; t[0].re, t[1].re, t[0].im, t[1].im
  85
  86     addps   xm0,  xm1
  87     addps   xm0,  xm8
  88
  89     movsd [outq], xm0
  90 %endmacro
  91
  92 %macro BUTTERFLIES_AC 1 ; %1 - exptab_offset
  93     mulps  m0, m12, [exptabq + 64*0 + 0*mmsize + %1]
  94     mulps  m1, m12, [exptabq + 64*0 + 1*mmsize + %1]
  95     mulps  m2, m13, [exptabq + 64*1 + 0*mmsize + %1]
  96     mulps  m3, m13, [exptabq + 64*1 + 1*mmsize + %1]
  97
  98     addps  m0, m0, m2
  99     addps  m1, m1, m3
 100     addps  m0, m0, m11
 101
 102     shufps m1, m1, m1, q2301
 103     addps  m0, m0, m1
 104
 105     vextractf128 xm1, m0, 1
 106
 107     movlps [outq + strideq*1], xm0
 108     movhps [outq + strideq*2], xm0
 109     movlps [outq +  stride3q], xm1
 110     movhps [outq + strideq*4], xm1
 111 %endmacro
 112
 113 INIT_YMM avx
 114 cglobal fft15, 4, 5, 14, out, in, exptab, stride, stride5
 115     shl strideq, 3
 116
 117     movaps xm5, [exptabq + 480 + 16*0]
 118     movaps xm6, [exptabq + 480 + 16*1]
 119     movaps xm7, [sign_adjust_5]
 120
 121     FFT5  0,  xm8, 11
 122     FFT5  8,  xm9, 12
 123     FFT5 16, xm10, 13
 124
 125 %define stride3q inq
 126     lea stride3q, [strideq + strideq*2]
 127     lea stride5q, [strideq + strideq*4]
 128
 129     BUTTERFLIES_DC (8*6 + 4*0)*2*4
 130     BUTTERFLIES_AC (8*0 + 0*0)*2*4
 131
 132     add outq, stride5q
 133     BUTTERFLIES_DC (8*6 + 4*1)*2*4
 134     BUTTERFLIES_AC (8*2 + 0*0)*2*4
 135
 136     add outq, stride5q
 137     BUTTERFLIES_DC (8*6 + 4*2)*2*4
 138     BUTTERFLIES_AC (8*4 + 0*0)*2*4
 139
 140     RET
 141
 142 %endif ; ARCH_X86_64
 143
 144 ;*******************************************************************************************************
 145 ;void ff_mdct15_postreindex(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
 146 ;*******************************************************************************************************
 147 %macro LUT_LOAD_4D 3
 148     mov      r4d, [lutq + %3q*4 +  0]
 149     movsd  xmm%1, [inq +  r4q*8]
 150     mov      r4d, [lutq + %3q*4 +  4]
 151     movhps xmm%1, [inq +  r4q*8]
 152 %if cpuflag(avx2)
 153     mov      r4d, [lutq + %3q*4 +  8]
 154     movsd     %2, [inq +  r4q*8]
 155     mov      r4d, [lutq + %3q*4 + 12]
 156     movhps    %2, [inq +  r4q*8]
 157     vinsertf128 %1, %1, %2, 1
 158 %endif
 159 %endmacro
 160
 161 %macro POSTROTATE_FN 1
 162 cglobal mdct15_postreindex, 5, 7, 8 + cpuflag(avx2)*2, out, in, exp, lut, len8, offset_p, offset_n
 163
 164     xor offset_nq, offset_nq
 165     lea offset_pq, [len8q*2 - %1]
 166
 167     movaps m7,  [sign_adjust_r]
 168
 169 %if cpuflag(avx2)
 170     movaps   m8, [perm_pos]
 171     movaps   m9, [perm_neg]
 172 %endif
 173
 174 .loop:
 175     movups m0, [expq + offset_pq*8]     ; exp[p0].re, exp[p0].im, exp[p1].re, exp[p1].im, exp[p2].re, exp[p2].im, exp[p3].re, exp[p3].im
 176     movups m1, [expq + offset_nq*8]     ; exp[n3].re, exp[n3].im, exp[n2].re, exp[n2].im, exp[n1].re, exp[n1].im, exp[n0].re, exp[n0].im
 177
 178     LUT_LOAD_4D m3, xm4, offset_p       ; in[p0].re, in[p0].im, in[p1].re, in[p1].im, in[p2].re, in[p2].im, in[p3].re, in[p3].im
 179     LUT_LOAD_4D m4, xm5, offset_n       ; in[n3].re, in[n3].im, in[n2].re, in[n2].im, in[n1].re, in[n1].im, in[n0].re, in[n0].im
 180
 181     mulps  m5, m3, m0                   ; in[p].reim * exp[p].reim
 182     mulps  m6, m4, m1                   ; in[n].reim * exp[n].reim
 183
 184     xorps  m5, m7                       ; in[p].re *= -1, in[p].im *= 1
 185     xorps  m6, m7                       ; in[n].re *= -1, in[n].im *= 1
 186
 187     shufps m3, m3, m3, q2301            ; in[p].imre
 188     shufps m4, m4, m4, q2301            ; in[n].imre
 189
 190     mulps  m3, m0                       ; in[p].imre * exp[p].reim
 191     mulps  m4, m1                       ; in[n].imre * exp[n].reim
 192
 193     haddps m3, m6                       ; out[n0].im, out[n1].im, out[n3].re, out[n2].re, out[n2].im, out[n3].im, out[n1].re, out[n0].re
 194     haddps m5, m4                       ; out[p0].re, out[p1].re, out[p3].im, out[p2].im, out[p2].re, out[p3].re, out[p1].im, out[p0].im
 195
 196 %if cpuflag(avx2)
 197     vpermps m3, m9, m3                  ; out[n3].im, out[n3].re, out[n2].im, out[n2].re, out[n1].im, out[n1].re, out[n0].im, out[n0].re
 198     vpermps m5, m8, m5                  ; out[p0].re, out[p0].im, out[p1].re, out[p1].im, out[p2].re, out[p2].im, out[p3].re, out[p3].im
 199 %else
 200     shufps m3, m3, m3, q0312
 201     shufps m5, m5, m5, q2130
 202 %endif
 203
 204     movups [outq + offset_nq*8], m3
 205     movups [outq + offset_pq*8], m5
 206
 207     sub offset_pq, %1
 208     add offset_nq, %1
 209     cmp offset_nq, offset_pq
 210     jle .loop
 211
 212     REP_RET
 213 %endmacro
 214
 215 INIT_XMM sse3
 216 POSTROTATE_FN 2
 217
 218 %if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
 219 INIT_YMM avx2
 220 POSTROTATE_FN 4
 221 %endif