git.sesse.net Git - ffmpeg/blob - libavcodec/x86/mdct15.asm

   1 ;******************************************************************************
   2 ;* SIMD optimized non-power-of-two MDCT functions
   3 ;*
   4 ;* Copyright (C) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
   5 ;*
   6 ;* This file is part of FFmpeg.
   7 ;*
   8 ;* FFmpeg is free software; you can redistribute it and/or
   9 ;* modify it under the terms of the GNU Lesser General Public
  10 ;* License as published by the Free Software Foundation; either
  11 ;* version 2.1 of the License, or (at your option) any later version.
  12 ;*
  13 ;* FFmpeg is distributed in the hope that it will be useful,
  14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 ;* Lesser General Public License for more details.
  17 ;*
  18 ;* You should have received a copy of the GNU Lesser General Public
  19 ;* License along with FFmpeg; if not, write to the Free Software
  20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21 ;******************************************************************************
  22
  23 %include "libavutil/x86/x86util.asm"
  24
  25 SECTION_RODATA 32
  26
  27 perm_neg: dd 2, 5, 3, 4, 6, 1, 7, 0
  28 perm_pos: dd 0, 7, 1, 6, 4, 3, 5, 2
  29 sign_adjust_r: times 4 dd 0x80000000, 0x00000000
  30
  31 sign_adjust_5: dd 0x00000000, 0x80000000, 0x80000000, 0x00000000
  32
  33 SECTION .text
  34
  35 %if ARCH_X86_64
  36
  37 ;*****************************************************************************************
  38 ;void ff_fft15_avx(FFTComplex *out, FFTComplex *in, FFTComplex *exptab, ptrdiff_t stride);
  39 ;*****************************************************************************************
  40 %macro FFT5 3 ; %1 - in_offset, %2 - dst1 (64bit used), %3 - dst2
  41     VBROADCASTSD m0, [inq + %1]         ; in[ 0].re, in[ 0].im, in[ 0].re, in[ 0].im
  42     movsd   xm1, [inq + 1*16 +  8 + %1] ; in[ 3].re, in[ 3].im,         0,         0
  43     movsd   xm4, [inq + 6*16 +  0 + %1] ; in[12].re, in[12].im,         0,         0
  44     movhps  xm1, [inq + 3*16 +  0 + %1] ; in[ 3].re, in[ 3].im, in[ 6].re, in[ 6].im
  45     movhps  xm4, [inq + 4*16 +  8 + %1] ; in[12].re, in[12].im, in[ 9].re, in[ 9].im
  46
  47     subps       xm2,  xm1, xm4          ; t[2].im, t[2].re, t[3].im, t[3].re
  48     addps       xm1,  xm4               ; t[0].re, t[0].im, t[1].re, t[1].im
  49
  50     movhlps     %2,   xm1               ; t[0].re, t[1].re, t[0].im, t[1].im
  51     addps       %2,   xm1
  52     addps       %2,   xm0               ; DC[0].re, DC[0].im, junk...
  53     movlhps     %2,   %2                ; DC[0].re, DC[0].im, DC[0].re, DC[0].im
  54
  55     shufps      xm3,  xm1, xm2, q0110   ; t[0].re, t[0].im, t[2].re, t[2].im
  56     shufps      xm1,  xm2, q2332        ; t[1].re, t[1].im, t[3].re, t[3].im
  57
  58     mulps       xm%3, xm1, xm5
  59     mulps       xm4,  xm3, xm6
  60     mulps       xm1,  xm6
  61
  62     xorps       xm1,  xm7
  63     mulps       xm3,  xm5
  64     addsubps    xm3,  xm1               ; t[0].re, t[0].im, t[2].re, t[2].im
  65     subps       xm%3, xm4               ; t[4].re, t[4].im, t[5].re, t[5].im
  66
  67     movhlps     xm2, xm%3, xm3          ; t[2].re, t[2].im, t[5].re, t[5].im
  68     movlhps     xm3, xm%3               ; t[0].re, t[0].im, t[4].re, t[4].im
  69
  70     xorps       xm2,  xm7
  71     addps       xm%3, xm2, xm3
  72     subps       xm3,  xm2
  73
  74     shufps      xm3,  xm3, q1032
  75     vinsertf128 m%3,  m%3, xm3, 1       ; All ACs (tmp[1] through to tmp[4])
  76     addps       m%3,  m%3,  m0          ; Finally offset with DCs
  77 %endmacro
  78
  79 %macro BUTTERFLIES_DC 2 ; %1 - exptab_offset, %2 - out
  80     mulps xm0,  xm9, [exptabq + %1 + 16*0]
  81     mulps xm1, xm10, [exptabq + %1 + 16*1]
  82
  83     haddps  xm0,  xm1
  84     movhlps xm1,  xm0                   ; t[0].re, t[1].re, t[0].im, t[1].im
  85
  86     addps   xm0,  xm1
  87     addps   xm0,  xm8
  88
  89     movsd [%2q], xm0
  90 %endmacro
  91
  92 %macro BUTTERFLIES_AC 2 ; exptab, exptab_offset, src1, src2, src3, out (uses m0-m3)
  93     mulps  m0, m12, [exptabq + 64*0 + 0*mmsize + %1]
  94     mulps  m1, m12, [exptabq + 64*0 + 1*mmsize + %1]
  95     mulps  m2, m13, [exptabq + 64*1 + 0*mmsize + %1]
  96     mulps  m3, m13, [exptabq + 64*1 + 1*mmsize + %1]
  97
  98     addps  m0, m0, m2
  99     addps  m1, m1, m3
 100     addps  m0, m0, m11
 101
 102     shufps m1, m1, m1, q2301
 103     addps  m0, m0, m1
 104
 105     vextractf128 xm1, m0, 1
 106
 107     movlps [%2q + strideq*1], xm0
 108     movhps [%2q + strideq*2], xm0
 109     movlps [%2q +  stride3q], xm1
 110     movhps [%2q + strideq*4], xm1
 111 %endmacro
 112
 113 INIT_YMM avx
 114 cglobal fft15, 4, 6, 14, out, in, exptab, stride, stride3, stride5
 115 %define out0q inq
 116     shl strideq, 3
 117
 118     movaps xm5, [exptabq + 480 + 16*0]
 119     movaps xm6, [exptabq + 480 + 16*1]
 120     movaps xm7, [sign_adjust_5]
 121
 122     FFT5  0,  xm8, 11
 123     FFT5  8,  xm9, 12
 124     FFT5 16, xm10, 13
 125
 126     lea stride3q, [strideq + strideq*2]
 127     lea stride5q, [strideq + strideq*4]
 128
 129     mov out0q, outq
 130
 131     BUTTERFLIES_DC (8*6 + 4*0)*2*4, out0
 132     lea outq, [out0q + stride5q*1]
 133     BUTTERFLIES_DC (8*6 + 4*1)*2*4, out
 134     lea outq, [out0q + stride5q*2]
 135     BUTTERFLIES_DC (8*6 + 4*2)*2*4, out
 136
 137     BUTTERFLIES_AC (8*0)*2*4, out0
 138     lea outq, [out0q + stride5q*1]
 139     BUTTERFLIES_AC (8*2)*2*4, out
 140     lea outq, [out0q + stride5q*2]
 141     BUTTERFLIES_AC (8*4)*2*4, out
 142
 143     RET
 144
 145 %endif ; ARCH_X86_64
 146
 147 ;*******************************************************************************************************
 148 ;void ff_mdct15_postreindex(FFTComplex *out, FFTComplex *in, FFTComplex *exp, int *lut, ptrdiff_t len8);
 149 ;*******************************************************************************************************
 150 %macro LUT_LOAD_4D 3
 151     mov      r4d, [lutq + %3q*4 +  0]
 152     movsd  xmm%1, [inq +  r4q*8]
 153     mov      r4d, [lutq + %3q*4 +  4]
 154     movhps xmm%1, [inq +  r4q*8]
 155 %if cpuflag(avx2)
 156     mov      r4d, [lutq + %3q*4 +  8]
 157     movsd     %2, [inq +  r4q*8]
 158     mov      r4d, [lutq + %3q*4 + 12]
 159     movhps    %2, [inq +  r4q*8]
 160     vinsertf128 %1, %1, %2, 1
 161 %endif
 162 %endmacro
 163
 164 %macro POSTROTATE_FN 1
 165 cglobal mdct15_postreindex, 5, 7, 8 + cpuflag(avx2)*2, out, in, exp, lut, len8, offset_p, offset_n
 166
 167     xor offset_nq, offset_nq
 168     lea offset_pq, [len8q*2 - %1]
 169
 170     movaps m7,  [sign_adjust_r]
 171
 172 %if cpuflag(avx2)
 173     movaps   m8, [perm_pos]
 174     movaps   m9, [perm_neg]
 175 %endif
 176
 177 .loop:
 178     movups m0, [expq + offset_pq*8]     ; exp[p0].re, exp[p0].im, exp[p1].re, exp[p1].im, exp[p2].re, exp[p2].im, exp[p3].re, exp[p3].im
 179     movups m1, [expq + offset_nq*8]     ; exp[n3].re, exp[n3].im, exp[n2].re, exp[n2].im, exp[n1].re, exp[n1].im, exp[n0].re, exp[n0].im
 180
 181     LUT_LOAD_4D m3, xm4, offset_p       ; in[p0].re, in[p0].im, in[p1].re, in[p1].im, in[p2].re, in[p2].im, in[p3].re, in[p3].im
 182     LUT_LOAD_4D m4, xm5, offset_n       ; in[n3].re, in[n3].im, in[n2].re, in[n2].im, in[n1].re, in[n1].im, in[n0].re, in[n0].im
 183
 184     mulps  m5, m3, m0                   ; in[p].reim * exp[p].reim
 185     mulps  m6, m4, m1                   ; in[n].reim * exp[n].reim
 186
 187     xorps  m5, m7                       ; in[p].re *= -1, in[p].im *= 1
 188     xorps  m6, m7                       ; in[n].re *= -1, in[n].im *= 1
 189
 190     shufps m3, m3, m3, q2301            ; in[p].imre
 191     shufps m4, m4, m4, q2301            ; in[n].imre
 192
 193     mulps  m3, m0                       ; in[p].imre * exp[p].reim
 194     mulps  m4, m1                       ; in[n].imre * exp[n].reim
 195
 196     haddps m3, m6                       ; out[n0].im, out[n1].im, out[n3].re, out[n2].re, out[n2].im, out[n3].im, out[n1].re, out[n0].re
 197     haddps m5, m4                       ; out[p0].re, out[p1].re, out[p3].im, out[p2].im, out[p2].re, out[p3].re, out[p1].im, out[p0].im
 198
 199 %if cpuflag(avx2)
 200     vpermps m3, m9, m3                  ; out[n3].im, out[n3].re, out[n2].im, out[n2].re, out[n1].im, out[n1].re, out[n0].im, out[n0].re
 201     vpermps m5, m8, m5                  ; out[p0].re, out[p0].im, out[p1].re, out[p1].im, out[p2].re, out[p2].im, out[p3].re, out[p3].im
 202 %else
 203     shufps m3, m3, m3, q0312
 204     shufps m5, m5, m5, q2130
 205 %endif
 206
 207     movups [outq + offset_nq*8], m3
 208     movups [outq + offset_pq*8], m5
 209
 210     sub offset_pq, %1
 211     add offset_nq, %1
 212     cmp offset_nq, offset_pq
 213     jle .loop
 214
 215     REP_RET
 216 %endmacro
 217
 218 INIT_XMM sse3
 219 POSTROTATE_FN 2
 220
 221 %if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
 222 INIT_YMM avx2
 223 POSTROTATE_FN 4
 224 %endif