git.sesse.net Git - ffmpeg/blob - libavcodec/x86/sbrdsp.asm

   1 ;******************************************************************************
   2 ;* AAC Spectral Band Replication decoding functions
   3 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
   4 ;*
   5 ;* This file is part of Libav.
   6 ;*
   7 ;* Libav is free software; you can redistribute it and/or
   8 ;* modify it under the terms of the GNU Lesser General Public
   9 ;* License as published by the Free Software Foundation; either
  10 ;* version 2.1 of the License, or (at your option) any later version.
  11 ;*
  12 ;* Libav is distributed in the hope that it will be useful,
  13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 ;* Lesser General Public License for more details.
  16 ;*
  17 ;* You should have received a copy of the GNU Lesser General Public
  18 ;* License along with Libav; if not, write to the Free Software
  19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20 ;******************************************************************************
  21
  22 %include "libavutil/x86/x86util.asm"
  23
  24 SECTION_RODATA
  25 ; mask equivalent for multiply by -1.0 1.0
  26 ps_mask         times 2 dd 1<<31, 0
  27
  28 SECTION_TEXT
  29
  30 INIT_XMM sse
  31 cglobal sbr_sum_square, 2, 3, 6
  32     mov         r2, r1
  33     xorps       m0, m0
  34     xorps       m1, m1
  35     sar         r2, 3
  36     jz          .prepare
  37 .loop:
  38     movu        m2, [r0 +  0]
  39     movu        m3, [r0 + 16]
  40     movu        m4, [r0 + 32]
  41     movu        m5, [r0 + 48]
  42     mulps       m2, m2
  43     mulps       m3, m3
  44     mulps       m4, m4
  45     mulps       m5, m5
  46     addps       m0, m2
  47     addps       m1, m3
  48     addps       m0, m4
  49     addps       m1, m5
  50     add         r0, 64
  51     dec         r2
  52     jnz         .loop
  53 .prepare:
  54     and         r1, 7
  55     sar         r1, 1
  56     jz          .end
  57 ; len is a multiple of 2, thus there are at least 4 elements to process
  58 .endloop:
  59     movu        m2, [r0]
  60     add         r0, 16
  61     mulps       m2, m2
  62     dec         r1
  63     addps       m0, m2
  64     jnz         .endloop
  65 .end:
  66     addps       m0, m1
  67     movhlps     m2, m0
  68     addps       m0, m2
  69     movss       m1, m0
  70     shufps      m0, m0, 1
  71     addss       m0, m1
  72 %if ARCH_X86_64 == 0
  73     movss       r0m,  m0
  74     fld         dword r0m
  75 %endif
  76     RET
  77
  78 %define STEP  40*4*2
  79 cglobal sbr_hf_g_filt, 5, 6, 5
  80     lea         r1, [r1 + 8*r4] ; offset by ixh elements into X_high
  81     mov         r5, r3
  82     and         r3, 0xFC
  83     lea         r2, [r2 + r3*4]
  84     lea         r0, [r0 + r3*8]
  85     neg         r3
  86     jz          .loop1
  87 .loop4:
  88     movlps      m0, [r2 + 4*r3 + 0]
  89     movlps      m1, [r2 + 4*r3 + 8]
  90     movlps      m2, [r1 + 0*STEP]
  91     movlps      m3, [r1 + 2*STEP]
  92     movhps      m2, [r1 + 1*STEP]
  93     movhps      m3, [r1 + 3*STEP]
  94     unpcklps    m0, m0
  95     unpcklps    m1, m1
  96     mulps       m0, m2
  97     mulps       m1, m3
  98     movu        [r0 + 8*r3 +  0], m0
  99     movu        [r0 + 8*r3 + 16], m1
 100     add         r1, 4*STEP
 101     add         r3, 4
 102     jnz         .loop4
 103     and         r5, 3 ; number of single element loops
 104     jz          .end
 105 .loop1: ; element 0 and 1 can be computed at the same time
 106     movss       m0, [r2]
 107     movlps      m2, [r1]
 108     unpcklps    m0, m0
 109     mulps       m2, m0
 110     movlps    [r0], m2
 111     add         r0, 8
 112     add         r2, 4
 113     add         r1, STEP
 114     dec         r5
 115     jnz         .loop1
 116 .end:
 117     RET
 118
 119 ; static void sbr_hf_gen_c(float (*X_high)[2], const float (*X_low)[2],
 120 ;                          const float alpha0[2], const float alpha1[2],
 121 ;                          float bw, int start, int end)
 122 ;
 123 cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
 124     ; load alpha factors
 125 %define bw m0
 126 %if ARCH_X86_64 == 0 || WIN64
 127     movss      bw, BWm
 128 %endif
 129     movlps     m2, [alpha1q]
 130     movlps     m1, [alpha0q]
 131     shufps     bw, bw, 0
 132     mulps      m2, bw             ; (a1[0] a1[1])*bw
 133     mulps      m1, bw             ; (a0[0] a0[1])*bw    = (a2 a3)
 134     mulps      m2, bw             ; (a1[0] a1[1])*bw*bw = (a0 a1)
 135     mova       m3, m1
 136     mova       m4, m2
 137     mova       m7, [ps_mask]
 138
 139     ; Set pointers
 140 %if ARCH_X86_64 == 0 || WIN64
 141     ; start and end 6th and 7th args on stack
 142     mov        r2d, Sm
 143     mov        r3d, Em
 144 %define  start r2q
 145 %define  end   r3q
 146 %else
 147 ; BW does not actually occupy a register, so shift by 1
 148 %define  start BWq
 149 %define  end   Sq
 150 %endif
 151     sub      start, end          ; neg num of loops
 152     lea    X_highq, [X_highq + end*2*4]
 153     lea     X_lowq, [X_lowq  + end*2*4 - 2*2*4]
 154     shl      start, 3            ; offset from num loops
 155
 156     mova        m0, [X_lowq + start]
 157     movlhps     m1, m1           ; (a2 a3 a2 a3)
 158     movlhps     m2, m2           ; (a0 a1 a0 a1)
 159     shufps      m3, m3, q0101    ; (a3 a2 a3 a2)
 160     shufps      m4, m4, q0101    ; (a1 a0 a1 a0)
 161     xorps       m3, m7           ; (-a3 a2 -a3 a2)
 162     xorps       m4, m7           ; (-a1 a0 -a1 a0)
 163 .loop2:
 164     mova        m5, m0
 165     mova        m6, m0
 166     shufps      m0, m0, q2200    ; {Xl[-2][0],",Xl[-1][0],"}
 167     shufps      m5, m5, q3311    ; {Xl[-2][1],",Xl[-1][1],"}
 168     mulps       m0, m2
 169     mulps       m5, m4
 170     mova        m7, m6
 171     addps       m5, m0
 172     mova        m0, [X_lowq + start + 2*2*4]
 173     shufps      m6, m0, q0022    ; {Xl[-1][0],",Xl[0][0],"}
 174     shufps      m7, m0, q1133    ; {Xl[-1][1],",Xl[1][1],"}
 175     mulps       m6, m1
 176     mulps       m7, m3
 177     addps       m5, m6
 178     addps       m7, m0
 179     addps       m5, m7
 180     mova  [X_highq + start], m5
 181     add     start, 16
 182     jnz         .loop2
 183     RET