git.sesse.net Git - ffmpeg/blob - libavcodec/x86/sbrdsp.asm

   1 ;******************************************************************************
   2 ;* AAC Spectral Band Replication decoding functions
   3 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
   4 ;*
   5 ;* This file is part of Libav.
   6 ;*
   7 ;* Libav is free software; you can redistribute it and/or
   8 ;* modify it under the terms of the GNU Lesser General Public
   9 ;* License as published by the Free Software Foundation; either
  10 ;* version 2.1 of the License, or (at your option) any later version.
  11 ;*
  12 ;* Libav is distributed in the hope that it will be useful,
  13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 ;* Lesser General Public License for more details.
  16 ;*
  17 ;* You should have received a copy of the GNU Lesser General Public
  18 ;* License along with Libav; if not, write to the Free Software
  19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20 ;******************************************************************************
  21
  22 %include "x86inc.asm"
  23 %include "x86util.asm"
  24
  25 ;SECTION_RODATA
  26 SECTION .text
  27
  28 INIT_XMM sse
  29 cglobal sbr_sum_square, 2, 3, 6
  30     mov         r2, r1
  31     xorps       m0, m0
  32     xorps       m1, m1
  33     sar         r2, 3
  34     jz          .prepare
  35 .loop:
  36     movu        m2, [r0 +  0]
  37     movu        m3, [r0 + 16]
  38     movu        m4, [r0 + 32]
  39     movu        m5, [r0 + 48]
  40     mulps       m2, m2
  41     mulps       m3, m3
  42     mulps       m4, m4
  43     mulps       m5, m5
  44     addps       m0, m2
  45     addps       m1, m3
  46     addps       m0, m4
  47     addps       m1, m5
  48     add         r0, 64
  49     dec         r2
  50     jnz         .loop
  51 .prepare:
  52     and         r1, 7
  53     sar         r1, 1
  54     jz          .end
  55 ; len is a multiple of 2, thus there are at least 4 elements to process
  56 .endloop:
  57     movu        m2, [r0]
  58     add         r0, 16
  59     mulps       m2, m2
  60     dec         r1
  61     addps       m0, m2
  62     jnz         .endloop
  63 .end:
  64     addps       m0, m1
  65     movhlps     m2, m0
  66     addps       m0, m2
  67     movss       m1, m0
  68     shufps      m0, m0, 1
  69     addss       m0, m1
  70 %if ARCH_X86_64 == 0
  71     movss       r0m,  m0
  72     fld         dword r0m
  73 %endif
  74     RET
  75
  76 %define STEP  40*4*2
  77 cglobal sbr_hf_g_filt, 5, 6, 5
  78     lea         r1, [r1 + 8*r4] ; offset by ixh elements into X_high
  79     mov         r5, r3
  80     and         r3, 0xFC
  81     lea         r2, [r2 + r3*4]
  82     lea         r0, [r0 + r3*8]
  83     neg         r3
  84     jz          .loop1
  85 .loop4:
  86     movlps      m0, [r2 + 4*r3 + 0]
  87     movlps      m1, [r2 + 4*r3 + 8]
  88     movlps      m2, [r1 + 0*STEP]
  89     movlps      m3, [r1 + 2*STEP]
  90     movhps      m2, [r1 + 1*STEP]
  91     movhps      m3, [r1 + 3*STEP]
  92     unpcklps    m0, m0
  93     unpcklps    m1, m1
  94     mulps       m0, m2
  95     mulps       m1, m3
  96     movu        [r0 + 8*r3 +  0], m0
  97     movu        [r0 + 8*r3 + 16], m1
  98     add         r1, 4*STEP
  99     add         r3, 4
 100     jnz         .loop4
 101     and         r5, 3 ; number of single element loops
 102     jz          .end
 103 .loop1: ; element 0 and 1 can be computed at the same time
 104     movss       m0, [r2]
 105     movlps      m2, [r1]
 106     unpcklps    m0, m0
 107     mulps       m2, m0
 108     movlps    [r0], m2
 109     add         r0, 8
 110     add         r2, 4
 111     add         r1, STEP
 112     dec         r5
 113     jnz         .loop1
 114 .end:
 115     RET