git.sesse.net Git - ffmpeg/blob - libavcodec/x86/jpeg2000dsp.asm

   1 ;******************************************************************************
   2 ;* SIMD-optimized JPEG2000 DSP functions
   3 ;* Copyright (c) 2014 Nicolas Bertrand
   4 ;* Copyright (c) 2015 James Almer
   5 ;*
   6 ;* This file is part of FFmpeg.
   7 ;*
   8 ;* FFmpeg is free software; you can redistribute it and/or
   9 ;* modify it under the terms of the GNU Lesser General Public
  10 ;* License as published by the Free Software Foundation; either
  11 ;* version 2.1 of the License, or (at your option) any later version.
  12 ;*
  13 ;* FFmpeg is distributed in the hope that it will be useful,
  14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 ;* Lesser General Public License for more details.
  17 ;*
  18 ;* You should have received a copy of the GNU Lesser General Public
  19 ;* License along with FFmpeg; if not, write to the Free Software
  20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21 ;******************************************************************************
  22
  23 %include "libavutil/x86/x86util.asm"
  24
  25 SECTION_RODATA 32
  26
  27 pf_ict0: times 8 dd 1.402
  28 pf_ict1: times 8 dd 0.34413
  29 pf_ict2: times 8 dd 0.71414
  30 pf_ict3: times 8 dd 1.772
  31
  32 SECTION .text
  33
  34 ;***********************************************************************
  35 ; ff_ict_float_<opt>(float *src0, float *src1, float *src2, int csize)
  36 ;***********************************************************************
  37 %macro ICT_FLOAT 1
  38 cglobal ict_float, 4, 4, %1, src0, src1, src2, csize
  39     shl  csized, 2
  40     add   src0q, csizeq
  41     add   src1q, csizeq
  42     add   src2q, csizeq
  43     neg  csizeq
  44     movaps   m6, [pf_ict0]
  45     movaps   m7, [pf_ict1]
  46     %define ICT0 m6
  47     %define ICT1 m7
  48
  49 %if ARCH_X86_64
  50     movaps   m8, [pf_ict2]
  51     %define ICT2 m8
  52 %if cpuflag(avx)
  53     movaps   m3, [pf_ict3]
  54     %define ICT3 m3
  55 %else
  56     movaps   m9, [pf_ict3]
  57     %define ICT3 m9
  58 %endif
  59
  60 %else ; ARCH_X86_32
  61     %define ICT2 [pf_ict2]
  62 %if cpuflag(avx)
  63     movaps   m3, [pf_ict3]
  64     %define ICT3 m3
  65 %else
  66     %define ICT3 [pf_ict3]
  67 %endif
  68
  69 %endif ; ARCH
  70
  71 align 16
  72 .loop:
  73     movaps   m0, [src0q+csizeq]
  74     movaps   m1, [src1q+csizeq]
  75     movaps   m2, [src2q+csizeq]
  76
  77 %if cpuflag(fma4) || cpuflag(fma3)
  78 %if cpuflag(fma4)
  79     fnmaddps  m5, m1, ICT1, m0
  80     fmaddps   m4, m2, ICT0, m0
  81 %else ; fma3
  82     movaps    m5, m1
  83     movaps    m4, m2
  84     fnmaddps  m5, m5, ICT1, m0
  85     fmaddps   m4, m4, ICT0, m0
  86 %endif
  87     fmaddps   m0, m1, ICT3, m0
  88     fnmaddps  m5, m2, ICT2, m5
  89 %else ; non FMA
  90 %if cpuflag(avx)
  91     mulps    m5, m1, ICT1
  92     mulps    m4, m2, ICT0
  93     mulps    m1, m1, ICT3
  94     mulps    m2, m2, ICT2
  95     subps    m5, m0, m5
  96 %else ; sse
  97     movaps   m3, m1
  98     movaps   m4, m2
  99     movaps   m5, m0
 100     mulps    m3, ICT1
 101     mulps    m4, ICT0
 102     mulps    m1, ICT3
 103     mulps    m2, ICT2
 104     subps    m5, m3
 105 %endif
 106     addps    m4, m4, m0
 107     addps    m0, m0, m1
 108     subps    m5, m5, m2
 109 %endif
 110
 111     movaps   [src0q+csizeq], m4
 112     movaps   [src2q+csizeq], m0
 113     movaps   [src1q+csizeq], m5
 114     add  csizeq, mmsize
 115     jl .loop
 116     REP_RET
 117 %endmacro
 118
 119 INIT_XMM sse
 120 ICT_FLOAT 10
 121 INIT_YMM avx
 122 ICT_FLOAT 9
 123 %if HAVE_FMA4_EXTERNAL
 124 INIT_XMM fma4
 125 ICT_FLOAT 9
 126 %endif
 127 INIT_YMM fma3
 128 ICT_FLOAT 9
 129
 130 ;***************************************************************************
 131 ; ff_rct_int_<opt>(int32_t *src0, int32_t *src1, int32_t *src2, int csize)
 132 ;***************************************************************************
 133 %macro RCT_INT 0
 134 cglobal rct_int, 4, 4, 4, src0, src1, src2, csize
 135     shl  csized, 2
 136     add   src0q, csizeq
 137     add   src1q, csizeq
 138     add   src2q, csizeq
 139     neg  csizeq
 140
 141 align 16
 142 .loop:
 143     mova   m1, [src1q+csizeq]
 144     mova   m2, [src2q+csizeq]
 145     mova   m0, [src0q+csizeq]
 146     paddd  m3, m1, m2
 147     psrad  m3, 2
 148     psubd  m0, m3
 149     paddd  m1, m0
 150     paddd  m2, m0
 151     mova   [src1q+csizeq], m0
 152     mova   [src2q+csizeq], m1
 153     mova   [src0q+csizeq], m2
 154     add  csizeq, mmsize
 155     jl .loop
 156     REP_RET
 157 %endmacro
 158
 159 INIT_XMM sse2
 160 RCT_INT
 161 %if HAVE_AVX2_EXTERNAL
 162 INIT_YMM avx2
 163 RCT_INT
 164 %endif