git.sesse.net Git - ffmpeg/blob - libavcodec/x86/fmtconvert.asm

   1 ;******************************************************************************
   2 ;* x86 optimized Format Conversion Utils
   3 ;* Copyright (c) 2008 Loren Merritt
   4 ;*
   5 ;* This file is part of FFmpeg.
   6 ;*
   7 ;* FFmpeg is free software; you can redistribute it and/or
   8 ;* modify it under the terms of the GNU Lesser General Public
   9 ;* License as published by the Free Software Foundation; either
  10 ;* version 2.1 of the License, or (at your option) any later version.
  11 ;*
  12 ;* FFmpeg is distributed in the hope that it will be useful,
  13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 ;* Lesser General Public License for more details.
  16 ;*
  17 ;* You should have received a copy of the GNU Lesser General Public
  18 ;* License along with FFmpeg; if not, write to the Free Software
  19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20 ;******************************************************************************
  21
  22 %include "libavutil/x86/x86util.asm"
  23
  24 SECTION .text
  25
  26 ;------------------------------------------------------------------------------
  27 ; void ff_int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul,
  28 ;                                    int len);
  29 ;------------------------------------------------------------------------------
  30 %macro INT32_TO_FLOAT_FMUL_SCALAR 1
  31 %if UNIX64
  32 cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
  33 %else
  34 cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
  35 %endif
  36 %if WIN64
  37     SWAP 0, 2
  38 %elif ARCH_X86_32
  39     movss   m0, mulm
  40 %endif
  41     SPLATD  m0
  42     shl     lend, 2
  43     add     srcq, lenq
  44     add     dstq, lenq
  45     neg     lenq
  46 .loop:
  47 %if cpuflag(sse2)
  48     cvtdq2ps  m1, [srcq+lenq   ]
  49     cvtdq2ps  m2, [srcq+lenq+16]
  50 %else
  51     cvtpi2ps  m1, [srcq+lenq   ]
  52     cvtpi2ps  m3, [srcq+lenq+ 8]
  53     cvtpi2ps  m2, [srcq+lenq+16]
  54     cvtpi2ps  m4, [srcq+lenq+24]
  55     movlhps   m1, m3
  56     movlhps   m2, m4
  57 %endif
  58     mulps     m1, m0
  59     mulps     m2, m0
  60     mova  [dstq+lenq   ], m1
  61     mova  [dstq+lenq+16], m2
  62     add     lenq, 32
  63     jl .loop
  64 %if notcpuflag(sse2)
  65     ;; cvtpi2ps switches to MMX even if the source is a memory location
  66     ;; possible an error in documentation since every tested CPU disagrees with
  67     ;; that. Use emms anyway since the vast majority of machines will use the
  68     ;; SSE2 variant
  69     emms
  70 %endif
  71     RET
  72 %endmacro
  73
  74 INIT_XMM sse
  75 INT32_TO_FLOAT_FMUL_SCALAR 5
  76 INIT_XMM sse2
  77 INT32_TO_FLOAT_FMUL_SCALAR 3
  78
  79 ;------------------------------------------------------------------------------
  80 ; void ff_int32_to_float_fmul_array8(FmtConvertContext *c, float *dst, const int32_t *src,
  81 ;                                    const float *mul, int len);
  82 ;------------------------------------------------------------------------------
  83 %macro INT32_TO_FLOAT_FMUL_ARRAY8 0
  84 cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len
  85     shl     lend, 2
  86     add     srcq, lenq
  87     add     dstq, lenq
  88     neg     lenq
  89 .loop:
  90     movss     m0, [mulq]
  91     SPLATD    m0
  92 %if cpuflag(sse2)
  93     cvtdq2ps  m1, [srcq+lenq   ]
  94     cvtdq2ps  m2, [srcq+lenq+16]
  95 %else
  96     cvtpi2ps  m1, [srcq+lenq   ]
  97     cvtpi2ps  m3, [srcq+lenq+ 8]
  98     cvtpi2ps  m2, [srcq+lenq+16]
  99     cvtpi2ps  m4, [srcq+lenq+24]
 100     movlhps   m1, m3
 101     movlhps   m2, m4
 102 %endif
 103     mulps     m1, m0
 104     mulps     m2, m0
 105     mova  [dstq+lenq   ], m1
 106     mova  [dstq+lenq+16], m2
 107     add     mulq, 4
 108     add     lenq, 32
 109     jl .loop
 110 %if notcpuflag(sse2)
 111     ;; cvtpi2ps switches to MMX even if the source is a memory location
 112     ;; possible an error in documentation since every tested CPU disagrees with
 113     ;; that. Use emms anyway since the vast majority of machines will use the
 114     ;; SSE2 variant
 115     emms
 116 %endif
 117     RET
 118 %endmacro
 119
 120 INIT_XMM sse
 121 INT32_TO_FLOAT_FMUL_ARRAY8
 122 INIT_XMM sse2
 123 INT32_TO_FLOAT_FMUL_ARRAY8
 124