git.sesse.net Git - ffmpeg/blob - libavcodec/x86/mpegaudiodec_mmx.c

   1 /*
   2  * MMX optimized MP3 decoding functions
   3  * Copyright (c) 2010 Vitor Sessak
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 #include "libavutil/cpu.h"
  23 #include "libavutil/x86_cpu.h"
  24 #include "libavcodec/dsputil.h"
  25 #include "libavcodec/mpegaudiodsp.h"
  26
  27 void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
  28 void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
  29 void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
  30 void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
  31 void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
  32
  33 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
  34 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
  35
  36 #define SUM8(op, sum, w, p)               \
  37 {                                         \
  38     op(sum, (w)[0 * 64], (p)[0 * 64]);    \
  39     op(sum, (w)[1 * 64], (p)[1 * 64]);    \
  40     op(sum, (w)[2 * 64], (p)[2 * 64]);    \
  41     op(sum, (w)[3 * 64], (p)[3 * 64]);    \
  42     op(sum, (w)[4 * 64], (p)[4 * 64]);    \
  43     op(sum, (w)[5 * 64], (p)[5 * 64]);    \
  44     op(sum, (w)[6 * 64], (p)[6 * 64]);    \
  45     op(sum, (w)[7 * 64], (p)[7 * 64]);    \
  46 }
  47
  48 static void apply_window(const float *buf, const float *win1,
  49                          const float *win2, float *sum1, float *sum2, int len)
  50 {
  51     x86_reg count = - 4*len;
  52     const float *win1a = win1+len;
  53     const float *win2a = win2+len;
  54     const float *bufa  = buf+len;
  55     float *sum1a = sum1+len;
  56     float *sum2a = sum2+len;
  57
  58
  59 #define MULT(a, b)                                 \
  60     "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
  61     "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
  62     "mulps         %%xmm2, %%xmm1           \n\t"  \
  63     "subps         %%xmm1, %%xmm0           \n\t"  \
  64     "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
  65     "subps         %%xmm2, %%xmm4           \n\t"  \
  66
  67     __asm__ volatile(
  68             "1:                                   \n\t"
  69             "xorps       %%xmm0, %%xmm0           \n\t"
  70             "xorps       %%xmm4, %%xmm4           \n\t"
  71
  72             MULT(   0,   0)
  73             MULT( 256,  64)
  74             MULT( 512, 128)
  75             MULT( 768, 192)
  76             MULT(1024, 256)
  77             MULT(1280, 320)
  78             MULT(1536, 384)
  79             MULT(1792, 448)
  80
  81             "movaps      %%xmm0, (%4,%0)          \n\t"
  82             "movaps      %%xmm4, (%5,%0)          \n\t"
  83             "add            $16,  %0              \n\t"
  84             "jl              1b                   \n\t"
  85             :"+&r"(count)
  86             :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
  87             );
  88
  89 #undef MULT
  90 }
  91
  92 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
  93                              int incr)
  94 {
  95     LOCAL_ALIGNED_16(float, suma, [17]);
  96     LOCAL_ALIGNED_16(float, sumb, [17]);
  97     LOCAL_ALIGNED_16(float, sumc, [17]);
  98     LOCAL_ALIGNED_16(float, sumd, [17]);
  99
 100     float sum;
 101
 102     /* copy to avoid wrap */
 103     memcpy(in + 512, in, 32 * sizeof(*in));
 104
 105     apply_window(in + 16, win     , win + 512, suma, sumc, 16);
 106     apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
 107
 108     SUM8(MACS, suma[0], win + 32, in + 48);
 109
 110     sumc[ 0] = 0;
 111     sumb[16] = 0;
 112     sumd[16] = 0;
 113
 114 #define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
 115             "movups " #sumd "(%4),       %%xmm0          \n\t" \
 116             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
 117             "subps  " #suma "(%1),       %%xmm0          \n\t" \
 118             "movaps        %%xmm0," #out1 "(%0)          \n\t" \
 119 \
 120             "movups " #sumc "(%3),       %%xmm0          \n\t" \
 121             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
 122             "addps  " #sumb "(%2),       %%xmm0          \n\t" \
 123             "movaps        %%xmm0," #out2 "(%0)          \n\t"
 124
 125     if (incr == 1) {
 126         __asm__ volatile(
 127             SUMS( 0, 48,  4, 52,  0, 112)
 128             SUMS(16, 32, 20, 36, 16,  96)
 129             SUMS(32, 16, 36, 20, 32,  80)
 130             SUMS(48,  0, 52,  4, 48,  64)
 131
 132             :"+&r"(out)
 133             :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
 134             :"memory"
 135             );
 136         out += 16*incr;
 137     } else {
 138         int j;
 139         float *out2 = out + 32 * incr;
 140         out[0  ]  = -suma[   0];
 141         out += incr;
 142         out2 -= incr;
 143         for(j=1;j<16;j++) {
 144             *out  = -suma[   j] + sumd[16-j];
 145             *out2 =  sumb[16-j] + sumc[   j];
 146             out  += incr;
 147             out2 -= incr;
 148         }
 149     }
 150
 151     sum = 0;
 152     SUM8(MLSS, sum, win + 16 + 32, in + 32);
 153     *out = sum;
 154 }
 155
 156 void ff_mpadsp_init_mmx(MPADSPContext *s)
 157 {
 158     int mm_flags = av_get_cpu_flags();
 159
 160     if (mm_flags & AV_CPU_FLAG_SSE2) {
 161         s->apply_window_float = apply_window_mp3;
 162     }
 163     if (HAVE_YASM && mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
 164         s->imdct36_float = ff_imdct36_float_avx;
 165     }
 166     else if (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSE) {
 167         s->imdct36_float = ff_imdct36_float_ssse3;
 168     }
 169     else if (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE3 && HAVE_SSE) {
 170         s->imdct36_float = ff_imdct36_float_sse3;
 171     }
 172     else if (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
 173         s->imdct36_float = ff_imdct36_float_sse2;
 174     }
 175     else if (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
 176         s->imdct36_float = ff_imdct36_float_sse;
 177     }
 178 }