git.sesse.net Git - x264/blob - common/x86/util.h

   1 /*****************************************************************************
   2  * util.h: x86 inline asm
   3  *****************************************************************************
   4  * Copyright (C) 2008-2015 x264 project
   5  *
   6  * Authors: Fiona Glaser <fiona@x264.com>
   7  *          Loren Merritt <lorenm@u.washington.edu>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22  *
  23  * This program is also available under a commercial proprietary license.
  24  * For more information, contact us at licensing@x264.com.
  25  *****************************************************************************/
  26
  27 #ifndef X264_X86_UTIL_H
  28 #define X264_X86_UTIL_H
  29
  30 #ifdef __SSE__
  31 #include <xmmintrin.h>
  32
  33 #undef M128_ZERO
  34 #define M128_ZERO ((__m128){0,0,0,0})
  35 #define x264_union128_t x264_union128_sse_t
  36 typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t;
  37 #if HAVE_VECTOREXT
  38 typedef uint32_t v4si __attribute__((vector_size (16)));
  39 #endif
  40 #endif // __SSE__
  41
  42 #if HAVE_X86_INLINE_ASM && HAVE_MMX
  43
  44 #define x264_median_mv x264_median_mv_mmx2
  45 static ALWAYS_INLINE void x264_median_mv_mmx2( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
  46 {
  47     asm(
  48         "movd   %1,    %%mm0 \n"
  49         "movd   %2,    %%mm1 \n"
  50         "movq   %%mm0, %%mm3 \n"
  51         "movd   %3,    %%mm2 \n"
  52         "pmaxsw %%mm1, %%mm0 \n"
  53         "pminsw %%mm3, %%mm1 \n"
  54         "pminsw %%mm2, %%mm0 \n"
  55         "pmaxsw %%mm1, %%mm0 \n"
  56         "movd   %%mm0, %0    \n"
  57         :"=m"(*(x264_union32_t*)dst)
  58         :"m"(M32( a )), "m"(M32( b )), "m"(M32( c ))
  59     );
  60 }
  61
  62 #define x264_predictor_difference x264_predictor_difference_mmx2
  63 static ALWAYS_INLINE int x264_predictor_difference_mmx2( int16_t (*mvc)[2], intptr_t i_mvc )
  64 {
  65     int sum;
  66     static const uint64_t pw_1 = 0x0001000100010001ULL;
  67
  68     asm(
  69         "pxor    %%mm4, %%mm4 \n"
  70         "test    $1, %1       \n"
  71         "jnz 3f               \n"
  72         "movd    -8(%2,%1,4), %%mm0 \n"
  73         "movd    -4(%2,%1,4), %%mm3 \n"
  74         "psubw   %%mm3, %%mm0 \n"
  75         "jmp 2f               \n"
  76         "3:                   \n"
  77         "dec     %1           \n"
  78         "1:                   \n"
  79         "movq    -8(%2,%1,4), %%mm0 \n"
  80         "psubw   -4(%2,%1,4), %%mm0 \n"
  81         "2:                   \n"
  82         "sub     $2,    %1    \n"
  83         "pxor    %%mm2, %%mm2 \n"
  84         "psubw   %%mm0, %%mm2 \n"
  85         "pmaxsw  %%mm2, %%mm0 \n"
  86         "paddusw %%mm0, %%mm4 \n"
  87         "jg 1b                \n"
  88         "pmaddwd %4, %%mm4    \n"
  89         "pshufw $14, %%mm4, %%mm0 \n"
  90         "paddd   %%mm0, %%mm4 \n"
  91         "movd    %%mm4, %0    \n"
  92         :"=r"(sum), "+r"(i_mvc)
  93         :"r"(mvc), "m"(M64( mvc )), "m"(pw_1)
  94     );
  95     return sum;
  96 }
  97
  98 #define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmx2
  99 static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmx2(uint8_t *mvdleft, uint8_t *mvdtop)
 100 {
 101     static const uint64_t pb_2    = 0x0202020202020202ULL;
 102     static const uint64_t pb_32   = 0x2020202020202020ULL;
 103     static const uint64_t pb_33   = 0x2121212121212121ULL;
 104     int amvd;
 105     asm(
 106         "movd         %1, %%mm0 \n"
 107         "movd         %2, %%mm1 \n"
 108         "paddusb   %%mm1, %%mm0 \n"
 109         "pminub       %5, %%mm0 \n"
 110         "pxor      %%mm2, %%mm2 \n"
 111         "movq      %%mm0, %%mm1 \n"
 112         "pcmpgtb      %3, %%mm0 \n"
 113         "pcmpgtb      %4, %%mm1 \n"
 114         "psubb     %%mm0, %%mm2 \n"
 115         "psubb     %%mm1, %%mm2 \n"
 116         "movd      %%mm2, %0    \n"
 117         :"=r"(amvd)
 118         :"m"(M16( mvdleft )),"m"(M16( mvdtop )),
 119          "m"(pb_2),"m"(pb_32),"m"(pb_33)
 120     );
 121     return amvd;
 122 }
 123
 124 #define x264_predictor_clip x264_predictor_clip_mmx2
 125 static int ALWAYS_INLINE x264_predictor_clip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
 126 {
 127     static const uint32_t pd_32 = 0x20;
 128     intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
 129
 130     asm(
 131         "movq       (%2), %%mm5 \n"
 132         "movd         %6, %%mm3 \n"
 133         "psllw        $2, %%mm5 \n" // Convert to subpel
 134         "pshufw $0xEE, %%mm5, %%mm6 \n"
 135         "dec         %k3        \n"
 136         "jz 2f                  \n" // if( i_mvc == 1 ) {do the last iteration}
 137         "punpckldq %%mm3, %%mm3 \n"
 138         "punpckldq %%mm5, %%mm5 \n"
 139         "movd         %7, %%mm4 \n"
 140         "lea   (%0,%3,4), %3    \n"
 141         "1:                     \n"
 142         "movq       (%0), %%mm0 \n"
 143         "add          $8, %0    \n"
 144         "movq      %%mm3, %%mm1 \n"
 145         "pxor      %%mm2, %%mm2 \n"
 146         "pcmpeqd   %%mm0, %%mm1 \n" // mv == pmv
 147         "pcmpeqd   %%mm0, %%mm2 \n" // mv == 0
 148         "por       %%mm1, %%mm2 \n" // (mv == pmv || mv == 0) * -1
 149         "pmovmskb  %%mm2, %k2   \n" // (mv == pmv || mv == 0) * 0xf
 150         "pmaxsw    %%mm5, %%mm0 \n"
 151         "pminsw    %%mm6, %%mm0 \n"
 152         "pand      %%mm4, %%mm2 \n" // (mv0 == pmv || mv0 == 0) * 32
 153         "psrlq     %%mm2, %%mm0 \n" // drop mv0 if it's skipped
 154         "movq      %%mm0, (%5,%4,4) \n"
 155         "and         $24, %k2   \n"
 156         "add          $2, %4    \n"
 157         "add          $8, %k2   \n"
 158         "shr          $4, %k2   \n" // (4-val)>>1
 159         "sub          %2, %4    \n" // +1 for each valid motion vector
 160         "cmp          %3, %0    \n"
 161         "jl 1b                  \n"
 162         "jg 3f                  \n" // if( i == i_mvc - 1 ) {do the last iteration}
 163
 164         /* Do the last iteration */
 165         "2:                     \n"
 166         "movd       (%0), %%mm0 \n"
 167         "pxor      %%mm2, %%mm2 \n"
 168         "pcmpeqd   %%mm0, %%mm3 \n"
 169         "pcmpeqd   %%mm0, %%mm2 \n"
 170         "por       %%mm3, %%mm2 \n"
 171         "pmovmskb  %%mm2, %k2   \n"
 172         "pmaxsw    %%mm5, %%mm0 \n"
 173         "pminsw    %%mm6, %%mm0 \n"
 174         "movd      %%mm0, (%5,%4,4) \n"
 175         "inc          %4        \n"
 176         "and          $1, %k2   \n"
 177         "sub          %2, %4    \n" // output += !(mv == pmv || mv == 0)
 178         "3:                     \n"
 179         :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
 180         :"r"(dst), "g"(pmv), "m"(pd_32), "m"(M64( mvc ))
 181     );
 182     return i;
 183 }
 184
 185 /* Same as the above, except we do (mv + 2) >> 2 on the input. */
 186 #define x264_predictor_roundclip x264_predictor_roundclip_mmx2
 187 static int ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
 188 {
 189     static const uint64_t pw_2 = 0x0002000200020002ULL;
 190     static const uint32_t pd_32 = 0x20;
 191     intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
 192
 193     asm(
 194         "movq       (%2), %%mm5 \n"
 195         "movq         %6, %%mm7 \n"
 196         "movd         %7, %%mm3 \n"
 197         "pshufw $0xEE, %%mm5, %%mm6 \n"
 198         "dec         %k3        \n"
 199         "jz 2f                  \n"
 200         "punpckldq %%mm3, %%mm3 \n"
 201         "punpckldq %%mm5, %%mm5 \n"
 202         "movd         %8, %%mm4 \n"
 203         "lea   (%0,%3,4), %3    \n"
 204         "1:                     \n"
 205         "movq       (%0), %%mm0 \n"
 206         "add          $8, %0    \n"
 207         "paddw     %%mm7, %%mm0 \n"
 208         "psraw        $2, %%mm0 \n"
 209         "movq      %%mm3, %%mm1 \n"
 210         "pxor      %%mm2, %%mm2 \n"
 211         "pcmpeqd   %%mm0, %%mm1 \n"
 212         "pcmpeqd   %%mm0, %%mm2 \n"
 213         "por       %%mm1, %%mm2 \n"
 214         "pmovmskb  %%mm2, %k2   \n"
 215         "pmaxsw    %%mm5, %%mm0 \n"
 216         "pminsw    %%mm6, %%mm0 \n"
 217         "pand      %%mm4, %%mm2 \n"
 218         "psrlq     %%mm2, %%mm0 \n"
 219         "movq      %%mm0, (%5,%4,4) \n"
 220         "and         $24, %k2   \n"
 221         "add          $2, %4    \n"
 222         "add          $8, %k2   \n"
 223         "shr          $4, %k2   \n"
 224         "sub          %2, %4    \n"
 225         "cmp          %3, %0    \n"
 226         "jl 1b                  \n"
 227         "jg 3f                  \n"
 228
 229         /* Do the last iteration */
 230         "2:                     \n"
 231         "movd       (%0), %%mm0 \n"
 232         "paddw     %%mm7, %%mm0 \n"
 233         "psraw        $2, %%mm0 \n"
 234         "pxor      %%mm2, %%mm2 \n"
 235         "pcmpeqd   %%mm0, %%mm3 \n"
 236         "pcmpeqd   %%mm0, %%mm2 \n"
 237         "por       %%mm3, %%mm2 \n"
 238         "pmovmskb  %%mm2, %k2   \n"
 239         "pmaxsw    %%mm5, %%mm0 \n"
 240         "pminsw    %%mm6, %%mm0 \n"
 241         "movd      %%mm0, (%5,%4,4) \n"
 242         "inc          %4        \n"
 243         "and          $1, %k2   \n"
 244         "sub          %2, %4    \n"
 245         "3:                     \n"
 246         :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
 247         :"r"(dst), "m"(pw_2), "g"(pmv), "m"(pd_32), "m"(M64( mvc ))
 248     );
 249     return i;
 250 }
 251
 252 #endif
 253
 254 #endif