git.sesse.net Git - ffmpeg/blob - libavcodec/x86/dsputil_mmx.c

   1 /*
   2  * MMX optimized DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 #include "libavutil/attributes.h"
  26 #include "libavutil/cpu.h"
  27 #include "libavutil/x86/asm.h"
  28 #include "libavcodec/dsputil.h"
  29 #include "libavcodec/h264dsp.h"
  30 #include "libavcodec/mpegvideo.h"
  31 #include "libavcodec/simple_idct.h"
  32 #include "dsputil_mmx.h"
  33 #include "idct_xvid.h"
  34
  35 //#undef NDEBUG
  36 //#include <assert.h>
  37
  38 /* pixel operations */
  39 DECLARE_ALIGNED(8,  const uint64_t, ff_bone) = 0x0101010101010101ULL;
  40 DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
  41
  42 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL };
  43 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL };
  44 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_3)    = { 0x0003000300030003ULL, 0x0003000300030003ULL };
  45 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_4)    = { 0x0004000400040004ULL, 0x0004000400040004ULL };
  46 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5)    = { 0x0005000500050005ULL, 0x0005000500050005ULL };
  47 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_8)    = { 0x0008000800080008ULL, 0x0008000800080008ULL };
  48 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_9)    = { 0x0009000900090009ULL, 0x0009000900090009ULL };
  49 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15)   =   0x000F000F000F000FULL;
  50 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_16)   = { 0x0010001000100010ULL, 0x0010001000100010ULL };
  51 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_17)   = { 0x0011001100110011ULL, 0x0011001100110011ULL };
  52 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_18)   = { 0x0012001200120012ULL, 0x0012001200120012ULL };
  53 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_20)   =   0x0014001400140014ULL;
  54 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_27)   = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
  55 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_28)   = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
  56 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_32)   = { 0x0020002000200020ULL, 0x0020002000200020ULL };
  57 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_42)   =   0x002A002A002A002AULL;
  58 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_53)   =   0x0035003500350035ULL;
  59 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_63)   = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
  60 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_64)   = { 0x0040004000400040ULL, 0x0040004000400040ULL };
  61 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_96)   =   0x0060006000600060ULL;
  62 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_128)  =   0x0080008000800080ULL;
  63 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_255)  =   0x00ff00ff00ff00ffULL;
  64 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x0200020002000200ULL };
  65 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
  66
  67 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_0)    = { 0x0000000000000000ULL, 0x0000000000000000ULL };
  68 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL };
  69 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL };
  70 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_4)    = { 0x0404040404040404ULL, 0x0404040404040404ULL };
  71 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_7)    =   0x0707070707070707ULL;
  72 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_1F)   =   0x1F1F1F1F1F1F1F1FULL;
  73 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_3F)   =   0x3F3F3F3F3F3F3F3FULL;
  74 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_80)   = { 0x8080808080808080ULL, 0x8080808080808080ULL };
  75 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_81)   =   0x8181818181818181ULL;
  76 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_A1)   = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
  77 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_F8)   = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
  78 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC)   =   0xFCFCFCFCFCFCFCFCULL;
  79 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
  80
  81 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
  82 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
  83
  84
  85 #if HAVE_YASM
  86 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
  87                               ptrdiff_t line_size, int h);
  88 void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
  89                              ptrdiff_t line_size, int h);
  90 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
  91                               int dstStride, int src1Stride, int h);
  92 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
  93                                      uint8_t *src2, int dstStride,
  94                                      int src1Stride, int h);
  95 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
  96                               int dstStride, int src1Stride, int h);
  97 void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
  98                                ptrdiff_t line_size, int h);
  99 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
 100                               ptrdiff_t line_size, int h);
 101 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
 102                                int dstStride, int src1Stride, int h);
 103 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
 104                                int dstStride, int src1Stride, int h);
 105 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
 106                                       int dstStride, int src1Stride, int h);
 107 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
 108                                      ptrdiff_t line_size, int h);
 109 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
 110                                     ptrdiff_t line_size, int h);
 111 void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
 112                                            const uint8_t *pixels,
 113                                            ptrdiff_t line_size, int h);
 114 void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
 115                                           const uint8_t *pixels,
 116                                           ptrdiff_t line_size, int h);
 117 void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
 118                               ptrdiff_t line_size, int h);
 119 void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
 120                              ptrdiff_t line_size, int h);
 121 void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
 122                                      ptrdiff_t line_size, int h);
 123 void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
 124                                     ptrdiff_t line_size, int h);
 125 void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
 126                                            const uint8_t *pixels,
 127                                            ptrdiff_t line_size, int h);
 128 void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
 129                                           const uint8_t *pixels,
 130                                           ptrdiff_t line_size, int h);
 131 void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
 132                           ptrdiff_t line_size, int h);
 133 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
 134                               ptrdiff_t line_size, int h);
 135 void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
 136                              ptrdiff_t line_size, int h);
 137 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
 138                               ptrdiff_t line_size, int h);
 139 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
 140                              ptrdiff_t line_size, int h);
 141 void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
 142                                ptrdiff_t line_size, int h);
 143 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
 144                               ptrdiff_t line_size, int h);
 145
 146 void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
 147 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
 148                                    ptrdiff_t line_size, int h)
 149 {
 150     ff_put_pixels8_mmxext(block,     pixels,     line_size, h);
 151     ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
 152 }
 153
 154 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
 155                                          int dstStride, int srcStride, int h);
 156 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
 157                                          int dstStride, int srcStride, int h);
 158 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
 159                                                  int dstStride, int srcStride,
 160                                                  int h);
 161 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
 162                                         int dstStride, int srcStride, int h);
 163 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
 164                                         int dstStride, int srcStride, int h);
 165 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
 166                                                 int dstStride, int srcStride,
 167                                                 int h);
 168 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
 169                                          int dstStride, int srcStride);
 170 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
 171                                          int dstStride, int srcStride);
 172 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
 173                                                  int dstStride, int srcStride);
 174 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
 175                                         int dstStride, int srcStride);
 176 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
 177                                         int dstStride, int srcStride);
 178 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
 179                                                 int dstStride, int srcStride);
 180 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
 181 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
 182 #endif /* HAVE_YASM */
 183
 184
 185 #if HAVE_INLINE_ASM
 186
 187 #define JUMPALIGN()     __asm__ volatile (".p2align 3"::)
 188 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
 189
 190 #define MOVQ_BFE(regd)                                  \
 191     __asm__ volatile (                                  \
 192         "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
 193         "paddb   %%"#regd", %%"#regd"   \n\t" ::)
 194
 195 #ifndef PIC
 196 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
 197 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
 198 #else
 199 // for shared library it's better to use this way for accessing constants
 200 // pcmpeqd -> -1
 201 #define MOVQ_BONE(regd)                                 \
 202     __asm__ volatile (                                  \
 203         "pcmpeqd  %%"#regd", %%"#regd"  \n\t"           \
 204         "psrlw          $15, %%"#regd"  \n\t"           \
 205         "packuswb %%"#regd", %%"#regd"  \n\t" ::)
 206
 207 #define MOVQ_WTWO(regd)                                 \
 208     __asm__ volatile (                                  \
 209         "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
 210         "psrlw         $15, %%"#regd"   \n\t"           \
 211         "psllw          $1, %%"#regd"   \n\t"::)
 212
 213 #endif
 214
 215 // using regr as temporary and for the output result
 216 // first argument is unmodifed and second is trashed
 217 // regfe is supposed to contain 0xfefefefefefefefe
 218 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe)                \
 219     "movq   "#rega", "#regr"            \n\t"                    \
 220     "pand   "#regb", "#regr"            \n\t"                    \
 221     "pxor   "#rega", "#regb"            \n\t"                    \
 222     "pand  "#regfe", "#regb"            \n\t"                    \
 223     "psrlq       $1, "#regb"            \n\t"                    \
 224     "paddb  "#regb", "#regr"            \n\t"
 225
 226 #define PAVGB_MMX(rega, regb, regr, regfe)                       \
 227     "movq   "#rega", "#regr"            \n\t"                    \
 228     "por    "#regb", "#regr"            \n\t"                    \
 229     "pxor   "#rega", "#regb"            \n\t"                    \
 230     "pand  "#regfe", "#regb"            \n\t"                    \
 231     "psrlq       $1, "#regb"            \n\t"                    \
 232     "psubb  "#regb", "#regr"            \n\t"
 233
 234 // mm6 is supposed to contain 0xfefefefefefefefe
 235 #define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp)   \
 236     "movq  "#rega", "#regr"             \n\t"                    \
 237     "movq  "#regc", "#regp"             \n\t"                    \
 238     "pand  "#regb", "#regr"             \n\t"                    \
 239     "pand  "#regd", "#regp"             \n\t"                    \
 240     "pxor  "#rega", "#regb"             \n\t"                    \
 241     "pxor  "#regc", "#regd"             \n\t"                    \
 242     "pand    %%mm6, "#regb"             \n\t"                    \
 243     "pand    %%mm6, "#regd"             \n\t"                    \
 244     "psrlq      $1, "#regb"             \n\t"                    \
 245     "psrlq      $1, "#regd"             \n\t"                    \
 246     "paddb "#regb", "#regr"             \n\t"                    \
 247     "paddb "#regd", "#regp"             \n\t"
 248
 249 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp)           \
 250     "movq  "#rega", "#regr"             \n\t"                    \
 251     "movq  "#regc", "#regp"             \n\t"                    \
 252     "por   "#regb", "#regr"             \n\t"                    \
 253     "por   "#regd", "#regp"             \n\t"                    \
 254     "pxor  "#rega", "#regb"             \n\t"                    \
 255     "pxor  "#regc", "#regd"             \n\t"                    \
 256     "pand    %%mm6, "#regb"             \n\t"                    \
 257     "pand    %%mm6, "#regd"             \n\t"                    \
 258     "psrlq      $1, "#regd"             \n\t"                    \
 259     "psrlq      $1, "#regb"             \n\t"                    \
 260     "psubb "#regb", "#regr"             \n\t"                    \
 261     "psubb "#regd", "#regp"             \n\t"
 262
 263 /***********************************/
 264 /* MMX no rounding */
 265 #define NO_RND 1
 266 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
 267 #define SET_RND  MOVQ_WONE
 268 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
 269 #define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
 270 #define OP_AVG(a, b, c, e)              PAVGB_MMX(a, b, c, e)
 271
 272 #include "dsputil_rnd_template.c"
 273
 274 #undef DEF
 275 #undef SET_RND
 276 #undef PAVGBP
 277 #undef PAVGB
 278 #undef NO_RND
 279 /***********************************/
 280 /* MMX rounding */
 281
 282 #define DEF(x, y) x ## _ ## y ## _mmx
 283 #define SET_RND  MOVQ_WTWO
 284 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
 285 #define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
 286
 287 #include "dsputil_rnd_template.c"
 288
 289 #undef DEF
 290 #undef SET_RND
 291 #undef PAVGBP
 292 #undef PAVGB
 293 #undef OP_AVG
 294
 295 #endif /* HAVE_INLINE_ASM */
 296
 297
 298 #if HAVE_YASM
 299
 300 /***********************************/
 301 /* 3Dnow specific */
 302
 303 #define DEF(x) x ## _3dnow
 304
 305 #include "dsputil_avg_template.c"
 306
 307 #undef DEF
 308
 309 /***********************************/
 310 /* MMXEXT specific */
 311
 312 #define DEF(x) x ## _mmxext
 313
 314 #include "dsputil_avg_template.c"
 315
 316 #undef DEF
 317
 318 #endif /* HAVE_YASM */
 319
 320
 321 #if HAVE_INLINE_ASM
 322 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
 323 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
 324 #define put_pixels16_mmxext put_pixels16_mmx
 325 #define put_pixels8_mmxext put_pixels8_mmx
 326 #define put_pixels4_mmxext put_pixels4_mmx
 327 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
 328 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
 329
 330 /***********************************/
 331 /* standard MMX */
 332
 333 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
 334                                int line_size)
 335 {
 336     const int16_t *p;
 337     uint8_t *pix;
 338
 339     /* read the pixels */
 340     p   = block;
 341     pix = pixels;
 342     /* unrolled loop */
 343     __asm__ volatile (
 344         "movq      (%3), %%mm0          \n\t"
 345         "movq     8(%3), %%mm1          \n\t"
 346         "movq    16(%3), %%mm2          \n\t"
 347         "movq    24(%3), %%mm3          \n\t"
 348         "movq    32(%3), %%mm4          \n\t"
 349         "movq    40(%3), %%mm5          \n\t"
 350         "movq    48(%3), %%mm6          \n\t"
 351         "movq    56(%3), %%mm7          \n\t"
 352         "packuswb %%mm1, %%mm0          \n\t"
 353         "packuswb %%mm3, %%mm2          \n\t"
 354         "packuswb %%mm5, %%mm4          \n\t"
 355         "packuswb %%mm7, %%mm6          \n\t"
 356         "movq     %%mm0, (%0)           \n\t"
 357         "movq     %%mm2, (%0, %1)       \n\t"
 358         "movq     %%mm4, (%0, %1, 2)    \n\t"
 359         "movq     %%mm6, (%0, %2)       \n\t"
 360         :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
 361            "r"(p)
 362         : "memory");
 363     pix += line_size * 4;
 364     p   += 32;
 365
 366     // if here would be an exact copy of the code above
 367     // compiler would generate some very strange code
 368     // thus using "r"
 369     __asm__ volatile (
 370         "movq       (%3), %%mm0         \n\t"
 371         "movq      8(%3), %%mm1         \n\t"
 372         "movq     16(%3), %%mm2         \n\t"
 373         "movq     24(%3), %%mm3         \n\t"
 374         "movq     32(%3), %%mm4         \n\t"
 375         "movq     40(%3), %%mm5         \n\t"
 376         "movq     48(%3), %%mm6         \n\t"
 377         "movq     56(%3), %%mm7         \n\t"
 378         "packuswb  %%mm1, %%mm0         \n\t"
 379         "packuswb  %%mm3, %%mm2         \n\t"
 380         "packuswb  %%mm5, %%mm4         \n\t"
 381         "packuswb  %%mm7, %%mm6         \n\t"
 382         "movq      %%mm0, (%0)          \n\t"
 383         "movq      %%mm2, (%0, %1)      \n\t"
 384         "movq      %%mm4, (%0, %1, 2)   \n\t"
 385         "movq      %%mm6, (%0, %2)      \n\t"
 386         :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
 387         : "memory");
 388 }
 389
 390 #define put_signed_pixels_clamped_mmx_half(off)             \
 391     "movq          "#off"(%2), %%mm1        \n\t"           \
 392     "movq     16 + "#off"(%2), %%mm2        \n\t"           \
 393     "movq     32 + "#off"(%2), %%mm3        \n\t"           \
 394     "movq     48 + "#off"(%2), %%mm4        \n\t"           \
 395     "packsswb  8 + "#off"(%2), %%mm1        \n\t"           \
 396     "packsswb 24 + "#off"(%2), %%mm2        \n\t"           \
 397     "packsswb 40 + "#off"(%2), %%mm3        \n\t"           \
 398     "packsswb 56 + "#off"(%2), %%mm4        \n\t"           \
 399     "paddb              %%mm0, %%mm1        \n\t"           \
 400     "paddb              %%mm0, %%mm2        \n\t"           \
 401     "paddb              %%mm0, %%mm3        \n\t"           \
 402     "paddb              %%mm0, %%mm4        \n\t"           \
 403     "movq               %%mm1, (%0)         \n\t"           \
 404     "movq               %%mm2, (%0, %3)     \n\t"           \
 405     "movq               %%mm3, (%0, %3, 2)  \n\t"           \
 406     "movq               %%mm4, (%0, %1)     \n\t"
 407
 408 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
 409                                       int line_size)
 410 {
 411     x86_reg line_skip = line_size;
 412     x86_reg line_skip3;
 413
 414     __asm__ volatile (
 415         "movq "MANGLE(ff_pb_80)", %%mm0     \n\t"
 416         "lea         (%3, %3, 2), %1        \n\t"
 417         put_signed_pixels_clamped_mmx_half(0)
 418         "lea         (%0, %3, 4), %0        \n\t"
 419         put_signed_pixels_clamped_mmx_half(64)
 420         : "+&r"(pixels), "=&r"(line_skip3)
 421         : "r"(block), "r"(line_skip)
 422         : "memory");
 423 }
 424
 425 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
 426                                int line_size)
 427 {
 428     const int16_t *p;
 429     uint8_t *pix;
 430     int i;
 431
 432     /* read the pixels */
 433     p   = block;
 434     pix = pixels;
 435     MOVQ_ZERO(mm7);
 436     i = 4;
 437     do {
 438         __asm__ volatile (
 439             "movq        (%2), %%mm0    \n\t"
 440             "movq       8(%2), %%mm1    \n\t"
 441             "movq      16(%2), %%mm2    \n\t"
 442             "movq      24(%2), %%mm3    \n\t"
 443             "movq          %0, %%mm4    \n\t"
 444             "movq          %1, %%mm6    \n\t"
 445             "movq       %%mm4, %%mm5    \n\t"
 446             "punpcklbw  %%mm7, %%mm4    \n\t"
 447             "punpckhbw  %%mm7, %%mm5    \n\t"
 448             "paddsw     %%mm4, %%mm0    \n\t"
 449             "paddsw     %%mm5, %%mm1    \n\t"
 450             "movq       %%mm6, %%mm5    \n\t"
 451             "punpcklbw  %%mm7, %%mm6    \n\t"
 452             "punpckhbw  %%mm7, %%mm5    \n\t"
 453             "paddsw     %%mm6, %%mm2    \n\t"
 454             "paddsw     %%mm5, %%mm3    \n\t"
 455             "packuswb   %%mm1, %%mm0    \n\t"
 456             "packuswb   %%mm3, %%mm2    \n\t"
 457             "movq       %%mm0, %0       \n\t"
 458             "movq       %%mm2, %1       \n\t"
 459             : "+m"(*pix), "+m"(*(pix + line_size))
 460             : "r"(p)
 461             : "memory");
 462         pix += line_size * 2;
 463         p   += 16;
 464     } while (--i);
 465 }
 466
 467 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
 468                             int line_size, int h)
 469 {
 470     __asm__ volatile (
 471         "lea   (%3, %3), %%"REG_a"      \n\t"
 472         ".p2align     3                 \n\t"
 473         "1:                             \n\t"
 474         "movq  (%1    ), %%mm0          \n\t"
 475         "movq  (%1, %3), %%mm1          \n\t"
 476         "movq     %%mm0, (%2)           \n\t"
 477         "movq     %%mm1, (%2, %3)       \n\t"
 478         "add  %%"REG_a", %1             \n\t"
 479         "add  %%"REG_a", %2             \n\t"
 480         "movq  (%1    ), %%mm0          \n\t"
 481         "movq  (%1, %3), %%mm1          \n\t"
 482         "movq     %%mm0, (%2)           \n\t"
 483         "movq     %%mm1, (%2, %3)       \n\t"
 484         "add  %%"REG_a", %1             \n\t"
 485         "add  %%"REG_a", %2             \n\t"
 486         "subl        $4, %0             \n\t"
 487         "jnz         1b                 \n\t"
 488         : "+g"(h), "+r"(pixels),  "+r"(block)
 489         : "r"((x86_reg)line_size)
 490         : "%"REG_a, "memory"
 491         );
 492 }
 493
 494 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
 495                              int line_size, int h)
 496 {
 497     __asm__ volatile (
 498         "lea   (%3, %3), %%"REG_a"      \n\t"
 499         ".p2align     3                 \n\t"
 500         "1:                             \n\t"
 501         "movq  (%1    ), %%mm0          \n\t"
 502         "movq 8(%1    ), %%mm4          \n\t"
 503         "movq  (%1, %3), %%mm1          \n\t"
 504         "movq 8(%1, %3), %%mm5          \n\t"
 505         "movq     %%mm0,  (%2)          \n\t"
 506         "movq     %%mm4, 8(%2)          \n\t"
 507         "movq     %%mm1,  (%2, %3)      \n\t"
 508         "movq     %%mm5, 8(%2, %3)      \n\t"
 509         "add  %%"REG_a", %1             \n\t"
 510         "add  %%"REG_a", %2             \n\t"
 511         "movq  (%1    ), %%mm0          \n\t"
 512         "movq 8(%1    ), %%mm4          \n\t"
 513         "movq  (%1, %3), %%mm1          \n\t"
 514         "movq 8(%1, %3), %%mm5          \n\t"
 515         "movq     %%mm0,  (%2)          \n\t"
 516         "movq     %%mm4, 8(%2)          \n\t"
 517         "movq     %%mm1,  (%2, %3)      \n\t"
 518         "movq     %%mm5, 8(%2, %3)      \n\t"
 519         "add  %%"REG_a", %1             \n\t"
 520         "add  %%"REG_a", %2             \n\t"
 521         "subl        $4, %0             \n\t"
 522         "jnz         1b                 \n\t"
 523         : "+g"(h), "+r"(pixels),  "+r"(block)
 524         : "r"((x86_reg)line_size)
 525         : "%"REG_a, "memory"
 526         );
 527 }
 528
 529 #define CLEAR_BLOCKS(name, n)                           \
 530 static void name(int16_t *blocks)                       \
 531 {                                                       \
 532     __asm__ volatile (                                  \
 533         "pxor %%mm7, %%mm7              \n\t"           \
 534         "mov     %1,        %%"REG_a"   \n\t"           \
 535         "1:                             \n\t"           \
 536         "movq %%mm7,   (%0, %%"REG_a")  \n\t"           \
 537         "movq %%mm7,  8(%0, %%"REG_a")  \n\t"           \
 538         "movq %%mm7, 16(%0, %%"REG_a")  \n\t"           \
 539         "movq %%mm7, 24(%0, %%"REG_a")  \n\t"           \
 540         "add    $32, %%"REG_a"          \n\t"           \
 541         "js      1b                     \n\t"           \
 542         :: "r"(((uint8_t *)blocks) + 128 * n),          \
 543            "i"(-128 * n)                                \
 544         : "%"REG_a                                      \
 545         );                                              \
 546 }
 547 CLEAR_BLOCKS(clear_blocks_mmx, 6)
 548 CLEAR_BLOCKS(clear_block_mmx, 1)
 549
 550 static void clear_block_sse(int16_t *block)
 551 {
 552     __asm__ volatile (
 553         "xorps  %%xmm0, %%xmm0          \n"
 554         "movaps %%xmm0,    (%0)         \n"
 555         "movaps %%xmm0,  16(%0)         \n"
 556         "movaps %%xmm0,  32(%0)         \n"
 557         "movaps %%xmm0,  48(%0)         \n"
 558         "movaps %%xmm0,  64(%0)         \n"
 559         "movaps %%xmm0,  80(%0)         \n"
 560         "movaps %%xmm0,  96(%0)         \n"
 561         "movaps %%xmm0, 112(%0)         \n"
 562         :: "r"(block)
 563         : "memory"
 564     );
 565 }
 566
 567 static void clear_blocks_sse(int16_t *blocks)
 568 {
 569     __asm__ volatile (
 570         "xorps  %%xmm0, %%xmm0              \n"
 571         "mov        %1,         %%"REG_a"   \n"
 572         "1:                                 \n"
 573         "movaps %%xmm0,    (%0, %%"REG_a")  \n"
 574         "movaps %%xmm0,  16(%0, %%"REG_a")  \n"
 575         "movaps %%xmm0,  32(%0, %%"REG_a")  \n"
 576         "movaps %%xmm0,  48(%0, %%"REG_a")  \n"
 577         "movaps %%xmm0,  64(%0, %%"REG_a")  \n"
 578         "movaps %%xmm0,  80(%0, %%"REG_a")  \n"
 579         "movaps %%xmm0,  96(%0, %%"REG_a")  \n"
 580         "movaps %%xmm0, 112(%0, %%"REG_a")  \n"
 581         "add      $128,         %%"REG_a"   \n"
 582         "js         1b                      \n"
 583         :: "r"(((uint8_t *)blocks) + 128 * 6),
 584            "i"(-128 * 6)
 585         : "%"REG_a
 586     );
 587 }
 588
 589 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
 590 {
 591     x86_reg i = 0;
 592     __asm__ volatile (
 593         "jmp          2f                \n\t"
 594         "1:                             \n\t"
 595         "movq   (%1, %0), %%mm0         \n\t"
 596         "movq   (%2, %0), %%mm1         \n\t"
 597         "paddb     %%mm0, %%mm1         \n\t"
 598         "movq      %%mm1, (%2, %0)      \n\t"
 599         "movq  8(%1, %0), %%mm0         \n\t"
 600         "movq  8(%2, %0), %%mm1         \n\t"
 601         "paddb     %%mm0, %%mm1         \n\t"
 602         "movq      %%mm1, 8(%2, %0)     \n\t"
 603         "add         $16, %0            \n\t"
 604         "2:                             \n\t"
 605         "cmp          %3, %0            \n\t"
 606         "js           1b                \n\t"
 607         : "+r"(i)
 608         : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
 609     );
 610     for ( ; i < w; i++)
 611         dst[i + 0] += src[i + 0];
 612 }
 613
 614 #if HAVE_7REGS
 615 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
 616                                             const uint8_t *diff, int w,
 617                                             int *left, int *left_top)
 618 {
 619     x86_reg w2 = -w;
 620     x86_reg x;
 621     int l  = *left     & 0xff;
 622     int tl = *left_top & 0xff;
 623     int t;
 624     __asm__ volatile (
 625         "mov          %7, %3            \n"
 626         "1:                             \n"
 627         "movzbl (%3, %4), %2            \n"
 628         "mov          %2, %k3           \n"
 629         "sub         %b1, %b3           \n"
 630         "add         %b0, %b3           \n"
 631         "mov          %2, %1            \n"
 632         "cmp          %0, %2            \n"
 633         "cmovg        %0, %2            \n"
 634         "cmovg        %1, %0            \n"
 635         "cmp         %k3, %0            \n"
 636         "cmovg       %k3, %0            \n"
 637         "mov          %7, %3            \n"
 638         "cmp          %2, %0            \n"
 639         "cmovl        %2, %0            \n"
 640         "add    (%6, %4), %b0           \n"
 641         "mov         %b0, (%5, %4)      \n"
 642         "inc          %4                \n"
 643         "jl           1b                \n"
 644         : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
 645         : "r"(dst + w), "r"(diff + w), "rm"(top + w)
 646     );
 647     *left     = l;
 648     *left_top = tl;
 649 }
 650 #endif
 651 #endif /* HAVE_INLINE_ASM */
 652
 653 void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
 654 void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
 655
 656 #if HAVE_INLINE_ASM
 657 /* Draw the edges of width 'w' of an image of size width, height
 658  * this MMX version can only handle w == 8 || w == 16. */
 659 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
 660                            int w, int h, int sides)
 661 {
 662     uint8_t *ptr, *last_line;
 663     int i;
 664
 665     last_line = buf + (height - 1) * wrap;
 666     /* left and right */
 667     ptr = buf;
 668     if (w == 8) {
 669         __asm__ volatile (
 670             "1:                             \n\t"
 671             "movd            (%0), %%mm0    \n\t"
 672             "punpcklbw      %%mm0, %%mm0    \n\t"
 673             "punpcklwd      %%mm0, %%mm0    \n\t"
 674             "punpckldq      %%mm0, %%mm0    \n\t"
 675             "movq           %%mm0, -8(%0)   \n\t"
 676             "movq      -8(%0, %2), %%mm1    \n\t"
 677             "punpckhbw      %%mm1, %%mm1    \n\t"
 678             "punpckhwd      %%mm1, %%mm1    \n\t"
 679             "punpckhdq      %%mm1, %%mm1    \n\t"
 680             "movq           %%mm1, (%0, %2) \n\t"
 681             "add               %1, %0       \n\t"
 682             "cmp               %3, %0       \n\t"
 683             "jb                1b           \n\t"
 684             : "+r"(ptr)
 685             : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
 686             );
 687     } else {
 688         __asm__ volatile (
 689             "1:                                 \n\t"
 690             "movd            (%0), %%mm0        \n\t"
 691             "punpcklbw      %%mm0, %%mm0        \n\t"
 692             "punpcklwd      %%mm0, %%mm0        \n\t"
 693             "punpckldq      %%mm0, %%mm0        \n\t"
 694             "movq           %%mm0, -8(%0)       \n\t"
 695             "movq           %%mm0, -16(%0)      \n\t"
 696             "movq      -8(%0, %2), %%mm1        \n\t"
 697             "punpckhbw      %%mm1, %%mm1        \n\t"
 698             "punpckhwd      %%mm1, %%mm1        \n\t"
 699             "punpckhdq      %%mm1, %%mm1        \n\t"
 700             "movq           %%mm1,  (%0, %2)    \n\t"
 701             "movq           %%mm1, 8(%0, %2)    \n\t"
 702             "add               %1, %0           \n\t"
 703             "cmp               %3, %0           \n\t"
 704             "jb                1b               \n\t"
 705             : "+r"(ptr)
 706             : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
 707             );
 708     }
 709
 710     /* top and bottom (and hopefully also the corners) */
 711     if (sides & EDGE_TOP) {
 712         for (i = 0; i < h; i += 4) {
 713             ptr = buf - (i + 1) * wrap - w;
 714             __asm__ volatile (
 715                 "1:                             \n\t"
 716                 "movq (%1, %0), %%mm0           \n\t"
 717                 "movq    %%mm0, (%0)            \n\t"
 718                 "movq    %%mm0, (%0, %2)        \n\t"
 719                 "movq    %%mm0, (%0, %2, 2)     \n\t"
 720                 "movq    %%mm0, (%0, %3)        \n\t"
 721                 "add        $8, %0              \n\t"
 722                 "cmp        %4, %0              \n\t"
 723                 "jb         1b                  \n\t"
 724                 : "+r"(ptr)
 725                 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
 726                   "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
 727                 );
 728         }
 729     }
 730
 731     if (sides & EDGE_BOTTOM) {
 732         for (i = 0; i < h; i += 4) {
 733             ptr = last_line + (i + 1) * wrap - w;
 734             __asm__ volatile (
 735                 "1:                             \n\t"
 736                 "movq (%1, %0), %%mm0           \n\t"
 737                 "movq    %%mm0, (%0)            \n\t"
 738                 "movq    %%mm0, (%0, %2)        \n\t"
 739                 "movq    %%mm0, (%0, %2, 2)     \n\t"
 740                 "movq    %%mm0, (%0, %3)        \n\t"
 741                 "add        $8, %0              \n\t"
 742                 "cmp        %4, %0              \n\t"
 743                 "jb         1b                  \n\t"
 744                 : "+r"(ptr)
 745                 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
 746                   "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
 747                   "r"(ptr + width + 2 * w)
 748                 );
 749         }
 750     }
 751 }
 752 #endif /* HAVE_INLINE_ASM */
 753
 754
 755 #if HAVE_YASM
 756 #define QPEL_OP(OPNAME, ROUNDER, RND, MMX)                              \
 757 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src,   \
 758                                           int stride)                   \
 759 {                                                                       \
 760     ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);              \
 761 }                                                                       \
 762                                                                         \
 763 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src,    \
 764                                          int stride)                    \
 765 {                                                                       \
 766     uint64_t temp[8];                                                   \
 767     uint8_t * const half = (uint8_t*)temp;                              \
 768     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,        \
 769                                                    stride, 8);          \
 770     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half,                 \
 771                                         stride, stride, 8);             \
 772 }                                                                       \
 773                                                                         \
 774 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src,    \
 775                                          int stride)                    \
 776 {                                                                       \
 777     ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride,    \
 778                                                    stride, 8);          \
 779 }                                                                       \
 780                                                                         \
 781 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src,    \
 782                                          int stride)                    \
 783 {                                                                       \
 784     uint64_t temp[8];                                                   \
 785     uint8_t * const half = (uint8_t*)temp;                              \
 786     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,        \
 787                                                    stride, 8);          \
 788     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride,     \
 789                                         stride, 8);                     \
 790 }                                                                       \
 791                                                                         \
 792 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src,    \
 793                                          int stride)                    \
 794 {                                                                       \
 795     uint64_t temp[8];                                                   \
 796     uint8_t * const half = (uint8_t*)temp;                              \
 797     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src,           \
 798                                                    8, stride);          \
 799     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half,                 \
 800                                         stride, stride, 8);             \
 801 }                                                                       \
 802                                                                         \
 803 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src,    \
 804                                          int stride)                    \
 805 {                                                                       \
 806     ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src,            \
 807                                                    stride, stride);     \
 808 }                                                                       \
 809                                                                         \
 810 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src,    \
 811                                          int stride)                    \
 812 {                                                                       \
 813     uint64_t temp[8];                                                   \
 814     uint8_t * const half = (uint8_t*)temp;                              \
 815     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src,           \
 816                                                    8, stride);          \
 817     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
 818                                         stride, 8);                     \
 819 }                                                                       \
 820                                                                         \
 821 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src,    \
 822                                          int stride)                    \
 823 {                                                                       \
 824     uint64_t half[8 + 9];                                               \
 825     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
 826     uint8_t * const halfHV = ((uint8_t*)half);                          \
 827     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
 828                                                    stride, 9);          \
 829     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8,           \
 830                                         stride, 9);                     \
 831     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
 832     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV,             \
 833                                         stride, 8, 8);                  \
 834 }                                                                       \
 835                                                                         \
 836 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src,    \
 837                                          int stride)                    \
 838 {                                                                       \
 839     uint64_t half[8 + 9];                                               \
 840     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
 841     uint8_t * const halfHV = ((uint8_t*)half);                          \
 842     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
 843                                                    stride, 9);          \
 844     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
 845                                         stride, 9);                     \
 846     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
 847     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV,             \
 848                                         stride, 8, 8);                  \
 849 }                                                                       \
 850                                                                         \
 851 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src,    \
 852                                          int stride)                    \
 853 {                                                                       \
 854     uint64_t half[8 + 9];                                               \
 855     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
 856     uint8_t * const halfHV = ((uint8_t*)half);                          \
 857     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
 858                                                    stride, 9);          \
 859     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8,           \
 860                                         stride, 9);                     \
 861     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
 862     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV,         \
 863                                         stride, 8, 8);                  \
 864 }                                                                       \
 865                                                                         \
 866 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src,    \
 867                                          int stride)                    \
 868 {                                                                       \
 869     uint64_t half[8 + 9];                                               \
 870     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
 871     uint8_t * const halfHV = ((uint8_t*)half);                          \
 872     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
 873                                                    stride, 9);          \
 874     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
 875                                         stride, 9);                     \
 876     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
 877     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV,         \
 878                                         stride, 8, 8);                  \
 879 }                                                                       \
 880                                                                         \
 881 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src,    \
 882                                          int stride)                    \
 883 {                                                                       \
 884     uint64_t half[8 + 9];                                               \
 885     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
 886     uint8_t * const halfHV = ((uint8_t*)half);                          \
 887     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
 888                                                    stride, 9);          \
 889     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
 890     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV,             \
 891                                         stride, 8, 8);                  \
 892 }                                                                       \
 893                                                                         \
 894 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src,    \
 895                                          int stride)                    \
 896 {                                                                       \
 897     uint64_t half[8 + 9];                                               \
 898     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
 899     uint8_t * const halfHV = ((uint8_t*)half);                          \
 900     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
 901                                                    stride, 9);          \
 902     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
 903     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV,         \
 904                                         stride, 8, 8);                  \
 905 }                                                                       \
 906                                                                         \
 907 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src,    \
 908                                          int stride)                    \
 909 {                                                                       \
 910     uint64_t half[8 + 9];                                               \
 911     uint8_t * const halfH = ((uint8_t*)half);                           \
 912     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
 913                                                    stride, 9);          \
 914     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH,              \
 915                                         8, stride, 9);                  \
 916     ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
 917                                                    stride, 8);          \
 918 }                                                                       \
 919                                                                         \
 920 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src,    \
 921                                          int stride)                    \
 922 {                                                                       \
 923     uint64_t half[8 + 9];                                               \
 924     uint8_t * const halfH = ((uint8_t*)half);                           \
 925     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
 926                                                    stride, 9);          \
 927     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
 928                                         stride, 9);                     \
 929     ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
 930                                                    stride, 8);          \
 931 }                                                                       \
 932                                                                         \
 933 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src,    \
 934                                          int stride)                    \
 935 {                                                                       \
 936     uint64_t half[9];                                                   \
 937     uint8_t * const halfH = ((uint8_t*)half);                           \
 938     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
 939                                                    stride, 9);          \
 940     ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
 941                                                    stride, 8);          \
 942 }                                                                       \
 943                                                                         \
 944 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src,  \
 945                                            int stride)                  \
 946 {                                                                       \
 947     ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);            \
 948 }                                                                       \
 949                                                                         \
 950 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src,   \
 951                                           int stride)                   \
 952 {                                                                       \
 953     uint64_t temp[32];                                                  \
 954     uint8_t * const half = (uint8_t*)temp;                              \
 955     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,      \
 956                                                     stride, 16);        \
 957     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride,        \
 958                                          stride, 16);                   \
 959 }                                                                       \
 960                                                                         \
 961 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src,   \
 962                                           int stride)                   \
 963 {                                                                       \
 964     ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src,           \
 965                                                     stride, stride, 16);\
 966 }                                                                       \
 967                                                                         \
 968 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src,   \
 969                                           int stride)                   \
 970 {                                                                       \
 971     uint64_t temp[32];                                                  \
 972     uint8_t * const half = (uint8_t*)temp;                              \
 973     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,      \
 974                                                     stride, 16);        \
 975     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half,            \
 976                                          stride, stride, 16);           \
 977 }                                                                       \
 978                                                                         \
 979 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src,   \
 980                                           int stride)                   \
 981 {                                                                       \
 982     uint64_t temp[32];                                                  \
 983     uint8_t * const half = (uint8_t*)temp;                              \
 984     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,      \
 985                                                     stride);            \
 986     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride,        \
 987                                          stride, 16);                   \
 988 }                                                                       \
 989                                                                         \
 990 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src,   \
 991                                           int stride)                   \
 992 {                                                                       \
 993     ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src,           \
 994                                                     stride, stride);    \
 995 }                                                                       \
 996                                                                         \
 997 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src,   \
 998                                           int stride)                   \
 999 {                                                                       \
1000     uint64_t temp[32];                                                  \
1001     uint8_t * const half = (uint8_t*)temp;                              \
1002     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,      \
1003                                                     stride);            \
1004     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half,         \
1005                                          stride, stride, 16);           \
1006 }                                                                       \
1007                                                                         \
1008 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src,   \
1009                                           int stride)                   \
1010 {                                                                       \
1011     uint64_t half[16 * 2 + 17 * 2];                                     \
1012     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1013     uint8_t * const halfHV = ((uint8_t*)half);                          \
1014     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1015                                                     stride, 17);        \
1016     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
1017                                          stride, 17);                   \
1018     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
1019                                                     16, 16);            \
1020     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV,            \
1021                                          stride, 16, 16);               \
1022 }                                                                       \
1023                                                                         \
1024 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src,   \
1025                                           int stride)                   \
1026 {                                                                       \
1027     uint64_t half[16 * 2 + 17 * 2];                                     \
1028     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1029     uint8_t * const halfHV = ((uint8_t*)half);                          \
1030     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1031                                                     stride, 17);        \
1032     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
1033                                          stride, 17);                   \
1034     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
1035                                                     16, 16);            \
1036     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV,            \
1037                                          stride, 16, 16);               \
1038 }                                                                       \
1039                                                                         \
1040 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src,   \
1041                                           int stride)                   \
1042 {                                                                       \
1043     uint64_t half[16 * 2 + 17 * 2];                                     \
1044     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1045     uint8_t * const halfHV = ((uint8_t*)half);                          \
1046     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1047                                                     stride, 17);        \
1048     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
1049                                          stride, 17);                   \
1050     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
1051                                                     16, 16);            \
1052     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV,       \
1053                                          stride, 16, 16);               \
1054 }                                                                       \
1055                                                                         \
1056 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src,   \
1057                                           int stride)                   \
1058 {                                                                       \
1059     uint64_t half[16 * 2 + 17 * 2];                                     \
1060     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1061     uint8_t * const halfHV = ((uint8_t*)half);                          \
1062     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1063                                                     stride, 17);        \
1064     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
1065                                          stride, 17);                   \
1066     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
1067                                                     16, 16);            \
1068     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV,       \
1069                                          stride, 16, 16);               \
1070 }                                                                       \
1071                                                                         \
1072 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src,   \
1073                                           int stride)                   \
1074 {                                                                       \
1075     uint64_t half[16 * 2 + 17 * 2];                                     \
1076     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1077     uint8_t * const halfHV = ((uint8_t*)half);                          \
1078     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1079                                                     stride, 17);        \
1080     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
1081                                                     16, 16);            \
1082     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV,            \
1083                                          stride, 16, 16);               \
1084 }                                                                       \
1085                                                                         \
1086 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src,   \
1087                                           int stride)                   \
1088 {                                                                       \
1089     uint64_t half[16 * 2 + 17 * 2];                                     \
1090     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1091     uint8_t * const halfHV = ((uint8_t*)half);                          \
1092     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1093                                                     stride, 17);        \
1094     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
1095                                                     16, 16);            \
1096     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV,       \
1097                                          stride, 16, 16);               \
1098 }                                                                       \
1099                                                                         \
1100 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src,   \
1101                                           int stride)                   \
1102 {                                                                       \
1103     uint64_t half[17 * 2];                                              \
1104     uint8_t * const halfH = ((uint8_t*)half);                           \
1105     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1106                                                     stride, 17);        \
1107     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
1108                                          stride, 17);                   \
1109     ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
1110                                                     stride, 16);        \
1111 }                                                                       \
1112                                                                         \
1113 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src,   \
1114                                           int stride)                   \
1115 {                                                                       \
1116     uint64_t half[17 * 2];                                              \
1117     uint8_t * const halfH = ((uint8_t*)half);                           \
1118     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1119                                                     stride, 17);        \
1120     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
1121                                          stride, 17);                   \
1122     ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
1123                                                     stride, 16);        \
1124 }                                                                       \
1125                                                                         \
1126 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src,   \
1127                                           int stride)                   \
1128 {                                                                       \
1129     uint64_t half[17 * 2];                                              \
1130     uint8_t * const halfH = ((uint8_t*)half);                           \
1131     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1132                                                     stride, 17);        \
1133     ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
1134                                                     stride, 16);        \
1135 }
1136
1137 QPEL_OP(put_,          ff_pw_16, _,        mmxext)
1138 QPEL_OP(avg_,          ff_pw_16, _,        mmxext)
1139 QPEL_OP(put_no_rnd_,   ff_pw_15, _no_rnd_, mmxext)
1140 #endif /* HAVE_YASM */
1141
1142
1143 #if HAVE_INLINE_ASM
1144 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1145 {
1146   put_pixels8_xy2_mmx(dst, src, stride, 8);
1147 }
1148 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1149 {
1150   put_pixels16_xy2_mmx(dst, src, stride, 16);
1151 }
1152 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1153 {
1154   avg_pixels8_xy2_mmx(dst, src, stride, 8);
1155 }
1156 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1157 {
1158   avg_pixels16_xy2_mmx(dst, src, stride, 16);
1159 }
1160
1161 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1162                     int stride, int h, int ox, int oy,
1163                     int dxx, int dxy, int dyx, int dyy,
1164                     int shift, int r, int width, int height)
1165 {
1166     const int w    = 8;
1167     const int ix   = ox  >> (16 + shift);
1168     const int iy   = oy  >> (16 + shift);
1169     const int oxs  = ox  >> 4;
1170     const int oys  = oy  >> 4;
1171     const int dxxs = dxx >> 4;
1172     const int dxys = dxy >> 4;
1173     const int dyxs = dyx >> 4;
1174     const int dyys = dyy >> 4;
1175     const uint16_t r4[4]   = { r, r, r, r };
1176     const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1177     const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1178     const uint64_t shift2 = 2 * shift;
1179     int x, y;
1180
1181     const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1182     const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1183     const int dxh = dxy * (h - 1);
1184     const int dyw = dyx * (w - 1);
1185     if ( // non-constant fullpel offset (3% of blocks)
1186         ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1187          (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1188         // uses more than 16 bits of subpel mv (only at huge resolution)
1189         || (dxx | dxy | dyx | dyy) & 15 ||
1190         (unsigned)ix >= width  - w ||
1191         (unsigned)iy >= height - h) {
1192         // FIXME could still use mmx for some of the rows
1193         ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1194                  shift, r, width, height);
1195         return;
1196     }
1197
1198     src += ix + iy * stride;
1199
1200     __asm__ volatile (
1201         "movd         %0, %%mm6         \n\t"
1202         "pxor      %%mm7, %%mm7         \n\t"
1203         "punpcklwd %%mm6, %%mm6         \n\t"
1204         "punpcklwd %%mm6, %%mm6         \n\t"
1205         :: "r"(1<<shift)
1206     );
1207
1208     for (x = 0; x < w; x += 4) {
1209         uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1210                             oxs - dxys + dxxs * (x + 1),
1211                             oxs - dxys + dxxs * (x + 2),
1212                             oxs - dxys + dxxs * (x + 3) };
1213         uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1214                             oys - dyys + dyxs * (x + 1),
1215                             oys - dyys + dyxs * (x + 2),
1216                             oys - dyys + dyxs * (x + 3) };
1217
1218         for (y = 0; y < h; y++) {
1219             __asm__ volatile (
1220                 "movq      %0, %%mm4    \n\t"
1221                 "movq      %1, %%mm5    \n\t"
1222                 "paddw     %2, %%mm4    \n\t"
1223                 "paddw     %3, %%mm5    \n\t"
1224                 "movq   %%mm4, %0       \n\t"
1225                 "movq   %%mm5, %1       \n\t"
1226                 "psrlw    $12, %%mm4    \n\t"
1227                 "psrlw    $12, %%mm5    \n\t"
1228                 : "+m"(*dx4), "+m"(*dy4)
1229                 : "m"(*dxy4), "m"(*dyy4)
1230             );
1231
1232             __asm__ volatile (
1233                 "movq      %%mm6, %%mm2 \n\t"
1234                 "movq      %%mm6, %%mm1 \n\t"
1235                 "psubw     %%mm4, %%mm2 \n\t"
1236                 "psubw     %%mm5, %%mm1 \n\t"
1237                 "movq      %%mm2, %%mm0 \n\t"
1238                 "movq      %%mm4, %%mm3 \n\t"
1239                 "pmullw    %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1240                 "pmullw    %%mm5, %%mm3 \n\t" // dx * dy
1241                 "pmullw    %%mm5, %%mm2 \n\t" // (s - dx) * dy
1242                 "pmullw    %%mm4, %%mm1 \n\t" // dx * (s - dy)
1243
1244                 "movd         %4, %%mm5 \n\t"
1245                 "movd         %3, %%mm4 \n\t"
1246                 "punpcklbw %%mm7, %%mm5 \n\t"
1247                 "punpcklbw %%mm7, %%mm4 \n\t"
1248                 "pmullw    %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1249                 "pmullw    %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1250
1251                 "movd         %2, %%mm5 \n\t"
1252                 "movd         %1, %%mm4 \n\t"
1253                 "punpcklbw %%mm7, %%mm5 \n\t"
1254                 "punpcklbw %%mm7, %%mm4 \n\t"
1255                 "pmullw    %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1256                 "pmullw    %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1257                 "paddw        %5, %%mm1 \n\t"
1258                 "paddw     %%mm3, %%mm2 \n\t"
1259                 "paddw     %%mm1, %%mm0 \n\t"
1260                 "paddw     %%mm2, %%mm0 \n\t"
1261
1262                 "psrlw        %6, %%mm0 \n\t"
1263                 "packuswb  %%mm0, %%mm0 \n\t"
1264                 "movd      %%mm0, %0    \n\t"
1265
1266                 : "=m"(dst[x + y * stride])
1267                 : "m"(src[0]), "m"(src[1]),
1268                   "m"(src[stride]), "m"(src[stride + 1]),
1269                   "m"(*r4), "m"(shift2)
1270             );
1271             src += stride;
1272         }
1273         src += 4 - h * stride;
1274     }
1275 }
1276 #endif /* HAVE_INLINE_ASM */
1277
1278 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1279                           ptrdiff_t line_size, int h);
1280 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1281                           ptrdiff_t line_size, int h);
1282
1283 #if HAVE_INLINE_ASM
1284
1285 /* CAVS-specific */
1286 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1287 {
1288     put_pixels8_mmx(dst, src, stride, 8);
1289 }
1290
1291 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1292 {
1293     avg_pixels8_mmx(dst, src, stride, 8);
1294 }
1295
1296 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1297 {
1298     put_pixels16_mmx(dst, src, stride, 16);
1299 }
1300
1301 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1302 {
1303     avg_pixels16_mmx(dst, src, stride, 16);
1304 }
1305
1306 /* VC-1-specific */
1307 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1308                                int stride, int rnd)
1309 {
1310     put_pixels8_mmx(dst, src, stride, 8);
1311 }
1312
1313 static void vector_clipf_sse(float *dst, const float *src,
1314                              float min, float max, int len)
1315 {
1316     x86_reg i = (len - 16) * 4;
1317     __asm__ volatile (
1318         "movss          %3, %%xmm4      \n\t"
1319         "movss          %4, %%xmm5      \n\t"
1320         "shufps $0, %%xmm4, %%xmm4      \n\t"
1321         "shufps $0, %%xmm5, %%xmm5      \n\t"
1322         "1:                             \n\t"
1323         "movaps   (%2, %0), %%xmm0      \n\t" // 3/1 on intel
1324         "movaps 16(%2, %0), %%xmm1      \n\t"
1325         "movaps 32(%2, %0), %%xmm2      \n\t"
1326         "movaps 48(%2, %0), %%xmm3      \n\t"
1327         "maxps      %%xmm4, %%xmm0      \n\t"
1328         "maxps      %%xmm4, %%xmm1      \n\t"
1329         "maxps      %%xmm4, %%xmm2      \n\t"
1330         "maxps      %%xmm4, %%xmm3      \n\t"
1331         "minps      %%xmm5, %%xmm0      \n\t"
1332         "minps      %%xmm5, %%xmm1      \n\t"
1333         "minps      %%xmm5, %%xmm2      \n\t"
1334         "minps      %%xmm5, %%xmm3      \n\t"
1335         "movaps     %%xmm0,   (%1, %0)  \n\t"
1336         "movaps     %%xmm1, 16(%1, %0)  \n\t"
1337         "movaps     %%xmm2, 32(%1, %0)  \n\t"
1338         "movaps     %%xmm3, 48(%1, %0)  \n\t"
1339         "sub           $64, %0          \n\t"
1340         "jge            1b              \n\t"
1341         : "+&r"(i)
1342         : "r"(dst), "r"(src), "m"(min), "m"(max)
1343         : "memory"
1344     );
1345 }
1346
1347 #endif /* HAVE_INLINE_ASM */
1348
1349 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1350                                       int order);
1351 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1352                                     int order);
1353 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1354                                                const int16_t *v3,
1355                                                int order, int mul);
1356 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1357                                              const int16_t *v3,
1358                                              int order, int mul);
1359 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1360                                               const int16_t *v3,
1361                                               int order, int mul);
1362
1363 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1364                                         const int16_t *window, unsigned int len);
1365 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1366                                       const int16_t *window, unsigned int len);
1367 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1368                                   const int16_t *window, unsigned int len);
1369 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1370                                 const int16_t *window, unsigned int len);
1371 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1372                                  const int16_t *window, unsigned int len);
1373 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1374                                       const int16_t *window, unsigned int len);
1375
1376 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1377 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1378
1379 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1380                                           const uint8_t *diff, int w,
1381                                           int *left, int *left_top);
1382 int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1383                                        int w, int left);
1384 int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1385                                       int w, int left);
1386
1387 void ff_vector_clip_int32_mmx     (int32_t *dst, const int32_t *src,
1388                                    int32_t min, int32_t max, unsigned int len);
1389 void ff_vector_clip_int32_sse2    (int32_t *dst, const int32_t *src,
1390                                    int32_t min, int32_t max, unsigned int len);
1391 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1392                                    int32_t min, int32_t max, unsigned int len);
1393 void ff_vector_clip_int32_sse4    (int32_t *dst, const int32_t *src,
1394                                    int32_t min, int32_t max, unsigned int len);
1395
1396 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)                          \
1397     do {                                                                     \
1398     c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1399     c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1400     c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
1401     c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
1402     c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
1403     c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
1404     c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
1405     c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
1406     c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
1407     c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
1408     c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
1409     c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
1410     c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
1411     c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
1412     c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
1413     c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
1414     } while (0)
1415
1416 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
1417     do {                                                                        \
1418         c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _     ## CPU; \
1419         c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_  ## CPU; \
1420         c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_  ## CPU; \
1421         c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
1422     } while (0)
1423
1424 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
1425                                      int mm_flags)
1426 {
1427     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1428
1429 #if HAVE_INLINE_ASM
1430     c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
1431     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
1432     c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
1433
1434     if (!high_bit_depth) {
1435         c->clear_block  = clear_block_mmx;
1436         c->clear_blocks = clear_blocks_mmx;
1437         c->draw_edges   = draw_edges_mmx;
1438
1439         SET_HPEL_FUNCS(put,        [0], 16, mmx);
1440         SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
1441         SET_HPEL_FUNCS(avg,        [0], 16, mmx);
1442         SET_HPEL_FUNCS(avg_no_rnd,    , 16, mmx);
1443         SET_HPEL_FUNCS(put,        [1],  8, mmx);
1444         SET_HPEL_FUNCS(put_no_rnd, [1],  8, mmx);
1445         SET_HPEL_FUNCS(avg,        [1],  8, mmx);
1446
1447         switch (avctx->idct_algo) {
1448         case FF_IDCT_AUTO:
1449         case FF_IDCT_SIMPLEMMX:
1450             c->idct_put              = ff_simple_idct_put_mmx;
1451             c->idct_add              = ff_simple_idct_add_mmx;
1452             c->idct                  = ff_simple_idct_mmx;
1453             c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
1454             break;
1455         case FF_IDCT_XVIDMMX:
1456             c->idct_put              = ff_idct_xvid_mmx_put;
1457             c->idct_add              = ff_idct_xvid_mmx_add;
1458             c->idct                  = ff_idct_xvid_mmx;
1459             break;
1460         }
1461     }
1462
1463     c->gmc = gmc_mmx;
1464
1465     c->add_bytes = add_bytes_mmx;
1466 #endif /* HAVE_INLINE_ASM */
1467
1468 #if HAVE_YASM
1469     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1470         c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
1471         c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
1472     }
1473
1474     c->vector_clip_int32 = ff_vector_clip_int32_mmx;
1475 #endif
1476
1477 }
1478
1479 static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
1480                                         int mm_flags)
1481 {
1482     const int bit_depth      = avctx->bits_per_raw_sample;
1483     const int high_bit_depth = bit_depth > 8;
1484
1485 #if HAVE_YASM
1486     SET_QPEL_FUNCS(avg_qpel,        0, 16, mmxext, );
1487     SET_QPEL_FUNCS(avg_qpel,        1,  8, mmxext, );
1488
1489     SET_QPEL_FUNCS(put_qpel,        0, 16, mmxext, );
1490     SET_QPEL_FUNCS(put_qpel,        1,  8, mmxext, );
1491     SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
1492     SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmxext, );
1493
1494     if (!high_bit_depth) {
1495         c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
1496         c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
1497
1498         c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
1499         c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
1500         c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
1501
1502         c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
1503         c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
1504
1505         c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
1506         c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
1507         c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
1508     }
1509
1510     if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1511         if (!high_bit_depth) {
1512             c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
1513             c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
1514             c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
1515             c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
1516
1517             c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
1518             c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
1519         }
1520     }
1521 #endif /* HAVE_YASM */
1522
1523 #if HAVE_INLINE_ASM
1524     if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1525         c->idct_put = ff_idct_xvid_mmxext_put;
1526         c->idct_add = ff_idct_xvid_mmxext_add;
1527         c->idct     = ff_idct_xvid_mmxext;
1528     }
1529 #endif /* HAVE_INLINE_ASM */
1530
1531 #if HAVE_MMXEXT_EXTERNAL
1532     if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1533                                avctx->codec_id == AV_CODEC_ID_THEORA)) {
1534         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
1535         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
1536     }
1537
1538     /* slower than cmov version on AMD */
1539     if (!(mm_flags & AV_CPU_FLAG_3DNOW))
1540         c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
1541
1542     c->scalarproduct_int16          = ff_scalarproduct_int16_mmxext;
1543     c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
1544
1545     if (avctx->flags & CODEC_FLAG_BITEXACT) {
1546         c->apply_window_int16 = ff_apply_window_int16_mmxext;
1547     } else {
1548         c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
1549     }
1550 #endif /* HAVE_MMXEXT_EXTERNAL */
1551 }
1552
1553 static av_cold void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
1554                                        int mm_flags)
1555 {
1556     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1557
1558 #if HAVE_YASM
1559     if (!high_bit_depth) {
1560         c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
1561         c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
1562
1563         c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
1564         c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
1565         c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
1566
1567         c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
1568         c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
1569
1570         c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
1571         c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
1572         c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
1573
1574         if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
1575             c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
1576             c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
1577             c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
1578             c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
1579
1580             c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
1581             c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
1582         }
1583     }
1584
1585     if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1586                                avctx->codec_id == AV_CODEC_ID_THEORA)) {
1587         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
1588         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
1589     }
1590 #endif /* HAVE_YASM */
1591 }
1592
1593 static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
1594                                      int mm_flags)
1595 {
1596     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1597
1598 #if HAVE_INLINE_ASM
1599     if (!high_bit_depth) {
1600         if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
1601             /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
1602             c->clear_block  = clear_block_sse;
1603             c->clear_blocks = clear_blocks_sse;
1604         }
1605     }
1606
1607     c->vector_clipf = vector_clipf_sse;
1608 #endif /* HAVE_INLINE_ASM */
1609 }
1610
1611 static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
1612                                       int mm_flags)
1613 {
1614     const int bit_depth      = avctx->bits_per_raw_sample;
1615     const int high_bit_depth = bit_depth > 8;
1616
1617 #if HAVE_SSE2_INLINE
1618     if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1619         c->idct_put              = ff_idct_xvid_sse2_put;
1620         c->idct_add              = ff_idct_xvid_sse2_add;
1621         c->idct                  = ff_idct_xvid_sse2;
1622         c->idct_permutation_type = FF_SSE2_IDCT_PERM;
1623     }
1624 #endif /* HAVE_SSE2_INLINE */
1625
1626 #if HAVE_SSE2_EXTERNAL
1627     if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1628         // these functions are slower than mmx on AMD, but faster on Intel
1629         if (!high_bit_depth) {
1630             c->put_pixels_tab[0][0]        = ff_put_pixels16_sse2;
1631             c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
1632             c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
1633         }
1634     }
1635
1636     c->scalarproduct_int16          = ff_scalarproduct_int16_sse2;
1637     c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
1638     if (mm_flags & AV_CPU_FLAG_ATOM) {
1639         c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
1640     } else {
1641         c->vector_clip_int32 = ff_vector_clip_int32_sse2;
1642     }
1643     if (avctx->flags & CODEC_FLAG_BITEXACT) {
1644         c->apply_window_int16 = ff_apply_window_int16_sse2;
1645     } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1646         c->apply_window_int16 = ff_apply_window_int16_round_sse2;
1647     }
1648     c->bswap_buf = ff_bswap32_buf_sse2;
1649 #endif /* HAVE_SSE2_EXTERNAL */
1650 }
1651
1652 static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
1653                                        int mm_flags)
1654 {
1655 #if HAVE_SSSE3_EXTERNAL
1656     c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
1657     if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
1658         c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
1659
1660     if (mm_flags & AV_CPU_FLAG_ATOM)
1661         c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
1662     else
1663         c->apply_window_int16 = ff_apply_window_int16_ssse3;
1664     if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
1665         c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
1666     c->bswap_buf = ff_bswap32_buf_ssse3;
1667 #endif /* HAVE_SSSE3_EXTERNAL */
1668 }
1669
1670 static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
1671                                       int mm_flags)
1672 {
1673 #if HAVE_SSE4_EXTERNAL
1674     c->vector_clip_int32 = ff_vector_clip_int32_sse4;
1675 #endif /* HAVE_SSE4_EXTERNAL */
1676 }
1677
1678 av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
1679 {
1680     int mm_flags = av_get_cpu_flags();
1681
1682 #if HAVE_7REGS && HAVE_INLINE_ASM
1683     if (mm_flags & AV_CPU_FLAG_CMOV)
1684         c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
1685 #endif
1686
1687     if (mm_flags & AV_CPU_FLAG_MMX)
1688         dsputil_init_mmx(c, avctx, mm_flags);
1689
1690     if (mm_flags & AV_CPU_FLAG_MMXEXT)
1691         dsputil_init_mmxext(c, avctx, mm_flags);
1692
1693     if (mm_flags & AV_CPU_FLAG_3DNOW)
1694         dsputil_init_3dnow(c, avctx, mm_flags);
1695
1696     if (mm_flags & AV_CPU_FLAG_SSE)
1697         dsputil_init_sse(c, avctx, mm_flags);
1698
1699     if (mm_flags & AV_CPU_FLAG_SSE2)
1700         dsputil_init_sse2(c, avctx, mm_flags);
1701
1702     if (mm_flags & AV_CPU_FLAG_SSSE3)
1703         dsputil_init_ssse3(c, avctx, mm_flags);
1704
1705     if (mm_flags & AV_CPU_FLAG_SSE4)
1706         dsputil_init_sse4(c, avctx, mm_flags);
1707
1708     if (CONFIG_ENCODERS)
1709         ff_dsputilenc_init_mmx(c, avctx);
1710 }