git.sesse.net Git - ffmpeg/blob - libavcodec/x86/dsputil_mmx.c

   1 /*
   2  * MMX optimized DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  *
  22  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  23  */
  24
  25 #include "libavutil/cpu.h"
  26 #include "libavutil/x86/asm.h"
  27 #include "libavcodec/dsputil.h"
  28 #include "libavcodec/h264dsp.h"
  29 #include "libavcodec/mpegvideo.h"
  30 #include "libavcodec/simple_idct.h"
  31 #include "dsputil_mmx.h"
  32 #include "idct_xvid.h"
  33 #include "diracdsp_mmx.h"
  34
  35 //#undef NDEBUG
  36 //#include <assert.h>
  37
  38 /* pixel operations */
  39 DECLARE_ALIGNED(8,  const uint64_t, ff_bone) = 0x0101010101010101ULL;
  40 DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
  41
  42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
  43     { 0x8000000080000000ULL, 0x8000000080000000ULL };
  44
  45 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL };
  46 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL };
  47 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_3)    = { 0x0003000300030003ULL, 0x0003000300030003ULL };
  48 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_4)    = { 0x0004000400040004ULL, 0x0004000400040004ULL };
  49 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5)    = { 0x0005000500050005ULL, 0x0005000500050005ULL };
  50 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_8)    = { 0x0008000800080008ULL, 0x0008000800080008ULL };
  51 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_9)    = { 0x0009000900090009ULL, 0x0009000900090009ULL };
  52 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15)   =   0x000F000F000F000FULL;
  53 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_16)   = { 0x0010001000100010ULL, 0x0010001000100010ULL };
  54 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_17)   = { 0x0011001100110011ULL, 0x0011001100110011ULL };
  55 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_18)   = { 0x0012001200120012ULL, 0x0012001200120012ULL };
  56 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_20)   =   0x0014001400140014ULL;
  57 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_27)   = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
  58 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_28)   = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
  59 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_32)   = { 0x0020002000200020ULL, 0x0020002000200020ULL };
  60 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_42)   =   0x002A002A002A002AULL;
  61 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_53)   =   0x0035003500350035ULL;
  62 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_63)   = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
  63 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_64)   = { 0x0040004000400040ULL, 0x0040004000400040ULL };
  64 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_96)   =   0x0060006000600060ULL;
  65 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_128)  =   0x0080008000800080ULL;
  66 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_255)  =   0x00ff00ff00ff00ffULL;
  67 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x0200020002000200ULL };
  68 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
  69
  70 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_0)    = { 0x0000000000000000ULL, 0x0000000000000000ULL };
  71 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL };
  72 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL };
  73 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_4)    = { 0x0404040404040404ULL, 0x0404040404040404ULL };
  74 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_7)    =   0x0707070707070707ULL;
  75 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_1F)   =   0x1F1F1F1F1F1F1F1FULL;
  76 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_3F)   =   0x3F3F3F3F3F3F3F3FULL;
  77 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_80)   = { 0x8080808080808080ULL, 0x8080808080808080ULL };
  78 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_81)   =   0x8181818181818181ULL;
  79 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_A1)   = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
  80 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_F8)   = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
  81 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC)   =   0xFCFCFCFCFCFCFCFCULL;
  82 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
  83
  84 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
  85 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
  86
  87 #if HAVE_INLINE_ASM
  88
  89 #define JUMPALIGN()     __asm__ volatile (".p2align 3"::)
  90 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
  91
  92 #define MOVQ_BFE(regd)                                  \
  93     __asm__ volatile (                                  \
  94         "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
  95         "paddb   %%"#regd", %%"#regd"   \n\t" ::)
  96
  97 #ifndef PIC
  98 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
  99 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
 100 #else
 101 // for shared library it's better to use this way for accessing constants
 102 // pcmpeqd -> -1
 103 #define MOVQ_BONE(regd)                                 \
 104     __asm__ volatile (                                  \
 105         "pcmpeqd  %%"#regd", %%"#regd"  \n\t"           \
 106         "psrlw          $15, %%"#regd"  \n\t"           \
 107         "packuswb %%"#regd", %%"#regd"  \n\t" ::)
 108
 109 #define MOVQ_WTWO(regd)                                 \
 110     __asm__ volatile (                                  \
 111         "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
 112         "psrlw         $15, %%"#regd"   \n\t"           \
 113         "psllw          $1, %%"#regd"   \n\t"::)
 114
 115 #endif
 116
 117 // using regr as temporary and for the output result
 118 // first argument is unmodifed and second is trashed
 119 // regfe is supposed to contain 0xfefefefefefefefe
 120 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe)                \
 121     "movq   "#rega", "#regr"            \n\t"                    \
 122     "pand   "#regb", "#regr"            \n\t"                    \
 123     "pxor   "#rega", "#regb"            \n\t"                    \
 124     "pand  "#regfe", "#regb"            \n\t"                    \
 125     "psrlq       $1, "#regb"            \n\t"                    \
 126     "paddb  "#regb", "#regr"            \n\t"
 127
 128 #define PAVGB_MMX(rega, regb, regr, regfe)                       \
 129     "movq   "#rega", "#regr"            \n\t"                    \
 130     "por    "#regb", "#regr"            \n\t"                    \
 131     "pxor   "#rega", "#regb"            \n\t"                    \
 132     "pand  "#regfe", "#regb"            \n\t"                    \
 133     "psrlq       $1, "#regb"            \n\t"                    \
 134     "psubb  "#regb", "#regr"            \n\t"
 135
 136 // mm6 is supposed to contain 0xfefefefefefefefe
 137 #define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp)   \
 138     "movq  "#rega", "#regr"             \n\t"                    \
 139     "movq  "#regc", "#regp"             \n\t"                    \
 140     "pand  "#regb", "#regr"             \n\t"                    \
 141     "pand  "#regd", "#regp"             \n\t"                    \
 142     "pxor  "#rega", "#regb"             \n\t"                    \
 143     "pxor  "#regc", "#regd"             \n\t"                    \
 144     "pand    %%mm6, "#regb"             \n\t"                    \
 145     "pand    %%mm6, "#regd"             \n\t"                    \
 146     "psrlq      $1, "#regb"             \n\t"                    \
 147     "psrlq      $1, "#regd"             \n\t"                    \
 148     "paddb "#regb", "#regr"             \n\t"                    \
 149     "paddb "#regd", "#regp"             \n\t"
 150
 151 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp)           \
 152     "movq  "#rega", "#regr"             \n\t"                    \
 153     "movq  "#regc", "#regp"             \n\t"                    \
 154     "por   "#regb", "#regr"             \n\t"                    \
 155     "por   "#regd", "#regp"             \n\t"                    \
 156     "pxor  "#rega", "#regb"             \n\t"                    \
 157     "pxor  "#regc", "#regd"             \n\t"                    \
 158     "pand    %%mm6, "#regb"             \n\t"                    \
 159     "pand    %%mm6, "#regd"             \n\t"                    \
 160     "psrlq      $1, "#regd"             \n\t"                    \
 161     "psrlq      $1, "#regb"             \n\t"                    \
 162     "psubb "#regb", "#regr"             \n\t"                    \
 163     "psubb "#regd", "#regp"             \n\t"
 164
 165 /***********************************/
 166 /* MMX no rounding */
 167 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
 168 #define SET_RND  MOVQ_WONE
 169 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
 170 #define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
 171 #define OP_AVG(a, b, c, e)              PAVGB_MMX(a, b, c, e)
 172
 173 #include "dsputil_rnd_template.c"
 174
 175 #undef DEF
 176 #undef SET_RND
 177 #undef PAVGBP
 178 #undef PAVGB
 179 /***********************************/
 180 /* MMX rounding */
 181
 182 #define DEF(x, y) x ## _ ## y ## _mmx
 183 #define SET_RND  MOVQ_WTWO
 184 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
 185 #define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
 186
 187 #include "dsputil_rnd_template.c"
 188
 189 #undef DEF
 190 #undef SET_RND
 191 #undef PAVGBP
 192 #undef PAVGB
 193 #undef OP_AVG
 194
 195 /***********************************/
 196 /* 3Dnow specific */
 197
 198 #define DEF(x) x ## _3dnow
 199 #define PAVGB "pavgusb"
 200 #define OP_AVG PAVGB
 201
 202 #include "dsputil_avg_template.c"
 203
 204 #undef DEF
 205 #undef PAVGB
 206 #undef OP_AVG
 207
 208 /***********************************/
 209 /* MMX2 specific */
 210
 211 #define DEF(x) x ## _mmx2
 212
 213 /* Introduced only in MMX2 set */
 214 #define PAVGB "pavgb"
 215 #define OP_AVG PAVGB
 216
 217 #include "dsputil_avg_template.c"
 218
 219 #undef DEF
 220 #undef PAVGB
 221 #undef OP_AVG
 222
 223 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
 224 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
 225 #define put_pixels16_mmx2 put_pixels16_mmx
 226 #define put_pixels8_mmx2 put_pixels8_mmx
 227 #define put_pixels4_mmx2 put_pixels4_mmx
 228 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
 229 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
 230 #define put_pixels16_3dnow put_pixels16_mmx
 231 #define put_pixels8_3dnow put_pixels8_mmx
 232 #define put_pixels4_3dnow put_pixels4_mmx
 233 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
 234 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
 235
 236 /***********************************/
 237 /* standard MMX */
 238
 239 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
 240                                int line_size)
 241 {
 242     const DCTELEM *p;
 243     uint8_t *pix;
 244
 245     /* read the pixels */
 246     p   = block;
 247     pix = pixels;
 248     /* unrolled loop */
 249     __asm__ volatile (
 250         "movq      (%3), %%mm0          \n\t"
 251         "movq     8(%3), %%mm1          \n\t"
 252         "movq    16(%3), %%mm2          \n\t"
 253         "movq    24(%3), %%mm3          \n\t"
 254         "movq    32(%3), %%mm4          \n\t"
 255         "movq    40(%3), %%mm5          \n\t"
 256         "movq    48(%3), %%mm6          \n\t"
 257         "movq    56(%3), %%mm7          \n\t"
 258         "packuswb %%mm1, %%mm0          \n\t"
 259         "packuswb %%mm3, %%mm2          \n\t"
 260         "packuswb %%mm5, %%mm4          \n\t"
 261         "packuswb %%mm7, %%mm6          \n\t"
 262         "movq     %%mm0, (%0)           \n\t"
 263         "movq     %%mm2, (%0, %1)       \n\t"
 264         "movq     %%mm4, (%0, %1, 2)    \n\t"
 265         "movq     %%mm6, (%0, %2)       \n\t"
 266         :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
 267            "r"(p)
 268         : "memory");
 269     pix += line_size * 4;
 270     p   += 32;
 271
 272     // if here would be an exact copy of the code above
 273     // compiler would generate some very strange code
 274     // thus using "r"
 275     __asm__ volatile (
 276         "movq       (%3), %%mm0         \n\t"
 277         "movq      8(%3), %%mm1         \n\t"
 278         "movq     16(%3), %%mm2         \n\t"
 279         "movq     24(%3), %%mm3         \n\t"
 280         "movq     32(%3), %%mm4         \n\t"
 281         "movq     40(%3), %%mm5         \n\t"
 282         "movq     48(%3), %%mm6         \n\t"
 283         "movq     56(%3), %%mm7         \n\t"
 284         "packuswb  %%mm1, %%mm0         \n\t"
 285         "packuswb  %%mm3, %%mm2         \n\t"
 286         "packuswb  %%mm5, %%mm4         \n\t"
 287         "packuswb  %%mm7, %%mm6         \n\t"
 288         "movq      %%mm0, (%0)          \n\t"
 289         "movq      %%mm2, (%0, %1)      \n\t"
 290         "movq      %%mm4, (%0, %1, 2)   \n\t"
 291         "movq      %%mm6, (%0, %2)      \n\t"
 292         :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
 293         : "memory");
 294 }
 295
 296 #define put_signed_pixels_clamped_mmx_half(off)             \
 297     "movq          "#off"(%2), %%mm1        \n\t"           \
 298     "movq     16 + "#off"(%2), %%mm2        \n\t"           \
 299     "movq     32 + "#off"(%2), %%mm3        \n\t"           \
 300     "movq     48 + "#off"(%2), %%mm4        \n\t"           \
 301     "packsswb  8 + "#off"(%2), %%mm1        \n\t"           \
 302     "packsswb 24 + "#off"(%2), %%mm2        \n\t"           \
 303     "packsswb 40 + "#off"(%2), %%mm3        \n\t"           \
 304     "packsswb 56 + "#off"(%2), %%mm4        \n\t"           \
 305     "paddb              %%mm0, %%mm1        \n\t"           \
 306     "paddb              %%mm0, %%mm2        \n\t"           \
 307     "paddb              %%mm0, %%mm3        \n\t"           \
 308     "paddb              %%mm0, %%mm4        \n\t"           \
 309     "movq               %%mm1, (%0)         \n\t"           \
 310     "movq               %%mm2, (%0, %3)     \n\t"           \
 311     "movq               %%mm3, (%0, %3, 2)  \n\t"           \
 312     "movq               %%mm4, (%0, %1)     \n\t"
 313
 314 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
 315                                       int line_size)
 316 {
 317     x86_reg line_skip = line_size;
 318     x86_reg line_skip3;
 319
 320     __asm__ volatile (
 321         "movq "MANGLE(ff_pb_80)", %%mm0     \n\t"
 322         "lea         (%3, %3, 2), %1        \n\t"
 323         put_signed_pixels_clamped_mmx_half(0)
 324         "lea         (%0, %3, 4), %0        \n\t"
 325         put_signed_pixels_clamped_mmx_half(64)
 326         : "+&r"(pixels), "=&r"(line_skip3)
 327         : "r"(block), "r"(line_skip)
 328         : "memory");
 329 }
 330
 331 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
 332                                int line_size)
 333 {
 334     const DCTELEM *p;
 335     uint8_t *pix;
 336     int i;
 337
 338     /* read the pixels */
 339     p   = block;
 340     pix = pixels;
 341     MOVQ_ZERO(mm7);
 342     i = 4;
 343     do {
 344         __asm__ volatile (
 345             "movq        (%2), %%mm0    \n\t"
 346             "movq       8(%2), %%mm1    \n\t"
 347             "movq      16(%2), %%mm2    \n\t"
 348             "movq      24(%2), %%mm3    \n\t"
 349             "movq          %0, %%mm4    \n\t"
 350             "movq          %1, %%mm6    \n\t"
 351             "movq       %%mm4, %%mm5    \n\t"
 352             "punpcklbw  %%mm7, %%mm4    \n\t"
 353             "punpckhbw  %%mm7, %%mm5    \n\t"
 354             "paddsw     %%mm4, %%mm0    \n\t"
 355             "paddsw     %%mm5, %%mm1    \n\t"
 356             "movq       %%mm6, %%mm5    \n\t"
 357             "punpcklbw  %%mm7, %%mm6    \n\t"
 358             "punpckhbw  %%mm7, %%mm5    \n\t"
 359             "paddsw     %%mm6, %%mm2    \n\t"
 360             "paddsw     %%mm5, %%mm3    \n\t"
 361             "packuswb   %%mm1, %%mm0    \n\t"
 362             "packuswb   %%mm3, %%mm2    \n\t"
 363             "movq       %%mm0, %0       \n\t"
 364             "movq       %%mm2, %1       \n\t"
 365             : "+m"(*pix), "+m"(*(pix + line_size))
 366             : "r"(p)
 367             : "memory");
 368         pix += line_size * 2;
 369         p   += 16;
 370     } while (--i);
 371 }
 372
 373 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
 374                             int line_size, int h)
 375 {
 376     __asm__ volatile (
 377         "lea   (%3, %3), %%"REG_a"      \n\t"
 378         ".p2align     3                 \n\t"
 379         "1:                             \n\t"
 380         "movd  (%1    ), %%mm0          \n\t"
 381         "movd  (%1, %3), %%mm1          \n\t"
 382         "movd     %%mm0, (%2)           \n\t"
 383         "movd     %%mm1, (%2, %3)       \n\t"
 384         "add  %%"REG_a", %1             \n\t"
 385         "add  %%"REG_a", %2             \n\t"
 386         "movd  (%1    ), %%mm0          \n\t"
 387         "movd  (%1, %3), %%mm1          \n\t"
 388         "movd     %%mm0, (%2)           \n\t"
 389         "movd     %%mm1, (%2, %3)       \n\t"
 390         "add  %%"REG_a", %1             \n\t"
 391         "add  %%"REG_a", %2             \n\t"
 392         "subl        $4, %0             \n\t"
 393         "jnz         1b                 \n\t"
 394         : "+g"(h), "+r"(pixels),  "+r"(block)
 395         : "r"((x86_reg)line_size)
 396         : "%"REG_a, "memory"
 397         );
 398 }
 399
 400 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
 401                             int line_size, int h)
 402 {
 403     __asm__ volatile (
 404         "lea   (%3, %3), %%"REG_a"      \n\t"
 405         ".p2align     3                 \n\t"
 406         "1:                             \n\t"
 407         "movq  (%1    ), %%mm0          \n\t"
 408         "movq  (%1, %3), %%mm1          \n\t"
 409         "movq     %%mm0, (%2)           \n\t"
 410         "movq     %%mm1, (%2, %3)       \n\t"
 411         "add  %%"REG_a", %1             \n\t"
 412         "add  %%"REG_a", %2             \n\t"
 413         "movq  (%1    ), %%mm0          \n\t"
 414         "movq  (%1, %3), %%mm1          \n\t"
 415         "movq     %%mm0, (%2)           \n\t"
 416         "movq     %%mm1, (%2, %3)       \n\t"
 417         "add  %%"REG_a", %1             \n\t"
 418         "add  %%"REG_a", %2             \n\t"
 419         "subl        $4, %0             \n\t"
 420         "jnz         1b                 \n\t"
 421         : "+g"(h), "+r"(pixels),  "+r"(block)
 422         : "r"((x86_reg)line_size)
 423         : "%"REG_a, "memory"
 424         );
 425 }
 426
 427 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
 428                              int line_size, int h)
 429 {
 430     __asm__ volatile (
 431         "lea   (%3, %3), %%"REG_a"      \n\t"
 432         ".p2align     3                 \n\t"
 433         "1:                             \n\t"
 434         "movq  (%1    ), %%mm0          \n\t"
 435         "movq 8(%1    ), %%mm4          \n\t"
 436         "movq  (%1, %3), %%mm1          \n\t"
 437         "movq 8(%1, %3), %%mm5          \n\t"
 438         "movq     %%mm0,  (%2)          \n\t"
 439         "movq     %%mm4, 8(%2)          \n\t"
 440         "movq     %%mm1,  (%2, %3)      \n\t"
 441         "movq     %%mm5, 8(%2, %3)      \n\t"
 442         "add  %%"REG_a", %1             \n\t"
 443         "add  %%"REG_a", %2             \n\t"
 444         "movq  (%1    ), %%mm0          \n\t"
 445         "movq 8(%1    ), %%mm4          \n\t"
 446         "movq  (%1, %3), %%mm1          \n\t"
 447         "movq 8(%1, %3), %%mm5          \n\t"
 448         "movq     %%mm0,  (%2)          \n\t"
 449         "movq     %%mm4, 8(%2)          \n\t"
 450         "movq     %%mm1,  (%2, %3)      \n\t"
 451         "movq     %%mm5, 8(%2, %3)      \n\t"
 452         "add  %%"REG_a", %1             \n\t"
 453         "add  %%"REG_a", %2             \n\t"
 454         "subl        $4, %0             \n\t"
 455         "jnz         1b                 \n\t"
 456         : "+g"(h), "+r"(pixels),  "+r"(block)
 457         : "r"((x86_reg)line_size)
 458         : "%"REG_a, "memory"
 459         );
 460 }
 461
 462 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
 463                               int line_size, int h)
 464 {
 465     __asm__ volatile (
 466         "1:                              \n\t"
 467         "movdqu (%1       ), %%xmm0      \n\t"
 468         "movdqu (%1, %3   ), %%xmm1      \n\t"
 469         "movdqu (%1, %3, 2), %%xmm2      \n\t"
 470         "movdqu (%1, %4   ), %%xmm3      \n\t"
 471         "lea    (%1, %3, 4), %1          \n\t"
 472         "movdqa      %%xmm0, (%2)        \n\t"
 473         "movdqa      %%xmm1, (%2, %3)    \n\t"
 474         "movdqa      %%xmm2, (%2, %3, 2) \n\t"
 475         "movdqa      %%xmm3, (%2, %4)    \n\t"
 476         "subl            $4, %0          \n\t"
 477         "lea    (%2, %3, 4), %2          \n\t"
 478         "jnz             1b              \n\t"
 479         : "+g"(h), "+r"(pixels),  "+r"(block)
 480         : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
 481         : "memory"
 482         );
 483 }
 484
 485 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
 486                               int line_size, int h)
 487 {
 488     __asm__ volatile (
 489         "1:                                 \n\t"
 490         "movdqu (%1       ), %%xmm0         \n\t"
 491         "movdqu (%1, %3   ), %%xmm1         \n\t"
 492         "movdqu (%1, %3, 2), %%xmm2         \n\t"
 493         "movdqu (%1, %4   ), %%xmm3         \n\t"
 494         "lea    (%1, %3, 4), %1             \n\t"
 495         "pavgb  (%2       ), %%xmm0         \n\t"
 496         "pavgb  (%2, %3   ), %%xmm1         \n\t"
 497         "pavgb  (%2, %3, 2), %%xmm2         \n\t"
 498         "pavgb     (%2, %4), %%xmm3         \n\t"
 499         "movdqa      %%xmm0, (%2)           \n\t"
 500         "movdqa      %%xmm1, (%2, %3)       \n\t"
 501         "movdqa      %%xmm2, (%2, %3, 2)    \n\t"
 502         "movdqa      %%xmm3, (%2, %4)       \n\t"
 503         "subl            $4, %0             \n\t"
 504         "lea    (%2, %3, 4), %2             \n\t"
 505         "jnz             1b                 \n\t"
 506         : "+g"(h), "+r"(pixels),  "+r"(block)
 507         : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
 508         : "memory"
 509         );
 510 }
 511
 512 #define CLEAR_BLOCKS(name, n)                           \
 513 static void name(DCTELEM *blocks)                       \
 514 {                                                       \
 515     __asm__ volatile (                                  \
 516         "pxor %%mm7, %%mm7              \n\t"           \
 517         "mov     %1,        %%"REG_a"   \n\t"           \
 518         "1:                             \n\t"           \
 519         "movq %%mm7,   (%0, %%"REG_a")  \n\t"           \
 520         "movq %%mm7,  8(%0, %%"REG_a")  \n\t"           \
 521         "movq %%mm7, 16(%0, %%"REG_a")  \n\t"           \
 522         "movq %%mm7, 24(%0, %%"REG_a")  \n\t"           \
 523         "add    $32, %%"REG_a"          \n\t"           \
 524         "js      1b                     \n\t"           \
 525         :: "r"(((uint8_t *)blocks) + 128 * n),          \
 526            "i"(-128 * n)                                \
 527         : "%"REG_a                                      \
 528         );                                              \
 529 }
 530 CLEAR_BLOCKS(clear_blocks_mmx, 6)
 531 CLEAR_BLOCKS(clear_block_mmx, 1)
 532
 533 static void clear_block_sse(DCTELEM *block)
 534 {
 535     __asm__ volatile (
 536         "xorps  %%xmm0, %%xmm0          \n"
 537         "movaps %%xmm0,    (%0)         \n"
 538         "movaps %%xmm0,  16(%0)         \n"
 539         "movaps %%xmm0,  32(%0)         \n"
 540         "movaps %%xmm0,  48(%0)         \n"
 541         "movaps %%xmm0,  64(%0)         \n"
 542         "movaps %%xmm0,  80(%0)         \n"
 543         "movaps %%xmm0,  96(%0)         \n"
 544         "movaps %%xmm0, 112(%0)         \n"
 545         :: "r"(block)
 546         : "memory"
 547     );
 548 }
 549
 550 static void clear_blocks_sse(DCTELEM *blocks)
 551 {
 552     __asm__ volatile (
 553         "xorps  %%xmm0, %%xmm0              \n"
 554         "mov        %1,         %%"REG_a"   \n"
 555         "1:                                 \n"
 556         "movaps %%xmm0,    (%0, %%"REG_a")  \n"
 557         "movaps %%xmm0,  16(%0, %%"REG_a")  \n"
 558         "movaps %%xmm0,  32(%0, %%"REG_a")  \n"
 559         "movaps %%xmm0,  48(%0, %%"REG_a")  \n"
 560         "movaps %%xmm0,  64(%0, %%"REG_a")  \n"
 561         "movaps %%xmm0,  80(%0, %%"REG_a")  \n"
 562         "movaps %%xmm0,  96(%0, %%"REG_a")  \n"
 563         "movaps %%xmm0, 112(%0, %%"REG_a")  \n"
 564         "add      $128,         %%"REG_a"   \n"
 565         "js         1b                      \n"
 566         :: "r"(((uint8_t *)blocks) + 128 * 6),
 567            "i"(-128 * 6)
 568         : "%"REG_a
 569     );
 570 }
 571
 572 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
 573 {
 574     x86_reg i = 0;
 575     __asm__ volatile (
 576         "jmp          2f                \n\t"
 577         "1:                             \n\t"
 578         "movq   (%1, %0), %%mm0         \n\t"
 579         "movq   (%2, %0), %%mm1         \n\t"
 580         "paddb     %%mm0, %%mm1         \n\t"
 581         "movq      %%mm1, (%2, %0)      \n\t"
 582         "movq  8(%1, %0), %%mm0         \n\t"
 583         "movq  8(%2, %0), %%mm1         \n\t"
 584         "paddb     %%mm0, %%mm1         \n\t"
 585         "movq      %%mm1, 8(%2, %0)     \n\t"
 586         "add         $16, %0            \n\t"
 587         "2:                             \n\t"
 588         "cmp          %3, %0            \n\t"
 589         "js           1b                \n\t"
 590         : "+r"(i)
 591         : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
 592     );
 593     for ( ; i < w; i++)
 594         dst[i + 0] += src[i + 0];
 595 }
 596
 597 #if HAVE_7REGS
 598 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
 599                                             const uint8_t *diff, int w,
 600                                             int *left, int *left_top)
 601 {
 602     x86_reg w2 = -w;
 603     x86_reg x;
 604     int l  = *left     & 0xff;
 605     int tl = *left_top & 0xff;
 606     int t;
 607     __asm__ volatile (
 608         "mov          %7, %3            \n"
 609         "1:                             \n"
 610         "movzbl (%3, %4), %2            \n"
 611         "mov          %2, %k3           \n"
 612         "sub         %b1, %b3           \n"
 613         "add         %b0, %b3           \n"
 614         "mov          %2, %1            \n"
 615         "cmp          %0, %2            \n"
 616         "cmovg        %0, %2            \n"
 617         "cmovg        %1, %0            \n"
 618         "cmp         %k3, %0            \n"
 619         "cmovg       %k3, %0            \n"
 620         "mov          %7, %3            \n"
 621         "cmp          %2, %0            \n"
 622         "cmovl        %2, %0            \n"
 623         "add    (%6, %4), %b0           \n"
 624         "mov         %b0, (%5, %4)      \n"
 625         "inc          %4                \n"
 626         "jl           1b                \n"
 627         : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
 628         : "r"(dst + w), "r"(diff + w), "rm"(top + w)
 629     );
 630     *left     = l;
 631     *left_top = tl;
 632 }
 633 #endif
 634
 635 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
 636     __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
 637         "movd  (%1), %%mm0              \n\t"
 638         "add   %3, %1                   \n\t"
 639         "movd  (%1), %%mm1              \n\t"
 640         "movd  (%1,%3,1), %%mm2         \n\t"
 641         "movd  (%1,%3,2), %%mm3         \n\t"
 642         "punpcklbw %%mm1, %%mm0         \n\t"
 643         "punpcklbw %%mm3, %%mm2         \n\t"
 644         "movq %%mm0, %%mm1              \n\t"
 645         "punpcklwd %%mm2, %%mm0         \n\t"
 646         "punpckhwd %%mm2, %%mm1         \n\t"
 647         "movd  %%mm0, (%0)              \n\t"
 648         "add   %2, %0                   \n\t"
 649         "punpckhdq %%mm0, %%mm0         \n\t"
 650         "movd  %%mm0, (%0)              \n\t"
 651         "movd  %%mm1, (%0,%2,1)         \n\t"
 652         "punpckhdq %%mm1, %%mm1         \n\t"
 653         "movd  %%mm1, (%0,%2,2)         \n\t"
 654
 655         :  "+&r" (dst),
 656            "+&r" (src)
 657         :  "r" (dst_stride),
 658            "r" (src_stride)
 659         :  "memory"
 660     );
 661 }
 662
 663 #define H263_LOOP_FILTER                        \
 664     "pxor      %%mm7, %%mm7             \n\t"   \
 665     "movq         %0, %%mm0             \n\t"   \
 666     "movq         %0, %%mm1             \n\t"   \
 667     "movq         %3, %%mm2             \n\t"   \
 668     "movq         %3, %%mm3             \n\t"   \
 669     "punpcklbw %%mm7, %%mm0             \n\t"   \
 670     "punpckhbw %%mm7, %%mm1             \n\t"   \
 671     "punpcklbw %%mm7, %%mm2             \n\t"   \
 672     "punpckhbw %%mm7, %%mm3             \n\t"   \
 673     "psubw     %%mm2, %%mm0             \n\t"   \
 674     "psubw     %%mm3, %%mm1             \n\t"   \
 675     "movq         %1, %%mm2             \n\t"   \
 676     "movq         %1, %%mm3             \n\t"   \
 677     "movq         %2, %%mm4             \n\t"   \
 678     "movq         %2, %%mm5             \n\t"   \
 679     "punpcklbw %%mm7, %%mm2             \n\t"   \
 680     "punpckhbw %%mm7, %%mm3             \n\t"   \
 681     "punpcklbw %%mm7, %%mm4             \n\t"   \
 682     "punpckhbw %%mm7, %%mm5             \n\t"   \
 683     "psubw     %%mm2, %%mm4             \n\t"   \
 684     "psubw     %%mm3, %%mm5             \n\t"   \
 685     "psllw        $2, %%mm4             \n\t"   \
 686     "psllw        $2, %%mm5             \n\t"   \
 687     "paddw     %%mm0, %%mm4             \n\t"   \
 688     "paddw     %%mm1, %%mm5             \n\t"   \
 689     "pxor      %%mm6, %%mm6             \n\t"   \
 690     "pcmpgtw   %%mm4, %%mm6             \n\t"   \
 691     "pcmpgtw   %%mm5, %%mm7             \n\t"   \
 692     "pxor      %%mm6, %%mm4             \n\t"   \
 693     "pxor      %%mm7, %%mm5             \n\t"   \
 694     "psubw     %%mm6, %%mm4             \n\t"   \
 695     "psubw     %%mm7, %%mm5             \n\t"   \
 696     "psrlw        $3, %%mm4             \n\t"   \
 697     "psrlw        $3, %%mm5             \n\t"   \
 698     "packuswb  %%mm5, %%mm4             \n\t"   \
 699     "packsswb  %%mm7, %%mm6             \n\t"   \
 700     "pxor      %%mm7, %%mm7             \n\t"   \
 701     "movd         %4, %%mm2             \n\t"   \
 702     "punpcklbw %%mm2, %%mm2             \n\t"   \
 703     "punpcklbw %%mm2, %%mm2             \n\t"   \
 704     "punpcklbw %%mm2, %%mm2             \n\t"   \
 705     "psubusb   %%mm4, %%mm2             \n\t"   \
 706     "movq      %%mm2, %%mm3             \n\t"   \
 707     "psubusb   %%mm4, %%mm3             \n\t"   \
 708     "psubb     %%mm3, %%mm2             \n\t"   \
 709     "movq         %1, %%mm3             \n\t"   \
 710     "movq         %2, %%mm4             \n\t"   \
 711     "pxor      %%mm6, %%mm3             \n\t"   \
 712     "pxor      %%mm6, %%mm4             \n\t"   \
 713     "paddusb   %%mm2, %%mm3             \n\t"   \
 714     "psubusb   %%mm2, %%mm4             \n\t"   \
 715     "pxor      %%mm6, %%mm3             \n\t"   \
 716     "pxor      %%mm6, %%mm4             \n\t"   \
 717     "paddusb   %%mm2, %%mm2             \n\t"   \
 718     "packsswb  %%mm1, %%mm0             \n\t"   \
 719     "pcmpgtb   %%mm0, %%mm7             \n\t"   \
 720     "pxor      %%mm7, %%mm0             \n\t"   \
 721     "psubb     %%mm7, %%mm0             \n\t"   \
 722     "movq      %%mm0, %%mm1             \n\t"   \
 723     "psubusb   %%mm2, %%mm0             \n\t"   \
 724     "psubb     %%mm0, %%mm1             \n\t"   \
 725     "pand         %5, %%mm1             \n\t"   \
 726     "psrlw        $2, %%mm1             \n\t"   \
 727     "pxor      %%mm7, %%mm1             \n\t"   \
 728     "psubb     %%mm7, %%mm1             \n\t"   \
 729     "movq         %0, %%mm5             \n\t"   \
 730     "movq         %3, %%mm6             \n\t"   \
 731     "psubb     %%mm1, %%mm5             \n\t"   \
 732     "paddb     %%mm1, %%mm6             \n\t"
 733
 734 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
 735 {
 736     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
 737         const int strength = ff_h263_loop_filter_strength[qscale];
 738
 739         __asm__ volatile (
 740             H263_LOOP_FILTER
 741
 742             "movq %%mm3, %1             \n\t"
 743             "movq %%mm4, %2             \n\t"
 744             "movq %%mm5, %0             \n\t"
 745             "movq %%mm6, %3             \n\t"
 746             : "+m"(*(uint64_t*)(src - 2 * stride)),
 747               "+m"(*(uint64_t*)(src - 1 * stride)),
 748               "+m"(*(uint64_t*)(src + 0 * stride)),
 749               "+m"(*(uint64_t*)(src + 1 * stride))
 750             : "g"(2 * strength), "m"(ff_pb_FC)
 751             );
 752     }
 753 }
 754
 755 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
 756 {
 757     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
 758         const int strength = ff_h263_loop_filter_strength[qscale];
 759         DECLARE_ALIGNED(8, uint64_t, temp)[4];
 760         uint8_t *btemp = (uint8_t*)temp;
 761
 762         src -= 2;
 763
 764         transpose4x4(btemp,     src,              8, stride);
 765         transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
 766         __asm__ volatile (
 767             H263_LOOP_FILTER // 5 3 4 6
 768
 769             : "+m"(temp[0]),
 770               "+m"(temp[1]),
 771               "+m"(temp[2]),
 772               "+m"(temp[3])
 773             : "g"(2 * strength), "m"(ff_pb_FC)
 774             );
 775
 776         __asm__ volatile (
 777             "movq      %%mm5, %%mm1         \n\t"
 778             "movq      %%mm4, %%mm0         \n\t"
 779             "punpcklbw %%mm3, %%mm5         \n\t"
 780             "punpcklbw %%mm6, %%mm4         \n\t"
 781             "punpckhbw %%mm3, %%mm1         \n\t"
 782             "punpckhbw %%mm6, %%mm0         \n\t"
 783             "movq      %%mm5, %%mm3         \n\t"
 784             "movq      %%mm1, %%mm6         \n\t"
 785             "punpcklwd %%mm4, %%mm5         \n\t"
 786             "punpcklwd %%mm0, %%mm1         \n\t"
 787             "punpckhwd %%mm4, %%mm3         \n\t"
 788             "punpckhwd %%mm0, %%mm6         \n\t"
 789             "movd      %%mm5, (%0)          \n\t"
 790             "punpckhdq %%mm5, %%mm5         \n\t"
 791             "movd      %%mm5, (%0, %2)      \n\t"
 792             "movd      %%mm3, (%0, %2, 2)   \n\t"
 793             "punpckhdq %%mm3, %%mm3         \n\t"
 794             "movd      %%mm3, (%0, %3)      \n\t"
 795             "movd      %%mm1, (%1)          \n\t"
 796             "punpckhdq %%mm1, %%mm1         \n\t"
 797             "movd      %%mm1, (%1, %2)      \n\t"
 798             "movd      %%mm6, (%1, %2, 2)   \n\t"
 799             "punpckhdq %%mm6, %%mm6         \n\t"
 800             "movd      %%mm6, (%1, %3)      \n\t"
 801             :: "r"(src),
 802                "r"(src + 4 * stride),
 803                "r"((x86_reg)stride),
 804                "r"((x86_reg)(3 * stride))
 805             );
 806     }
 807 }
 808
 809 /* Draw the edges of width 'w' of an image of size width, height
 810  * this MMX version can only handle w == 8 || w == 16. */
 811 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
 812                            int w, int h, int sides)
 813 {
 814     uint8_t *ptr, *last_line;
 815     int i;
 816
 817     last_line = buf + (height - 1) * wrap;
 818     /* left and right */
 819     ptr = buf;
 820     if (w == 8) {
 821         __asm__ volatile (
 822             "1:                             \n\t"
 823             "movd            (%0), %%mm0    \n\t"
 824             "punpcklbw      %%mm0, %%mm0    \n\t"
 825             "punpcklwd      %%mm0, %%mm0    \n\t"
 826             "punpckldq      %%mm0, %%mm0    \n\t"
 827             "movq           %%mm0, -8(%0)   \n\t"
 828             "movq      -8(%0, %2), %%mm1    \n\t"
 829             "punpckhbw      %%mm1, %%mm1    \n\t"
 830             "punpckhwd      %%mm1, %%mm1    \n\t"
 831             "punpckhdq      %%mm1, %%mm1    \n\t"
 832             "movq           %%mm1, (%0, %2) \n\t"
 833             "add               %1, %0       \n\t"
 834             "cmp               %3, %0       \n\t"
 835             "jb                1b           \n\t"
 836             : "+r"(ptr)
 837             : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
 838             );
 839     } else if(w==16){
 840         __asm__ volatile (
 841             "1:                                 \n\t"
 842             "movd            (%0), %%mm0        \n\t"
 843             "punpcklbw      %%mm0, %%mm0        \n\t"
 844             "punpcklwd      %%mm0, %%mm0        \n\t"
 845             "punpckldq      %%mm0, %%mm0        \n\t"
 846             "movq           %%mm0, -8(%0)       \n\t"
 847             "movq           %%mm0, -16(%0)      \n\t"
 848             "movq      -8(%0, %2), %%mm1        \n\t"
 849             "punpckhbw      %%mm1, %%mm1        \n\t"
 850             "punpckhwd      %%mm1, %%mm1        \n\t"
 851             "punpckhdq      %%mm1, %%mm1        \n\t"
 852             "movq           %%mm1,  (%0, %2)    \n\t"
 853             "movq           %%mm1, 8(%0, %2)    \n\t"
 854             "add               %1, %0           \n\t"
 855             "cmp               %3, %0           \n\t"
 856             "jb                1b               \n\t"
 857             : "+r"(ptr)
 858             : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
 859             );
 860     } else {
 861         av_assert1(w == 4);
 862         __asm__ volatile (
 863             "1:                             \n\t"
 864             "movd            (%0), %%mm0    \n\t"
 865             "punpcklbw      %%mm0, %%mm0    \n\t"
 866             "punpcklwd      %%mm0, %%mm0    \n\t"
 867             "movd           %%mm0, -4(%0)   \n\t"
 868             "movd      -4(%0, %2), %%mm1    \n\t"
 869             "punpcklbw      %%mm1, %%mm1    \n\t"
 870             "punpckhwd      %%mm1, %%mm1    \n\t"
 871             "punpckhdq      %%mm1, %%mm1    \n\t"
 872             "movd           %%mm1, (%0, %2) \n\t"
 873             "add               %1, %0       \n\t"
 874             "cmp               %3, %0       \n\t"
 875             "jb                1b           \n\t"
 876             : "+r"(ptr)
 877             : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
 878             );
 879     }
 880
 881     /* top and bottom (and hopefully also the corners) */
 882     if (sides & EDGE_TOP) {
 883         for (i = 0; i < h; i += 4) {
 884             ptr = buf - (i + 1) * wrap - w;
 885             __asm__ volatile (
 886                 "1:                             \n\t"
 887                 "movq (%1, %0), %%mm0           \n\t"
 888                 "movq    %%mm0, (%0)            \n\t"
 889                 "movq    %%mm0, (%0, %2)        \n\t"
 890                 "movq    %%mm0, (%0, %2, 2)     \n\t"
 891                 "movq    %%mm0, (%0, %3)        \n\t"
 892                 "add        $8, %0              \n\t"
 893                 "cmp        %4, %0              \n\t"
 894                 "jb         1b                  \n\t"
 895                 : "+r"(ptr)
 896                 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
 897                   "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
 898                 );
 899         }
 900     }
 901
 902     if (sides & EDGE_BOTTOM) {
 903         for (i = 0; i < h; i += 4) {
 904             ptr = last_line + (i + 1) * wrap - w;
 905             __asm__ volatile (
 906                 "1:                             \n\t"
 907                 "movq (%1, %0), %%mm0           \n\t"
 908                 "movq    %%mm0, (%0)            \n\t"
 909                 "movq    %%mm0, (%0, %2)        \n\t"
 910                 "movq    %%mm0, (%0, %2, 2)     \n\t"
 911                 "movq    %%mm0, (%0, %3)        \n\t"
 912                 "add        $8, %0              \n\t"
 913                 "cmp        %4, %0              \n\t"
 914                 "jb         1b                  \n\t"
 915                 : "+r"(ptr)
 916                 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
 917                   "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
 918                   "r"(ptr + width + 2 * w)
 919                 );
 920         }
 921     }
 922 }
 923
 924 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd,                      \
 925                    in0, in1, in2, in7, out, OP)                           \
 926     "paddw               "#m4", "#m3"   \n\t" /* x1 */                    \
 927     "movq   "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */                    \
 928     "pmullw              "#m3", %%mm4   \n\t" /* 20x1 */                  \
 929     "movq               "#in7", "#m3"   \n\t" /* d */                     \
 930     "movq               "#in0", %%mm5   \n\t" /* D */                     \
 931     "paddw               "#m3", %%mm5   \n\t" /* x4 */                    \
 932     "psubw               %%mm5, %%mm4   \n\t" /* 20x1 - x4 */             \
 933     "movq               "#in1", %%mm5   \n\t" /* C */                     \
 934     "movq               "#in2", %%mm6   \n\t" /* B */                     \
 935     "paddw               "#m6", %%mm5   \n\t" /* x3 */                    \
 936     "paddw               "#m5", %%mm6   \n\t" /* x2 */                    \
 937     "paddw               %%mm6, %%mm6   \n\t" /* 2x2 */                   \
 938     "psubw               %%mm6, %%mm5   \n\t" /* -2x2 + x3 */             \
 939     "pmullw  "MANGLE(ff_pw_3)", %%mm5   \n\t" /* -6x2 + 3x3 */            \
 940     "paddw              "#rnd", %%mm4   \n\t" /* x2 */                    \
 941     "paddw               %%mm4, %%mm5   \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
 942     "psraw                  $5, %%mm5   \n\t"                             \
 943     "packuswb            %%mm5, %%mm5   \n\t"                             \
 944     OP(%%mm5, out, %%mm7, d)
 945
 946 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)                \
 947 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst,           \
 948                                                   uint8_t *src,           \
 949                                                   int dstStride,          \
 950                                                   int srcStride,          \
 951                                                   int h)                  \
 952 {                                                                         \
 953     uint64_t temp;                                                        \
 954                                                                           \
 955     __asm__ volatile (                                                    \
 956         "pxor      %%mm7, %%mm7             \n\t"                         \
 957         "1:                                 \n\t"                         \
 958         "movq       (%0), %%mm0             \n\t" /* ABCDEFGH */          \
 959         "movq      %%mm0, %%mm1             \n\t" /* ABCDEFGH */          \
 960         "movq      %%mm0, %%mm2             \n\t" /* ABCDEFGH */          \
 961         "punpcklbw %%mm7, %%mm0             \n\t" /* 0A0B0C0D */          \
 962         "punpckhbw %%mm7, %%mm1             \n\t" /* 0E0F0G0H */          \
 963         "pshufw    $0x90, %%mm0, %%mm5      \n\t" /* 0A0A0B0C */          \
 964         "pshufw    $0x41, %%mm0, %%mm6      \n\t" /* 0B0A0A0B */          \
 965         "movq      %%mm2, %%mm3             \n\t" /* ABCDEFGH */          \
 966         "movq      %%mm2, %%mm4             \n\t" /* ABCDEFGH */          \
 967         "psllq        $8, %%mm2             \n\t" /* 0ABCDEFG */          \
 968         "psllq       $16, %%mm3             \n\t" /* 00ABCDEF */          \
 969         "psllq       $24, %%mm4             \n\t" /* 000ABCDE */          \
 970         "punpckhbw %%mm7, %%mm2             \n\t" /* 0D0E0F0G */          \
 971         "punpckhbw %%mm7, %%mm3             \n\t" /* 0C0D0E0F */          \
 972         "punpckhbw %%mm7, %%mm4             \n\t" /* 0B0C0D0E */          \
 973         "paddw     %%mm3, %%mm5             \n\t" /* b */                 \
 974         "paddw     %%mm2, %%mm6             \n\t" /* c */                 \
 975         "paddw     %%mm5, %%mm5             \n\t" /* 2b */                \
 976         "psubw     %%mm5, %%mm6             \n\t" /* c - 2b */            \
 977         "pshufw    $0x06, %%mm0, %%mm5      \n\t" /* 0C0B0A0A */          \
 978         "pmullw "MANGLE(ff_pw_3)", %%mm6    \n\t" /* 3c - 6b */           \
 979         "paddw     %%mm4, %%mm0             \n\t" /* a */                 \
 980         "paddw     %%mm1, %%mm5             \n\t" /* d */                 \
 981         "pmullw "MANGLE(ff_pw_20)", %%mm0   \n\t" /* 20a */               \
 982         "psubw     %%mm5, %%mm0             \n\t" /* 20a - d */           \
 983         "paddw        %6, %%mm6             \n\t"                         \
 984         "paddw     %%mm6, %%mm0             \n\t" /* 20a - 6b + 3c - d */ \
 985         "psraw        $5, %%mm0             \n\t"                         \
 986         "movq      %%mm0, %5                \n\t"                         \
 987         /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */     \
 988                                                                           \
 989         "movq      5(%0), %%mm0             \n\t" /* FGHIJKLM */          \
 990         "movq      %%mm0, %%mm5             \n\t" /* FGHIJKLM */          \
 991         "movq      %%mm0, %%mm6             \n\t" /* FGHIJKLM */          \
 992         "psrlq        $8, %%mm0             \n\t" /* GHIJKLM0 */          \
 993         "psrlq       $16, %%mm5             \n\t" /* HIJKLM00 */          \
 994         "punpcklbw %%mm7, %%mm0             \n\t" /* 0G0H0I0J */          \
 995         "punpcklbw %%mm7, %%mm5             \n\t" /* 0H0I0J0K */          \
 996         "paddw     %%mm0, %%mm2             \n\t" /* b */                 \
 997         "paddw     %%mm5, %%mm3             \n\t" /* c */                 \
 998         "paddw     %%mm2, %%mm2             \n\t" /* 2b */                \
 999         "psubw     %%mm2, %%mm3             \n\t" /* c - 2b */            \
1000         "movq      %%mm6, %%mm2             \n\t" /* FGHIJKLM */          \
1001         "psrlq       $24, %%mm6             \n\t" /* IJKLM000 */          \
1002         "punpcklbw %%mm7, %%mm2             \n\t" /* 0F0G0H0I */          \
1003         "punpcklbw %%mm7, %%mm6             \n\t" /* 0I0J0K0L */          \
1004         "pmullw "MANGLE(ff_pw_3)", %%mm3    \n\t" /* 3c - 6b */           \
1005         "paddw     %%mm2, %%mm1             \n\t" /* a */                 \
1006         "paddw     %%mm6, %%mm4             \n\t" /* d */                 \
1007         "pmullw "MANGLE(ff_pw_20)", %%mm1   \n\t" /* 20a */               \
1008         "psubw     %%mm4, %%mm3             \n\t" /* - 6b +3c - d */      \
1009         "paddw        %6, %%mm1             \n\t"                         \
1010         "paddw     %%mm1, %%mm3             \n\t" /* 20a - 6b +3c - d */  \
1011         "psraw        $5, %%mm3             \n\t"                         \
1012         "movq         %5, %%mm1             \n\t"                         \
1013         "packuswb  %%mm3, %%mm1             \n\t"                         \
1014         OP_MMX2(%%mm1, (%1), %%mm4, q)                                    \
1015         /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */     \
1016                                                                           \
1017         "movq      9(%0), %%mm1             \n\t" /* JKLMNOPQ */          \
1018         "movq      %%mm1, %%mm4             \n\t" /* JKLMNOPQ */          \
1019         "movq      %%mm1, %%mm3             \n\t" /* JKLMNOPQ */          \
1020         "psrlq        $8, %%mm1             \n\t" /* KLMNOPQ0 */          \
1021         "psrlq       $16, %%mm4             \n\t" /* LMNOPQ00 */          \
1022         "punpcklbw %%mm7, %%mm1             \n\t" /* 0K0L0M0N */          \
1023         "punpcklbw %%mm7, %%mm4             \n\t" /* 0L0M0N0O */          \
1024         "paddw     %%mm1, %%mm5             \n\t" /* b */                 \
1025         "paddw     %%mm4, %%mm0             \n\t" /* c */                 \
1026         "paddw     %%mm5, %%mm5             \n\t" /* 2b */                \
1027         "psubw     %%mm5, %%mm0             \n\t" /* c - 2b */            \
1028         "movq      %%mm3, %%mm5             \n\t" /* JKLMNOPQ */          \
1029         "psrlq       $24, %%mm3             \n\t" /* MNOPQ000 */          \
1030         "pmullw "MANGLE(ff_pw_3)", %%mm0    \n\t" /* 3c - 6b */           \
1031         "punpcklbw %%mm7, %%mm3             \n\t" /* 0M0N0O0P */          \
1032         "paddw     %%mm3, %%mm2             \n\t" /* d */                 \
1033         "psubw     %%mm2, %%mm0             \n\t" /* -6b + 3c - d */      \
1034         "movq      %%mm5, %%mm2             \n\t" /* JKLMNOPQ */          \
1035         "punpcklbw %%mm7, %%mm2             \n\t" /* 0J0K0L0M */          \
1036         "punpckhbw %%mm7, %%mm5             \n\t" /* 0N0O0P0Q */          \
1037         "paddw     %%mm2, %%mm6             \n\t" /* a */                 \
1038         "pmullw "MANGLE(ff_pw_20)", %%mm6   \n\t" /* 20a */               \
1039         "paddw        %6, %%mm0             \n\t"                         \
1040         "paddw     %%mm6, %%mm0             \n\t" /* 20a - 6b + 3c - d */ \
1041         "psraw        $5, %%mm0             \n\t"                         \
1042         /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */                         \
1043         /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */                              \
1044                                                                           \
1045         "paddw    %%mm5, %%mm3              \n\t" /* a */                 \
1046         "pshufw   $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */          \
1047         "paddw    %%mm4, %%mm6              \n\t" /* b */                 \
1048         "pshufw   $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */          \
1049         "pshufw   $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */          \
1050         "paddw    %%mm1, %%mm4              \n\t" /* c */                 \
1051         "paddw    %%mm2, %%mm5              \n\t" /* d */                 \
1052         "paddw    %%mm6, %%mm6              \n\t" /* 2b */                \
1053         "psubw    %%mm6, %%mm4              \n\t" /* c - 2b */            \
1054         "pmullw "MANGLE(ff_pw_20)", %%mm3   \n\t" /* 20a */               \
1055         "pmullw  "MANGLE(ff_pw_3)", %%mm4   \n\t" /* 3c - 6b */           \
1056         "psubw    %%mm5, %%mm3              \n\t" /* -6b + 3c - d */      \
1057         "paddw       %6, %%mm4              \n\t"                         \
1058         "paddw    %%mm3, %%mm4              \n\t" /* 20a - 6b + 3c - d */ \
1059         "psraw       $5, %%mm4              \n\t"                         \
1060         "packuswb %%mm4, %%mm0              \n\t"                         \
1061         OP_MMX2(%%mm0, 8(%1), %%mm4, q)                                   \
1062                                                                           \
1063         "add         %3, %0                 \n\t"                         \
1064         "add         %4, %1                 \n\t"                         \
1065         "decl        %2                     \n\t"                         \
1066         "jnz         1b                     \n\t"                         \
1067         : "+a"(src), "+c"(dst), "+D"(h)                                   \
1068         : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride),               \
1069           /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER)      \
1070         : "memory"                                                        \
1071         );                                                                \
1072 }                                                                         \
1073                                                                           \
1074 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst,          \
1075                                                    uint8_t *src,          \
1076                                                    int dstStride,         \
1077                                                    int srcStride,         \
1078                                                    int h)                 \
1079 {                                                                         \
1080     int i;                                                                \
1081     int16_t temp[16];                                                     \
1082     /* quick HACK, XXX FIXME MUST be optimized */                         \
1083     for (i = 0; i < h; i++) {                                             \
1084         temp[ 0] = (src[ 0] + src[ 1]) * 20 - (src[ 0] + src[ 2]) * 6 +   \
1085                    (src[ 1] + src[ 3]) *  3 - (src[ 2] + src[ 4]);        \
1086         temp[ 1] = (src[ 1] + src[ 2]) * 20 - (src[ 0] + src[ 3]) * 6 +   \
1087                    (src[ 0] + src[ 4]) *  3 - (src[ 1] + src[ 5]);        \
1088         temp[ 2] = (src[ 2] + src[ 3]) * 20 - (src[ 1] + src[ 4]) * 6 +   \
1089                    (src[ 0] + src[ 5]) *  3 - (src[ 0] + src[ 6]);        \
1090         temp[ 3] = (src[ 3] + src[ 4]) * 20 - (src[ 2] + src[ 5]) * 6 +   \
1091                    (src[ 1] + src[ 6]) *  3 - (src[ 0] + src[ 7]);        \
1092         temp[ 4] = (src[ 4] + src[ 5]) * 20 - (src[ 3] + src[ 6]) * 6 +   \
1093                    (src[ 2] + src[ 7]) *  3 - (src[ 1] + src[ 8]);        \
1094         temp[ 5] = (src[ 5] + src[ 6]) * 20 - (src[ 4] + src[ 7]) * 6 +   \
1095                    (src[ 3] + src[ 8]) *  3 - (src[ 2] + src[ 9]);        \
1096         temp[ 6] = (src[ 6] + src[ 7]) * 20 - (src[ 5] + src[ 8]) * 6 +   \
1097                    (src[ 4] + src[ 9]) *  3 - (src[ 3] + src[10]);        \
1098         temp[ 7] = (src[ 7] + src[ 8]) * 20 - (src[ 6] + src[ 9]) * 6 +   \
1099                    (src[ 5] + src[10]) *  3 - (src[ 4] + src[11]);        \
1100         temp[ 8] = (src[ 8] + src[ 9]) * 20 - (src[ 7] + src[10]) * 6 +   \
1101                    (src[ 6] + src[11]) *  3 - (src[ 5] + src[12]);        \
1102         temp[ 9] = (src[ 9] + src[10]) * 20 - (src[ 8] + src[11]) * 6 +   \
1103                    (src[ 7] + src[12]) *  3 - (src[ 6] + src[13]);        \
1104         temp[10] = (src[10] + src[11]) * 20 - (src[ 9] + src[12]) * 6 +   \
1105                    (src[ 8] + src[13]) *  3 - (src[ 7] + src[14]);        \
1106         temp[11] = (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 +   \
1107                    (src[ 9] + src[14]) *  3 - (src[ 8] + src[15]);        \
1108         temp[12] = (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 +   \
1109                    (src[10] + src[15]) *  3 - (src[ 9] + src[16]);        \
1110         temp[13] = (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 +   \
1111                    (src[11] + src[16]) *  3 - (src[10] + src[16]);        \
1112         temp[14] = (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 +   \
1113                    (src[12] + src[16]) *  3 - (src[11] + src[15]);        \
1114         temp[15] = (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 +   \
1115                    (src[13] + src[15]) *  3 - (src[12] + src[14]);        \
1116         __asm__ volatile (                                                \
1117             "movq      (%0), %%mm0          \n\t"                         \
1118             "movq     8(%0), %%mm1          \n\t"                         \
1119             "paddw       %2, %%mm0          \n\t"                         \
1120             "paddw       %2, %%mm1          \n\t"                         \
1121             "psraw       $5, %%mm0          \n\t"                         \
1122             "psraw       $5, %%mm1          \n\t"                         \
1123             "packuswb %%mm1, %%mm0          \n\t"                         \
1124             OP_3DNOW(%%mm0, (%1), %%mm1, q)                               \
1125             "movq    16(%0), %%mm0          \n\t"                         \
1126             "movq    24(%0), %%mm1          \n\t"                         \
1127             "paddw       %2, %%mm0          \n\t"                         \
1128             "paddw       %2, %%mm1          \n\t"                         \
1129             "psraw       $5, %%mm0          \n\t"                         \
1130             "psraw       $5, %%mm1          \n\t"                         \
1131             "packuswb %%mm1, %%mm0          \n\t"                         \
1132             OP_3DNOW(%%mm0, 8(%1), %%mm1, q)                              \
1133             :: "r"(temp), "r"(dst), "m"(ROUNDER)                          \
1134             : "memory"                                                    \
1135             );                                                            \
1136         dst += dstStride;                                                 \
1137         src += srcStride;                                                 \
1138     }                                                                     \
1139 }                                                                         \
1140                                                                           \
1141 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst,            \
1142                                                  uint8_t *src,            \
1143                                                  int dstStride,           \
1144                                                  int srcStride,           \
1145                                                  int h)                   \
1146 {                                                                         \
1147     __asm__ volatile (                                                    \
1148         "pxor      %%mm7, %%mm7             \n\t"                         \
1149         "1:                                 \n\t"                         \
1150         "movq       (%0), %%mm0             \n\t" /* ABCDEFGH */          \
1151         "movq      %%mm0, %%mm1             \n\t" /* ABCDEFGH */          \
1152         "movq      %%mm0, %%mm2             \n\t" /* ABCDEFGH */          \
1153         "punpcklbw %%mm7, %%mm0             \n\t" /* 0A0B0C0D */          \
1154         "punpckhbw %%mm7, %%mm1             \n\t" /* 0E0F0G0H */          \
1155         "pshufw    $0x90, %%mm0, %%mm5      \n\t" /* 0A0A0B0C */          \
1156         "pshufw    $0x41, %%mm0, %%mm6      \n\t" /* 0B0A0A0B */          \
1157         "movq      %%mm2, %%mm3             \n\t" /* ABCDEFGH */          \
1158         "movq      %%mm2, %%mm4             \n\t" /* ABCDEFGH */          \
1159         "psllq        $8, %%mm2             \n\t" /* 0ABCDEFG */          \
1160         "psllq       $16, %%mm3             \n\t" /* 00ABCDEF */          \
1161         "psllq       $24, %%mm4             \n\t" /* 000ABCDE */          \
1162         "punpckhbw %%mm7, %%mm2             \n\t" /* 0D0E0F0G */          \
1163         "punpckhbw %%mm7, %%mm3             \n\t" /* 0C0D0E0F */          \
1164         "punpckhbw %%mm7, %%mm4             \n\t" /* 0B0C0D0E */          \
1165         "paddw     %%mm3, %%mm5             \n\t" /* b */                 \
1166         "paddw     %%mm2, %%mm6             \n\t" /* c */                 \
1167         "paddw     %%mm5, %%mm5             \n\t" /* 2b */                \
1168         "psubw     %%mm5, %%mm6             \n\t" /* c - 2b */            \
1169         "pshufw    $0x06, %%mm0, %%mm5      \n\t" /* 0C0B0A0A */          \
1170         "pmullw "MANGLE(ff_pw_3)", %%mm6    \n\t" /* 3c - 6b */           \
1171         "paddw     %%mm4, %%mm0             \n\t" /* a */                 \
1172         "paddw     %%mm1, %%mm5             \n\t" /* d */                 \
1173         "pmullw "MANGLE(ff_pw_20)", %%mm0   \n\t" /* 20a */               \
1174         "psubw     %%mm5, %%mm0             \n\t" /* 20a - d */           \
1175         "paddw        %5, %%mm6             \n\t"                         \
1176         "paddw     %%mm6, %%mm0             \n\t" /* 20a - 6b + 3c - d */ \
1177         "psraw        $5, %%mm0             \n\t"                         \
1178         /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */     \
1179                                                                           \
1180         "movd      5(%0), %%mm5             \n\t" /* FGHI */              \
1181         "punpcklbw %%mm7, %%mm5             \n\t" /* 0F0G0H0I */          \
1182         "pshufw    $0xF9, %%mm5, %%mm6      \n\t" /* 0G0H0I0I */          \
1183         "paddw     %%mm5, %%mm1             \n\t" /* a */                 \
1184         "paddw     %%mm6, %%mm2             \n\t" /* b */                 \
1185         "pshufw    $0xBE, %%mm5, %%mm6      \n\t" /* 0H0I0I0H */          \
1186         "pshufw    $0x6F, %%mm5, %%mm5      \n\t" /* 0I0I0H0G */          \
1187         "paddw     %%mm6, %%mm3             \n\t" /* c */                 \
1188         "paddw     %%mm5, %%mm4             \n\t" /* d */                 \
1189         "paddw     %%mm2, %%mm2             \n\t" /* 2b */                \
1190         "psubw     %%mm2, %%mm3             \n\t" /* c - 2b */            \
1191         "pmullw "MANGLE(ff_pw_20)", %%mm1   \n\t" /* 20a */               \
1192         "pmullw  "MANGLE(ff_pw_3)", %%mm3   \n\t" /* 3c - 6b */           \
1193         "psubw     %%mm4, %%mm3             \n\t" /* -6b + 3c - d */      \
1194         "paddw        %5, %%mm1             \n\t"                         \
1195         "paddw     %%mm1, %%mm3             \n\t" /* 20a - 6b + 3c - d */ \
1196         "psraw        $5, %%mm3             \n\t"                         \
1197         "packuswb  %%mm3, %%mm0             \n\t"                         \
1198         OP_MMX2(%%mm0, (%1), %%mm4, q)                                    \
1199                                                                           \
1200         "add          %3, %0                \n\t"                         \
1201         "add          %4, %1                \n\t"                         \
1202         "decl         %2                    \n\t"                         \
1203         "jnz          1b                    \n\t"                         \
1204         : "+a"(src), "+c"(dst), "+d"(h)                                   \
1205         : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride),               \
1206           /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER)                 \
1207         : "memory"                                                        \
1208         );                                                                \
1209 }                                                                         \
1210                                                                           \
1211 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst,           \
1212                                                   uint8_t *src,           \
1213                                                   int dstStride,          \
1214                                                   int srcStride,          \
1215                                                   int h)                  \
1216 {                                                                         \
1217     int i;                                                                \
1218     int16_t temp[8];                                                      \
1219     /* quick HACK, XXX FIXME MUST be optimized */                         \
1220     for (i = 0; i < h; i++) {                                             \
1221         temp[0] = (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 +        \
1222                   (src[1] + src[3]) *  3 - (src[2] + src[4]);             \
1223         temp[1] = (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 +        \
1224                   (src[0] + src[4]) *  3 - (src[1] + src[5]);             \
1225         temp[2] = (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 +        \
1226                   (src[0] + src[5]) *  3 - (src[0] + src[6]);             \
1227         temp[3] = (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 +        \
1228                   (src[1] + src[6]) *  3 - (src[0] + src[7]);             \
1229         temp[4] = (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 +        \
1230                   (src[2] + src[7]) *  3 - (src[1] + src[8]);             \
1231         temp[5] = (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 +        \
1232                   (src[3] + src[8]) *  3 - (src[2] + src[8]);             \
1233         temp[6] = (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 +        \
1234                   (src[4] + src[8]) *  3 - (src[3] + src[7]);             \
1235         temp[7] = (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 +        \
1236                   (src[5] + src[7]) *  3 - (src[4] + src[6]);             \
1237         __asm__ volatile (                                                \
1238             "movq      (%0), %%mm0      \n\t"                             \
1239             "movq     8(%0), %%mm1      \n\t"                             \
1240             "paddw       %2, %%mm0      \n\t"                             \
1241             "paddw       %2, %%mm1      \n\t"                             \
1242             "psraw       $5, %%mm0      \n\t"                             \
1243             "psraw       $5, %%mm1      \n\t"                             \
1244             "packuswb %%mm1, %%mm0      \n\t"                             \
1245             OP_3DNOW(%%mm0, (%1), %%mm1, q)                               \
1246             :: "r"(temp), "r"(dst), "m"(ROUNDER)                          \
1247             : "memory"                                                    \
1248             );                                                            \
1249         dst += dstStride;                                                 \
1250         src += srcStride;                                                 \
1251     }                                                                     \
1252 }
1253
1254 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)                          \
1255 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst,      \
1256                                                      uint8_t *src,      \
1257                                                      int dstStride,     \
1258                                                      int srcStride)     \
1259 {                                                                       \
1260     uint64_t temp[17 * 4];                                              \
1261     uint64_t *temp_ptr = temp;                                          \
1262     int count = 17;                                                     \
1263                                                                         \
1264     /* FIXME unroll */                                                  \
1265     __asm__ volatile (                                                  \
1266         "pxor      %%mm7, %%mm7             \n\t"                       \
1267         "1:                                 \n\t"                       \
1268         "movq       (%0), %%mm0             \n\t"                       \
1269         "movq       (%0), %%mm1             \n\t"                       \
1270         "movq      8(%0), %%mm2             \n\t"                       \
1271         "movq      8(%0), %%mm3             \n\t"                       \
1272         "punpcklbw %%mm7, %%mm0             \n\t"                       \
1273         "punpckhbw %%mm7, %%mm1             \n\t"                       \
1274         "punpcklbw %%mm7, %%mm2             \n\t"                       \
1275         "punpckhbw %%mm7, %%mm3             \n\t"                       \
1276         "movq      %%mm0, (%1)              \n\t"                       \
1277         "movq      %%mm1, 17 * 8(%1)        \n\t"                       \
1278         "movq      %%mm2, 2 * 17 * 8(%1)    \n\t"                       \
1279         "movq      %%mm3, 3 * 17 * 8(%1)    \n\t"                       \
1280         "add          $8, %1                \n\t"                       \
1281         "add          %3, %0                \n\t"                       \
1282         "decl         %2                    \n\t"                       \
1283         "jnz          1b                    \n\t"                       \
1284         : "+r"(src), "+r"(temp_ptr), "+r"(count)                        \
1285         : "r"((x86_reg)srcStride)                                       \
1286         : "memory"                                                      \
1287         );                                                              \
1288                                                                         \
1289     temp_ptr = temp;                                                    \
1290     count    = 4;                                                       \
1291                                                                         \
1292     /* FIXME reorder for speed */                                       \
1293     __asm__ volatile (                                                  \
1294         /* "pxor  %%mm7, %%mm7            \n\t" */                      \
1295         "1:                             \n\t"                           \
1296         "movq    (%0), %%mm0            \n\t"                           \
1297         "movq   8(%0), %%mm1            \n\t"                           \
1298         "movq  16(%0), %%mm2            \n\t"                           \
1299         "movq  24(%0), %%mm3            \n\t"                           \
1300         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),   8(%0),    (%0),  32(%0), (%1),     OP) \
1301         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),    (%0),    (%0),  40(%0), (%1, %3), OP) \
1302         "add       %4, %1               \n\t"                           \
1303         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),    (%0),   8(%0),  48(%0), (%1),     OP) \
1304                                                                         \
1305         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),   8(%0),  16(%0),  56(%0), (%1, %3), OP) \
1306         "add       %4, %1               \n\t"                           \
1307         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0),  16(%0),  24(%0),  64(%0), (%1),     OP) \
1308         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0),  24(%0),  32(%0),  72(%0), (%1, %3), OP) \
1309         "add       %4, %1               \n\t"                           \
1310         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0),  32(%0),  40(%0),  80(%0), (%1),     OP) \
1311         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0),  40(%0),  48(%0),  88(%0), (%1, %3), OP) \
1312         "add       %4, %1               \n\t"                           \
1313         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0),  48(%0),  56(%0),  96(%0), (%1),     OP) \
1314         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0),  56(%0),  64(%0), 104(%0), (%1, %3), OP) \
1315         "add       %4, %1               \n\t"                           \
1316         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0),  64(%0),  72(%0), 112(%0), (%1),     OP) \
1317         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0),  72(%0),  80(%0), 120(%0), (%1, %3), OP) \
1318         "add       %4, %1               \n\t"                           \
1319         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0),  80(%0),  88(%0), 128(%0), (%1),     OP) \
1320                                                                         \
1321         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0),  88(%0),  96(%0), 128(%0), (%1, %3), OP) \
1322         "add       %4, %1               \n\t"                           \
1323         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0),  96(%0), 104(%0), 120(%0), (%1),     OP) \
1324         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1325                                                                         \
1326         "add     $136, %0               \n\t"                           \
1327         "add       %6, %1               \n\t"                           \
1328         "decl      %2                   \n\t"                           \
1329         "jnz       1b                   \n\t"                           \
1330                                                                         \
1331         : "+r"(temp_ptr), "+r"(dst), "+g"(count)                        \
1332         : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride),         \
1333           /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER),              \
1334           "g"(4 - 14 * (x86_reg)dstStride)                              \
1335         : "memory"                                                      \
1336         );                                                              \
1337 }                                                                       \
1338                                                                         \
1339 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst,       \
1340                                                     uint8_t *src,       \
1341                                                     int dstStride,      \
1342                                                     int srcStride)      \
1343 {                                                                       \
1344     uint64_t temp[9 * 2];                                               \
1345     uint64_t *temp_ptr = temp;                                          \
1346     int count = 9;                                                      \
1347                                                                         \
1348     /* FIXME unroll */                                                  \
1349     __asm__ volatile (                                                  \
1350         "pxor      %%mm7, %%mm7         \n\t"                           \
1351         "1:                             \n\t"                           \
1352         "movq       (%0), %%mm0         \n\t"                           \
1353         "movq       (%0), %%mm1         \n\t"                           \
1354         "punpcklbw %%mm7, %%mm0         \n\t"                           \
1355         "punpckhbw %%mm7, %%mm1         \n\t"                           \
1356         "movq      %%mm0, (%1)          \n\t"                           \
1357         "movq      %%mm1, 9*8(%1)       \n\t"                           \
1358         "add          $8, %1            \n\t"                           \
1359         "add          %3, %0            \n\t"                           \
1360         "decl         %2                \n\t"                           \
1361         "jnz          1b                \n\t"                           \
1362         : "+r"(src), "+r"(temp_ptr), "+r"(count)                        \
1363         : "r"((x86_reg)srcStride)                                       \
1364         : "memory"                                                      \
1365         );                                                              \
1366                                                                         \
1367     temp_ptr = temp;                                                    \
1368     count    = 2;                                                       \
1369                                                                         \
1370     /* FIXME reorder for speed */                                       \
1371     __asm__ volatile (                                                  \
1372         /* "pxor  %%mm7, %%mm7            \n\t" */                      \
1373         "1:                             \n\t"                           \
1374         "movq    (%0), %%mm0            \n\t"                           \
1375         "movq   8(%0), %%mm1            \n\t"                           \
1376         "movq  16(%0), %%mm2            \n\t"                           \
1377         "movq  24(%0), %%mm3            \n\t"                           \
1378         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)     \
1379         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP) \
1380         "add       %4, %1               \n\t"                           \
1381         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)     \
1382                                                                         \
1383         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1384         "add       %4, %1               \n\t"                           \
1385         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)     \
1386                                                                         \
1387         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1388         "add       %4, %1               \n\t"                           \
1389         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)     \
1390         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1391                                                                         \
1392         "add      $72, %0               \n\t"                           \
1393         "add       %6, %1               \n\t"                           \
1394         "decl      %2                   \n\t"                           \
1395         "jnz       1b                   \n\t"                           \
1396                                                                         \
1397         : "+r"(temp_ptr), "+r"(dst), "+g"(count)                        \
1398         : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride),         \
1399           /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER),              \
1400           "g"(4 - 6 * (x86_reg)dstStride)                               \
1401         : "memory"                                                      \
1402         );                                                              \
1403 }                                                                       \
1404                                                                         \
1405 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src,   \
1406                                           int stride)                   \
1407 {                                                                       \
1408     OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);                     \
1409 }                                                                       \
1410                                                                         \
1411 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src,    \
1412                                          int stride)                    \
1413 {                                                                       \
1414     uint64_t temp[8];                                                   \
1415     uint8_t * const half = (uint8_t*)temp;                              \
1416     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,           \
1417                                                 stride, 8);             \
1418     OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);    \
1419 }                                                                       \
1420                                                                         \
1421 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src,    \
1422                                          int stride)                    \
1423 {                                                                       \
1424     OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride,           \
1425                                             stride, 8);                 \
1426 }                                                                       \
1427                                                                         \
1428 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src,    \
1429                                          int stride)                    \
1430 {                                                                       \
1431     uint64_t temp[8];                                                   \
1432     uint8_t * const half = (uint8_t*)temp;                              \
1433     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,           \
1434                                                 stride, 8);             \
1435     OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride,            \
1436                                  stride, 8);                            \
1437 }                                                                       \
1438                                                                         \
1439 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src,    \
1440                                          int stride)                    \
1441 {                                                                       \
1442     uint64_t temp[8];                                                   \
1443     uint8_t * const half = (uint8_t*)temp;                              \
1444     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);  \
1445     OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);    \
1446 }                                                                       \
1447                                                                         \
1448 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src,    \
1449                                          int stride)                    \
1450 {                                                                       \
1451     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);  \
1452 }                                                                       \
1453                                                                         \
1454 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src,    \
1455                                          int stride)                    \
1456 {                                                                       \
1457     uint64_t temp[8];                                                   \
1458     uint8_t * const half = (uint8_t*)temp;                              \
1459     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);  \
1460     OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,       \
1461                                  stride, 8);                            \
1462 }                                                                       \
1463                                                                         \
1464 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src,    \
1465                                          int stride)                    \
1466 {                                                                       \
1467     uint64_t half[8 + 9];                                               \
1468     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
1469     uint8_t * const halfHV = ((uint8_t*)half);                          \
1470     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1471                                                 stride, 9);             \
1472     put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);  \
1473     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
1474     OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);     \
1475 }                                                                       \
1476                                                                         \
1477 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src,    \
1478                                          int stride)                    \
1479 {                                                                       \
1480     uint64_t half[8 + 9];                                               \
1481     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
1482     uint8_t * const halfHV = ((uint8_t*)half);                          \
1483     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1484                                                 stride, 9);             \
1485     put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,          \
1486                                      stride, 9);                        \
1487     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
1488     OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);     \
1489 }                                                                       \
1490                                                                         \
1491 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src,    \
1492                                          int stride)                    \
1493 {                                                                       \
1494     uint64_t half[8 + 9];                                               \
1495     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
1496     uint8_t * const halfHV = ((uint8_t*)half);                          \
1497     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1498                                                 stride, 9);             \
1499     put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);  \
1500     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
1501     OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1502 }                                                                       \
1503                                                                         \
1504 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src,    \
1505                                          int stride)                    \
1506 {                                                                       \
1507     uint64_t half[8 + 9];                                               \
1508     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
1509     uint8_t * const halfHV = ((uint8_t*)half);                          \
1510     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1511                                                 stride, 9);             \
1512     put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,          \
1513                                      stride, 9);                        \
1514     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
1515     OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1516 }                                                                       \
1517                                                                         \
1518 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src,    \
1519                                          int stride)                    \
1520 {                                                                       \
1521     uint64_t half[8 + 9];                                               \
1522     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
1523     uint8_t * const halfHV = ((uint8_t*)half);                          \
1524     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1525                                                 stride, 9);             \
1526     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
1527     OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);     \
1528 }                                                                       \
1529                                                                         \
1530 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src,    \
1531                                          int stride)                    \
1532 {                                                                       \
1533     uint64_t half[8 + 9];                                               \
1534     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
1535     uint8_t * const halfHV = ((uint8_t*)half);                          \
1536     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1537                                                 stride, 9);             \
1538     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
1539     OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1540 }                                                                       \
1541                                                                         \
1542 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src,    \
1543                                          int stride)                    \
1544 {                                                                       \
1545     uint64_t half[8 + 9];                                               \
1546     uint8_t * const halfH = ((uint8_t*)half);                           \
1547     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1548                                                 stride, 9);             \
1549     put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);  \
1550     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);     \
1551 }                                                                       \
1552                                                                         \
1553 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src,    \
1554                                          int stride)                    \
1555 {                                                                       \
1556     uint64_t half[8 + 9];                                               \
1557     uint8_t * const halfH = ((uint8_t*)half);                           \
1558     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1559                                                 stride, 9);             \
1560     put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,          \
1561                                      stride, 9);                        \
1562     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);     \
1563 }                                                                       \
1564                                                                         \
1565 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src,    \
1566                                          int stride)                    \
1567 {                                                                       \
1568     uint64_t half[9];                                                   \
1569     uint8_t * const halfH = ((uint8_t*)half);                           \
1570     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1571                                                 stride, 9);             \
1572     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);     \
1573 }                                                                       \
1574                                                                         \
1575 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src,  \
1576                                            int stride)                  \
1577 {                                                                       \
1578     OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);                   \
1579 }                                                                       \
1580                                                                         \
1581 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src,   \
1582                                           int stride)                   \
1583 {                                                                       \
1584     uint64_t temp[32];                                                  \
1585     uint8_t * const half = (uint8_t*)temp;                              \
1586     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,         \
1587                                                  stride, 16);           \
1588     OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);  \
1589 }                                                                       \
1590                                                                         \
1591 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src,   \
1592                                           int stride)                   \
1593 {                                                                       \
1594     OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src,                  \
1595                                              stride, stride, 16);       \
1596 }                                                                       \
1597                                                                         \
1598 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src,   \
1599                                           int stride)                   \
1600 {                                                                       \
1601     uint64_t temp[32];                                                  \
1602     uint8_t * const half = (uint8_t*)temp;                              \
1603     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,         \
1604                                                  stride, 16);           \
1605     OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half,                   \
1606                                   stride, stride, 16);                  \
1607 }                                                                       \
1608                                                                         \
1609 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src,   \
1610                                           int stride)                   \
1611 {                                                                       \
1612     uint64_t temp[32];                                                  \
1613     uint8_t * const half = (uint8_t*)temp;                              \
1614     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,         \
1615                                                  stride);               \
1616     OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);  \
1617 }                                                                       \
1618                                                                         \
1619 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src,   \
1620                                           int stride)                   \
1621 {                                                                       \
1622     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1623 }                                                                       \
1624                                                                         \
1625 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src,   \
1626                                           int stride)                   \
1627 {                                                                       \
1628     uint64_t temp[32];                                                  \
1629     uint8_t * const half = (uint8_t*)temp;                              \
1630     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,         \
1631                                                  stride);               \
1632     OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half,                \
1633                                   stride, stride, 16);                  \
1634 }                                                                       \
1635                                                                         \
1636 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src,   \
1637                                           int stride)                   \
1638 {                                                                       \
1639     uint64_t half[16 * 2 + 17 * 2];                                     \
1640     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1641     uint8_t * const halfHV = ((uint8_t*)half);                          \
1642     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1643                                                  stride, 17);           \
1644     put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,            \
1645                                       stride, 17);                      \
1646     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
1647                                                  16, 16);               \
1648     OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);  \
1649 }                                                                       \
1650                                                                         \
1651 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src,   \
1652                                           int stride)                   \
1653 {                                                                       \
1654     uint64_t half[16 * 2 + 17 * 2];                                     \
1655     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1656     uint8_t * const halfHV = ((uint8_t*)half);                          \
1657     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1658                                                  stride, 17);           \
1659     put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,        \
1660                                       stride, 17);                      \
1661     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
1662                                                  16, 16);               \
1663     OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);  \
1664 }                                                                       \
1665                                                                         \
1666 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src,   \
1667                                           int stride)                   \
1668 {                                                                       \
1669     uint64_t half[16 * 2 + 17 * 2];                                     \
1670     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1671     uint8_t * const halfHV = ((uint8_t*)half);                          \
1672     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1673                                                  stride, 17);           \
1674     put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,            \
1675                                       stride, 17);                      \
1676     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
1677                                                  16, 16);               \
1678     OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride,      \
1679                                   16, 16);                              \
1680 }                                                                       \
1681                                                                         \
1682 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src,   \
1683                                           int stride)                   \
1684 {                                                                       \
1685     uint64_t half[16 * 2 + 17 * 2];                                     \
1686     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1687     uint8_t * const halfHV = ((uint8_t*)half);                          \
1688     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1689                                                  stride, 17);           \
1690     put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,        \
1691                                       stride, 17);                      \
1692     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
1693                                                  16, 16);               \
1694     OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride,      \
1695                                   16, 16);                              \
1696 }                                                                       \
1697                                                                         \
1698 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src,   \
1699                                           int stride)                   \
1700 {                                                                       \
1701     uint64_t half[16 * 2 + 17 * 2];                                     \
1702     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1703     uint8_t * const halfHV = ((uint8_t*)half);                          \
1704     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1705                                                  stride, 17);           \
1706     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
1707                                                  16, 16);               \
1708     OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);  \
1709 }                                                                       \
1710                                                                         \
1711 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src,   \
1712                                           int stride)                   \
1713 {                                                                       \
1714     uint64_t half[16 * 2 + 17 * 2];                                     \
1715     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1716     uint8_t * const halfHV = ((uint8_t*)half);                          \
1717     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1718                                                  stride, 17);           \
1719     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
1720                                                  16, 16);               \
1721     OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride,      \
1722                                   16, 16);                              \
1723 }                                                                       \
1724                                                                         \
1725 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src,   \
1726                                           int stride)                   \
1727 {                                                                       \
1728     uint64_t half[17 * 2];                                              \
1729     uint8_t * const halfH = ((uint8_t*)half);                           \
1730     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1731                                                  stride, 17);           \
1732     put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,            \
1733                                       stride, 17);                      \
1734     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);   \
1735 }                                                                       \
1736                                                                         \
1737 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src,   \
1738                                           int stride)                   \
1739 {                                                                       \
1740     uint64_t half[17 * 2];                                              \
1741     uint8_t * const halfH = ((uint8_t*)half);                           \
1742     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1743                                                  stride, 17);           \
1744     put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,        \
1745                                       stride, 17);                      \
1746     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);   \
1747 }                                                                       \
1748                                                                         \
1749 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src,   \
1750                                           int stride)                   \
1751 {                                                                       \
1752     uint64_t half[17 * 2];                                              \
1753     uint8_t * const halfH = ((uint8_t*)half);                           \
1754     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1755                                                  stride, 17);           \
1756     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);   \
1757 }
1758
1759 #define PUT_OP(a, b, temp, size)                \
1760     "mov"#size"        "#a", "#b"       \n\t"
1761
1762 #define AVG_3DNOW_OP(a, b, temp, size)          \
1763     "mov"#size"        "#b", "#temp"    \n\t"   \
1764     "pavgusb        "#temp", "#a"       \n\t"   \
1765     "mov"#size"        "#a", "#b"       \n\t"
1766
1767 #define AVG_MMX2_OP(a, b, temp, size)           \
1768     "mov"#size"        "#b", "#temp"    \n\t"   \
1769     "pavgb          "#temp", "#a"       \n\t"   \
1770     "mov"#size"        "#a", "#b"       \n\t"
1771
1772 QPEL_BASE(put_,        ff_pw_16, _,        PUT_OP,       PUT_OP)
1773 QPEL_BASE(avg_,        ff_pw_16, _,        AVG_MMX2_OP,  AVG_3DNOW_OP)
1774 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP,       PUT_OP)
1775 QPEL_OP(put_,          ff_pw_16, _,        PUT_OP,       3dnow)
1776 QPEL_OP(avg_,          ff_pw_16, _,        AVG_3DNOW_OP, 3dnow)
1777 QPEL_OP(put_no_rnd_,   ff_pw_15, _no_rnd_, PUT_OP,       3dnow)
1778 QPEL_OP(put_,          ff_pw_16, _,        PUT_OP,       mmx2)
1779 QPEL_OP(avg_,          ff_pw_16, _,        AVG_MMX2_OP,  mmx2)
1780 QPEL_OP(put_no_rnd_,   ff_pw_15, _no_rnd_, PUT_OP,       mmx2)
1781
1782 /***********************************/
1783 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1784
1785 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)                              \
1786 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1787                                                                  uint8_t *src, \
1788                                                                  int stride)   \
1789 {                                                                              \
1790     OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);                  \
1791 }
1792
1793 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)                        \
1794 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1795                                                                  uint8_t *src, \
1796                                                                  int stride)   \
1797 {                                                                              \
1798     OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE,    \
1799                                                S1, S2);                        \
1800 }
1801
1802 #define QPEL_2TAP(OPNAME, SIZE, MMX)                                        \
1803 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)                            \
1804 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)                            \
1805 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)                               \
1806 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =    \
1807     OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;                                \
1808 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =    \
1809     OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;                           \
1810 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =    \
1811     OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;                           \
1812 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst,      \
1813                                                          uint8_t *src,      \
1814                                                          int stride)        \
1815 {                                                                           \
1816     OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE);    \
1817 }                                                                           \
1818 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst,      \
1819                                                          uint8_t *src,      \
1820                                                          int stride)        \
1821 {                                                                           \
1822     OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride,              \
1823                                             stride, SIZE);                  \
1824 }                                                                           \
1825 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0,           1,       0)                \
1826 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1,          -1,       0)                \
1827 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0,           stride,  0)                \
1828 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride,     -stride,  0)                \
1829 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0,           stride,  1)                \
1830 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1,           stride, -1)                \
1831 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride,     -stride,  1)                \
1832 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1)                \
1833
1834 QPEL_2TAP(put_, 16, mmx2)
1835 QPEL_2TAP(avg_, 16, mmx2)
1836 QPEL_2TAP(put_,  8, mmx2)
1837 QPEL_2TAP(avg_,  8, mmx2)
1838 QPEL_2TAP(put_, 16, 3dnow)
1839 QPEL_2TAP(avg_, 16, 3dnow)
1840 QPEL_2TAP(put_,  8, 3dnow)
1841 QPEL_2TAP(avg_,  8, 3dnow)
1842
1843 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1844 {
1845   put_pixels8_xy2_mmx(dst, src, stride, 8);
1846 }
1847 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1848 {
1849   put_pixels16_xy2_mmx(dst, src, stride, 16);
1850 }
1851 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1852 {
1853   avg_pixels8_xy2_mmx(dst, src, stride, 8);
1854 }
1855 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1856 {
1857   avg_pixels16_xy2_mmx(dst, src, stride, 16);
1858 }
1859
1860 #endif /* HAVE_INLINE_ASM */
1861
1862 #if HAVE_YASM
1863 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
1864                                 x86_reg linesize, x86_reg start_y,
1865                                 x86_reg end_y, x86_reg block_h,
1866                                 x86_reg start_x, x86_reg end_x,
1867                                 x86_reg block_w);
1868 extern emu_edge_core_func ff_emu_edge_core_mmx;
1869 extern emu_edge_core_func ff_emu_edge_core_sse;
1870
1871 static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
1872                                               int linesize,
1873                                               int block_w, int block_h,
1874                                               int src_x, int src_y,
1875                                               int w, int h,
1876                                               emu_edge_core_func *core_fn)
1877 {
1878     int start_y, start_x, end_y, end_x, src_y_add = 0;
1879
1880     if (src_y >= h) {
1881         src_y_add = h - 1 - src_y;
1882         src_y     = h - 1;
1883     } else if (src_y <= -block_h) {
1884         src_y_add = 1 - block_h - src_y;
1885         src_y     = 1 - block_h;
1886     }
1887     if (src_x >= w) {
1888         src   += w - 1 - src_x;
1889         src_x  = w - 1;
1890     } else if (src_x <= -block_w) {
1891         src   += 1 - block_w - src_x;
1892         src_x  = 1 - block_w;
1893     }
1894
1895     start_y = FFMAX(0, -src_y);
1896     start_x = FFMAX(0, -src_x);
1897     end_y   = FFMIN(block_h, h-src_y);
1898     end_x   = FFMIN(block_w, w-src_x);
1899     assert(start_x < end_x && block_w > 0);
1900     assert(start_y < end_y && block_h > 0);
1901
1902     // fill in the to-be-copied part plus all above/below
1903     src += (src_y_add + start_y) * linesize + start_x;
1904     buf += start_x;
1905     core_fn(buf, src, linesize, start_y, end_y,
1906             block_h, start_x, end_x, block_w);
1907 }
1908
1909 #if ARCH_X86_32
1910 static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
1911                                              int linesize,
1912                                              int block_w, int block_h,
1913                                              int src_x, int src_y, int w, int h)
1914 {
1915     emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1916                      w, h, &ff_emu_edge_core_mmx);
1917 }
1918 #endif
1919
1920 static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
1921                                              int linesize,
1922                                              int block_w, int block_h,
1923                                              int src_x, int src_y, int w, int h)
1924 {
1925     emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1926                      w, h, &ff_emu_edge_core_sse);
1927 }
1928 #endif /* HAVE_YASM */
1929
1930 #if HAVE_INLINE_ASM
1931
1932 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1933                                    int linesize, int block_w, int block_h,
1934                                    int src_x, int src_y, int w, int h);
1935
1936 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1937                                  int stride, int h, int ox, int oy,
1938                                  int dxx, int dxy, int dyx, int dyy,
1939                                  int shift, int r, int width, int height,
1940                                  emulated_edge_mc_func *emu_edge_fn)
1941 {
1942     const int w    = 8;
1943     const int ix   = ox  >> (16 + shift);
1944     const int iy   = oy  >> (16 + shift);
1945     const int oxs  = ox  >> 4;
1946     const int oys  = oy  >> 4;
1947     const int dxxs = dxx >> 4;
1948     const int dxys = dxy >> 4;
1949     const int dyxs = dyx >> 4;
1950     const int dyys = dyy >> 4;
1951     const uint16_t r4[4]   = { r, r, r, r };
1952     const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1953     const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1954     const uint64_t shift2 = 2 * shift;
1955     uint8_t edge_buf[(h + 1) * stride];
1956     int x, y;
1957
1958     const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1959     const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1960     const int dxh = dxy * (h - 1);
1961     const int dyw = dyx * (w - 1);
1962     if ( // non-constant fullpel offset (3% of blocks)
1963         ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1964          (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1965         // uses more than 16 bits of subpel mv (only at huge resolution)
1966         || (dxx | dxy | dyx | dyy) & 15) {
1967         // FIXME could still use mmx for some of the rows
1968         ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1969                  shift, r, width, height);
1970         return;
1971     }
1972
1973     src += ix + iy * stride;
1974     if ((unsigned)ix >= width  - w ||
1975         (unsigned)iy >= height - h) {
1976         emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1977         src = edge_buf;
1978     }
1979
1980     __asm__ volatile (
1981         "movd         %0, %%mm6         \n\t"
1982         "pxor      %%mm7, %%mm7         \n\t"
1983         "punpcklwd %%mm6, %%mm6         \n\t"
1984         "punpcklwd %%mm6, %%mm6         \n\t"
1985         :: "r"(1<<shift)
1986     );
1987
1988     for (x = 0; x < w; x += 4) {
1989         uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1990                             oxs - dxys + dxxs * (x + 1),
1991                             oxs - dxys + dxxs * (x + 2),
1992                             oxs - dxys + dxxs * (x + 3) };
1993         uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1994                             oys - dyys + dyxs * (x + 1),
1995                             oys - dyys + dyxs * (x + 2),
1996                             oys - dyys + dyxs * (x + 3) };
1997
1998         for (y = 0; y < h; y++) {
1999             __asm__ volatile (
2000                 "movq      %0, %%mm4    \n\t"
2001                 "movq      %1, %%mm5    \n\t"
2002                 "paddw     %2, %%mm4    \n\t"
2003                 "paddw     %3, %%mm5    \n\t"
2004                 "movq   %%mm4, %0       \n\t"
2005                 "movq   %%mm5, %1       \n\t"
2006                 "psrlw    $12, %%mm4    \n\t"
2007                 "psrlw    $12, %%mm5    \n\t"
2008                 : "+m"(*dx4), "+m"(*dy4)
2009                 : "m"(*dxy4), "m"(*dyy4)
2010             );
2011
2012             __asm__ volatile (
2013                 "movq      %%mm6, %%mm2 \n\t"
2014                 "movq      %%mm6, %%mm1 \n\t"
2015                 "psubw     %%mm4, %%mm2 \n\t"
2016                 "psubw     %%mm5, %%mm1 \n\t"
2017                 "movq      %%mm2, %%mm0 \n\t"
2018                 "movq      %%mm4, %%mm3 \n\t"
2019                 "pmullw    %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
2020                 "pmullw    %%mm5, %%mm3 \n\t" // dx * dy
2021                 "pmullw    %%mm5, %%mm2 \n\t" // (s - dx) * dy
2022                 "pmullw    %%mm4, %%mm1 \n\t" // dx * (s - dy)
2023
2024                 "movd         %4, %%mm5 \n\t"
2025                 "movd         %3, %%mm4 \n\t"
2026                 "punpcklbw %%mm7, %%mm5 \n\t"
2027                 "punpcklbw %%mm7, %%mm4 \n\t"
2028                 "pmullw    %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
2029                 "pmullw    %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
2030
2031                 "movd         %2, %%mm5 \n\t"
2032                 "movd         %1, %%mm4 \n\t"
2033                 "punpcklbw %%mm7, %%mm5 \n\t"
2034                 "punpcklbw %%mm7, %%mm4 \n\t"
2035                 "pmullw    %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
2036                 "pmullw    %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
2037                 "paddw        %5, %%mm1 \n\t"
2038                 "paddw     %%mm3, %%mm2 \n\t"
2039                 "paddw     %%mm1, %%mm0 \n\t"
2040                 "paddw     %%mm2, %%mm0 \n\t"
2041
2042                 "psrlw        %6, %%mm0 \n\t"
2043                 "packuswb  %%mm0, %%mm0 \n\t"
2044                 "movd      %%mm0, %0    \n\t"
2045
2046                 : "=m"(dst[x + y * stride])
2047                 : "m"(src[0]), "m"(src[1]),
2048                   "m"(src[stride]), "m"(src[stride + 1]),
2049                   "m"(*r4), "m"(shift2)
2050             );
2051             src += stride;
2052         }
2053         src += 4 - h * stride;
2054     }
2055 }
2056
2057 #if HAVE_YASM
2058 #if ARCH_X86_32
2059 static void gmc_mmx(uint8_t *dst, uint8_t *src,
2060                     int stride, int h, int ox, int oy,
2061                     int dxx, int dxy, int dyx, int dyy,
2062                     int shift, int r, int width, int height)
2063 {
2064     gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2065         width, height, &emulated_edge_mc_mmx);
2066 }
2067 #endif
2068 static void gmc_sse(uint8_t *dst, uint8_t *src,
2069                     int stride, int h, int ox, int oy,
2070                     int dxx, int dxy, int dyx, int dyy,
2071                     int shift, int r, int width, int height)
2072 {
2073     gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2074         width, height, &emulated_edge_mc_sse);
2075 }
2076 #else
2077 static void gmc_mmx(uint8_t *dst, uint8_t *src,
2078                     int stride, int h, int ox, int oy,
2079                     int dxx, int dxy, int dyx, int dyy,
2080                     int shift, int r, int width, int height)
2081 {
2082     gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2083         width, height, &ff_emulated_edge_mc_8);
2084 }
2085 #endif
2086
2087 #define PREFETCH(name, op)                      \
2088 static void name(void *mem, int stride, int h)  \
2089 {                                               \
2090     const uint8_t *p = mem;                     \
2091     do {                                        \
2092         __asm__ volatile (#op" %0" :: "m"(*p)); \
2093         p += stride;                            \
2094     } while (--h);                              \
2095 }
2096
2097 PREFETCH(prefetch_mmx2,  prefetcht0)
2098 PREFETCH(prefetch_3dnow, prefetch)
2099 #undef PREFETCH
2100
2101 #endif /* HAVE_INLINE_ASM */
2102
2103 #include "h264_qpel.c"
2104
2105 void ff_put_h264_chroma_mc8_mmx_rnd  (uint8_t *dst, uint8_t *src,
2106                                       int stride, int h, int x, int y);
2107 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
2108                                       int stride, int h, int x, int y);
2109 void ff_avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst, uint8_t *src,
2110                                       int stride, int h, int x, int y);
2111
2112 void ff_put_h264_chroma_mc4_mmx      (uint8_t *dst, uint8_t *src,
2113                                       int stride, int h, int x, int y);
2114 void ff_avg_h264_chroma_mc4_mmx2     (uint8_t *dst, uint8_t *src,
2115                                       int stride, int h, int x, int y);
2116 void ff_avg_h264_chroma_mc4_3dnow    (uint8_t *dst, uint8_t *src,
2117                                       int stride, int h, int x, int y);
2118
2119 void ff_put_h264_chroma_mc2_mmx2     (uint8_t *dst, uint8_t *src,
2120                                       int stride, int h, int x, int y);
2121 void ff_avg_h264_chroma_mc2_mmx2     (uint8_t *dst, uint8_t *src,
2122                                       int stride, int h, int x, int y);
2123
2124 void ff_put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
2125                                       int stride, int h, int x, int y);
2126 void ff_put_h264_chroma_mc4_ssse3    (uint8_t *dst, uint8_t *src,
2127                                       int stride, int h, int x, int y);
2128
2129 void ff_avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
2130                                       int stride, int h, int x, int y);
2131 void ff_avg_h264_chroma_mc4_ssse3    (uint8_t *dst, uint8_t *src,
2132                                       int stride, int h, int x, int y);
2133
2134 #define CHROMA_MC(OP, NUM, DEPTH, OPT)                                  \
2135 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT      \
2136                                       (uint8_t *dst, uint8_t *src,      \
2137                                        int stride, int h, int x, int y);
2138
2139 CHROMA_MC(put, 2, 10, mmx2)
2140 CHROMA_MC(avg, 2, 10, mmx2)
2141 CHROMA_MC(put, 4, 10, mmx2)
2142 CHROMA_MC(avg, 4, 10, mmx2)
2143 CHROMA_MC(put, 8, 10, sse2)
2144 CHROMA_MC(avg, 8, 10, sse2)
2145 CHROMA_MC(put, 8, 10, avx)
2146 CHROMA_MC(avg, 8, 10, avx)
2147
2148 #if HAVE_INLINE_ASM
2149
2150 /* CAVS-specific */
2151 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2152 {
2153     put_pixels8_mmx(dst, src, stride, 8);
2154 }
2155
2156 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2157 {
2158     avg_pixels8_mmx(dst, src, stride, 8);
2159 }
2160
2161 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2162 {
2163     put_pixels16_mmx(dst, src, stride, 16);
2164 }
2165
2166 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2167 {
2168     avg_pixels16_mmx(dst, src, stride, 16);
2169 }
2170
2171 /* VC-1-specific */
2172 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
2173                                int stride, int rnd)
2174 {
2175     put_pixels8_mmx(dst, src, stride, 8);
2176 }
2177
2178 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src,
2179                                 int stride, int rnd)
2180 {
2181     avg_pixels8_mmx2(dst, src, stride, 8);
2182 }
2183
2184 /* only used in VP3/5/6 */
2185 static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
2186 {
2187 //    START_TIMER
2188     MOVQ_BFE(mm6);
2189     __asm__ volatile(
2190         "1:                             \n\t"
2191         "movq   (%1), %%mm0             \n\t"
2192         "movq   (%2), %%mm1             \n\t"
2193         "movq   (%1,%4), %%mm2          \n\t"
2194         "movq   (%2,%4), %%mm3          \n\t"
2195         PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
2196         "movq   %%mm4, (%3)             \n\t"
2197         "movq   %%mm5, (%3,%4)          \n\t"
2198
2199         "movq   (%1,%4,2), %%mm0        \n\t"
2200         "movq   (%2,%4,2), %%mm1        \n\t"
2201         "movq   (%1,%5), %%mm2          \n\t"
2202         "movq   (%2,%5), %%mm3          \n\t"
2203         "lea    (%1,%4,4), %1           \n\t"
2204         "lea    (%2,%4,4), %2           \n\t"
2205         PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
2206         "movq   %%mm4, (%3,%4,2)        \n\t"
2207         "movq   %%mm5, (%3,%5)          \n\t"
2208         "lea    (%3,%4,4), %3           \n\t"
2209         "subl   $4, %0                  \n\t"
2210         "jnz    1b                      \n\t"
2211         :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
2212         :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
2213         :"memory");
2214 //    STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
2215 }
2216 static void put_vp_no_rnd_pixels16_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
2217 {
2218     put_vp_no_rnd_pixels8_l2_mmx(dst, a, b, stride, h);
2219     put_vp_no_rnd_pixels8_l2_mmx(dst+8, a+8, b+8, stride, h);
2220 }
2221
2222 #if CONFIG_DIRAC_DECODER
2223 #define DIRAC_PIXOP(OPNAME, EXT)\
2224 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2225 {\
2226     OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
2227 }\
2228 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2229 {\
2230     OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
2231 }\
2232 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2233 {\
2234     OPNAME ## _pixels16_ ## EXT(dst   , src[0]   , stride, h);\
2235     OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
2236 }
2237
2238 DIRAC_PIXOP(put, mmx)
2239 DIRAC_PIXOP(avg, mmx)
2240 DIRAC_PIXOP(avg, mmx2)
2241
2242 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2243 {
2244     put_pixels16_sse2(dst, src[0], stride, h);
2245 }
2246 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2247 {
2248     avg_pixels16_sse2(dst, src[0], stride, h);
2249 }
2250 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2251 {
2252     put_pixels16_sse2(dst   , src[0]   , stride, h);
2253     put_pixels16_sse2(dst+16, src[0]+16, stride, h);
2254 }
2255 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2256 {
2257     avg_pixels16_sse2(dst   , src[0]   , stride, h);
2258     avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
2259 }
2260 #endif
2261
2262 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
2263  * converted. */
2264 #if CONFIG_GPL
2265 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
2266                                     DCTELEM *block)
2267 {
2268     ff_mmx_idct(block);
2269     ff_put_pixels_clamped_mmx(block, dest, line_size);
2270 }
2271
2272 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
2273                                     DCTELEM *block)
2274 {
2275     ff_mmx_idct(block);
2276     ff_add_pixels_clamped_mmx(block, dest, line_size);
2277 }
2278
2279 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
2280                                      DCTELEM *block)
2281 {
2282     ff_mmxext_idct(block);
2283     ff_put_pixels_clamped_mmx(block, dest, line_size);
2284 }
2285
2286 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
2287                                      DCTELEM *block)
2288 {
2289     ff_mmxext_idct(block);
2290     ff_add_pixels_clamped_mmx(block, dest, line_size);
2291 }
2292 #endif
2293
2294 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2295 {
2296     int i;
2297     __asm__ volatile ("pxor %%mm7, %%mm7":);
2298     for (i = 0; i < blocksize; i += 2) {
2299         __asm__ volatile (
2300             "movq       %0, %%mm0   \n\t"
2301             "movq       %1, %%mm1   \n\t"
2302             "movq    %%mm0, %%mm2   \n\t"
2303             "movq    %%mm1, %%mm3   \n\t"
2304             "pfcmpge %%mm7, %%mm2   \n\t" // m <= 0.0
2305             "pfcmpge %%mm7, %%mm3   \n\t" // a <= 0.0
2306             "pslld     $31, %%mm2   \n\t" // keep only the sign bit
2307             "pxor    %%mm2, %%mm1   \n\t"
2308             "movq    %%mm3, %%mm4   \n\t"
2309             "pand    %%mm1, %%mm3   \n\t"
2310             "pandn   %%mm1, %%mm4   \n\t"
2311             "pfadd   %%mm0, %%mm3   \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2312             "pfsub   %%mm4, %%mm0   \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2313             "movq    %%mm3, %1      \n\t"
2314             "movq    %%mm0, %0      \n\t"
2315             : "+m"(mag[i]), "+m"(ang[i])
2316             :: "memory"
2317         );
2318     }
2319     __asm__ volatile ("femms");
2320 }
2321
2322 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2323 {
2324     int i;
2325
2326     __asm__ volatile (
2327         "movaps  %0, %%xmm5 \n\t"
2328         :: "m"(ff_pdw_80000000[0])
2329     );
2330     for (i = 0; i < blocksize; i += 4) {
2331         __asm__ volatile (
2332             "movaps      %0, %%xmm0 \n\t"
2333             "movaps      %1, %%xmm1 \n\t"
2334             "xorps   %%xmm2, %%xmm2 \n\t"
2335             "xorps   %%xmm3, %%xmm3 \n\t"
2336             "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2337             "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2338             "andps   %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2339             "xorps   %%xmm2, %%xmm1 \n\t"
2340             "movaps  %%xmm3, %%xmm4 \n\t"
2341             "andps   %%xmm1, %%xmm3 \n\t"
2342             "andnps  %%xmm1, %%xmm4 \n\t"
2343             "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2344             "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2345             "movaps  %%xmm3, %1     \n\t"
2346             "movaps  %%xmm0, %0     \n\t"
2347             : "+m"(mag[i]), "+m"(ang[i])
2348             :: "memory"
2349         );
2350     }
2351 }
2352
2353 #if HAVE_6REGS
2354 static void vector_fmul_window_3dnowext(float *dst, const float *src0,
2355                                         const float *src1, const float *win,
2356                                         int len)
2357 {
2358     x86_reg i = -len * 4;
2359     x86_reg j =  len * 4 - 8;
2360     __asm__ volatile (
2361         "1:                             \n"
2362         "pswapd (%5, %1), %%mm1         \n"
2363         "movq   (%5, %0), %%mm0         \n"
2364         "pswapd (%4, %1), %%mm5         \n"
2365         "movq   (%3, %0), %%mm4         \n"
2366         "movq      %%mm0, %%mm2         \n"
2367         "movq      %%mm1, %%mm3         \n"
2368         "pfmul     %%mm4, %%mm2         \n" // src0[len + i] * win[len + i]
2369         "pfmul     %%mm5, %%mm3         \n" // src1[j]       * win[len + j]
2370         "pfmul     %%mm4, %%mm1         \n" // src0[len + i] * win[len + j]
2371         "pfmul     %%mm5, %%mm0         \n" // src1[j]       * win[len + i]
2372         "pfadd     %%mm3, %%mm2         \n"
2373         "pfsub     %%mm0, %%mm1         \n"
2374         "pswapd    %%mm2, %%mm2         \n"
2375         "movq      %%mm1, (%2, %0)      \n"
2376         "movq      %%mm2, (%2, %1)      \n"
2377         "sub          $8, %1            \n"
2378         "add          $8, %0            \n"
2379         "jl           1b                \n"
2380         "femms                          \n"
2381         : "+r"(i), "+r"(j)
2382         : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2383     );
2384 }
2385
2386 static void vector_fmul_window_sse(float *dst, const float *src0,
2387                                    const float *src1, const float *win, int len)
2388 {
2389     x86_reg i = -len * 4;
2390     x86_reg j =  len * 4 - 16;
2391     __asm__ volatile (
2392         "1:                             \n"
2393         "movaps      (%5, %1), %%xmm1   \n"
2394         "movaps      (%5, %0), %%xmm0   \n"
2395         "movaps      (%4, %1), %%xmm5   \n"
2396         "movaps      (%3, %0), %%xmm4   \n"
2397         "shufps $0x1b, %%xmm1, %%xmm1   \n"
2398         "shufps $0x1b, %%xmm5, %%xmm5   \n"
2399         "movaps        %%xmm0, %%xmm2   \n"
2400         "movaps        %%xmm1, %%xmm3   \n"
2401         "mulps         %%xmm4, %%xmm2   \n" // src0[len + i] * win[len + i]
2402         "mulps         %%xmm5, %%xmm3   \n" // src1[j]       * win[len + j]
2403         "mulps         %%xmm4, %%xmm1   \n" // src0[len + i] * win[len + j]
2404         "mulps         %%xmm5, %%xmm0   \n" // src1[j]       * win[len + i]
2405         "addps         %%xmm3, %%xmm2   \n"
2406         "subps         %%xmm0, %%xmm1   \n"
2407         "shufps $0x1b, %%xmm2, %%xmm2   \n"
2408         "movaps        %%xmm1, (%2, %0) \n"
2409         "movaps        %%xmm2, (%2, %1) \n"
2410         "sub              $16, %1       \n"
2411         "add              $16, %0       \n"
2412         "jl                1b           \n"
2413         : "+r"(i), "+r"(j)
2414         : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2415     );
2416 }
2417 #endif /* HAVE_6REGS */
2418
2419 static void vector_clipf_sse(float *dst, const float *src,
2420                              float min, float max, int len)
2421 {
2422     x86_reg i = (len - 16) * 4;
2423     __asm__ volatile (
2424         "movss          %3, %%xmm4      \n\t"
2425         "movss          %4, %%xmm5      \n\t"
2426         "shufps $0, %%xmm4, %%xmm4      \n\t"
2427         "shufps $0, %%xmm5, %%xmm5      \n\t"
2428         "1:                             \n\t"
2429         "movaps   (%2, %0), %%xmm0      \n\t" // 3/1 on intel
2430         "movaps 16(%2, %0), %%xmm1      \n\t"
2431         "movaps 32(%2, %0), %%xmm2      \n\t"
2432         "movaps 48(%2, %0), %%xmm3      \n\t"
2433         "maxps      %%xmm4, %%xmm0      \n\t"
2434         "maxps      %%xmm4, %%xmm1      \n\t"
2435         "maxps      %%xmm4, %%xmm2      \n\t"
2436         "maxps      %%xmm4, %%xmm3      \n\t"
2437         "minps      %%xmm5, %%xmm0      \n\t"
2438         "minps      %%xmm5, %%xmm1      \n\t"
2439         "minps      %%xmm5, %%xmm2      \n\t"
2440         "minps      %%xmm5, %%xmm3      \n\t"
2441         "movaps     %%xmm0,   (%1, %0)  \n\t"
2442         "movaps     %%xmm1, 16(%1, %0)  \n\t"
2443         "movaps     %%xmm2, 32(%1, %0)  \n\t"
2444         "movaps     %%xmm3, 48(%1, %0)  \n\t"
2445         "sub           $64, %0          \n\t"
2446         "jge            1b              \n\t"
2447         : "+&r"(i)
2448         : "r"(dst), "r"(src), "m"(min), "m"(max)
2449         : "memory"
2450     );
2451 }
2452
2453 #endif /* HAVE_INLINE_ASM */
2454
2455 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2,
2456                                     int order);
2457 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
2458                                     int order);
2459 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2,
2460                                              const int16_t *v3,
2461                                              int order, int mul);
2462 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
2463                                              const int16_t *v3,
2464                                              int order, int mul);
2465 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
2466                                               const int16_t *v3,
2467                                               int order, int mul);
2468
2469 void ff_apply_window_int16_mmxext    (int16_t *output, const int16_t *input,
2470                                       const int16_t *window, unsigned int len);
2471 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2472                                       const int16_t *window, unsigned int len);
2473 void ff_apply_window_int16_sse2      (int16_t *output, const int16_t *input,
2474                                       const int16_t *window, unsigned int len);
2475 void ff_apply_window_int16_sse2_ba   (int16_t *output, const int16_t *input,
2476                                       const int16_t *window, unsigned int len);
2477 void ff_apply_window_int16_ssse3     (int16_t *output, const int16_t *input,
2478                                       const int16_t *window, unsigned int len);
2479 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2480                                       const int16_t *window, unsigned int len);
2481
2482 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
2483 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
2484
2485 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top,
2486                                         const uint8_t *diff, int w,
2487                                         int *left, int *left_top);
2488 int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
2489                                        int w, int left);
2490 int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
2491                                       int w, int left);
2492
2493 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2494
2495 void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
2496                                 const float *src1, int len);
2497 void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
2498                                 const float *src1, int len);
2499
2500 void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2501                             const float *src2, int len);
2502 void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
2503                             const float *src2, int len);
2504
2505 void ff_vector_clip_int32_mmx     (int32_t *dst, const int32_t *src,
2506                                    int32_t min, int32_t max, unsigned int len);
2507 void ff_vector_clip_int32_sse2    (int32_t *dst, const int32_t *src,
2508                                    int32_t min, int32_t max, unsigned int len);
2509 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
2510                                    int32_t min, int32_t max, unsigned int len);
2511 void ff_vector_clip_int32_sse4    (int32_t *dst, const int32_t *src,
2512                                    int32_t min, int32_t max, unsigned int len);
2513
2514 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2515                                                 const float *src1, int len);
2516 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2517                                                 const float *src1, int len);
2518
2519 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)                          \
2520     do {                                                                     \
2521     c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2522     c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2523     c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2524     c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2525     c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2526     c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2527     c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2528     c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2529     c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2530     c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2531     c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2532     c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2533     c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2534     c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2535     c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2536     c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2537     } while (0)
2538
2539 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
2540     do {                                                                        \
2541         c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _     ## CPU; \
2542         c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_  ## CPU; \
2543         c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_  ## CPU; \
2544         c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2545     } while (0)
2546
2547 #define H264_QPEL_FUNCS(x, y, CPU)                                                            \
2548     do {                                                                                      \
2549         c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2550         c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc  ## x ## y ## _ ## CPU; \
2551         c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2552         c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc  ## x ## y ## _ ## CPU; \
2553     } while (0)
2554
2555 #define H264_QPEL_FUNCS_10(x, y, CPU)                                                               \
2556     do {                                                                                            \
2557         c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2558         c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc  ## x ## y ## _10_ ## CPU; \
2559         c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2560         c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc  ## x ## y ## _10_ ## CPU; \
2561     } while (0)
2562
2563 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2564 {
2565     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2566
2567 #if HAVE_INLINE_ASM
2568     c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
2569     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2570     c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
2571
2572     if (!high_bit_depth) {
2573         c->clear_block  = clear_block_mmx;
2574         c->clear_blocks = clear_blocks_mmx;
2575         c->draw_edges   = draw_edges_mmx;
2576
2577         SET_HPEL_FUNCS(put,        0, 16, mmx);
2578         SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2579         SET_HPEL_FUNCS(avg,        0, 16, mmx);
2580         SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2581         SET_HPEL_FUNCS(put,        1,  8, mmx);
2582         SET_HPEL_FUNCS(put_no_rnd, 1,  8, mmx);
2583         SET_HPEL_FUNCS(avg,        1,  8, mmx);
2584         SET_HPEL_FUNCS(avg_no_rnd, 1,  8, mmx);
2585     }
2586
2587 #if ARCH_X86_32 || !HAVE_YASM
2588     c->gmc = gmc_mmx;
2589 #endif
2590
2591     c->add_bytes = add_bytes_mmx;
2592
2593     c->put_no_rnd_pixels_l2[0]= put_vp_no_rnd_pixels16_l2_mmx;
2594     c->put_no_rnd_pixels_l2[1]= put_vp_no_rnd_pixels8_l2_mmx;
2595
2596     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2597         c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2598         c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2599     }
2600 #endif /* HAVE_INLINE_ASM */
2601
2602 #if HAVE_YASM
2603 #if ARCH_X86_32
2604     if (!high_bit_depth)
2605         c->emulated_edge_mc = emulated_edge_mc_mmx;
2606 #endif
2607
2608     if (!high_bit_depth && CONFIG_H264CHROMA) {
2609         c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
2610         c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2611     }
2612
2613     c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2614 #endif
2615
2616 }
2617
2618 static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
2619                               int mm_flags)
2620 {
2621     const int bit_depth      = avctx->bits_per_raw_sample;
2622     const int high_bit_depth = bit_depth > 8;
2623
2624 #if HAVE_INLINE_ASM
2625     c->prefetch = prefetch_mmx2;
2626
2627     if (!high_bit_depth) {
2628         c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2629         c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2630
2631         c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2632         c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2633         c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2634
2635         c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2636         c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2637
2638         c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2639         c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2640         c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2641     }
2642
2643     if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2644         if (!high_bit_depth) {
2645             c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2646             c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2647             c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2648             c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2649
2650             c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2651             c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2652         }
2653     }
2654
2655     if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2656                                avctx->codec_id == AV_CODEC_ID_THEORA)) {
2657         c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2658         c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2659     }
2660 #endif /* HAVE_INLINE_ASM */
2661
2662     if (CONFIG_H264QPEL) {
2663 #if HAVE_INLINE_ASM
2664         SET_QPEL_FUNCS(put_qpel,        0, 16, mmx2, );
2665         SET_QPEL_FUNCS(put_qpel,        1,  8, mmx2, );
2666         SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
2667         SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmx2, );
2668         SET_QPEL_FUNCS(avg_qpel,        0, 16, mmx2, );
2669         SET_QPEL_FUNCS(avg_qpel,        1,  8, mmx2, );
2670 #endif /* HAVE_INLINE_ASM */
2671
2672         if (!high_bit_depth) {
2673 #if HAVE_INLINE_ASM
2674             SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
2675             SET_QPEL_FUNCS(put_h264_qpel, 1,  8, mmx2, );
2676             SET_QPEL_FUNCS(put_h264_qpel, 2,  4, mmx2, );
2677             SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
2678             SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, mmx2, );
2679             SET_QPEL_FUNCS(avg_h264_qpel, 2,  4, mmx2, );
2680 #endif /* HAVE_INLINE_ASM */
2681         } else if (bit_depth == 10) {
2682 #if HAVE_YASM
2683 #if !ARCH_X86_64
2684             SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2685             SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2686             SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 10_mmxext, ff_);
2687             SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, 10_mmxext, ff_);
2688 #endif
2689             SET_QPEL_FUNCS(put_h264_qpel, 2, 4,  10_mmxext, ff_);
2690             SET_QPEL_FUNCS(avg_h264_qpel, 2, 4,  10_mmxext, ff_);
2691 #endif /* HAVE_YASM */
2692         }
2693
2694 #if HAVE_INLINE_ASM
2695         SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
2696         SET_QPEL_FUNCS(put_2tap_qpel, 1,  8, mmx2, );
2697         SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
2698         SET_QPEL_FUNCS(avg_2tap_qpel, 1,  8, mmx2, );
2699 #endif /* HAVE_INLINE_ASM */
2700     }
2701
2702 #if HAVE_YASM
2703     if (!high_bit_depth && CONFIG_H264CHROMA) {
2704         c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_mmx2_rnd;
2705         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmx2;
2706         c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmx2;
2707         c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmx2;
2708     }
2709     if (bit_depth == 10 && CONFIG_H264CHROMA) {
2710         c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmx2;
2711         c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmx2;
2712         c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmx2;
2713         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmx2;
2714     }
2715
2716     /* slower than cmov version on AMD */
2717     if (!(mm_flags & AV_CPU_FLAG_3DNOW))
2718         c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2719
2720     c->scalarproduct_int16          = ff_scalarproduct_int16_mmx2;
2721     c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2722
2723     if (avctx->flags & CODEC_FLAG_BITEXACT) {
2724         c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2725     } else {
2726         c->apply_window_int16 = ff_apply_window_int16_mmxext;
2727     }
2728 #endif /* HAVE_YASM */
2729 }
2730
2731 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2732                                int mm_flags)
2733 {
2734     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2735
2736 #if HAVE_INLINE_ASM
2737     c->prefetch = prefetch_3dnow;
2738
2739     if (!high_bit_depth) {
2740         c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2741         c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2742
2743         c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2744         c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2745         c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2746
2747         c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2748         c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2749
2750         c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2751         c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2752         c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2753
2754         if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2755             c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2756             c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2757             c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2758             c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2759
2760             c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2761             c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2762         }
2763     }
2764
2765     if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2766                                avctx->codec_id == AV_CODEC_ID_THEORA)) {
2767         c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2768         c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2769     }
2770
2771     if (CONFIG_H264QPEL) {
2772         SET_QPEL_FUNCS(put_qpel,        0, 16, 3dnow, );
2773         SET_QPEL_FUNCS(put_qpel,        1,  8, 3dnow, );
2774         SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2775         SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, 3dnow, );
2776         SET_QPEL_FUNCS(avg_qpel,        0, 16, 3dnow, );
2777         SET_QPEL_FUNCS(avg_qpel,        1,  8, 3dnow, );
2778
2779         if (!high_bit_depth) {
2780             SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2781             SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 3dnow, );
2782             SET_QPEL_FUNCS(put_h264_qpel, 2,  4, 3dnow, );
2783             SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2784             SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, 3dnow, );
2785             SET_QPEL_FUNCS(avg_h264_qpel, 2,  4, 3dnow, );
2786         }
2787
2788         SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2789         SET_QPEL_FUNCS(put_2tap_qpel, 1,  8, 3dnow, );
2790         SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2791         SET_QPEL_FUNCS(avg_2tap_qpel, 1,  8, 3dnow, );
2792     }
2793
2794     c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2795 #endif /* HAVE_INLINE_ASM */
2796
2797 #if HAVE_YASM
2798     if (!high_bit_depth && CONFIG_H264CHROMA) {
2799         c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_3dnow_rnd;
2800         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2801     }
2802 #endif /* HAVE_YASM */
2803 }
2804
2805 static void dsputil_init_3dnowext(DSPContext *c, AVCodecContext *avctx,
2806                                   int mm_flags)
2807 {
2808 #if HAVE_AMD3DNOWEXT_INLINE && HAVE_6REGS
2809     c->vector_fmul_window  = vector_fmul_window_3dnowext;
2810 #endif
2811 }
2812
2813 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2814 {
2815     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2816
2817 #if HAVE_INLINE_ASM
2818     if (!high_bit_depth) {
2819         if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2820             /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2821             c->clear_block  = clear_block_sse;
2822             c->clear_blocks = clear_blocks_sse;
2823         }
2824     }
2825
2826     c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2827
2828 #if HAVE_6REGS
2829     c->vector_fmul_window = vector_fmul_window_sse;
2830 #endif
2831
2832     c->vector_clipf = vector_clipf_sse;
2833 #endif /* HAVE_INLINE_ASM */
2834
2835 #if HAVE_YASM
2836     c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
2837     c->vector_fmul_add     = ff_vector_fmul_add_sse;
2838
2839     c->scalarproduct_float          = ff_scalarproduct_float_sse;
2840     c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
2841
2842     if (!high_bit_depth)
2843         c->emulated_edge_mc = emulated_edge_mc_sse;
2844 #if HAVE_INLINE_ASM
2845     c->gmc = gmc_sse;
2846 #endif
2847 #endif /* HAVE_YASM */
2848 }
2849
2850 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2851                               int mm_flags)
2852 {
2853     const int bit_depth      = avctx->bits_per_raw_sample;
2854
2855 #if HAVE_INLINE_ASM
2856     const int high_bit_depth = bit_depth > 8;
2857
2858     if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2859         // these functions are slower than mmx on AMD, but faster on Intel
2860         if (!high_bit_depth) {
2861             c->put_pixels_tab[0][0]        = put_pixels16_sse2;
2862             c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2863             c->avg_pixels_tab[0][0]        = avg_pixels16_sse2;
2864             if (CONFIG_H264QPEL)
2865                 H264_QPEL_FUNCS(0, 0, sse2);
2866         }
2867     }
2868
2869     if (!high_bit_depth && CONFIG_H264QPEL) {
2870         H264_QPEL_FUNCS(0, 1, sse2);
2871         H264_QPEL_FUNCS(0, 2, sse2);
2872         H264_QPEL_FUNCS(0, 3, sse2);
2873         H264_QPEL_FUNCS(1, 1, sse2);
2874         H264_QPEL_FUNCS(1, 2, sse2);
2875         H264_QPEL_FUNCS(1, 3, sse2);
2876         H264_QPEL_FUNCS(2, 1, sse2);
2877         H264_QPEL_FUNCS(2, 2, sse2);
2878         H264_QPEL_FUNCS(2, 3, sse2);
2879         H264_QPEL_FUNCS(3, 1, sse2);
2880         H264_QPEL_FUNCS(3, 2, sse2);
2881         H264_QPEL_FUNCS(3, 3, sse2);
2882     }
2883 #endif /* HAVE_INLINE_ASM */
2884
2885 #if HAVE_YASM
2886     if (bit_depth == 10) {
2887         if (CONFIG_H264QPEL) {
2888             SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2889             SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 10_sse2, ff_);
2890             SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2891             SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, 10_sse2, ff_);
2892             H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2893             H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2894             H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
2895         }
2896         if (CONFIG_H264CHROMA) {
2897             c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2898             c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2899         }
2900     }
2901
2902     c->scalarproduct_int16          = ff_scalarproduct_int16_sse2;
2903     c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2904     if (mm_flags & AV_CPU_FLAG_ATOM) {
2905         c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2906     } else {
2907         c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2908     }
2909     if (avctx->flags & CODEC_FLAG_BITEXACT) {
2910         c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
2911     } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2912         c->apply_window_int16 = ff_apply_window_int16_sse2;
2913     }
2914     c->bswap_buf = ff_bswap32_buf_sse2;
2915 #endif /* HAVE_YASM */
2916 }
2917
2918 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2919                                int mm_flags)
2920 {
2921     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2922     const int bit_depth      = avctx->bits_per_raw_sample;
2923
2924 #if HAVE_SSSE3_INLINE
2925     if (!high_bit_depth && CONFIG_H264QPEL) {
2926         H264_QPEL_FUNCS(1, 0, ssse3);
2927         H264_QPEL_FUNCS(1, 1, ssse3);
2928         H264_QPEL_FUNCS(1, 2, ssse3);
2929         H264_QPEL_FUNCS(1, 3, ssse3);
2930         H264_QPEL_FUNCS(2, 0, ssse3);
2931         H264_QPEL_FUNCS(2, 1, ssse3);
2932         H264_QPEL_FUNCS(2, 2, ssse3);
2933         H264_QPEL_FUNCS(2, 3, ssse3);
2934         H264_QPEL_FUNCS(3, 0, ssse3);
2935         H264_QPEL_FUNCS(3, 1, ssse3);
2936         H264_QPEL_FUNCS(3, 2, ssse3);
2937         H264_QPEL_FUNCS(3, 3, ssse3);
2938     }
2939 #endif /* HAVE_SSSE3_INLINE */
2940
2941 #if HAVE_SSSE3_EXTERNAL
2942     if (bit_depth == 10 && CONFIG_H264QPEL) {
2943         H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
2944         H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
2945         H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
2946     }
2947     if (!high_bit_depth && CONFIG_H264CHROMA) {
2948         c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_ssse3_rnd;
2949         c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_ssse3_rnd;
2950         c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
2951         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
2952     }
2953     c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2954     if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2955         c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2956
2957     if (mm_flags & AV_CPU_FLAG_ATOM)
2958         c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2959     else
2960         c->apply_window_int16 = ff_apply_window_int16_ssse3;
2961     if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
2962         c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2963     c->bswap_buf = ff_bswap32_buf_ssse3;
2964 #endif /* HAVE_SSSE3_EXTERNAL */
2965 }
2966
2967 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
2968                               int mm_flags)
2969 {
2970 #if HAVE_SSE4_EXTERNAL
2971     c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2972 #endif /* HAVE_SSE4_EXTERNAL */
2973 }
2974
2975 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2976 {
2977 #if HAVE_AVX_EXTERNAL
2978     const int bit_depth = avctx->bits_per_raw_sample;
2979
2980     if (bit_depth == 10) {
2981         // AVX implies !cache64.
2982         // TODO: Port cache(32|64) detection from x264.
2983         if (CONFIG_H264QPEL) {
2984             H264_QPEL_FUNCS_10(1, 0, sse2);
2985             H264_QPEL_FUNCS_10(2, 0, sse2);
2986             H264_QPEL_FUNCS_10(3, 0, sse2);
2987         }
2988
2989         if (CONFIG_H264CHROMA) {
2990             c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2991             c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2992         }
2993     }
2994     c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
2995     c->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
2996     c->vector_fmul_add = ff_vector_fmul_add_avx;
2997 #endif /* HAVE_AVX_EXTERNAL */
2998 }
2999
3000 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
3001 {
3002     int mm_flags = av_get_cpu_flags();
3003
3004 #if HAVE_7REGS && HAVE_INLINE_ASM
3005     if (mm_flags & AV_CPU_FLAG_CMOV)
3006         c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
3007 #endif
3008
3009     if (mm_flags & AV_CPU_FLAG_MMX) {
3010 #if HAVE_INLINE_ASM
3011         const int idct_algo = avctx->idct_algo;
3012
3013         if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
3014             if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
3015                 c->idct_put              = ff_simple_idct_put_mmx;
3016                 c->idct_add              = ff_simple_idct_add_mmx;
3017                 c->idct                  = ff_simple_idct_mmx;
3018                 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
3019 #if CONFIG_GPL
3020             } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
3021                 if (mm_flags & AV_CPU_FLAG_MMX2) {
3022                     c->idct_put = ff_libmpeg2mmx2_idct_put;
3023                     c->idct_add = ff_libmpeg2mmx2_idct_add;
3024                     c->idct     = ff_mmxext_idct;
3025                 } else {
3026                     c->idct_put = ff_libmpeg2mmx_idct_put;
3027                     c->idct_add = ff_libmpeg2mmx_idct_add;
3028                     c->idct     = ff_mmx_idct;
3029                 }
3030                 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
3031 #endif
3032             } else if (idct_algo == FF_IDCT_XVIDMMX) {
3033                 if (mm_flags & AV_CPU_FLAG_SSE2) {
3034                     c->idct_put              = ff_idct_xvid_sse2_put;
3035                     c->idct_add              = ff_idct_xvid_sse2_add;
3036                     c->idct                  = ff_idct_xvid_sse2;
3037                     c->idct_permutation_type = FF_SSE2_IDCT_PERM;
3038                 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
3039                     c->idct_put              = ff_idct_xvid_mmx2_put;
3040                     c->idct_add              = ff_idct_xvid_mmx2_add;
3041                     c->idct                  = ff_idct_xvid_mmx2;
3042                 } else {
3043                     c->idct_put              = ff_idct_xvid_mmx_put;
3044                     c->idct_add              = ff_idct_xvid_mmx_add;
3045                     c->idct                  = ff_idct_xvid_mmx;
3046                 }
3047             }
3048         }
3049 #endif /* HAVE_INLINE_ASM */
3050
3051         dsputil_init_mmx(c, avctx, mm_flags);
3052     }
3053
3054     if (mm_flags & AV_CPU_FLAG_MMXEXT)
3055         dsputil_init_mmx2(c, avctx, mm_flags);
3056
3057     if (mm_flags & AV_CPU_FLAG_3DNOW)
3058         dsputil_init_3dnow(c, avctx, mm_flags);
3059
3060     if (mm_flags & AV_CPU_FLAG_3DNOWEXT)
3061         dsputil_init_3dnowext(c, avctx, mm_flags);
3062
3063     if (mm_flags & AV_CPU_FLAG_SSE)
3064         dsputil_init_sse(c, avctx, mm_flags);
3065
3066     if (mm_flags & AV_CPU_FLAG_SSE2)
3067         dsputil_init_sse2(c, avctx, mm_flags);
3068
3069     if (mm_flags & AV_CPU_FLAG_SSSE3)
3070         dsputil_init_ssse3(c, avctx, mm_flags);
3071
3072     if (mm_flags & AV_CPU_FLAG_SSE4)
3073         dsputil_init_sse4(c, avctx, mm_flags);
3074
3075     if (mm_flags & AV_CPU_FLAG_AVX)
3076         dsputil_init_avx(c, avctx, mm_flags);
3077
3078     if (CONFIG_ENCODERS)
3079         ff_dsputilenc_init_mmx(c, avctx);
3080 }