git.sesse.net Git - ffmpeg/blob - libavcodec/x86/dsputil_mmx.c

   1 /*
   2  * MMX optimized DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  *
  22  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  23  */
  24
  25 #include "libavutil/cpu.h"
  26 #include "libavutil/x86/asm.h"
  27 #include "libavcodec/dsputil.h"
  28 #include "libavcodec/h264dsp.h"
  29 #include "libavcodec/mpegvideo.h"
  30 #include "libavcodec/simple_idct.h"
  31 #include "dsputil_mmx.h"
  32 #include "idct_xvid.h"
  33 #include "diracdsp_mmx.h"
  34
  35 //#undef NDEBUG
  36 //#include <assert.h>
  37
  38 /* pixel operations */
  39 DECLARE_ALIGNED(8,  const uint64_t, ff_bone) = 0x0101010101010101ULL;
  40 DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
  41
  42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
  43     { 0x8000000080000000ULL, 0x8000000080000000ULL };
  44
  45 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL };
  46 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL };
  47 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_3)    = { 0x0003000300030003ULL, 0x0003000300030003ULL };
  48 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_4)    = { 0x0004000400040004ULL, 0x0004000400040004ULL };
  49 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5)    = { 0x0005000500050005ULL, 0x0005000500050005ULL };
  50 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_8)    = { 0x0008000800080008ULL, 0x0008000800080008ULL };
  51 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_9)    = { 0x0009000900090009ULL, 0x0009000900090009ULL };
  52 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15)   =   0x000F000F000F000FULL;
  53 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_16)   = { 0x0010001000100010ULL, 0x0010001000100010ULL };
  54 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_17)   = { 0x0011001100110011ULL, 0x0011001100110011ULL };
  55 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_18)   = { 0x0012001200120012ULL, 0x0012001200120012ULL };
  56 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_20)   =   0x0014001400140014ULL;
  57 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_27)   = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
  58 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_28)   = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
  59 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_32)   = { 0x0020002000200020ULL, 0x0020002000200020ULL };
  60 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_42)   =   0x002A002A002A002AULL;
  61 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_53)   =   0x0035003500350035ULL;
  62 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_63)   = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
  63 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_64)   = { 0x0040004000400040ULL, 0x0040004000400040ULL };
  64 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_96)   =   0x0060006000600060ULL;
  65 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_128)  =   0x0080008000800080ULL;
  66 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_255)  =   0x00ff00ff00ff00ffULL;
  67 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x0200020002000200ULL };
  68 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
  69
  70 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_0)    = { 0x0000000000000000ULL, 0x0000000000000000ULL };
  71 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL };
  72 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL };
  73 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_4)    = { 0x0404040404040404ULL, 0x0404040404040404ULL };
  74 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_7)    =   0x0707070707070707ULL;
  75 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_1F)   =   0x1F1F1F1F1F1F1F1FULL;
  76 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_3F)   =   0x3F3F3F3F3F3F3F3FULL;
  77 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_80)   = { 0x8080808080808080ULL, 0x8080808080808080ULL };
  78 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_81)   =   0x8181818181818181ULL;
  79 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_A1)   = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
  80 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_F8)   = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
  81 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC)   =   0xFCFCFCFCFCFCFCFCULL;
  82 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
  83
  84 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
  85 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
  86
  87 #if HAVE_INLINE_ASM
  88
  89 #define JUMPALIGN()     __asm__ volatile (".p2align 3"::)
  90 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
  91
  92 #define MOVQ_BFE(regd)                                  \
  93     __asm__ volatile (                                  \
  94         "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
  95         "paddb   %%"#regd", %%"#regd"   \n\t" ::)
  96
  97 #ifndef PIC
  98 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
  99 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
 100 #else
 101 // for shared library it's better to use this way for accessing constants
 102 // pcmpeqd -> -1
 103 #define MOVQ_BONE(regd)                                 \
 104     __asm__ volatile (                                  \
 105         "pcmpeqd  %%"#regd", %%"#regd"  \n\t"           \
 106         "psrlw          $15, %%"#regd"  \n\t"           \
 107         "packuswb %%"#regd", %%"#regd"  \n\t" ::)
 108
 109 #define MOVQ_WTWO(regd)                                 \
 110     __asm__ volatile (                                  \
 111         "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
 112         "psrlw         $15, %%"#regd"   \n\t"           \
 113         "psllw          $1, %%"#regd"   \n\t"::)
 114
 115 #endif
 116
 117 // using regr as temporary and for the output result
 118 // first argument is unmodifed and second is trashed
 119 // regfe is supposed to contain 0xfefefefefefefefe
 120 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe)                \
 121     "movq   "#rega", "#regr"            \n\t"                    \
 122     "pand   "#regb", "#regr"            \n\t"                    \
 123     "pxor   "#rega", "#regb"            \n\t"                    \
 124     "pand  "#regfe", "#regb"            \n\t"                    \
 125     "psrlq       $1, "#regb"            \n\t"                    \
 126     "paddb  "#regb", "#regr"            \n\t"
 127
 128 #define PAVGB_MMX(rega, regb, regr, regfe)                       \
 129     "movq   "#rega", "#regr"            \n\t"                    \
 130     "por    "#regb", "#regr"            \n\t"                    \
 131     "pxor   "#rega", "#regb"            \n\t"                    \
 132     "pand  "#regfe", "#regb"            \n\t"                    \
 133     "psrlq       $1, "#regb"            \n\t"                    \
 134     "psubb  "#regb", "#regr"            \n\t"
 135
 136 // mm6 is supposed to contain 0xfefefefefefefefe
 137 #define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp)   \
 138     "movq  "#rega", "#regr"             \n\t"                    \
 139     "movq  "#regc", "#regp"             \n\t"                    \
 140     "pand  "#regb", "#regr"             \n\t"                    \
 141     "pand  "#regd", "#regp"             \n\t"                    \
 142     "pxor  "#rega", "#regb"             \n\t"                    \
 143     "pxor  "#regc", "#regd"             \n\t"                    \
 144     "pand    %%mm6, "#regb"             \n\t"                    \
 145     "pand    %%mm6, "#regd"             \n\t"                    \
 146     "psrlq      $1, "#regb"             \n\t"                    \
 147     "psrlq      $1, "#regd"             \n\t"                    \
 148     "paddb "#regb", "#regr"             \n\t"                    \
 149     "paddb "#regd", "#regp"             \n\t"
 150
 151 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp)           \
 152     "movq  "#rega", "#regr"             \n\t"                    \
 153     "movq  "#regc", "#regp"             \n\t"                    \
 154     "por   "#regb", "#regr"             \n\t"                    \
 155     "por   "#regd", "#regp"             \n\t"                    \
 156     "pxor  "#rega", "#regb"             \n\t"                    \
 157     "pxor  "#regc", "#regd"             \n\t"                    \
 158     "pand    %%mm6, "#regb"             \n\t"                    \
 159     "pand    %%mm6, "#regd"             \n\t"                    \
 160     "psrlq      $1, "#regd"             \n\t"                    \
 161     "psrlq      $1, "#regb"             \n\t"                    \
 162     "psubb "#regb", "#regr"             \n\t"                    \
 163     "psubb "#regd", "#regp"             \n\t"
 164
 165 /***********************************/
 166 /* MMX no rounding */
 167 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
 168 #define SET_RND  MOVQ_WONE
 169 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
 170 #define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
 171 #define OP_AVG(a, b, c, e)              PAVGB_MMX(a, b, c, e)
 172
 173 #include "dsputil_rnd_template.c"
 174
 175 #undef DEF
 176 #undef SET_RND
 177 #undef PAVGBP
 178 #undef PAVGB
 179 /***********************************/
 180 /* MMX rounding */
 181
 182 #define DEF(x, y) x ## _ ## y ## _mmx
 183 #define SET_RND  MOVQ_WTWO
 184 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
 185 #define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
 186
 187 #include "dsputil_rnd_template.c"
 188
 189 #undef DEF
 190 #undef SET_RND
 191 #undef PAVGBP
 192 #undef PAVGB
 193 #undef OP_AVG
 194
 195 /***********************************/
 196 /* 3Dnow specific */
 197
 198 #define DEF(x) x ## _3dnow
 199 #define PAVGB "pavgusb"
 200 #define OP_AVG PAVGB
 201 #define SKIP_FOR_3DNOW
 202
 203 #include "dsputil_avg_template.c"
 204
 205 #undef DEF
 206 #undef PAVGB
 207 #undef OP_AVG
 208 #undef SKIP_FOR_3DNOW
 209
 210 /***********************************/
 211 /* MMXEXT specific */
 212
 213 #define DEF(x) x ## _mmxext
 214
 215 /* Introduced only in MMXEXT set */
 216 #define PAVGB "pavgb"
 217 #define OP_AVG PAVGB
 218
 219 #include "dsputil_avg_template.c"
 220
 221 #undef DEF
 222 #undef PAVGB
 223 #undef OP_AVG
 224
 225 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
 226 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
 227 #define put_pixels16_mmxext put_pixels16_mmx
 228 #define put_pixels8_mmxext put_pixels8_mmx
 229 #define put_pixels4_mmxext put_pixels4_mmx
 230 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
 231 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
 232
 233 /***********************************/
 234 /* standard MMX */
 235
 236 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
 237                                int line_size)
 238 {
 239     const DCTELEM *p;
 240     uint8_t *pix;
 241
 242     /* read the pixels */
 243     p   = block;
 244     pix = pixels;
 245     /* unrolled loop */
 246     __asm__ volatile (
 247         "movq      (%3), %%mm0          \n\t"
 248         "movq     8(%3), %%mm1          \n\t"
 249         "movq    16(%3), %%mm2          \n\t"
 250         "movq    24(%3), %%mm3          \n\t"
 251         "movq    32(%3), %%mm4          \n\t"
 252         "movq    40(%3), %%mm5          \n\t"
 253         "movq    48(%3), %%mm6          \n\t"
 254         "movq    56(%3), %%mm7          \n\t"
 255         "packuswb %%mm1, %%mm0          \n\t"
 256         "packuswb %%mm3, %%mm2          \n\t"
 257         "packuswb %%mm5, %%mm4          \n\t"
 258         "packuswb %%mm7, %%mm6          \n\t"
 259         "movq     %%mm0, (%0)           \n\t"
 260         "movq     %%mm2, (%0, %1)       \n\t"
 261         "movq     %%mm4, (%0, %1, 2)    \n\t"
 262         "movq     %%mm6, (%0, %2)       \n\t"
 263         :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
 264            "r"(p)
 265         : "memory");
 266     pix += line_size * 4;
 267     p   += 32;
 268
 269     // if here would be an exact copy of the code above
 270     // compiler would generate some very strange code
 271     // thus using "r"
 272     __asm__ volatile (
 273         "movq       (%3), %%mm0         \n\t"
 274         "movq      8(%3), %%mm1         \n\t"
 275         "movq     16(%3), %%mm2         \n\t"
 276         "movq     24(%3), %%mm3         \n\t"
 277         "movq     32(%3), %%mm4         \n\t"
 278         "movq     40(%3), %%mm5         \n\t"
 279         "movq     48(%3), %%mm6         \n\t"
 280         "movq     56(%3), %%mm7         \n\t"
 281         "packuswb  %%mm1, %%mm0         \n\t"
 282         "packuswb  %%mm3, %%mm2         \n\t"
 283         "packuswb  %%mm5, %%mm4         \n\t"
 284         "packuswb  %%mm7, %%mm6         \n\t"
 285         "movq      %%mm0, (%0)          \n\t"
 286         "movq      %%mm2, (%0, %1)      \n\t"
 287         "movq      %%mm4, (%0, %1, 2)   \n\t"
 288         "movq      %%mm6, (%0, %2)      \n\t"
 289         :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
 290         : "memory");
 291 }
 292
 293 #define put_signed_pixels_clamped_mmx_half(off)             \
 294     "movq          "#off"(%2), %%mm1        \n\t"           \
 295     "movq     16 + "#off"(%2), %%mm2        \n\t"           \
 296     "movq     32 + "#off"(%2), %%mm3        \n\t"           \
 297     "movq     48 + "#off"(%2), %%mm4        \n\t"           \
 298     "packsswb  8 + "#off"(%2), %%mm1        \n\t"           \
 299     "packsswb 24 + "#off"(%2), %%mm2        \n\t"           \
 300     "packsswb 40 + "#off"(%2), %%mm3        \n\t"           \
 301     "packsswb 56 + "#off"(%2), %%mm4        \n\t"           \
 302     "paddb              %%mm0, %%mm1        \n\t"           \
 303     "paddb              %%mm0, %%mm2        \n\t"           \
 304     "paddb              %%mm0, %%mm3        \n\t"           \
 305     "paddb              %%mm0, %%mm4        \n\t"           \
 306     "movq               %%mm1, (%0)         \n\t"           \
 307     "movq               %%mm2, (%0, %3)     \n\t"           \
 308     "movq               %%mm3, (%0, %3, 2)  \n\t"           \
 309     "movq               %%mm4, (%0, %1)     \n\t"
 310
 311 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
 312                                       int line_size)
 313 {
 314     x86_reg line_skip = line_size;
 315     x86_reg line_skip3;
 316
 317     __asm__ volatile (
 318         "movq "MANGLE(ff_pb_80)", %%mm0     \n\t"
 319         "lea         (%3, %3, 2), %1        \n\t"
 320         put_signed_pixels_clamped_mmx_half(0)
 321         "lea         (%0, %3, 4), %0        \n\t"
 322         put_signed_pixels_clamped_mmx_half(64)
 323         : "+&r"(pixels), "=&r"(line_skip3)
 324         : "r"(block), "r"(line_skip)
 325         : "memory");
 326 }
 327
 328 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
 329                                int line_size)
 330 {
 331     const DCTELEM *p;
 332     uint8_t *pix;
 333     int i;
 334
 335     /* read the pixels */
 336     p   = block;
 337     pix = pixels;
 338     MOVQ_ZERO(mm7);
 339     i = 4;
 340     do {
 341         __asm__ volatile (
 342             "movq        (%2), %%mm0    \n\t"
 343             "movq       8(%2), %%mm1    \n\t"
 344             "movq      16(%2), %%mm2    \n\t"
 345             "movq      24(%2), %%mm3    \n\t"
 346             "movq          %0, %%mm4    \n\t"
 347             "movq          %1, %%mm6    \n\t"
 348             "movq       %%mm4, %%mm5    \n\t"
 349             "punpcklbw  %%mm7, %%mm4    \n\t"
 350             "punpckhbw  %%mm7, %%mm5    \n\t"
 351             "paddsw     %%mm4, %%mm0    \n\t"
 352             "paddsw     %%mm5, %%mm1    \n\t"
 353             "movq       %%mm6, %%mm5    \n\t"
 354             "punpcklbw  %%mm7, %%mm6    \n\t"
 355             "punpckhbw  %%mm7, %%mm5    \n\t"
 356             "paddsw     %%mm6, %%mm2    \n\t"
 357             "paddsw     %%mm5, %%mm3    \n\t"
 358             "packuswb   %%mm1, %%mm0    \n\t"
 359             "packuswb   %%mm3, %%mm2    \n\t"
 360             "movq       %%mm0, %0       \n\t"
 361             "movq       %%mm2, %1       \n\t"
 362             : "+m"(*pix), "+m"(*(pix + line_size))
 363             : "r"(p)
 364             : "memory");
 365         pix += line_size * 2;
 366         p   += 16;
 367     } while (--i);
 368 }
 369
 370 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
 371                             int line_size, int h)
 372 {
 373     __asm__ volatile (
 374         "lea   (%3, %3), %%"REG_a"      \n\t"
 375         ".p2align     3                 \n\t"
 376         "1:                             \n\t"
 377         "movq  (%1    ), %%mm0          \n\t"
 378         "movq  (%1, %3), %%mm1          \n\t"
 379         "movq     %%mm0, (%2)           \n\t"
 380         "movq     %%mm1, (%2, %3)       \n\t"
 381         "add  %%"REG_a", %1             \n\t"
 382         "add  %%"REG_a", %2             \n\t"
 383         "movq  (%1    ), %%mm0          \n\t"
 384         "movq  (%1, %3), %%mm1          \n\t"
 385         "movq     %%mm0, (%2)           \n\t"
 386         "movq     %%mm1, (%2, %3)       \n\t"
 387         "add  %%"REG_a", %1             \n\t"
 388         "add  %%"REG_a", %2             \n\t"
 389         "subl        $4, %0             \n\t"
 390         "jnz         1b                 \n\t"
 391         : "+g"(h), "+r"(pixels),  "+r"(block)
 392         : "r"((x86_reg)line_size)
 393         : "%"REG_a, "memory"
 394         );
 395 }
 396
 397 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
 398                              int line_size, int h)
 399 {
 400     __asm__ volatile (
 401         "lea   (%3, %3), %%"REG_a"      \n\t"
 402         ".p2align     3                 \n\t"
 403         "1:                             \n\t"
 404         "movq  (%1    ), %%mm0          \n\t"
 405         "movq 8(%1    ), %%mm4          \n\t"
 406         "movq  (%1, %3), %%mm1          \n\t"
 407         "movq 8(%1, %3), %%mm5          \n\t"
 408         "movq     %%mm0,  (%2)          \n\t"
 409         "movq     %%mm4, 8(%2)          \n\t"
 410         "movq     %%mm1,  (%2, %3)      \n\t"
 411         "movq     %%mm5, 8(%2, %3)      \n\t"
 412         "add  %%"REG_a", %1             \n\t"
 413         "add  %%"REG_a", %2             \n\t"
 414         "movq  (%1    ), %%mm0          \n\t"
 415         "movq 8(%1    ), %%mm4          \n\t"
 416         "movq  (%1, %3), %%mm1          \n\t"
 417         "movq 8(%1, %3), %%mm5          \n\t"
 418         "movq     %%mm0,  (%2)          \n\t"
 419         "movq     %%mm4, 8(%2)          \n\t"
 420         "movq     %%mm1,  (%2, %3)      \n\t"
 421         "movq     %%mm5, 8(%2, %3)      \n\t"
 422         "add  %%"REG_a", %1             \n\t"
 423         "add  %%"REG_a", %2             \n\t"
 424         "subl        $4, %0             \n\t"
 425         "jnz         1b                 \n\t"
 426         : "+g"(h), "+r"(pixels),  "+r"(block)
 427         : "r"((x86_reg)line_size)
 428         : "%"REG_a, "memory"
 429         );
 430 }
 431
 432 #define CLEAR_BLOCKS(name, n)                           \
 433 static void name(DCTELEM *blocks)                       \
 434 {                                                       \
 435     __asm__ volatile (                                  \
 436         "pxor %%mm7, %%mm7              \n\t"           \
 437         "mov     %1,        %%"REG_a"   \n\t"           \
 438         "1:                             \n\t"           \
 439         "movq %%mm7,   (%0, %%"REG_a")  \n\t"           \
 440         "movq %%mm7,  8(%0, %%"REG_a")  \n\t"           \
 441         "movq %%mm7, 16(%0, %%"REG_a")  \n\t"           \
 442         "movq %%mm7, 24(%0, %%"REG_a")  \n\t"           \
 443         "add    $32, %%"REG_a"          \n\t"           \
 444         "js      1b                     \n\t"           \
 445         :: "r"(((uint8_t *)blocks) + 128 * n),          \
 446            "i"(-128 * n)                                \
 447         : "%"REG_a                                      \
 448         );                                              \
 449 }
 450 CLEAR_BLOCKS(clear_blocks_mmx, 6)
 451 CLEAR_BLOCKS(clear_block_mmx, 1)
 452
 453 static void clear_block_sse(DCTELEM *block)
 454 {
 455     __asm__ volatile (
 456         "xorps  %%xmm0, %%xmm0          \n"
 457         "movaps %%xmm0,    (%0)         \n"
 458         "movaps %%xmm0,  16(%0)         \n"
 459         "movaps %%xmm0,  32(%0)         \n"
 460         "movaps %%xmm0,  48(%0)         \n"
 461         "movaps %%xmm0,  64(%0)         \n"
 462         "movaps %%xmm0,  80(%0)         \n"
 463         "movaps %%xmm0,  96(%0)         \n"
 464         "movaps %%xmm0, 112(%0)         \n"
 465         :: "r"(block)
 466         : "memory"
 467     );
 468 }
 469
 470 static void clear_blocks_sse(DCTELEM *blocks)
 471 {
 472     __asm__ volatile (
 473         "xorps  %%xmm0, %%xmm0              \n"
 474         "mov        %1,         %%"REG_a"   \n"
 475         "1:                                 \n"
 476         "movaps %%xmm0,    (%0, %%"REG_a")  \n"
 477         "movaps %%xmm0,  16(%0, %%"REG_a")  \n"
 478         "movaps %%xmm0,  32(%0, %%"REG_a")  \n"
 479         "movaps %%xmm0,  48(%0, %%"REG_a")  \n"
 480         "movaps %%xmm0,  64(%0, %%"REG_a")  \n"
 481         "movaps %%xmm0,  80(%0, %%"REG_a")  \n"
 482         "movaps %%xmm0,  96(%0, %%"REG_a")  \n"
 483         "movaps %%xmm0, 112(%0, %%"REG_a")  \n"
 484         "add      $128,         %%"REG_a"   \n"
 485         "js         1b                      \n"
 486         :: "r"(((uint8_t *)blocks) + 128 * 6),
 487            "i"(-128 * 6)
 488         : "%"REG_a
 489     );
 490 }
 491
 492 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
 493 {
 494     x86_reg i = 0;
 495     __asm__ volatile (
 496         "jmp          2f                \n\t"
 497         "1:                             \n\t"
 498         "movq   (%1, %0), %%mm0         \n\t"
 499         "movq   (%2, %0), %%mm1         \n\t"
 500         "paddb     %%mm0, %%mm1         \n\t"
 501         "movq      %%mm1, (%2, %0)      \n\t"
 502         "movq  8(%1, %0), %%mm0         \n\t"
 503         "movq  8(%2, %0), %%mm1         \n\t"
 504         "paddb     %%mm0, %%mm1         \n\t"
 505         "movq      %%mm1, 8(%2, %0)     \n\t"
 506         "add         $16, %0            \n\t"
 507         "2:                             \n\t"
 508         "cmp          %3, %0            \n\t"
 509         "js           1b                \n\t"
 510         : "+r"(i)
 511         : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
 512     );
 513     for ( ; i < w; i++)
 514         dst[i + 0] += src[i + 0];
 515 }
 516
 517 #if HAVE_7REGS
 518 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
 519                                             const uint8_t *diff, int w,
 520                                             int *left, int *left_top)
 521 {
 522     x86_reg w2 = -w;
 523     x86_reg x;
 524     int l  = *left     & 0xff;
 525     int tl = *left_top & 0xff;
 526     int t;
 527     __asm__ volatile (
 528         "mov          %7, %3            \n"
 529         "1:                             \n"
 530         "movzbl (%3, %4), %2            \n"
 531         "mov          %2, %k3           \n"
 532         "sub         %b1, %b3           \n"
 533         "add         %b0, %b3           \n"
 534         "mov          %2, %1            \n"
 535         "cmp          %0, %2            \n"
 536         "cmovg        %0, %2            \n"
 537         "cmovg        %1, %0            \n"
 538         "cmp         %k3, %0            \n"
 539         "cmovg       %k3, %0            \n"
 540         "mov          %7, %3            \n"
 541         "cmp          %2, %0            \n"
 542         "cmovl        %2, %0            \n"
 543         "add    (%6, %4), %b0           \n"
 544         "mov         %b0, (%5, %4)      \n"
 545         "inc          %4                \n"
 546         "jl           1b                \n"
 547         : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
 548         : "r"(dst + w), "r"(diff + w), "rm"(top + w)
 549     );
 550     *left     = l;
 551     *left_top = tl;
 552 }
 553 #endif
 554
 555 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
 556     __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
 557         "movd  (%1), %%mm0              \n\t"
 558         "add   %3, %1                   \n\t"
 559         "movd  (%1), %%mm1              \n\t"
 560         "movd  (%1,%3,1), %%mm2         \n\t"
 561         "movd  (%1,%3,2), %%mm3         \n\t"
 562         "punpcklbw %%mm1, %%mm0         \n\t"
 563         "punpcklbw %%mm3, %%mm2         \n\t"
 564         "movq %%mm0, %%mm1              \n\t"
 565         "punpcklwd %%mm2, %%mm0         \n\t"
 566         "punpckhwd %%mm2, %%mm1         \n\t"
 567         "movd  %%mm0, (%0)              \n\t"
 568         "add   %2, %0                   \n\t"
 569         "punpckhdq %%mm0, %%mm0         \n\t"
 570         "movd  %%mm0, (%0)              \n\t"
 571         "movd  %%mm1, (%0,%2,1)         \n\t"
 572         "punpckhdq %%mm1, %%mm1         \n\t"
 573         "movd  %%mm1, (%0,%2,2)         \n\t"
 574
 575         :  "+&r" (dst),
 576            "+&r" (src)
 577         :  "r" (dst_stride),
 578            "r" (src_stride)
 579         :  "memory"
 580     );
 581 }
 582
 583 #define H263_LOOP_FILTER                        \
 584     "pxor      %%mm7, %%mm7             \n\t"   \
 585     "movq         %0, %%mm0             \n\t"   \
 586     "movq         %0, %%mm1             \n\t"   \
 587     "movq         %3, %%mm2             \n\t"   \
 588     "movq         %3, %%mm3             \n\t"   \
 589     "punpcklbw %%mm7, %%mm0             \n\t"   \
 590     "punpckhbw %%mm7, %%mm1             \n\t"   \
 591     "punpcklbw %%mm7, %%mm2             \n\t"   \
 592     "punpckhbw %%mm7, %%mm3             \n\t"   \
 593     "psubw     %%mm2, %%mm0             \n\t"   \
 594     "psubw     %%mm3, %%mm1             \n\t"   \
 595     "movq         %1, %%mm2             \n\t"   \
 596     "movq         %1, %%mm3             \n\t"   \
 597     "movq         %2, %%mm4             \n\t"   \
 598     "movq         %2, %%mm5             \n\t"   \
 599     "punpcklbw %%mm7, %%mm2             \n\t"   \
 600     "punpckhbw %%mm7, %%mm3             \n\t"   \
 601     "punpcklbw %%mm7, %%mm4             \n\t"   \
 602     "punpckhbw %%mm7, %%mm5             \n\t"   \
 603     "psubw     %%mm2, %%mm4             \n\t"   \
 604     "psubw     %%mm3, %%mm5             \n\t"   \
 605     "psllw        $2, %%mm4             \n\t"   \
 606     "psllw        $2, %%mm5             \n\t"   \
 607     "paddw     %%mm0, %%mm4             \n\t"   \
 608     "paddw     %%mm1, %%mm5             \n\t"   \
 609     "pxor      %%mm6, %%mm6             \n\t"   \
 610     "pcmpgtw   %%mm4, %%mm6             \n\t"   \
 611     "pcmpgtw   %%mm5, %%mm7             \n\t"   \
 612     "pxor      %%mm6, %%mm4             \n\t"   \
 613     "pxor      %%mm7, %%mm5             \n\t"   \
 614     "psubw     %%mm6, %%mm4             \n\t"   \
 615     "psubw     %%mm7, %%mm5             \n\t"   \
 616     "psrlw        $3, %%mm4             \n\t"   \
 617     "psrlw        $3, %%mm5             \n\t"   \
 618     "packuswb  %%mm5, %%mm4             \n\t"   \
 619     "packsswb  %%mm7, %%mm6             \n\t"   \
 620     "pxor      %%mm7, %%mm7             \n\t"   \
 621     "movd         %4, %%mm2             \n\t"   \
 622     "punpcklbw %%mm2, %%mm2             \n\t"   \
 623     "punpcklbw %%mm2, %%mm2             \n\t"   \
 624     "punpcklbw %%mm2, %%mm2             \n\t"   \
 625     "psubusb   %%mm4, %%mm2             \n\t"   \
 626     "movq      %%mm2, %%mm3             \n\t"   \
 627     "psubusb   %%mm4, %%mm3             \n\t"   \
 628     "psubb     %%mm3, %%mm2             \n\t"   \
 629     "movq         %1, %%mm3             \n\t"   \
 630     "movq         %2, %%mm4             \n\t"   \
 631     "pxor      %%mm6, %%mm3             \n\t"   \
 632     "pxor      %%mm6, %%mm4             \n\t"   \
 633     "paddusb   %%mm2, %%mm3             \n\t"   \
 634     "psubusb   %%mm2, %%mm4             \n\t"   \
 635     "pxor      %%mm6, %%mm3             \n\t"   \
 636     "pxor      %%mm6, %%mm4             \n\t"   \
 637     "paddusb   %%mm2, %%mm2             \n\t"   \
 638     "packsswb  %%mm1, %%mm0             \n\t"   \
 639     "pcmpgtb   %%mm0, %%mm7             \n\t"   \
 640     "pxor      %%mm7, %%mm0             \n\t"   \
 641     "psubb     %%mm7, %%mm0             \n\t"   \
 642     "movq      %%mm0, %%mm1             \n\t"   \
 643     "psubusb   %%mm2, %%mm0             \n\t"   \
 644     "psubb     %%mm0, %%mm1             \n\t"   \
 645     "pand         %5, %%mm1             \n\t"   \
 646     "psrlw        $2, %%mm1             \n\t"   \
 647     "pxor      %%mm7, %%mm1             \n\t"   \
 648     "psubb     %%mm7, %%mm1             \n\t"   \
 649     "movq         %0, %%mm5             \n\t"   \
 650     "movq         %3, %%mm6             \n\t"   \
 651     "psubb     %%mm1, %%mm5             \n\t"   \
 652     "paddb     %%mm1, %%mm6             \n\t"
 653
 654 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
 655 {
 656     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
 657         const int strength = ff_h263_loop_filter_strength[qscale];
 658
 659         __asm__ volatile (
 660             H263_LOOP_FILTER
 661
 662             "movq %%mm3, %1             \n\t"
 663             "movq %%mm4, %2             \n\t"
 664             "movq %%mm5, %0             \n\t"
 665             "movq %%mm6, %3             \n\t"
 666             : "+m"(*(uint64_t*)(src - 2 * stride)),
 667               "+m"(*(uint64_t*)(src - 1 * stride)),
 668               "+m"(*(uint64_t*)(src + 0 * stride)),
 669               "+m"(*(uint64_t*)(src + 1 * stride))
 670             : "g"(2 * strength), "m"(ff_pb_FC)
 671             );
 672     }
 673 }
 674
 675 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
 676 {
 677     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
 678         const int strength = ff_h263_loop_filter_strength[qscale];
 679         DECLARE_ALIGNED(8, uint64_t, temp)[4];
 680         uint8_t *btemp = (uint8_t*)temp;
 681
 682         src -= 2;
 683
 684         transpose4x4(btemp,     src,              8, stride);
 685         transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
 686         __asm__ volatile (
 687             H263_LOOP_FILTER // 5 3 4 6
 688
 689             : "+m"(temp[0]),
 690               "+m"(temp[1]),
 691               "+m"(temp[2]),
 692               "+m"(temp[3])
 693             : "g"(2 * strength), "m"(ff_pb_FC)
 694             );
 695
 696         __asm__ volatile (
 697             "movq      %%mm5, %%mm1         \n\t"
 698             "movq      %%mm4, %%mm0         \n\t"
 699             "punpcklbw %%mm3, %%mm5         \n\t"
 700             "punpcklbw %%mm6, %%mm4         \n\t"
 701             "punpckhbw %%mm3, %%mm1         \n\t"
 702             "punpckhbw %%mm6, %%mm0         \n\t"
 703             "movq      %%mm5, %%mm3         \n\t"
 704             "movq      %%mm1, %%mm6         \n\t"
 705             "punpcklwd %%mm4, %%mm5         \n\t"
 706             "punpcklwd %%mm0, %%mm1         \n\t"
 707             "punpckhwd %%mm4, %%mm3         \n\t"
 708             "punpckhwd %%mm0, %%mm6         \n\t"
 709             "movd      %%mm5, (%0)          \n\t"
 710             "punpckhdq %%mm5, %%mm5         \n\t"
 711             "movd      %%mm5, (%0, %2)      \n\t"
 712             "movd      %%mm3, (%0, %2, 2)   \n\t"
 713             "punpckhdq %%mm3, %%mm3         \n\t"
 714             "movd      %%mm3, (%0, %3)      \n\t"
 715             "movd      %%mm1, (%1)          \n\t"
 716             "punpckhdq %%mm1, %%mm1         \n\t"
 717             "movd      %%mm1, (%1, %2)      \n\t"
 718             "movd      %%mm6, (%1, %2, 2)   \n\t"
 719             "punpckhdq %%mm6, %%mm6         \n\t"
 720             "movd      %%mm6, (%1, %3)      \n\t"
 721             :: "r"(src),
 722                "r"(src + 4 * stride),
 723                "r"((x86_reg)stride),
 724                "r"((x86_reg)(3 * stride))
 725             );
 726     }
 727 }
 728
 729 /* Draw the edges of width 'w' of an image of size width, height
 730  * this MMX version can only handle w == 8 || w == 16. */
 731 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
 732                            int w, int h, int sides)
 733 {
 734     uint8_t *ptr, *last_line;
 735     int i;
 736
 737     last_line = buf + (height - 1) * wrap;
 738     /* left and right */
 739     ptr = buf;
 740     if (w == 8) {
 741         __asm__ volatile (
 742             "1:                             \n\t"
 743             "movd            (%0), %%mm0    \n\t"
 744             "punpcklbw      %%mm0, %%mm0    \n\t"
 745             "punpcklwd      %%mm0, %%mm0    \n\t"
 746             "punpckldq      %%mm0, %%mm0    \n\t"
 747             "movq           %%mm0, -8(%0)   \n\t"
 748             "movq      -8(%0, %2), %%mm1    \n\t"
 749             "punpckhbw      %%mm1, %%mm1    \n\t"
 750             "punpckhwd      %%mm1, %%mm1    \n\t"
 751             "punpckhdq      %%mm1, %%mm1    \n\t"
 752             "movq           %%mm1, (%0, %2) \n\t"
 753             "add               %1, %0       \n\t"
 754             "cmp               %3, %0       \n\t"
 755             "jb                1b           \n\t"
 756             : "+r"(ptr)
 757             : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
 758             );
 759     } else if(w==16){
 760         __asm__ volatile (
 761             "1:                                 \n\t"
 762             "movd            (%0), %%mm0        \n\t"
 763             "punpcklbw      %%mm0, %%mm0        \n\t"
 764             "punpcklwd      %%mm0, %%mm0        \n\t"
 765             "punpckldq      %%mm0, %%mm0        \n\t"
 766             "movq           %%mm0, -8(%0)       \n\t"
 767             "movq           %%mm0, -16(%0)      \n\t"
 768             "movq      -8(%0, %2), %%mm1        \n\t"
 769             "punpckhbw      %%mm1, %%mm1        \n\t"
 770             "punpckhwd      %%mm1, %%mm1        \n\t"
 771             "punpckhdq      %%mm1, %%mm1        \n\t"
 772             "movq           %%mm1,  (%0, %2)    \n\t"
 773             "movq           %%mm1, 8(%0, %2)    \n\t"
 774             "add               %1, %0           \n\t"
 775             "cmp               %3, %0           \n\t"
 776             "jb                1b               \n\t"
 777             : "+r"(ptr)
 778             : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
 779             );
 780     } else {
 781         av_assert1(w == 4);
 782         __asm__ volatile (
 783             "1:                             \n\t"
 784             "movd            (%0), %%mm0    \n\t"
 785             "punpcklbw      %%mm0, %%mm0    \n\t"
 786             "punpcklwd      %%mm0, %%mm0    \n\t"
 787             "movd           %%mm0, -4(%0)   \n\t"
 788             "movd      -4(%0, %2), %%mm1    \n\t"
 789             "punpcklbw      %%mm1, %%mm1    \n\t"
 790             "punpckhwd      %%mm1, %%mm1    \n\t"
 791             "punpckhdq      %%mm1, %%mm1    \n\t"
 792             "movd           %%mm1, (%0, %2) \n\t"
 793             "add               %1, %0       \n\t"
 794             "cmp               %3, %0       \n\t"
 795             "jb                1b           \n\t"
 796             : "+r"(ptr)
 797             : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
 798             );
 799     }
 800
 801     /* top and bottom (and hopefully also the corners) */
 802     if (sides & EDGE_TOP) {
 803         for (i = 0; i < h; i += 4) {
 804             ptr = buf - (i + 1) * wrap - w;
 805             __asm__ volatile (
 806                 "1:                             \n\t"
 807                 "movq (%1, %0), %%mm0           \n\t"
 808                 "movq    %%mm0, (%0)            \n\t"
 809                 "movq    %%mm0, (%0, %2)        \n\t"
 810                 "movq    %%mm0, (%0, %2, 2)     \n\t"
 811                 "movq    %%mm0, (%0, %3)        \n\t"
 812                 "add        $8, %0              \n\t"
 813                 "cmp        %4, %0              \n\t"
 814                 "jb         1b                  \n\t"
 815                 : "+r"(ptr)
 816                 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
 817                   "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
 818                 );
 819         }
 820     }
 821
 822     if (sides & EDGE_BOTTOM) {
 823         for (i = 0; i < h; i += 4) {
 824             ptr = last_line + (i + 1) * wrap - w;
 825             __asm__ volatile (
 826                 "1:                             \n\t"
 827                 "movq (%1, %0), %%mm0           \n\t"
 828                 "movq    %%mm0, (%0)            \n\t"
 829                 "movq    %%mm0, (%0, %2)        \n\t"
 830                 "movq    %%mm0, (%0, %2, 2)     \n\t"
 831                 "movq    %%mm0, (%0, %3)        \n\t"
 832                 "add        $8, %0              \n\t"
 833                 "cmp        %4, %0              \n\t"
 834                 "jb         1b                  \n\t"
 835                 : "+r"(ptr)
 836                 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
 837                   "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
 838                   "r"(ptr + width + 2 * w)
 839                 );
 840         }
 841     }
 842 }
 843
 844 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd,                      \
 845                    in0, in1, in2, in7, out, OP)                           \
 846     "paddw               "#m4", "#m3"   \n\t" /* x1 */                    \
 847     "movq   "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */                    \
 848     "pmullw              "#m3", %%mm4   \n\t" /* 20x1 */                  \
 849     "movq               "#in7", "#m3"   \n\t" /* d */                     \
 850     "movq               "#in0", %%mm5   \n\t" /* D */                     \
 851     "paddw               "#m3", %%mm5   \n\t" /* x4 */                    \
 852     "psubw               %%mm5, %%mm4   \n\t" /* 20x1 - x4 */             \
 853     "movq               "#in1", %%mm5   \n\t" /* C */                     \
 854     "movq               "#in2", %%mm6   \n\t" /* B */                     \
 855     "paddw               "#m6", %%mm5   \n\t" /* x3 */                    \
 856     "paddw               "#m5", %%mm6   \n\t" /* x2 */                    \
 857     "paddw               %%mm6, %%mm6   \n\t" /* 2x2 */                   \
 858     "psubw               %%mm6, %%mm5   \n\t" /* -2x2 + x3 */             \
 859     "pmullw  "MANGLE(ff_pw_3)", %%mm5   \n\t" /* -6x2 + 3x3 */            \
 860     "paddw              "#rnd", %%mm4   \n\t" /* x2 */                    \
 861     "paddw               %%mm4, %%mm5   \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
 862     "psraw                  $5, %%mm5   \n\t"                             \
 863     "packuswb            %%mm5, %%mm5   \n\t"                             \
 864     OP(%%mm5, out, %%mm7, d)
 865
 866 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMXEXT)                        \
 867 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst,         \
 868                                                     uint8_t *src,         \
 869                                                     int dstStride,        \
 870                                                     int srcStride,        \
 871                                                     int h)                \
 872 {                                                                         \
 873     uint64_t temp;                                                        \
 874                                                                           \
 875     __asm__ volatile (                                                    \
 876         "pxor      %%mm7, %%mm7             \n\t"                         \
 877         "1:                                 \n\t"                         \
 878         "movq       (%0), %%mm0             \n\t" /* ABCDEFGH */          \
 879         "movq      %%mm0, %%mm1             \n\t" /* ABCDEFGH */          \
 880         "movq      %%mm0, %%mm2             \n\t" /* ABCDEFGH */          \
 881         "punpcklbw %%mm7, %%mm0             \n\t" /* 0A0B0C0D */          \
 882         "punpckhbw %%mm7, %%mm1             \n\t" /* 0E0F0G0H */          \
 883         "pshufw    $0x90, %%mm0, %%mm5      \n\t" /* 0A0A0B0C */          \
 884         "pshufw    $0x41, %%mm0, %%mm6      \n\t" /* 0B0A0A0B */          \
 885         "movq      %%mm2, %%mm3             \n\t" /* ABCDEFGH */          \
 886         "movq      %%mm2, %%mm4             \n\t" /* ABCDEFGH */          \
 887         "psllq        $8, %%mm2             \n\t" /* 0ABCDEFG */          \
 888         "psllq       $16, %%mm3             \n\t" /* 00ABCDEF */          \
 889         "psllq       $24, %%mm4             \n\t" /* 000ABCDE */          \
 890         "punpckhbw %%mm7, %%mm2             \n\t" /* 0D0E0F0G */          \
 891         "punpckhbw %%mm7, %%mm3             \n\t" /* 0C0D0E0F */          \
 892         "punpckhbw %%mm7, %%mm4             \n\t" /* 0B0C0D0E */          \
 893         "paddw     %%mm3, %%mm5             \n\t" /* b */                 \
 894         "paddw     %%mm2, %%mm6             \n\t" /* c */                 \
 895         "paddw     %%mm5, %%mm5             \n\t" /* 2b */                \
 896         "psubw     %%mm5, %%mm6             \n\t" /* c - 2b */            \
 897         "pshufw    $0x06, %%mm0, %%mm5      \n\t" /* 0C0B0A0A */          \
 898         "pmullw "MANGLE(ff_pw_3)", %%mm6    \n\t" /* 3c - 6b */           \
 899         "paddw     %%mm4, %%mm0             \n\t" /* a */                 \
 900         "paddw     %%mm1, %%mm5             \n\t" /* d */                 \
 901         "pmullw "MANGLE(ff_pw_20)", %%mm0   \n\t" /* 20a */               \
 902         "psubw     %%mm5, %%mm0             \n\t" /* 20a - d */           \
 903         "paddw        %6, %%mm6             \n\t"                         \
 904         "paddw     %%mm6, %%mm0             \n\t" /* 20a - 6b + 3c - d */ \
 905         "psraw        $5, %%mm0             \n\t"                         \
 906         "movq      %%mm0, %5                \n\t"                         \
 907         /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */     \
 908                                                                           \
 909         "movq      5(%0), %%mm0             \n\t" /* FGHIJKLM */          \
 910         "movq      %%mm0, %%mm5             \n\t" /* FGHIJKLM */          \
 911         "movq      %%mm0, %%mm6             \n\t" /* FGHIJKLM */          \
 912         "psrlq        $8, %%mm0             \n\t" /* GHIJKLM0 */          \
 913         "psrlq       $16, %%mm5             \n\t" /* HIJKLM00 */          \
 914         "punpcklbw %%mm7, %%mm0             \n\t" /* 0G0H0I0J */          \
 915         "punpcklbw %%mm7, %%mm5             \n\t" /* 0H0I0J0K */          \
 916         "paddw     %%mm0, %%mm2             \n\t" /* b */                 \
 917         "paddw     %%mm5, %%mm3             \n\t" /* c */                 \
 918         "paddw     %%mm2, %%mm2             \n\t" /* 2b */                \
 919         "psubw     %%mm2, %%mm3             \n\t" /* c - 2b */            \
 920         "movq      %%mm6, %%mm2             \n\t" /* FGHIJKLM */          \
 921         "psrlq       $24, %%mm6             \n\t" /* IJKLM000 */          \
 922         "punpcklbw %%mm7, %%mm2             \n\t" /* 0F0G0H0I */          \
 923         "punpcklbw %%mm7, %%mm6             \n\t" /* 0I0J0K0L */          \
 924         "pmullw "MANGLE(ff_pw_3)", %%mm3    \n\t" /* 3c - 6b */           \
 925         "paddw     %%mm2, %%mm1             \n\t" /* a */                 \
 926         "paddw     %%mm6, %%mm4             \n\t" /* d */                 \
 927         "pmullw "MANGLE(ff_pw_20)", %%mm1   \n\t" /* 20a */               \
 928         "psubw     %%mm4, %%mm3             \n\t" /* - 6b +3c - d */      \
 929         "paddw        %6, %%mm1             \n\t"                         \
 930         "paddw     %%mm1, %%mm3             \n\t" /* 20a - 6b +3c - d */  \
 931         "psraw        $5, %%mm3             \n\t"                         \
 932         "movq         %5, %%mm1             \n\t"                         \
 933         "packuswb  %%mm3, %%mm1             \n\t"                         \
 934         OP_MMXEXT(%%mm1, (%1), %%mm4, q)                                  \
 935         /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */     \
 936                                                                           \
 937         "movq      9(%0), %%mm1             \n\t" /* JKLMNOPQ */          \
 938         "movq      %%mm1, %%mm4             \n\t" /* JKLMNOPQ */          \
 939         "movq      %%mm1, %%mm3             \n\t" /* JKLMNOPQ */          \
 940         "psrlq        $8, %%mm1             \n\t" /* KLMNOPQ0 */          \
 941         "psrlq       $16, %%mm4             \n\t" /* LMNOPQ00 */          \
 942         "punpcklbw %%mm7, %%mm1             \n\t" /* 0K0L0M0N */          \
 943         "punpcklbw %%mm7, %%mm4             \n\t" /* 0L0M0N0O */          \
 944         "paddw     %%mm1, %%mm5             \n\t" /* b */                 \
 945         "paddw     %%mm4, %%mm0             \n\t" /* c */                 \
 946         "paddw     %%mm5, %%mm5             \n\t" /* 2b */                \
 947         "psubw     %%mm5, %%mm0             \n\t" /* c - 2b */            \
 948         "movq      %%mm3, %%mm5             \n\t" /* JKLMNOPQ */          \
 949         "psrlq       $24, %%mm3             \n\t" /* MNOPQ000 */          \
 950         "pmullw "MANGLE(ff_pw_3)", %%mm0    \n\t" /* 3c - 6b */           \
 951         "punpcklbw %%mm7, %%mm3             \n\t" /* 0M0N0O0P */          \
 952         "paddw     %%mm3, %%mm2             \n\t" /* d */                 \
 953         "psubw     %%mm2, %%mm0             \n\t" /* -6b + 3c - d */      \
 954         "movq      %%mm5, %%mm2             \n\t" /* JKLMNOPQ */          \
 955         "punpcklbw %%mm7, %%mm2             \n\t" /* 0J0K0L0M */          \
 956         "punpckhbw %%mm7, %%mm5             \n\t" /* 0N0O0P0Q */          \
 957         "paddw     %%mm2, %%mm6             \n\t" /* a */                 \
 958         "pmullw "MANGLE(ff_pw_20)", %%mm6   \n\t" /* 20a */               \
 959         "paddw        %6, %%mm0             \n\t"                         \
 960         "paddw     %%mm6, %%mm0             \n\t" /* 20a - 6b + 3c - d */ \
 961         "psraw        $5, %%mm0             \n\t"                         \
 962         /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */                         \
 963         /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */                              \
 964                                                                           \
 965         "paddw    %%mm5, %%mm3              \n\t" /* a */                 \
 966         "pshufw   $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */          \
 967         "paddw    %%mm4, %%mm6              \n\t" /* b */                 \
 968         "pshufw   $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */          \
 969         "pshufw   $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */          \
 970         "paddw    %%mm1, %%mm4              \n\t" /* c */                 \
 971         "paddw    %%mm2, %%mm5              \n\t" /* d */                 \
 972         "paddw    %%mm6, %%mm6              \n\t" /* 2b */                \
 973         "psubw    %%mm6, %%mm4              \n\t" /* c - 2b */            \
 974         "pmullw "MANGLE(ff_pw_20)", %%mm3   \n\t" /* 20a */               \
 975         "pmullw  "MANGLE(ff_pw_3)", %%mm4   \n\t" /* 3c - 6b */           \
 976         "psubw    %%mm5, %%mm3              \n\t" /* -6b + 3c - d */      \
 977         "paddw       %6, %%mm4              \n\t"                         \
 978         "paddw    %%mm3, %%mm4              \n\t" /* 20a - 6b + 3c - d */ \
 979         "psraw       $5, %%mm4              \n\t"                         \
 980         "packuswb %%mm4, %%mm0              \n\t"                         \
 981         OP_MMXEXT(%%mm0, 8(%1), %%mm4, q)                                 \
 982                                                                           \
 983         "add         %3, %0                 \n\t"                         \
 984         "add         %4, %1                 \n\t"                         \
 985         "decl        %2                     \n\t"                         \
 986         "jnz         1b                     \n\t"                         \
 987         : "+a"(src), "+c"(dst), "+D"(h)                                   \
 988         : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride),               \
 989           /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER)      \
 990         : "memory"                                                        \
 991         );                                                                \
 992 }                                                                         \
 993                                                                           \
 994 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst,          \
 995                                                    uint8_t *src,          \
 996                                                    int dstStride,         \
 997                                                    int srcStride,         \
 998                                                    int h)                 \
 999 {                                                                         \
1000     __asm__ volatile (                                                    \
1001         "pxor      %%mm7, %%mm7             \n\t"                         \
1002         "1:                                 \n\t"                         \
1003         "movq       (%0), %%mm0             \n\t" /* ABCDEFGH */          \
1004         "movq      %%mm0, %%mm1             \n\t" /* ABCDEFGH */          \
1005         "movq      %%mm0, %%mm2             \n\t" /* ABCDEFGH */          \
1006         "punpcklbw %%mm7, %%mm0             \n\t" /* 0A0B0C0D */          \
1007         "punpckhbw %%mm7, %%mm1             \n\t" /* 0E0F0G0H */          \
1008         "pshufw    $0x90, %%mm0, %%mm5      \n\t" /* 0A0A0B0C */          \
1009         "pshufw    $0x41, %%mm0, %%mm6      \n\t" /* 0B0A0A0B */          \
1010         "movq      %%mm2, %%mm3             \n\t" /* ABCDEFGH */          \
1011         "movq      %%mm2, %%mm4             \n\t" /* ABCDEFGH */          \
1012         "psllq        $8, %%mm2             \n\t" /* 0ABCDEFG */          \
1013         "psllq       $16, %%mm3             \n\t" /* 00ABCDEF */          \
1014         "psllq       $24, %%mm4             \n\t" /* 000ABCDE */          \
1015         "punpckhbw %%mm7, %%mm2             \n\t" /* 0D0E0F0G */          \
1016         "punpckhbw %%mm7, %%mm3             \n\t" /* 0C0D0E0F */          \
1017         "punpckhbw %%mm7, %%mm4             \n\t" /* 0B0C0D0E */          \
1018         "paddw     %%mm3, %%mm5             \n\t" /* b */                 \
1019         "paddw     %%mm2, %%mm6             \n\t" /* c */                 \
1020         "paddw     %%mm5, %%mm5             \n\t" /* 2b */                \
1021         "psubw     %%mm5, %%mm6             \n\t" /* c - 2b */            \
1022         "pshufw    $0x06, %%mm0, %%mm5      \n\t" /* 0C0B0A0A */          \
1023         "pmullw "MANGLE(ff_pw_3)", %%mm6    \n\t" /* 3c - 6b */           \
1024         "paddw     %%mm4, %%mm0             \n\t" /* a */                 \
1025         "paddw     %%mm1, %%mm5             \n\t" /* d */                 \
1026         "pmullw "MANGLE(ff_pw_20)", %%mm0   \n\t" /* 20a */               \
1027         "psubw     %%mm5, %%mm0             \n\t" /* 20a - d */           \
1028         "paddw        %5, %%mm6             \n\t"                         \
1029         "paddw     %%mm6, %%mm0             \n\t" /* 20a - 6b + 3c - d */ \
1030         "psraw        $5, %%mm0             \n\t"                         \
1031         /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */     \
1032                                                                           \
1033         "movd      5(%0), %%mm5             \n\t" /* FGHI */              \
1034         "punpcklbw %%mm7, %%mm5             \n\t" /* 0F0G0H0I */          \
1035         "pshufw    $0xF9, %%mm5, %%mm6      \n\t" /* 0G0H0I0I */          \
1036         "paddw     %%mm5, %%mm1             \n\t" /* a */                 \
1037         "paddw     %%mm6, %%mm2             \n\t" /* b */                 \
1038         "pshufw    $0xBE, %%mm5, %%mm6      \n\t" /* 0H0I0I0H */          \
1039         "pshufw    $0x6F, %%mm5, %%mm5      \n\t" /* 0I0I0H0G */          \
1040         "paddw     %%mm6, %%mm3             \n\t" /* c */                 \
1041         "paddw     %%mm5, %%mm4             \n\t" /* d */                 \
1042         "paddw     %%mm2, %%mm2             \n\t" /* 2b */                \
1043         "psubw     %%mm2, %%mm3             \n\t" /* c - 2b */            \
1044         "pmullw "MANGLE(ff_pw_20)", %%mm1   \n\t" /* 20a */               \
1045         "pmullw  "MANGLE(ff_pw_3)", %%mm3   \n\t" /* 3c - 6b */           \
1046         "psubw     %%mm4, %%mm3             \n\t" /* -6b + 3c - d */      \
1047         "paddw        %5, %%mm1             \n\t"                         \
1048         "paddw     %%mm1, %%mm3             \n\t" /* 20a - 6b + 3c - d */ \
1049         "psraw        $5, %%mm3             \n\t"                         \
1050         "packuswb  %%mm3, %%mm0             \n\t"                         \
1051         OP_MMXEXT(%%mm0, (%1), %%mm4, q)                                  \
1052                                                                           \
1053         "add          %3, %0                \n\t"                         \
1054         "add          %4, %1                \n\t"                         \
1055         "decl         %2                    \n\t"                         \
1056         "jnz          1b                    \n\t"                         \
1057         : "+a"(src), "+c"(dst), "+d"(h)                                   \
1058         : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride),               \
1059           /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER)                 \
1060         : "memory"                                                        \
1061         );                                                                \
1062 }
1063
1064 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)                          \
1065 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst,      \
1066                                                      uint8_t *src,      \
1067                                                      int dstStride,     \
1068                                                      int srcStride)     \
1069 {                                                                       \
1070     uint64_t temp[17 * 4];                                              \
1071     uint64_t *temp_ptr = temp;                                          \
1072     int count = 17;                                                     \
1073                                                                         \
1074     /* FIXME unroll */                                                  \
1075     __asm__ volatile (                                                  \
1076         "pxor      %%mm7, %%mm7             \n\t"                       \
1077         "1:                                 \n\t"                       \
1078         "movq       (%0), %%mm0             \n\t"                       \
1079         "movq       (%0), %%mm1             \n\t"                       \
1080         "movq      8(%0), %%mm2             \n\t"                       \
1081         "movq      8(%0), %%mm3             \n\t"                       \
1082         "punpcklbw %%mm7, %%mm0             \n\t"                       \
1083         "punpckhbw %%mm7, %%mm1             \n\t"                       \
1084         "punpcklbw %%mm7, %%mm2             \n\t"                       \
1085         "punpckhbw %%mm7, %%mm3             \n\t"                       \
1086         "movq      %%mm0, (%1)              \n\t"                       \
1087         "movq      %%mm1, 17 * 8(%1)        \n\t"                       \
1088         "movq      %%mm2, 2 * 17 * 8(%1)    \n\t"                       \
1089         "movq      %%mm3, 3 * 17 * 8(%1)    \n\t"                       \
1090         "add          $8, %1                \n\t"                       \
1091         "add          %3, %0                \n\t"                       \
1092         "decl         %2                    \n\t"                       \
1093         "jnz          1b                    \n\t"                       \
1094         : "+r"(src), "+r"(temp_ptr), "+r"(count)                        \
1095         : "r"((x86_reg)srcStride)                                       \
1096         : "memory"                                                      \
1097         );                                                              \
1098                                                                         \
1099     temp_ptr = temp;                                                    \
1100     count    = 4;                                                       \
1101                                                                         \
1102     /* FIXME reorder for speed */                                       \
1103     __asm__ volatile (                                                  \
1104         /* "pxor  %%mm7, %%mm7            \n\t" */                      \
1105         "1:                             \n\t"                           \
1106         "movq    (%0), %%mm0            \n\t"                           \
1107         "movq   8(%0), %%mm1            \n\t"                           \
1108         "movq  16(%0), %%mm2            \n\t"                           \
1109         "movq  24(%0), %%mm3            \n\t"                           \
1110         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),   8(%0),    (%0),  32(%0), (%1),     OP) \
1111         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),    (%0),    (%0),  40(%0), (%1, %3), OP) \
1112         "add       %4, %1               \n\t"                           \
1113         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),    (%0),   8(%0),  48(%0), (%1),     OP) \
1114                                                                         \
1115         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),   8(%0),  16(%0),  56(%0), (%1, %3), OP) \
1116         "add       %4, %1               \n\t"                           \
1117         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0),  16(%0),  24(%0),  64(%0), (%1),     OP) \
1118         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0),  24(%0),  32(%0),  72(%0), (%1, %3), OP) \
1119         "add       %4, %1               \n\t"                           \
1120         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0),  32(%0),  40(%0),  80(%0), (%1),     OP) \
1121         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0),  40(%0),  48(%0),  88(%0), (%1, %3), OP) \
1122         "add       %4, %1               \n\t"                           \
1123         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0),  48(%0),  56(%0),  96(%0), (%1),     OP) \
1124         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0),  56(%0),  64(%0), 104(%0), (%1, %3), OP) \
1125         "add       %4, %1               \n\t"                           \
1126         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0),  64(%0),  72(%0), 112(%0), (%1),     OP) \
1127         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0),  72(%0),  80(%0), 120(%0), (%1, %3), OP) \
1128         "add       %4, %1               \n\t"                           \
1129         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0),  80(%0),  88(%0), 128(%0), (%1),     OP) \
1130                                                                         \
1131         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0),  88(%0),  96(%0), 128(%0), (%1, %3), OP) \
1132         "add       %4, %1               \n\t"                           \
1133         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0),  96(%0), 104(%0), 120(%0), (%1),     OP) \
1134         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1135                                                                         \
1136         "add     $136, %0               \n\t"                           \
1137         "add       %6, %1               \n\t"                           \
1138         "decl      %2                   \n\t"                           \
1139         "jnz       1b                   \n\t"                           \
1140                                                                         \
1141         : "+r"(temp_ptr), "+r"(dst), "+g"(count)                        \
1142         : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride),         \
1143           /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER),              \
1144           "g"(4 - 14 * (x86_reg)dstStride)                              \
1145         : "memory"                                                      \
1146         );                                                              \
1147 }                                                                       \
1148                                                                         \
1149 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst,       \
1150                                                     uint8_t *src,       \
1151                                                     int dstStride,      \
1152                                                     int srcStride)      \
1153 {                                                                       \
1154     uint64_t temp[9 * 2];                                               \
1155     uint64_t *temp_ptr = temp;                                          \
1156     int count = 9;                                                      \
1157                                                                         \
1158     /* FIXME unroll */                                                  \
1159     __asm__ volatile (                                                  \
1160         "pxor      %%mm7, %%mm7         \n\t"                           \
1161         "1:                             \n\t"                           \
1162         "movq       (%0), %%mm0         \n\t"                           \
1163         "movq       (%0), %%mm1         \n\t"                           \
1164         "punpcklbw %%mm7, %%mm0         \n\t"                           \
1165         "punpckhbw %%mm7, %%mm1         \n\t"                           \
1166         "movq      %%mm0, (%1)          \n\t"                           \
1167         "movq      %%mm1, 9*8(%1)       \n\t"                           \
1168         "add          $8, %1            \n\t"                           \
1169         "add          %3, %0            \n\t"                           \
1170         "decl         %2                \n\t"                           \
1171         "jnz          1b                \n\t"                           \
1172         : "+r"(src), "+r"(temp_ptr), "+r"(count)                        \
1173         : "r"((x86_reg)srcStride)                                       \
1174         : "memory"                                                      \
1175         );                                                              \
1176                                                                         \
1177     temp_ptr = temp;                                                    \
1178     count    = 2;                                                       \
1179                                                                         \
1180     /* FIXME reorder for speed */                                       \
1181     __asm__ volatile (                                                  \
1182         /* "pxor  %%mm7, %%mm7            \n\t" */                      \
1183         "1:                             \n\t"                           \
1184         "movq    (%0), %%mm0            \n\t"                           \
1185         "movq   8(%0), %%mm1            \n\t"                           \
1186         "movq  16(%0), %%mm2            \n\t"                           \
1187         "movq  24(%0), %%mm3            \n\t"                           \
1188         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)     \
1189         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP) \
1190         "add       %4, %1               \n\t"                           \
1191         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)     \
1192                                                                         \
1193         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1194         "add       %4, %1               \n\t"                           \
1195         QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)     \
1196                                                                         \
1197         QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1198         "add       %4, %1               \n\t"                           \
1199         QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)     \
1200         QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1201                                                                         \
1202         "add      $72, %0               \n\t"                           \
1203         "add       %6, %1               \n\t"                           \
1204         "decl      %2                   \n\t"                           \
1205         "jnz       1b                   \n\t"                           \
1206                                                                         \
1207         : "+r"(temp_ptr), "+r"(dst), "+g"(count)                        \
1208         : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride),         \
1209           /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER),              \
1210           "g"(4 - 6 * (x86_reg)dstStride)                               \
1211         : "memory"                                                      \
1212         );                                                              \
1213 }                                                                       \
1214                                                                         \
1215 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src,   \
1216                                           int stride)                   \
1217 {                                                                       \
1218     OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);                     \
1219 }                                                                       \
1220                                                                         \
1221 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src,    \
1222                                          int stride)                    \
1223 {                                                                       \
1224     uint64_t temp[8];                                                   \
1225     uint8_t * const half = (uint8_t*)temp;                              \
1226     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,           \
1227                                                 stride, 8);             \
1228     OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);    \
1229 }                                                                       \
1230                                                                         \
1231 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src,    \
1232                                          int stride)                    \
1233 {                                                                       \
1234     OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride,           \
1235                                             stride, 8);                 \
1236 }                                                                       \
1237                                                                         \
1238 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src,    \
1239                                          int stride)                    \
1240 {                                                                       \
1241     uint64_t temp[8];                                                   \
1242     uint8_t * const half = (uint8_t*)temp;                              \
1243     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,           \
1244                                                 stride, 8);             \
1245     OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride,            \
1246                                  stride, 8);                            \
1247 }                                                                       \
1248                                                                         \
1249 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src,    \
1250                                          int stride)                    \
1251 {                                                                       \
1252     uint64_t temp[8];                                                   \
1253     uint8_t * const half = (uint8_t*)temp;                              \
1254     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);  \
1255     OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);    \
1256 }                                                                       \
1257                                                                         \
1258 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src,    \
1259                                          int stride)                    \
1260 {                                                                       \
1261     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);  \
1262 }                                                                       \
1263                                                                         \
1264 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src,    \
1265                                          int stride)                    \
1266 {                                                                       \
1267     uint64_t temp[8];                                                   \
1268     uint8_t * const half = (uint8_t*)temp;                              \
1269     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);  \
1270     OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,       \
1271                                  stride, 8);                            \
1272 }                                                                       \
1273                                                                         \
1274 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src,    \
1275                                          int stride)                    \
1276 {                                                                       \
1277     uint64_t half[8 + 9];                                               \
1278     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
1279     uint8_t * const halfHV = ((uint8_t*)half);                          \
1280     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1281                                                 stride, 9);             \
1282     put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);  \
1283     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
1284     OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);     \
1285 }                                                                       \
1286                                                                         \
1287 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src,    \
1288                                          int stride)                    \
1289 {                                                                       \
1290     uint64_t half[8 + 9];                                               \
1291     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
1292     uint8_t * const halfHV = ((uint8_t*)half);                          \
1293     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1294                                                 stride, 9);             \
1295     put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,          \
1296                                      stride, 9);                        \
1297     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
1298     OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);     \
1299 }                                                                       \
1300                                                                         \
1301 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src,    \
1302                                          int stride)                    \
1303 {                                                                       \
1304     uint64_t half[8 + 9];                                               \
1305     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
1306     uint8_t * const halfHV = ((uint8_t*)half);                          \
1307     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1308                                                 stride, 9);             \
1309     put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);  \
1310     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
1311     OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1312 }                                                                       \
1313                                                                         \
1314 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src,    \
1315                                          int stride)                    \
1316 {                                                                       \
1317     uint64_t half[8 + 9];                                               \
1318     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
1319     uint8_t * const halfHV = ((uint8_t*)half);                          \
1320     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1321                                                 stride, 9);             \
1322     put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,          \
1323                                      stride, 9);                        \
1324     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
1325     OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1326 }                                                                       \
1327                                                                         \
1328 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src,    \
1329                                          int stride)                    \
1330 {                                                                       \
1331     uint64_t half[8 + 9];                                               \
1332     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
1333     uint8_t * const halfHV = ((uint8_t*)half);                          \
1334     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1335                                                 stride, 9);             \
1336     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
1337     OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);     \
1338 }                                                                       \
1339                                                                         \
1340 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src,    \
1341                                          int stride)                    \
1342 {                                                                       \
1343     uint64_t half[8 + 9];                                               \
1344     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
1345     uint8_t * const halfHV = ((uint8_t*)half);                          \
1346     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1347                                                 stride, 9);             \
1348     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
1349     OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1350 }                                                                       \
1351                                                                         \
1352 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src,    \
1353                                          int stride)                    \
1354 {                                                                       \
1355     uint64_t half[8 + 9];                                               \
1356     uint8_t * const halfH = ((uint8_t*)half);                           \
1357     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1358                                                 stride, 9);             \
1359     put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);  \
1360     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);     \
1361 }                                                                       \
1362                                                                         \
1363 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src,    \
1364                                          int stride)                    \
1365 {                                                                       \
1366     uint64_t half[8 + 9];                                               \
1367     uint8_t * const halfH = ((uint8_t*)half);                           \
1368     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1369                                                 stride, 9);             \
1370     put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,          \
1371                                      stride, 9);                        \
1372     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);     \
1373 }                                                                       \
1374                                                                         \
1375 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src,    \
1376                                          int stride)                    \
1377 {                                                                       \
1378     uint64_t half[9];                                                   \
1379     uint8_t * const halfH = ((uint8_t*)half);                           \
1380     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
1381                                                 stride, 9);             \
1382     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);     \
1383 }                                                                       \
1384                                                                         \
1385 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src,  \
1386                                            int stride)                  \
1387 {                                                                       \
1388     OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);                   \
1389 }                                                                       \
1390                                                                         \
1391 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src,   \
1392                                           int stride)                   \
1393 {                                                                       \
1394     uint64_t temp[32];                                                  \
1395     uint8_t * const half = (uint8_t*)temp;                              \
1396     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,         \
1397                                                  stride, 16);           \
1398     OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);  \
1399 }                                                                       \
1400                                                                         \
1401 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src,   \
1402                                           int stride)                   \
1403 {                                                                       \
1404     OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src,                  \
1405                                              stride, stride, 16);       \
1406 }                                                                       \
1407                                                                         \
1408 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src,   \
1409                                           int stride)                   \
1410 {                                                                       \
1411     uint64_t temp[32];                                                  \
1412     uint8_t * const half = (uint8_t*)temp;                              \
1413     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,         \
1414                                                  stride, 16);           \
1415     OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half,                   \
1416                                   stride, stride, 16);                  \
1417 }                                                                       \
1418                                                                         \
1419 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src,   \
1420                                           int stride)                   \
1421 {                                                                       \
1422     uint64_t temp[32];                                                  \
1423     uint8_t * const half = (uint8_t*)temp;                              \
1424     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,         \
1425                                                  stride);               \
1426     OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);  \
1427 }                                                                       \
1428                                                                         \
1429 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src,   \
1430                                           int stride)                   \
1431 {                                                                       \
1432     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1433 }                                                                       \
1434                                                                         \
1435 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src,   \
1436                                           int stride)                   \
1437 {                                                                       \
1438     uint64_t temp[32];                                                  \
1439     uint8_t * const half = (uint8_t*)temp;                              \
1440     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,         \
1441                                                  stride);               \
1442     OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half,                \
1443                                   stride, stride, 16);                  \
1444 }                                                                       \
1445                                                                         \
1446 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src,   \
1447                                           int stride)                   \
1448 {                                                                       \
1449     uint64_t half[16 * 2 + 17 * 2];                                     \
1450     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1451     uint8_t * const halfHV = ((uint8_t*)half);                          \
1452     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1453                                                  stride, 17);           \
1454     put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,            \
1455                                       stride, 17);                      \
1456     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
1457                                                  16, 16);               \
1458     OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);  \
1459 }                                                                       \
1460                                                                         \
1461 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src,   \
1462                                           int stride)                   \
1463 {                                                                       \
1464     uint64_t half[16 * 2 + 17 * 2];                                     \
1465     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1466     uint8_t * const halfHV = ((uint8_t*)half);                          \
1467     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1468                                                  stride, 17);           \
1469     put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,        \
1470                                       stride, 17);                      \
1471     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
1472                                                  16, 16);               \
1473     OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);  \
1474 }                                                                       \
1475                                                                         \
1476 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src,   \
1477                                           int stride)                   \
1478 {                                                                       \
1479     uint64_t half[16 * 2 + 17 * 2];                                     \
1480     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1481     uint8_t * const halfHV = ((uint8_t*)half);                          \
1482     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1483                                                  stride, 17);           \
1484     put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,            \
1485                                       stride, 17);                      \
1486     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
1487                                                  16, 16);               \
1488     OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride,      \
1489                                   16, 16);                              \
1490 }                                                                       \
1491                                                                         \
1492 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src,   \
1493                                           int stride)                   \
1494 {                                                                       \
1495     uint64_t half[16 * 2 + 17 * 2];                                     \
1496     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1497     uint8_t * const halfHV = ((uint8_t*)half);                          \
1498     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1499                                                  stride, 17);           \
1500     put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,        \
1501                                       stride, 17);                      \
1502     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
1503                                                  16, 16);               \
1504     OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride,      \
1505                                   16, 16);                              \
1506 }                                                                       \
1507                                                                         \
1508 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src,   \
1509                                           int stride)                   \
1510 {                                                                       \
1511     uint64_t half[16 * 2 + 17 * 2];                                     \
1512     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1513     uint8_t * const halfHV = ((uint8_t*)half);                          \
1514     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1515                                                  stride, 17);           \
1516     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
1517                                                  16, 16);               \
1518     OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);  \
1519 }                                                                       \
1520                                                                         \
1521 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src,   \
1522                                           int stride)                   \
1523 {                                                                       \
1524     uint64_t half[16 * 2 + 17 * 2];                                     \
1525     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1526     uint8_t * const halfHV = ((uint8_t*)half);                          \
1527     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1528                                                  stride, 17);           \
1529     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
1530                                                  16, 16);               \
1531     OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride,      \
1532                                   16, 16);                              \
1533 }                                                                       \
1534                                                                         \
1535 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src,   \
1536                                           int stride)                   \
1537 {                                                                       \
1538     uint64_t half[17 * 2];                                              \
1539     uint8_t * const halfH = ((uint8_t*)half);                           \
1540     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1541                                                  stride, 17);           \
1542     put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,            \
1543                                       stride, 17);                      \
1544     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);   \
1545 }                                                                       \
1546                                                                         \
1547 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src,   \
1548                                           int stride)                   \
1549 {                                                                       \
1550     uint64_t half[17 * 2];                                              \
1551     uint8_t * const halfH = ((uint8_t*)half);                           \
1552     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1553                                                  stride, 17);           \
1554     put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,        \
1555                                       stride, 17);                      \
1556     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);   \
1557 }                                                                       \
1558                                                                         \
1559 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src,   \
1560                                           int stride)                   \
1561 {                                                                       \
1562     uint64_t half[17 * 2];                                              \
1563     uint8_t * const halfH = ((uint8_t*)half);                           \
1564     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
1565                                                  stride, 17);           \
1566     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);   \
1567 }
1568
1569 #define PUT_OP(a, b, temp, size)                \
1570     "mov"#size"        "#a", "#b"       \n\t"
1571
1572 #define AVG_MMXEXT_OP(a, b, temp, size)         \
1573     "mov"#size"        "#b", "#temp"    \n\t"   \
1574     "pavgb          "#temp", "#a"       \n\t"   \
1575     "mov"#size"        "#a", "#b"       \n\t"
1576
1577 QPEL_BASE(put_,        ff_pw_16, _,        PUT_OP)
1578 QPEL_BASE(avg_,        ff_pw_16, _,        AVG_MMXEXT_OP)
1579 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP)
1580 QPEL_OP(put_,          ff_pw_16, _,        PUT_OP,        mmxext)
1581 QPEL_OP(avg_,          ff_pw_16, _,        AVG_MMXEXT_OP, mmxext)
1582 QPEL_OP(put_no_rnd_,   ff_pw_15, _no_rnd_, PUT_OP,        mmxext)
1583
1584 /***********************************/
1585 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1586
1587 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)                              \
1588 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1589                                                                  uint8_t *src, \
1590                                                                  int stride)   \
1591 {                                                                              \
1592     OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);                  \
1593 }
1594
1595 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)                        \
1596 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1597                                                                  uint8_t *src, \
1598                                                                  int stride)   \
1599 {                                                                              \
1600     OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE,    \
1601                                                S1, S2);                        \
1602 }
1603
1604 #define QPEL_2TAP(OPNAME, SIZE, MMX)                                        \
1605 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)                            \
1606 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)                            \
1607 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)                               \
1608 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =    \
1609     OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;                                \
1610 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =    \
1611     OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;                           \
1612 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =    \
1613     OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;                           \
1614 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst,      \
1615                                                          uint8_t *src,      \
1616                                                          int stride)        \
1617 {                                                                           \
1618     OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE);    \
1619 }                                                                           \
1620 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst,      \
1621                                                          uint8_t *src,      \
1622                                                          int stride)        \
1623 {                                                                           \
1624     OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride,              \
1625                                             stride, SIZE);                  \
1626 }                                                                           \
1627 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0,           1,       0)                \
1628 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1,          -1,       0)                \
1629 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0,           stride,  0)                \
1630 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride,     -stride,  0)                \
1631 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0,           stride,  1)                \
1632 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1,           stride, -1)                \
1633 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride,     -stride,  1)                \
1634 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1)                \
1635
1636 QPEL_2TAP(put_, 16, mmxext)
1637 QPEL_2TAP(avg_, 16, mmxext)
1638 QPEL_2TAP(put_,  8, mmxext)
1639 QPEL_2TAP(avg_,  8, mmxext)
1640
1641 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1642 {
1643   put_pixels8_xy2_mmx(dst, src, stride, 8);
1644 }
1645 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1646 {
1647   put_pixels16_xy2_mmx(dst, src, stride, 16);
1648 }
1649 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1650 {
1651   avg_pixels8_xy2_mmx(dst, src, stride, 8);
1652 }
1653 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1654 {
1655   avg_pixels16_xy2_mmx(dst, src, stride, 16);
1656 }
1657
1658 #endif /* HAVE_INLINE_ASM */
1659
1660 #if HAVE_YASM
1661 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
1662                                 x86_reg linesize, x86_reg start_y,
1663                                 x86_reg end_y, x86_reg block_h,
1664                                 x86_reg start_x, x86_reg end_x,
1665                                 x86_reg block_w);
1666 extern emu_edge_core_func ff_emu_edge_core_mmx;
1667 extern emu_edge_core_func ff_emu_edge_core_sse;
1668
1669 static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
1670                                               int linesize,
1671                                               int block_w, int block_h,
1672                                               int src_x, int src_y,
1673                                               int w, int h,
1674                                               emu_edge_core_func *core_fn)
1675 {
1676     int start_y, start_x, end_y, end_x, src_y_add = 0;
1677
1678     if (src_y >= h) {
1679         src -= src_y*linesize;
1680         src_y_add = h - 1;
1681         src_y     = h - 1;
1682     } else if (src_y <= -block_h) {
1683         src -= src_y*linesize;
1684         src_y_add = 1 - block_h;
1685         src_y     = 1 - block_h;
1686     }
1687     if (src_x >= w) {
1688         src   += w - 1 - src_x;
1689         src_x  = w - 1;
1690     } else if (src_x <= -block_w) {
1691         src   += 1 - block_w - src_x;
1692         src_x  = 1 - block_w;
1693     }
1694
1695     start_y = FFMAX(0, -src_y);
1696     start_x = FFMAX(0, -src_x);
1697     end_y   = FFMIN(block_h, h-src_y);
1698     end_x   = FFMIN(block_w, w-src_x);
1699     av_assert2(start_x < end_x && block_w > 0);
1700     av_assert2(start_y < end_y && block_h > 0);
1701
1702     // fill in the to-be-copied part plus all above/below
1703     src += (src_y_add + start_y) * linesize + start_x;
1704     buf += start_x;
1705     core_fn(buf, src, linesize, start_y, end_y,
1706             block_h, start_x, end_x, block_w);
1707 }
1708
1709 #if ARCH_X86_32
1710 static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
1711                                              int linesize,
1712                                              int block_w, int block_h,
1713                                              int src_x, int src_y, int w, int h)
1714 {
1715     emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1716                      w, h, &ff_emu_edge_core_mmx);
1717 }
1718 #endif
1719
1720 static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
1721                                              int linesize,
1722                                              int block_w, int block_h,
1723                                              int src_x, int src_y, int w, int h)
1724 {
1725     emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1726                      w, h, &ff_emu_edge_core_sse);
1727 }
1728 #endif /* HAVE_YASM */
1729
1730 #if HAVE_INLINE_ASM
1731
1732 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1733                                    int linesize, int block_w, int block_h,
1734                                    int src_x, int src_y, int w, int h);
1735
1736 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1737                                  int stride, int h, int ox, int oy,
1738                                  int dxx, int dxy, int dyx, int dyy,
1739                                  int shift, int r, int width, int height,
1740                                  emulated_edge_mc_func *emu_edge_fn)
1741 {
1742     const int w    = 8;
1743     const int ix   = ox  >> (16 + shift);
1744     const int iy   = oy  >> (16 + shift);
1745     const int oxs  = ox  >> 4;
1746     const int oys  = oy  >> 4;
1747     const int dxxs = dxx >> 4;
1748     const int dxys = dxy >> 4;
1749     const int dyxs = dyx >> 4;
1750     const int dyys = dyy >> 4;
1751     const uint16_t r4[4]   = { r, r, r, r };
1752     const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1753     const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1754     const uint64_t shift2 = 2 * shift;
1755 #define MAX_STRIDE 4096U
1756 #define MAX_H 8U
1757     uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
1758     int x, y;
1759
1760     const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1761     const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1762     const int dxh = dxy * (h - 1);
1763     const int dyw = dyx * (w - 1);
1764     int need_emu =  (unsigned)ix >= width  - w ||
1765                     (unsigned)iy >= height - h;
1766
1767     if ( // non-constant fullpel offset (3% of blocks)
1768         ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1769          (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1770         // uses more than 16 bits of subpel mv (only at huge resolution)
1771         || (dxx | dxy | dyx | dyy) & 15
1772         || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
1773         // FIXME could still use mmx for some of the rows
1774         ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1775                  shift, r, width, height);
1776         return;
1777     }
1778
1779     src += ix + iy * stride;
1780     if (need_emu) {
1781         emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1782         src = edge_buf;
1783     }
1784
1785     __asm__ volatile (
1786         "movd         %0, %%mm6         \n\t"
1787         "pxor      %%mm7, %%mm7         \n\t"
1788         "punpcklwd %%mm6, %%mm6         \n\t"
1789         "punpcklwd %%mm6, %%mm6         \n\t"
1790         :: "r"(1<<shift)
1791     );
1792
1793     for (x = 0; x < w; x += 4) {
1794         uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1795                             oxs - dxys + dxxs * (x + 1),
1796                             oxs - dxys + dxxs * (x + 2),
1797                             oxs - dxys + dxxs * (x + 3) };
1798         uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1799                             oys - dyys + dyxs * (x + 1),
1800                             oys - dyys + dyxs * (x + 2),
1801                             oys - dyys + dyxs * (x + 3) };
1802
1803         for (y = 0; y < h; y++) {
1804             __asm__ volatile (
1805                 "movq      %0, %%mm4    \n\t"
1806                 "movq      %1, %%mm5    \n\t"
1807                 "paddw     %2, %%mm4    \n\t"
1808                 "paddw     %3, %%mm5    \n\t"
1809                 "movq   %%mm4, %0       \n\t"
1810                 "movq   %%mm5, %1       \n\t"
1811                 "psrlw    $12, %%mm4    \n\t"
1812                 "psrlw    $12, %%mm5    \n\t"
1813                 : "+m"(*dx4), "+m"(*dy4)
1814                 : "m"(*dxy4), "m"(*dyy4)
1815             );
1816
1817             __asm__ volatile (
1818                 "movq      %%mm6, %%mm2 \n\t"
1819                 "movq      %%mm6, %%mm1 \n\t"
1820                 "psubw     %%mm4, %%mm2 \n\t"
1821                 "psubw     %%mm5, %%mm1 \n\t"
1822                 "movq      %%mm2, %%mm0 \n\t"
1823                 "movq      %%mm4, %%mm3 \n\t"
1824                 "pmullw    %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1825                 "pmullw    %%mm5, %%mm3 \n\t" // dx * dy
1826                 "pmullw    %%mm5, %%mm2 \n\t" // (s - dx) * dy
1827                 "pmullw    %%mm4, %%mm1 \n\t" // dx * (s - dy)
1828
1829                 "movd         %4, %%mm5 \n\t"
1830                 "movd         %3, %%mm4 \n\t"
1831                 "punpcklbw %%mm7, %%mm5 \n\t"
1832                 "punpcklbw %%mm7, %%mm4 \n\t"
1833                 "pmullw    %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1834                 "pmullw    %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1835
1836                 "movd         %2, %%mm5 \n\t"
1837                 "movd         %1, %%mm4 \n\t"
1838                 "punpcklbw %%mm7, %%mm5 \n\t"
1839                 "punpcklbw %%mm7, %%mm4 \n\t"
1840                 "pmullw    %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1841                 "pmullw    %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1842                 "paddw        %5, %%mm1 \n\t"
1843                 "paddw     %%mm3, %%mm2 \n\t"
1844                 "paddw     %%mm1, %%mm0 \n\t"
1845                 "paddw     %%mm2, %%mm0 \n\t"
1846
1847                 "psrlw        %6, %%mm0 \n\t"
1848                 "packuswb  %%mm0, %%mm0 \n\t"
1849                 "movd      %%mm0, %0    \n\t"
1850
1851                 : "=m"(dst[x + y * stride])
1852                 : "m"(src[0]), "m"(src[1]),
1853                   "m"(src[stride]), "m"(src[stride + 1]),
1854                   "m"(*r4), "m"(shift2)
1855             );
1856             src += stride;
1857         }
1858         src += 4 - h * stride;
1859     }
1860 }
1861
1862 #if HAVE_YASM
1863 #if ARCH_X86_32
1864 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1865                     int stride, int h, int ox, int oy,
1866                     int dxx, int dxy, int dyx, int dyy,
1867                     int shift, int r, int width, int height)
1868 {
1869     gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1870         width, height, &emulated_edge_mc_mmx);
1871 }
1872 #endif
1873 static void gmc_sse(uint8_t *dst, uint8_t *src,
1874                     int stride, int h, int ox, int oy,
1875                     int dxx, int dxy, int dyx, int dyy,
1876                     int shift, int r, int width, int height)
1877 {
1878     gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1879         width, height, &emulated_edge_mc_sse);
1880 }
1881 #else
1882 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1883                     int stride, int h, int ox, int oy,
1884                     int dxx, int dxy, int dyx, int dyy,
1885                     int shift, int r, int width, int height)
1886 {
1887     gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1888         width, height, &ff_emulated_edge_mc_8);
1889 }
1890 #endif
1891
1892 #define PREFETCH(name, op)                      \
1893 static void name(void *mem, int stride, int h)  \
1894 {                                               \
1895     const uint8_t *p = mem;                     \
1896     do {                                        \
1897         __asm__ volatile (#op" %0" :: "m"(*p)); \
1898         p += stride;                            \
1899     } while (--h);                              \
1900 }
1901
1902 PREFETCH(prefetch_mmxext, prefetcht0)
1903 PREFETCH(prefetch_3dnow, prefetch)
1904 #undef PREFETCH
1905
1906 #endif /* HAVE_INLINE_ASM */
1907
1908 #include "h264_qpel.c"
1909
1910 void ff_put_h264_chroma_mc8_rnd_mmx  (uint8_t *dst, uint8_t *src,
1911                                       int stride, int h, int x, int y);
1912 void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
1913                                        int stride, int h, int x, int y);
1914 void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
1915                                       int stride, int h, int x, int y);
1916
1917 void ff_put_h264_chroma_mc4_mmx      (uint8_t *dst, uint8_t *src,
1918                                       int stride, int h, int x, int y);
1919 void ff_avg_h264_chroma_mc4_mmxext   (uint8_t *dst, uint8_t *src,
1920                                       int stride, int h, int x, int y);
1921 void ff_avg_h264_chroma_mc4_3dnow    (uint8_t *dst, uint8_t *src,
1922                                       int stride, int h, int x, int y);
1923
1924 void ff_put_h264_chroma_mc2_mmxext   (uint8_t *dst, uint8_t *src,
1925                                       int stride, int h, int x, int y);
1926 void ff_avg_h264_chroma_mc2_mmxext   (uint8_t *dst, uint8_t *src,
1927                                       int stride, int h, int x, int y);
1928
1929 void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1930                                       int stride, int h, int x, int y);
1931 void ff_put_h264_chroma_mc4_ssse3    (uint8_t *dst, uint8_t *src,
1932                                       int stride, int h, int x, int y);
1933
1934 void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1935                                       int stride, int h, int x, int y);
1936 void ff_avg_h264_chroma_mc4_ssse3    (uint8_t *dst, uint8_t *src,
1937                                       int stride, int h, int x, int y);
1938
1939 #define CHROMA_MC(OP, NUM, DEPTH, OPT)                                  \
1940 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT      \
1941                                       (uint8_t *dst, uint8_t *src,      \
1942                                        int stride, int h, int x, int y);
1943
1944 CHROMA_MC(put, 2, 10, mmxext)
1945 CHROMA_MC(avg, 2, 10, mmxext)
1946 CHROMA_MC(put, 4, 10, mmxext)
1947 CHROMA_MC(avg, 4, 10, mmxext)
1948 CHROMA_MC(put, 8, 10, sse2)
1949 CHROMA_MC(avg, 8, 10, sse2)
1950 CHROMA_MC(put, 8, 10, avx)
1951 CHROMA_MC(avg, 8, 10, avx)
1952
1953 #if HAVE_INLINE_ASM
1954
1955 /* CAVS-specific */
1956 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1957 {
1958     put_pixels8_mmx(dst, src, stride, 8);
1959 }
1960
1961 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1962 {
1963     avg_pixels8_mmx(dst, src, stride, 8);
1964 }
1965
1966 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1967 {
1968     put_pixels16_mmx(dst, src, stride, 16);
1969 }
1970
1971 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1972 {
1973     avg_pixels16_mmx(dst, src, stride, 16);
1974 }
1975
1976 /* VC-1-specific */
1977 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1978                                int stride, int rnd)
1979 {
1980     put_pixels8_mmx(dst, src, stride, 8);
1981 }
1982
1983 void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
1984                                   int stride, int rnd)
1985 {
1986     avg_pixels8_mmxext(dst, src, stride, 8);
1987 }
1988
1989 /* only used in VP3/5/6 */
1990 static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
1991 {
1992 //    START_TIMER
1993     MOVQ_BFE(mm6);
1994     __asm__ volatile(
1995         "1:                             \n\t"
1996         "movq   (%1), %%mm0             \n\t"
1997         "movq   (%2), %%mm1             \n\t"
1998         "movq   (%1,%4), %%mm2          \n\t"
1999         "movq   (%2,%4), %%mm3          \n\t"
2000         PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
2001         "movq   %%mm4, (%3)             \n\t"
2002         "movq   %%mm5, (%3,%4)          \n\t"
2003
2004         "movq   (%1,%4,2), %%mm0        \n\t"
2005         "movq   (%2,%4,2), %%mm1        \n\t"
2006         "movq   (%1,%5), %%mm2          \n\t"
2007         "movq   (%2,%5), %%mm3          \n\t"
2008         "lea    (%1,%4,4), %1           \n\t"
2009         "lea    (%2,%4,4), %2           \n\t"
2010         PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
2011         "movq   %%mm4, (%3,%4,2)        \n\t"
2012         "movq   %%mm5, (%3,%5)          \n\t"
2013         "lea    (%3,%4,4), %3           \n\t"
2014         "subl   $4, %0                  \n\t"
2015         "jnz    1b                      \n\t"
2016         :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
2017         :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
2018         :"memory");
2019 //    STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
2020 }
2021 static void put_vp_no_rnd_pixels16_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
2022 {
2023     put_vp_no_rnd_pixels8_l2_mmx(dst, a, b, stride, h);
2024     put_vp_no_rnd_pixels8_l2_mmx(dst+8, a+8, b+8, stride, h);
2025 }
2026
2027 #if CONFIG_DIRAC_DECODER
2028 #define DIRAC_PIXOP(OPNAME, EXT)\
2029 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2030 {\
2031     OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
2032 }\
2033 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2034 {\
2035     OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
2036 }\
2037 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2038 {\
2039     OPNAME ## _pixels16_ ## EXT(dst   , src[0]   , stride, h);\
2040     OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
2041 }
2042
2043 DIRAC_PIXOP(put, mmx)
2044 DIRAC_PIXOP(avg, mmx)
2045 DIRAC_PIXOP(avg, mmxext)
2046
2047 #if HAVE_YASM
2048 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2049 {
2050     ff_put_pixels16_sse2(dst, src[0], stride, h);
2051 }
2052 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2053 {
2054     ff_avg_pixels16_sse2(dst, src[0], stride, h);
2055 }
2056 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2057 {
2058     ff_put_pixels16_sse2(dst   , src[0]   , stride, h);
2059     ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
2060 }
2061 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2062 {
2063     ff_avg_pixels16_sse2(dst   , src[0]   , stride, h);
2064     ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
2065 }
2066 #endif
2067 #endif
2068
2069 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
2070  * converted. */
2071 #if CONFIG_GPL
2072 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
2073                                     DCTELEM *block)
2074 {
2075     ff_mmx_idct(block);
2076     ff_put_pixels_clamped_mmx(block, dest, line_size);
2077 }
2078
2079 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
2080                                     DCTELEM *block)
2081 {
2082     ff_mmx_idct(block);
2083     ff_add_pixels_clamped_mmx(block, dest, line_size);
2084 }
2085
2086 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
2087                                      DCTELEM *block)
2088 {
2089     ff_mmxext_idct(block);
2090     ff_put_pixels_clamped_mmx(block, dest, line_size);
2091 }
2092
2093 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
2094                                      DCTELEM *block)
2095 {
2096     ff_mmxext_idct(block);
2097     ff_add_pixels_clamped_mmx(block, dest, line_size);
2098 }
2099 #endif
2100
2101 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2102 {
2103     int i;
2104     __asm__ volatile ("pxor %%mm7, %%mm7":);
2105     for (i = 0; i < blocksize; i += 2) {
2106         __asm__ volatile (
2107             "movq       %0, %%mm0   \n\t"
2108             "movq       %1, %%mm1   \n\t"
2109             "movq    %%mm0, %%mm2   \n\t"
2110             "movq    %%mm1, %%mm3   \n\t"
2111             "pfcmpge %%mm7, %%mm2   \n\t" // m <= 0.0
2112             "pfcmpge %%mm7, %%mm3   \n\t" // a <= 0.0
2113             "pslld     $31, %%mm2   \n\t" // keep only the sign bit
2114             "pxor    %%mm2, %%mm1   \n\t"
2115             "movq    %%mm3, %%mm4   \n\t"
2116             "pand    %%mm1, %%mm3   \n\t"
2117             "pandn   %%mm1, %%mm4   \n\t"
2118             "pfadd   %%mm0, %%mm3   \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2119             "pfsub   %%mm4, %%mm0   \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2120             "movq    %%mm3, %1      \n\t"
2121             "movq    %%mm0, %0      \n\t"
2122             : "+m"(mag[i]), "+m"(ang[i])
2123             :: "memory"
2124         );
2125     }
2126     __asm__ volatile ("femms");
2127 }
2128
2129 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2130 {
2131     int i;
2132
2133     __asm__ volatile (
2134         "movaps  %0, %%xmm5 \n\t"
2135         :: "m"(ff_pdw_80000000[0])
2136     );
2137     for (i = 0; i < blocksize; i += 4) {
2138         __asm__ volatile (
2139             "movaps      %0, %%xmm0 \n\t"
2140             "movaps      %1, %%xmm1 \n\t"
2141             "xorps   %%xmm2, %%xmm2 \n\t"
2142             "xorps   %%xmm3, %%xmm3 \n\t"
2143             "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2144             "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2145             "andps   %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2146             "xorps   %%xmm2, %%xmm1 \n\t"
2147             "movaps  %%xmm3, %%xmm4 \n\t"
2148             "andps   %%xmm1, %%xmm3 \n\t"
2149             "andnps  %%xmm1, %%xmm4 \n\t"
2150             "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2151             "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2152             "movaps  %%xmm3, %1     \n\t"
2153             "movaps  %%xmm0, %0     \n\t"
2154             : "+m"(mag[i]), "+m"(ang[i])
2155             :: "memory"
2156         );
2157     }
2158 }
2159
2160 #if HAVE_6REGS
2161 static void vector_fmul_window_3dnowext(float *dst, const float *src0,
2162                                         const float *src1, const float *win,
2163                                         int len)
2164 {
2165     x86_reg i = -len * 4;
2166     x86_reg j =  len * 4 - 8;
2167     __asm__ volatile (
2168         "1:                             \n"
2169         "pswapd (%5, %1), %%mm1         \n"
2170         "movq   (%5, %0), %%mm0         \n"
2171         "pswapd (%4, %1), %%mm5         \n"
2172         "movq   (%3, %0), %%mm4         \n"
2173         "movq      %%mm0, %%mm2         \n"
2174         "movq      %%mm1, %%mm3         \n"
2175         "pfmul     %%mm4, %%mm2         \n" // src0[len + i] * win[len + i]
2176         "pfmul     %%mm5, %%mm3         \n" // src1[j]       * win[len + j]
2177         "pfmul     %%mm4, %%mm1         \n" // src0[len + i] * win[len + j]
2178         "pfmul     %%mm5, %%mm0         \n" // src1[j]       * win[len + i]
2179         "pfadd     %%mm3, %%mm2         \n"
2180         "pfsub     %%mm0, %%mm1         \n"
2181         "pswapd    %%mm2, %%mm2         \n"
2182         "movq      %%mm1, (%2, %0)      \n"
2183         "movq      %%mm2, (%2, %1)      \n"
2184         "sub          $8, %1            \n"
2185         "add          $8, %0            \n"
2186         "jl           1b                \n"
2187         "femms                          \n"
2188         : "+r"(i), "+r"(j)
2189         : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2190     );
2191 }
2192
2193 static void vector_fmul_window_sse(float *dst, const float *src0,
2194                                    const float *src1, const float *win, int len)
2195 {
2196     x86_reg i = -len * 4;
2197     x86_reg j =  len * 4 - 16;
2198     __asm__ volatile (
2199         "1:                             \n"
2200         "movaps      (%5, %1), %%xmm1   \n"
2201         "movaps      (%5, %0), %%xmm0   \n"
2202         "movaps      (%4, %1), %%xmm5   \n"
2203         "movaps      (%3, %0), %%xmm4   \n"
2204         "shufps $0x1b, %%xmm1, %%xmm1   \n"
2205         "shufps $0x1b, %%xmm5, %%xmm5   \n"
2206         "movaps        %%xmm0, %%xmm2   \n"
2207         "movaps        %%xmm1, %%xmm3   \n"
2208         "mulps         %%xmm4, %%xmm2   \n" // src0[len + i] * win[len + i]
2209         "mulps         %%xmm5, %%xmm3   \n" // src1[j]       * win[len + j]
2210         "mulps         %%xmm4, %%xmm1   \n" // src0[len + i] * win[len + j]
2211         "mulps         %%xmm5, %%xmm0   \n" // src1[j]       * win[len + i]
2212         "addps         %%xmm3, %%xmm2   \n"
2213         "subps         %%xmm0, %%xmm1   \n"
2214         "shufps $0x1b, %%xmm2, %%xmm2   \n"
2215         "movaps        %%xmm1, (%2, %0) \n"
2216         "movaps        %%xmm2, (%2, %1) \n"
2217         "sub              $16, %1       \n"
2218         "add              $16, %0       \n"
2219         "jl                1b           \n"
2220         : "+r"(i), "+r"(j)
2221         : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2222     );
2223 }
2224 #endif /* HAVE_6REGS */
2225
2226 static void vector_clipf_sse(float *dst, const float *src,
2227                              float min, float max, int len)
2228 {
2229     x86_reg i = (len - 16) * 4;
2230     __asm__ volatile (
2231         "movss          %3, %%xmm4      \n\t"
2232         "movss          %4, %%xmm5      \n\t"
2233         "shufps $0, %%xmm4, %%xmm4      \n\t"
2234         "shufps $0, %%xmm5, %%xmm5      \n\t"
2235         "1:                             \n\t"
2236         "movaps   (%2, %0), %%xmm0      \n\t" // 3/1 on intel
2237         "movaps 16(%2, %0), %%xmm1      \n\t"
2238         "movaps 32(%2, %0), %%xmm2      \n\t"
2239         "movaps 48(%2, %0), %%xmm3      \n\t"
2240         "maxps      %%xmm4, %%xmm0      \n\t"
2241         "maxps      %%xmm4, %%xmm1      \n\t"
2242         "maxps      %%xmm4, %%xmm2      \n\t"
2243         "maxps      %%xmm4, %%xmm3      \n\t"
2244         "minps      %%xmm5, %%xmm0      \n\t"
2245         "minps      %%xmm5, %%xmm1      \n\t"
2246         "minps      %%xmm5, %%xmm2      \n\t"
2247         "minps      %%xmm5, %%xmm3      \n\t"
2248         "movaps     %%xmm0,   (%1, %0)  \n\t"
2249         "movaps     %%xmm1, 16(%1, %0)  \n\t"
2250         "movaps     %%xmm2, 32(%1, %0)  \n\t"
2251         "movaps     %%xmm3, 48(%1, %0)  \n\t"
2252         "sub           $64, %0          \n\t"
2253         "jge            1b              \n\t"
2254         : "+&r"(i)
2255         : "r"(dst), "r"(src), "m"(min), "m"(max)
2256         : "memory"
2257     );
2258 }
2259
2260 #endif /* HAVE_INLINE_ASM */
2261
2262 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
2263                                       int order);
2264 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
2265                                     int order);
2266 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
2267                                                const int16_t *v3,
2268                                                int order, int mul);
2269 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
2270                                              const int16_t *v3,
2271                                              int order, int mul);
2272 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
2273                                               const int16_t *v3,
2274                                               int order, int mul);
2275
2276 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
2277                                         const int16_t *window, unsigned int len);
2278 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
2279                                       const int16_t *window, unsigned int len);
2280 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
2281                                   const int16_t *window, unsigned int len);
2282 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
2283                                 const int16_t *window, unsigned int len);
2284 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
2285                                  const int16_t *window, unsigned int len);
2286 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2287                                       const int16_t *window, unsigned int len);
2288
2289 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
2290 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
2291
2292 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
2293                                           const uint8_t *diff, int w,
2294                                           int *left, int *left_top);
2295 int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
2296                                        int w, int left);
2297 int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
2298                                       int w, int left);
2299
2300 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2301
2302 void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
2303                                 const float *src1, int len);
2304 void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
2305                                 const float *src1, int len);
2306
2307 void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2308                             const float *src2, int len);
2309 void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
2310                             const float *src2, int len);
2311
2312 void ff_vector_clip_int32_mmx     (int32_t *dst, const int32_t *src,
2313                                    int32_t min, int32_t max, unsigned int len);
2314 void ff_vector_clip_int32_sse2    (int32_t *dst, const int32_t *src,
2315                                    int32_t min, int32_t max, unsigned int len);
2316 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
2317                                    int32_t min, int32_t max, unsigned int len);
2318 void ff_vector_clip_int32_sse4    (int32_t *dst, const int32_t *src,
2319                                    int32_t min, int32_t max, unsigned int len);
2320
2321 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2322                                                 const float *src1, int len);
2323 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2324                                                 const float *src1, int len);
2325
2326 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)                          \
2327     do {                                                                     \
2328     c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2329     c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2330     c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2331     c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2332     c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2333     c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2334     c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2335     c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2336     c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2337     c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2338     c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2339     c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2340     c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2341     c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2342     c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2343     c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2344     } while (0)
2345
2346 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
2347     do {                                                                        \
2348         c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _     ## CPU; \
2349         c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_  ## CPU; \
2350         c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_  ## CPU; \
2351         c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2352     } while (0)
2353
2354 #define H264_QPEL_FUNCS(x, y, CPU)                                                            \
2355     do {                                                                                      \
2356         c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2357         c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc  ## x ## y ## _ ## CPU; \
2358         c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2359         c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc  ## x ## y ## _ ## CPU; \
2360     } while (0)
2361
2362 #define H264_QPEL_FUNCS_10(x, y, CPU)                                                               \
2363     do {                                                                                            \
2364         c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2365         c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc  ## x ## y ## _10_ ## CPU; \
2366         c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2367         c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc  ## x ## y ## _10_ ## CPU; \
2368     } while (0)
2369
2370 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2371 {
2372     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2373
2374 #if HAVE_INLINE_ASM
2375     c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
2376     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2377     c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
2378
2379     if (!high_bit_depth) {
2380         c->clear_block  = clear_block_mmx;
2381         c->clear_blocks = clear_blocks_mmx;
2382         c->draw_edges   = draw_edges_mmx;
2383
2384         SET_HPEL_FUNCS(put,        0, 16, mmx);
2385         SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2386         SET_HPEL_FUNCS(avg,        0, 16, mmx);
2387         SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2388         SET_HPEL_FUNCS(put,        1,  8, mmx);
2389         SET_HPEL_FUNCS(put_no_rnd, 1,  8, mmx);
2390         SET_HPEL_FUNCS(avg,        1,  8, mmx);
2391         SET_HPEL_FUNCS(avg_no_rnd, 1,  8, mmx);
2392     }
2393
2394 #if ARCH_X86_32 || !HAVE_YASM
2395     c->gmc = gmc_mmx;
2396 #endif
2397
2398     c->add_bytes = add_bytes_mmx;
2399
2400     c->put_no_rnd_pixels_l2[0]= put_vp_no_rnd_pixels16_l2_mmx;
2401     c->put_no_rnd_pixels_l2[1]= put_vp_no_rnd_pixels8_l2_mmx;
2402
2403     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2404         c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2405         c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2406     }
2407 #endif /* HAVE_INLINE_ASM */
2408
2409 #if HAVE_YASM
2410 #if ARCH_X86_32
2411     if (!high_bit_depth)
2412         c->emulated_edge_mc = emulated_edge_mc_mmx;
2413 #endif
2414
2415     if (!high_bit_depth && CONFIG_H264CHROMA) {
2416         c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
2417         c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2418     }
2419
2420     c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2421 #endif
2422
2423 }
2424
2425 static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
2426                                 int mm_flags)
2427 {
2428     const int bit_depth      = avctx->bits_per_raw_sample;
2429     const int high_bit_depth = bit_depth > 8;
2430
2431 #if HAVE_INLINE_ASM
2432     c->prefetch = prefetch_mmxext;
2433
2434     SET_QPEL_FUNCS(avg_qpel,        0, 16, mmxext, );
2435     SET_QPEL_FUNCS(avg_qpel,        1,  8, mmxext, );
2436     SET_QPEL_FUNCS(avg_2tap_qpel,   0, 16, mmxext, );
2437     SET_QPEL_FUNCS(avg_2tap_qpel,   1,  8, mmxext, );
2438
2439     SET_QPEL_FUNCS(put_qpel,        0, 16, mmxext, );
2440     SET_QPEL_FUNCS(put_qpel,        1,  8, mmxext, );
2441     SET_QPEL_FUNCS(put_2tap_qpel,   0, 16, mmxext, );
2442     SET_QPEL_FUNCS(put_2tap_qpel,   1,  8, mmxext, );
2443     SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
2444     SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmxext, );
2445
2446     if (!high_bit_depth) {
2447         c->put_pixels_tab[0][1] = put_pixels16_x2_mmxext;
2448         c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
2449
2450         c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
2451         c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
2452         c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
2453
2454         c->put_pixels_tab[1][1] = put_pixels8_x2_mmxext;
2455         c->put_pixels_tab[1][2] = put_pixels8_y2_mmxext;
2456
2457         c->avg_pixels_tab[1][0] = avg_pixels8_mmxext;
2458         c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmxext;
2459         c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmxext;
2460     }
2461
2462     if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2463         if (!high_bit_depth) {
2464             c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
2465             c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
2466             c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmxext;
2467             c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmxext;
2468
2469             c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
2470             c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmxext;
2471         }
2472     }
2473
2474     if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2475                                avctx->codec_id == AV_CODEC_ID_THEORA)) {
2476         c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmxext;
2477         c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmxext;
2478     }
2479 #endif /* HAVE_INLINE_ASM */
2480
2481 #if HAVE_MMXEXT_EXTERNAL
2482     if (CONFIG_H264QPEL) {
2483         if (!high_bit_depth) {
2484             SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
2485             SET_QPEL_FUNCS(put_h264_qpel, 1,  8, mmxext, );
2486             SET_QPEL_FUNCS(put_h264_qpel, 2,  4, mmxext, );
2487             SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
2488             SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, mmxext, );
2489             SET_QPEL_FUNCS(avg_h264_qpel, 2,  4, mmxext, );
2490         } else if (bit_depth == 10) {
2491 #if !ARCH_X86_64
2492             SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2493             SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2494             SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 10_mmxext, ff_);
2495             SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, 10_mmxext, ff_);
2496 #endif
2497             SET_QPEL_FUNCS(put_h264_qpel, 2, 4,  10_mmxext, ff_);
2498             SET_QPEL_FUNCS(avg_h264_qpel, 2, 4,  10_mmxext, ff_);
2499         }
2500     }
2501
2502     if (!high_bit_depth && CONFIG_H264CHROMA) {
2503         c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
2504         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
2505         c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
2506         c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
2507     }
2508     if (bit_depth == 10 && CONFIG_H264CHROMA) {
2509         c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
2510         c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
2511         c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
2512         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
2513     }
2514
2515     /* slower than cmov version on AMD */
2516     if (!(mm_flags & AV_CPU_FLAG_3DNOW))
2517         c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
2518
2519     c->scalarproduct_int16          = ff_scalarproduct_int16_mmxext;
2520     c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
2521
2522     if (avctx->flags & CODEC_FLAG_BITEXACT) {
2523         c->apply_window_int16 = ff_apply_window_int16_mmxext;
2524     } else {
2525         c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
2526     }
2527 #endif /* HAVE_MMXEXT_EXTERNAL */
2528 }
2529
2530 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2531                                int mm_flags)
2532 {
2533     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2534
2535 #if HAVE_INLINE_ASM
2536     c->prefetch = prefetch_3dnow;
2537
2538     if (!high_bit_depth) {
2539         c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2540         c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2541
2542         c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2543         c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2544         c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2545
2546         c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2547         c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2548
2549         c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2550         c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2551         c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2552
2553         if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2554             c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2555             c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2556             c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2557             c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2558
2559             c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2560             c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2561         }
2562     }
2563
2564     if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2565                                avctx->codec_id == AV_CODEC_ID_THEORA)) {
2566         c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2567         c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2568     }
2569
2570     c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2571 #endif /* HAVE_INLINE_ASM */
2572
2573 #if HAVE_YASM
2574     if (!high_bit_depth && CONFIG_H264CHROMA) {
2575         c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
2576         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2577     }
2578 #endif /* HAVE_YASM */
2579 }
2580
2581 static void dsputil_init_3dnowext(DSPContext *c, AVCodecContext *avctx,
2582                                   int mm_flags)
2583 {
2584 #if HAVE_AMD3DNOWEXT_INLINE && HAVE_6REGS
2585     c->vector_fmul_window  = vector_fmul_window_3dnowext;
2586 #endif
2587 }
2588
2589 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2590 {
2591     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2592
2593 #if HAVE_INLINE_ASM
2594     if (!high_bit_depth) {
2595         if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2596             /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2597             c->clear_block  = clear_block_sse;
2598             c->clear_blocks = clear_blocks_sse;
2599         }
2600     }
2601
2602     c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2603
2604 #if HAVE_6REGS
2605     c->vector_fmul_window = vector_fmul_window_sse;
2606 #endif
2607
2608     c->vector_clipf = vector_clipf_sse;
2609 #endif /* HAVE_INLINE_ASM */
2610
2611 #if HAVE_YASM
2612     c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
2613     c->vector_fmul_add     = ff_vector_fmul_add_sse;
2614
2615     c->scalarproduct_float          = ff_scalarproduct_float_sse;
2616     c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
2617
2618     if (!high_bit_depth)
2619         c->emulated_edge_mc = emulated_edge_mc_sse;
2620 #if HAVE_INLINE_ASM
2621     c->gmc = gmc_sse;
2622 #endif
2623 #endif /* HAVE_YASM */
2624 }
2625
2626 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2627                               int mm_flags)
2628 {
2629     const int bit_depth      = avctx->bits_per_raw_sample;
2630     const int high_bit_depth = bit_depth > 8;
2631
2632 #if HAVE_SSE2_INLINE
2633     if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
2634         c->idct_put              = ff_idct_xvid_sse2_put;
2635         c->idct_add              = ff_idct_xvid_sse2_add;
2636         c->idct                  = ff_idct_xvid_sse2;
2637         c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2638     }
2639 #endif /* HAVE_SSE2_INLINE */
2640
2641 #if HAVE_SSE2_EXTERNAL
2642     if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2643         // these functions are slower than mmx on AMD, but faster on Intel
2644         if (!high_bit_depth) {
2645             c->put_pixels_tab[0][0]        = ff_put_pixels16_sse2;
2646             c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
2647             c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
2648             if (CONFIG_H264QPEL)
2649                 H264_QPEL_FUNCS(0, 0, sse2);
2650         }
2651     }
2652
2653     if (!high_bit_depth && CONFIG_H264QPEL) {
2654         H264_QPEL_FUNCS(0, 1, sse2);
2655         H264_QPEL_FUNCS(0, 2, sse2);
2656         H264_QPEL_FUNCS(0, 3, sse2);
2657         H264_QPEL_FUNCS(1, 1, sse2);
2658         H264_QPEL_FUNCS(1, 2, sse2);
2659         H264_QPEL_FUNCS(1, 3, sse2);
2660         H264_QPEL_FUNCS(2, 1, sse2);
2661         H264_QPEL_FUNCS(2, 2, sse2);
2662         H264_QPEL_FUNCS(2, 3, sse2);
2663         H264_QPEL_FUNCS(3, 1, sse2);
2664         H264_QPEL_FUNCS(3, 2, sse2);
2665         H264_QPEL_FUNCS(3, 3, sse2);
2666     }
2667
2668     if (bit_depth == 10) {
2669         if (CONFIG_H264QPEL) {
2670             SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2671             SET_QPEL_FUNCS(put_h264_qpel, 1,  8, 10_sse2, ff_);
2672             SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2673             SET_QPEL_FUNCS(avg_h264_qpel, 1,  8, 10_sse2, ff_);
2674             H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2675             H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2676             H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
2677         }
2678         if (CONFIG_H264CHROMA) {
2679             c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2680             c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2681         }
2682     }
2683
2684     c->scalarproduct_int16          = ff_scalarproduct_int16_sse2;
2685     c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2686     if (mm_flags & AV_CPU_FLAG_ATOM) {
2687         c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2688     } else {
2689         c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2690     }
2691     if (avctx->flags & CODEC_FLAG_BITEXACT) {
2692         c->apply_window_int16 = ff_apply_window_int16_sse2;
2693     } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2694         c->apply_window_int16 = ff_apply_window_int16_round_sse2;
2695     }
2696     c->bswap_buf = ff_bswap32_buf_sse2;
2697 #endif /* HAVE_SSE2_EXTERNAL */
2698 }
2699
2700 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2701                                int mm_flags)
2702 {
2703 #if HAVE_SSSE3_EXTERNAL
2704     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2705     const int bit_depth      = avctx->bits_per_raw_sample;
2706
2707     if (!high_bit_depth && CONFIG_H264QPEL) {
2708         H264_QPEL_FUNCS(1, 0, ssse3);
2709         H264_QPEL_FUNCS(1, 1, ssse3);
2710         H264_QPEL_FUNCS(1, 2, ssse3);
2711         H264_QPEL_FUNCS(1, 3, ssse3);
2712         H264_QPEL_FUNCS(2, 0, ssse3);
2713         H264_QPEL_FUNCS(2, 1, ssse3);
2714         H264_QPEL_FUNCS(2, 2, ssse3);
2715         H264_QPEL_FUNCS(2, 3, ssse3);
2716         H264_QPEL_FUNCS(3, 0, ssse3);
2717         H264_QPEL_FUNCS(3, 1, ssse3);
2718         H264_QPEL_FUNCS(3, 2, ssse3);
2719         H264_QPEL_FUNCS(3, 3, ssse3);
2720     }
2721     if (bit_depth == 10 && CONFIG_H264QPEL) {
2722         H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
2723         H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
2724         H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
2725     }
2726     if (!high_bit_depth && CONFIG_H264CHROMA) {
2727         c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
2728         c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
2729         c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
2730         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
2731     }
2732     c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2733     if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2734         c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2735
2736     if (mm_flags & AV_CPU_FLAG_ATOM)
2737         c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2738     else
2739         c->apply_window_int16 = ff_apply_window_int16_ssse3;
2740     if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
2741         c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2742     c->bswap_buf = ff_bswap32_buf_ssse3;
2743 #endif /* HAVE_SSSE3_EXTERNAL */
2744 }
2745
2746 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
2747                               int mm_flags)
2748 {
2749 #if HAVE_SSE4_EXTERNAL
2750     c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2751 #endif /* HAVE_SSE4_EXTERNAL */
2752 }
2753
2754 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2755 {
2756 #if HAVE_AVX_EXTERNAL
2757     const int bit_depth = avctx->bits_per_raw_sample;
2758
2759     if (bit_depth == 10) {
2760         // AVX implies !cache64.
2761         // TODO: Port cache(32|64) detection from x264.
2762         if (CONFIG_H264QPEL) {
2763             H264_QPEL_FUNCS_10(1, 0, sse2);
2764             H264_QPEL_FUNCS_10(2, 0, sse2);
2765             H264_QPEL_FUNCS_10(3, 0, sse2);
2766         }
2767
2768         if (CONFIG_H264CHROMA) {
2769             c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2770             c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2771         }
2772     }
2773     c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
2774     c->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
2775     c->vector_fmul_add = ff_vector_fmul_add_avx;
2776 #endif /* HAVE_AVX_EXTERNAL */
2777 }
2778
2779 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
2780 {
2781     int mm_flags = av_get_cpu_flags();
2782
2783 #if HAVE_7REGS && HAVE_INLINE_ASM
2784     if (mm_flags & AV_CPU_FLAG_CMOV)
2785         c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2786 #endif
2787
2788     if (mm_flags & AV_CPU_FLAG_MMX) {
2789 #if HAVE_INLINE_ASM
2790         const int idct_algo = avctx->idct_algo;
2791
2792         if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
2793             if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
2794                 c->idct_put              = ff_simple_idct_put_mmx;
2795                 c->idct_add              = ff_simple_idct_add_mmx;
2796                 c->idct                  = ff_simple_idct_mmx;
2797                 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
2798 #if CONFIG_GPL
2799             } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
2800                 if (mm_flags & AV_CPU_FLAG_MMX2) {
2801                     c->idct_put = ff_libmpeg2mmx2_idct_put;
2802                     c->idct_add = ff_libmpeg2mmx2_idct_add;
2803                     c->idct     = ff_mmxext_idct;
2804                 } else {
2805                     c->idct_put = ff_libmpeg2mmx_idct_put;
2806                     c->idct_add = ff_libmpeg2mmx_idct_add;
2807                     c->idct     = ff_mmx_idct;
2808                 }
2809                 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2810 #endif
2811             } else if (idct_algo == FF_IDCT_XVIDMMX) {
2812                 if (mm_flags & AV_CPU_FLAG_SSE2) {
2813                     c->idct_put              = ff_idct_xvid_sse2_put;
2814                     c->idct_add              = ff_idct_xvid_sse2_add;
2815                     c->idct                  = ff_idct_xvid_sse2;
2816                     c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2817                 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
2818                     c->idct_put              = ff_idct_xvid_mmxext_put;
2819                     c->idct_add              = ff_idct_xvid_mmxext_add;
2820                     c->idct                  = ff_idct_xvid_mmxext;
2821                 } else {
2822                     c->idct_put              = ff_idct_xvid_mmx_put;
2823                     c->idct_add              = ff_idct_xvid_mmx_add;
2824                     c->idct                  = ff_idct_xvid_mmx;
2825                 }
2826             }
2827         }
2828 #endif /* HAVE_INLINE_ASM */
2829
2830         dsputil_init_mmx(c, avctx, mm_flags);
2831     }
2832
2833     if (mm_flags & AV_CPU_FLAG_MMXEXT)
2834         dsputil_init_mmxext(c, avctx, mm_flags);
2835
2836     if (mm_flags & AV_CPU_FLAG_3DNOW)
2837         dsputil_init_3dnow(c, avctx, mm_flags);
2838
2839     if (mm_flags & AV_CPU_FLAG_3DNOWEXT)
2840         dsputil_init_3dnowext(c, avctx, mm_flags);
2841
2842     if (mm_flags & AV_CPU_FLAG_SSE)
2843         dsputil_init_sse(c, avctx, mm_flags);
2844
2845     if (mm_flags & AV_CPU_FLAG_SSE2)
2846         dsputil_init_sse2(c, avctx, mm_flags);
2847
2848     if (mm_flags & AV_CPU_FLAG_SSSE3)
2849         dsputil_init_ssse3(c, avctx, mm_flags);
2850
2851     if (mm_flags & AV_CPU_FLAG_SSE4)
2852         dsputil_init_sse4(c, avctx, mm_flags);
2853
2854     if (mm_flags & AV_CPU_FLAG_AVX)
2855         dsputil_init_avx(c, avctx, mm_flags);
2856
2857     if (CONFIG_ENCODERS)
2858         ff_dsputilenc_init_mmx(c, avctx);
2859 }