]> git.sesse.net Git - ffmpeg/blob - libavcodec/x86/dsputil_mmx.c
img2enc: add an option for overwriting one file with subsequent images
[ffmpeg] / libavcodec / x86 / dsputil_mmx.c
1 /*
2  * MMX optimized DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  *
8  * This file is part of Libav.
9  *
10  * Libav is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * Libav is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with Libav; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavcodec/dsputil.h"
29 #include "libavcodec/h264dsp.h"
30 #include "libavcodec/mpegvideo.h"
31 #include "libavcodec/simple_idct.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
34
35 //#undef NDEBUG
36 //#include <assert.h>
37
38 /* pixel operations */
39 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15)   =   0x000F000F000F000FULL;
40 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_17)   = { 0x0011001100110011ULL, 0x0011001100110011ULL };
41 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_20)   =   0x0014001400140014ULL;
42 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_42)   =   0x002A002A002A002AULL;
43 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_53)   =   0x0035003500350035ULL;
44 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_96)   =   0x0060006000600060ULL;
45 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_128)  =   0x0080008000800080ULL;
46 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_255)  =   0x00ff00ff00ff00ffULL;
47 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x0200020002000200ULL };
48 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
49
50 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_7)    =   0x0707070707070707ULL;
51 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_1F)   =   0x1F1F1F1F1F1F1F1FULL;
52 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_3F)   =   0x3F3F3F3F3F3F3F3FULL;
53 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_81)   =   0x8181818181818181ULL;
54 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC)   =   0xFCFCFCFCFCFCFCFCULL;
55
56 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
57 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
58
59
60 #if HAVE_YASM
61 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
62                               ptrdiff_t line_size, int h);
63 void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
64                              ptrdiff_t line_size, int h);
65 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
66                               int dstStride, int src1Stride, int h);
67 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
68                                      uint8_t *src2, int dstStride,
69                                      int src1Stride, int h);
70 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
71                               int dstStride, int src1Stride, int h);
72 void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
73                                ptrdiff_t line_size, int h);
74 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
75                               ptrdiff_t line_size, int h);
76 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
77                                int dstStride, int src1Stride, int h);
78 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
79                                int dstStride, int src1Stride, int h);
80 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
81                                       int dstStride, int src1Stride, int h);
82 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
83                                      ptrdiff_t line_size, int h);
84 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
85                                     ptrdiff_t line_size, int h);
86 void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
87                                            const uint8_t *pixels,
88                                            ptrdiff_t line_size, int h);
89 void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
90                                           const uint8_t *pixels,
91                                           ptrdiff_t line_size, int h);
92 void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
93                               ptrdiff_t line_size, int h);
94 void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
95                              ptrdiff_t line_size, int h);
96 void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
97                                      ptrdiff_t line_size, int h);
98 void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
99                                     ptrdiff_t line_size, int h);
100 void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
101                                            const uint8_t *pixels,
102                                            ptrdiff_t line_size, int h);
103 void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
104                                           const uint8_t *pixels,
105                                           ptrdiff_t line_size, int h);
106 void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
107                           ptrdiff_t line_size, int h);
108 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
109                               ptrdiff_t line_size, int h);
110 void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
111                              ptrdiff_t line_size, int h);
112 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
113                               ptrdiff_t line_size, int h);
114 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
115                              ptrdiff_t line_size, int h);
116 void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
117                                ptrdiff_t line_size, int h);
118 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
119                               ptrdiff_t line_size, int h);
120
121 void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
122 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
123                                    ptrdiff_t line_size, int h)
124 {
125     ff_put_pixels8_mmxext(block,     pixels,     line_size, h);
126     ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
127 }
128
129 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
130                                          int dstStride, int srcStride, int h);
131 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
132                                          int dstStride, int srcStride, int h);
133 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
134                                                  int dstStride, int srcStride,
135                                                  int h);
136 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
137                                         int dstStride, int srcStride, int h);
138 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
139                                         int dstStride, int srcStride, int h);
140 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
141                                                 int dstStride, int srcStride,
142                                                 int h);
143 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
144                                          int dstStride, int srcStride);
145 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
146                                          int dstStride, int srcStride);
147 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
148                                                  int dstStride, int srcStride);
149 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
150                                         int dstStride, int srcStride);
151 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
152                                         int dstStride, int srcStride);
153 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
154                                                 int dstStride, int srcStride);
155 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
156 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
157 #endif /* HAVE_YASM */
158
159
160 #if HAVE_INLINE_ASM
161
162 #define JUMPALIGN()     __asm__ volatile (".p2align 3"::)
163 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
164
165 #define MOVQ_BFE(regd)                                  \
166     __asm__ volatile (                                  \
167         "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
168         "paddb   %%"#regd", %%"#regd"   \n\t" ::)
169
170 #ifndef PIC
171 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
172 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
173 #else
174 // for shared library it's better to use this way for accessing constants
175 // pcmpeqd -> -1
176 #define MOVQ_BONE(regd)                                 \
177     __asm__ volatile (                                  \
178         "pcmpeqd  %%"#regd", %%"#regd"  \n\t"           \
179         "psrlw          $15, %%"#regd"  \n\t"           \
180         "packuswb %%"#regd", %%"#regd"  \n\t" ::)
181
182 #define MOVQ_WTWO(regd)                                 \
183     __asm__ volatile (                                  \
184         "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
185         "psrlw         $15, %%"#regd"   \n\t"           \
186         "psllw          $1, %%"#regd"   \n\t"::)
187
188 #endif
189
190 // using regr as temporary and for the output result
191 // first argument is unmodifed and second is trashed
192 // regfe is supposed to contain 0xfefefefefefefefe
193 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe)                \
194     "movq   "#rega", "#regr"            \n\t"                    \
195     "pand   "#regb", "#regr"            \n\t"                    \
196     "pxor   "#rega", "#regb"            \n\t"                    \
197     "pand  "#regfe", "#regb"            \n\t"                    \
198     "psrlq       $1, "#regb"            \n\t"                    \
199     "paddb  "#regb", "#regr"            \n\t"
200
201 #define PAVGB_MMX(rega, regb, regr, regfe)                       \
202     "movq   "#rega", "#regr"            \n\t"                    \
203     "por    "#regb", "#regr"            \n\t"                    \
204     "pxor   "#rega", "#regb"            \n\t"                    \
205     "pand  "#regfe", "#regb"            \n\t"                    \
206     "psrlq       $1, "#regb"            \n\t"                    \
207     "psubb  "#regb", "#regr"            \n\t"
208
209 // mm6 is supposed to contain 0xfefefefefefefefe
210 #define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp)   \
211     "movq  "#rega", "#regr"             \n\t"                    \
212     "movq  "#regc", "#regp"             \n\t"                    \
213     "pand  "#regb", "#regr"             \n\t"                    \
214     "pand  "#regd", "#regp"             \n\t"                    \
215     "pxor  "#rega", "#regb"             \n\t"                    \
216     "pxor  "#regc", "#regd"             \n\t"                    \
217     "pand    %%mm6, "#regb"             \n\t"                    \
218     "pand    %%mm6, "#regd"             \n\t"                    \
219     "psrlq      $1, "#regb"             \n\t"                    \
220     "psrlq      $1, "#regd"             \n\t"                    \
221     "paddb "#regb", "#regr"             \n\t"                    \
222     "paddb "#regd", "#regp"             \n\t"
223
224 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp)           \
225     "movq  "#rega", "#regr"             \n\t"                    \
226     "movq  "#regc", "#regp"             \n\t"                    \
227     "por   "#regb", "#regr"             \n\t"                    \
228     "por   "#regd", "#regp"             \n\t"                    \
229     "pxor  "#rega", "#regb"             \n\t"                    \
230     "pxor  "#regc", "#regd"             \n\t"                    \
231     "pand    %%mm6, "#regb"             \n\t"                    \
232     "pand    %%mm6, "#regd"             \n\t"                    \
233     "psrlq      $1, "#regd"             \n\t"                    \
234     "psrlq      $1, "#regb"             \n\t"                    \
235     "psubb "#regb", "#regr"             \n\t"                    \
236     "psubb "#regd", "#regp"             \n\t"
237
238 /***********************************/
239 /* MMX no rounding */
240 #define NO_RND 1
241 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
242 #define SET_RND  MOVQ_WONE
243 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
244 #define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
245 #define OP_AVG(a, b, c, e)              PAVGB_MMX(a, b, c, e)
246
247 #include "dsputil_rnd_template.c"
248
249 #undef DEF
250 #undef SET_RND
251 #undef PAVGBP
252 #undef PAVGB
253 #undef NO_RND
254 /***********************************/
255 /* MMX rounding */
256
257 #define DEF(x, y) x ## _ ## y ## _mmx
258 #define SET_RND  MOVQ_WTWO
259 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
260 #define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
261
262 #include "dsputil_rnd_template.c"
263
264 #undef DEF
265 #undef SET_RND
266 #undef PAVGBP
267 #undef PAVGB
268 #undef OP_AVG
269
270 #endif /* HAVE_INLINE_ASM */
271
272
273 #if HAVE_YASM
274
275 /***********************************/
276 /* 3Dnow specific */
277
278 #define DEF(x) x ## _3dnow
279
280 #include "dsputil_avg_template.c"
281
282 #undef DEF
283
284 /***********************************/
285 /* MMXEXT specific */
286
287 #define DEF(x) x ## _mmxext
288
289 #include "dsputil_avg_template.c"
290
291 #undef DEF
292
293 #endif /* HAVE_YASM */
294
295
296 #if HAVE_INLINE_ASM
297 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
298 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
299
300 /***********************************/
301 /* standard MMX */
302
303 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
304                                int line_size)
305 {
306     const int16_t *p;
307     uint8_t *pix;
308
309     /* read the pixels */
310     p   = block;
311     pix = pixels;
312     /* unrolled loop */
313     __asm__ volatile (
314         "movq      (%3), %%mm0          \n\t"
315         "movq     8(%3), %%mm1          \n\t"
316         "movq    16(%3), %%mm2          \n\t"
317         "movq    24(%3), %%mm3          \n\t"
318         "movq    32(%3), %%mm4          \n\t"
319         "movq    40(%3), %%mm5          \n\t"
320         "movq    48(%3), %%mm6          \n\t"
321         "movq    56(%3), %%mm7          \n\t"
322         "packuswb %%mm1, %%mm0          \n\t"
323         "packuswb %%mm3, %%mm2          \n\t"
324         "packuswb %%mm5, %%mm4          \n\t"
325         "packuswb %%mm7, %%mm6          \n\t"
326         "movq     %%mm0, (%0)           \n\t"
327         "movq     %%mm2, (%0, %1)       \n\t"
328         "movq     %%mm4, (%0, %1, 2)    \n\t"
329         "movq     %%mm6, (%0, %2)       \n\t"
330         :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
331            "r"(p)
332         : "memory");
333     pix += line_size * 4;
334     p   += 32;
335
336     // if here would be an exact copy of the code above
337     // compiler would generate some very strange code
338     // thus using "r"
339     __asm__ volatile (
340         "movq       (%3), %%mm0         \n\t"
341         "movq      8(%3), %%mm1         \n\t"
342         "movq     16(%3), %%mm2         \n\t"
343         "movq     24(%3), %%mm3         \n\t"
344         "movq     32(%3), %%mm4         \n\t"
345         "movq     40(%3), %%mm5         \n\t"
346         "movq     48(%3), %%mm6         \n\t"
347         "movq     56(%3), %%mm7         \n\t"
348         "packuswb  %%mm1, %%mm0         \n\t"
349         "packuswb  %%mm3, %%mm2         \n\t"
350         "packuswb  %%mm5, %%mm4         \n\t"
351         "packuswb  %%mm7, %%mm6         \n\t"
352         "movq      %%mm0, (%0)          \n\t"
353         "movq      %%mm2, (%0, %1)      \n\t"
354         "movq      %%mm4, (%0, %1, 2)   \n\t"
355         "movq      %%mm6, (%0, %2)      \n\t"
356         :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
357         : "memory");
358 }
359
360 #define put_signed_pixels_clamped_mmx_half(off)             \
361     "movq          "#off"(%2), %%mm1        \n\t"           \
362     "movq     16 + "#off"(%2), %%mm2        \n\t"           \
363     "movq     32 + "#off"(%2), %%mm3        \n\t"           \
364     "movq     48 + "#off"(%2), %%mm4        \n\t"           \
365     "packsswb  8 + "#off"(%2), %%mm1        \n\t"           \
366     "packsswb 24 + "#off"(%2), %%mm2        \n\t"           \
367     "packsswb 40 + "#off"(%2), %%mm3        \n\t"           \
368     "packsswb 56 + "#off"(%2), %%mm4        \n\t"           \
369     "paddb              %%mm0, %%mm1        \n\t"           \
370     "paddb              %%mm0, %%mm2        \n\t"           \
371     "paddb              %%mm0, %%mm3        \n\t"           \
372     "paddb              %%mm0, %%mm4        \n\t"           \
373     "movq               %%mm1, (%0)         \n\t"           \
374     "movq               %%mm2, (%0, %3)     \n\t"           \
375     "movq               %%mm3, (%0, %3, 2)  \n\t"           \
376     "movq               %%mm4, (%0, %1)     \n\t"
377
378 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
379                                       int line_size)
380 {
381     x86_reg line_skip = line_size;
382     x86_reg line_skip3;
383
384     __asm__ volatile (
385         "movq "MANGLE(ff_pb_80)", %%mm0     \n\t"
386         "lea         (%3, %3, 2), %1        \n\t"
387         put_signed_pixels_clamped_mmx_half(0)
388         "lea         (%0, %3, 4), %0        \n\t"
389         put_signed_pixels_clamped_mmx_half(64)
390         : "+&r"(pixels), "=&r"(line_skip3)
391         : "r"(block), "r"(line_skip)
392         : "memory");
393 }
394
395 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
396                                int line_size)
397 {
398     const int16_t *p;
399     uint8_t *pix;
400     int i;
401
402     /* read the pixels */
403     p   = block;
404     pix = pixels;
405     MOVQ_ZERO(mm7);
406     i = 4;
407     do {
408         __asm__ volatile (
409             "movq        (%2), %%mm0    \n\t"
410             "movq       8(%2), %%mm1    \n\t"
411             "movq      16(%2), %%mm2    \n\t"
412             "movq      24(%2), %%mm3    \n\t"
413             "movq          %0, %%mm4    \n\t"
414             "movq          %1, %%mm6    \n\t"
415             "movq       %%mm4, %%mm5    \n\t"
416             "punpcklbw  %%mm7, %%mm4    \n\t"
417             "punpckhbw  %%mm7, %%mm5    \n\t"
418             "paddsw     %%mm4, %%mm0    \n\t"
419             "paddsw     %%mm5, %%mm1    \n\t"
420             "movq       %%mm6, %%mm5    \n\t"
421             "punpcklbw  %%mm7, %%mm6    \n\t"
422             "punpckhbw  %%mm7, %%mm5    \n\t"
423             "paddsw     %%mm6, %%mm2    \n\t"
424             "paddsw     %%mm5, %%mm3    \n\t"
425             "packuswb   %%mm1, %%mm0    \n\t"
426             "packuswb   %%mm3, %%mm2    \n\t"
427             "movq       %%mm0, %0       \n\t"
428             "movq       %%mm2, %1       \n\t"
429             : "+m"(*pix), "+m"(*(pix + line_size))
430             : "r"(p)
431             : "memory");
432         pix += line_size * 2;
433         p   += 16;
434     } while (--i);
435 }
436
437 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
438                             ptrdiff_t line_size, int h)
439 {
440     __asm__ volatile (
441         "lea   (%3, %3), %%"REG_a"      \n\t"
442         ".p2align     3                 \n\t"
443         "1:                             \n\t"
444         "movq  (%1    ), %%mm0          \n\t"
445         "movq  (%1, %3), %%mm1          \n\t"
446         "movq     %%mm0, (%2)           \n\t"
447         "movq     %%mm1, (%2, %3)       \n\t"
448         "add  %%"REG_a", %1             \n\t"
449         "add  %%"REG_a", %2             \n\t"
450         "movq  (%1    ), %%mm0          \n\t"
451         "movq  (%1, %3), %%mm1          \n\t"
452         "movq     %%mm0, (%2)           \n\t"
453         "movq     %%mm1, (%2, %3)       \n\t"
454         "add  %%"REG_a", %1             \n\t"
455         "add  %%"REG_a", %2             \n\t"
456         "subl        $4, %0             \n\t"
457         "jnz         1b                 \n\t"
458         : "+g"(h), "+r"(pixels),  "+r"(block)
459         : "r"((x86_reg)line_size)
460         : "%"REG_a, "memory"
461         );
462 }
463
464 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
465                              ptrdiff_t line_size, int h)
466 {
467     __asm__ volatile (
468         "lea   (%3, %3), %%"REG_a"      \n\t"
469         ".p2align     3                 \n\t"
470         "1:                             \n\t"
471         "movq  (%1    ), %%mm0          \n\t"
472         "movq 8(%1    ), %%mm4          \n\t"
473         "movq  (%1, %3), %%mm1          \n\t"
474         "movq 8(%1, %3), %%mm5          \n\t"
475         "movq     %%mm0,  (%2)          \n\t"
476         "movq     %%mm4, 8(%2)          \n\t"
477         "movq     %%mm1,  (%2, %3)      \n\t"
478         "movq     %%mm5, 8(%2, %3)      \n\t"
479         "add  %%"REG_a", %1             \n\t"
480         "add  %%"REG_a", %2             \n\t"
481         "movq  (%1    ), %%mm0          \n\t"
482         "movq 8(%1    ), %%mm4          \n\t"
483         "movq  (%1, %3), %%mm1          \n\t"
484         "movq 8(%1, %3), %%mm5          \n\t"
485         "movq     %%mm0,  (%2)          \n\t"
486         "movq     %%mm4, 8(%2)          \n\t"
487         "movq     %%mm1,  (%2, %3)      \n\t"
488         "movq     %%mm5, 8(%2, %3)      \n\t"
489         "add  %%"REG_a", %1             \n\t"
490         "add  %%"REG_a", %2             \n\t"
491         "subl        $4, %0             \n\t"
492         "jnz         1b                 \n\t"
493         : "+g"(h), "+r"(pixels),  "+r"(block)
494         : "r"((x86_reg)line_size)
495         : "%"REG_a, "memory"
496         );
497 }
498
499 #define CLEAR_BLOCKS(name, n)                           \
500 static void name(int16_t *blocks)                       \
501 {                                                       \
502     __asm__ volatile (                                  \
503         "pxor %%mm7, %%mm7              \n\t"           \
504         "mov     %1,        %%"REG_a"   \n\t"           \
505         "1:                             \n\t"           \
506         "movq %%mm7,   (%0, %%"REG_a")  \n\t"           \
507         "movq %%mm7,  8(%0, %%"REG_a")  \n\t"           \
508         "movq %%mm7, 16(%0, %%"REG_a")  \n\t"           \
509         "movq %%mm7, 24(%0, %%"REG_a")  \n\t"           \
510         "add    $32, %%"REG_a"          \n\t"           \
511         "js      1b                     \n\t"           \
512         :: "r"(((uint8_t *)blocks) + 128 * n),          \
513            "i"(-128 * n)                                \
514         : "%"REG_a                                      \
515         );                                              \
516 }
517 CLEAR_BLOCKS(clear_blocks_mmx, 6)
518 CLEAR_BLOCKS(clear_block_mmx, 1)
519
520 static void clear_block_sse(int16_t *block)
521 {
522     __asm__ volatile (
523         "xorps  %%xmm0, %%xmm0          \n"
524         "movaps %%xmm0,    (%0)         \n"
525         "movaps %%xmm0,  16(%0)         \n"
526         "movaps %%xmm0,  32(%0)         \n"
527         "movaps %%xmm0,  48(%0)         \n"
528         "movaps %%xmm0,  64(%0)         \n"
529         "movaps %%xmm0,  80(%0)         \n"
530         "movaps %%xmm0,  96(%0)         \n"
531         "movaps %%xmm0, 112(%0)         \n"
532         :: "r"(block)
533         : "memory"
534     );
535 }
536
537 static void clear_blocks_sse(int16_t *blocks)
538 {
539     __asm__ volatile (
540         "xorps  %%xmm0, %%xmm0              \n"
541         "mov        %1,         %%"REG_a"   \n"
542         "1:                                 \n"
543         "movaps %%xmm0,    (%0, %%"REG_a")  \n"
544         "movaps %%xmm0,  16(%0, %%"REG_a")  \n"
545         "movaps %%xmm0,  32(%0, %%"REG_a")  \n"
546         "movaps %%xmm0,  48(%0, %%"REG_a")  \n"
547         "movaps %%xmm0,  64(%0, %%"REG_a")  \n"
548         "movaps %%xmm0,  80(%0, %%"REG_a")  \n"
549         "movaps %%xmm0,  96(%0, %%"REG_a")  \n"
550         "movaps %%xmm0, 112(%0, %%"REG_a")  \n"
551         "add      $128,         %%"REG_a"   \n"
552         "js         1b                      \n"
553         :: "r"(((uint8_t *)blocks) + 128 * 6),
554            "i"(-128 * 6)
555         : "%"REG_a
556     );
557 }
558
559 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
560 {
561     x86_reg i = 0;
562     __asm__ volatile (
563         "jmp          2f                \n\t"
564         "1:                             \n\t"
565         "movq   (%1, %0), %%mm0         \n\t"
566         "movq   (%2, %0), %%mm1         \n\t"
567         "paddb     %%mm0, %%mm1         \n\t"
568         "movq      %%mm1, (%2, %0)      \n\t"
569         "movq  8(%1, %0), %%mm0         \n\t"
570         "movq  8(%2, %0), %%mm1         \n\t"
571         "paddb     %%mm0, %%mm1         \n\t"
572         "movq      %%mm1, 8(%2, %0)     \n\t"
573         "add         $16, %0            \n\t"
574         "2:                             \n\t"
575         "cmp          %3, %0            \n\t"
576         "js           1b                \n\t"
577         : "+r"(i)
578         : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
579     );
580     for ( ; i < w; i++)
581         dst[i + 0] += src[i + 0];
582 }
583
584 #if HAVE_7REGS
585 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
586                                             const uint8_t *diff, int w,
587                                             int *left, int *left_top)
588 {
589     x86_reg w2 = -w;
590     x86_reg x;
591     int l  = *left     & 0xff;
592     int tl = *left_top & 0xff;
593     int t;
594     __asm__ volatile (
595         "mov          %7, %3            \n"
596         "1:                             \n"
597         "movzbl (%3, %4), %2            \n"
598         "mov          %2, %k3           \n"
599         "sub         %b1, %b3           \n"
600         "add         %b0, %b3           \n"
601         "mov          %2, %1            \n"
602         "cmp          %0, %2            \n"
603         "cmovg        %0, %2            \n"
604         "cmovg        %1, %0            \n"
605         "cmp         %k3, %0            \n"
606         "cmovg       %k3, %0            \n"
607         "mov          %7, %3            \n"
608         "cmp          %2, %0            \n"
609         "cmovl        %2, %0            \n"
610         "add    (%6, %4), %b0           \n"
611         "mov         %b0, (%5, %4)      \n"
612         "inc          %4                \n"
613         "jl           1b                \n"
614         : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
615         : "r"(dst + w), "r"(diff + w), "rm"(top + w)
616     );
617     *left     = l;
618     *left_top = tl;
619 }
620 #endif
621 #endif /* HAVE_INLINE_ASM */
622
623 void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
624 void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
625
626 #if HAVE_INLINE_ASM
627 /* Draw the edges of width 'w' of an image of size width, height
628  * this MMX version can only handle w == 8 || w == 16. */
629 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
630                            int w, int h, int sides)
631 {
632     uint8_t *ptr, *last_line;
633     int i;
634
635     last_line = buf + (height - 1) * wrap;
636     /* left and right */
637     ptr = buf;
638     if (w == 8) {
639         __asm__ volatile (
640             "1:                             \n\t"
641             "movd            (%0), %%mm0    \n\t"
642             "punpcklbw      %%mm0, %%mm0    \n\t"
643             "punpcklwd      %%mm0, %%mm0    \n\t"
644             "punpckldq      %%mm0, %%mm0    \n\t"
645             "movq           %%mm0, -8(%0)   \n\t"
646             "movq      -8(%0, %2), %%mm1    \n\t"
647             "punpckhbw      %%mm1, %%mm1    \n\t"
648             "punpckhwd      %%mm1, %%mm1    \n\t"
649             "punpckhdq      %%mm1, %%mm1    \n\t"
650             "movq           %%mm1, (%0, %2) \n\t"
651             "add               %1, %0       \n\t"
652             "cmp               %3, %0       \n\t"
653             "jb                1b           \n\t"
654             : "+r"(ptr)
655             : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
656             );
657     } else {
658         __asm__ volatile (
659             "1:                                 \n\t"
660             "movd            (%0), %%mm0        \n\t"
661             "punpcklbw      %%mm0, %%mm0        \n\t"
662             "punpcklwd      %%mm0, %%mm0        \n\t"
663             "punpckldq      %%mm0, %%mm0        \n\t"
664             "movq           %%mm0, -8(%0)       \n\t"
665             "movq           %%mm0, -16(%0)      \n\t"
666             "movq      -8(%0, %2), %%mm1        \n\t"
667             "punpckhbw      %%mm1, %%mm1        \n\t"
668             "punpckhwd      %%mm1, %%mm1        \n\t"
669             "punpckhdq      %%mm1, %%mm1        \n\t"
670             "movq           %%mm1,  (%0, %2)    \n\t"
671             "movq           %%mm1, 8(%0, %2)    \n\t"
672             "add               %1, %0           \n\t"
673             "cmp               %3, %0           \n\t"
674             "jb                1b               \n\t"
675             : "+r"(ptr)
676             : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
677             );
678     }
679
680     /* top and bottom (and hopefully also the corners) */
681     if (sides & EDGE_TOP) {
682         for (i = 0; i < h; i += 4) {
683             ptr = buf - (i + 1) * wrap - w;
684             __asm__ volatile (
685                 "1:                             \n\t"
686                 "movq (%1, %0), %%mm0           \n\t"
687                 "movq    %%mm0, (%0)            \n\t"
688                 "movq    %%mm0, (%0, %2)        \n\t"
689                 "movq    %%mm0, (%0, %2, 2)     \n\t"
690                 "movq    %%mm0, (%0, %3)        \n\t"
691                 "add        $8, %0              \n\t"
692                 "cmp        %4, %0              \n\t"
693                 "jb         1b                  \n\t"
694                 : "+r"(ptr)
695                 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
696                   "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
697                 );
698         }
699     }
700
701     if (sides & EDGE_BOTTOM) {
702         for (i = 0; i < h; i += 4) {
703             ptr = last_line + (i + 1) * wrap - w;
704             __asm__ volatile (
705                 "1:                             \n\t"
706                 "movq (%1, %0), %%mm0           \n\t"
707                 "movq    %%mm0, (%0)            \n\t"
708                 "movq    %%mm0, (%0, %2)        \n\t"
709                 "movq    %%mm0, (%0, %2, 2)     \n\t"
710                 "movq    %%mm0, (%0, %3)        \n\t"
711                 "add        $8, %0              \n\t"
712                 "cmp        %4, %0              \n\t"
713                 "jb         1b                  \n\t"
714                 : "+r"(ptr)
715                 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
716                   "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
717                   "r"(ptr + width + 2 * w)
718                 );
719         }
720     }
721 }
722 #endif /* HAVE_INLINE_ASM */
723
724
725 #if HAVE_YASM
726 #define QPEL_OP(OPNAME, ROUNDER, RND, MMX)                              \
727 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src,   \
728                                           ptrdiff_t stride)             \
729 {                                                                       \
730     ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);              \
731 }                                                                       \
732                                                                         \
733 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src,    \
734                                          ptrdiff_t stride)              \
735 {                                                                       \
736     uint64_t temp[8];                                                   \
737     uint8_t * const half = (uint8_t*)temp;                              \
738     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,        \
739                                                    stride, 8);          \
740     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half,                 \
741                                         stride, stride, 8);             \
742 }                                                                       \
743                                                                         \
744 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src,    \
745                                          ptrdiff_t stride)              \
746 {                                                                       \
747     ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride,    \
748                                                    stride, 8);          \
749 }                                                                       \
750                                                                         \
751 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src,    \
752                                          ptrdiff_t stride)              \
753 {                                                                       \
754     uint64_t temp[8];                                                   \
755     uint8_t * const half = (uint8_t*)temp;                              \
756     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,        \
757                                                    stride, 8);          \
758     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride,     \
759                                         stride, 8);                     \
760 }                                                                       \
761                                                                         \
762 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src,    \
763                                          ptrdiff_t stride)              \
764 {                                                                       \
765     uint64_t temp[8];                                                   \
766     uint8_t * const half = (uint8_t*)temp;                              \
767     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src,           \
768                                                    8, stride);          \
769     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half,                 \
770                                         stride, stride, 8);             \
771 }                                                                       \
772                                                                         \
773 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src,    \
774                                          ptrdiff_t stride)              \
775 {                                                                       \
776     ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src,            \
777                                                    stride, stride);     \
778 }                                                                       \
779                                                                         \
780 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src,    \
781                                          ptrdiff_t stride)              \
782 {                                                                       \
783     uint64_t temp[8];                                                   \
784     uint8_t * const half = (uint8_t*)temp;                              \
785     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src,           \
786                                                    8, stride);          \
787     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
788                                         stride, 8);                     \
789 }                                                                       \
790                                                                         \
791 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src,    \
792                                          ptrdiff_t stride)              \
793 {                                                                       \
794     uint64_t half[8 + 9];                                               \
795     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
796     uint8_t * const halfHV = ((uint8_t*)half);                          \
797     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
798                                                    stride, 9);          \
799     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8,           \
800                                         stride, 9);                     \
801     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
802     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV,             \
803                                         stride, 8, 8);                  \
804 }                                                                       \
805                                                                         \
806 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src,    \
807                                          ptrdiff_t stride)              \
808 {                                                                       \
809     uint64_t half[8 + 9];                                               \
810     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
811     uint8_t * const halfHV = ((uint8_t*)half);                          \
812     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
813                                                    stride, 9);          \
814     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
815                                         stride, 9);                     \
816     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
817     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV,             \
818                                         stride, 8, 8);                  \
819 }                                                                       \
820                                                                         \
821 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src,    \
822                                          ptrdiff_t stride)              \
823 {                                                                       \
824     uint64_t half[8 + 9];                                               \
825     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
826     uint8_t * const halfHV = ((uint8_t*)half);                          \
827     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
828                                                    stride, 9);          \
829     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8,           \
830                                         stride, 9);                     \
831     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
832     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV,         \
833                                         stride, 8, 8);                  \
834 }                                                                       \
835                                                                         \
836 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src,    \
837                                          ptrdiff_t stride)              \
838 {                                                                       \
839     uint64_t half[8 + 9];                                               \
840     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
841     uint8_t * const halfHV = ((uint8_t*)half);                          \
842     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
843                                                    stride, 9);          \
844     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
845                                         stride, 9);                     \
846     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
847     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV,         \
848                                         stride, 8, 8);                  \
849 }                                                                       \
850                                                                         \
851 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src,    \
852                                          ptrdiff_t stride)              \
853 {                                                                       \
854     uint64_t half[8 + 9];                                               \
855     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
856     uint8_t * const halfHV = ((uint8_t*)half);                          \
857     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
858                                                    stride, 9);          \
859     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
860     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV,             \
861                                         stride, 8, 8);                  \
862 }                                                                       \
863                                                                         \
864 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src,    \
865                                          ptrdiff_t stride)              \
866 {                                                                       \
867     uint64_t half[8 + 9];                                               \
868     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
869     uint8_t * const halfHV = ((uint8_t*)half);                          \
870     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
871                                                    stride, 9);          \
872     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
873     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV,         \
874                                         stride, 8, 8);                  \
875 }                                                                       \
876                                                                         \
877 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src,    \
878                                          ptrdiff_t stride)              \
879 {                                                                       \
880     uint64_t half[8 + 9];                                               \
881     uint8_t * const halfH = ((uint8_t*)half);                           \
882     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
883                                                    stride, 9);          \
884     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH,              \
885                                         8, stride, 9);                  \
886     ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
887                                                    stride, 8);          \
888 }                                                                       \
889                                                                         \
890 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src,    \
891                                          ptrdiff_t stride)              \
892 {                                                                       \
893     uint64_t half[8 + 9];                                               \
894     uint8_t * const halfH = ((uint8_t*)half);                           \
895     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
896                                                    stride, 9);          \
897     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
898                                         stride, 9);                     \
899     ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
900                                                    stride, 8);          \
901 }                                                                       \
902                                                                         \
903 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src,    \
904                                          ptrdiff_t stride)              \
905 {                                                                       \
906     uint64_t half[9];                                                   \
907     uint8_t * const halfH = ((uint8_t*)half);                           \
908     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
909                                                    stride, 9);          \
910     ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
911                                                    stride, 8);          \
912 }                                                                       \
913                                                                         \
914 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src,  \
915                                            ptrdiff_t stride)            \
916 {                                                                       \
917     ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);            \
918 }                                                                       \
919                                                                         \
920 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src,   \
921                                           ptrdiff_t stride)             \
922 {                                                                       \
923     uint64_t temp[32];                                                  \
924     uint8_t * const half = (uint8_t*)temp;                              \
925     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,      \
926                                                     stride, 16);        \
927     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride,        \
928                                          stride, 16);                   \
929 }                                                                       \
930                                                                         \
931 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src,   \
932                                           ptrdiff_t stride)             \
933 {                                                                       \
934     ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src,           \
935                                                     stride, stride, 16);\
936 }                                                                       \
937                                                                         \
938 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src,   \
939                                           ptrdiff_t stride)             \
940 {                                                                       \
941     uint64_t temp[32];                                                  \
942     uint8_t * const half = (uint8_t*)temp;                              \
943     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,      \
944                                                     stride, 16);        \
945     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half,            \
946                                          stride, stride, 16);           \
947 }                                                                       \
948                                                                         \
949 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src,   \
950                                           ptrdiff_t stride)             \
951 {                                                                       \
952     uint64_t temp[32];                                                  \
953     uint8_t * const half = (uint8_t*)temp;                              \
954     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,      \
955                                                     stride);            \
956     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride,        \
957                                          stride, 16);                   \
958 }                                                                       \
959                                                                         \
960 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src,   \
961                                           ptrdiff_t stride)             \
962 {                                                                       \
963     ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src,           \
964                                                     stride, stride);    \
965 }                                                                       \
966                                                                         \
967 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src,   \
968                                           ptrdiff_t stride)             \
969 {                                                                       \
970     uint64_t temp[32];                                                  \
971     uint8_t * const half = (uint8_t*)temp;                              \
972     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,      \
973                                                     stride);            \
974     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half,         \
975                                          stride, stride, 16);           \
976 }                                                                       \
977                                                                         \
978 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src,   \
979                                           ptrdiff_t stride)             \
980 {                                                                       \
981     uint64_t half[16 * 2 + 17 * 2];                                     \
982     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
983     uint8_t * const halfHV = ((uint8_t*)half);                          \
984     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
985                                                     stride, 17);        \
986     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
987                                          stride, 17);                   \
988     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
989                                                     16, 16);            \
990     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV,            \
991                                          stride, 16, 16);               \
992 }                                                                       \
993                                                                         \
994 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src,   \
995                                           ptrdiff_t stride)             \
996 {                                                                       \
997     uint64_t half[16 * 2 + 17 * 2];                                     \
998     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
999     uint8_t * const halfHV = ((uint8_t*)half);                          \
1000     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1001                                                     stride, 17);        \
1002     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
1003                                          stride, 17);                   \
1004     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
1005                                                     16, 16);            \
1006     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV,            \
1007                                          stride, 16, 16);               \
1008 }                                                                       \
1009                                                                         \
1010 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src,   \
1011                                           ptrdiff_t stride)             \
1012 {                                                                       \
1013     uint64_t half[16 * 2 + 17 * 2];                                     \
1014     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1015     uint8_t * const halfHV = ((uint8_t*)half);                          \
1016     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1017                                                     stride, 17);        \
1018     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
1019                                          stride, 17);                   \
1020     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
1021                                                     16, 16);            \
1022     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV,       \
1023                                          stride, 16, 16);               \
1024 }                                                                       \
1025                                                                         \
1026 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src,   \
1027                                           ptrdiff_t stride)             \
1028 {                                                                       \
1029     uint64_t half[16 * 2 + 17 * 2];                                     \
1030     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1031     uint8_t * const halfHV = ((uint8_t*)half);                          \
1032     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1033                                                     stride, 17);        \
1034     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
1035                                          stride, 17);                   \
1036     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
1037                                                     16, 16);            \
1038     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV,       \
1039                                          stride, 16, 16);               \
1040 }                                                                       \
1041                                                                         \
1042 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src,   \
1043                                           ptrdiff_t stride)             \
1044 {                                                                       \
1045     uint64_t half[16 * 2 + 17 * 2];                                     \
1046     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1047     uint8_t * const halfHV = ((uint8_t*)half);                          \
1048     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1049                                                     stride, 17);        \
1050     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
1051                                                     16, 16);            \
1052     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV,            \
1053                                          stride, 16, 16);               \
1054 }                                                                       \
1055                                                                         \
1056 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src,   \
1057                                           ptrdiff_t stride)             \
1058 {                                                                       \
1059     uint64_t half[16 * 2 + 17 * 2];                                     \
1060     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1061     uint8_t * const halfHV = ((uint8_t*)half);                          \
1062     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1063                                                     stride, 17);        \
1064     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
1065                                                     16, 16);            \
1066     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV,       \
1067                                          stride, 16, 16);               \
1068 }                                                                       \
1069                                                                         \
1070 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src,   \
1071                                           ptrdiff_t stride)             \
1072 {                                                                       \
1073     uint64_t half[17 * 2];                                              \
1074     uint8_t * const halfH = ((uint8_t*)half);                           \
1075     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1076                                                     stride, 17);        \
1077     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
1078                                          stride, 17);                   \
1079     ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
1080                                                     stride, 16);        \
1081 }                                                                       \
1082                                                                         \
1083 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src,   \
1084                                           ptrdiff_t stride)             \
1085 {                                                                       \
1086     uint64_t half[17 * 2];                                              \
1087     uint8_t * const halfH = ((uint8_t*)half);                           \
1088     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1089                                                     stride, 17);        \
1090     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
1091                                          stride, 17);                   \
1092     ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
1093                                                     stride, 16);        \
1094 }                                                                       \
1095                                                                         \
1096 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src,   \
1097                                           ptrdiff_t stride)             \
1098 {                                                                       \
1099     uint64_t half[17 * 2];                                              \
1100     uint8_t * const halfH = ((uint8_t*)half);                           \
1101     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1102                                                     stride, 17);        \
1103     ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
1104                                                     stride, 16);        \
1105 }
1106
1107 QPEL_OP(put_,          ff_pw_16, _,        mmxext)
1108 QPEL_OP(avg_,          ff_pw_16, _,        mmxext)
1109 QPEL_OP(put_no_rnd_,   ff_pw_15, _no_rnd_, mmxext)
1110 #endif /* HAVE_YASM */
1111
1112
1113 #if HAVE_INLINE_ASM
1114 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1115 {
1116   put_pixels8_xy2_mmx(dst, src, stride, 8);
1117 }
1118 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1119 {
1120   put_pixels16_xy2_mmx(dst, src, stride, 16);
1121 }
1122 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1123 {
1124   avg_pixels8_xy2_mmx(dst, src, stride, 8);
1125 }
1126 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1127 {
1128   avg_pixels16_xy2_mmx(dst, src, stride, 16);
1129 }
1130
1131 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1132                     int stride, int h, int ox, int oy,
1133                     int dxx, int dxy, int dyx, int dyy,
1134                     int shift, int r, int width, int height)
1135 {
1136     const int w    = 8;
1137     const int ix   = ox  >> (16 + shift);
1138     const int iy   = oy  >> (16 + shift);
1139     const int oxs  = ox  >> 4;
1140     const int oys  = oy  >> 4;
1141     const int dxxs = dxx >> 4;
1142     const int dxys = dxy >> 4;
1143     const int dyxs = dyx >> 4;
1144     const int dyys = dyy >> 4;
1145     const uint16_t r4[4]   = { r, r, r, r };
1146     const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1147     const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1148     const uint64_t shift2 = 2 * shift;
1149     int x, y;
1150
1151     const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1152     const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1153     const int dxh = dxy * (h - 1);
1154     const int dyw = dyx * (w - 1);
1155     if ( // non-constant fullpel offset (3% of blocks)
1156         ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1157          (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1158         // uses more than 16 bits of subpel mv (only at huge resolution)
1159         || (dxx | dxy | dyx | dyy) & 15 ||
1160         (unsigned)ix >= width  - w ||
1161         (unsigned)iy >= height - h) {
1162         // FIXME could still use mmx for some of the rows
1163         ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1164                  shift, r, width, height);
1165         return;
1166     }
1167
1168     src += ix + iy * stride;
1169
1170     __asm__ volatile (
1171         "movd         %0, %%mm6         \n\t"
1172         "pxor      %%mm7, %%mm7         \n\t"
1173         "punpcklwd %%mm6, %%mm6         \n\t"
1174         "punpcklwd %%mm6, %%mm6         \n\t"
1175         :: "r"(1<<shift)
1176     );
1177
1178     for (x = 0; x < w; x += 4) {
1179         uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1180                             oxs - dxys + dxxs * (x + 1),
1181                             oxs - dxys + dxxs * (x + 2),
1182                             oxs - dxys + dxxs * (x + 3) };
1183         uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1184                             oys - dyys + dyxs * (x + 1),
1185                             oys - dyys + dyxs * (x + 2),
1186                             oys - dyys + dyxs * (x + 3) };
1187
1188         for (y = 0; y < h; y++) {
1189             __asm__ volatile (
1190                 "movq      %0, %%mm4    \n\t"
1191                 "movq      %1, %%mm5    \n\t"
1192                 "paddw     %2, %%mm4    \n\t"
1193                 "paddw     %3, %%mm5    \n\t"
1194                 "movq   %%mm4, %0       \n\t"
1195                 "movq   %%mm5, %1       \n\t"
1196                 "psrlw    $12, %%mm4    \n\t"
1197                 "psrlw    $12, %%mm5    \n\t"
1198                 : "+m"(*dx4), "+m"(*dy4)
1199                 : "m"(*dxy4), "m"(*dyy4)
1200             );
1201
1202             __asm__ volatile (
1203                 "movq      %%mm6, %%mm2 \n\t"
1204                 "movq      %%mm6, %%mm1 \n\t"
1205                 "psubw     %%mm4, %%mm2 \n\t"
1206                 "psubw     %%mm5, %%mm1 \n\t"
1207                 "movq      %%mm2, %%mm0 \n\t"
1208                 "movq      %%mm4, %%mm3 \n\t"
1209                 "pmullw    %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1210                 "pmullw    %%mm5, %%mm3 \n\t" // dx * dy
1211                 "pmullw    %%mm5, %%mm2 \n\t" // (s - dx) * dy
1212                 "pmullw    %%mm4, %%mm1 \n\t" // dx * (s - dy)
1213
1214                 "movd         %4, %%mm5 \n\t"
1215                 "movd         %3, %%mm4 \n\t"
1216                 "punpcklbw %%mm7, %%mm5 \n\t"
1217                 "punpcklbw %%mm7, %%mm4 \n\t"
1218                 "pmullw    %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1219                 "pmullw    %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1220
1221                 "movd         %2, %%mm5 \n\t"
1222                 "movd         %1, %%mm4 \n\t"
1223                 "punpcklbw %%mm7, %%mm5 \n\t"
1224                 "punpcklbw %%mm7, %%mm4 \n\t"
1225                 "pmullw    %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1226                 "pmullw    %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1227                 "paddw        %5, %%mm1 \n\t"
1228                 "paddw     %%mm3, %%mm2 \n\t"
1229                 "paddw     %%mm1, %%mm0 \n\t"
1230                 "paddw     %%mm2, %%mm0 \n\t"
1231
1232                 "psrlw        %6, %%mm0 \n\t"
1233                 "packuswb  %%mm0, %%mm0 \n\t"
1234                 "movd      %%mm0, %0    \n\t"
1235
1236                 : "=m"(dst[x + y * stride])
1237                 : "m"(src[0]), "m"(src[1]),
1238                   "m"(src[stride]), "m"(src[stride + 1]),
1239                   "m"(*r4), "m"(shift2)
1240             );
1241             src += stride;
1242         }
1243         src += 4 - h * stride;
1244     }
1245 }
1246 #endif /* HAVE_INLINE_ASM */
1247
1248 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1249                           ptrdiff_t line_size, int h);
1250 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1251                           ptrdiff_t line_size, int h);
1252
1253 #if HAVE_INLINE_ASM
1254
1255 /* CAVS-specific */
1256 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1257 {
1258     put_pixels8_mmx(dst, src, stride, 8);
1259 }
1260
1261 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1262 {
1263     avg_pixels8_mmx(dst, src, stride, 8);
1264 }
1265
1266 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1267 {
1268     put_pixels16_mmx(dst, src, stride, 16);
1269 }
1270
1271 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1272 {
1273     avg_pixels16_mmx(dst, src, stride, 16);
1274 }
1275
1276 /* VC-1-specific */
1277 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1278                                ptrdiff_t stride, int rnd)
1279 {
1280     put_pixels8_mmx(dst, src, stride, 8);
1281 }
1282
1283 static void vector_clipf_sse(float *dst, const float *src,
1284                              float min, float max, int len)
1285 {
1286     x86_reg i = (len - 16) * 4;
1287     __asm__ volatile (
1288         "movss          %3, %%xmm4      \n\t"
1289         "movss          %4, %%xmm5      \n\t"
1290         "shufps $0, %%xmm4, %%xmm4      \n\t"
1291         "shufps $0, %%xmm5, %%xmm5      \n\t"
1292         "1:                             \n\t"
1293         "movaps   (%2, %0), %%xmm0      \n\t" // 3/1 on intel
1294         "movaps 16(%2, %0), %%xmm1      \n\t"
1295         "movaps 32(%2, %0), %%xmm2      \n\t"
1296         "movaps 48(%2, %0), %%xmm3      \n\t"
1297         "maxps      %%xmm4, %%xmm0      \n\t"
1298         "maxps      %%xmm4, %%xmm1      \n\t"
1299         "maxps      %%xmm4, %%xmm2      \n\t"
1300         "maxps      %%xmm4, %%xmm3      \n\t"
1301         "minps      %%xmm5, %%xmm0      \n\t"
1302         "minps      %%xmm5, %%xmm1      \n\t"
1303         "minps      %%xmm5, %%xmm2      \n\t"
1304         "minps      %%xmm5, %%xmm3      \n\t"
1305         "movaps     %%xmm0,   (%1, %0)  \n\t"
1306         "movaps     %%xmm1, 16(%1, %0)  \n\t"
1307         "movaps     %%xmm2, 32(%1, %0)  \n\t"
1308         "movaps     %%xmm3, 48(%1, %0)  \n\t"
1309         "sub           $64, %0          \n\t"
1310         "jge            1b              \n\t"
1311         : "+&r"(i)
1312         : "r"(dst), "r"(src), "m"(min), "m"(max)
1313         : "memory"
1314     );
1315 }
1316
1317 #endif /* HAVE_INLINE_ASM */
1318
1319 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1320                                       int order);
1321 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1322                                     int order);
1323 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1324                                                const int16_t *v3,
1325                                                int order, int mul);
1326 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1327                                              const int16_t *v3,
1328                                              int order, int mul);
1329 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1330                                               const int16_t *v3,
1331                                               int order, int mul);
1332
1333 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1334                                         const int16_t *window, unsigned int len);
1335 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1336                                       const int16_t *window, unsigned int len);
1337 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1338                                   const int16_t *window, unsigned int len);
1339 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1340                                 const int16_t *window, unsigned int len);
1341 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1342                                  const int16_t *window, unsigned int len);
1343 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1344                                       const int16_t *window, unsigned int len);
1345
1346 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1347 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1348
1349 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1350                                           const uint8_t *diff, int w,
1351                                           int *left, int *left_top);
1352 int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1353                                        int w, int left);
1354 int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1355                                       int w, int left);
1356
1357 void ff_vector_clip_int32_mmx     (int32_t *dst, const int32_t *src,
1358                                    int32_t min, int32_t max, unsigned int len);
1359 void ff_vector_clip_int32_sse2    (int32_t *dst, const int32_t *src,
1360                                    int32_t min, int32_t max, unsigned int len);
1361 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1362                                    int32_t min, int32_t max, unsigned int len);
1363 void ff_vector_clip_int32_sse4    (int32_t *dst, const int32_t *src,
1364                                    int32_t min, int32_t max, unsigned int len);
1365
1366 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)                          \
1367     do {                                                                     \
1368     c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1369     c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1370     c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
1371     c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
1372     c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
1373     c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
1374     c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
1375     c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
1376     c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
1377     c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
1378     c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
1379     c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
1380     c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
1381     c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
1382     c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
1383     c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
1384     } while (0)
1385
1386 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
1387     do {                                                                        \
1388         c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _     ## CPU; \
1389         c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_  ## CPU; \
1390         c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_  ## CPU; \
1391         c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
1392     } while (0)
1393
1394 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
1395                                      int mm_flags)
1396 {
1397     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1398
1399 #if HAVE_INLINE_ASM
1400     c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
1401     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
1402     c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
1403
1404     if (!high_bit_depth) {
1405         c->clear_block  = clear_block_mmx;
1406         c->clear_blocks = clear_blocks_mmx;
1407         c->draw_edges   = draw_edges_mmx;
1408
1409         SET_HPEL_FUNCS(put,        [0], 16, mmx);
1410         SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
1411         SET_HPEL_FUNCS(avg,        [0], 16, mmx);
1412         SET_HPEL_FUNCS(avg_no_rnd,    , 16, mmx);
1413         SET_HPEL_FUNCS(put,        [1],  8, mmx);
1414         SET_HPEL_FUNCS(put_no_rnd, [1],  8, mmx);
1415         SET_HPEL_FUNCS(avg,        [1],  8, mmx);
1416
1417         switch (avctx->idct_algo) {
1418         case FF_IDCT_AUTO:
1419         case FF_IDCT_SIMPLEMMX:
1420             c->idct_put              = ff_simple_idct_put_mmx;
1421             c->idct_add              = ff_simple_idct_add_mmx;
1422             c->idct                  = ff_simple_idct_mmx;
1423             c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
1424             break;
1425         case FF_IDCT_XVIDMMX:
1426             c->idct_put              = ff_idct_xvid_mmx_put;
1427             c->idct_add              = ff_idct_xvid_mmx_add;
1428             c->idct                  = ff_idct_xvid_mmx;
1429             break;
1430         }
1431     }
1432
1433     c->gmc = gmc_mmx;
1434
1435     c->add_bytes = add_bytes_mmx;
1436 #endif /* HAVE_INLINE_ASM */
1437
1438 #if HAVE_YASM
1439     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1440         c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
1441         c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
1442     }
1443
1444     c->vector_clip_int32 = ff_vector_clip_int32_mmx;
1445 #endif
1446
1447 }
1448
1449 static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
1450                                         int mm_flags)
1451 {
1452     const int bit_depth      = avctx->bits_per_raw_sample;
1453     const int high_bit_depth = bit_depth > 8;
1454
1455 #if HAVE_YASM
1456     SET_QPEL_FUNCS(avg_qpel,        0, 16, mmxext, );
1457     SET_QPEL_FUNCS(avg_qpel,        1,  8, mmxext, );
1458
1459     SET_QPEL_FUNCS(put_qpel,        0, 16, mmxext, );
1460     SET_QPEL_FUNCS(put_qpel,        1,  8, mmxext, );
1461     SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
1462     SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmxext, );
1463
1464     if (!high_bit_depth) {
1465         c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
1466         c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
1467
1468         c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
1469         c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
1470         c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
1471
1472         c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
1473         c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
1474
1475         c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
1476         c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
1477         c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
1478     }
1479
1480     if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1481         if (!high_bit_depth) {
1482             c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
1483             c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
1484             c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
1485             c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
1486
1487             c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
1488             c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
1489         }
1490     }
1491 #endif /* HAVE_YASM */
1492
1493 #if HAVE_INLINE_ASM
1494     if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1495         c->idct_put = ff_idct_xvid_mmxext_put;
1496         c->idct_add = ff_idct_xvid_mmxext_add;
1497         c->idct     = ff_idct_xvid_mmxext;
1498     }
1499 #endif /* HAVE_INLINE_ASM */
1500
1501 #if HAVE_MMXEXT_EXTERNAL
1502     if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1503                                avctx->codec_id == AV_CODEC_ID_THEORA)) {
1504         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
1505         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
1506     }
1507
1508     /* slower than cmov version on AMD */
1509     if (!(mm_flags & AV_CPU_FLAG_3DNOW))
1510         c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
1511
1512     c->scalarproduct_int16          = ff_scalarproduct_int16_mmxext;
1513     c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
1514
1515     if (avctx->flags & CODEC_FLAG_BITEXACT) {
1516         c->apply_window_int16 = ff_apply_window_int16_mmxext;
1517     } else {
1518         c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
1519     }
1520 #endif /* HAVE_MMXEXT_EXTERNAL */
1521 }
1522
1523 static av_cold void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
1524                                        int mm_flags)
1525 {
1526     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1527
1528 #if HAVE_YASM
1529     if (!high_bit_depth) {
1530         c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
1531         c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
1532
1533         c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
1534         c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
1535         c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
1536
1537         c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
1538         c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
1539
1540         c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
1541         c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
1542         c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
1543
1544         if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
1545             c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
1546             c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
1547             c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
1548             c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
1549
1550             c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
1551             c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
1552         }
1553     }
1554
1555     if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1556                                avctx->codec_id == AV_CODEC_ID_THEORA)) {
1557         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
1558         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
1559     }
1560 #endif /* HAVE_YASM */
1561 }
1562
1563 static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
1564                                      int mm_flags)
1565 {
1566     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1567
1568 #if HAVE_INLINE_ASM
1569     if (!high_bit_depth) {
1570         if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
1571             /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
1572             c->clear_block  = clear_block_sse;
1573             c->clear_blocks = clear_blocks_sse;
1574         }
1575     }
1576
1577     c->vector_clipf = vector_clipf_sse;
1578 #endif /* HAVE_INLINE_ASM */
1579 }
1580
1581 static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
1582                                       int mm_flags)
1583 {
1584     const int bit_depth      = avctx->bits_per_raw_sample;
1585     const int high_bit_depth = bit_depth > 8;
1586
1587 #if HAVE_SSE2_INLINE
1588     if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1589         c->idct_put              = ff_idct_xvid_sse2_put;
1590         c->idct_add              = ff_idct_xvid_sse2_add;
1591         c->idct                  = ff_idct_xvid_sse2;
1592         c->idct_permutation_type = FF_SSE2_IDCT_PERM;
1593     }
1594 #endif /* HAVE_SSE2_INLINE */
1595
1596 #if HAVE_SSE2_EXTERNAL
1597     if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1598         // these functions are slower than mmx on AMD, but faster on Intel
1599         if (!high_bit_depth) {
1600             c->put_pixels_tab[0][0]        = ff_put_pixels16_sse2;
1601             c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
1602             c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
1603         }
1604     }
1605
1606     c->scalarproduct_int16          = ff_scalarproduct_int16_sse2;
1607     c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
1608     if (mm_flags & AV_CPU_FLAG_ATOM) {
1609         c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
1610     } else {
1611         c->vector_clip_int32 = ff_vector_clip_int32_sse2;
1612     }
1613     if (avctx->flags & CODEC_FLAG_BITEXACT) {
1614         c->apply_window_int16 = ff_apply_window_int16_sse2;
1615     } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1616         c->apply_window_int16 = ff_apply_window_int16_round_sse2;
1617     }
1618     c->bswap_buf = ff_bswap32_buf_sse2;
1619 #endif /* HAVE_SSE2_EXTERNAL */
1620 }
1621
1622 static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
1623                                        int mm_flags)
1624 {
1625 #if HAVE_SSSE3_EXTERNAL
1626     c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
1627     if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
1628         c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
1629
1630     if (mm_flags & AV_CPU_FLAG_ATOM)
1631         c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
1632     else
1633         c->apply_window_int16 = ff_apply_window_int16_ssse3;
1634     if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
1635         c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
1636     c->bswap_buf = ff_bswap32_buf_ssse3;
1637 #endif /* HAVE_SSSE3_EXTERNAL */
1638 }
1639
1640 static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
1641                                       int mm_flags)
1642 {
1643 #if HAVE_SSE4_EXTERNAL
1644     c->vector_clip_int32 = ff_vector_clip_int32_sse4;
1645 #endif /* HAVE_SSE4_EXTERNAL */
1646 }
1647
1648 av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
1649 {
1650     int mm_flags = av_get_cpu_flags();
1651
1652 #if HAVE_7REGS && HAVE_INLINE_ASM
1653     if (mm_flags & AV_CPU_FLAG_CMOV)
1654         c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
1655 #endif
1656
1657     if (mm_flags & AV_CPU_FLAG_MMX)
1658         dsputil_init_mmx(c, avctx, mm_flags);
1659
1660     if (mm_flags & AV_CPU_FLAG_MMXEXT)
1661         dsputil_init_mmxext(c, avctx, mm_flags);
1662
1663     if (mm_flags & AV_CPU_FLAG_3DNOW)
1664         dsputil_init_3dnow(c, avctx, mm_flags);
1665
1666     if (mm_flags & AV_CPU_FLAG_SSE)
1667         dsputil_init_sse(c, avctx, mm_flags);
1668
1669     if (mm_flags & AV_CPU_FLAG_SSE2)
1670         dsputil_init_sse2(c, avctx, mm_flags);
1671
1672     if (mm_flags & AV_CPU_FLAG_SSSE3)
1673         dsputil_init_ssse3(c, avctx, mm_flags);
1674
1675     if (mm_flags & AV_CPU_FLAG_SSE4)
1676         dsputil_init_sse4(c, avctx, mm_flags);
1677
1678     if (CONFIG_ENCODERS)
1679         ff_dsputilenc_init_mmx(c, avctx);
1680 }