]> git.sesse.net Git - ffmpeg/blob - libavcodec/x86/dsputil_mmx.c
57310405092db80d473556c44a3f7d59e54bf54e
[ffmpeg] / libavcodec / x86 / dsputil_mmx.c
1 /*
2  * MMX optimized DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  *
8  * This file is part of Libav.
9  *
10  * Libav is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * Libav is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with Libav; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavcodec/dsputil.h"
29 #include "libavcodec/h264dsp.h"
30 #include "libavcodec/mpegvideo.h"
31 #include "libavcodec/simple_idct.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
34
35 //#undef NDEBUG
36 //#include <assert.h>
37
38 /* pixel operations */
39 DECLARE_ALIGNED(8,  const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8,  const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
41
42 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1)    = { 0x0001000100010001ULL, 0x0001000100010001ULL };
43 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_2)    = { 0x0002000200020002ULL, 0x0002000200020002ULL };
44 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_3)    = { 0x0003000300030003ULL, 0x0003000300030003ULL };
45 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_4)    = { 0x0004000400040004ULL, 0x0004000400040004ULL };
46 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5)    = { 0x0005000500050005ULL, 0x0005000500050005ULL };
47 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_8)    = { 0x0008000800080008ULL, 0x0008000800080008ULL };
48 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_9)    = { 0x0009000900090009ULL, 0x0009000900090009ULL };
49 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_15)   =   0x000F000F000F000FULL;
50 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_16)   = { 0x0010001000100010ULL, 0x0010001000100010ULL };
51 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_17)   = { 0x0011001100110011ULL, 0x0011001100110011ULL };
52 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_18)   = { 0x0012001200120012ULL, 0x0012001200120012ULL };
53 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_20)   =   0x0014001400140014ULL;
54 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_27)   = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
55 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_28)   = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
56 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_32)   = { 0x0020002000200020ULL, 0x0020002000200020ULL };
57 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_42)   =   0x002A002A002A002AULL;
58 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_53)   =   0x0035003500350035ULL;
59 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_63)   = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
60 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_64)   = { 0x0040004000400040ULL, 0x0040004000400040ULL };
61 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_96)   =   0x0060006000600060ULL;
62 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_128)  =   0x0080008000800080ULL;
63 DECLARE_ALIGNED(8,  const uint64_t, ff_pw_255)  =   0x00ff00ff00ff00ffULL;
64 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_512)  = { 0x0200020002000200ULL, 0x0200020002000200ULL };
65 DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
66
67 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_0)    = { 0x0000000000000000ULL, 0x0000000000000000ULL };
68 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_1)    = { 0x0101010101010101ULL, 0x0101010101010101ULL };
69 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_3)    = { 0x0303030303030303ULL, 0x0303030303030303ULL };
70 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_4)    = { 0x0404040404040404ULL, 0x0404040404040404ULL };
71 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_7)    =   0x0707070707070707ULL;
72 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_1F)   =   0x1F1F1F1F1F1F1F1FULL;
73 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_3F)   =   0x3F3F3F3F3F3F3F3FULL;
74 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_80)   = { 0x8080808080808080ULL, 0x8080808080808080ULL };
75 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_81)   =   0x8181818181818181ULL;
76 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_A1)   = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
77 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_F8)   = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
78 DECLARE_ALIGNED(8,  const uint64_t, ff_pb_FC)   =   0xFCFCFCFCFCFCFCFCULL;
79 DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
80
81 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
82 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
83
84
85 #if HAVE_YASM
86 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
87                               ptrdiff_t line_size, int h);
88 void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
89                              ptrdiff_t line_size, int h);
90 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
91                               int dstStride, int src1Stride, int h);
92 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
93                                      uint8_t *src2, int dstStride,
94                                      int src1Stride, int h);
95 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
96                               int dstStride, int src1Stride, int h);
97 void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
98                                ptrdiff_t line_size, int h);
99 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
100                               ptrdiff_t line_size, int h);
101 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
102                                int dstStride, int src1Stride, int h);
103 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
104                                int dstStride, int src1Stride, int h);
105 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
106                                       int dstStride, int src1Stride, int h);
107 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
108                                      ptrdiff_t line_size, int h);
109 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
110                                     ptrdiff_t line_size, int h);
111 void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
112                                            const uint8_t *pixels,
113                                            ptrdiff_t line_size, int h);
114 void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
115                                           const uint8_t *pixels,
116                                           ptrdiff_t line_size, int h);
117 void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
118                               ptrdiff_t line_size, int h);
119 void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
120                              ptrdiff_t line_size, int h);
121 void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
122                                      ptrdiff_t line_size, int h);
123 void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
124                                     ptrdiff_t line_size, int h);
125 void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
126                                            const uint8_t *pixels,
127                                            ptrdiff_t line_size, int h);
128 void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
129                                           const uint8_t *pixels,
130                                           ptrdiff_t line_size, int h);
131 void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
132                           ptrdiff_t line_size, int h);
133 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
134                               ptrdiff_t line_size, int h);
135 void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
136                              ptrdiff_t line_size, int h);
137 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
138                               ptrdiff_t line_size, int h);
139 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
140                              ptrdiff_t line_size, int h);
141 void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
142                                ptrdiff_t line_size, int h);
143 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
144                               ptrdiff_t line_size, int h);
145
146 void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
147 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
148                                    ptrdiff_t line_size, int h)
149 {
150     ff_put_pixels8_mmxext(block,     pixels,     line_size, h);
151     ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
152 }
153
154 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
155                                          int dstStride, int srcStride, int h);
156 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
157                                          int dstStride, int srcStride, int h);
158 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
159                                                  int dstStride, int srcStride,
160                                                  int h);
161 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
162                                         int dstStride, int srcStride, int h);
163 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
164                                         int dstStride, int srcStride, int h);
165 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
166                                                 int dstStride, int srcStride,
167                                                 int h);
168 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
169                                          int dstStride, int srcStride);
170 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
171                                          int dstStride, int srcStride);
172 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
173                                                  int dstStride, int srcStride);
174 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
175                                         int dstStride, int srcStride);
176 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
177                                         int dstStride, int srcStride);
178 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
179                                                 int dstStride, int srcStride);
180 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
181 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
182 #endif /* HAVE_YASM */
183
184
185 #if HAVE_INLINE_ASM
186
187 #define JUMPALIGN()     __asm__ volatile (".p2align 3"::)
188 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
189
190 #define MOVQ_BFE(regd)                                  \
191     __asm__ volatile (                                  \
192         "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
193         "paddb   %%"#regd", %%"#regd"   \n\t" ::)
194
195 #ifndef PIC
196 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
197 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
198 #else
199 // for shared library it's better to use this way for accessing constants
200 // pcmpeqd -> -1
201 #define MOVQ_BONE(regd)                                 \
202     __asm__ volatile (                                  \
203         "pcmpeqd  %%"#regd", %%"#regd"  \n\t"           \
204         "psrlw          $15, %%"#regd"  \n\t"           \
205         "packuswb %%"#regd", %%"#regd"  \n\t" ::)
206
207 #define MOVQ_WTWO(regd)                                 \
208     __asm__ volatile (                                  \
209         "pcmpeqd %%"#regd", %%"#regd"   \n\t"           \
210         "psrlw         $15, %%"#regd"   \n\t"           \
211         "psllw          $1, %%"#regd"   \n\t"::)
212
213 #endif
214
215 // using regr as temporary and for the output result
216 // first argument is unmodifed and second is trashed
217 // regfe is supposed to contain 0xfefefefefefefefe
218 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe)                \
219     "movq   "#rega", "#regr"            \n\t"                    \
220     "pand   "#regb", "#regr"            \n\t"                    \
221     "pxor   "#rega", "#regb"            \n\t"                    \
222     "pand  "#regfe", "#regb"            \n\t"                    \
223     "psrlq       $1, "#regb"            \n\t"                    \
224     "paddb  "#regb", "#regr"            \n\t"
225
226 #define PAVGB_MMX(rega, regb, regr, regfe)                       \
227     "movq   "#rega", "#regr"            \n\t"                    \
228     "por    "#regb", "#regr"            \n\t"                    \
229     "pxor   "#rega", "#regb"            \n\t"                    \
230     "pand  "#regfe", "#regb"            \n\t"                    \
231     "psrlq       $1, "#regb"            \n\t"                    \
232     "psubb  "#regb", "#regr"            \n\t"
233
234 // mm6 is supposed to contain 0xfefefefefefefefe
235 #define PAVGBP_MMX_NO_RND(rega, regb, regr,  regc, regd, regp)   \
236     "movq  "#rega", "#regr"             \n\t"                    \
237     "movq  "#regc", "#regp"             \n\t"                    \
238     "pand  "#regb", "#regr"             \n\t"                    \
239     "pand  "#regd", "#regp"             \n\t"                    \
240     "pxor  "#rega", "#regb"             \n\t"                    \
241     "pxor  "#regc", "#regd"             \n\t"                    \
242     "pand    %%mm6, "#regb"             \n\t"                    \
243     "pand    %%mm6, "#regd"             \n\t"                    \
244     "psrlq      $1, "#regb"             \n\t"                    \
245     "psrlq      $1, "#regd"             \n\t"                    \
246     "paddb "#regb", "#regr"             \n\t"                    \
247     "paddb "#regd", "#regp"             \n\t"
248
249 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp)           \
250     "movq  "#rega", "#regr"             \n\t"                    \
251     "movq  "#regc", "#regp"             \n\t"                    \
252     "por   "#regb", "#regr"             \n\t"                    \
253     "por   "#regd", "#regp"             \n\t"                    \
254     "pxor  "#rega", "#regb"             \n\t"                    \
255     "pxor  "#regc", "#regd"             \n\t"                    \
256     "pand    %%mm6, "#regb"             \n\t"                    \
257     "pand    %%mm6, "#regd"             \n\t"                    \
258     "psrlq      $1, "#regd"             \n\t"                    \
259     "psrlq      $1, "#regb"             \n\t"                    \
260     "psubb "#regb", "#regr"             \n\t"                    \
261     "psubb "#regd", "#regp"             \n\t"
262
263 /***********************************/
264 /* MMX no rounding */
265 #define NO_RND 1
266 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
267 #define SET_RND  MOVQ_WONE
268 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
269 #define PAVGB(a, b, c, e)               PAVGB_MMX_NO_RND(a, b, c, e)
270 #define OP_AVG(a, b, c, e)              PAVGB_MMX(a, b, c, e)
271
272 #include "dsputil_rnd_template.c"
273
274 #undef DEF
275 #undef SET_RND
276 #undef PAVGBP
277 #undef PAVGB
278 #undef NO_RND
279 /***********************************/
280 /* MMX rounding */
281
282 #define DEF(x, y) x ## _ ## y ## _mmx
283 #define SET_RND  MOVQ_WTWO
284 #define PAVGBP(a, b, c, d, e, f)        PAVGBP_MMX(a, b, c, d, e, f)
285 #define PAVGB(a, b, c, e)               PAVGB_MMX(a, b, c, e)
286
287 #include "dsputil_rnd_template.c"
288
289 #undef DEF
290 #undef SET_RND
291 #undef PAVGBP
292 #undef PAVGB
293 #undef OP_AVG
294
295 #endif /* HAVE_INLINE_ASM */
296
297
298 #if HAVE_YASM
299
300 /***********************************/
301 /* 3Dnow specific */
302
303 #define DEF(x) x ## _3dnow
304
305 #include "dsputil_avg_template.c"
306
307 #undef DEF
308
309 /***********************************/
310 /* MMXEXT specific */
311
312 #define DEF(x) x ## _mmxext
313
314 #include "dsputil_avg_template.c"
315
316 #undef DEF
317
318 #endif /* HAVE_YASM */
319
320
321 #if HAVE_INLINE_ASM
322 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
323 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
324 #define put_pixels16_mmxext put_pixels16_mmx
325 #define put_pixels8_mmxext put_pixels8_mmx
326 #define put_pixels4_mmxext put_pixels4_mmx
327 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
328 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
329
330 /***********************************/
331 /* standard MMX */
332
333 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
334                                int line_size)
335 {
336     const int16_t *p;
337     uint8_t *pix;
338
339     /* read the pixels */
340     p   = block;
341     pix = pixels;
342     /* unrolled loop */
343     __asm__ volatile (
344         "movq      (%3), %%mm0          \n\t"
345         "movq     8(%3), %%mm1          \n\t"
346         "movq    16(%3), %%mm2          \n\t"
347         "movq    24(%3), %%mm3          \n\t"
348         "movq    32(%3), %%mm4          \n\t"
349         "movq    40(%3), %%mm5          \n\t"
350         "movq    48(%3), %%mm6          \n\t"
351         "movq    56(%3), %%mm7          \n\t"
352         "packuswb %%mm1, %%mm0          \n\t"
353         "packuswb %%mm3, %%mm2          \n\t"
354         "packuswb %%mm5, %%mm4          \n\t"
355         "packuswb %%mm7, %%mm6          \n\t"
356         "movq     %%mm0, (%0)           \n\t"
357         "movq     %%mm2, (%0, %1)       \n\t"
358         "movq     %%mm4, (%0, %1, 2)    \n\t"
359         "movq     %%mm6, (%0, %2)       \n\t"
360         :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
361            "r"(p)
362         : "memory");
363     pix += line_size * 4;
364     p   += 32;
365
366     // if here would be an exact copy of the code above
367     // compiler would generate some very strange code
368     // thus using "r"
369     __asm__ volatile (
370         "movq       (%3), %%mm0         \n\t"
371         "movq      8(%3), %%mm1         \n\t"
372         "movq     16(%3), %%mm2         \n\t"
373         "movq     24(%3), %%mm3         \n\t"
374         "movq     32(%3), %%mm4         \n\t"
375         "movq     40(%3), %%mm5         \n\t"
376         "movq     48(%3), %%mm6         \n\t"
377         "movq     56(%3), %%mm7         \n\t"
378         "packuswb  %%mm1, %%mm0         \n\t"
379         "packuswb  %%mm3, %%mm2         \n\t"
380         "packuswb  %%mm5, %%mm4         \n\t"
381         "packuswb  %%mm7, %%mm6         \n\t"
382         "movq      %%mm0, (%0)          \n\t"
383         "movq      %%mm2, (%0, %1)      \n\t"
384         "movq      %%mm4, (%0, %1, 2)   \n\t"
385         "movq      %%mm6, (%0, %2)      \n\t"
386         :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
387         : "memory");
388 }
389
390 #define put_signed_pixels_clamped_mmx_half(off)             \
391     "movq          "#off"(%2), %%mm1        \n\t"           \
392     "movq     16 + "#off"(%2), %%mm2        \n\t"           \
393     "movq     32 + "#off"(%2), %%mm3        \n\t"           \
394     "movq     48 + "#off"(%2), %%mm4        \n\t"           \
395     "packsswb  8 + "#off"(%2), %%mm1        \n\t"           \
396     "packsswb 24 + "#off"(%2), %%mm2        \n\t"           \
397     "packsswb 40 + "#off"(%2), %%mm3        \n\t"           \
398     "packsswb 56 + "#off"(%2), %%mm4        \n\t"           \
399     "paddb              %%mm0, %%mm1        \n\t"           \
400     "paddb              %%mm0, %%mm2        \n\t"           \
401     "paddb              %%mm0, %%mm3        \n\t"           \
402     "paddb              %%mm0, %%mm4        \n\t"           \
403     "movq               %%mm1, (%0)         \n\t"           \
404     "movq               %%mm2, (%0, %3)     \n\t"           \
405     "movq               %%mm3, (%0, %3, 2)  \n\t"           \
406     "movq               %%mm4, (%0, %1)     \n\t"
407
408 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
409                                       int line_size)
410 {
411     x86_reg line_skip = line_size;
412     x86_reg line_skip3;
413
414     __asm__ volatile (
415         "movq "MANGLE(ff_pb_80)", %%mm0     \n\t"
416         "lea         (%3, %3, 2), %1        \n\t"
417         put_signed_pixels_clamped_mmx_half(0)
418         "lea         (%0, %3, 4), %0        \n\t"
419         put_signed_pixels_clamped_mmx_half(64)
420         : "+&r"(pixels), "=&r"(line_skip3)
421         : "r"(block), "r"(line_skip)
422         : "memory");
423 }
424
425 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
426                                int line_size)
427 {
428     const int16_t *p;
429     uint8_t *pix;
430     int i;
431
432     /* read the pixels */
433     p   = block;
434     pix = pixels;
435     MOVQ_ZERO(mm7);
436     i = 4;
437     do {
438         __asm__ volatile (
439             "movq        (%2), %%mm0    \n\t"
440             "movq       8(%2), %%mm1    \n\t"
441             "movq      16(%2), %%mm2    \n\t"
442             "movq      24(%2), %%mm3    \n\t"
443             "movq          %0, %%mm4    \n\t"
444             "movq          %1, %%mm6    \n\t"
445             "movq       %%mm4, %%mm5    \n\t"
446             "punpcklbw  %%mm7, %%mm4    \n\t"
447             "punpckhbw  %%mm7, %%mm5    \n\t"
448             "paddsw     %%mm4, %%mm0    \n\t"
449             "paddsw     %%mm5, %%mm1    \n\t"
450             "movq       %%mm6, %%mm5    \n\t"
451             "punpcklbw  %%mm7, %%mm6    \n\t"
452             "punpckhbw  %%mm7, %%mm5    \n\t"
453             "paddsw     %%mm6, %%mm2    \n\t"
454             "paddsw     %%mm5, %%mm3    \n\t"
455             "packuswb   %%mm1, %%mm0    \n\t"
456             "packuswb   %%mm3, %%mm2    \n\t"
457             "movq       %%mm0, %0       \n\t"
458             "movq       %%mm2, %1       \n\t"
459             : "+m"(*pix), "+m"(*(pix + line_size))
460             : "r"(p)
461             : "memory");
462         pix += line_size * 2;
463         p   += 16;
464     } while (--i);
465 }
466
467 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
468                             int line_size, int h)
469 {
470     __asm__ volatile (
471         "lea   (%3, %3), %%"REG_a"      \n\t"
472         ".p2align     3                 \n\t"
473         "1:                             \n\t"
474         "movq  (%1    ), %%mm0          \n\t"
475         "movq  (%1, %3), %%mm1          \n\t"
476         "movq     %%mm0, (%2)           \n\t"
477         "movq     %%mm1, (%2, %3)       \n\t"
478         "add  %%"REG_a", %1             \n\t"
479         "add  %%"REG_a", %2             \n\t"
480         "movq  (%1    ), %%mm0          \n\t"
481         "movq  (%1, %3), %%mm1          \n\t"
482         "movq     %%mm0, (%2)           \n\t"
483         "movq     %%mm1, (%2, %3)       \n\t"
484         "add  %%"REG_a", %1             \n\t"
485         "add  %%"REG_a", %2             \n\t"
486         "subl        $4, %0             \n\t"
487         "jnz         1b                 \n\t"
488         : "+g"(h), "+r"(pixels),  "+r"(block)
489         : "r"((x86_reg)line_size)
490         : "%"REG_a, "memory"
491         );
492 }
493
494 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
495                              int line_size, int h)
496 {
497     __asm__ volatile (
498         "lea   (%3, %3), %%"REG_a"      \n\t"
499         ".p2align     3                 \n\t"
500         "1:                             \n\t"
501         "movq  (%1    ), %%mm0          \n\t"
502         "movq 8(%1    ), %%mm4          \n\t"
503         "movq  (%1, %3), %%mm1          \n\t"
504         "movq 8(%1, %3), %%mm5          \n\t"
505         "movq     %%mm0,  (%2)          \n\t"
506         "movq     %%mm4, 8(%2)          \n\t"
507         "movq     %%mm1,  (%2, %3)      \n\t"
508         "movq     %%mm5, 8(%2, %3)      \n\t"
509         "add  %%"REG_a", %1             \n\t"
510         "add  %%"REG_a", %2             \n\t"
511         "movq  (%1    ), %%mm0          \n\t"
512         "movq 8(%1    ), %%mm4          \n\t"
513         "movq  (%1, %3), %%mm1          \n\t"
514         "movq 8(%1, %3), %%mm5          \n\t"
515         "movq     %%mm0,  (%2)          \n\t"
516         "movq     %%mm4, 8(%2)          \n\t"
517         "movq     %%mm1,  (%2, %3)      \n\t"
518         "movq     %%mm5, 8(%2, %3)      \n\t"
519         "add  %%"REG_a", %1             \n\t"
520         "add  %%"REG_a", %2             \n\t"
521         "subl        $4, %0             \n\t"
522         "jnz         1b                 \n\t"
523         : "+g"(h), "+r"(pixels),  "+r"(block)
524         : "r"((x86_reg)line_size)
525         : "%"REG_a, "memory"
526         );
527 }
528
529 #define CLEAR_BLOCKS(name, n)                           \
530 static void name(int16_t *blocks)                       \
531 {                                                       \
532     __asm__ volatile (                                  \
533         "pxor %%mm7, %%mm7              \n\t"           \
534         "mov     %1,        %%"REG_a"   \n\t"           \
535         "1:                             \n\t"           \
536         "movq %%mm7,   (%0, %%"REG_a")  \n\t"           \
537         "movq %%mm7,  8(%0, %%"REG_a")  \n\t"           \
538         "movq %%mm7, 16(%0, %%"REG_a")  \n\t"           \
539         "movq %%mm7, 24(%0, %%"REG_a")  \n\t"           \
540         "add    $32, %%"REG_a"          \n\t"           \
541         "js      1b                     \n\t"           \
542         :: "r"(((uint8_t *)blocks) + 128 * n),          \
543            "i"(-128 * n)                                \
544         : "%"REG_a                                      \
545         );                                              \
546 }
547 CLEAR_BLOCKS(clear_blocks_mmx, 6)
548 CLEAR_BLOCKS(clear_block_mmx, 1)
549
550 static void clear_block_sse(int16_t *block)
551 {
552     __asm__ volatile (
553         "xorps  %%xmm0, %%xmm0          \n"
554         "movaps %%xmm0,    (%0)         \n"
555         "movaps %%xmm0,  16(%0)         \n"
556         "movaps %%xmm0,  32(%0)         \n"
557         "movaps %%xmm0,  48(%0)         \n"
558         "movaps %%xmm0,  64(%0)         \n"
559         "movaps %%xmm0,  80(%0)         \n"
560         "movaps %%xmm0,  96(%0)         \n"
561         "movaps %%xmm0, 112(%0)         \n"
562         :: "r"(block)
563         : "memory"
564     );
565 }
566
567 static void clear_blocks_sse(int16_t *blocks)
568 {
569     __asm__ volatile (
570         "xorps  %%xmm0, %%xmm0              \n"
571         "mov        %1,         %%"REG_a"   \n"
572         "1:                                 \n"
573         "movaps %%xmm0,    (%0, %%"REG_a")  \n"
574         "movaps %%xmm0,  16(%0, %%"REG_a")  \n"
575         "movaps %%xmm0,  32(%0, %%"REG_a")  \n"
576         "movaps %%xmm0,  48(%0, %%"REG_a")  \n"
577         "movaps %%xmm0,  64(%0, %%"REG_a")  \n"
578         "movaps %%xmm0,  80(%0, %%"REG_a")  \n"
579         "movaps %%xmm0,  96(%0, %%"REG_a")  \n"
580         "movaps %%xmm0, 112(%0, %%"REG_a")  \n"
581         "add      $128,         %%"REG_a"   \n"
582         "js         1b                      \n"
583         :: "r"(((uint8_t *)blocks) + 128 * 6),
584            "i"(-128 * 6)
585         : "%"REG_a
586     );
587 }
588
589 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
590 {
591     x86_reg i = 0;
592     __asm__ volatile (
593         "jmp          2f                \n\t"
594         "1:                             \n\t"
595         "movq   (%1, %0), %%mm0         \n\t"
596         "movq   (%2, %0), %%mm1         \n\t"
597         "paddb     %%mm0, %%mm1         \n\t"
598         "movq      %%mm1, (%2, %0)      \n\t"
599         "movq  8(%1, %0), %%mm0         \n\t"
600         "movq  8(%2, %0), %%mm1         \n\t"
601         "paddb     %%mm0, %%mm1         \n\t"
602         "movq      %%mm1, 8(%2, %0)     \n\t"
603         "add         $16, %0            \n\t"
604         "2:                             \n\t"
605         "cmp          %3, %0            \n\t"
606         "js           1b                \n\t"
607         : "+r"(i)
608         : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
609     );
610     for ( ; i < w; i++)
611         dst[i + 0] += src[i + 0];
612 }
613
614 #if HAVE_7REGS
615 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
616                                             const uint8_t *diff, int w,
617                                             int *left, int *left_top)
618 {
619     x86_reg w2 = -w;
620     x86_reg x;
621     int l  = *left     & 0xff;
622     int tl = *left_top & 0xff;
623     int t;
624     __asm__ volatile (
625         "mov          %7, %3            \n"
626         "1:                             \n"
627         "movzbl (%3, %4), %2            \n"
628         "mov          %2, %k3           \n"
629         "sub         %b1, %b3           \n"
630         "add         %b0, %b3           \n"
631         "mov          %2, %1            \n"
632         "cmp          %0, %2            \n"
633         "cmovg        %0, %2            \n"
634         "cmovg        %1, %0            \n"
635         "cmp         %k3, %0            \n"
636         "cmovg       %k3, %0            \n"
637         "mov          %7, %3            \n"
638         "cmp          %2, %0            \n"
639         "cmovl        %2, %0            \n"
640         "add    (%6, %4), %b0           \n"
641         "mov         %b0, (%5, %4)      \n"
642         "inc          %4                \n"
643         "jl           1b                \n"
644         : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
645         : "r"(dst + w), "r"(diff + w), "rm"(top + w)
646     );
647     *left     = l;
648     *left_top = tl;
649 }
650 #endif
651 #endif /* HAVE_INLINE_ASM */
652
653 void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
654 void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
655
656 #if HAVE_INLINE_ASM
657 /* Draw the edges of width 'w' of an image of size width, height
658  * this MMX version can only handle w == 8 || w == 16. */
659 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
660                            int w, int h, int sides)
661 {
662     uint8_t *ptr, *last_line;
663     int i;
664
665     last_line = buf + (height - 1) * wrap;
666     /* left and right */
667     ptr = buf;
668     if (w == 8) {
669         __asm__ volatile (
670             "1:                             \n\t"
671             "movd            (%0), %%mm0    \n\t"
672             "punpcklbw      %%mm0, %%mm0    \n\t"
673             "punpcklwd      %%mm0, %%mm0    \n\t"
674             "punpckldq      %%mm0, %%mm0    \n\t"
675             "movq           %%mm0, -8(%0)   \n\t"
676             "movq      -8(%0, %2), %%mm1    \n\t"
677             "punpckhbw      %%mm1, %%mm1    \n\t"
678             "punpckhwd      %%mm1, %%mm1    \n\t"
679             "punpckhdq      %%mm1, %%mm1    \n\t"
680             "movq           %%mm1, (%0, %2) \n\t"
681             "add               %1, %0       \n\t"
682             "cmp               %3, %0       \n\t"
683             "jb                1b           \n\t"
684             : "+r"(ptr)
685             : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
686             );
687     } else {
688         __asm__ volatile (
689             "1:                                 \n\t"
690             "movd            (%0), %%mm0        \n\t"
691             "punpcklbw      %%mm0, %%mm0        \n\t"
692             "punpcklwd      %%mm0, %%mm0        \n\t"
693             "punpckldq      %%mm0, %%mm0        \n\t"
694             "movq           %%mm0, -8(%0)       \n\t"
695             "movq           %%mm0, -16(%0)      \n\t"
696             "movq      -8(%0, %2), %%mm1        \n\t"
697             "punpckhbw      %%mm1, %%mm1        \n\t"
698             "punpckhwd      %%mm1, %%mm1        \n\t"
699             "punpckhdq      %%mm1, %%mm1        \n\t"
700             "movq           %%mm1,  (%0, %2)    \n\t"
701             "movq           %%mm1, 8(%0, %2)    \n\t"
702             "add               %1, %0           \n\t"
703             "cmp               %3, %0           \n\t"
704             "jb                1b               \n\t"
705             : "+r"(ptr)
706             : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
707             );
708     }
709
710     /* top and bottom (and hopefully also the corners) */
711     if (sides & EDGE_TOP) {
712         for (i = 0; i < h; i += 4) {
713             ptr = buf - (i + 1) * wrap - w;
714             __asm__ volatile (
715                 "1:                             \n\t"
716                 "movq (%1, %0), %%mm0           \n\t"
717                 "movq    %%mm0, (%0)            \n\t"
718                 "movq    %%mm0, (%0, %2)        \n\t"
719                 "movq    %%mm0, (%0, %2, 2)     \n\t"
720                 "movq    %%mm0, (%0, %3)        \n\t"
721                 "add        $8, %0              \n\t"
722                 "cmp        %4, %0              \n\t"
723                 "jb         1b                  \n\t"
724                 : "+r"(ptr)
725                 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
726                   "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
727                 );
728         }
729     }
730
731     if (sides & EDGE_BOTTOM) {
732         for (i = 0; i < h; i += 4) {
733             ptr = last_line + (i + 1) * wrap - w;
734             __asm__ volatile (
735                 "1:                             \n\t"
736                 "movq (%1, %0), %%mm0           \n\t"
737                 "movq    %%mm0, (%0)            \n\t"
738                 "movq    %%mm0, (%0, %2)        \n\t"
739                 "movq    %%mm0, (%0, %2, 2)     \n\t"
740                 "movq    %%mm0, (%0, %3)        \n\t"
741                 "add        $8, %0              \n\t"
742                 "cmp        %4, %0              \n\t"
743                 "jb         1b                  \n\t"
744                 : "+r"(ptr)
745                 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
746                   "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
747                   "r"(ptr + width + 2 * w)
748                 );
749         }
750     }
751 }
752 #endif /* HAVE_INLINE_ASM */
753
754
755 #if HAVE_YASM
756 #define QPEL_OP(OPNAME, ROUNDER, RND, MMX)                              \
757 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src,   \
758                                           int stride)                   \
759 {                                                                       \
760     ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);              \
761 }                                                                       \
762                                                                         \
763 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src,    \
764                                          int stride)                    \
765 {                                                                       \
766     uint64_t temp[8];                                                   \
767     uint8_t * const half = (uint8_t*)temp;                              \
768     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,        \
769                                                    stride, 8);          \
770     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half,                 \
771                                         stride, stride, 8);             \
772 }                                                                       \
773                                                                         \
774 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src,    \
775                                          int stride)                    \
776 {                                                                       \
777     ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride,    \
778                                                    stride, 8);          \
779 }                                                                       \
780                                                                         \
781 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src,    \
782                                          int stride)                    \
783 {                                                                       \
784     uint64_t temp[8];                                                   \
785     uint8_t * const half = (uint8_t*)temp;                              \
786     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,        \
787                                                    stride, 8);          \
788     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride,     \
789                                         stride, 8);                     \
790 }                                                                       \
791                                                                         \
792 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src,    \
793                                          int stride)                    \
794 {                                                                       \
795     uint64_t temp[8];                                                   \
796     uint8_t * const half = (uint8_t*)temp;                              \
797     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src,           \
798                                                    8, stride);          \
799     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half,                 \
800                                         stride, stride, 8);             \
801 }                                                                       \
802                                                                         \
803 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src,    \
804                                          int stride)                    \
805 {                                                                       \
806     ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src,            \
807                                                    stride, stride);     \
808 }                                                                       \
809                                                                         \
810 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src,    \
811                                          int stride)                    \
812 {                                                                       \
813     uint64_t temp[8];                                                   \
814     uint8_t * const half = (uint8_t*)temp;                              \
815     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src,           \
816                                                    8, stride);          \
817     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
818                                         stride, 8);                     \
819 }                                                                       \
820                                                                         \
821 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src,    \
822                                          int stride)                    \
823 {                                                                       \
824     uint64_t half[8 + 9];                                               \
825     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
826     uint8_t * const halfHV = ((uint8_t*)half);                          \
827     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
828                                                    stride, 9);          \
829     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8,           \
830                                         stride, 9);                     \
831     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
832     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV,             \
833                                         stride, 8, 8);                  \
834 }                                                                       \
835                                                                         \
836 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src,    \
837                                          int stride)                    \
838 {                                                                       \
839     uint64_t half[8 + 9];                                               \
840     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
841     uint8_t * const halfHV = ((uint8_t*)half);                          \
842     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
843                                                    stride, 9);          \
844     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
845                                         stride, 9);                     \
846     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
847     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV,             \
848                                         stride, 8, 8);                  \
849 }                                                                       \
850                                                                         \
851 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src,    \
852                                          int stride)                    \
853 {                                                                       \
854     uint64_t half[8 + 9];                                               \
855     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
856     uint8_t * const halfHV = ((uint8_t*)half);                          \
857     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
858                                                    stride, 9);          \
859     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8,           \
860                                         stride, 9);                     \
861     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
862     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV,         \
863                                         stride, 8, 8);                  \
864 }                                                                       \
865                                                                         \
866 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src,    \
867                                          int stride)                    \
868 {                                                                       \
869     uint64_t half[8 + 9];                                               \
870     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
871     uint8_t * const halfHV = ((uint8_t*)half);                          \
872     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
873                                                    stride, 9);          \
874     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
875                                         stride, 9);                     \
876     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
877     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV,         \
878                                         stride, 8, 8);                  \
879 }                                                                       \
880                                                                         \
881 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src,    \
882                                          int stride)                    \
883 {                                                                       \
884     uint64_t half[8 + 9];                                               \
885     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
886     uint8_t * const halfHV = ((uint8_t*)half);                          \
887     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
888                                                    stride, 9);          \
889     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
890     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV,             \
891                                         stride, 8, 8);                  \
892 }                                                                       \
893                                                                         \
894 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src,    \
895                                          int stride)                    \
896 {                                                                       \
897     uint64_t half[8 + 9];                                               \
898     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
899     uint8_t * const halfHV = ((uint8_t*)half);                          \
900     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
901                                                    stride, 9);          \
902     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
903     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV,         \
904                                         stride, 8, 8);                  \
905 }                                                                       \
906                                                                         \
907 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src,    \
908                                          int stride)                    \
909 {                                                                       \
910     uint64_t half[8 + 9];                                               \
911     uint8_t * const halfH = ((uint8_t*)half);                           \
912     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
913                                                    stride, 9);          \
914     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH,              \
915                                         8, stride, 9);                  \
916     ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
917                                                    stride, 8);          \
918 }                                                                       \
919                                                                         \
920 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src,    \
921                                          int stride)                    \
922 {                                                                       \
923     uint64_t half[8 + 9];                                               \
924     uint8_t * const halfH = ((uint8_t*)half);                           \
925     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
926                                                    stride, 9);          \
927     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
928                                         stride, 9);                     \
929     ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
930                                                    stride, 8);          \
931 }                                                                       \
932                                                                         \
933 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src,    \
934                                          int stride)                    \
935 {                                                                       \
936     uint64_t half[9];                                                   \
937     uint8_t * const halfH = ((uint8_t*)half);                           \
938     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
939                                                    stride, 9);          \
940     ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
941                                                    stride, 8);          \
942 }                                                                       \
943                                                                         \
944 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src,  \
945                                            int stride)                  \
946 {                                                                       \
947     ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);            \
948 }                                                                       \
949                                                                         \
950 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src,   \
951                                           int stride)                   \
952 {                                                                       \
953     uint64_t temp[32];                                                  \
954     uint8_t * const half = (uint8_t*)temp;                              \
955     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,      \
956                                                     stride, 16);        \
957     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride,        \
958                                          stride, 16);                   \
959 }                                                                       \
960                                                                         \
961 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src,   \
962                                           int stride)                   \
963 {                                                                       \
964     ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src,           \
965                                                     stride, stride, 16);\
966 }                                                                       \
967                                                                         \
968 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src,   \
969                                           int stride)                   \
970 {                                                                       \
971     uint64_t temp[32];                                                  \
972     uint8_t * const half = (uint8_t*)temp;                              \
973     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,      \
974                                                     stride, 16);        \
975     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half,            \
976                                          stride, stride, 16);           \
977 }                                                                       \
978                                                                         \
979 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src,   \
980                                           int stride)                   \
981 {                                                                       \
982     uint64_t temp[32];                                                  \
983     uint8_t * const half = (uint8_t*)temp;                              \
984     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,      \
985                                                     stride);            \
986     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride,        \
987                                          stride, 16);                   \
988 }                                                                       \
989                                                                         \
990 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src,   \
991                                           int stride)                   \
992 {                                                                       \
993     ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src,           \
994                                                     stride, stride);    \
995 }                                                                       \
996                                                                         \
997 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src,   \
998                                           int stride)                   \
999 {                                                                       \
1000     uint64_t temp[32];                                                  \
1001     uint8_t * const half = (uint8_t*)temp;                              \
1002     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,      \
1003                                                     stride);            \
1004     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half,         \
1005                                          stride, stride, 16);           \
1006 }                                                                       \
1007                                                                         \
1008 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src,   \
1009                                           int stride)                   \
1010 {                                                                       \
1011     uint64_t half[16 * 2 + 17 * 2];                                     \
1012     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1013     uint8_t * const halfHV = ((uint8_t*)half);                          \
1014     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1015                                                     stride, 17);        \
1016     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
1017                                          stride, 17);                   \
1018     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
1019                                                     16, 16);            \
1020     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV,            \
1021                                          stride, 16, 16);               \
1022 }                                                                       \
1023                                                                         \
1024 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src,   \
1025                                           int stride)                   \
1026 {                                                                       \
1027     uint64_t half[16 * 2 + 17 * 2];                                     \
1028     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1029     uint8_t * const halfHV = ((uint8_t*)half);                          \
1030     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1031                                                     stride, 17);        \
1032     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
1033                                          stride, 17);                   \
1034     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
1035                                                     16, 16);            \
1036     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV,            \
1037                                          stride, 16, 16);               \
1038 }                                                                       \
1039                                                                         \
1040 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src,   \
1041                                           int stride)                   \
1042 {                                                                       \
1043     uint64_t half[16 * 2 + 17 * 2];                                     \
1044     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1045     uint8_t * const halfHV = ((uint8_t*)half);                          \
1046     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1047                                                     stride, 17);        \
1048     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
1049                                          stride, 17);                   \
1050     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
1051                                                     16, 16);            \
1052     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV,       \
1053                                          stride, 16, 16);               \
1054 }                                                                       \
1055                                                                         \
1056 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src,   \
1057                                           int stride)                   \
1058 {                                                                       \
1059     uint64_t half[16 * 2 + 17 * 2];                                     \
1060     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1061     uint8_t * const halfHV = ((uint8_t*)half);                          \
1062     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1063                                                     stride, 17);        \
1064     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
1065                                          stride, 17);                   \
1066     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
1067                                                     16, 16);            \
1068     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV,       \
1069                                          stride, 16, 16);               \
1070 }                                                                       \
1071                                                                         \
1072 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src,   \
1073                                           int stride)                   \
1074 {                                                                       \
1075     uint64_t half[16 * 2 + 17 * 2];                                     \
1076     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1077     uint8_t * const halfHV = ((uint8_t*)half);                          \
1078     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1079                                                     stride, 17);        \
1080     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
1081                                                     16, 16);            \
1082     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV,            \
1083                                          stride, 16, 16);               \
1084 }                                                                       \
1085                                                                         \
1086 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src,   \
1087                                           int stride)                   \
1088 {                                                                       \
1089     uint64_t half[16 * 2 + 17 * 2];                                     \
1090     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
1091     uint8_t * const halfHV = ((uint8_t*)half);                          \
1092     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1093                                                     stride, 17);        \
1094     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
1095                                                     16, 16);            \
1096     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV,       \
1097                                          stride, 16, 16);               \
1098 }                                                                       \
1099                                                                         \
1100 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src,   \
1101                                           int stride)                   \
1102 {                                                                       \
1103     uint64_t half[17 * 2];                                              \
1104     uint8_t * const halfH = ((uint8_t*)half);                           \
1105     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1106                                                     stride, 17);        \
1107     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
1108                                          stride, 17);                   \
1109     ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
1110                                                     stride, 16);        \
1111 }                                                                       \
1112                                                                         \
1113 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src,   \
1114                                           int stride)                   \
1115 {                                                                       \
1116     uint64_t half[17 * 2];                                              \
1117     uint8_t * const halfH = ((uint8_t*)half);                           \
1118     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1119                                                     stride, 17);        \
1120     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
1121                                          stride, 17);                   \
1122     ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
1123                                                     stride, 16);        \
1124 }                                                                       \
1125                                                                         \
1126 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src,   \
1127                                           int stride)                   \
1128 {                                                                       \
1129     uint64_t half[17 * 2];                                              \
1130     uint8_t * const halfH = ((uint8_t*)half);                           \
1131     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
1132                                                     stride, 17);        \
1133     ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
1134                                                     stride, 16);        \
1135 }
1136
1137 QPEL_OP(put_,          ff_pw_16, _,        mmxext)
1138 QPEL_OP(avg_,          ff_pw_16, _,        mmxext)
1139 QPEL_OP(put_no_rnd_,   ff_pw_15, _no_rnd_, mmxext)
1140 #endif /* HAVE_YASM */
1141
1142
1143 #if HAVE_INLINE_ASM
1144 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1145 {
1146   put_pixels8_xy2_mmx(dst, src, stride, 8);
1147 }
1148 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1149 {
1150   put_pixels16_xy2_mmx(dst, src, stride, 16);
1151 }
1152 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1153 {
1154   avg_pixels8_xy2_mmx(dst, src, stride, 8);
1155 }
1156 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1157 {
1158   avg_pixels16_xy2_mmx(dst, src, stride, 16);
1159 }
1160
1161 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1162                     int stride, int h, int ox, int oy,
1163                     int dxx, int dxy, int dyx, int dyy,
1164                     int shift, int r, int width, int height)
1165 {
1166     const int w    = 8;
1167     const int ix   = ox  >> (16 + shift);
1168     const int iy   = oy  >> (16 + shift);
1169     const int oxs  = ox  >> 4;
1170     const int oys  = oy  >> 4;
1171     const int dxxs = dxx >> 4;
1172     const int dxys = dxy >> 4;
1173     const int dyxs = dyx >> 4;
1174     const int dyys = dyy >> 4;
1175     const uint16_t r4[4]   = { r, r, r, r };
1176     const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1177     const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1178     const uint64_t shift2 = 2 * shift;
1179     int x, y;
1180
1181     const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1182     const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1183     const int dxh = dxy * (h - 1);
1184     const int dyw = dyx * (w - 1);
1185     if ( // non-constant fullpel offset (3% of blocks)
1186         ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1187          (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1188         // uses more than 16 bits of subpel mv (only at huge resolution)
1189         || (dxx | dxy | dyx | dyy) & 15 ||
1190         (unsigned)ix >= width  - w ||
1191         (unsigned)iy >= height - h) {
1192         // FIXME could still use mmx for some of the rows
1193         ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1194                  shift, r, width, height);
1195         return;
1196     }
1197
1198     src += ix + iy * stride;
1199
1200     __asm__ volatile (
1201         "movd         %0, %%mm6         \n\t"
1202         "pxor      %%mm7, %%mm7         \n\t"
1203         "punpcklwd %%mm6, %%mm6         \n\t"
1204         "punpcklwd %%mm6, %%mm6         \n\t"
1205         :: "r"(1<<shift)
1206     );
1207
1208     for (x = 0; x < w; x += 4) {
1209         uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1210                             oxs - dxys + dxxs * (x + 1),
1211                             oxs - dxys + dxxs * (x + 2),
1212                             oxs - dxys + dxxs * (x + 3) };
1213         uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1214                             oys - dyys + dyxs * (x + 1),
1215                             oys - dyys + dyxs * (x + 2),
1216                             oys - dyys + dyxs * (x + 3) };
1217
1218         for (y = 0; y < h; y++) {
1219             __asm__ volatile (
1220                 "movq      %0, %%mm4    \n\t"
1221                 "movq      %1, %%mm5    \n\t"
1222                 "paddw     %2, %%mm4    \n\t"
1223                 "paddw     %3, %%mm5    \n\t"
1224                 "movq   %%mm4, %0       \n\t"
1225                 "movq   %%mm5, %1       \n\t"
1226                 "psrlw    $12, %%mm4    \n\t"
1227                 "psrlw    $12, %%mm5    \n\t"
1228                 : "+m"(*dx4), "+m"(*dy4)
1229                 : "m"(*dxy4), "m"(*dyy4)
1230             );
1231
1232             __asm__ volatile (
1233                 "movq      %%mm6, %%mm2 \n\t"
1234                 "movq      %%mm6, %%mm1 \n\t"
1235                 "psubw     %%mm4, %%mm2 \n\t"
1236                 "psubw     %%mm5, %%mm1 \n\t"
1237                 "movq      %%mm2, %%mm0 \n\t"
1238                 "movq      %%mm4, %%mm3 \n\t"
1239                 "pmullw    %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1240                 "pmullw    %%mm5, %%mm3 \n\t" // dx * dy
1241                 "pmullw    %%mm5, %%mm2 \n\t" // (s - dx) * dy
1242                 "pmullw    %%mm4, %%mm1 \n\t" // dx * (s - dy)
1243
1244                 "movd         %4, %%mm5 \n\t"
1245                 "movd         %3, %%mm4 \n\t"
1246                 "punpcklbw %%mm7, %%mm5 \n\t"
1247                 "punpcklbw %%mm7, %%mm4 \n\t"
1248                 "pmullw    %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1249                 "pmullw    %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1250
1251                 "movd         %2, %%mm5 \n\t"
1252                 "movd         %1, %%mm4 \n\t"
1253                 "punpcklbw %%mm7, %%mm5 \n\t"
1254                 "punpcklbw %%mm7, %%mm4 \n\t"
1255                 "pmullw    %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1256                 "pmullw    %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1257                 "paddw        %5, %%mm1 \n\t"
1258                 "paddw     %%mm3, %%mm2 \n\t"
1259                 "paddw     %%mm1, %%mm0 \n\t"
1260                 "paddw     %%mm2, %%mm0 \n\t"
1261
1262                 "psrlw        %6, %%mm0 \n\t"
1263                 "packuswb  %%mm0, %%mm0 \n\t"
1264                 "movd      %%mm0, %0    \n\t"
1265
1266                 : "=m"(dst[x + y * stride])
1267                 : "m"(src[0]), "m"(src[1]),
1268                   "m"(src[stride]), "m"(src[stride + 1]),
1269                   "m"(*r4), "m"(shift2)
1270             );
1271             src += stride;
1272         }
1273         src += 4 - h * stride;
1274     }
1275 }
1276 #endif /* HAVE_INLINE_ASM */
1277
1278 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1279                           ptrdiff_t line_size, int h);
1280 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1281                           ptrdiff_t line_size, int h);
1282
1283 #if HAVE_INLINE_ASM
1284
1285 /* CAVS-specific */
1286 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1287 {
1288     put_pixels8_mmx(dst, src, stride, 8);
1289 }
1290
1291 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1292 {
1293     avg_pixels8_mmx(dst, src, stride, 8);
1294 }
1295
1296 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1297 {
1298     put_pixels16_mmx(dst, src, stride, 16);
1299 }
1300
1301 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1302 {
1303     avg_pixels16_mmx(dst, src, stride, 16);
1304 }
1305
1306 /* VC-1-specific */
1307 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1308                                int stride, int rnd)
1309 {
1310     put_pixels8_mmx(dst, src, stride, 8);
1311 }
1312
1313 static void vector_clipf_sse(float *dst, const float *src,
1314                              float min, float max, int len)
1315 {
1316     x86_reg i = (len - 16) * 4;
1317     __asm__ volatile (
1318         "movss          %3, %%xmm4      \n\t"
1319         "movss          %4, %%xmm5      \n\t"
1320         "shufps $0, %%xmm4, %%xmm4      \n\t"
1321         "shufps $0, %%xmm5, %%xmm5      \n\t"
1322         "1:                             \n\t"
1323         "movaps   (%2, %0), %%xmm0      \n\t" // 3/1 on intel
1324         "movaps 16(%2, %0), %%xmm1      \n\t"
1325         "movaps 32(%2, %0), %%xmm2      \n\t"
1326         "movaps 48(%2, %0), %%xmm3      \n\t"
1327         "maxps      %%xmm4, %%xmm0      \n\t"
1328         "maxps      %%xmm4, %%xmm1      \n\t"
1329         "maxps      %%xmm4, %%xmm2      \n\t"
1330         "maxps      %%xmm4, %%xmm3      \n\t"
1331         "minps      %%xmm5, %%xmm0      \n\t"
1332         "minps      %%xmm5, %%xmm1      \n\t"
1333         "minps      %%xmm5, %%xmm2      \n\t"
1334         "minps      %%xmm5, %%xmm3      \n\t"
1335         "movaps     %%xmm0,   (%1, %0)  \n\t"
1336         "movaps     %%xmm1, 16(%1, %0)  \n\t"
1337         "movaps     %%xmm2, 32(%1, %0)  \n\t"
1338         "movaps     %%xmm3, 48(%1, %0)  \n\t"
1339         "sub           $64, %0          \n\t"
1340         "jge            1b              \n\t"
1341         : "+&r"(i)
1342         : "r"(dst), "r"(src), "m"(min), "m"(max)
1343         : "memory"
1344     );
1345 }
1346
1347 #endif /* HAVE_INLINE_ASM */
1348
1349 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1350                                       int order);
1351 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1352                                     int order);
1353 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1354                                                const int16_t *v3,
1355                                                int order, int mul);
1356 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1357                                              const int16_t *v3,
1358                                              int order, int mul);
1359 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1360                                               const int16_t *v3,
1361                                               int order, int mul);
1362
1363 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1364                                         const int16_t *window, unsigned int len);
1365 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1366                                       const int16_t *window, unsigned int len);
1367 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1368                                   const int16_t *window, unsigned int len);
1369 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1370                                 const int16_t *window, unsigned int len);
1371 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1372                                  const int16_t *window, unsigned int len);
1373 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1374                                       const int16_t *window, unsigned int len);
1375
1376 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1377 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1378
1379 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1380                                           const uint8_t *diff, int w,
1381                                           int *left, int *left_top);
1382 int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1383                                        int w, int left);
1384 int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1385                                       int w, int left);
1386
1387 void ff_vector_clip_int32_mmx     (int32_t *dst, const int32_t *src,
1388                                    int32_t min, int32_t max, unsigned int len);
1389 void ff_vector_clip_int32_sse2    (int32_t *dst, const int32_t *src,
1390                                    int32_t min, int32_t max, unsigned int len);
1391 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1392                                    int32_t min, int32_t max, unsigned int len);
1393 void ff_vector_clip_int32_sse4    (int32_t *dst, const int32_t *src,
1394                                    int32_t min, int32_t max, unsigned int len);
1395
1396 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)                          \
1397     do {                                                                     \
1398     c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1399     c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1400     c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
1401     c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
1402     c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
1403     c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
1404     c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
1405     c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
1406     c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
1407     c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
1408     c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
1409     c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
1410     c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
1411     c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
1412     c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
1413     c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
1414     } while (0)
1415
1416 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                     \
1417     do {                                                                        \
1418         c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _     ## CPU; \
1419         c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_  ## CPU; \
1420         c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_  ## CPU; \
1421         c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
1422     } while (0)
1423
1424 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
1425                                      int mm_flags)
1426 {
1427     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1428
1429 #if HAVE_INLINE_ASM
1430     c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
1431     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
1432     c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
1433
1434     if (!high_bit_depth) {
1435         c->clear_block  = clear_block_mmx;
1436         c->clear_blocks = clear_blocks_mmx;
1437         c->draw_edges   = draw_edges_mmx;
1438
1439         SET_HPEL_FUNCS(put,        [0], 16, mmx);
1440         SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
1441         SET_HPEL_FUNCS(avg,        [0], 16, mmx);
1442         SET_HPEL_FUNCS(avg_no_rnd,    , 16, mmx);
1443         SET_HPEL_FUNCS(put,        [1],  8, mmx);
1444         SET_HPEL_FUNCS(put_no_rnd, [1],  8, mmx);
1445         SET_HPEL_FUNCS(avg,        [1],  8, mmx);
1446
1447         switch (avctx->idct_algo) {
1448         case FF_IDCT_AUTO:
1449         case FF_IDCT_SIMPLEMMX:
1450             c->idct_put              = ff_simple_idct_put_mmx;
1451             c->idct_add              = ff_simple_idct_add_mmx;
1452             c->idct                  = ff_simple_idct_mmx;
1453             c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
1454             break;
1455         case FF_IDCT_XVIDMMX:
1456             c->idct_put              = ff_idct_xvid_mmx_put;
1457             c->idct_add              = ff_idct_xvid_mmx_add;
1458             c->idct                  = ff_idct_xvid_mmx;
1459             break;
1460         }
1461     }
1462
1463     c->gmc = gmc_mmx;
1464
1465     c->add_bytes = add_bytes_mmx;
1466 #endif /* HAVE_INLINE_ASM */
1467
1468 #if HAVE_YASM
1469     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1470         c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
1471         c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
1472     }
1473
1474     c->vector_clip_int32 = ff_vector_clip_int32_mmx;
1475 #endif
1476
1477 }
1478
1479 static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
1480                                         int mm_flags)
1481 {
1482     const int bit_depth      = avctx->bits_per_raw_sample;
1483     const int high_bit_depth = bit_depth > 8;
1484
1485 #if HAVE_YASM
1486     SET_QPEL_FUNCS(avg_qpel,        0, 16, mmxext, );
1487     SET_QPEL_FUNCS(avg_qpel,        1,  8, mmxext, );
1488
1489     SET_QPEL_FUNCS(put_qpel,        0, 16, mmxext, );
1490     SET_QPEL_FUNCS(put_qpel,        1,  8, mmxext, );
1491     SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
1492     SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmxext, );
1493
1494     if (!high_bit_depth) {
1495         c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
1496         c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
1497
1498         c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
1499         c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
1500         c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
1501
1502         c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
1503         c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
1504
1505         c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
1506         c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
1507         c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
1508     }
1509
1510     if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1511         if (!high_bit_depth) {
1512             c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
1513             c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
1514             c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
1515             c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
1516
1517             c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
1518             c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
1519         }
1520     }
1521 #endif /* HAVE_YASM */
1522
1523 #if HAVE_INLINE_ASM
1524     if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1525         c->idct_put = ff_idct_xvid_mmxext_put;
1526         c->idct_add = ff_idct_xvid_mmxext_add;
1527         c->idct     = ff_idct_xvid_mmxext;
1528     }
1529 #endif /* HAVE_INLINE_ASM */
1530
1531 #if HAVE_MMXEXT_EXTERNAL
1532     if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1533                                avctx->codec_id == AV_CODEC_ID_THEORA)) {
1534         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
1535         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
1536     }
1537
1538     /* slower than cmov version on AMD */
1539     if (!(mm_flags & AV_CPU_FLAG_3DNOW))
1540         c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
1541
1542     c->scalarproduct_int16          = ff_scalarproduct_int16_mmxext;
1543     c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
1544
1545     if (avctx->flags & CODEC_FLAG_BITEXACT) {
1546         c->apply_window_int16 = ff_apply_window_int16_mmxext;
1547     } else {
1548         c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
1549     }
1550 #endif /* HAVE_MMXEXT_EXTERNAL */
1551 }
1552
1553 static av_cold void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
1554                                        int mm_flags)
1555 {
1556     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1557
1558 #if HAVE_YASM
1559     if (!high_bit_depth) {
1560         c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
1561         c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
1562
1563         c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
1564         c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
1565         c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
1566
1567         c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
1568         c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
1569
1570         c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
1571         c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
1572         c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
1573
1574         if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
1575             c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
1576             c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
1577             c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
1578             c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
1579
1580             c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
1581             c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
1582         }
1583     }
1584
1585     if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1586                                avctx->codec_id == AV_CODEC_ID_THEORA)) {
1587         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
1588         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
1589     }
1590 #endif /* HAVE_YASM */
1591 }
1592
1593 static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
1594                                      int mm_flags)
1595 {
1596     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1597
1598 #if HAVE_INLINE_ASM
1599     if (!high_bit_depth) {
1600         if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
1601             /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
1602             c->clear_block  = clear_block_sse;
1603             c->clear_blocks = clear_blocks_sse;
1604         }
1605     }
1606
1607     c->vector_clipf = vector_clipf_sse;
1608 #endif /* HAVE_INLINE_ASM */
1609 }
1610
1611 static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
1612                                       int mm_flags)
1613 {
1614     const int bit_depth      = avctx->bits_per_raw_sample;
1615     const int high_bit_depth = bit_depth > 8;
1616
1617 #if HAVE_SSE2_INLINE
1618     if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1619         c->idct_put              = ff_idct_xvid_sse2_put;
1620         c->idct_add              = ff_idct_xvid_sse2_add;
1621         c->idct                  = ff_idct_xvid_sse2;
1622         c->idct_permutation_type = FF_SSE2_IDCT_PERM;
1623     }
1624 #endif /* HAVE_SSE2_INLINE */
1625
1626 #if HAVE_SSE2_EXTERNAL
1627     if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1628         // these functions are slower than mmx on AMD, but faster on Intel
1629         if (!high_bit_depth) {
1630             c->put_pixels_tab[0][0]        = ff_put_pixels16_sse2;
1631             c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
1632             c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
1633         }
1634     }
1635
1636     c->scalarproduct_int16          = ff_scalarproduct_int16_sse2;
1637     c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
1638     if (mm_flags & AV_CPU_FLAG_ATOM) {
1639         c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
1640     } else {
1641         c->vector_clip_int32 = ff_vector_clip_int32_sse2;
1642     }
1643     if (avctx->flags & CODEC_FLAG_BITEXACT) {
1644         c->apply_window_int16 = ff_apply_window_int16_sse2;
1645     } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1646         c->apply_window_int16 = ff_apply_window_int16_round_sse2;
1647     }
1648     c->bswap_buf = ff_bswap32_buf_sse2;
1649 #endif /* HAVE_SSE2_EXTERNAL */
1650 }
1651
1652 static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
1653                                        int mm_flags)
1654 {
1655 #if HAVE_SSSE3_EXTERNAL
1656     c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
1657     if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
1658         c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
1659
1660     if (mm_flags & AV_CPU_FLAG_ATOM)
1661         c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
1662     else
1663         c->apply_window_int16 = ff_apply_window_int16_ssse3;
1664     if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
1665         c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
1666     c->bswap_buf = ff_bswap32_buf_ssse3;
1667 #endif /* HAVE_SSSE3_EXTERNAL */
1668 }
1669
1670 static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
1671                                       int mm_flags)
1672 {
1673 #if HAVE_SSE4_EXTERNAL
1674     c->vector_clip_int32 = ff_vector_clip_int32_sse4;
1675 #endif /* HAVE_SSE4_EXTERNAL */
1676 }
1677
1678 av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
1679 {
1680     int mm_flags = av_get_cpu_flags();
1681
1682 #if HAVE_7REGS && HAVE_INLINE_ASM
1683     if (mm_flags & AV_CPU_FLAG_CMOV)
1684         c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
1685 #endif
1686
1687     if (mm_flags & AV_CPU_FLAG_MMX)
1688         dsputil_init_mmx(c, avctx, mm_flags);
1689
1690     if (mm_flags & AV_CPU_FLAG_MMXEXT)
1691         dsputil_init_mmxext(c, avctx, mm_flags);
1692
1693     if (mm_flags & AV_CPU_FLAG_3DNOW)
1694         dsputil_init_3dnow(c, avctx, mm_flags);
1695
1696     if (mm_flags & AV_CPU_FLAG_SSE)
1697         dsputil_init_sse(c, avctx, mm_flags);
1698
1699     if (mm_flags & AV_CPU_FLAG_SSE2)
1700         dsputil_init_sse2(c, avctx, mm_flags);
1701
1702     if (mm_flags & AV_CPU_FLAG_SSSE3)
1703         dsputil_init_ssse3(c, avctx, mm_flags);
1704
1705     if (mm_flags & AV_CPU_FLAG_SSE4)
1706         dsputil_init_sse4(c, avctx, mm_flags);
1707
1708     if (CONFIG_ENCODERS)
1709         ff_dsputilenc_init_mmx(c, avctx);
1710 }