2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavcodec/dsputil.h"
29 #include "libavcodec/h264dsp.h"
30 #include "libavcodec/mpegvideo.h"
31 #include "libavcodec/simple_idct.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
34 #include "diracdsp_mmx.h"
39 /* pixel operations */
40 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
41 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
43 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
44 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
50 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
54 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
58 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
59 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
60 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
61 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
62 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
63 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
65 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
66 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
72 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
75 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
78 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
79 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
82 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
83 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
87 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
88 ptrdiff_t line_size, int h);
89 void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
90 ptrdiff_t line_size, int h);
91 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
92 int dstStride, int src1Stride, int h);
93 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
94 uint8_t *src2, int dstStride,
95 int src1Stride, int h);
96 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
97 int dstStride, int src1Stride, int h);
98 void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
99 ptrdiff_t line_size, int h);
100 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
101 ptrdiff_t line_size, int h);
102 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
103 int dstStride, int src1Stride, int h);
104 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
105 int dstStride, int src1Stride, int h);
106 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
107 int dstStride, int src1Stride, int h);
108 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
109 ptrdiff_t line_size, int h);
110 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
111 ptrdiff_t line_size, int h);
112 void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
113 const uint8_t *pixels,
114 ptrdiff_t line_size, int h);
115 void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
116 const uint8_t *pixels,
117 ptrdiff_t line_size, int h);
118 void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
119 ptrdiff_t line_size, int h);
120 void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
121 ptrdiff_t line_size, int h);
122 void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
123 ptrdiff_t line_size, int h);
124 void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
125 ptrdiff_t line_size, int h);
126 void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
127 const uint8_t *pixels,
128 ptrdiff_t line_size, int h);
129 void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
130 const uint8_t *pixels,
131 ptrdiff_t line_size, int h);
132 void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
133 ptrdiff_t line_size, int h);
134 void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
135 ptrdiff_t line_size, int h);
136 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
137 ptrdiff_t line_size, int h);
138 void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
139 ptrdiff_t line_size, int h);
140 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
141 ptrdiff_t line_size, int h);
142 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
143 ptrdiff_t line_size, int h);
144 void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
145 ptrdiff_t line_size, int h);
146 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
147 ptrdiff_t line_size, int h);
149 void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
150 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
151 int line_size, int h)
153 ff_put_pixels8_mmxext(block, pixels, line_size, h);
154 ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
157 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
158 int dstStride, int srcStride, int h);
159 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
160 int dstStride, int srcStride, int h);
161 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
162 int dstStride, int srcStride,
164 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
165 int dstStride, int srcStride, int h);
166 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
167 int dstStride, int srcStride, int h);
168 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
169 int dstStride, int srcStride,
171 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
172 int dstStride, int srcStride);
173 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
174 int dstStride, int srcStride);
175 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
176 int dstStride, int srcStride);
177 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
178 int dstStride, int srcStride);
179 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
180 int dstStride, int srcStride);
181 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
182 int dstStride, int srcStride);
183 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
184 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
185 #endif /* HAVE_YASM */
190 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
191 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
193 #define MOVQ_BFE(regd) \
195 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
196 "paddb %%"#regd", %%"#regd" \n\t" ::)
199 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
200 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
202 // for shared library it's better to use this way for accessing constants
204 #define MOVQ_BONE(regd) \
206 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
207 "psrlw $15, %%"#regd" \n\t" \
208 "packuswb %%"#regd", %%"#regd" \n\t" ::)
210 #define MOVQ_WTWO(regd) \
212 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
213 "psrlw $15, %%"#regd" \n\t" \
214 "psllw $1, %%"#regd" \n\t"::)
218 // using regr as temporary and for the output result
219 // first argument is unmodifed and second is trashed
220 // regfe is supposed to contain 0xfefefefefefefefe
221 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
222 "movq "#rega", "#regr" \n\t" \
223 "pand "#regb", "#regr" \n\t" \
224 "pxor "#rega", "#regb" \n\t" \
225 "pand "#regfe", "#regb" \n\t" \
226 "psrlq $1, "#regb" \n\t" \
227 "paddb "#regb", "#regr" \n\t"
229 #define PAVGB_MMX(rega, regb, regr, regfe) \
230 "movq "#rega", "#regr" \n\t" \
231 "por "#regb", "#regr" \n\t" \
232 "pxor "#rega", "#regb" \n\t" \
233 "pand "#regfe", "#regb" \n\t" \
234 "psrlq $1, "#regb" \n\t" \
235 "psubb "#regb", "#regr" \n\t"
237 // mm6 is supposed to contain 0xfefefefefefefefe
238 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
239 "movq "#rega", "#regr" \n\t" \
240 "movq "#regc", "#regp" \n\t" \
241 "pand "#regb", "#regr" \n\t" \
242 "pand "#regd", "#regp" \n\t" \
243 "pxor "#rega", "#regb" \n\t" \
244 "pxor "#regc", "#regd" \n\t" \
245 "pand %%mm6, "#regb" \n\t" \
246 "pand %%mm6, "#regd" \n\t" \
247 "psrlq $1, "#regb" \n\t" \
248 "psrlq $1, "#regd" \n\t" \
249 "paddb "#regb", "#regr" \n\t" \
250 "paddb "#regd", "#regp" \n\t"
252 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
253 "movq "#rega", "#regr" \n\t" \
254 "movq "#regc", "#regp" \n\t" \
255 "por "#regb", "#regr" \n\t" \
256 "por "#regd", "#regp" \n\t" \
257 "pxor "#rega", "#regb" \n\t" \
258 "pxor "#regc", "#regd" \n\t" \
259 "pand %%mm6, "#regb" \n\t" \
260 "pand %%mm6, "#regd" \n\t" \
261 "psrlq $1, "#regd" \n\t" \
262 "psrlq $1, "#regb" \n\t" \
263 "psubb "#regb", "#regr" \n\t" \
264 "psubb "#regd", "#regp" \n\t"
266 /***********************************/
267 /* MMX no rounding */
269 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
270 #define SET_RND MOVQ_WONE
271 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
272 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
273 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
275 #include "dsputil_rnd_template.c"
282 /***********************************/
285 #define DEF(x, y) x ## _ ## y ## _mmx
286 #define SET_RND MOVQ_WTWO
287 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
288 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
290 #include "dsputil_rnd_template.c"
298 #endif /* HAVE_INLINE_ASM */
302 #define ff_put_pixels8_mmx ff_put_pixels8_mmxext
304 /***********************************/
307 #define DEF(x) x ## _3dnow
309 #include "dsputil_avg_template.c"
313 /***********************************/
314 /* MMXEXT specific */
316 #define DEF(x) x ## _mmxext
318 #include "dsputil_avg_template.c"
322 #endif /* HAVE_YASM */
326 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
327 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
328 #define put_pixels16_mmxext put_pixels16_mmx
329 #define put_pixels8_mmxext put_pixels8_mmx
330 #define put_pixels4_mmxext put_pixels4_mmx
331 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
332 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
334 /***********************************/
337 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
343 /* read the pixels */
348 "movq (%3), %%mm0 \n\t"
349 "movq 8(%3), %%mm1 \n\t"
350 "movq 16(%3), %%mm2 \n\t"
351 "movq 24(%3), %%mm3 \n\t"
352 "movq 32(%3), %%mm4 \n\t"
353 "movq 40(%3), %%mm5 \n\t"
354 "movq 48(%3), %%mm6 \n\t"
355 "movq 56(%3), %%mm7 \n\t"
356 "packuswb %%mm1, %%mm0 \n\t"
357 "packuswb %%mm3, %%mm2 \n\t"
358 "packuswb %%mm5, %%mm4 \n\t"
359 "packuswb %%mm7, %%mm6 \n\t"
360 "movq %%mm0, (%0) \n\t"
361 "movq %%mm2, (%0, %1) \n\t"
362 "movq %%mm4, (%0, %1, 2) \n\t"
363 "movq %%mm6, (%0, %2) \n\t"
364 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
367 pix += line_size * 4;
370 // if here would be an exact copy of the code above
371 // compiler would generate some very strange code
374 "movq (%3), %%mm0 \n\t"
375 "movq 8(%3), %%mm1 \n\t"
376 "movq 16(%3), %%mm2 \n\t"
377 "movq 24(%3), %%mm3 \n\t"
378 "movq 32(%3), %%mm4 \n\t"
379 "movq 40(%3), %%mm5 \n\t"
380 "movq 48(%3), %%mm6 \n\t"
381 "movq 56(%3), %%mm7 \n\t"
382 "packuswb %%mm1, %%mm0 \n\t"
383 "packuswb %%mm3, %%mm2 \n\t"
384 "packuswb %%mm5, %%mm4 \n\t"
385 "packuswb %%mm7, %%mm6 \n\t"
386 "movq %%mm0, (%0) \n\t"
387 "movq %%mm2, (%0, %1) \n\t"
388 "movq %%mm4, (%0, %1, 2) \n\t"
389 "movq %%mm6, (%0, %2) \n\t"
390 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
394 #define put_signed_pixels_clamped_mmx_half(off) \
395 "movq "#off"(%2), %%mm1 \n\t" \
396 "movq 16 + "#off"(%2), %%mm2 \n\t" \
397 "movq 32 + "#off"(%2), %%mm3 \n\t" \
398 "movq 48 + "#off"(%2), %%mm4 \n\t" \
399 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
400 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
401 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
402 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
403 "paddb %%mm0, %%mm1 \n\t" \
404 "paddb %%mm0, %%mm2 \n\t" \
405 "paddb %%mm0, %%mm3 \n\t" \
406 "paddb %%mm0, %%mm4 \n\t" \
407 "movq %%mm1, (%0) \n\t" \
408 "movq %%mm2, (%0, %3) \n\t" \
409 "movq %%mm3, (%0, %3, 2) \n\t" \
410 "movq %%mm4, (%0, %1) \n\t"
412 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
415 x86_reg line_skip = line_size;
419 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
420 "lea (%3, %3, 2), %1 \n\t"
421 put_signed_pixels_clamped_mmx_half(0)
422 "lea (%0, %3, 4), %0 \n\t"
423 put_signed_pixels_clamped_mmx_half(64)
424 : "+&r"(pixels), "=&r"(line_skip3)
425 : "r"(block), "r"(line_skip)
429 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
436 /* read the pixels */
443 "movq (%2), %%mm0 \n\t"
444 "movq 8(%2), %%mm1 \n\t"
445 "movq 16(%2), %%mm2 \n\t"
446 "movq 24(%2), %%mm3 \n\t"
447 "movq %0, %%mm4 \n\t"
448 "movq %1, %%mm6 \n\t"
449 "movq %%mm4, %%mm5 \n\t"
450 "punpcklbw %%mm7, %%mm4 \n\t"
451 "punpckhbw %%mm7, %%mm5 \n\t"
452 "paddsw %%mm4, %%mm0 \n\t"
453 "paddsw %%mm5, %%mm1 \n\t"
454 "movq %%mm6, %%mm5 \n\t"
455 "punpcklbw %%mm7, %%mm6 \n\t"
456 "punpckhbw %%mm7, %%mm5 \n\t"
457 "paddsw %%mm6, %%mm2 \n\t"
458 "paddsw %%mm5, %%mm3 \n\t"
459 "packuswb %%mm1, %%mm0 \n\t"
460 "packuswb %%mm3, %%mm2 \n\t"
461 "movq %%mm0, %0 \n\t"
462 "movq %%mm2, %1 \n\t"
463 : "+m"(*pix), "+m"(*(pix + line_size))
466 pix += line_size * 2;
471 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
472 ptrdiff_t line_size, int h)
475 "lea (%3, %3), %%"REG_a" \n\t"
478 "movq (%1 ), %%mm0 \n\t"
479 "movq (%1, %3), %%mm1 \n\t"
480 "movq %%mm0, (%2) \n\t"
481 "movq %%mm1, (%2, %3) \n\t"
482 "add %%"REG_a", %1 \n\t"
483 "add %%"REG_a", %2 \n\t"
484 "movq (%1 ), %%mm0 \n\t"
485 "movq (%1, %3), %%mm1 \n\t"
486 "movq %%mm0, (%2) \n\t"
487 "movq %%mm1, (%2, %3) \n\t"
488 "add %%"REG_a", %1 \n\t"
489 "add %%"REG_a", %2 \n\t"
492 : "+g"(h), "+r"(pixels), "+r"(block)
493 : "r"((x86_reg)line_size)
498 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
499 ptrdiff_t line_size, int h)
502 "lea (%3, %3), %%"REG_a" \n\t"
505 "movq (%1 ), %%mm0 \n\t"
506 "movq 8(%1 ), %%mm4 \n\t"
507 "movq (%1, %3), %%mm1 \n\t"
508 "movq 8(%1, %3), %%mm5 \n\t"
509 "movq %%mm0, (%2) \n\t"
510 "movq %%mm4, 8(%2) \n\t"
511 "movq %%mm1, (%2, %3) \n\t"
512 "movq %%mm5, 8(%2, %3) \n\t"
513 "add %%"REG_a", %1 \n\t"
514 "add %%"REG_a", %2 \n\t"
515 "movq (%1 ), %%mm0 \n\t"
516 "movq 8(%1 ), %%mm4 \n\t"
517 "movq (%1, %3), %%mm1 \n\t"
518 "movq 8(%1, %3), %%mm5 \n\t"
519 "movq %%mm0, (%2) \n\t"
520 "movq %%mm4, 8(%2) \n\t"
521 "movq %%mm1, (%2, %3) \n\t"
522 "movq %%mm5, 8(%2, %3) \n\t"
523 "add %%"REG_a", %1 \n\t"
524 "add %%"REG_a", %2 \n\t"
527 : "+g"(h), "+r"(pixels), "+r"(block)
528 : "r"((x86_reg)line_size)
533 #define CLEAR_BLOCKS(name, n) \
534 static void name(int16_t *blocks) \
537 "pxor %%mm7, %%mm7 \n\t" \
538 "mov %1, %%"REG_a" \n\t" \
540 "movq %%mm7, (%0, %%"REG_a") \n\t" \
541 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
542 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
543 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
544 "add $32, %%"REG_a" \n\t" \
546 :: "r"(((uint8_t *)blocks) + 128 * n), \
551 CLEAR_BLOCKS(clear_blocks_mmx, 6)
552 CLEAR_BLOCKS(clear_block_mmx, 1)
554 static void clear_block_sse(int16_t *block)
557 "xorps %%xmm0, %%xmm0 \n"
558 "movaps %%xmm0, (%0) \n"
559 "movaps %%xmm0, 16(%0) \n"
560 "movaps %%xmm0, 32(%0) \n"
561 "movaps %%xmm0, 48(%0) \n"
562 "movaps %%xmm0, 64(%0) \n"
563 "movaps %%xmm0, 80(%0) \n"
564 "movaps %%xmm0, 96(%0) \n"
565 "movaps %%xmm0, 112(%0) \n"
571 static void clear_blocks_sse(int16_t *blocks)
574 "xorps %%xmm0, %%xmm0 \n"
575 "mov %1, %%"REG_a" \n"
577 "movaps %%xmm0, (%0, %%"REG_a") \n"
578 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
579 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
580 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
581 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
582 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
583 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
584 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
585 "add $128, %%"REG_a" \n"
587 :: "r"(((uint8_t *)blocks) + 128 * 6),
593 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
599 "movq (%1, %0), %%mm0 \n\t"
600 "movq (%2, %0), %%mm1 \n\t"
601 "paddb %%mm0, %%mm1 \n\t"
602 "movq %%mm1, (%2, %0) \n\t"
603 "movq 8(%1, %0), %%mm0 \n\t"
604 "movq 8(%2, %0), %%mm1 \n\t"
605 "paddb %%mm0, %%mm1 \n\t"
606 "movq %%mm1, 8(%2, %0) \n\t"
612 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
615 dst[i + 0] += src[i + 0];
619 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
620 const uint8_t *diff, int w,
621 int *left, int *left_top)
625 int l = *left & 0xff;
626 int tl = *left_top & 0xff;
631 "movzbl (%3, %4), %2 \n"
644 "add (%6, %4), %b0 \n"
645 "mov %b0, (%5, %4) \n"
648 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
649 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
655 #endif /* HAVE_INLINE_ASM */
657 void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
658 void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
661 /* Draw the edges of width 'w' of an image of size width, height
662 * this MMX version can only handle w == 8 || w == 16. */
663 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
664 int w, int h, int sides)
666 uint8_t *ptr, *last_line;
669 last_line = buf + (height - 1) * wrap;
675 "movd (%0), %%mm0 \n\t"
676 "punpcklbw %%mm0, %%mm0 \n\t"
677 "punpcklwd %%mm0, %%mm0 \n\t"
678 "punpckldq %%mm0, %%mm0 \n\t"
679 "movq %%mm0, -8(%0) \n\t"
680 "movq -8(%0, %2), %%mm1 \n\t"
681 "punpckhbw %%mm1, %%mm1 \n\t"
682 "punpckhwd %%mm1, %%mm1 \n\t"
683 "punpckhdq %%mm1, %%mm1 \n\t"
684 "movq %%mm1, (%0, %2) \n\t"
689 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
694 "movd (%0), %%mm0 \n\t"
695 "punpcklbw %%mm0, %%mm0 \n\t"
696 "punpcklwd %%mm0, %%mm0 \n\t"
697 "punpckldq %%mm0, %%mm0 \n\t"
698 "movq %%mm0, -8(%0) \n\t"
699 "movq %%mm0, -16(%0) \n\t"
700 "movq -8(%0, %2), %%mm1 \n\t"
701 "punpckhbw %%mm1, %%mm1 \n\t"
702 "punpckhwd %%mm1, %%mm1 \n\t"
703 "punpckhdq %%mm1, %%mm1 \n\t"
704 "movq %%mm1, (%0, %2) \n\t"
705 "movq %%mm1, 8(%0, %2) \n\t"
710 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
716 "movd (%0), %%mm0 \n\t"
717 "punpcklbw %%mm0, %%mm0 \n\t"
718 "punpcklwd %%mm0, %%mm0 \n\t"
719 "movd %%mm0, -4(%0) \n\t"
720 "movd -4(%0, %2), %%mm1 \n\t"
721 "punpcklbw %%mm1, %%mm1 \n\t"
722 "punpckhwd %%mm1, %%mm1 \n\t"
723 "punpckhdq %%mm1, %%mm1 \n\t"
724 "movd %%mm1, (%0, %2) \n\t"
729 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
733 /* top and bottom (and hopefully also the corners) */
734 if (sides & EDGE_TOP) {
735 for (i = 0; i < h; i += 4) {
736 ptr = buf - (i + 1) * wrap - w;
739 "movq (%1, %0), %%mm0 \n\t"
740 "movq %%mm0, (%0) \n\t"
741 "movq %%mm0, (%0, %2) \n\t"
742 "movq %%mm0, (%0, %2, 2) \n\t"
743 "movq %%mm0, (%0, %3) \n\t"
748 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
749 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
754 if (sides & EDGE_BOTTOM) {
755 for (i = 0; i < h; i += 4) {
756 ptr = last_line + (i + 1) * wrap - w;
759 "movq (%1, %0), %%mm0 \n\t"
760 "movq %%mm0, (%0) \n\t"
761 "movq %%mm0, (%0, %2) \n\t"
762 "movq %%mm0, (%0, %2, 2) \n\t"
763 "movq %%mm0, (%0, %3) \n\t"
768 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
769 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
770 "r"(ptr + width + 2 * w)
775 #endif /* HAVE_INLINE_ASM */
779 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
780 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
783 ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
786 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
790 uint8_t * const half = (uint8_t*)temp; \
791 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
793 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
794 stride, stride, 8); \
797 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
800 ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
804 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
808 uint8_t * const half = (uint8_t*)temp; \
809 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
811 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
815 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
819 uint8_t * const half = (uint8_t*)temp; \
820 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
822 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
823 stride, stride, 8); \
826 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
829 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
833 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
837 uint8_t * const half = (uint8_t*)temp; \
838 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
840 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
844 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
847 uint64_t half[8 + 9]; \
848 uint8_t * const halfH = ((uint8_t*)half) + 64; \
849 uint8_t * const halfHV = ((uint8_t*)half); \
850 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
852 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
854 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
855 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
859 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
862 uint64_t half[8 + 9]; \
863 uint8_t * const halfH = ((uint8_t*)half) + 64; \
864 uint8_t * const halfHV = ((uint8_t*)half); \
865 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
867 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
869 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
870 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
874 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
877 uint64_t half[8 + 9]; \
878 uint8_t * const halfH = ((uint8_t*)half) + 64; \
879 uint8_t * const halfHV = ((uint8_t*)half); \
880 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
882 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
884 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
885 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
889 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
892 uint64_t half[8 + 9]; \
893 uint8_t * const halfH = ((uint8_t*)half) + 64; \
894 uint8_t * const halfHV = ((uint8_t*)half); \
895 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
897 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
899 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
900 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
904 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
907 uint64_t half[8 + 9]; \
908 uint8_t * const halfH = ((uint8_t*)half) + 64; \
909 uint8_t * const halfHV = ((uint8_t*)half); \
910 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
912 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
913 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
917 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
920 uint64_t half[8 + 9]; \
921 uint8_t * const halfH = ((uint8_t*)half) + 64; \
922 uint8_t * const halfHV = ((uint8_t*)half); \
923 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
925 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
926 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
930 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
933 uint64_t half[8 + 9]; \
934 uint8_t * const halfH = ((uint8_t*)half); \
935 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
937 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
939 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
943 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
946 uint64_t half[8 + 9]; \
947 uint8_t * const halfH = ((uint8_t*)half); \
948 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
950 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
952 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
956 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
960 uint8_t * const halfH = ((uint8_t*)half); \
961 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
963 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
967 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
970 ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
973 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
977 uint8_t * const half = (uint8_t*)temp; \
978 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
980 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
984 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
987 ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
988 stride, stride, 16);\
991 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
995 uint8_t * const half = (uint8_t*)temp; \
996 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
998 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
999 stride, stride, 16); \
1002 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1005 uint64_t temp[32]; \
1006 uint8_t * const half = (uint8_t*)temp; \
1007 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1009 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
1013 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1016 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
1020 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1023 uint64_t temp[32]; \
1024 uint8_t * const half = (uint8_t*)temp; \
1025 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1027 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1028 stride, stride, 16); \
1031 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1034 uint64_t half[16 * 2 + 17 * 2]; \
1035 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1036 uint8_t * const halfHV = ((uint8_t*)half); \
1037 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1039 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1041 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1043 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1047 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1050 uint64_t half[16 * 2 + 17 * 2]; \
1051 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1052 uint8_t * const halfHV = ((uint8_t*)half); \
1053 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1055 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1057 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1059 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1063 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1066 uint64_t half[16 * 2 + 17 * 2]; \
1067 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1068 uint8_t * const halfHV = ((uint8_t*)half); \
1069 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1071 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1073 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1075 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1079 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1082 uint64_t half[16 * 2 + 17 * 2]; \
1083 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1084 uint8_t * const halfHV = ((uint8_t*)half); \
1085 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1087 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1089 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1091 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1095 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1098 uint64_t half[16 * 2 + 17 * 2]; \
1099 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1100 uint8_t * const halfHV = ((uint8_t*)half); \
1101 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1103 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1105 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1109 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1112 uint64_t half[16 * 2 + 17 * 2]; \
1113 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1114 uint8_t * const halfHV = ((uint8_t*)half); \
1115 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1117 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1119 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1123 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1126 uint64_t half[17 * 2]; \
1127 uint8_t * const halfH = ((uint8_t*)half); \
1128 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1130 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1132 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1136 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1139 uint64_t half[17 * 2]; \
1140 uint8_t * const halfH = ((uint8_t*)half); \
1141 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1143 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1145 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1149 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1152 uint64_t half[17 * 2]; \
1153 uint8_t * const halfH = ((uint8_t*)half); \
1154 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1156 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1160 #define PUT_OP(a, b, temp, size) \
1161 "mov"#size" "#a", "#b" \n\t"
1163 #define AVG_MMXEXT_OP(a, b, temp, size) \
1164 "mov"#size" "#b", "#temp" \n\t" \
1165 "pavgb "#temp", "#a" \n\t" \
1166 "mov"#size" "#a", "#b" \n\t"
1168 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext)
1169 QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
1170 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
1171 #endif /* HAVE_YASM */
1175 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1177 put_pixels8_xy2_mmx(dst, src, stride, 8);
1179 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1181 put_pixels16_xy2_mmx(dst, src, stride, 16);
1183 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1185 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1187 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1189 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1192 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1193 ptrdiff_t linesize, int block_w, int block_h,
1194 int src_x, int src_y, int w, int h);
1196 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1197 int stride, int h, int ox, int oy,
1198 int dxx, int dxy, int dyx, int dyy,
1199 int shift, int r, int width, int height,
1200 emulated_edge_mc_func *emu_edge_fn)
1203 const int ix = ox >> (16 + shift);
1204 const int iy = oy >> (16 + shift);
1205 const int oxs = ox >> 4;
1206 const int oys = oy >> 4;
1207 const int dxxs = dxx >> 4;
1208 const int dxys = dxy >> 4;
1209 const int dyxs = dyx >> 4;
1210 const int dyys = dyy >> 4;
1211 const uint16_t r4[4] = { r, r, r, r };
1212 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1213 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1214 const uint64_t shift2 = 2 * shift;
1215 #define MAX_STRIDE 4096U
1217 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
1220 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1221 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1222 const int dxh = dxy * (h - 1);
1223 const int dyw = dyx * (w - 1);
1224 int need_emu = (unsigned)ix >= width - w ||
1225 (unsigned)iy >= height - h;
1227 if ( // non-constant fullpel offset (3% of blocks)
1228 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1229 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1230 // uses more than 16 bits of subpel mv (only at huge resolution)
1231 || (dxx | dxy | dyx | dyy) & 15
1232 || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
1233 // FIXME could still use mmx for some of the rows
1234 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1235 shift, r, width, height);
1239 src += ix + iy * stride;
1241 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1246 "movd %0, %%mm6 \n\t"
1247 "pxor %%mm7, %%mm7 \n\t"
1248 "punpcklwd %%mm6, %%mm6 \n\t"
1249 "punpcklwd %%mm6, %%mm6 \n\t"
1253 for (x = 0; x < w; x += 4) {
1254 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1255 oxs - dxys + dxxs * (x + 1),
1256 oxs - dxys + dxxs * (x + 2),
1257 oxs - dxys + dxxs * (x + 3) };
1258 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1259 oys - dyys + dyxs * (x + 1),
1260 oys - dyys + dyxs * (x + 2),
1261 oys - dyys + dyxs * (x + 3) };
1263 for (y = 0; y < h; y++) {
1265 "movq %0, %%mm4 \n\t"
1266 "movq %1, %%mm5 \n\t"
1267 "paddw %2, %%mm4 \n\t"
1268 "paddw %3, %%mm5 \n\t"
1269 "movq %%mm4, %0 \n\t"
1270 "movq %%mm5, %1 \n\t"
1271 "psrlw $12, %%mm4 \n\t"
1272 "psrlw $12, %%mm5 \n\t"
1273 : "+m"(*dx4), "+m"(*dy4)
1274 : "m"(*dxy4), "m"(*dyy4)
1278 "movq %%mm6, %%mm2 \n\t"
1279 "movq %%mm6, %%mm1 \n\t"
1280 "psubw %%mm4, %%mm2 \n\t"
1281 "psubw %%mm5, %%mm1 \n\t"
1282 "movq %%mm2, %%mm0 \n\t"
1283 "movq %%mm4, %%mm3 \n\t"
1284 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1285 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1286 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1287 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1289 "movd %4, %%mm5 \n\t"
1290 "movd %3, %%mm4 \n\t"
1291 "punpcklbw %%mm7, %%mm5 \n\t"
1292 "punpcklbw %%mm7, %%mm4 \n\t"
1293 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1294 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1296 "movd %2, %%mm5 \n\t"
1297 "movd %1, %%mm4 \n\t"
1298 "punpcklbw %%mm7, %%mm5 \n\t"
1299 "punpcklbw %%mm7, %%mm4 \n\t"
1300 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1301 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1302 "paddw %5, %%mm1 \n\t"
1303 "paddw %%mm3, %%mm2 \n\t"
1304 "paddw %%mm1, %%mm0 \n\t"
1305 "paddw %%mm2, %%mm0 \n\t"
1307 "psrlw %6, %%mm0 \n\t"
1308 "packuswb %%mm0, %%mm0 \n\t"
1309 "movd %%mm0, %0 \n\t"
1311 : "=m"(dst[x + y * stride])
1312 : "m"(src[0]), "m"(src[1]),
1313 "m"(src[stride]), "m"(src[stride + 1]),
1314 "m"(*r4), "m"(shift2)
1318 src += 4 - h * stride;
1325 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1326 int stride, int h, int ox, int oy,
1327 int dxx, int dxy, int dyx, int dyy,
1328 int shift, int r, int width, int height)
1330 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1331 width, height, &ff_emulated_edge_mc_8);
1334 static void gmc_sse(uint8_t *dst, uint8_t *src,
1335 int stride, int h, int ox, int oy,
1336 int dxx, int dxy, int dyx, int dyy,
1337 int shift, int r, int width, int height)
1339 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1340 width, height, &ff_emulated_edge_mc_8);
1343 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1344 int stride, int h, int ox, int oy,
1345 int dxx, int dxy, int dyx, int dyy,
1346 int shift, int r, int width, int height)
1348 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1349 width, height, &ff_emulated_edge_mc_8);
1354 #endif /* HAVE_INLINE_ASM */
1356 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1357 ptrdiff_t line_size, int h);
1358 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1359 ptrdiff_t line_size, int h);
1364 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1366 put_pixels8_mmx(dst, src, stride, 8);
1369 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1371 avg_pixels8_mmx(dst, src, stride, 8);
1374 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1376 put_pixels16_mmx(dst, src, stride, 16);
1379 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1381 avg_pixels16_mmx(dst, src, stride, 16);
1383 #endif /* HAVE_INLINE_ASM */
1387 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1388 int stride, int rnd)
1390 ff_put_pixels8_mmx(dst, src, stride, 8);
1393 void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
1394 int stride, int rnd)
1396 ff_avg_pixels8_mmxext(dst, src, stride, 8);
1398 #endif /* HAVE_YASM */
1400 #if CONFIG_DIRAC_DECODER
1401 #define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
1402 void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1405 ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
1407 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
1409 void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1412 ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
1414 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
1416 void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1419 ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
1421 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
1422 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
1427 DIRAC_PIXOP(put, put, mmx)
1428 DIRAC_PIXOP(avg, avg, mmx)
1432 DIRAC_PIXOP(avg, ff_avg, mmxext)
1434 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1437 ff_put_dirac_pixels16_c(dst, src, stride, h);
1439 ff_put_pixels16_sse2(dst, src[0], stride, h);
1441 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1444 ff_avg_dirac_pixels16_c(dst, src, stride, h);
1446 ff_avg_pixels16_sse2(dst, src[0], stride, h);
1448 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1451 ff_put_dirac_pixels32_c(dst, src, stride, h);
1453 ff_put_pixels16_sse2(dst , src[0] , stride, h);
1454 ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
1457 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1460 ff_avg_dirac_pixels32_c(dst, src, stride, h);
1462 ff_avg_pixels16_sse2(dst , src[0] , stride, h);
1463 ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
1469 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
1472 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
1476 ff_put_pixels_clamped_mmx(block, dest, line_size);
1479 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
1483 ff_add_pixels_clamped_mmx(block, dest, line_size);
1486 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
1489 ff_mmxext_idct(block);
1490 ff_put_pixels_clamped_mmx(block, dest, line_size);
1493 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
1496 ff_mmxext_idct(block);
1497 ff_add_pixels_clamped_mmx(block, dest, line_size);
1502 static void vector_clipf_sse(float *dst, const float *src,
1503 float min, float max, int len)
1505 x86_reg i = (len - 16) * 4;
1507 "movss %3, %%xmm4 \n\t"
1508 "movss %4, %%xmm5 \n\t"
1509 "shufps $0, %%xmm4, %%xmm4 \n\t"
1510 "shufps $0, %%xmm5, %%xmm5 \n\t"
1512 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1513 "movaps 16(%2, %0), %%xmm1 \n\t"
1514 "movaps 32(%2, %0), %%xmm2 \n\t"
1515 "movaps 48(%2, %0), %%xmm3 \n\t"
1516 "maxps %%xmm4, %%xmm0 \n\t"
1517 "maxps %%xmm4, %%xmm1 \n\t"
1518 "maxps %%xmm4, %%xmm2 \n\t"
1519 "maxps %%xmm4, %%xmm3 \n\t"
1520 "minps %%xmm5, %%xmm0 \n\t"
1521 "minps %%xmm5, %%xmm1 \n\t"
1522 "minps %%xmm5, %%xmm2 \n\t"
1523 "minps %%xmm5, %%xmm3 \n\t"
1524 "movaps %%xmm0, (%1, %0) \n\t"
1525 "movaps %%xmm1, 16(%1, %0) \n\t"
1526 "movaps %%xmm2, 32(%1, %0) \n\t"
1527 "movaps %%xmm3, 48(%1, %0) \n\t"
1531 : "r"(dst), "r"(src), "m"(min), "m"(max)
1536 #endif /* HAVE_INLINE_ASM */
1538 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1540 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1542 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1544 int order, int mul);
1545 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1547 int order, int mul);
1548 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1550 int order, int mul);
1552 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1553 const int16_t *window, unsigned int len);
1554 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1555 const int16_t *window, unsigned int len);
1556 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1557 const int16_t *window, unsigned int len);
1558 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1559 const int16_t *window, unsigned int len);
1560 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1561 const int16_t *window, unsigned int len);
1562 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1563 const int16_t *window, unsigned int len);
1565 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1566 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1568 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1569 const uint8_t *diff, int w,
1570 int *left, int *left_top);
1571 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1573 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1576 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1577 int32_t min, int32_t max, unsigned int len);
1578 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1579 int32_t min, int32_t max, unsigned int len);
1580 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1581 int32_t min, int32_t max, unsigned int len);
1582 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1583 int32_t min, int32_t max, unsigned int len);
1585 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
1587 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1588 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1589 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
1590 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
1591 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
1592 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
1593 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
1594 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
1595 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
1596 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
1597 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
1598 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
1599 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
1600 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
1601 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
1602 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
1605 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
1607 c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
1608 c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
1609 c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
1610 c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
1613 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
1616 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1619 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
1620 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
1621 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
1623 if (!high_bit_depth) {
1624 c->clear_block = clear_block_mmx;
1625 c->clear_blocks = clear_blocks_mmx;
1626 c->draw_edges = draw_edges_mmx;
1628 SET_HPEL_FUNCS(put, [0], 16, mmx);
1629 SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
1630 SET_HPEL_FUNCS(avg, [0], 16, mmx);
1631 SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
1632 SET_HPEL_FUNCS(put, [1], 8, mmx);
1633 SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
1634 SET_HPEL_FUNCS(avg, [1], 8, mmx);
1637 #if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
1641 c->add_bytes = add_bytes_mmx;
1642 #endif /* HAVE_INLINE_ASM */
1645 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1646 c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
1647 c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
1650 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
1655 static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
1658 const int bit_depth = avctx->bits_per_raw_sample;
1659 const int high_bit_depth = bit_depth > 8;
1662 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
1663 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
1665 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
1666 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
1667 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
1668 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
1670 if (!high_bit_depth) {
1671 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
1672 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
1674 c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
1675 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
1676 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
1678 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
1679 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
1681 c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
1682 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
1683 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
1686 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1687 if (!high_bit_depth) {
1688 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
1689 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
1690 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
1691 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
1693 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
1694 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
1697 #endif /* HAVE_YASM */
1699 #if HAVE_MMXEXT_EXTERNAL
1700 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1701 avctx->codec_id == AV_CODEC_ID_THEORA)) {
1702 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
1703 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
1706 /* slower than cmov version on AMD */
1707 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
1708 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
1710 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
1711 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
1713 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1714 c->apply_window_int16 = ff_apply_window_int16_mmxext;
1716 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
1718 #endif /* HAVE_MMXEXT_EXTERNAL */
1721 static av_cold void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
1724 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1727 if (!high_bit_depth) {
1728 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
1729 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
1731 c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
1732 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
1733 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
1735 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
1736 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
1738 c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
1739 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
1740 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
1742 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
1743 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
1744 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
1745 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
1746 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
1748 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
1749 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
1753 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1754 avctx->codec_id == AV_CODEC_ID_THEORA)) {
1755 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
1756 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
1758 #endif /* HAVE_YASM */
1761 static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
1764 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1767 if (!high_bit_depth) {
1768 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
1769 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
1770 c->clear_block = clear_block_sse;
1771 c->clear_blocks = clear_blocks_sse;
1775 c->vector_clipf = vector_clipf_sse;
1776 #endif /* HAVE_INLINE_ASM */
1779 #if HAVE_INLINE_ASM && CONFIG_VIDEODSP
1782 #endif /* HAVE_YASM */
1785 static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
1788 const int bit_depth = avctx->bits_per_raw_sample;
1789 const int high_bit_depth = bit_depth > 8;
1791 #if HAVE_SSE2_INLINE
1792 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1793 c->idct_put = ff_idct_xvid_sse2_put;
1794 c->idct_add = ff_idct_xvid_sse2_add;
1795 c->idct = ff_idct_xvid_sse2;
1796 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
1798 #endif /* HAVE_SSE2_INLINE */
1800 #if HAVE_SSE2_EXTERNAL
1801 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1802 // these functions are slower than mmx on AMD, but faster on Intel
1803 if (!high_bit_depth) {
1804 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
1805 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
1806 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
1810 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
1811 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
1812 if (mm_flags & AV_CPU_FLAG_ATOM) {
1813 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
1815 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
1817 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1818 c->apply_window_int16 = ff_apply_window_int16_sse2;
1819 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1820 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
1822 c->bswap_buf = ff_bswap32_buf_sse2;
1823 #endif /* HAVE_SSE2_EXTERNAL */
1826 static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
1829 #if HAVE_SSSE3_EXTERNAL
1830 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
1831 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
1832 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
1834 if (mm_flags & AV_CPU_FLAG_ATOM)
1835 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
1837 c->apply_window_int16 = ff_apply_window_int16_ssse3;
1838 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
1839 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
1840 c->bswap_buf = ff_bswap32_buf_ssse3;
1841 #endif /* HAVE_SSSE3_EXTERNAL */
1844 static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
1847 #if HAVE_SSE4_EXTERNAL
1848 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
1849 #endif /* HAVE_SSE4_EXTERNAL */
1852 av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
1854 int mm_flags = av_get_cpu_flags();
1856 #if HAVE_7REGS && HAVE_INLINE_ASM
1857 if (mm_flags & AV_CPU_FLAG_CMOV)
1858 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
1861 if (mm_flags & AV_CPU_FLAG_MMX) {
1863 const int idct_algo = avctx->idct_algo;
1865 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
1866 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
1867 c->idct_put = ff_simple_idct_put_mmx;
1868 c->idct_add = ff_simple_idct_add_mmx;
1869 c->idct = ff_simple_idct_mmx;
1870 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
1872 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
1873 if (mm_flags & AV_CPU_FLAG_MMX2) {
1874 c->idct_put = ff_libmpeg2mmx2_idct_put;
1875 c->idct_add = ff_libmpeg2mmx2_idct_add;
1876 c->idct = ff_mmxext_idct;
1878 c->idct_put = ff_libmpeg2mmx_idct_put;
1879 c->idct_add = ff_libmpeg2mmx_idct_add;
1880 c->idct = ff_mmx_idct;
1882 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
1884 } else if (idct_algo == FF_IDCT_XVIDMMX) {
1885 if (mm_flags & AV_CPU_FLAG_SSE2) {
1886 c->idct_put = ff_idct_xvid_sse2_put;
1887 c->idct_add = ff_idct_xvid_sse2_add;
1888 c->idct = ff_idct_xvid_sse2;
1889 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
1890 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
1891 c->idct_put = ff_idct_xvid_mmxext_put;
1892 c->idct_add = ff_idct_xvid_mmxext_add;
1893 c->idct = ff_idct_xvid_mmxext;
1895 c->idct_put = ff_idct_xvid_mmx_put;
1896 c->idct_add = ff_idct_xvid_mmx_add;
1897 c->idct = ff_idct_xvid_mmx;
1901 #endif /* HAVE_INLINE_ASM */
1903 dsputil_init_mmx(c, avctx, mm_flags);
1906 if (mm_flags & AV_CPU_FLAG_MMXEXT)
1907 dsputil_init_mmxext(c, avctx, mm_flags);
1909 if (mm_flags & AV_CPU_FLAG_3DNOW)
1910 dsputil_init_3dnow(c, avctx, mm_flags);
1912 if (mm_flags & AV_CPU_FLAG_SSE)
1913 dsputil_init_sse(c, avctx, mm_flags);
1915 if (mm_flags & AV_CPU_FLAG_SSE2)
1916 dsputil_init_sse2(c, avctx, mm_flags);
1918 if (mm_flags & AV_CPU_FLAG_SSSE3)
1919 dsputil_init_ssse3(c, avctx, mm_flags);
1921 if (mm_flags & AV_CPU_FLAG_SSE4)
1922 dsputil_init_sse4(c, avctx, mm_flags);
1924 if (CONFIG_ENCODERS)
1925 ff_dsputilenc_init_mmx(c, avctx);