2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavcodec/dsputil.h"
29 #include "libavcodec/h264dsp.h"
30 #include "libavcodec/mpegvideo.h"
31 #include "libavcodec/simple_idct.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
43 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
44 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
49 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
53 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
57 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
58 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
60 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
62 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
63 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
64 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
65 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
71 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
72 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
74 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
76 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
81 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
82 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
86 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
87 ptrdiff_t line_size, int h);
88 void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
89 ptrdiff_t line_size, int h);
90 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
91 int dstStride, int src1Stride, int h);
92 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
93 uint8_t *src2, int dstStride,
94 int src1Stride, int h);
95 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
96 int dstStride, int src1Stride, int h);
97 void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
98 ptrdiff_t line_size, int h);
99 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
100 ptrdiff_t line_size, int h);
101 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
102 int dstStride, int src1Stride, int h);
103 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
104 int dstStride, int src1Stride, int h);
105 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
106 int dstStride, int src1Stride, int h);
107 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
108 ptrdiff_t line_size, int h);
109 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
110 ptrdiff_t line_size, int h);
111 void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
112 const uint8_t *pixels,
113 ptrdiff_t line_size, int h);
114 void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
115 const uint8_t *pixels,
116 ptrdiff_t line_size, int h);
117 void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
118 ptrdiff_t line_size, int h);
119 void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
120 ptrdiff_t line_size, int h);
121 void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
122 ptrdiff_t line_size, int h);
123 void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
124 ptrdiff_t line_size, int h);
125 void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
126 const uint8_t *pixels,
127 ptrdiff_t line_size, int h);
128 void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
129 const uint8_t *pixels,
130 ptrdiff_t line_size, int h);
131 void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
132 ptrdiff_t line_size, int h);
133 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
134 ptrdiff_t line_size, int h);
135 void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
136 ptrdiff_t line_size, int h);
137 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
138 ptrdiff_t line_size, int h);
139 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
140 ptrdiff_t line_size, int h);
141 void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
142 ptrdiff_t line_size, int h);
143 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
144 ptrdiff_t line_size, int h);
146 void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
147 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
148 ptrdiff_t line_size, int h)
150 ff_put_pixels8_mmxext(block, pixels, line_size, h);
151 ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
154 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
155 int dstStride, int srcStride, int h);
156 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
157 int dstStride, int srcStride, int h);
158 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
159 int dstStride, int srcStride,
161 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
162 int dstStride, int srcStride, int h);
163 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
164 int dstStride, int srcStride, int h);
165 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
166 int dstStride, int srcStride,
168 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
169 int dstStride, int srcStride);
170 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
171 int dstStride, int srcStride);
172 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
173 int dstStride, int srcStride);
174 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
175 int dstStride, int srcStride);
176 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
177 int dstStride, int srcStride);
178 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
179 int dstStride, int srcStride);
180 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
181 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
182 #endif /* HAVE_YASM */
187 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
188 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
190 #define MOVQ_BFE(regd) \
192 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
193 "paddb %%"#regd", %%"#regd" \n\t" ::)
196 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
197 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
199 // for shared library it's better to use this way for accessing constants
201 #define MOVQ_BONE(regd) \
203 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
204 "psrlw $15, %%"#regd" \n\t" \
205 "packuswb %%"#regd", %%"#regd" \n\t" ::)
207 #define MOVQ_WTWO(regd) \
209 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
210 "psrlw $15, %%"#regd" \n\t" \
211 "psllw $1, %%"#regd" \n\t"::)
215 // using regr as temporary and for the output result
216 // first argument is unmodifed and second is trashed
217 // regfe is supposed to contain 0xfefefefefefefefe
218 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
219 "movq "#rega", "#regr" \n\t" \
220 "pand "#regb", "#regr" \n\t" \
221 "pxor "#rega", "#regb" \n\t" \
222 "pand "#regfe", "#regb" \n\t" \
223 "psrlq $1, "#regb" \n\t" \
224 "paddb "#regb", "#regr" \n\t"
226 #define PAVGB_MMX(rega, regb, regr, regfe) \
227 "movq "#rega", "#regr" \n\t" \
228 "por "#regb", "#regr" \n\t" \
229 "pxor "#rega", "#regb" \n\t" \
230 "pand "#regfe", "#regb" \n\t" \
231 "psrlq $1, "#regb" \n\t" \
232 "psubb "#regb", "#regr" \n\t"
234 // mm6 is supposed to contain 0xfefefefefefefefe
235 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
236 "movq "#rega", "#regr" \n\t" \
237 "movq "#regc", "#regp" \n\t" \
238 "pand "#regb", "#regr" \n\t" \
239 "pand "#regd", "#regp" \n\t" \
240 "pxor "#rega", "#regb" \n\t" \
241 "pxor "#regc", "#regd" \n\t" \
242 "pand %%mm6, "#regb" \n\t" \
243 "pand %%mm6, "#regd" \n\t" \
244 "psrlq $1, "#regb" \n\t" \
245 "psrlq $1, "#regd" \n\t" \
246 "paddb "#regb", "#regr" \n\t" \
247 "paddb "#regd", "#regp" \n\t"
249 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
250 "movq "#rega", "#regr" \n\t" \
251 "movq "#regc", "#regp" \n\t" \
252 "por "#regb", "#regr" \n\t" \
253 "por "#regd", "#regp" \n\t" \
254 "pxor "#rega", "#regb" \n\t" \
255 "pxor "#regc", "#regd" \n\t" \
256 "pand %%mm6, "#regb" \n\t" \
257 "pand %%mm6, "#regd" \n\t" \
258 "psrlq $1, "#regd" \n\t" \
259 "psrlq $1, "#regb" \n\t" \
260 "psubb "#regb", "#regr" \n\t" \
261 "psubb "#regd", "#regp" \n\t"
263 /***********************************/
264 /* MMX no rounding */
266 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
267 #define SET_RND MOVQ_WONE
268 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
269 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
270 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
272 #include "dsputil_rnd_template.c"
279 /***********************************/
282 #define DEF(x, y) x ## _ ## y ## _mmx
283 #define SET_RND MOVQ_WTWO
284 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
285 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
287 #include "dsputil_rnd_template.c"
295 #endif /* HAVE_INLINE_ASM */
300 /***********************************/
303 #define DEF(x) x ## _3dnow
305 #include "dsputil_avg_template.c"
309 /***********************************/
310 /* MMXEXT specific */
312 #define DEF(x) x ## _mmxext
314 #include "dsputil_avg_template.c"
318 #endif /* HAVE_YASM */
322 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
323 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
325 /***********************************/
328 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
334 /* read the pixels */
339 "movq (%3), %%mm0 \n\t"
340 "movq 8(%3), %%mm1 \n\t"
341 "movq 16(%3), %%mm2 \n\t"
342 "movq 24(%3), %%mm3 \n\t"
343 "movq 32(%3), %%mm4 \n\t"
344 "movq 40(%3), %%mm5 \n\t"
345 "movq 48(%3), %%mm6 \n\t"
346 "movq 56(%3), %%mm7 \n\t"
347 "packuswb %%mm1, %%mm0 \n\t"
348 "packuswb %%mm3, %%mm2 \n\t"
349 "packuswb %%mm5, %%mm4 \n\t"
350 "packuswb %%mm7, %%mm6 \n\t"
351 "movq %%mm0, (%0) \n\t"
352 "movq %%mm2, (%0, %1) \n\t"
353 "movq %%mm4, (%0, %1, 2) \n\t"
354 "movq %%mm6, (%0, %2) \n\t"
355 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
358 pix += line_size * 4;
361 // if here would be an exact copy of the code above
362 // compiler would generate some very strange code
365 "movq (%3), %%mm0 \n\t"
366 "movq 8(%3), %%mm1 \n\t"
367 "movq 16(%3), %%mm2 \n\t"
368 "movq 24(%3), %%mm3 \n\t"
369 "movq 32(%3), %%mm4 \n\t"
370 "movq 40(%3), %%mm5 \n\t"
371 "movq 48(%3), %%mm6 \n\t"
372 "movq 56(%3), %%mm7 \n\t"
373 "packuswb %%mm1, %%mm0 \n\t"
374 "packuswb %%mm3, %%mm2 \n\t"
375 "packuswb %%mm5, %%mm4 \n\t"
376 "packuswb %%mm7, %%mm6 \n\t"
377 "movq %%mm0, (%0) \n\t"
378 "movq %%mm2, (%0, %1) \n\t"
379 "movq %%mm4, (%0, %1, 2) \n\t"
380 "movq %%mm6, (%0, %2) \n\t"
381 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
385 #define put_signed_pixels_clamped_mmx_half(off) \
386 "movq "#off"(%2), %%mm1 \n\t" \
387 "movq 16 + "#off"(%2), %%mm2 \n\t" \
388 "movq 32 + "#off"(%2), %%mm3 \n\t" \
389 "movq 48 + "#off"(%2), %%mm4 \n\t" \
390 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
391 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
392 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
393 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
394 "paddb %%mm0, %%mm1 \n\t" \
395 "paddb %%mm0, %%mm2 \n\t" \
396 "paddb %%mm0, %%mm3 \n\t" \
397 "paddb %%mm0, %%mm4 \n\t" \
398 "movq %%mm1, (%0) \n\t" \
399 "movq %%mm2, (%0, %3) \n\t" \
400 "movq %%mm3, (%0, %3, 2) \n\t" \
401 "movq %%mm4, (%0, %1) \n\t"
403 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
406 x86_reg line_skip = line_size;
410 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
411 "lea (%3, %3, 2), %1 \n\t"
412 put_signed_pixels_clamped_mmx_half(0)
413 "lea (%0, %3, 4), %0 \n\t"
414 put_signed_pixels_clamped_mmx_half(64)
415 : "+&r"(pixels), "=&r"(line_skip3)
416 : "r"(block), "r"(line_skip)
420 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
427 /* read the pixels */
434 "movq (%2), %%mm0 \n\t"
435 "movq 8(%2), %%mm1 \n\t"
436 "movq 16(%2), %%mm2 \n\t"
437 "movq 24(%2), %%mm3 \n\t"
438 "movq %0, %%mm4 \n\t"
439 "movq %1, %%mm6 \n\t"
440 "movq %%mm4, %%mm5 \n\t"
441 "punpcklbw %%mm7, %%mm4 \n\t"
442 "punpckhbw %%mm7, %%mm5 \n\t"
443 "paddsw %%mm4, %%mm0 \n\t"
444 "paddsw %%mm5, %%mm1 \n\t"
445 "movq %%mm6, %%mm5 \n\t"
446 "punpcklbw %%mm7, %%mm6 \n\t"
447 "punpckhbw %%mm7, %%mm5 \n\t"
448 "paddsw %%mm6, %%mm2 \n\t"
449 "paddsw %%mm5, %%mm3 \n\t"
450 "packuswb %%mm1, %%mm0 \n\t"
451 "packuswb %%mm3, %%mm2 \n\t"
452 "movq %%mm0, %0 \n\t"
453 "movq %%mm2, %1 \n\t"
454 : "+m"(*pix), "+m"(*(pix + line_size))
457 pix += line_size * 2;
462 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
463 ptrdiff_t line_size, int h)
466 "lea (%3, %3), %%"REG_a" \n\t"
469 "movq (%1 ), %%mm0 \n\t"
470 "movq (%1, %3), %%mm1 \n\t"
471 "movq %%mm0, (%2) \n\t"
472 "movq %%mm1, (%2, %3) \n\t"
473 "add %%"REG_a", %1 \n\t"
474 "add %%"REG_a", %2 \n\t"
475 "movq (%1 ), %%mm0 \n\t"
476 "movq (%1, %3), %%mm1 \n\t"
477 "movq %%mm0, (%2) \n\t"
478 "movq %%mm1, (%2, %3) \n\t"
479 "add %%"REG_a", %1 \n\t"
480 "add %%"REG_a", %2 \n\t"
483 : "+g"(h), "+r"(pixels), "+r"(block)
484 : "r"((x86_reg)line_size)
489 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
490 ptrdiff_t line_size, int h)
493 "lea (%3, %3), %%"REG_a" \n\t"
496 "movq (%1 ), %%mm0 \n\t"
497 "movq 8(%1 ), %%mm4 \n\t"
498 "movq (%1, %3), %%mm1 \n\t"
499 "movq 8(%1, %3), %%mm5 \n\t"
500 "movq %%mm0, (%2) \n\t"
501 "movq %%mm4, 8(%2) \n\t"
502 "movq %%mm1, (%2, %3) \n\t"
503 "movq %%mm5, 8(%2, %3) \n\t"
504 "add %%"REG_a", %1 \n\t"
505 "add %%"REG_a", %2 \n\t"
506 "movq (%1 ), %%mm0 \n\t"
507 "movq 8(%1 ), %%mm4 \n\t"
508 "movq (%1, %3), %%mm1 \n\t"
509 "movq 8(%1, %3), %%mm5 \n\t"
510 "movq %%mm0, (%2) \n\t"
511 "movq %%mm4, 8(%2) \n\t"
512 "movq %%mm1, (%2, %3) \n\t"
513 "movq %%mm5, 8(%2, %3) \n\t"
514 "add %%"REG_a", %1 \n\t"
515 "add %%"REG_a", %2 \n\t"
518 : "+g"(h), "+r"(pixels), "+r"(block)
519 : "r"((x86_reg)line_size)
524 #define CLEAR_BLOCKS(name, n) \
525 static void name(int16_t *blocks) \
528 "pxor %%mm7, %%mm7 \n\t" \
529 "mov %1, %%"REG_a" \n\t" \
531 "movq %%mm7, (%0, %%"REG_a") \n\t" \
532 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
533 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
534 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
535 "add $32, %%"REG_a" \n\t" \
537 :: "r"(((uint8_t *)blocks) + 128 * n), \
542 CLEAR_BLOCKS(clear_blocks_mmx, 6)
543 CLEAR_BLOCKS(clear_block_mmx, 1)
545 static void clear_block_sse(int16_t *block)
548 "xorps %%xmm0, %%xmm0 \n"
549 "movaps %%xmm0, (%0) \n"
550 "movaps %%xmm0, 16(%0) \n"
551 "movaps %%xmm0, 32(%0) \n"
552 "movaps %%xmm0, 48(%0) \n"
553 "movaps %%xmm0, 64(%0) \n"
554 "movaps %%xmm0, 80(%0) \n"
555 "movaps %%xmm0, 96(%0) \n"
556 "movaps %%xmm0, 112(%0) \n"
562 static void clear_blocks_sse(int16_t *blocks)
565 "xorps %%xmm0, %%xmm0 \n"
566 "mov %1, %%"REG_a" \n"
568 "movaps %%xmm0, (%0, %%"REG_a") \n"
569 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
570 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
571 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
572 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
573 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
574 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
575 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
576 "add $128, %%"REG_a" \n"
578 :: "r"(((uint8_t *)blocks) + 128 * 6),
584 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
590 "movq (%1, %0), %%mm0 \n\t"
591 "movq (%2, %0), %%mm1 \n\t"
592 "paddb %%mm0, %%mm1 \n\t"
593 "movq %%mm1, (%2, %0) \n\t"
594 "movq 8(%1, %0), %%mm0 \n\t"
595 "movq 8(%2, %0), %%mm1 \n\t"
596 "paddb %%mm0, %%mm1 \n\t"
597 "movq %%mm1, 8(%2, %0) \n\t"
603 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
606 dst[i + 0] += src[i + 0];
610 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
611 const uint8_t *diff, int w,
612 int *left, int *left_top)
616 int l = *left & 0xff;
617 int tl = *left_top & 0xff;
622 "movzbl (%3, %4), %2 \n"
635 "add (%6, %4), %b0 \n"
636 "mov %b0, (%5, %4) \n"
639 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
640 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
646 #endif /* HAVE_INLINE_ASM */
648 void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
649 void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
652 /* Draw the edges of width 'w' of an image of size width, height
653 * this MMX version can only handle w == 8 || w == 16. */
654 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
655 int w, int h, int sides)
657 uint8_t *ptr, *last_line;
660 last_line = buf + (height - 1) * wrap;
666 "movd (%0), %%mm0 \n\t"
667 "punpcklbw %%mm0, %%mm0 \n\t"
668 "punpcklwd %%mm0, %%mm0 \n\t"
669 "punpckldq %%mm0, %%mm0 \n\t"
670 "movq %%mm0, -8(%0) \n\t"
671 "movq -8(%0, %2), %%mm1 \n\t"
672 "punpckhbw %%mm1, %%mm1 \n\t"
673 "punpckhwd %%mm1, %%mm1 \n\t"
674 "punpckhdq %%mm1, %%mm1 \n\t"
675 "movq %%mm1, (%0, %2) \n\t"
680 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
685 "movd (%0), %%mm0 \n\t"
686 "punpcklbw %%mm0, %%mm0 \n\t"
687 "punpcklwd %%mm0, %%mm0 \n\t"
688 "punpckldq %%mm0, %%mm0 \n\t"
689 "movq %%mm0, -8(%0) \n\t"
690 "movq %%mm0, -16(%0) \n\t"
691 "movq -8(%0, %2), %%mm1 \n\t"
692 "punpckhbw %%mm1, %%mm1 \n\t"
693 "punpckhwd %%mm1, %%mm1 \n\t"
694 "punpckhdq %%mm1, %%mm1 \n\t"
695 "movq %%mm1, (%0, %2) \n\t"
696 "movq %%mm1, 8(%0, %2) \n\t"
701 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
705 /* top and bottom (and hopefully also the corners) */
706 if (sides & EDGE_TOP) {
707 for (i = 0; i < h; i += 4) {
708 ptr = buf - (i + 1) * wrap - w;
711 "movq (%1, %0), %%mm0 \n\t"
712 "movq %%mm0, (%0) \n\t"
713 "movq %%mm0, (%0, %2) \n\t"
714 "movq %%mm0, (%0, %2, 2) \n\t"
715 "movq %%mm0, (%0, %3) \n\t"
720 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
721 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
726 if (sides & EDGE_BOTTOM) {
727 for (i = 0; i < h; i += 4) {
728 ptr = last_line + (i + 1) * wrap - w;
731 "movq (%1, %0), %%mm0 \n\t"
732 "movq %%mm0, (%0) \n\t"
733 "movq %%mm0, (%0, %2) \n\t"
734 "movq %%mm0, (%0, %2, 2) \n\t"
735 "movq %%mm0, (%0, %3) \n\t"
740 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
741 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
742 "r"(ptr + width + 2 * w)
747 #endif /* HAVE_INLINE_ASM */
751 #define QPEL_OP(OPNAME, ROUNDER, RND, MMX) \
752 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
755 ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
758 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
762 uint8_t * const half = (uint8_t*)temp; \
763 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
765 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
766 stride, stride, 8); \
769 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
772 ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
776 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
780 uint8_t * const half = (uint8_t*)temp; \
781 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
783 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
787 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
791 uint8_t * const half = (uint8_t*)temp; \
792 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
794 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
795 stride, stride, 8); \
798 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
801 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
805 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
809 uint8_t * const half = (uint8_t*)temp; \
810 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
812 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
816 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
819 uint64_t half[8 + 9]; \
820 uint8_t * const halfH = ((uint8_t*)half) + 64; \
821 uint8_t * const halfHV = ((uint8_t*)half); \
822 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
824 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
826 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
827 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
831 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
834 uint64_t half[8 + 9]; \
835 uint8_t * const halfH = ((uint8_t*)half) + 64; \
836 uint8_t * const halfHV = ((uint8_t*)half); \
837 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
839 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
841 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
842 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
846 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
849 uint64_t half[8 + 9]; \
850 uint8_t * const halfH = ((uint8_t*)half) + 64; \
851 uint8_t * const halfHV = ((uint8_t*)half); \
852 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
854 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
856 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
857 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
861 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
864 uint64_t half[8 + 9]; \
865 uint8_t * const halfH = ((uint8_t*)half) + 64; \
866 uint8_t * const halfHV = ((uint8_t*)half); \
867 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
869 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
871 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
872 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
876 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
879 uint64_t half[8 + 9]; \
880 uint8_t * const halfH = ((uint8_t*)half) + 64; \
881 uint8_t * const halfHV = ((uint8_t*)half); \
882 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
884 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
885 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
889 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
892 uint64_t half[8 + 9]; \
893 uint8_t * const halfH = ((uint8_t*)half) + 64; \
894 uint8_t * const halfHV = ((uint8_t*)half); \
895 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
897 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
898 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
902 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
905 uint64_t half[8 + 9]; \
906 uint8_t * const halfH = ((uint8_t*)half); \
907 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
909 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
911 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
915 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
918 uint64_t half[8 + 9]; \
919 uint8_t * const halfH = ((uint8_t*)half); \
920 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
922 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
924 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
928 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
932 uint8_t * const halfH = ((uint8_t*)half); \
933 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
935 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
939 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
942 ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
945 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
949 uint8_t * const half = (uint8_t*)temp; \
950 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
952 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
956 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
959 ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
960 stride, stride, 16);\
963 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
967 uint8_t * const half = (uint8_t*)temp; \
968 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
970 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
971 stride, stride, 16); \
974 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
978 uint8_t * const half = (uint8_t*)temp; \
979 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
981 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
985 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
988 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
992 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
996 uint8_t * const half = (uint8_t*)temp; \
997 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
999 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1000 stride, stride, 16); \
1003 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1006 uint64_t half[16 * 2 + 17 * 2]; \
1007 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1008 uint8_t * const halfHV = ((uint8_t*)half); \
1009 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1011 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1013 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1015 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1019 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1022 uint64_t half[16 * 2 + 17 * 2]; \
1023 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1024 uint8_t * const halfHV = ((uint8_t*)half); \
1025 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1027 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1029 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1031 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1035 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1038 uint64_t half[16 * 2 + 17 * 2]; \
1039 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1040 uint8_t * const halfHV = ((uint8_t*)half); \
1041 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1043 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1045 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1047 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1051 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1054 uint64_t half[16 * 2 + 17 * 2]; \
1055 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1056 uint8_t * const halfHV = ((uint8_t*)half); \
1057 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1059 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1061 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1063 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1067 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1070 uint64_t half[16 * 2 + 17 * 2]; \
1071 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1072 uint8_t * const halfHV = ((uint8_t*)half); \
1073 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1075 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1077 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1081 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1084 uint64_t half[16 * 2 + 17 * 2]; \
1085 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1086 uint8_t * const halfHV = ((uint8_t*)half); \
1087 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1089 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1091 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1095 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1098 uint64_t half[17 * 2]; \
1099 uint8_t * const halfH = ((uint8_t*)half); \
1100 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1102 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1104 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1108 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1111 uint64_t half[17 * 2]; \
1112 uint8_t * const halfH = ((uint8_t*)half); \
1113 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1115 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1117 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1121 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1124 uint64_t half[17 * 2]; \
1125 uint8_t * const halfH = ((uint8_t*)half); \
1126 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1128 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1132 QPEL_OP(put_, ff_pw_16, _, mmxext)
1133 QPEL_OP(avg_, ff_pw_16, _, mmxext)
1134 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, mmxext)
1135 #endif /* HAVE_YASM */
1139 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1141 put_pixels8_xy2_mmx(dst, src, stride, 8);
1143 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1145 put_pixels16_xy2_mmx(dst, src, stride, 16);
1147 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1149 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1151 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1153 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1156 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1157 int stride, int h, int ox, int oy,
1158 int dxx, int dxy, int dyx, int dyy,
1159 int shift, int r, int width, int height)
1162 const int ix = ox >> (16 + shift);
1163 const int iy = oy >> (16 + shift);
1164 const int oxs = ox >> 4;
1165 const int oys = oy >> 4;
1166 const int dxxs = dxx >> 4;
1167 const int dxys = dxy >> 4;
1168 const int dyxs = dyx >> 4;
1169 const int dyys = dyy >> 4;
1170 const uint16_t r4[4] = { r, r, r, r };
1171 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1172 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1173 const uint64_t shift2 = 2 * shift;
1176 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1177 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1178 const int dxh = dxy * (h - 1);
1179 const int dyw = dyx * (w - 1);
1180 if ( // non-constant fullpel offset (3% of blocks)
1181 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1182 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1183 // uses more than 16 bits of subpel mv (only at huge resolution)
1184 || (dxx | dxy | dyx | dyy) & 15 ||
1185 (unsigned)ix >= width - w ||
1186 (unsigned)iy >= height - h) {
1187 // FIXME could still use mmx for some of the rows
1188 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1189 shift, r, width, height);
1193 src += ix + iy * stride;
1196 "movd %0, %%mm6 \n\t"
1197 "pxor %%mm7, %%mm7 \n\t"
1198 "punpcklwd %%mm6, %%mm6 \n\t"
1199 "punpcklwd %%mm6, %%mm6 \n\t"
1203 for (x = 0; x < w; x += 4) {
1204 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1205 oxs - dxys + dxxs * (x + 1),
1206 oxs - dxys + dxxs * (x + 2),
1207 oxs - dxys + dxxs * (x + 3) };
1208 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1209 oys - dyys + dyxs * (x + 1),
1210 oys - dyys + dyxs * (x + 2),
1211 oys - dyys + dyxs * (x + 3) };
1213 for (y = 0; y < h; y++) {
1215 "movq %0, %%mm4 \n\t"
1216 "movq %1, %%mm5 \n\t"
1217 "paddw %2, %%mm4 \n\t"
1218 "paddw %3, %%mm5 \n\t"
1219 "movq %%mm4, %0 \n\t"
1220 "movq %%mm5, %1 \n\t"
1221 "psrlw $12, %%mm4 \n\t"
1222 "psrlw $12, %%mm5 \n\t"
1223 : "+m"(*dx4), "+m"(*dy4)
1224 : "m"(*dxy4), "m"(*dyy4)
1228 "movq %%mm6, %%mm2 \n\t"
1229 "movq %%mm6, %%mm1 \n\t"
1230 "psubw %%mm4, %%mm2 \n\t"
1231 "psubw %%mm5, %%mm1 \n\t"
1232 "movq %%mm2, %%mm0 \n\t"
1233 "movq %%mm4, %%mm3 \n\t"
1234 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1235 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1236 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1237 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1239 "movd %4, %%mm5 \n\t"
1240 "movd %3, %%mm4 \n\t"
1241 "punpcklbw %%mm7, %%mm5 \n\t"
1242 "punpcklbw %%mm7, %%mm4 \n\t"
1243 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1244 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1246 "movd %2, %%mm5 \n\t"
1247 "movd %1, %%mm4 \n\t"
1248 "punpcklbw %%mm7, %%mm5 \n\t"
1249 "punpcklbw %%mm7, %%mm4 \n\t"
1250 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1251 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1252 "paddw %5, %%mm1 \n\t"
1253 "paddw %%mm3, %%mm2 \n\t"
1254 "paddw %%mm1, %%mm0 \n\t"
1255 "paddw %%mm2, %%mm0 \n\t"
1257 "psrlw %6, %%mm0 \n\t"
1258 "packuswb %%mm0, %%mm0 \n\t"
1259 "movd %%mm0, %0 \n\t"
1261 : "=m"(dst[x + y * stride])
1262 : "m"(src[0]), "m"(src[1]),
1263 "m"(src[stride]), "m"(src[stride + 1]),
1264 "m"(*r4), "m"(shift2)
1268 src += 4 - h * stride;
1271 #endif /* HAVE_INLINE_ASM */
1273 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1274 ptrdiff_t line_size, int h);
1275 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1276 ptrdiff_t line_size, int h);
1281 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1283 put_pixels8_mmx(dst, src, stride, 8);
1286 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1288 avg_pixels8_mmx(dst, src, stride, 8);
1291 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1293 put_pixels16_mmx(dst, src, stride, 16);
1296 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1298 avg_pixels16_mmx(dst, src, stride, 16);
1302 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1303 ptrdiff_t stride, int rnd)
1305 put_pixels8_mmx(dst, src, stride, 8);
1308 static void vector_clipf_sse(float *dst, const float *src,
1309 float min, float max, int len)
1311 x86_reg i = (len - 16) * 4;
1313 "movss %3, %%xmm4 \n\t"
1314 "movss %4, %%xmm5 \n\t"
1315 "shufps $0, %%xmm4, %%xmm4 \n\t"
1316 "shufps $0, %%xmm5, %%xmm5 \n\t"
1318 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1319 "movaps 16(%2, %0), %%xmm1 \n\t"
1320 "movaps 32(%2, %0), %%xmm2 \n\t"
1321 "movaps 48(%2, %0), %%xmm3 \n\t"
1322 "maxps %%xmm4, %%xmm0 \n\t"
1323 "maxps %%xmm4, %%xmm1 \n\t"
1324 "maxps %%xmm4, %%xmm2 \n\t"
1325 "maxps %%xmm4, %%xmm3 \n\t"
1326 "minps %%xmm5, %%xmm0 \n\t"
1327 "minps %%xmm5, %%xmm1 \n\t"
1328 "minps %%xmm5, %%xmm2 \n\t"
1329 "minps %%xmm5, %%xmm3 \n\t"
1330 "movaps %%xmm0, (%1, %0) \n\t"
1331 "movaps %%xmm1, 16(%1, %0) \n\t"
1332 "movaps %%xmm2, 32(%1, %0) \n\t"
1333 "movaps %%xmm3, 48(%1, %0) \n\t"
1337 : "r"(dst), "r"(src), "m"(min), "m"(max)
1342 #endif /* HAVE_INLINE_ASM */
1344 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1346 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1348 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1350 int order, int mul);
1351 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1353 int order, int mul);
1354 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1356 int order, int mul);
1358 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1359 const int16_t *window, unsigned int len);
1360 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1361 const int16_t *window, unsigned int len);
1362 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1363 const int16_t *window, unsigned int len);
1364 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1365 const int16_t *window, unsigned int len);
1366 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1367 const int16_t *window, unsigned int len);
1368 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1369 const int16_t *window, unsigned int len);
1371 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1372 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1374 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1375 const uint8_t *diff, int w,
1376 int *left, int *left_top);
1377 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1379 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1382 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1383 int32_t min, int32_t max, unsigned int len);
1384 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1385 int32_t min, int32_t max, unsigned int len);
1386 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1387 int32_t min, int32_t max, unsigned int len);
1388 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1389 int32_t min, int32_t max, unsigned int len);
1391 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
1393 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1394 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1395 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
1396 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
1397 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
1398 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
1399 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
1400 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
1401 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
1402 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
1403 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
1404 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
1405 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
1406 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
1407 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
1408 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
1411 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
1413 c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
1414 c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
1415 c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
1416 c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
1419 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
1422 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1425 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
1426 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
1427 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
1429 if (!high_bit_depth) {
1430 c->clear_block = clear_block_mmx;
1431 c->clear_blocks = clear_blocks_mmx;
1432 c->draw_edges = draw_edges_mmx;
1434 SET_HPEL_FUNCS(put, [0], 16, mmx);
1435 SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
1436 SET_HPEL_FUNCS(avg, [0], 16, mmx);
1437 SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
1438 SET_HPEL_FUNCS(put, [1], 8, mmx);
1439 SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
1440 SET_HPEL_FUNCS(avg, [1], 8, mmx);
1442 switch (avctx->idct_algo) {
1444 case FF_IDCT_SIMPLEMMX:
1445 c->idct_put = ff_simple_idct_put_mmx;
1446 c->idct_add = ff_simple_idct_add_mmx;
1447 c->idct = ff_simple_idct_mmx;
1448 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
1450 case FF_IDCT_XVIDMMX:
1451 c->idct_put = ff_idct_xvid_mmx_put;
1452 c->idct_add = ff_idct_xvid_mmx_add;
1453 c->idct = ff_idct_xvid_mmx;
1460 c->add_bytes = add_bytes_mmx;
1461 #endif /* HAVE_INLINE_ASM */
1464 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1465 c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
1466 c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
1469 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
1474 static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
1477 const int bit_depth = avctx->bits_per_raw_sample;
1478 const int high_bit_depth = bit_depth > 8;
1481 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
1482 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
1484 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
1485 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
1486 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
1487 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
1489 if (!high_bit_depth) {
1490 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
1491 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
1493 c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
1494 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
1495 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
1497 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
1498 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
1500 c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
1501 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
1502 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
1505 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1506 if (!high_bit_depth) {
1507 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
1508 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
1509 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
1510 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
1512 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
1513 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
1516 #endif /* HAVE_YASM */
1519 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1520 c->idct_put = ff_idct_xvid_mmxext_put;
1521 c->idct_add = ff_idct_xvid_mmxext_add;
1522 c->idct = ff_idct_xvid_mmxext;
1524 #endif /* HAVE_INLINE_ASM */
1526 #if HAVE_MMXEXT_EXTERNAL
1527 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1528 avctx->codec_id == AV_CODEC_ID_THEORA)) {
1529 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
1530 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
1533 /* slower than cmov version on AMD */
1534 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
1535 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
1537 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
1538 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
1540 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1541 c->apply_window_int16 = ff_apply_window_int16_mmxext;
1543 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
1545 #endif /* HAVE_MMXEXT_EXTERNAL */
1548 static av_cold void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
1551 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1554 if (!high_bit_depth) {
1555 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
1556 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
1558 c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
1559 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
1560 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
1562 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
1563 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
1565 c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
1566 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
1567 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
1569 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
1570 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
1571 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
1572 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
1573 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
1575 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
1576 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
1580 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1581 avctx->codec_id == AV_CODEC_ID_THEORA)) {
1582 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
1583 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
1585 #endif /* HAVE_YASM */
1588 static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
1591 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1594 if (!high_bit_depth) {
1595 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
1596 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
1597 c->clear_block = clear_block_sse;
1598 c->clear_blocks = clear_blocks_sse;
1602 c->vector_clipf = vector_clipf_sse;
1603 #endif /* HAVE_INLINE_ASM */
1606 static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
1609 const int bit_depth = avctx->bits_per_raw_sample;
1610 const int high_bit_depth = bit_depth > 8;
1612 #if HAVE_SSE2_INLINE
1613 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1614 c->idct_put = ff_idct_xvid_sse2_put;
1615 c->idct_add = ff_idct_xvid_sse2_add;
1616 c->idct = ff_idct_xvid_sse2;
1617 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
1619 #endif /* HAVE_SSE2_INLINE */
1621 #if HAVE_SSE2_EXTERNAL
1622 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1623 // these functions are slower than mmx on AMD, but faster on Intel
1624 if (!high_bit_depth) {
1625 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
1626 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
1627 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
1631 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
1632 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
1633 if (mm_flags & AV_CPU_FLAG_ATOM) {
1634 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
1636 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
1638 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1639 c->apply_window_int16 = ff_apply_window_int16_sse2;
1640 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1641 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
1643 c->bswap_buf = ff_bswap32_buf_sse2;
1644 #endif /* HAVE_SSE2_EXTERNAL */
1647 static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
1650 #if HAVE_SSSE3_EXTERNAL
1651 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
1652 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
1653 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
1655 if (mm_flags & AV_CPU_FLAG_ATOM)
1656 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
1658 c->apply_window_int16 = ff_apply_window_int16_ssse3;
1659 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
1660 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
1661 c->bswap_buf = ff_bswap32_buf_ssse3;
1662 #endif /* HAVE_SSSE3_EXTERNAL */
1665 static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
1668 #if HAVE_SSE4_EXTERNAL
1669 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
1670 #endif /* HAVE_SSE4_EXTERNAL */
1673 av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
1675 int mm_flags = av_get_cpu_flags();
1677 #if HAVE_7REGS && HAVE_INLINE_ASM
1678 if (mm_flags & AV_CPU_FLAG_CMOV)
1679 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
1682 if (mm_flags & AV_CPU_FLAG_MMX)
1683 dsputil_init_mmx(c, avctx, mm_flags);
1685 if (mm_flags & AV_CPU_FLAG_MMXEXT)
1686 dsputil_init_mmxext(c, avctx, mm_flags);
1688 if (mm_flags & AV_CPU_FLAG_3DNOW)
1689 dsputil_init_3dnow(c, avctx, mm_flags);
1691 if (mm_flags & AV_CPU_FLAG_SSE)
1692 dsputil_init_sse(c, avctx, mm_flags);
1694 if (mm_flags & AV_CPU_FLAG_SSE2)
1695 dsputil_init_sse2(c, avctx, mm_flags);
1697 if (mm_flags & AV_CPU_FLAG_SSSE3)
1698 dsputil_init_ssse3(c, avctx, mm_flags);
1700 if (mm_flags & AV_CPU_FLAG_SSE4)
1701 dsputil_init_sse4(c, avctx, mm_flags);
1703 if (CONFIG_ENCODERS)
1704 ff_dsputilenc_init_mmx(c, avctx);