2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavcodec/dsputil.h"
29 #include "libavcodec/h264dsp.h"
30 #include "libavcodec/mpegvideo.h"
31 #include "libavcodec/simple_idct.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
43 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
44 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
49 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
53 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
57 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
58 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
60 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
62 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
63 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
64 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
65 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
71 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
72 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
74 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
76 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
81 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
82 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
86 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
87 ptrdiff_t line_size, int h);
88 void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
89 ptrdiff_t line_size, int h);
90 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
91 int dstStride, int src1Stride, int h);
92 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
93 uint8_t *src2, int dstStride,
94 int src1Stride, int h);
95 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
96 int dstStride, int src1Stride, int h);
97 void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
98 ptrdiff_t line_size, int h);
99 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
100 ptrdiff_t line_size, int h);
101 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
102 int dstStride, int src1Stride, int h);
103 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
104 int dstStride, int src1Stride, int h);
105 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
106 int dstStride, int src1Stride, int h);
107 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
108 ptrdiff_t line_size, int h);
109 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
110 ptrdiff_t line_size, int h);
111 void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
112 const uint8_t *pixels,
113 ptrdiff_t line_size, int h);
114 void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
115 const uint8_t *pixels,
116 ptrdiff_t line_size, int h);
117 void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
118 ptrdiff_t line_size, int h);
119 void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
120 ptrdiff_t line_size, int h);
121 void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
122 ptrdiff_t line_size, int h);
123 void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
124 ptrdiff_t line_size, int h);
125 void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
126 const uint8_t *pixels,
127 ptrdiff_t line_size, int h);
128 void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
129 const uint8_t *pixels,
130 ptrdiff_t line_size, int h);
131 void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
132 ptrdiff_t line_size, int h);
133 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
134 ptrdiff_t line_size, int h);
135 void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
136 ptrdiff_t line_size, int h);
137 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
138 ptrdiff_t line_size, int h);
139 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
140 ptrdiff_t line_size, int h);
141 void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
142 ptrdiff_t line_size, int h);
143 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
144 ptrdiff_t line_size, int h);
146 void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
147 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
148 ptrdiff_t line_size, int h)
150 ff_put_pixels8_mmxext(block, pixels, line_size, h);
151 ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
154 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
155 int dstStride, int srcStride, int h);
156 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
157 int dstStride, int srcStride, int h);
158 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
159 int dstStride, int srcStride,
161 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
162 int dstStride, int srcStride, int h);
163 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
164 int dstStride, int srcStride, int h);
165 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
166 int dstStride, int srcStride,
168 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
169 int dstStride, int srcStride);
170 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
171 int dstStride, int srcStride);
172 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
173 int dstStride, int srcStride);
174 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
175 int dstStride, int srcStride);
176 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
177 int dstStride, int srcStride);
178 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
179 int dstStride, int srcStride);
180 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
181 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
182 #endif /* HAVE_YASM */
187 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
188 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
190 #define MOVQ_BFE(regd) \
192 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
193 "paddb %%"#regd", %%"#regd" \n\t" ::)
196 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
197 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
199 // for shared library it's better to use this way for accessing constants
201 #define MOVQ_BONE(regd) \
203 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
204 "psrlw $15, %%"#regd" \n\t" \
205 "packuswb %%"#regd", %%"#regd" \n\t" ::)
207 #define MOVQ_WTWO(regd) \
209 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
210 "psrlw $15, %%"#regd" \n\t" \
211 "psllw $1, %%"#regd" \n\t"::)
215 // using regr as temporary and for the output result
216 // first argument is unmodifed and second is trashed
217 // regfe is supposed to contain 0xfefefefefefefefe
218 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
219 "movq "#rega", "#regr" \n\t" \
220 "pand "#regb", "#regr" \n\t" \
221 "pxor "#rega", "#regb" \n\t" \
222 "pand "#regfe", "#regb" \n\t" \
223 "psrlq $1, "#regb" \n\t" \
224 "paddb "#regb", "#regr" \n\t"
226 #define PAVGB_MMX(rega, regb, regr, regfe) \
227 "movq "#rega", "#regr" \n\t" \
228 "por "#regb", "#regr" \n\t" \
229 "pxor "#rega", "#regb" \n\t" \
230 "pand "#regfe", "#regb" \n\t" \
231 "psrlq $1, "#regb" \n\t" \
232 "psubb "#regb", "#regr" \n\t"
234 // mm6 is supposed to contain 0xfefefefefefefefe
235 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
236 "movq "#rega", "#regr" \n\t" \
237 "movq "#regc", "#regp" \n\t" \
238 "pand "#regb", "#regr" \n\t" \
239 "pand "#regd", "#regp" \n\t" \
240 "pxor "#rega", "#regb" \n\t" \
241 "pxor "#regc", "#regd" \n\t" \
242 "pand %%mm6, "#regb" \n\t" \
243 "pand %%mm6, "#regd" \n\t" \
244 "psrlq $1, "#regb" \n\t" \
245 "psrlq $1, "#regd" \n\t" \
246 "paddb "#regb", "#regr" \n\t" \
247 "paddb "#regd", "#regp" \n\t"
249 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
250 "movq "#rega", "#regr" \n\t" \
251 "movq "#regc", "#regp" \n\t" \
252 "por "#regb", "#regr" \n\t" \
253 "por "#regd", "#regp" \n\t" \
254 "pxor "#rega", "#regb" \n\t" \
255 "pxor "#regc", "#regd" \n\t" \
256 "pand %%mm6, "#regb" \n\t" \
257 "pand %%mm6, "#regd" \n\t" \
258 "psrlq $1, "#regd" \n\t" \
259 "psrlq $1, "#regb" \n\t" \
260 "psubb "#regb", "#regr" \n\t" \
261 "psubb "#regd", "#regp" \n\t"
263 /***********************************/
264 /* MMX no rounding */
266 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
267 #define SET_RND MOVQ_WONE
268 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
269 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
270 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
272 #include "dsputil_rnd_template.c"
279 /***********************************/
282 #define DEF(x, y) x ## _ ## y ## _mmx
283 #define SET_RND MOVQ_WTWO
284 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
285 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
287 #include "dsputil_rnd_template.c"
295 #endif /* HAVE_INLINE_ASM */
300 /***********************************/
303 #define DEF(x) x ## _3dnow
305 #include "dsputil_avg_template.c"
309 /***********************************/
310 /* MMXEXT specific */
312 #define DEF(x) x ## _mmxext
314 #include "dsputil_avg_template.c"
318 #endif /* HAVE_YASM */
322 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
323 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
324 #define put_pixels16_mmxext put_pixels16_mmx
325 #define put_pixels8_mmxext put_pixels8_mmx
326 #define put_pixels4_mmxext put_pixels4_mmx
327 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
328 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
330 /***********************************/
333 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
339 /* read the pixels */
344 "movq (%3), %%mm0 \n\t"
345 "movq 8(%3), %%mm1 \n\t"
346 "movq 16(%3), %%mm2 \n\t"
347 "movq 24(%3), %%mm3 \n\t"
348 "movq 32(%3), %%mm4 \n\t"
349 "movq 40(%3), %%mm5 \n\t"
350 "movq 48(%3), %%mm6 \n\t"
351 "movq 56(%3), %%mm7 \n\t"
352 "packuswb %%mm1, %%mm0 \n\t"
353 "packuswb %%mm3, %%mm2 \n\t"
354 "packuswb %%mm5, %%mm4 \n\t"
355 "packuswb %%mm7, %%mm6 \n\t"
356 "movq %%mm0, (%0) \n\t"
357 "movq %%mm2, (%0, %1) \n\t"
358 "movq %%mm4, (%0, %1, 2) \n\t"
359 "movq %%mm6, (%0, %2) \n\t"
360 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
363 pix += line_size * 4;
366 // if here would be an exact copy of the code above
367 // compiler would generate some very strange code
370 "movq (%3), %%mm0 \n\t"
371 "movq 8(%3), %%mm1 \n\t"
372 "movq 16(%3), %%mm2 \n\t"
373 "movq 24(%3), %%mm3 \n\t"
374 "movq 32(%3), %%mm4 \n\t"
375 "movq 40(%3), %%mm5 \n\t"
376 "movq 48(%3), %%mm6 \n\t"
377 "movq 56(%3), %%mm7 \n\t"
378 "packuswb %%mm1, %%mm0 \n\t"
379 "packuswb %%mm3, %%mm2 \n\t"
380 "packuswb %%mm5, %%mm4 \n\t"
381 "packuswb %%mm7, %%mm6 \n\t"
382 "movq %%mm0, (%0) \n\t"
383 "movq %%mm2, (%0, %1) \n\t"
384 "movq %%mm4, (%0, %1, 2) \n\t"
385 "movq %%mm6, (%0, %2) \n\t"
386 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
390 #define put_signed_pixels_clamped_mmx_half(off) \
391 "movq "#off"(%2), %%mm1 \n\t" \
392 "movq 16 + "#off"(%2), %%mm2 \n\t" \
393 "movq 32 + "#off"(%2), %%mm3 \n\t" \
394 "movq 48 + "#off"(%2), %%mm4 \n\t" \
395 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
396 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
397 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
398 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
399 "paddb %%mm0, %%mm1 \n\t" \
400 "paddb %%mm0, %%mm2 \n\t" \
401 "paddb %%mm0, %%mm3 \n\t" \
402 "paddb %%mm0, %%mm4 \n\t" \
403 "movq %%mm1, (%0) \n\t" \
404 "movq %%mm2, (%0, %3) \n\t" \
405 "movq %%mm3, (%0, %3, 2) \n\t" \
406 "movq %%mm4, (%0, %1) \n\t"
408 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
411 x86_reg line_skip = line_size;
415 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
416 "lea (%3, %3, 2), %1 \n\t"
417 put_signed_pixels_clamped_mmx_half(0)
418 "lea (%0, %3, 4), %0 \n\t"
419 put_signed_pixels_clamped_mmx_half(64)
420 : "+&r"(pixels), "=&r"(line_skip3)
421 : "r"(block), "r"(line_skip)
425 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
432 /* read the pixels */
439 "movq (%2), %%mm0 \n\t"
440 "movq 8(%2), %%mm1 \n\t"
441 "movq 16(%2), %%mm2 \n\t"
442 "movq 24(%2), %%mm3 \n\t"
443 "movq %0, %%mm4 \n\t"
444 "movq %1, %%mm6 \n\t"
445 "movq %%mm4, %%mm5 \n\t"
446 "punpcklbw %%mm7, %%mm4 \n\t"
447 "punpckhbw %%mm7, %%mm5 \n\t"
448 "paddsw %%mm4, %%mm0 \n\t"
449 "paddsw %%mm5, %%mm1 \n\t"
450 "movq %%mm6, %%mm5 \n\t"
451 "punpcklbw %%mm7, %%mm6 \n\t"
452 "punpckhbw %%mm7, %%mm5 \n\t"
453 "paddsw %%mm6, %%mm2 \n\t"
454 "paddsw %%mm5, %%mm3 \n\t"
455 "packuswb %%mm1, %%mm0 \n\t"
456 "packuswb %%mm3, %%mm2 \n\t"
457 "movq %%mm0, %0 \n\t"
458 "movq %%mm2, %1 \n\t"
459 : "+m"(*pix), "+m"(*(pix + line_size))
462 pix += line_size * 2;
467 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
468 int line_size, int h)
471 "lea (%3, %3), %%"REG_a" \n\t"
474 "movq (%1 ), %%mm0 \n\t"
475 "movq (%1, %3), %%mm1 \n\t"
476 "movq %%mm0, (%2) \n\t"
477 "movq %%mm1, (%2, %3) \n\t"
478 "add %%"REG_a", %1 \n\t"
479 "add %%"REG_a", %2 \n\t"
480 "movq (%1 ), %%mm0 \n\t"
481 "movq (%1, %3), %%mm1 \n\t"
482 "movq %%mm0, (%2) \n\t"
483 "movq %%mm1, (%2, %3) \n\t"
484 "add %%"REG_a", %1 \n\t"
485 "add %%"REG_a", %2 \n\t"
488 : "+g"(h), "+r"(pixels), "+r"(block)
489 : "r"((x86_reg)line_size)
494 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
495 int line_size, int h)
498 "lea (%3, %3), %%"REG_a" \n\t"
501 "movq (%1 ), %%mm0 \n\t"
502 "movq 8(%1 ), %%mm4 \n\t"
503 "movq (%1, %3), %%mm1 \n\t"
504 "movq 8(%1, %3), %%mm5 \n\t"
505 "movq %%mm0, (%2) \n\t"
506 "movq %%mm4, 8(%2) \n\t"
507 "movq %%mm1, (%2, %3) \n\t"
508 "movq %%mm5, 8(%2, %3) \n\t"
509 "add %%"REG_a", %1 \n\t"
510 "add %%"REG_a", %2 \n\t"
511 "movq (%1 ), %%mm0 \n\t"
512 "movq 8(%1 ), %%mm4 \n\t"
513 "movq (%1, %3), %%mm1 \n\t"
514 "movq 8(%1, %3), %%mm5 \n\t"
515 "movq %%mm0, (%2) \n\t"
516 "movq %%mm4, 8(%2) \n\t"
517 "movq %%mm1, (%2, %3) \n\t"
518 "movq %%mm5, 8(%2, %3) \n\t"
519 "add %%"REG_a", %1 \n\t"
520 "add %%"REG_a", %2 \n\t"
523 : "+g"(h), "+r"(pixels), "+r"(block)
524 : "r"((x86_reg)line_size)
529 #define CLEAR_BLOCKS(name, n) \
530 static void name(int16_t *blocks) \
533 "pxor %%mm7, %%mm7 \n\t" \
534 "mov %1, %%"REG_a" \n\t" \
536 "movq %%mm7, (%0, %%"REG_a") \n\t" \
537 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
538 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
539 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
540 "add $32, %%"REG_a" \n\t" \
542 :: "r"(((uint8_t *)blocks) + 128 * n), \
547 CLEAR_BLOCKS(clear_blocks_mmx, 6)
548 CLEAR_BLOCKS(clear_block_mmx, 1)
550 static void clear_block_sse(int16_t *block)
553 "xorps %%xmm0, %%xmm0 \n"
554 "movaps %%xmm0, (%0) \n"
555 "movaps %%xmm0, 16(%0) \n"
556 "movaps %%xmm0, 32(%0) \n"
557 "movaps %%xmm0, 48(%0) \n"
558 "movaps %%xmm0, 64(%0) \n"
559 "movaps %%xmm0, 80(%0) \n"
560 "movaps %%xmm0, 96(%0) \n"
561 "movaps %%xmm0, 112(%0) \n"
567 static void clear_blocks_sse(int16_t *blocks)
570 "xorps %%xmm0, %%xmm0 \n"
571 "mov %1, %%"REG_a" \n"
573 "movaps %%xmm0, (%0, %%"REG_a") \n"
574 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
575 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
576 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
577 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
578 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
579 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
580 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
581 "add $128, %%"REG_a" \n"
583 :: "r"(((uint8_t *)blocks) + 128 * 6),
589 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
595 "movq (%1, %0), %%mm0 \n\t"
596 "movq (%2, %0), %%mm1 \n\t"
597 "paddb %%mm0, %%mm1 \n\t"
598 "movq %%mm1, (%2, %0) \n\t"
599 "movq 8(%1, %0), %%mm0 \n\t"
600 "movq 8(%2, %0), %%mm1 \n\t"
601 "paddb %%mm0, %%mm1 \n\t"
602 "movq %%mm1, 8(%2, %0) \n\t"
608 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
611 dst[i + 0] += src[i + 0];
615 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
616 const uint8_t *diff, int w,
617 int *left, int *left_top)
621 int l = *left & 0xff;
622 int tl = *left_top & 0xff;
627 "movzbl (%3, %4), %2 \n"
640 "add (%6, %4), %b0 \n"
641 "mov %b0, (%5, %4) \n"
644 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
645 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
651 #endif /* HAVE_INLINE_ASM */
653 void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
654 void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
657 /* Draw the edges of width 'w' of an image of size width, height
658 * this MMX version can only handle w == 8 || w == 16. */
659 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
660 int w, int h, int sides)
662 uint8_t *ptr, *last_line;
665 last_line = buf + (height - 1) * wrap;
671 "movd (%0), %%mm0 \n\t"
672 "punpcklbw %%mm0, %%mm0 \n\t"
673 "punpcklwd %%mm0, %%mm0 \n\t"
674 "punpckldq %%mm0, %%mm0 \n\t"
675 "movq %%mm0, -8(%0) \n\t"
676 "movq -8(%0, %2), %%mm1 \n\t"
677 "punpckhbw %%mm1, %%mm1 \n\t"
678 "punpckhwd %%mm1, %%mm1 \n\t"
679 "punpckhdq %%mm1, %%mm1 \n\t"
680 "movq %%mm1, (%0, %2) \n\t"
685 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
690 "movd (%0), %%mm0 \n\t"
691 "punpcklbw %%mm0, %%mm0 \n\t"
692 "punpcklwd %%mm0, %%mm0 \n\t"
693 "punpckldq %%mm0, %%mm0 \n\t"
694 "movq %%mm0, -8(%0) \n\t"
695 "movq %%mm0, -16(%0) \n\t"
696 "movq -8(%0, %2), %%mm1 \n\t"
697 "punpckhbw %%mm1, %%mm1 \n\t"
698 "punpckhwd %%mm1, %%mm1 \n\t"
699 "punpckhdq %%mm1, %%mm1 \n\t"
700 "movq %%mm1, (%0, %2) \n\t"
701 "movq %%mm1, 8(%0, %2) \n\t"
706 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
710 /* top and bottom (and hopefully also the corners) */
711 if (sides & EDGE_TOP) {
712 for (i = 0; i < h; i += 4) {
713 ptr = buf - (i + 1) * wrap - w;
716 "movq (%1, %0), %%mm0 \n\t"
717 "movq %%mm0, (%0) \n\t"
718 "movq %%mm0, (%0, %2) \n\t"
719 "movq %%mm0, (%0, %2, 2) \n\t"
720 "movq %%mm0, (%0, %3) \n\t"
725 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
726 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
731 if (sides & EDGE_BOTTOM) {
732 for (i = 0; i < h; i += 4) {
733 ptr = last_line + (i + 1) * wrap - w;
736 "movq (%1, %0), %%mm0 \n\t"
737 "movq %%mm0, (%0) \n\t"
738 "movq %%mm0, (%0, %2) \n\t"
739 "movq %%mm0, (%0, %2, 2) \n\t"
740 "movq %%mm0, (%0, %3) \n\t"
745 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
746 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
747 "r"(ptr + width + 2 * w)
752 #endif /* HAVE_INLINE_ASM */
756 #define QPEL_OP(OPNAME, ROUNDER, RND, MMX) \
757 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
760 ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
763 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
767 uint8_t * const half = (uint8_t*)temp; \
768 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
770 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
771 stride, stride, 8); \
774 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
777 ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
781 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
785 uint8_t * const half = (uint8_t*)temp; \
786 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
788 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
792 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
796 uint8_t * const half = (uint8_t*)temp; \
797 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
799 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
800 stride, stride, 8); \
803 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
806 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
810 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
814 uint8_t * const half = (uint8_t*)temp; \
815 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
817 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
821 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
824 uint64_t half[8 + 9]; \
825 uint8_t * const halfH = ((uint8_t*)half) + 64; \
826 uint8_t * const halfHV = ((uint8_t*)half); \
827 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
829 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
831 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
832 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
836 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
839 uint64_t half[8 + 9]; \
840 uint8_t * const halfH = ((uint8_t*)half) + 64; \
841 uint8_t * const halfHV = ((uint8_t*)half); \
842 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
844 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
846 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
847 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
851 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
854 uint64_t half[8 + 9]; \
855 uint8_t * const halfH = ((uint8_t*)half) + 64; \
856 uint8_t * const halfHV = ((uint8_t*)half); \
857 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
859 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
861 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
862 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
866 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
869 uint64_t half[8 + 9]; \
870 uint8_t * const halfH = ((uint8_t*)half) + 64; \
871 uint8_t * const halfHV = ((uint8_t*)half); \
872 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
874 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
876 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
877 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
881 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
884 uint64_t half[8 + 9]; \
885 uint8_t * const halfH = ((uint8_t*)half) + 64; \
886 uint8_t * const halfHV = ((uint8_t*)half); \
887 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
889 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
890 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
894 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
897 uint64_t half[8 + 9]; \
898 uint8_t * const halfH = ((uint8_t*)half) + 64; \
899 uint8_t * const halfHV = ((uint8_t*)half); \
900 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
902 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
903 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
907 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
910 uint64_t half[8 + 9]; \
911 uint8_t * const halfH = ((uint8_t*)half); \
912 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
914 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
916 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
920 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
923 uint64_t half[8 + 9]; \
924 uint8_t * const halfH = ((uint8_t*)half); \
925 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
927 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
929 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
933 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
937 uint8_t * const halfH = ((uint8_t*)half); \
938 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
940 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
944 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
947 ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
950 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
954 uint8_t * const half = (uint8_t*)temp; \
955 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
957 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
961 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
964 ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
965 stride, stride, 16);\
968 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
972 uint8_t * const half = (uint8_t*)temp; \
973 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
975 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
976 stride, stride, 16); \
979 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
983 uint8_t * const half = (uint8_t*)temp; \
984 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
986 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
990 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
993 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
997 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1000 uint64_t temp[32]; \
1001 uint8_t * const half = (uint8_t*)temp; \
1002 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1004 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1005 stride, stride, 16); \
1008 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1011 uint64_t half[16 * 2 + 17 * 2]; \
1012 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1013 uint8_t * const halfHV = ((uint8_t*)half); \
1014 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1016 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1018 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1020 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1024 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1027 uint64_t half[16 * 2 + 17 * 2]; \
1028 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1029 uint8_t * const halfHV = ((uint8_t*)half); \
1030 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1032 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1034 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1036 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1040 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1043 uint64_t half[16 * 2 + 17 * 2]; \
1044 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1045 uint8_t * const halfHV = ((uint8_t*)half); \
1046 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1048 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1050 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1052 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1056 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1059 uint64_t half[16 * 2 + 17 * 2]; \
1060 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1061 uint8_t * const halfHV = ((uint8_t*)half); \
1062 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1064 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1066 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1068 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1072 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1075 uint64_t half[16 * 2 + 17 * 2]; \
1076 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1077 uint8_t * const halfHV = ((uint8_t*)half); \
1078 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1080 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1082 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1086 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1089 uint64_t half[16 * 2 + 17 * 2]; \
1090 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1091 uint8_t * const halfHV = ((uint8_t*)half); \
1092 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1094 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1096 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1100 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1103 uint64_t half[17 * 2]; \
1104 uint8_t * const halfH = ((uint8_t*)half); \
1105 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1107 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1109 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1113 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1116 uint64_t half[17 * 2]; \
1117 uint8_t * const halfH = ((uint8_t*)half); \
1118 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1120 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1122 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1126 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1129 uint64_t half[17 * 2]; \
1130 uint8_t * const halfH = ((uint8_t*)half); \
1131 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1133 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1137 QPEL_OP(put_, ff_pw_16, _, mmxext)
1138 QPEL_OP(avg_, ff_pw_16, _, mmxext)
1139 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, mmxext)
1140 #endif /* HAVE_YASM */
1144 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1146 put_pixels8_xy2_mmx(dst, src, stride, 8);
1148 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1150 put_pixels16_xy2_mmx(dst, src, stride, 16);
1152 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1154 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1156 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1158 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1161 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1162 int stride, int h, int ox, int oy,
1163 int dxx, int dxy, int dyx, int dyy,
1164 int shift, int r, int width, int height)
1167 const int ix = ox >> (16 + shift);
1168 const int iy = oy >> (16 + shift);
1169 const int oxs = ox >> 4;
1170 const int oys = oy >> 4;
1171 const int dxxs = dxx >> 4;
1172 const int dxys = dxy >> 4;
1173 const int dyxs = dyx >> 4;
1174 const int dyys = dyy >> 4;
1175 const uint16_t r4[4] = { r, r, r, r };
1176 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1177 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1178 const uint64_t shift2 = 2 * shift;
1181 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1182 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1183 const int dxh = dxy * (h - 1);
1184 const int dyw = dyx * (w - 1);
1185 if ( // non-constant fullpel offset (3% of blocks)
1186 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1187 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1188 // uses more than 16 bits of subpel mv (only at huge resolution)
1189 || (dxx | dxy | dyx | dyy) & 15 ||
1190 (unsigned)ix >= width - w ||
1191 (unsigned)iy >= height - h) {
1192 // FIXME could still use mmx for some of the rows
1193 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1194 shift, r, width, height);
1198 src += ix + iy * stride;
1201 "movd %0, %%mm6 \n\t"
1202 "pxor %%mm7, %%mm7 \n\t"
1203 "punpcklwd %%mm6, %%mm6 \n\t"
1204 "punpcklwd %%mm6, %%mm6 \n\t"
1208 for (x = 0; x < w; x += 4) {
1209 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1210 oxs - dxys + dxxs * (x + 1),
1211 oxs - dxys + dxxs * (x + 2),
1212 oxs - dxys + dxxs * (x + 3) };
1213 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1214 oys - dyys + dyxs * (x + 1),
1215 oys - dyys + dyxs * (x + 2),
1216 oys - dyys + dyxs * (x + 3) };
1218 for (y = 0; y < h; y++) {
1220 "movq %0, %%mm4 \n\t"
1221 "movq %1, %%mm5 \n\t"
1222 "paddw %2, %%mm4 \n\t"
1223 "paddw %3, %%mm5 \n\t"
1224 "movq %%mm4, %0 \n\t"
1225 "movq %%mm5, %1 \n\t"
1226 "psrlw $12, %%mm4 \n\t"
1227 "psrlw $12, %%mm5 \n\t"
1228 : "+m"(*dx4), "+m"(*dy4)
1229 : "m"(*dxy4), "m"(*dyy4)
1233 "movq %%mm6, %%mm2 \n\t"
1234 "movq %%mm6, %%mm1 \n\t"
1235 "psubw %%mm4, %%mm2 \n\t"
1236 "psubw %%mm5, %%mm1 \n\t"
1237 "movq %%mm2, %%mm0 \n\t"
1238 "movq %%mm4, %%mm3 \n\t"
1239 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1240 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1241 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1242 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1244 "movd %4, %%mm5 \n\t"
1245 "movd %3, %%mm4 \n\t"
1246 "punpcklbw %%mm7, %%mm5 \n\t"
1247 "punpcklbw %%mm7, %%mm4 \n\t"
1248 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1249 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1251 "movd %2, %%mm5 \n\t"
1252 "movd %1, %%mm4 \n\t"
1253 "punpcklbw %%mm7, %%mm5 \n\t"
1254 "punpcklbw %%mm7, %%mm4 \n\t"
1255 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1256 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1257 "paddw %5, %%mm1 \n\t"
1258 "paddw %%mm3, %%mm2 \n\t"
1259 "paddw %%mm1, %%mm0 \n\t"
1260 "paddw %%mm2, %%mm0 \n\t"
1262 "psrlw %6, %%mm0 \n\t"
1263 "packuswb %%mm0, %%mm0 \n\t"
1264 "movd %%mm0, %0 \n\t"
1266 : "=m"(dst[x + y * stride])
1267 : "m"(src[0]), "m"(src[1]),
1268 "m"(src[stride]), "m"(src[stride + 1]),
1269 "m"(*r4), "m"(shift2)
1273 src += 4 - h * stride;
1276 #endif /* HAVE_INLINE_ASM */
1278 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1279 ptrdiff_t line_size, int h);
1280 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1281 ptrdiff_t line_size, int h);
1286 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1288 put_pixels8_mmx(dst, src, stride, 8);
1291 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1293 avg_pixels8_mmx(dst, src, stride, 8);
1296 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1298 put_pixels16_mmx(dst, src, stride, 16);
1301 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1303 avg_pixels16_mmx(dst, src, stride, 16);
1307 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1308 int stride, int rnd)
1310 put_pixels8_mmx(dst, src, stride, 8);
1313 static void vector_clipf_sse(float *dst, const float *src,
1314 float min, float max, int len)
1316 x86_reg i = (len - 16) * 4;
1318 "movss %3, %%xmm4 \n\t"
1319 "movss %4, %%xmm5 \n\t"
1320 "shufps $0, %%xmm4, %%xmm4 \n\t"
1321 "shufps $0, %%xmm5, %%xmm5 \n\t"
1323 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1324 "movaps 16(%2, %0), %%xmm1 \n\t"
1325 "movaps 32(%2, %0), %%xmm2 \n\t"
1326 "movaps 48(%2, %0), %%xmm3 \n\t"
1327 "maxps %%xmm4, %%xmm0 \n\t"
1328 "maxps %%xmm4, %%xmm1 \n\t"
1329 "maxps %%xmm4, %%xmm2 \n\t"
1330 "maxps %%xmm4, %%xmm3 \n\t"
1331 "minps %%xmm5, %%xmm0 \n\t"
1332 "minps %%xmm5, %%xmm1 \n\t"
1333 "minps %%xmm5, %%xmm2 \n\t"
1334 "minps %%xmm5, %%xmm3 \n\t"
1335 "movaps %%xmm0, (%1, %0) \n\t"
1336 "movaps %%xmm1, 16(%1, %0) \n\t"
1337 "movaps %%xmm2, 32(%1, %0) \n\t"
1338 "movaps %%xmm3, 48(%1, %0) \n\t"
1342 : "r"(dst), "r"(src), "m"(min), "m"(max)
1347 #endif /* HAVE_INLINE_ASM */
1349 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1351 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1353 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1355 int order, int mul);
1356 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1358 int order, int mul);
1359 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1361 int order, int mul);
1363 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1364 const int16_t *window, unsigned int len);
1365 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1366 const int16_t *window, unsigned int len);
1367 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1368 const int16_t *window, unsigned int len);
1369 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1370 const int16_t *window, unsigned int len);
1371 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1372 const int16_t *window, unsigned int len);
1373 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1374 const int16_t *window, unsigned int len);
1376 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1377 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1379 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1380 const uint8_t *diff, int w,
1381 int *left, int *left_top);
1382 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1384 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1387 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1388 int32_t min, int32_t max, unsigned int len);
1389 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1390 int32_t min, int32_t max, unsigned int len);
1391 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1392 int32_t min, int32_t max, unsigned int len);
1393 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1394 int32_t min, int32_t max, unsigned int len);
1396 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
1398 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1399 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1400 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
1401 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
1402 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
1403 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
1404 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
1405 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
1406 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
1407 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
1408 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
1409 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
1410 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
1411 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
1412 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
1413 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
1416 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
1418 c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
1419 c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
1420 c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
1421 c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
1424 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
1427 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1430 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
1431 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
1432 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
1434 if (!high_bit_depth) {
1435 c->clear_block = clear_block_mmx;
1436 c->clear_blocks = clear_blocks_mmx;
1437 c->draw_edges = draw_edges_mmx;
1439 SET_HPEL_FUNCS(put, [0], 16, mmx);
1440 SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
1441 SET_HPEL_FUNCS(avg, [0], 16, mmx);
1442 SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
1443 SET_HPEL_FUNCS(put, [1], 8, mmx);
1444 SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
1445 SET_HPEL_FUNCS(avg, [1], 8, mmx);
1447 switch (avctx->idct_algo) {
1449 case FF_IDCT_SIMPLEMMX:
1450 c->idct_put = ff_simple_idct_put_mmx;
1451 c->idct_add = ff_simple_idct_add_mmx;
1452 c->idct = ff_simple_idct_mmx;
1453 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
1455 case FF_IDCT_XVIDMMX:
1456 c->idct_put = ff_idct_xvid_mmx_put;
1457 c->idct_add = ff_idct_xvid_mmx_add;
1458 c->idct = ff_idct_xvid_mmx;
1465 c->add_bytes = add_bytes_mmx;
1466 #endif /* HAVE_INLINE_ASM */
1469 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1470 c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
1471 c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
1474 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
1479 static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
1482 const int bit_depth = avctx->bits_per_raw_sample;
1483 const int high_bit_depth = bit_depth > 8;
1486 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
1487 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
1489 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
1490 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
1491 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
1492 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
1494 if (!high_bit_depth) {
1495 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
1496 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
1498 c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
1499 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
1500 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
1502 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
1503 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
1505 c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
1506 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
1507 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
1510 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1511 if (!high_bit_depth) {
1512 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
1513 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
1514 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
1515 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
1517 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
1518 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
1521 #endif /* HAVE_YASM */
1524 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1525 c->idct_put = ff_idct_xvid_mmxext_put;
1526 c->idct_add = ff_idct_xvid_mmxext_add;
1527 c->idct = ff_idct_xvid_mmxext;
1529 #endif /* HAVE_INLINE_ASM */
1531 #if HAVE_MMXEXT_EXTERNAL
1532 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1533 avctx->codec_id == AV_CODEC_ID_THEORA)) {
1534 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
1535 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
1538 /* slower than cmov version on AMD */
1539 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
1540 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
1542 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
1543 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
1545 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1546 c->apply_window_int16 = ff_apply_window_int16_mmxext;
1548 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
1550 #endif /* HAVE_MMXEXT_EXTERNAL */
1553 static av_cold void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
1556 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1559 if (!high_bit_depth) {
1560 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
1561 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
1563 c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
1564 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
1565 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
1567 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
1568 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
1570 c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
1571 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
1572 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
1574 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
1575 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
1576 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
1577 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
1578 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
1580 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
1581 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
1585 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1586 avctx->codec_id == AV_CODEC_ID_THEORA)) {
1587 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
1588 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
1590 #endif /* HAVE_YASM */
1593 static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
1596 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1599 if (!high_bit_depth) {
1600 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
1601 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
1602 c->clear_block = clear_block_sse;
1603 c->clear_blocks = clear_blocks_sse;
1607 c->vector_clipf = vector_clipf_sse;
1608 #endif /* HAVE_INLINE_ASM */
1611 static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
1614 const int bit_depth = avctx->bits_per_raw_sample;
1615 const int high_bit_depth = bit_depth > 8;
1617 #if HAVE_SSE2_INLINE
1618 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1619 c->idct_put = ff_idct_xvid_sse2_put;
1620 c->idct_add = ff_idct_xvid_sse2_add;
1621 c->idct = ff_idct_xvid_sse2;
1622 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
1624 #endif /* HAVE_SSE2_INLINE */
1626 #if HAVE_SSE2_EXTERNAL
1627 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1628 // these functions are slower than mmx on AMD, but faster on Intel
1629 if (!high_bit_depth) {
1630 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
1631 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
1632 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
1636 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
1637 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
1638 if (mm_flags & AV_CPU_FLAG_ATOM) {
1639 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
1641 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
1643 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1644 c->apply_window_int16 = ff_apply_window_int16_sse2;
1645 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1646 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
1648 c->bswap_buf = ff_bswap32_buf_sse2;
1649 #endif /* HAVE_SSE2_EXTERNAL */
1652 static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
1655 #if HAVE_SSSE3_EXTERNAL
1656 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
1657 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
1658 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
1660 if (mm_flags & AV_CPU_FLAG_ATOM)
1661 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
1663 c->apply_window_int16 = ff_apply_window_int16_ssse3;
1664 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
1665 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
1666 c->bswap_buf = ff_bswap32_buf_ssse3;
1667 #endif /* HAVE_SSSE3_EXTERNAL */
1670 static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
1673 #if HAVE_SSE4_EXTERNAL
1674 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
1675 #endif /* HAVE_SSE4_EXTERNAL */
1678 av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
1680 int mm_flags = av_get_cpu_flags();
1682 #if HAVE_7REGS && HAVE_INLINE_ASM
1683 if (mm_flags & AV_CPU_FLAG_CMOV)
1684 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
1687 if (mm_flags & AV_CPU_FLAG_MMX)
1688 dsputil_init_mmx(c, avctx, mm_flags);
1690 if (mm_flags & AV_CPU_FLAG_MMXEXT)
1691 dsputil_init_mmxext(c, avctx, mm_flags);
1693 if (mm_flags & AV_CPU_FLAG_3DNOW)
1694 dsputil_init_3dnow(c, avctx, mm_flags);
1696 if (mm_flags & AV_CPU_FLAG_SSE)
1697 dsputil_init_sse(c, avctx, mm_flags);
1699 if (mm_flags & AV_CPU_FLAG_SSE2)
1700 dsputil_init_sse2(c, avctx, mm_flags);
1702 if (mm_flags & AV_CPU_FLAG_SSSE3)
1703 dsputil_init_ssse3(c, avctx, mm_flags);
1705 if (mm_flags & AV_CPU_FLAG_SSE4)
1706 dsputil_init_sse4(c, avctx, mm_flags);
1708 if (CONFIG_ENCODERS)
1709 ff_dsputilenc_init_mmx(c, avctx);