2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavcodec/dsputil.h"
29 #include "libavcodec/h264dsp.h"
30 #include "libavcodec/mpegvideo.h"
31 #include "libavcodec/simple_idct.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
43 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
44 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
49 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
53 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
57 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
58 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
60 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
62 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
63 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
64 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
65 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
71 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
72 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
74 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
76 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
81 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
82 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
86 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
87 ptrdiff_t line_size, int h);
88 void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
89 ptrdiff_t line_size, int h);
90 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
91 int dstStride, int src1Stride, int h);
92 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
93 uint8_t *src2, int dstStride,
94 int src1Stride, int h);
95 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
96 int dstStride, int src1Stride, int h);
97 void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
98 ptrdiff_t line_size, int h);
99 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
100 ptrdiff_t line_size, int h);
101 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
102 int dstStride, int src1Stride, int h);
103 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
104 int dstStride, int src1Stride, int h);
105 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
106 int dstStride, int src1Stride, int h);
107 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
108 ptrdiff_t line_size, int h);
109 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
110 ptrdiff_t line_size, int h);
111 void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
112 const uint8_t *pixels,
113 ptrdiff_t line_size, int h);
114 void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
115 const uint8_t *pixels,
116 ptrdiff_t line_size, int h);
117 void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
118 ptrdiff_t line_size, int h);
119 void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
120 ptrdiff_t line_size, int h);
121 void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
122 ptrdiff_t line_size, int h);
123 void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
124 ptrdiff_t line_size, int h);
125 void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
126 const uint8_t *pixels,
127 ptrdiff_t line_size, int h);
128 void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
129 const uint8_t *pixels,
130 ptrdiff_t line_size, int h);
131 void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
132 ptrdiff_t line_size, int h);
133 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
134 ptrdiff_t line_size, int h);
135 void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
136 ptrdiff_t line_size, int h);
137 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
138 ptrdiff_t line_size, int h);
139 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
140 ptrdiff_t line_size, int h);
141 void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
142 ptrdiff_t line_size, int h);
143 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
144 ptrdiff_t line_size, int h);
146 void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
147 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
148 ptrdiff_t line_size, int h)
150 ff_put_pixels8_mmxext(block, pixels, line_size, h);
151 ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
154 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
155 int dstStride, int srcStride, int h);
156 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
157 int dstStride, int srcStride, int h);
158 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
159 int dstStride, int srcStride,
161 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
162 int dstStride, int srcStride, int h);
163 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
164 int dstStride, int srcStride, int h);
165 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
166 int dstStride, int srcStride,
168 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
169 int dstStride, int srcStride);
170 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
171 int dstStride, int srcStride);
172 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
173 int dstStride, int srcStride);
174 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
175 int dstStride, int srcStride);
176 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
177 int dstStride, int srcStride);
178 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
179 int dstStride, int srcStride);
180 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
181 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
182 #endif /* HAVE_YASM */
187 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
188 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
190 #define MOVQ_BFE(regd) \
192 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
193 "paddb %%"#regd", %%"#regd" \n\t" ::)
196 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
197 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
199 // for shared library it's better to use this way for accessing constants
201 #define MOVQ_BONE(regd) \
203 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
204 "psrlw $15, %%"#regd" \n\t" \
205 "packuswb %%"#regd", %%"#regd" \n\t" ::)
207 #define MOVQ_WTWO(regd) \
209 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
210 "psrlw $15, %%"#regd" \n\t" \
211 "psllw $1, %%"#regd" \n\t"::)
215 // using regr as temporary and for the output result
216 // first argument is unmodifed and second is trashed
217 // regfe is supposed to contain 0xfefefefefefefefe
218 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
219 "movq "#rega", "#regr" \n\t" \
220 "pand "#regb", "#regr" \n\t" \
221 "pxor "#rega", "#regb" \n\t" \
222 "pand "#regfe", "#regb" \n\t" \
223 "psrlq $1, "#regb" \n\t" \
224 "paddb "#regb", "#regr" \n\t"
226 #define PAVGB_MMX(rega, regb, regr, regfe) \
227 "movq "#rega", "#regr" \n\t" \
228 "por "#regb", "#regr" \n\t" \
229 "pxor "#rega", "#regb" \n\t" \
230 "pand "#regfe", "#regb" \n\t" \
231 "psrlq $1, "#regb" \n\t" \
232 "psubb "#regb", "#regr" \n\t"
234 // mm6 is supposed to contain 0xfefefefefefefefe
235 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
236 "movq "#rega", "#regr" \n\t" \
237 "movq "#regc", "#regp" \n\t" \
238 "pand "#regb", "#regr" \n\t" \
239 "pand "#regd", "#regp" \n\t" \
240 "pxor "#rega", "#regb" \n\t" \
241 "pxor "#regc", "#regd" \n\t" \
242 "pand %%mm6, "#regb" \n\t" \
243 "pand %%mm6, "#regd" \n\t" \
244 "psrlq $1, "#regb" \n\t" \
245 "psrlq $1, "#regd" \n\t" \
246 "paddb "#regb", "#regr" \n\t" \
247 "paddb "#regd", "#regp" \n\t"
249 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
250 "movq "#rega", "#regr" \n\t" \
251 "movq "#regc", "#regp" \n\t" \
252 "por "#regb", "#regr" \n\t" \
253 "por "#regd", "#regp" \n\t" \
254 "pxor "#rega", "#regb" \n\t" \
255 "pxor "#regc", "#regd" \n\t" \
256 "pand %%mm6, "#regb" \n\t" \
257 "pand %%mm6, "#regd" \n\t" \
258 "psrlq $1, "#regd" \n\t" \
259 "psrlq $1, "#regb" \n\t" \
260 "psubb "#regb", "#regr" \n\t" \
261 "psubb "#regd", "#regp" \n\t"
263 /***********************************/
264 /* MMX no rounding */
266 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
267 #define SET_RND MOVQ_WONE
268 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
269 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
270 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
272 #include "dsputil_rnd_template.c"
279 /***********************************/
282 #define DEF(x, y) x ## _ ## y ## _mmx
283 #define SET_RND MOVQ_WTWO
284 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
285 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
287 #include "dsputil_rnd_template.c"
295 #endif /* HAVE_INLINE_ASM */
299 #define ff_put_pixels8_mmx ff_put_pixels8_mmxext
301 /***********************************/
304 #define DEF(x) x ## _3dnow
306 #include "dsputil_avg_template.c"
310 /***********************************/
311 /* MMXEXT specific */
313 #define DEF(x) x ## _mmxext
315 #include "dsputil_avg_template.c"
319 #endif /* HAVE_YASM */
323 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
324 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
325 #define put_pixels16_mmxext put_pixels16_mmx
326 #define put_pixels8_mmxext put_pixels8_mmx
327 #define put_pixels4_mmxext put_pixels4_mmx
328 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
329 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
331 /***********************************/
334 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
340 /* read the pixels */
345 "movq (%3), %%mm0 \n\t"
346 "movq 8(%3), %%mm1 \n\t"
347 "movq 16(%3), %%mm2 \n\t"
348 "movq 24(%3), %%mm3 \n\t"
349 "movq 32(%3), %%mm4 \n\t"
350 "movq 40(%3), %%mm5 \n\t"
351 "movq 48(%3), %%mm6 \n\t"
352 "movq 56(%3), %%mm7 \n\t"
353 "packuswb %%mm1, %%mm0 \n\t"
354 "packuswb %%mm3, %%mm2 \n\t"
355 "packuswb %%mm5, %%mm4 \n\t"
356 "packuswb %%mm7, %%mm6 \n\t"
357 "movq %%mm0, (%0) \n\t"
358 "movq %%mm2, (%0, %1) \n\t"
359 "movq %%mm4, (%0, %1, 2) \n\t"
360 "movq %%mm6, (%0, %2) \n\t"
361 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
364 pix += line_size * 4;
367 // if here would be an exact copy of the code above
368 // compiler would generate some very strange code
371 "movq (%3), %%mm0 \n\t"
372 "movq 8(%3), %%mm1 \n\t"
373 "movq 16(%3), %%mm2 \n\t"
374 "movq 24(%3), %%mm3 \n\t"
375 "movq 32(%3), %%mm4 \n\t"
376 "movq 40(%3), %%mm5 \n\t"
377 "movq 48(%3), %%mm6 \n\t"
378 "movq 56(%3), %%mm7 \n\t"
379 "packuswb %%mm1, %%mm0 \n\t"
380 "packuswb %%mm3, %%mm2 \n\t"
381 "packuswb %%mm5, %%mm4 \n\t"
382 "packuswb %%mm7, %%mm6 \n\t"
383 "movq %%mm0, (%0) \n\t"
384 "movq %%mm2, (%0, %1) \n\t"
385 "movq %%mm4, (%0, %1, 2) \n\t"
386 "movq %%mm6, (%0, %2) \n\t"
387 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
391 #define put_signed_pixels_clamped_mmx_half(off) \
392 "movq "#off"(%2), %%mm1 \n\t" \
393 "movq 16 + "#off"(%2), %%mm2 \n\t" \
394 "movq 32 + "#off"(%2), %%mm3 \n\t" \
395 "movq 48 + "#off"(%2), %%mm4 \n\t" \
396 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
397 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
398 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
399 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
400 "paddb %%mm0, %%mm1 \n\t" \
401 "paddb %%mm0, %%mm2 \n\t" \
402 "paddb %%mm0, %%mm3 \n\t" \
403 "paddb %%mm0, %%mm4 \n\t" \
404 "movq %%mm1, (%0) \n\t" \
405 "movq %%mm2, (%0, %3) \n\t" \
406 "movq %%mm3, (%0, %3, 2) \n\t" \
407 "movq %%mm4, (%0, %1) \n\t"
409 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
412 x86_reg line_skip = line_size;
416 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
417 "lea (%3, %3, 2), %1 \n\t"
418 put_signed_pixels_clamped_mmx_half(0)
419 "lea (%0, %3, 4), %0 \n\t"
420 put_signed_pixels_clamped_mmx_half(64)
421 : "+&r"(pixels), "=&r"(line_skip3)
422 : "r"(block), "r"(line_skip)
426 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
433 /* read the pixels */
440 "movq (%2), %%mm0 \n\t"
441 "movq 8(%2), %%mm1 \n\t"
442 "movq 16(%2), %%mm2 \n\t"
443 "movq 24(%2), %%mm3 \n\t"
444 "movq %0, %%mm4 \n\t"
445 "movq %1, %%mm6 \n\t"
446 "movq %%mm4, %%mm5 \n\t"
447 "punpcklbw %%mm7, %%mm4 \n\t"
448 "punpckhbw %%mm7, %%mm5 \n\t"
449 "paddsw %%mm4, %%mm0 \n\t"
450 "paddsw %%mm5, %%mm1 \n\t"
451 "movq %%mm6, %%mm5 \n\t"
452 "punpcklbw %%mm7, %%mm6 \n\t"
453 "punpckhbw %%mm7, %%mm5 \n\t"
454 "paddsw %%mm6, %%mm2 \n\t"
455 "paddsw %%mm5, %%mm3 \n\t"
456 "packuswb %%mm1, %%mm0 \n\t"
457 "packuswb %%mm3, %%mm2 \n\t"
458 "movq %%mm0, %0 \n\t"
459 "movq %%mm2, %1 \n\t"
460 : "+m"(*pix), "+m"(*(pix + line_size))
463 pix += line_size * 2;
468 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
469 int line_size, int h)
472 "lea (%3, %3), %%"REG_a" \n\t"
475 "movq (%1 ), %%mm0 \n\t"
476 "movq (%1, %3), %%mm1 \n\t"
477 "movq %%mm0, (%2) \n\t"
478 "movq %%mm1, (%2, %3) \n\t"
479 "add %%"REG_a", %1 \n\t"
480 "add %%"REG_a", %2 \n\t"
481 "movq (%1 ), %%mm0 \n\t"
482 "movq (%1, %3), %%mm1 \n\t"
483 "movq %%mm0, (%2) \n\t"
484 "movq %%mm1, (%2, %3) \n\t"
485 "add %%"REG_a", %1 \n\t"
486 "add %%"REG_a", %2 \n\t"
489 : "+g"(h), "+r"(pixels), "+r"(block)
490 : "r"((x86_reg)line_size)
495 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
496 int line_size, int h)
499 "lea (%3, %3), %%"REG_a" \n\t"
502 "movq (%1 ), %%mm0 \n\t"
503 "movq 8(%1 ), %%mm4 \n\t"
504 "movq (%1, %3), %%mm1 \n\t"
505 "movq 8(%1, %3), %%mm5 \n\t"
506 "movq %%mm0, (%2) \n\t"
507 "movq %%mm4, 8(%2) \n\t"
508 "movq %%mm1, (%2, %3) \n\t"
509 "movq %%mm5, 8(%2, %3) \n\t"
510 "add %%"REG_a", %1 \n\t"
511 "add %%"REG_a", %2 \n\t"
512 "movq (%1 ), %%mm0 \n\t"
513 "movq 8(%1 ), %%mm4 \n\t"
514 "movq (%1, %3), %%mm1 \n\t"
515 "movq 8(%1, %3), %%mm5 \n\t"
516 "movq %%mm0, (%2) \n\t"
517 "movq %%mm4, 8(%2) \n\t"
518 "movq %%mm1, (%2, %3) \n\t"
519 "movq %%mm5, 8(%2, %3) \n\t"
520 "add %%"REG_a", %1 \n\t"
521 "add %%"REG_a", %2 \n\t"
524 : "+g"(h), "+r"(pixels), "+r"(block)
525 : "r"((x86_reg)line_size)
530 #define CLEAR_BLOCKS(name, n) \
531 static void name(int16_t *blocks) \
534 "pxor %%mm7, %%mm7 \n\t" \
535 "mov %1, %%"REG_a" \n\t" \
537 "movq %%mm7, (%0, %%"REG_a") \n\t" \
538 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
539 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
540 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
541 "add $32, %%"REG_a" \n\t" \
543 :: "r"(((uint8_t *)blocks) + 128 * n), \
548 CLEAR_BLOCKS(clear_blocks_mmx, 6)
549 CLEAR_BLOCKS(clear_block_mmx, 1)
551 static void clear_block_sse(int16_t *block)
554 "xorps %%xmm0, %%xmm0 \n"
555 "movaps %%xmm0, (%0) \n"
556 "movaps %%xmm0, 16(%0) \n"
557 "movaps %%xmm0, 32(%0) \n"
558 "movaps %%xmm0, 48(%0) \n"
559 "movaps %%xmm0, 64(%0) \n"
560 "movaps %%xmm0, 80(%0) \n"
561 "movaps %%xmm0, 96(%0) \n"
562 "movaps %%xmm0, 112(%0) \n"
568 static void clear_blocks_sse(int16_t *blocks)
571 "xorps %%xmm0, %%xmm0 \n"
572 "mov %1, %%"REG_a" \n"
574 "movaps %%xmm0, (%0, %%"REG_a") \n"
575 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
576 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
577 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
578 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
579 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
580 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
581 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
582 "add $128, %%"REG_a" \n"
584 :: "r"(((uint8_t *)blocks) + 128 * 6),
590 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
596 "movq (%1, %0), %%mm0 \n\t"
597 "movq (%2, %0), %%mm1 \n\t"
598 "paddb %%mm0, %%mm1 \n\t"
599 "movq %%mm1, (%2, %0) \n\t"
600 "movq 8(%1, %0), %%mm0 \n\t"
601 "movq 8(%2, %0), %%mm1 \n\t"
602 "paddb %%mm0, %%mm1 \n\t"
603 "movq %%mm1, 8(%2, %0) \n\t"
609 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
612 dst[i + 0] += src[i + 0];
616 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
617 const uint8_t *diff, int w,
618 int *left, int *left_top)
622 int l = *left & 0xff;
623 int tl = *left_top & 0xff;
628 "movzbl (%3, %4), %2 \n"
641 "add (%6, %4), %b0 \n"
642 "mov %b0, (%5, %4) \n"
645 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
646 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
652 #endif /* HAVE_INLINE_ASM */
654 void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
655 void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
658 /* Draw the edges of width 'w' of an image of size width, height
659 * this MMX version can only handle w == 8 || w == 16. */
660 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
661 int w, int h, int sides)
663 uint8_t *ptr, *last_line;
666 last_line = buf + (height - 1) * wrap;
672 "movd (%0), %%mm0 \n\t"
673 "punpcklbw %%mm0, %%mm0 \n\t"
674 "punpcklwd %%mm0, %%mm0 \n\t"
675 "punpckldq %%mm0, %%mm0 \n\t"
676 "movq %%mm0, -8(%0) \n\t"
677 "movq -8(%0, %2), %%mm1 \n\t"
678 "punpckhbw %%mm1, %%mm1 \n\t"
679 "punpckhwd %%mm1, %%mm1 \n\t"
680 "punpckhdq %%mm1, %%mm1 \n\t"
681 "movq %%mm1, (%0, %2) \n\t"
686 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
691 "movd (%0), %%mm0 \n\t"
692 "punpcklbw %%mm0, %%mm0 \n\t"
693 "punpcklwd %%mm0, %%mm0 \n\t"
694 "punpckldq %%mm0, %%mm0 \n\t"
695 "movq %%mm0, -8(%0) \n\t"
696 "movq %%mm0, -16(%0) \n\t"
697 "movq -8(%0, %2), %%mm1 \n\t"
698 "punpckhbw %%mm1, %%mm1 \n\t"
699 "punpckhwd %%mm1, %%mm1 \n\t"
700 "punpckhdq %%mm1, %%mm1 \n\t"
701 "movq %%mm1, (%0, %2) \n\t"
702 "movq %%mm1, 8(%0, %2) \n\t"
707 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
711 /* top and bottom (and hopefully also the corners) */
712 if (sides & EDGE_TOP) {
713 for (i = 0; i < h; i += 4) {
714 ptr = buf - (i + 1) * wrap - w;
717 "movq (%1, %0), %%mm0 \n\t"
718 "movq %%mm0, (%0) \n\t"
719 "movq %%mm0, (%0, %2) \n\t"
720 "movq %%mm0, (%0, %2, 2) \n\t"
721 "movq %%mm0, (%0, %3) \n\t"
726 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
727 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
732 if (sides & EDGE_BOTTOM) {
733 for (i = 0; i < h; i += 4) {
734 ptr = last_line + (i + 1) * wrap - w;
737 "movq (%1, %0), %%mm0 \n\t"
738 "movq %%mm0, (%0) \n\t"
739 "movq %%mm0, (%0, %2) \n\t"
740 "movq %%mm0, (%0, %2, 2) \n\t"
741 "movq %%mm0, (%0, %3) \n\t"
746 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
747 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
748 "r"(ptr + width + 2 * w)
753 #endif /* HAVE_INLINE_ASM */
757 #define QPEL_OP(OPNAME, ROUNDER, RND, MMX) \
758 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
761 ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
764 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
768 uint8_t * const half = (uint8_t*)temp; \
769 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
771 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
772 stride, stride, 8); \
775 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
778 ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
782 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
786 uint8_t * const half = (uint8_t*)temp; \
787 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
789 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
793 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
797 uint8_t * const half = (uint8_t*)temp; \
798 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
800 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
801 stride, stride, 8); \
804 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
807 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
811 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
815 uint8_t * const half = (uint8_t*)temp; \
816 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
818 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
822 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
825 uint64_t half[8 + 9]; \
826 uint8_t * const halfH = ((uint8_t*)half) + 64; \
827 uint8_t * const halfHV = ((uint8_t*)half); \
828 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
830 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
832 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
833 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
837 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
840 uint64_t half[8 + 9]; \
841 uint8_t * const halfH = ((uint8_t*)half) + 64; \
842 uint8_t * const halfHV = ((uint8_t*)half); \
843 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
845 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
847 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
848 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
852 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
855 uint64_t half[8 + 9]; \
856 uint8_t * const halfH = ((uint8_t*)half) + 64; \
857 uint8_t * const halfHV = ((uint8_t*)half); \
858 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
860 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
862 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
863 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
867 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
870 uint64_t half[8 + 9]; \
871 uint8_t * const halfH = ((uint8_t*)half) + 64; \
872 uint8_t * const halfHV = ((uint8_t*)half); \
873 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
875 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
877 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
878 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
882 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
885 uint64_t half[8 + 9]; \
886 uint8_t * const halfH = ((uint8_t*)half) + 64; \
887 uint8_t * const halfHV = ((uint8_t*)half); \
888 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
890 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
891 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
895 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
898 uint64_t half[8 + 9]; \
899 uint8_t * const halfH = ((uint8_t*)half) + 64; \
900 uint8_t * const halfHV = ((uint8_t*)half); \
901 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
903 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
904 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
908 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
911 uint64_t half[8 + 9]; \
912 uint8_t * const halfH = ((uint8_t*)half); \
913 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
915 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
917 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
921 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
924 uint64_t half[8 + 9]; \
925 uint8_t * const halfH = ((uint8_t*)half); \
926 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
928 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
930 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
934 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
938 uint8_t * const halfH = ((uint8_t*)half); \
939 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
941 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
945 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
948 ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
951 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
955 uint8_t * const half = (uint8_t*)temp; \
956 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
958 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
962 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
965 ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
966 stride, stride, 16);\
969 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
973 uint8_t * const half = (uint8_t*)temp; \
974 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
976 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
977 stride, stride, 16); \
980 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
984 uint8_t * const half = (uint8_t*)temp; \
985 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
987 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
991 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
994 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
998 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1001 uint64_t temp[32]; \
1002 uint8_t * const half = (uint8_t*)temp; \
1003 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1005 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1006 stride, stride, 16); \
1009 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1012 uint64_t half[16 * 2 + 17 * 2]; \
1013 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1014 uint8_t * const halfHV = ((uint8_t*)half); \
1015 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1017 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1019 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1021 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1025 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1028 uint64_t half[16 * 2 + 17 * 2]; \
1029 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1030 uint8_t * const halfHV = ((uint8_t*)half); \
1031 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1033 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1035 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1037 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1041 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1044 uint64_t half[16 * 2 + 17 * 2]; \
1045 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1046 uint8_t * const halfHV = ((uint8_t*)half); \
1047 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1049 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1051 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1053 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1057 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1060 uint64_t half[16 * 2 + 17 * 2]; \
1061 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1062 uint8_t * const halfHV = ((uint8_t*)half); \
1063 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1065 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1067 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1069 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1073 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1076 uint64_t half[16 * 2 + 17 * 2]; \
1077 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1078 uint8_t * const halfHV = ((uint8_t*)half); \
1079 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1081 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1083 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1087 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1090 uint64_t half[16 * 2 + 17 * 2]; \
1091 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1092 uint8_t * const halfHV = ((uint8_t*)half); \
1093 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1095 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1097 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1101 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1104 uint64_t half[17 * 2]; \
1105 uint8_t * const halfH = ((uint8_t*)half); \
1106 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1108 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1110 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1114 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1117 uint64_t half[17 * 2]; \
1118 uint8_t * const halfH = ((uint8_t*)half); \
1119 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1121 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1123 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1127 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1130 uint64_t half[17 * 2]; \
1131 uint8_t * const halfH = ((uint8_t*)half); \
1132 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1134 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1138 QPEL_OP(put_, ff_pw_16, _, mmxext)
1139 QPEL_OP(avg_, ff_pw_16, _, mmxext)
1140 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, mmxext)
1141 #endif /* HAVE_YASM */
1145 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1147 put_pixels8_xy2_mmx(dst, src, stride, 8);
1149 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1151 put_pixels16_xy2_mmx(dst, src, stride, 16);
1153 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1155 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1157 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1159 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1162 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1163 int stride, int h, int ox, int oy,
1164 int dxx, int dxy, int dyx, int dyy,
1165 int shift, int r, int width, int height)
1168 const int ix = ox >> (16 + shift);
1169 const int iy = oy >> (16 + shift);
1170 const int oxs = ox >> 4;
1171 const int oys = oy >> 4;
1172 const int dxxs = dxx >> 4;
1173 const int dxys = dxy >> 4;
1174 const int dyxs = dyx >> 4;
1175 const int dyys = dyy >> 4;
1176 const uint16_t r4[4] = { r, r, r, r };
1177 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1178 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1179 const uint64_t shift2 = 2 * shift;
1182 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1183 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1184 const int dxh = dxy * (h - 1);
1185 const int dyw = dyx * (w - 1);
1186 if ( // non-constant fullpel offset (3% of blocks)
1187 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1188 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1189 // uses more than 16 bits of subpel mv (only at huge resolution)
1190 || (dxx | dxy | dyx | dyy) & 15 ||
1191 (unsigned)ix >= width - w ||
1192 (unsigned)iy >= height - h) {
1193 // FIXME could still use mmx for some of the rows
1194 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1195 shift, r, width, height);
1199 src += ix + iy * stride;
1202 "movd %0, %%mm6 \n\t"
1203 "pxor %%mm7, %%mm7 \n\t"
1204 "punpcklwd %%mm6, %%mm6 \n\t"
1205 "punpcklwd %%mm6, %%mm6 \n\t"
1209 for (x = 0; x < w; x += 4) {
1210 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1211 oxs - dxys + dxxs * (x + 1),
1212 oxs - dxys + dxxs * (x + 2),
1213 oxs - dxys + dxxs * (x + 3) };
1214 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1215 oys - dyys + dyxs * (x + 1),
1216 oys - dyys + dyxs * (x + 2),
1217 oys - dyys + dyxs * (x + 3) };
1219 for (y = 0; y < h; y++) {
1221 "movq %0, %%mm4 \n\t"
1222 "movq %1, %%mm5 \n\t"
1223 "paddw %2, %%mm4 \n\t"
1224 "paddw %3, %%mm5 \n\t"
1225 "movq %%mm4, %0 \n\t"
1226 "movq %%mm5, %1 \n\t"
1227 "psrlw $12, %%mm4 \n\t"
1228 "psrlw $12, %%mm5 \n\t"
1229 : "+m"(*dx4), "+m"(*dy4)
1230 : "m"(*dxy4), "m"(*dyy4)
1234 "movq %%mm6, %%mm2 \n\t"
1235 "movq %%mm6, %%mm1 \n\t"
1236 "psubw %%mm4, %%mm2 \n\t"
1237 "psubw %%mm5, %%mm1 \n\t"
1238 "movq %%mm2, %%mm0 \n\t"
1239 "movq %%mm4, %%mm3 \n\t"
1240 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1241 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1242 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1243 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1245 "movd %4, %%mm5 \n\t"
1246 "movd %3, %%mm4 \n\t"
1247 "punpcklbw %%mm7, %%mm5 \n\t"
1248 "punpcklbw %%mm7, %%mm4 \n\t"
1249 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1250 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1252 "movd %2, %%mm5 \n\t"
1253 "movd %1, %%mm4 \n\t"
1254 "punpcklbw %%mm7, %%mm5 \n\t"
1255 "punpcklbw %%mm7, %%mm4 \n\t"
1256 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1257 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1258 "paddw %5, %%mm1 \n\t"
1259 "paddw %%mm3, %%mm2 \n\t"
1260 "paddw %%mm1, %%mm0 \n\t"
1261 "paddw %%mm2, %%mm0 \n\t"
1263 "psrlw %6, %%mm0 \n\t"
1264 "packuswb %%mm0, %%mm0 \n\t"
1265 "movd %%mm0, %0 \n\t"
1267 : "=m"(dst[x + y * stride])
1268 : "m"(src[0]), "m"(src[1]),
1269 "m"(src[stride]), "m"(src[stride + 1]),
1270 "m"(*r4), "m"(shift2)
1274 src += 4 - h * stride;
1277 #endif /* HAVE_INLINE_ASM */
1279 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1280 ptrdiff_t line_size, int h);
1281 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1282 ptrdiff_t line_size, int h);
1287 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1289 put_pixels8_mmx(dst, src, stride, 8);
1292 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1294 avg_pixels8_mmx(dst, src, stride, 8);
1297 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1299 put_pixels16_mmx(dst, src, stride, 16);
1302 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1304 avg_pixels16_mmx(dst, src, stride, 16);
1306 #endif /* HAVE_INLINE_ASM */
1310 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1311 int stride, int rnd)
1313 ff_put_pixels8_mmx(dst, src, stride, 8);
1315 #endif /* HAVE_YASM */
1318 static void vector_clipf_sse(float *dst, const float *src,
1319 float min, float max, int len)
1321 x86_reg i = (len - 16) * 4;
1323 "movss %3, %%xmm4 \n\t"
1324 "movss %4, %%xmm5 \n\t"
1325 "shufps $0, %%xmm4, %%xmm4 \n\t"
1326 "shufps $0, %%xmm5, %%xmm5 \n\t"
1328 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1329 "movaps 16(%2, %0), %%xmm1 \n\t"
1330 "movaps 32(%2, %0), %%xmm2 \n\t"
1331 "movaps 48(%2, %0), %%xmm3 \n\t"
1332 "maxps %%xmm4, %%xmm0 \n\t"
1333 "maxps %%xmm4, %%xmm1 \n\t"
1334 "maxps %%xmm4, %%xmm2 \n\t"
1335 "maxps %%xmm4, %%xmm3 \n\t"
1336 "minps %%xmm5, %%xmm0 \n\t"
1337 "minps %%xmm5, %%xmm1 \n\t"
1338 "minps %%xmm5, %%xmm2 \n\t"
1339 "minps %%xmm5, %%xmm3 \n\t"
1340 "movaps %%xmm0, (%1, %0) \n\t"
1341 "movaps %%xmm1, 16(%1, %0) \n\t"
1342 "movaps %%xmm2, 32(%1, %0) \n\t"
1343 "movaps %%xmm3, 48(%1, %0) \n\t"
1347 : "r"(dst), "r"(src), "m"(min), "m"(max)
1352 #endif /* HAVE_INLINE_ASM */
1354 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1356 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1358 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1360 int order, int mul);
1361 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1363 int order, int mul);
1364 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1366 int order, int mul);
1368 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1369 const int16_t *window, unsigned int len);
1370 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1371 const int16_t *window, unsigned int len);
1372 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1373 const int16_t *window, unsigned int len);
1374 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1375 const int16_t *window, unsigned int len);
1376 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1377 const int16_t *window, unsigned int len);
1378 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1379 const int16_t *window, unsigned int len);
1381 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1382 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1384 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1385 const uint8_t *diff, int w,
1386 int *left, int *left_top);
1387 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1389 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1392 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1393 int32_t min, int32_t max, unsigned int len);
1394 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1395 int32_t min, int32_t max, unsigned int len);
1396 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1397 int32_t min, int32_t max, unsigned int len);
1398 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1399 int32_t min, int32_t max, unsigned int len);
1401 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
1403 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1404 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1405 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
1406 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
1407 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
1408 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
1409 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
1410 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
1411 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
1412 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
1413 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
1414 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
1415 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
1416 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
1417 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
1418 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
1421 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
1423 c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
1424 c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
1425 c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
1426 c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
1429 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
1432 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1435 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
1436 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
1437 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
1439 if (!high_bit_depth) {
1440 c->clear_block = clear_block_mmx;
1441 c->clear_blocks = clear_blocks_mmx;
1442 c->draw_edges = draw_edges_mmx;
1444 SET_HPEL_FUNCS(put, [0], 16, mmx);
1445 SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
1446 SET_HPEL_FUNCS(avg, [0], 16, mmx);
1447 SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
1448 SET_HPEL_FUNCS(put, [1], 8, mmx);
1449 SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
1450 SET_HPEL_FUNCS(avg, [1], 8, mmx);
1452 switch (avctx->idct_algo) {
1454 case FF_IDCT_SIMPLEMMX:
1455 c->idct_put = ff_simple_idct_put_mmx;
1456 c->idct_add = ff_simple_idct_add_mmx;
1457 c->idct = ff_simple_idct_mmx;
1458 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
1460 case FF_IDCT_XVIDMMX:
1461 c->idct_put = ff_idct_xvid_mmx_put;
1462 c->idct_add = ff_idct_xvid_mmx_add;
1463 c->idct = ff_idct_xvid_mmx;
1470 c->add_bytes = add_bytes_mmx;
1471 #endif /* HAVE_INLINE_ASM */
1474 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1475 c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
1476 c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
1479 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
1484 static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
1487 const int bit_depth = avctx->bits_per_raw_sample;
1488 const int high_bit_depth = bit_depth > 8;
1491 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
1492 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
1494 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
1495 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
1496 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
1497 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
1499 if (!high_bit_depth) {
1500 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
1501 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
1503 c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
1504 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
1505 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
1507 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
1508 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
1510 c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
1511 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
1512 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
1515 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1516 if (!high_bit_depth) {
1517 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
1518 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
1519 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
1520 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
1522 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
1523 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
1526 #endif /* HAVE_YASM */
1529 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1530 c->idct_put = ff_idct_xvid_mmxext_put;
1531 c->idct_add = ff_idct_xvid_mmxext_add;
1532 c->idct = ff_idct_xvid_mmxext;
1534 #endif /* HAVE_INLINE_ASM */
1536 #if HAVE_MMXEXT_EXTERNAL
1537 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1538 avctx->codec_id == AV_CODEC_ID_THEORA)) {
1539 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
1540 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
1543 /* slower than cmov version on AMD */
1544 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
1545 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
1547 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
1548 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
1550 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1551 c->apply_window_int16 = ff_apply_window_int16_mmxext;
1553 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
1555 #endif /* HAVE_MMXEXT_EXTERNAL */
1558 static av_cold void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
1561 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1564 if (!high_bit_depth) {
1565 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
1566 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
1568 c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
1569 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
1570 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
1572 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
1573 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
1575 c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
1576 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
1577 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
1579 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
1580 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
1581 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
1582 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
1583 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
1585 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
1586 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
1590 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1591 avctx->codec_id == AV_CODEC_ID_THEORA)) {
1592 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
1593 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
1595 #endif /* HAVE_YASM */
1598 static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
1601 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1604 if (!high_bit_depth) {
1605 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
1606 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
1607 c->clear_block = clear_block_sse;
1608 c->clear_blocks = clear_blocks_sse;
1612 c->vector_clipf = vector_clipf_sse;
1613 #endif /* HAVE_INLINE_ASM */
1616 static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
1619 const int bit_depth = avctx->bits_per_raw_sample;
1620 const int high_bit_depth = bit_depth > 8;
1622 #if HAVE_SSE2_INLINE
1623 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1624 c->idct_put = ff_idct_xvid_sse2_put;
1625 c->idct_add = ff_idct_xvid_sse2_add;
1626 c->idct = ff_idct_xvid_sse2;
1627 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
1629 #endif /* HAVE_SSE2_INLINE */
1631 #if HAVE_SSE2_EXTERNAL
1632 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1633 // these functions are slower than mmx on AMD, but faster on Intel
1634 if (!high_bit_depth) {
1635 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
1636 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
1637 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
1641 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
1642 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
1643 if (mm_flags & AV_CPU_FLAG_ATOM) {
1644 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
1646 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
1648 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1649 c->apply_window_int16 = ff_apply_window_int16_sse2;
1650 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1651 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
1653 c->bswap_buf = ff_bswap32_buf_sse2;
1654 #endif /* HAVE_SSE2_EXTERNAL */
1657 static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
1660 #if HAVE_SSSE3_EXTERNAL
1661 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
1662 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
1663 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
1665 if (mm_flags & AV_CPU_FLAG_ATOM)
1666 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
1668 c->apply_window_int16 = ff_apply_window_int16_ssse3;
1669 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
1670 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
1671 c->bswap_buf = ff_bswap32_buf_ssse3;
1672 #endif /* HAVE_SSSE3_EXTERNAL */
1675 static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
1678 #if HAVE_SSE4_EXTERNAL
1679 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
1680 #endif /* HAVE_SSE4_EXTERNAL */
1683 av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
1685 int mm_flags = av_get_cpu_flags();
1687 #if HAVE_7REGS && HAVE_INLINE_ASM
1688 if (mm_flags & AV_CPU_FLAG_CMOV)
1689 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
1692 if (mm_flags & AV_CPU_FLAG_MMX)
1693 dsputil_init_mmx(c, avctx, mm_flags);
1695 if (mm_flags & AV_CPU_FLAG_MMXEXT)
1696 dsputil_init_mmxext(c, avctx, mm_flags);
1698 if (mm_flags & AV_CPU_FLAG_3DNOW)
1699 dsputil_init_3dnow(c, avctx, mm_flags);
1701 if (mm_flags & AV_CPU_FLAG_SSE)
1702 dsputil_init_sse(c, avctx, mm_flags);
1704 if (mm_flags & AV_CPU_FLAG_SSE2)
1705 dsputil_init_sse2(c, avctx, mm_flags);
1707 if (mm_flags & AV_CPU_FLAG_SSSE3)
1708 dsputil_init_ssse3(c, avctx, mm_flags);
1710 if (mm_flags & AV_CPU_FLAG_SSE4)
1711 dsputil_init_sse4(c, avctx, mm_flags);
1713 if (CONFIG_ENCODERS)
1714 ff_dsputilenc_init_mmx(c, avctx);