2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "dsputil_mmx.h"
32 #include "idct_xvid.h"
33 #include "diracdsp_mmx.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
43 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
44 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
49 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
53 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
57 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
58 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
60 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
62 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
63 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
64 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
65 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
71 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
72 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
74 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
76 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
81 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
82 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
86 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
87 int line_size, int h);
88 void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
89 int line_size, int h);
90 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
91 int dstStride, int src1Stride, int h);
92 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
93 uint8_t *src2, int dstStride,
94 int src1Stride, int h);
95 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
96 int dstStride, int src1Stride, int h);
97 void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
98 int line_size, int h);
99 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
100 int line_size, int h);
101 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
102 int dstStride, int src1Stride, int h);
103 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
104 int dstStride, int src1Stride, int h);
105 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
106 int dstStride, int src1Stride, int h);
107 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
108 int line_size, int h);
109 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
110 int line_size, int h);
111 void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
112 const uint8_t *pixels,
113 int line_size, int h);
114 void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
115 const uint8_t *pixels,
116 int line_size, int h);
117 void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
118 int line_size, int h);
119 void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
120 int line_size, int h);
121 void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
122 int line_size, int h);
123 void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
124 int line_size, int h);
125 void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
126 const uint8_t *pixels,
127 int line_size, int h);
128 void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
129 const uint8_t *pixels,
130 int line_size, int h);
131 void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
132 int line_size, int h);
133 void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
134 int line_size, int h);
135 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
136 int line_size, int h);
137 void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
138 int line_size, int h);
139 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
140 int line_size, int h);
141 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
142 int line_size, int h);
143 void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
144 int line_size, int h);
145 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
146 int line_size, int h);
148 void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h);
149 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
150 int line_size, int h)
152 ff_put_pixels8_mmxext(block, pixels, line_size, h);
153 ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
156 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
157 int dstStride, int srcStride, int h);
158 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
159 int dstStride, int srcStride, int h);
160 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
161 int dstStride, int srcStride,
163 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
164 int dstStride, int srcStride, int h);
165 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
166 int dstStride, int srcStride, int h);
167 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
168 int dstStride, int srcStride,
170 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
171 int dstStride, int srcStride);
172 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
173 int dstStride, int srcStride);
174 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
175 int dstStride, int srcStride);
176 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
177 int dstStride, int srcStride);
178 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
179 int dstStride, int srcStride);
180 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
181 int dstStride, int srcStride);
182 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
183 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
184 #endif /* HAVE_YASM */
189 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
190 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
192 #define MOVQ_BFE(regd) \
194 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
195 "paddb %%"#regd", %%"#regd" \n\t" ::)
198 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
199 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
201 // for shared library it's better to use this way for accessing constants
203 #define MOVQ_BONE(regd) \
205 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
206 "psrlw $15, %%"#regd" \n\t" \
207 "packuswb %%"#regd", %%"#regd" \n\t" ::)
209 #define MOVQ_WTWO(regd) \
211 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
212 "psrlw $15, %%"#regd" \n\t" \
213 "psllw $1, %%"#regd" \n\t"::)
217 // using regr as temporary and for the output result
218 // first argument is unmodifed and second is trashed
219 // regfe is supposed to contain 0xfefefefefefefefe
220 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
221 "movq "#rega", "#regr" \n\t" \
222 "pand "#regb", "#regr" \n\t" \
223 "pxor "#rega", "#regb" \n\t" \
224 "pand "#regfe", "#regb" \n\t" \
225 "psrlq $1, "#regb" \n\t" \
226 "paddb "#regb", "#regr" \n\t"
228 #define PAVGB_MMX(rega, regb, regr, regfe) \
229 "movq "#rega", "#regr" \n\t" \
230 "por "#regb", "#regr" \n\t" \
231 "pxor "#rega", "#regb" \n\t" \
232 "pand "#regfe", "#regb" \n\t" \
233 "psrlq $1, "#regb" \n\t" \
234 "psubb "#regb", "#regr" \n\t"
236 // mm6 is supposed to contain 0xfefefefefefefefe
237 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
238 "movq "#rega", "#regr" \n\t" \
239 "movq "#regc", "#regp" \n\t" \
240 "pand "#regb", "#regr" \n\t" \
241 "pand "#regd", "#regp" \n\t" \
242 "pxor "#rega", "#regb" \n\t" \
243 "pxor "#regc", "#regd" \n\t" \
244 "pand %%mm6, "#regb" \n\t" \
245 "pand %%mm6, "#regd" \n\t" \
246 "psrlq $1, "#regb" \n\t" \
247 "psrlq $1, "#regd" \n\t" \
248 "paddb "#regb", "#regr" \n\t" \
249 "paddb "#regd", "#regp" \n\t"
251 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
252 "movq "#rega", "#regr" \n\t" \
253 "movq "#regc", "#regp" \n\t" \
254 "por "#regb", "#regr" \n\t" \
255 "por "#regd", "#regp" \n\t" \
256 "pxor "#rega", "#regb" \n\t" \
257 "pxor "#regc", "#regd" \n\t" \
258 "pand %%mm6, "#regb" \n\t" \
259 "pand %%mm6, "#regd" \n\t" \
260 "psrlq $1, "#regd" \n\t" \
261 "psrlq $1, "#regb" \n\t" \
262 "psubb "#regb", "#regr" \n\t" \
263 "psubb "#regd", "#regp" \n\t"
265 /***********************************/
266 /* MMX no rounding */
268 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
269 #define SET_RND MOVQ_WONE
270 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
271 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
272 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
274 #include "dsputil_rnd_template.c"
281 /***********************************/
284 #define DEF(x, y) x ## _ ## y ## _mmx
285 #define SET_RND MOVQ_WTWO
286 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
287 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
289 #include "dsputil_rnd_template.c"
297 #endif /* HAVE_INLINE_ASM */
301 #define ff_put_pixels8_mmx ff_put_pixels8_mmxext
303 /***********************************/
306 #define DEF(x) x ## _3dnow
308 #include "dsputil_avg_template.c"
312 /***********************************/
313 /* MMXEXT specific */
315 #define DEF(x) x ## _mmxext
317 #include "dsputil_avg_template.c"
321 #endif /* HAVE_YASM */
325 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
326 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
327 #define put_pixels16_mmxext put_pixels16_mmx
328 #define put_pixels8_mmxext put_pixels8_mmx
329 #define put_pixels4_mmxext put_pixels4_mmx
330 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
331 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
333 /***********************************/
336 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
342 /* read the pixels */
347 "movq (%3), %%mm0 \n\t"
348 "movq 8(%3), %%mm1 \n\t"
349 "movq 16(%3), %%mm2 \n\t"
350 "movq 24(%3), %%mm3 \n\t"
351 "movq 32(%3), %%mm4 \n\t"
352 "movq 40(%3), %%mm5 \n\t"
353 "movq 48(%3), %%mm6 \n\t"
354 "movq 56(%3), %%mm7 \n\t"
355 "packuswb %%mm1, %%mm0 \n\t"
356 "packuswb %%mm3, %%mm2 \n\t"
357 "packuswb %%mm5, %%mm4 \n\t"
358 "packuswb %%mm7, %%mm6 \n\t"
359 "movq %%mm0, (%0) \n\t"
360 "movq %%mm2, (%0, %1) \n\t"
361 "movq %%mm4, (%0, %1, 2) \n\t"
362 "movq %%mm6, (%0, %2) \n\t"
363 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
366 pix += line_size * 4;
369 // if here would be an exact copy of the code above
370 // compiler would generate some very strange code
373 "movq (%3), %%mm0 \n\t"
374 "movq 8(%3), %%mm1 \n\t"
375 "movq 16(%3), %%mm2 \n\t"
376 "movq 24(%3), %%mm3 \n\t"
377 "movq 32(%3), %%mm4 \n\t"
378 "movq 40(%3), %%mm5 \n\t"
379 "movq 48(%3), %%mm6 \n\t"
380 "movq 56(%3), %%mm7 \n\t"
381 "packuswb %%mm1, %%mm0 \n\t"
382 "packuswb %%mm3, %%mm2 \n\t"
383 "packuswb %%mm5, %%mm4 \n\t"
384 "packuswb %%mm7, %%mm6 \n\t"
385 "movq %%mm0, (%0) \n\t"
386 "movq %%mm2, (%0, %1) \n\t"
387 "movq %%mm4, (%0, %1, 2) \n\t"
388 "movq %%mm6, (%0, %2) \n\t"
389 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
393 #define put_signed_pixels_clamped_mmx_half(off) \
394 "movq "#off"(%2), %%mm1 \n\t" \
395 "movq 16 + "#off"(%2), %%mm2 \n\t" \
396 "movq 32 + "#off"(%2), %%mm3 \n\t" \
397 "movq 48 + "#off"(%2), %%mm4 \n\t" \
398 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
399 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
400 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
401 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
402 "paddb %%mm0, %%mm1 \n\t" \
403 "paddb %%mm0, %%mm2 \n\t" \
404 "paddb %%mm0, %%mm3 \n\t" \
405 "paddb %%mm0, %%mm4 \n\t" \
406 "movq %%mm1, (%0) \n\t" \
407 "movq %%mm2, (%0, %3) \n\t" \
408 "movq %%mm3, (%0, %3, 2) \n\t" \
409 "movq %%mm4, (%0, %1) \n\t"
411 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
414 x86_reg line_skip = line_size;
418 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
419 "lea (%3, %3, 2), %1 \n\t"
420 put_signed_pixels_clamped_mmx_half(0)
421 "lea (%0, %3, 4), %0 \n\t"
422 put_signed_pixels_clamped_mmx_half(64)
423 : "+&r"(pixels), "=&r"(line_skip3)
424 : "r"(block), "r"(line_skip)
428 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
435 /* read the pixels */
442 "movq (%2), %%mm0 \n\t"
443 "movq 8(%2), %%mm1 \n\t"
444 "movq 16(%2), %%mm2 \n\t"
445 "movq 24(%2), %%mm3 \n\t"
446 "movq %0, %%mm4 \n\t"
447 "movq %1, %%mm6 \n\t"
448 "movq %%mm4, %%mm5 \n\t"
449 "punpcklbw %%mm7, %%mm4 \n\t"
450 "punpckhbw %%mm7, %%mm5 \n\t"
451 "paddsw %%mm4, %%mm0 \n\t"
452 "paddsw %%mm5, %%mm1 \n\t"
453 "movq %%mm6, %%mm5 \n\t"
454 "punpcklbw %%mm7, %%mm6 \n\t"
455 "punpckhbw %%mm7, %%mm5 \n\t"
456 "paddsw %%mm6, %%mm2 \n\t"
457 "paddsw %%mm5, %%mm3 \n\t"
458 "packuswb %%mm1, %%mm0 \n\t"
459 "packuswb %%mm3, %%mm2 \n\t"
460 "movq %%mm0, %0 \n\t"
461 "movq %%mm2, %1 \n\t"
462 : "+m"(*pix), "+m"(*(pix + line_size))
465 pix += line_size * 2;
470 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
471 int line_size, int h)
474 "lea (%3, %3), %%"REG_a" \n\t"
477 "movq (%1 ), %%mm0 \n\t"
478 "movq (%1, %3), %%mm1 \n\t"
479 "movq %%mm0, (%2) \n\t"
480 "movq %%mm1, (%2, %3) \n\t"
481 "add %%"REG_a", %1 \n\t"
482 "add %%"REG_a", %2 \n\t"
483 "movq (%1 ), %%mm0 \n\t"
484 "movq (%1, %3), %%mm1 \n\t"
485 "movq %%mm0, (%2) \n\t"
486 "movq %%mm1, (%2, %3) \n\t"
487 "add %%"REG_a", %1 \n\t"
488 "add %%"REG_a", %2 \n\t"
491 : "+g"(h), "+r"(pixels), "+r"(block)
492 : "r"((x86_reg)line_size)
497 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
498 int line_size, int h)
501 "lea (%3, %3), %%"REG_a" \n\t"
504 "movq (%1 ), %%mm0 \n\t"
505 "movq 8(%1 ), %%mm4 \n\t"
506 "movq (%1, %3), %%mm1 \n\t"
507 "movq 8(%1, %3), %%mm5 \n\t"
508 "movq %%mm0, (%2) \n\t"
509 "movq %%mm4, 8(%2) \n\t"
510 "movq %%mm1, (%2, %3) \n\t"
511 "movq %%mm5, 8(%2, %3) \n\t"
512 "add %%"REG_a", %1 \n\t"
513 "add %%"REG_a", %2 \n\t"
514 "movq (%1 ), %%mm0 \n\t"
515 "movq 8(%1 ), %%mm4 \n\t"
516 "movq (%1, %3), %%mm1 \n\t"
517 "movq 8(%1, %3), %%mm5 \n\t"
518 "movq %%mm0, (%2) \n\t"
519 "movq %%mm4, 8(%2) \n\t"
520 "movq %%mm1, (%2, %3) \n\t"
521 "movq %%mm5, 8(%2, %3) \n\t"
522 "add %%"REG_a", %1 \n\t"
523 "add %%"REG_a", %2 \n\t"
526 : "+g"(h), "+r"(pixels), "+r"(block)
527 : "r"((x86_reg)line_size)
532 #define CLEAR_BLOCKS(name, n) \
533 static void name(int16_t *blocks) \
536 "pxor %%mm7, %%mm7 \n\t" \
537 "mov %1, %%"REG_a" \n\t" \
539 "movq %%mm7, (%0, %%"REG_a") \n\t" \
540 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
541 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
542 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
543 "add $32, %%"REG_a" \n\t" \
545 :: "r"(((uint8_t *)blocks) + 128 * n), \
550 CLEAR_BLOCKS(clear_blocks_mmx, 6)
551 CLEAR_BLOCKS(clear_block_mmx, 1)
553 static void clear_block_sse(int16_t *block)
556 "xorps %%xmm0, %%xmm0 \n"
557 "movaps %%xmm0, (%0) \n"
558 "movaps %%xmm0, 16(%0) \n"
559 "movaps %%xmm0, 32(%0) \n"
560 "movaps %%xmm0, 48(%0) \n"
561 "movaps %%xmm0, 64(%0) \n"
562 "movaps %%xmm0, 80(%0) \n"
563 "movaps %%xmm0, 96(%0) \n"
564 "movaps %%xmm0, 112(%0) \n"
570 static void clear_blocks_sse(int16_t *blocks)
573 "xorps %%xmm0, %%xmm0 \n"
574 "mov %1, %%"REG_a" \n"
576 "movaps %%xmm0, (%0, %%"REG_a") \n"
577 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
578 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
579 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
580 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
581 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
582 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
583 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
584 "add $128, %%"REG_a" \n"
586 :: "r"(((uint8_t *)blocks) + 128 * 6),
592 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
598 "movq (%1, %0), %%mm0 \n\t"
599 "movq (%2, %0), %%mm1 \n\t"
600 "paddb %%mm0, %%mm1 \n\t"
601 "movq %%mm1, (%2, %0) \n\t"
602 "movq 8(%1, %0), %%mm0 \n\t"
603 "movq 8(%2, %0), %%mm1 \n\t"
604 "paddb %%mm0, %%mm1 \n\t"
605 "movq %%mm1, 8(%2, %0) \n\t"
611 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
614 dst[i + 0] += src[i + 0];
618 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
619 const uint8_t *diff, int w,
620 int *left, int *left_top)
624 int l = *left & 0xff;
625 int tl = *left_top & 0xff;
630 "movzbl (%3, %4), %2 \n"
643 "add (%6, %4), %b0 \n"
644 "mov %b0, (%5, %4) \n"
647 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
648 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
655 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
656 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
657 "movd (%1), %%mm0 \n\t"
659 "movd (%1), %%mm1 \n\t"
660 "movd (%1,%3,1), %%mm2 \n\t"
661 "movd (%1,%3,2), %%mm3 \n\t"
662 "punpcklbw %%mm1, %%mm0 \n\t"
663 "punpcklbw %%mm3, %%mm2 \n\t"
664 "movq %%mm0, %%mm1 \n\t"
665 "punpcklwd %%mm2, %%mm0 \n\t"
666 "punpckhwd %%mm2, %%mm1 \n\t"
667 "movd %%mm0, (%0) \n\t"
669 "punpckhdq %%mm0, %%mm0 \n\t"
670 "movd %%mm0, (%0) \n\t"
671 "movd %%mm1, (%0,%2,1) \n\t"
672 "punpckhdq %%mm1, %%mm1 \n\t"
673 "movd %%mm1, (%0,%2,2) \n\t"
683 #define H263_LOOP_FILTER \
684 "pxor %%mm7, %%mm7 \n\t" \
685 "movq %0, %%mm0 \n\t" \
686 "movq %0, %%mm1 \n\t" \
687 "movq %3, %%mm2 \n\t" \
688 "movq %3, %%mm3 \n\t" \
689 "punpcklbw %%mm7, %%mm0 \n\t" \
690 "punpckhbw %%mm7, %%mm1 \n\t" \
691 "punpcklbw %%mm7, %%mm2 \n\t" \
692 "punpckhbw %%mm7, %%mm3 \n\t" \
693 "psubw %%mm2, %%mm0 \n\t" \
694 "psubw %%mm3, %%mm1 \n\t" \
695 "movq %1, %%mm2 \n\t" \
696 "movq %1, %%mm3 \n\t" \
697 "movq %2, %%mm4 \n\t" \
698 "movq %2, %%mm5 \n\t" \
699 "punpcklbw %%mm7, %%mm2 \n\t" \
700 "punpckhbw %%mm7, %%mm3 \n\t" \
701 "punpcklbw %%mm7, %%mm4 \n\t" \
702 "punpckhbw %%mm7, %%mm5 \n\t" \
703 "psubw %%mm2, %%mm4 \n\t" \
704 "psubw %%mm3, %%mm5 \n\t" \
705 "psllw $2, %%mm4 \n\t" \
706 "psllw $2, %%mm5 \n\t" \
707 "paddw %%mm0, %%mm4 \n\t" \
708 "paddw %%mm1, %%mm5 \n\t" \
709 "pxor %%mm6, %%mm6 \n\t" \
710 "pcmpgtw %%mm4, %%mm6 \n\t" \
711 "pcmpgtw %%mm5, %%mm7 \n\t" \
712 "pxor %%mm6, %%mm4 \n\t" \
713 "pxor %%mm7, %%mm5 \n\t" \
714 "psubw %%mm6, %%mm4 \n\t" \
715 "psubw %%mm7, %%mm5 \n\t" \
716 "psrlw $3, %%mm4 \n\t" \
717 "psrlw $3, %%mm5 \n\t" \
718 "packuswb %%mm5, %%mm4 \n\t" \
719 "packsswb %%mm7, %%mm6 \n\t" \
720 "pxor %%mm7, %%mm7 \n\t" \
721 "movd %4, %%mm2 \n\t" \
722 "punpcklbw %%mm2, %%mm2 \n\t" \
723 "punpcklbw %%mm2, %%mm2 \n\t" \
724 "punpcklbw %%mm2, %%mm2 \n\t" \
725 "psubusb %%mm4, %%mm2 \n\t" \
726 "movq %%mm2, %%mm3 \n\t" \
727 "psubusb %%mm4, %%mm3 \n\t" \
728 "psubb %%mm3, %%mm2 \n\t" \
729 "movq %1, %%mm3 \n\t" \
730 "movq %2, %%mm4 \n\t" \
731 "pxor %%mm6, %%mm3 \n\t" \
732 "pxor %%mm6, %%mm4 \n\t" \
733 "paddusb %%mm2, %%mm3 \n\t" \
734 "psubusb %%mm2, %%mm4 \n\t" \
735 "pxor %%mm6, %%mm3 \n\t" \
736 "pxor %%mm6, %%mm4 \n\t" \
737 "paddusb %%mm2, %%mm2 \n\t" \
738 "packsswb %%mm1, %%mm0 \n\t" \
739 "pcmpgtb %%mm0, %%mm7 \n\t" \
740 "pxor %%mm7, %%mm0 \n\t" \
741 "psubb %%mm7, %%mm0 \n\t" \
742 "movq %%mm0, %%mm1 \n\t" \
743 "psubusb %%mm2, %%mm0 \n\t" \
744 "psubb %%mm0, %%mm1 \n\t" \
745 "pand %5, %%mm1 \n\t" \
746 "psrlw $2, %%mm1 \n\t" \
747 "pxor %%mm7, %%mm1 \n\t" \
748 "psubb %%mm7, %%mm1 \n\t" \
749 "movq %0, %%mm5 \n\t" \
750 "movq %3, %%mm6 \n\t" \
751 "psubb %%mm1, %%mm5 \n\t" \
752 "paddb %%mm1, %%mm6 \n\t"
754 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
756 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
757 const int strength = ff_h263_loop_filter_strength[qscale];
762 "movq %%mm3, %1 \n\t"
763 "movq %%mm4, %2 \n\t"
764 "movq %%mm5, %0 \n\t"
765 "movq %%mm6, %3 \n\t"
766 : "+m"(*(uint64_t*)(src - 2 * stride)),
767 "+m"(*(uint64_t*)(src - 1 * stride)),
768 "+m"(*(uint64_t*)(src + 0 * stride)),
769 "+m"(*(uint64_t*)(src + 1 * stride))
770 : "g"(2 * strength), "m"(ff_pb_FC)
775 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
777 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
778 const int strength = ff_h263_loop_filter_strength[qscale];
779 DECLARE_ALIGNED(8, uint64_t, temp)[4];
780 uint8_t *btemp = (uint8_t*)temp;
784 transpose4x4(btemp, src, 8, stride);
785 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
787 H263_LOOP_FILTER // 5 3 4 6
793 : "g"(2 * strength), "m"(ff_pb_FC)
797 "movq %%mm5, %%mm1 \n\t"
798 "movq %%mm4, %%mm0 \n\t"
799 "punpcklbw %%mm3, %%mm5 \n\t"
800 "punpcklbw %%mm6, %%mm4 \n\t"
801 "punpckhbw %%mm3, %%mm1 \n\t"
802 "punpckhbw %%mm6, %%mm0 \n\t"
803 "movq %%mm5, %%mm3 \n\t"
804 "movq %%mm1, %%mm6 \n\t"
805 "punpcklwd %%mm4, %%mm5 \n\t"
806 "punpcklwd %%mm0, %%mm1 \n\t"
807 "punpckhwd %%mm4, %%mm3 \n\t"
808 "punpckhwd %%mm0, %%mm6 \n\t"
809 "movd %%mm5, (%0) \n\t"
810 "punpckhdq %%mm5, %%mm5 \n\t"
811 "movd %%mm5, (%0, %2) \n\t"
812 "movd %%mm3, (%0, %2, 2) \n\t"
813 "punpckhdq %%mm3, %%mm3 \n\t"
814 "movd %%mm3, (%0, %3) \n\t"
815 "movd %%mm1, (%1) \n\t"
816 "punpckhdq %%mm1, %%mm1 \n\t"
817 "movd %%mm1, (%1, %2) \n\t"
818 "movd %%mm6, (%1, %2, 2) \n\t"
819 "punpckhdq %%mm6, %%mm6 \n\t"
820 "movd %%mm6, (%1, %3) \n\t"
822 "r"(src + 4 * stride),
823 "r"((x86_reg)stride),
824 "r"((x86_reg)(3 * stride))
829 /* Draw the edges of width 'w' of an image of size width, height
830 * this MMX version can only handle w == 8 || w == 16. */
831 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
832 int w, int h, int sides)
834 uint8_t *ptr, *last_line;
837 last_line = buf + (height - 1) * wrap;
843 "movd (%0), %%mm0 \n\t"
844 "punpcklbw %%mm0, %%mm0 \n\t"
845 "punpcklwd %%mm0, %%mm0 \n\t"
846 "punpckldq %%mm0, %%mm0 \n\t"
847 "movq %%mm0, -8(%0) \n\t"
848 "movq -8(%0, %2), %%mm1 \n\t"
849 "punpckhbw %%mm1, %%mm1 \n\t"
850 "punpckhwd %%mm1, %%mm1 \n\t"
851 "punpckhdq %%mm1, %%mm1 \n\t"
852 "movq %%mm1, (%0, %2) \n\t"
857 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
862 "movd (%0), %%mm0 \n\t"
863 "punpcklbw %%mm0, %%mm0 \n\t"
864 "punpcklwd %%mm0, %%mm0 \n\t"
865 "punpckldq %%mm0, %%mm0 \n\t"
866 "movq %%mm0, -8(%0) \n\t"
867 "movq %%mm0, -16(%0) \n\t"
868 "movq -8(%0, %2), %%mm1 \n\t"
869 "punpckhbw %%mm1, %%mm1 \n\t"
870 "punpckhwd %%mm1, %%mm1 \n\t"
871 "punpckhdq %%mm1, %%mm1 \n\t"
872 "movq %%mm1, (%0, %2) \n\t"
873 "movq %%mm1, 8(%0, %2) \n\t"
878 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
884 "movd (%0), %%mm0 \n\t"
885 "punpcklbw %%mm0, %%mm0 \n\t"
886 "punpcklwd %%mm0, %%mm0 \n\t"
887 "movd %%mm0, -4(%0) \n\t"
888 "movd -4(%0, %2), %%mm1 \n\t"
889 "punpcklbw %%mm1, %%mm1 \n\t"
890 "punpckhwd %%mm1, %%mm1 \n\t"
891 "punpckhdq %%mm1, %%mm1 \n\t"
892 "movd %%mm1, (%0, %2) \n\t"
897 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
901 /* top and bottom (and hopefully also the corners) */
902 if (sides & EDGE_TOP) {
903 for (i = 0; i < h; i += 4) {
904 ptr = buf - (i + 1) * wrap - w;
907 "movq (%1, %0), %%mm0 \n\t"
908 "movq %%mm0, (%0) \n\t"
909 "movq %%mm0, (%0, %2) \n\t"
910 "movq %%mm0, (%0, %2, 2) \n\t"
911 "movq %%mm0, (%0, %3) \n\t"
916 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
917 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
922 if (sides & EDGE_BOTTOM) {
923 for (i = 0; i < h; i += 4) {
924 ptr = last_line + (i + 1) * wrap - w;
927 "movq (%1, %0), %%mm0 \n\t"
928 "movq %%mm0, (%0) \n\t"
929 "movq %%mm0, (%0, %2) \n\t"
930 "movq %%mm0, (%0, %2, 2) \n\t"
931 "movq %%mm0, (%0, %3) \n\t"
936 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
937 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
938 "r"(ptr + width + 2 * w)
943 #endif /* HAVE_INLINE_ASM */
947 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
948 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
951 ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
954 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
958 uint8_t * const half = (uint8_t*)temp; \
959 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
961 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
962 stride, stride, 8); \
965 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
968 ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
972 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
976 uint8_t * const half = (uint8_t*)temp; \
977 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
979 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
983 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
987 uint8_t * const half = (uint8_t*)temp; \
988 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
990 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
991 stride, stride, 8); \
994 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
997 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
1001 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1005 uint8_t * const half = (uint8_t*)temp; \
1006 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
1008 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
1012 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1015 uint64_t half[8 + 9]; \
1016 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1017 uint8_t * const halfHV = ((uint8_t*)half); \
1018 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1020 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
1022 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1023 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
1027 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1030 uint64_t half[8 + 9]; \
1031 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1032 uint8_t * const halfHV = ((uint8_t*)half); \
1033 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1035 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1037 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1038 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
1042 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1045 uint64_t half[8 + 9]; \
1046 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1047 uint8_t * const halfHV = ((uint8_t*)half); \
1048 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1050 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
1052 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1053 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
1057 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1060 uint64_t half[8 + 9]; \
1061 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1062 uint8_t * const halfHV = ((uint8_t*)half); \
1063 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1065 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1067 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1068 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
1072 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1075 uint64_t half[8 + 9]; \
1076 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1077 uint8_t * const halfHV = ((uint8_t*)half); \
1078 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1080 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1081 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
1085 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1088 uint64_t half[8 + 9]; \
1089 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1090 uint8_t * const halfHV = ((uint8_t*)half); \
1091 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1093 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1094 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
1098 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1101 uint64_t half[8 + 9]; \
1102 uint8_t * const halfH = ((uint8_t*)half); \
1103 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1105 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
1107 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
1111 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1114 uint64_t half[8 + 9]; \
1115 uint8_t * const halfH = ((uint8_t*)half); \
1116 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1118 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1120 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
1124 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1128 uint8_t * const halfH = ((uint8_t*)half); \
1129 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1131 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
1135 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1138 ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1141 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1144 uint64_t temp[32]; \
1145 uint8_t * const half = (uint8_t*)temp; \
1146 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1148 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
1152 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1155 ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1156 stride, stride, 16);\
1159 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1162 uint64_t temp[32]; \
1163 uint8_t * const half = (uint8_t*)temp; \
1164 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1166 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1167 stride, stride, 16); \
1170 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1173 uint64_t temp[32]; \
1174 uint8_t * const half = (uint8_t*)temp; \
1175 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1177 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
1181 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1184 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
1188 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1191 uint64_t temp[32]; \
1192 uint8_t * const half = (uint8_t*)temp; \
1193 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1195 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1196 stride, stride, 16); \
1199 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1202 uint64_t half[16 * 2 + 17 * 2]; \
1203 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1204 uint8_t * const halfHV = ((uint8_t*)half); \
1205 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1207 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1209 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1211 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1215 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1218 uint64_t half[16 * 2 + 17 * 2]; \
1219 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1220 uint8_t * const halfHV = ((uint8_t*)half); \
1221 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1223 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1225 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1227 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1231 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1234 uint64_t half[16 * 2 + 17 * 2]; \
1235 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1236 uint8_t * const halfHV = ((uint8_t*)half); \
1237 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1239 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1241 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1243 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1247 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1250 uint64_t half[16 * 2 + 17 * 2]; \
1251 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1252 uint8_t * const halfHV = ((uint8_t*)half); \
1253 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1255 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1257 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1259 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1263 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1266 uint64_t half[16 * 2 + 17 * 2]; \
1267 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1268 uint8_t * const halfHV = ((uint8_t*)half); \
1269 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1271 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1273 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1277 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1280 uint64_t half[16 * 2 + 17 * 2]; \
1281 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1282 uint8_t * const halfHV = ((uint8_t*)half); \
1283 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1285 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1287 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1291 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1294 uint64_t half[17 * 2]; \
1295 uint8_t * const halfH = ((uint8_t*)half); \
1296 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1298 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1300 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1304 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1307 uint64_t half[17 * 2]; \
1308 uint8_t * const halfH = ((uint8_t*)half); \
1309 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1311 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1313 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1317 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1320 uint64_t half[17 * 2]; \
1321 uint8_t * const halfH = ((uint8_t*)half); \
1322 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1324 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1328 #define PUT_OP(a, b, temp, size) \
1329 "mov"#size" "#a", "#b" \n\t"
1331 #define AVG_MMXEXT_OP(a, b, temp, size) \
1332 "mov"#size" "#b", "#temp" \n\t" \
1333 "pavgb "#temp", "#a" \n\t" \
1334 "mov"#size" "#a", "#b" \n\t"
1336 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext)
1337 QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
1338 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
1339 #endif /* HAVE_YASM */
1343 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1345 put_pixels8_xy2_mmx(dst, src, stride, 8);
1347 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1349 put_pixels16_xy2_mmx(dst, src, stride, 16);
1351 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1353 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1355 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1357 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1360 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1361 ptrdiff_t linesize, int block_w, int block_h,
1362 int src_x, int src_y, int w, int h);
1364 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1365 int stride, int h, int ox, int oy,
1366 int dxx, int dxy, int dyx, int dyy,
1367 int shift, int r, int width, int height,
1368 emulated_edge_mc_func *emu_edge_fn)
1371 const int ix = ox >> (16 + shift);
1372 const int iy = oy >> (16 + shift);
1373 const int oxs = ox >> 4;
1374 const int oys = oy >> 4;
1375 const int dxxs = dxx >> 4;
1376 const int dxys = dxy >> 4;
1377 const int dyxs = dyx >> 4;
1378 const int dyys = dyy >> 4;
1379 const uint16_t r4[4] = { r, r, r, r };
1380 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1381 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1382 const uint64_t shift2 = 2 * shift;
1383 #define MAX_STRIDE 4096U
1385 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
1388 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1389 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1390 const int dxh = dxy * (h - 1);
1391 const int dyw = dyx * (w - 1);
1392 int need_emu = (unsigned)ix >= width - w ||
1393 (unsigned)iy >= height - h;
1395 if ( // non-constant fullpel offset (3% of blocks)
1396 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1397 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1398 // uses more than 16 bits of subpel mv (only at huge resolution)
1399 || (dxx | dxy | dyx | dyy) & 15
1400 || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
1401 // FIXME could still use mmx for some of the rows
1402 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1403 shift, r, width, height);
1407 src += ix + iy * stride;
1409 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1414 "movd %0, %%mm6 \n\t"
1415 "pxor %%mm7, %%mm7 \n\t"
1416 "punpcklwd %%mm6, %%mm6 \n\t"
1417 "punpcklwd %%mm6, %%mm6 \n\t"
1421 for (x = 0; x < w; x += 4) {
1422 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1423 oxs - dxys + dxxs * (x + 1),
1424 oxs - dxys + dxxs * (x + 2),
1425 oxs - dxys + dxxs * (x + 3) };
1426 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1427 oys - dyys + dyxs * (x + 1),
1428 oys - dyys + dyxs * (x + 2),
1429 oys - dyys + dyxs * (x + 3) };
1431 for (y = 0; y < h; y++) {
1433 "movq %0, %%mm4 \n\t"
1434 "movq %1, %%mm5 \n\t"
1435 "paddw %2, %%mm4 \n\t"
1436 "paddw %3, %%mm5 \n\t"
1437 "movq %%mm4, %0 \n\t"
1438 "movq %%mm5, %1 \n\t"
1439 "psrlw $12, %%mm4 \n\t"
1440 "psrlw $12, %%mm5 \n\t"
1441 : "+m"(*dx4), "+m"(*dy4)
1442 : "m"(*dxy4), "m"(*dyy4)
1446 "movq %%mm6, %%mm2 \n\t"
1447 "movq %%mm6, %%mm1 \n\t"
1448 "psubw %%mm4, %%mm2 \n\t"
1449 "psubw %%mm5, %%mm1 \n\t"
1450 "movq %%mm2, %%mm0 \n\t"
1451 "movq %%mm4, %%mm3 \n\t"
1452 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1453 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1454 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1455 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1457 "movd %4, %%mm5 \n\t"
1458 "movd %3, %%mm4 \n\t"
1459 "punpcklbw %%mm7, %%mm5 \n\t"
1460 "punpcklbw %%mm7, %%mm4 \n\t"
1461 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1462 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1464 "movd %2, %%mm5 \n\t"
1465 "movd %1, %%mm4 \n\t"
1466 "punpcklbw %%mm7, %%mm5 \n\t"
1467 "punpcklbw %%mm7, %%mm4 \n\t"
1468 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1469 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1470 "paddw %5, %%mm1 \n\t"
1471 "paddw %%mm3, %%mm2 \n\t"
1472 "paddw %%mm1, %%mm0 \n\t"
1473 "paddw %%mm2, %%mm0 \n\t"
1475 "psrlw %6, %%mm0 \n\t"
1476 "packuswb %%mm0, %%mm0 \n\t"
1477 "movd %%mm0, %0 \n\t"
1479 : "=m"(dst[x + y * stride])
1480 : "m"(src[0]), "m"(src[1]),
1481 "m"(src[stride]), "m"(src[stride + 1]),
1482 "m"(*r4), "m"(shift2)
1486 src += 4 - h * stride;
1493 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1494 int stride, int h, int ox, int oy,
1495 int dxx, int dxy, int dyx, int dyy,
1496 int shift, int r, int width, int height)
1498 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1499 width, height, &ff_emulated_edge_mc_8);
1502 static void gmc_sse(uint8_t *dst, uint8_t *src,
1503 int stride, int h, int ox, int oy,
1504 int dxx, int dxy, int dyx, int dyy,
1505 int shift, int r, int width, int height)
1507 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1508 width, height, &ff_emulated_edge_mc_8);
1511 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1512 int stride, int h, int ox, int oy,
1513 int dxx, int dxy, int dyx, int dyy,
1514 int shift, int r, int width, int height)
1516 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1517 width, height, &ff_emulated_edge_mc_8);
1522 #endif /* HAVE_INLINE_ASM */
1524 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1525 int line_size, int h);
1526 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1527 int line_size, int h);
1529 void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
1530 int stride, int h, int x, int y);
1531 void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
1532 int stride, int h, int x, int y);
1533 void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
1534 int stride, int h, int x, int y);
1536 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1537 int stride, int h, int x, int y);
1538 void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
1539 int stride, int h, int x, int y);
1540 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1541 int stride, int h, int x, int y);
1543 void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1544 int stride, int h, int x, int y);
1545 void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1546 int stride, int h, int x, int y);
1548 void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1549 int stride, int h, int x, int y);
1550 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1551 int stride, int h, int x, int y);
1553 void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1554 int stride, int h, int x, int y);
1555 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1556 int stride, int h, int x, int y);
1558 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1559 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1560 (uint8_t *dst, uint8_t *src, \
1561 int stride, int h, int x, int y);
1563 CHROMA_MC(put, 2, 10, mmxext)
1564 CHROMA_MC(avg, 2, 10, mmxext)
1565 CHROMA_MC(put, 4, 10, mmxext)
1566 CHROMA_MC(avg, 4, 10, mmxext)
1567 CHROMA_MC(put, 8, 10, sse2)
1568 CHROMA_MC(avg, 8, 10, sse2)
1569 CHROMA_MC(put, 8, 10, avx)
1570 CHROMA_MC(avg, 8, 10, avx)
1575 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1577 put_pixels8_mmx(dst, src, stride, 8);
1580 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1582 avg_pixels8_mmx(dst, src, stride, 8);
1585 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1587 put_pixels16_mmx(dst, src, stride, 16);
1590 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1592 avg_pixels16_mmx(dst, src, stride, 16);
1594 #endif /* HAVE_INLINE_ASM */
1598 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1599 int stride, int rnd)
1601 ff_put_pixels8_mmx(dst, src, stride, 8);
1604 void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
1605 int stride, int rnd)
1607 ff_avg_pixels8_mmxext(dst, src, stride, 8);
1609 #endif /* HAVE_YASM */
1611 #if CONFIG_DIRAC_DECODER
1612 #define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
1613 void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1616 ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
1618 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
1620 void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1623 ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
1625 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
1627 void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1630 ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
1632 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
1633 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
1638 DIRAC_PIXOP(put, put, mmx)
1639 DIRAC_PIXOP(avg, avg, mmx)
1643 DIRAC_PIXOP(avg, ff_avg, mmxext)
1645 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1648 ff_put_dirac_pixels16_c(dst, src, stride, h);
1650 ff_put_pixels16_sse2(dst, src[0], stride, h);
1652 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1655 ff_avg_dirac_pixels16_c(dst, src, stride, h);
1657 ff_avg_pixels16_sse2(dst, src[0], stride, h);
1659 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1662 ff_put_dirac_pixels32_c(dst, src, stride, h);
1664 ff_put_pixels16_sse2(dst , src[0] , stride, h);
1665 ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
1668 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1671 ff_avg_dirac_pixels32_c(dst, src, stride, h);
1673 ff_avg_pixels16_sse2(dst , src[0] , stride, h);
1674 ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
1680 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
1683 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
1687 ff_put_pixels_clamped_mmx(block, dest, line_size);
1690 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
1694 ff_add_pixels_clamped_mmx(block, dest, line_size);
1697 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
1700 ff_mmxext_idct(block);
1701 ff_put_pixels_clamped_mmx(block, dest, line_size);
1704 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
1707 ff_mmxext_idct(block);
1708 ff_add_pixels_clamped_mmx(block, dest, line_size);
1713 static void vector_clipf_sse(float *dst, const float *src,
1714 float min, float max, int len)
1716 x86_reg i = (len - 16) * 4;
1718 "movss %3, %%xmm4 \n\t"
1719 "movss %4, %%xmm5 \n\t"
1720 "shufps $0, %%xmm4, %%xmm4 \n\t"
1721 "shufps $0, %%xmm5, %%xmm5 \n\t"
1723 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1724 "movaps 16(%2, %0), %%xmm1 \n\t"
1725 "movaps 32(%2, %0), %%xmm2 \n\t"
1726 "movaps 48(%2, %0), %%xmm3 \n\t"
1727 "maxps %%xmm4, %%xmm0 \n\t"
1728 "maxps %%xmm4, %%xmm1 \n\t"
1729 "maxps %%xmm4, %%xmm2 \n\t"
1730 "maxps %%xmm4, %%xmm3 \n\t"
1731 "minps %%xmm5, %%xmm0 \n\t"
1732 "minps %%xmm5, %%xmm1 \n\t"
1733 "minps %%xmm5, %%xmm2 \n\t"
1734 "minps %%xmm5, %%xmm3 \n\t"
1735 "movaps %%xmm0, (%1, %0) \n\t"
1736 "movaps %%xmm1, 16(%1, %0) \n\t"
1737 "movaps %%xmm2, 32(%1, %0) \n\t"
1738 "movaps %%xmm3, 48(%1, %0) \n\t"
1742 : "r"(dst), "r"(src), "m"(min), "m"(max)
1747 #endif /* HAVE_INLINE_ASM */
1749 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1751 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1753 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1755 int order, int mul);
1756 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1758 int order, int mul);
1759 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1761 int order, int mul);
1763 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1764 const int16_t *window, unsigned int len);
1765 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1766 const int16_t *window, unsigned int len);
1767 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1768 const int16_t *window, unsigned int len);
1769 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1770 const int16_t *window, unsigned int len);
1771 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1772 const int16_t *window, unsigned int len);
1773 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1774 const int16_t *window, unsigned int len);
1776 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1777 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1779 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1780 const uint8_t *diff, int w,
1781 int *left, int *left_top);
1782 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1784 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1787 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1788 int32_t min, int32_t max, unsigned int len);
1789 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1790 int32_t min, int32_t max, unsigned int len);
1791 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1792 int32_t min, int32_t max, unsigned int len);
1793 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1794 int32_t min, int32_t max, unsigned int len);
1796 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
1798 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1799 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1800 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
1801 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
1802 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
1803 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
1804 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
1805 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
1806 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
1807 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
1808 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
1809 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
1810 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
1811 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
1812 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
1813 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
1816 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
1818 c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
1819 c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
1820 c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
1821 c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
1824 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
1826 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1829 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
1830 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
1831 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
1833 if (!high_bit_depth) {
1834 c->clear_block = clear_block_mmx;
1835 c->clear_blocks = clear_blocks_mmx;
1836 c->draw_edges = draw_edges_mmx;
1838 SET_HPEL_FUNCS(put, [0], 16, mmx);
1839 SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
1840 SET_HPEL_FUNCS(avg, [0], 16, mmx);
1841 SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
1842 SET_HPEL_FUNCS(put, [1], 8, mmx);
1843 SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
1844 SET_HPEL_FUNCS(avg, [1], 8, mmx);
1847 #if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
1851 c->add_bytes = add_bytes_mmx;
1853 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1854 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
1855 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
1857 #endif /* HAVE_INLINE_ASM */
1860 if (!high_bit_depth && CONFIG_H264CHROMA) {
1861 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
1862 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
1865 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
1870 static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
1873 const int bit_depth = avctx->bits_per_raw_sample;
1874 const int high_bit_depth = bit_depth > 8;
1877 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
1878 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
1880 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
1881 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
1882 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
1883 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
1885 if (!high_bit_depth) {
1886 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
1887 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
1889 c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
1890 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
1891 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
1893 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
1894 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
1896 c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
1897 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
1898 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
1901 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1902 if (!high_bit_depth) {
1903 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
1904 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
1905 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
1906 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
1908 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
1909 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
1912 #endif /* HAVE_YASM */
1914 #if HAVE_MMXEXT_EXTERNAL
1915 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1916 avctx->codec_id == AV_CODEC_ID_THEORA)) {
1917 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
1918 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
1921 if (!high_bit_depth && CONFIG_H264CHROMA) {
1922 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
1923 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
1924 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
1925 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
1927 if (bit_depth == 10 && CONFIG_H264CHROMA) {
1928 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
1929 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
1930 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
1931 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
1934 /* slower than cmov version on AMD */
1935 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
1936 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
1938 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
1939 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
1941 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1942 c->apply_window_int16 = ff_apply_window_int16_mmxext;
1944 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
1946 #endif /* HAVE_MMXEXT_EXTERNAL */
1949 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
1952 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1955 if (!high_bit_depth) {
1956 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
1957 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
1959 c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
1960 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
1961 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
1963 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
1964 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
1966 c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
1967 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
1968 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
1970 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
1971 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
1972 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
1973 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
1974 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
1976 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
1977 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
1981 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1982 avctx->codec_id == AV_CODEC_ID_THEORA)) {
1983 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
1984 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
1987 if (!high_bit_depth && CONFIG_H264CHROMA) {
1988 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
1989 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
1991 #endif /* HAVE_YASM */
1994 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
1996 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1999 if (!high_bit_depth) {
2000 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2001 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2002 c->clear_block = clear_block_sse;
2003 c->clear_blocks = clear_blocks_sse;
2007 c->vector_clipf = vector_clipf_sse;
2008 #endif /* HAVE_INLINE_ASM */
2011 #if HAVE_INLINE_ASM && CONFIG_VIDEODSP
2014 #endif /* HAVE_YASM */
2017 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2020 const int bit_depth = avctx->bits_per_raw_sample;
2021 const int high_bit_depth = bit_depth > 8;
2023 #if HAVE_SSE2_INLINE
2024 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
2025 c->idct_put = ff_idct_xvid_sse2_put;
2026 c->idct_add = ff_idct_xvid_sse2_add;
2027 c->idct = ff_idct_xvid_sse2;
2028 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2030 #endif /* HAVE_SSE2_INLINE */
2032 #if HAVE_SSE2_EXTERNAL
2033 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2034 // these functions are slower than mmx on AMD, but faster on Intel
2035 if (!high_bit_depth) {
2036 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
2037 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
2038 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
2042 if (bit_depth == 10) {
2043 if (CONFIG_H264CHROMA) {
2044 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2045 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2049 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2050 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2051 if (mm_flags & AV_CPU_FLAG_ATOM) {
2052 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2054 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2056 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2057 c->apply_window_int16 = ff_apply_window_int16_sse2;
2058 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2059 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
2061 c->bswap_buf = ff_bswap32_buf_sse2;
2062 #endif /* HAVE_SSE2_EXTERNAL */
2065 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2068 #if HAVE_SSSE3_EXTERNAL
2069 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2071 if (!high_bit_depth && CONFIG_H264CHROMA) {
2072 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
2073 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
2074 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
2075 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
2077 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2078 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2079 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2081 if (mm_flags & AV_CPU_FLAG_ATOM)
2082 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2084 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2085 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
2086 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2087 c->bswap_buf = ff_bswap32_buf_ssse3;
2088 #endif /* HAVE_SSSE3_EXTERNAL */
2091 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
2094 #if HAVE_SSE4_EXTERNAL
2095 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2096 #endif /* HAVE_SSE4_EXTERNAL */
2099 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2101 #if HAVE_AVX_EXTERNAL
2102 const int bit_depth = avctx->bits_per_raw_sample;
2104 if (bit_depth == 10) {
2105 if (CONFIG_H264CHROMA) {
2106 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2107 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2110 #endif /* HAVE_AVX_EXTERNAL */
2113 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
2115 int mm_flags = av_get_cpu_flags();
2117 #if HAVE_7REGS && HAVE_INLINE_ASM
2118 if (mm_flags & AV_CPU_FLAG_CMOV)
2119 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2122 if (mm_flags & AV_CPU_FLAG_MMX) {
2124 const int idct_algo = avctx->idct_algo;
2126 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
2127 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
2128 c->idct_put = ff_simple_idct_put_mmx;
2129 c->idct_add = ff_simple_idct_add_mmx;
2130 c->idct = ff_simple_idct_mmx;
2131 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
2133 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
2134 if (mm_flags & AV_CPU_FLAG_MMX2) {
2135 c->idct_put = ff_libmpeg2mmx2_idct_put;
2136 c->idct_add = ff_libmpeg2mmx2_idct_add;
2137 c->idct = ff_mmxext_idct;
2139 c->idct_put = ff_libmpeg2mmx_idct_put;
2140 c->idct_add = ff_libmpeg2mmx_idct_add;
2141 c->idct = ff_mmx_idct;
2143 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2145 } else if (idct_algo == FF_IDCT_XVIDMMX) {
2146 if (mm_flags & AV_CPU_FLAG_SSE2) {
2147 c->idct_put = ff_idct_xvid_sse2_put;
2148 c->idct_add = ff_idct_xvid_sse2_add;
2149 c->idct = ff_idct_xvid_sse2;
2150 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2151 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
2152 c->idct_put = ff_idct_xvid_mmxext_put;
2153 c->idct_add = ff_idct_xvid_mmxext_add;
2154 c->idct = ff_idct_xvid_mmxext;
2156 c->idct_put = ff_idct_xvid_mmx_put;
2157 c->idct_add = ff_idct_xvid_mmx_add;
2158 c->idct = ff_idct_xvid_mmx;
2162 #endif /* HAVE_INLINE_ASM */
2164 dsputil_init_mmx(c, avctx, mm_flags);
2167 if (mm_flags & AV_CPU_FLAG_MMXEXT)
2168 dsputil_init_mmxext(c, avctx, mm_flags);
2170 if (mm_flags & AV_CPU_FLAG_3DNOW)
2171 dsputil_init_3dnow(c, avctx, mm_flags);
2173 if (mm_flags & AV_CPU_FLAG_SSE)
2174 dsputil_init_sse(c, avctx, mm_flags);
2176 if (mm_flags & AV_CPU_FLAG_SSE2)
2177 dsputil_init_sse2(c, avctx, mm_flags);
2179 if (mm_flags & AV_CPU_FLAG_SSSE3)
2180 dsputil_init_ssse3(c, avctx, mm_flags);
2182 if (mm_flags & AV_CPU_FLAG_SSE4)
2183 dsputil_init_sse4(c, avctx, mm_flags);
2185 if (mm_flags & AV_CPU_FLAG_AVX)
2186 dsputil_init_avx(c, avctx, mm_flags);
2188 if (CONFIG_ENCODERS)
2189 ff_dsputilenc_init_mmx(c, avctx);