2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "dsputil_mmx.h"
32 #include "idct_xvid.h"
33 #include "diracdsp_mmx.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
43 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
44 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
49 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
53 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
57 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
58 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
60 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
62 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
63 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
64 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
65 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
71 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
72 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
74 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
76 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
81 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
82 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
86 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
87 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
89 #define MOVQ_BFE(regd) \
91 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
92 "paddb %%"#regd", %%"#regd" \n\t" ::)
95 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
96 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
98 // for shared library it's better to use this way for accessing constants
100 #define MOVQ_BONE(regd) \
102 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
103 "psrlw $15, %%"#regd" \n\t" \
104 "packuswb %%"#regd", %%"#regd" \n\t" ::)
106 #define MOVQ_WTWO(regd) \
108 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
109 "psrlw $15, %%"#regd" \n\t" \
110 "psllw $1, %%"#regd" \n\t"::)
114 // using regr as temporary and for the output result
115 // first argument is unmodifed and second is trashed
116 // regfe is supposed to contain 0xfefefefefefefefe
117 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
118 "movq "#rega", "#regr" \n\t" \
119 "pand "#regb", "#regr" \n\t" \
120 "pxor "#rega", "#regb" \n\t" \
121 "pand "#regfe", "#regb" \n\t" \
122 "psrlq $1, "#regb" \n\t" \
123 "paddb "#regb", "#regr" \n\t"
125 #define PAVGB_MMX(rega, regb, regr, regfe) \
126 "movq "#rega", "#regr" \n\t" \
127 "por "#regb", "#regr" \n\t" \
128 "pxor "#rega", "#regb" \n\t" \
129 "pand "#regfe", "#regb" \n\t" \
130 "psrlq $1, "#regb" \n\t" \
131 "psubb "#regb", "#regr" \n\t"
133 // mm6 is supposed to contain 0xfefefefefefefefe
134 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
135 "movq "#rega", "#regr" \n\t" \
136 "movq "#regc", "#regp" \n\t" \
137 "pand "#regb", "#regr" \n\t" \
138 "pand "#regd", "#regp" \n\t" \
139 "pxor "#rega", "#regb" \n\t" \
140 "pxor "#regc", "#regd" \n\t" \
141 "pand %%mm6, "#regb" \n\t" \
142 "pand %%mm6, "#regd" \n\t" \
143 "psrlq $1, "#regb" \n\t" \
144 "psrlq $1, "#regd" \n\t" \
145 "paddb "#regb", "#regr" \n\t" \
146 "paddb "#regd", "#regp" \n\t"
148 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
149 "movq "#rega", "#regr" \n\t" \
150 "movq "#regc", "#regp" \n\t" \
151 "por "#regb", "#regr" \n\t" \
152 "por "#regd", "#regp" \n\t" \
153 "pxor "#rega", "#regb" \n\t" \
154 "pxor "#regc", "#regd" \n\t" \
155 "pand %%mm6, "#regb" \n\t" \
156 "pand %%mm6, "#regd" \n\t" \
157 "psrlq $1, "#regd" \n\t" \
158 "psrlq $1, "#regb" \n\t" \
159 "psubb "#regb", "#regr" \n\t" \
160 "psubb "#regd", "#regp" \n\t"
162 /***********************************/
163 /* MMX no rounding */
165 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
166 #define SET_RND MOVQ_WONE
167 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
168 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
169 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
171 #include "dsputil_rnd_template.c"
178 /***********************************/
181 #define DEF(x, y) x ## _ ## y ## _mmx
182 #define SET_RND MOVQ_WTWO
183 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
184 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
186 #include "dsputil_rnd_template.c"
194 /***********************************/
197 #define DEF(x) x ## _3dnow
198 #define PAVGB "pavgusb"
199 #define SKIP_FOR_3DNOW
201 #include "dsputil_avg_template.c"
205 #undef SKIP_FOR_3DNOW
207 /***********************************/
208 /* MMXEXT specific */
210 #define DEF(x) x ## _mmxext
212 /* Introduced only in MMXEXT set */
213 #define PAVGB "pavgb"
215 #include "dsputil_avg_template.c"
220 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
221 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
222 #define put_pixels16_mmxext put_pixels16_mmx
223 #define put_pixels8_mmxext put_pixels8_mmx
224 #define put_pixels4_mmxext put_pixels4_mmx
225 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
226 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
228 /***********************************/
231 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
237 /* read the pixels */
242 "movq (%3), %%mm0 \n\t"
243 "movq 8(%3), %%mm1 \n\t"
244 "movq 16(%3), %%mm2 \n\t"
245 "movq 24(%3), %%mm3 \n\t"
246 "movq 32(%3), %%mm4 \n\t"
247 "movq 40(%3), %%mm5 \n\t"
248 "movq 48(%3), %%mm6 \n\t"
249 "movq 56(%3), %%mm7 \n\t"
250 "packuswb %%mm1, %%mm0 \n\t"
251 "packuswb %%mm3, %%mm2 \n\t"
252 "packuswb %%mm5, %%mm4 \n\t"
253 "packuswb %%mm7, %%mm6 \n\t"
254 "movq %%mm0, (%0) \n\t"
255 "movq %%mm2, (%0, %1) \n\t"
256 "movq %%mm4, (%0, %1, 2) \n\t"
257 "movq %%mm6, (%0, %2) \n\t"
258 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
261 pix += line_size * 4;
264 // if here would be an exact copy of the code above
265 // compiler would generate some very strange code
268 "movq (%3), %%mm0 \n\t"
269 "movq 8(%3), %%mm1 \n\t"
270 "movq 16(%3), %%mm2 \n\t"
271 "movq 24(%3), %%mm3 \n\t"
272 "movq 32(%3), %%mm4 \n\t"
273 "movq 40(%3), %%mm5 \n\t"
274 "movq 48(%3), %%mm6 \n\t"
275 "movq 56(%3), %%mm7 \n\t"
276 "packuswb %%mm1, %%mm0 \n\t"
277 "packuswb %%mm3, %%mm2 \n\t"
278 "packuswb %%mm5, %%mm4 \n\t"
279 "packuswb %%mm7, %%mm6 \n\t"
280 "movq %%mm0, (%0) \n\t"
281 "movq %%mm2, (%0, %1) \n\t"
282 "movq %%mm4, (%0, %1, 2) \n\t"
283 "movq %%mm6, (%0, %2) \n\t"
284 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
288 #define put_signed_pixels_clamped_mmx_half(off) \
289 "movq "#off"(%2), %%mm1 \n\t" \
290 "movq 16 + "#off"(%2), %%mm2 \n\t" \
291 "movq 32 + "#off"(%2), %%mm3 \n\t" \
292 "movq 48 + "#off"(%2), %%mm4 \n\t" \
293 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
294 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
295 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
296 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
297 "paddb %%mm0, %%mm1 \n\t" \
298 "paddb %%mm0, %%mm2 \n\t" \
299 "paddb %%mm0, %%mm3 \n\t" \
300 "paddb %%mm0, %%mm4 \n\t" \
301 "movq %%mm1, (%0) \n\t" \
302 "movq %%mm2, (%0, %3) \n\t" \
303 "movq %%mm3, (%0, %3, 2) \n\t" \
304 "movq %%mm4, (%0, %1) \n\t"
306 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
309 x86_reg line_skip = line_size;
313 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
314 "lea (%3, %3, 2), %1 \n\t"
315 put_signed_pixels_clamped_mmx_half(0)
316 "lea (%0, %3, 4), %0 \n\t"
317 put_signed_pixels_clamped_mmx_half(64)
318 : "+&r"(pixels), "=&r"(line_skip3)
319 : "r"(block), "r"(line_skip)
323 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
330 /* read the pixels */
337 "movq (%2), %%mm0 \n\t"
338 "movq 8(%2), %%mm1 \n\t"
339 "movq 16(%2), %%mm2 \n\t"
340 "movq 24(%2), %%mm3 \n\t"
341 "movq %0, %%mm4 \n\t"
342 "movq %1, %%mm6 \n\t"
343 "movq %%mm4, %%mm5 \n\t"
344 "punpcklbw %%mm7, %%mm4 \n\t"
345 "punpckhbw %%mm7, %%mm5 \n\t"
346 "paddsw %%mm4, %%mm0 \n\t"
347 "paddsw %%mm5, %%mm1 \n\t"
348 "movq %%mm6, %%mm5 \n\t"
349 "punpcklbw %%mm7, %%mm6 \n\t"
350 "punpckhbw %%mm7, %%mm5 \n\t"
351 "paddsw %%mm6, %%mm2 \n\t"
352 "paddsw %%mm5, %%mm3 \n\t"
353 "packuswb %%mm1, %%mm0 \n\t"
354 "packuswb %%mm3, %%mm2 \n\t"
355 "movq %%mm0, %0 \n\t"
356 "movq %%mm2, %1 \n\t"
357 : "+m"(*pix), "+m"(*(pix + line_size))
360 pix += line_size * 2;
365 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
366 int line_size, int h)
369 "lea (%3, %3), %%"REG_a" \n\t"
372 "movq (%1 ), %%mm0 \n\t"
373 "movq (%1, %3), %%mm1 \n\t"
374 "movq %%mm0, (%2) \n\t"
375 "movq %%mm1, (%2, %3) \n\t"
376 "add %%"REG_a", %1 \n\t"
377 "add %%"REG_a", %2 \n\t"
378 "movq (%1 ), %%mm0 \n\t"
379 "movq (%1, %3), %%mm1 \n\t"
380 "movq %%mm0, (%2) \n\t"
381 "movq %%mm1, (%2, %3) \n\t"
382 "add %%"REG_a", %1 \n\t"
383 "add %%"REG_a", %2 \n\t"
386 : "+g"(h), "+r"(pixels), "+r"(block)
387 : "r"((x86_reg)line_size)
392 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
393 int line_size, int h)
396 "lea (%3, %3), %%"REG_a" \n\t"
399 "movq (%1 ), %%mm0 \n\t"
400 "movq 8(%1 ), %%mm4 \n\t"
401 "movq (%1, %3), %%mm1 \n\t"
402 "movq 8(%1, %3), %%mm5 \n\t"
403 "movq %%mm0, (%2) \n\t"
404 "movq %%mm4, 8(%2) \n\t"
405 "movq %%mm1, (%2, %3) \n\t"
406 "movq %%mm5, 8(%2, %3) \n\t"
407 "add %%"REG_a", %1 \n\t"
408 "add %%"REG_a", %2 \n\t"
409 "movq (%1 ), %%mm0 \n\t"
410 "movq 8(%1 ), %%mm4 \n\t"
411 "movq (%1, %3), %%mm1 \n\t"
412 "movq 8(%1, %3), %%mm5 \n\t"
413 "movq %%mm0, (%2) \n\t"
414 "movq %%mm4, 8(%2) \n\t"
415 "movq %%mm1, (%2, %3) \n\t"
416 "movq %%mm5, 8(%2, %3) \n\t"
417 "add %%"REG_a", %1 \n\t"
418 "add %%"REG_a", %2 \n\t"
421 : "+g"(h), "+r"(pixels), "+r"(block)
422 : "r"((x86_reg)line_size)
427 #define CLEAR_BLOCKS(name, n) \
428 static void name(int16_t *blocks) \
431 "pxor %%mm7, %%mm7 \n\t" \
432 "mov %1, %%"REG_a" \n\t" \
434 "movq %%mm7, (%0, %%"REG_a") \n\t" \
435 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
436 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
437 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
438 "add $32, %%"REG_a" \n\t" \
440 :: "r"(((uint8_t *)blocks) + 128 * n), \
445 CLEAR_BLOCKS(clear_blocks_mmx, 6)
446 CLEAR_BLOCKS(clear_block_mmx, 1)
448 static void clear_block_sse(int16_t *block)
451 "xorps %%xmm0, %%xmm0 \n"
452 "movaps %%xmm0, (%0) \n"
453 "movaps %%xmm0, 16(%0) \n"
454 "movaps %%xmm0, 32(%0) \n"
455 "movaps %%xmm0, 48(%0) \n"
456 "movaps %%xmm0, 64(%0) \n"
457 "movaps %%xmm0, 80(%0) \n"
458 "movaps %%xmm0, 96(%0) \n"
459 "movaps %%xmm0, 112(%0) \n"
465 static void clear_blocks_sse(int16_t *blocks)
468 "xorps %%xmm0, %%xmm0 \n"
469 "mov %1, %%"REG_a" \n"
471 "movaps %%xmm0, (%0, %%"REG_a") \n"
472 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
473 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
474 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
475 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
476 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
477 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
478 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
479 "add $128, %%"REG_a" \n"
481 :: "r"(((uint8_t *)blocks) + 128 * 6),
487 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
493 "movq (%1, %0), %%mm0 \n\t"
494 "movq (%2, %0), %%mm1 \n\t"
495 "paddb %%mm0, %%mm1 \n\t"
496 "movq %%mm1, (%2, %0) \n\t"
497 "movq 8(%1, %0), %%mm0 \n\t"
498 "movq 8(%2, %0), %%mm1 \n\t"
499 "paddb %%mm0, %%mm1 \n\t"
500 "movq %%mm1, 8(%2, %0) \n\t"
506 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
509 dst[i + 0] += src[i + 0];
513 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
514 const uint8_t *diff, int w,
515 int *left, int *left_top)
519 int l = *left & 0xff;
520 int tl = *left_top & 0xff;
525 "movzbl (%3, %4), %2 \n"
538 "add (%6, %4), %b0 \n"
539 "mov %b0, (%5, %4) \n"
542 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
543 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
550 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
551 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
552 "movd (%1), %%mm0 \n\t"
554 "movd (%1), %%mm1 \n\t"
555 "movd (%1,%3,1), %%mm2 \n\t"
556 "movd (%1,%3,2), %%mm3 \n\t"
557 "punpcklbw %%mm1, %%mm0 \n\t"
558 "punpcklbw %%mm3, %%mm2 \n\t"
559 "movq %%mm0, %%mm1 \n\t"
560 "punpcklwd %%mm2, %%mm0 \n\t"
561 "punpckhwd %%mm2, %%mm1 \n\t"
562 "movd %%mm0, (%0) \n\t"
564 "punpckhdq %%mm0, %%mm0 \n\t"
565 "movd %%mm0, (%0) \n\t"
566 "movd %%mm1, (%0,%2,1) \n\t"
567 "punpckhdq %%mm1, %%mm1 \n\t"
568 "movd %%mm1, (%0,%2,2) \n\t"
578 #define H263_LOOP_FILTER \
579 "pxor %%mm7, %%mm7 \n\t" \
580 "movq %0, %%mm0 \n\t" \
581 "movq %0, %%mm1 \n\t" \
582 "movq %3, %%mm2 \n\t" \
583 "movq %3, %%mm3 \n\t" \
584 "punpcklbw %%mm7, %%mm0 \n\t" \
585 "punpckhbw %%mm7, %%mm1 \n\t" \
586 "punpcklbw %%mm7, %%mm2 \n\t" \
587 "punpckhbw %%mm7, %%mm3 \n\t" \
588 "psubw %%mm2, %%mm0 \n\t" \
589 "psubw %%mm3, %%mm1 \n\t" \
590 "movq %1, %%mm2 \n\t" \
591 "movq %1, %%mm3 \n\t" \
592 "movq %2, %%mm4 \n\t" \
593 "movq %2, %%mm5 \n\t" \
594 "punpcklbw %%mm7, %%mm2 \n\t" \
595 "punpckhbw %%mm7, %%mm3 \n\t" \
596 "punpcklbw %%mm7, %%mm4 \n\t" \
597 "punpckhbw %%mm7, %%mm5 \n\t" \
598 "psubw %%mm2, %%mm4 \n\t" \
599 "psubw %%mm3, %%mm5 \n\t" \
600 "psllw $2, %%mm4 \n\t" \
601 "psllw $2, %%mm5 \n\t" \
602 "paddw %%mm0, %%mm4 \n\t" \
603 "paddw %%mm1, %%mm5 \n\t" \
604 "pxor %%mm6, %%mm6 \n\t" \
605 "pcmpgtw %%mm4, %%mm6 \n\t" \
606 "pcmpgtw %%mm5, %%mm7 \n\t" \
607 "pxor %%mm6, %%mm4 \n\t" \
608 "pxor %%mm7, %%mm5 \n\t" \
609 "psubw %%mm6, %%mm4 \n\t" \
610 "psubw %%mm7, %%mm5 \n\t" \
611 "psrlw $3, %%mm4 \n\t" \
612 "psrlw $3, %%mm5 \n\t" \
613 "packuswb %%mm5, %%mm4 \n\t" \
614 "packsswb %%mm7, %%mm6 \n\t" \
615 "pxor %%mm7, %%mm7 \n\t" \
616 "movd %4, %%mm2 \n\t" \
617 "punpcklbw %%mm2, %%mm2 \n\t" \
618 "punpcklbw %%mm2, %%mm2 \n\t" \
619 "punpcklbw %%mm2, %%mm2 \n\t" \
620 "psubusb %%mm4, %%mm2 \n\t" \
621 "movq %%mm2, %%mm3 \n\t" \
622 "psubusb %%mm4, %%mm3 \n\t" \
623 "psubb %%mm3, %%mm2 \n\t" \
624 "movq %1, %%mm3 \n\t" \
625 "movq %2, %%mm4 \n\t" \
626 "pxor %%mm6, %%mm3 \n\t" \
627 "pxor %%mm6, %%mm4 \n\t" \
628 "paddusb %%mm2, %%mm3 \n\t" \
629 "psubusb %%mm2, %%mm4 \n\t" \
630 "pxor %%mm6, %%mm3 \n\t" \
631 "pxor %%mm6, %%mm4 \n\t" \
632 "paddusb %%mm2, %%mm2 \n\t" \
633 "packsswb %%mm1, %%mm0 \n\t" \
634 "pcmpgtb %%mm0, %%mm7 \n\t" \
635 "pxor %%mm7, %%mm0 \n\t" \
636 "psubb %%mm7, %%mm0 \n\t" \
637 "movq %%mm0, %%mm1 \n\t" \
638 "psubusb %%mm2, %%mm0 \n\t" \
639 "psubb %%mm0, %%mm1 \n\t" \
640 "pand %5, %%mm1 \n\t" \
641 "psrlw $2, %%mm1 \n\t" \
642 "pxor %%mm7, %%mm1 \n\t" \
643 "psubb %%mm7, %%mm1 \n\t" \
644 "movq %0, %%mm5 \n\t" \
645 "movq %3, %%mm6 \n\t" \
646 "psubb %%mm1, %%mm5 \n\t" \
647 "paddb %%mm1, %%mm6 \n\t"
649 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
651 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
652 const int strength = ff_h263_loop_filter_strength[qscale];
657 "movq %%mm3, %1 \n\t"
658 "movq %%mm4, %2 \n\t"
659 "movq %%mm5, %0 \n\t"
660 "movq %%mm6, %3 \n\t"
661 : "+m"(*(uint64_t*)(src - 2 * stride)),
662 "+m"(*(uint64_t*)(src - 1 * stride)),
663 "+m"(*(uint64_t*)(src + 0 * stride)),
664 "+m"(*(uint64_t*)(src + 1 * stride))
665 : "g"(2 * strength), "m"(ff_pb_FC)
670 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
672 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
673 const int strength = ff_h263_loop_filter_strength[qscale];
674 DECLARE_ALIGNED(8, uint64_t, temp)[4];
675 uint8_t *btemp = (uint8_t*)temp;
679 transpose4x4(btemp, src, 8, stride);
680 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
682 H263_LOOP_FILTER // 5 3 4 6
688 : "g"(2 * strength), "m"(ff_pb_FC)
692 "movq %%mm5, %%mm1 \n\t"
693 "movq %%mm4, %%mm0 \n\t"
694 "punpcklbw %%mm3, %%mm5 \n\t"
695 "punpcklbw %%mm6, %%mm4 \n\t"
696 "punpckhbw %%mm3, %%mm1 \n\t"
697 "punpckhbw %%mm6, %%mm0 \n\t"
698 "movq %%mm5, %%mm3 \n\t"
699 "movq %%mm1, %%mm6 \n\t"
700 "punpcklwd %%mm4, %%mm5 \n\t"
701 "punpcklwd %%mm0, %%mm1 \n\t"
702 "punpckhwd %%mm4, %%mm3 \n\t"
703 "punpckhwd %%mm0, %%mm6 \n\t"
704 "movd %%mm5, (%0) \n\t"
705 "punpckhdq %%mm5, %%mm5 \n\t"
706 "movd %%mm5, (%0, %2) \n\t"
707 "movd %%mm3, (%0, %2, 2) \n\t"
708 "punpckhdq %%mm3, %%mm3 \n\t"
709 "movd %%mm3, (%0, %3) \n\t"
710 "movd %%mm1, (%1) \n\t"
711 "punpckhdq %%mm1, %%mm1 \n\t"
712 "movd %%mm1, (%1, %2) \n\t"
713 "movd %%mm6, (%1, %2, 2) \n\t"
714 "punpckhdq %%mm6, %%mm6 \n\t"
715 "movd %%mm6, (%1, %3) \n\t"
717 "r"(src + 4 * stride),
718 "r"((x86_reg)stride),
719 "r"((x86_reg)(3 * stride))
724 /* Draw the edges of width 'w' of an image of size width, height
725 * this MMX version can only handle w == 8 || w == 16. */
726 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
727 int w, int h, int sides)
729 uint8_t *ptr, *last_line;
732 last_line = buf + (height - 1) * wrap;
738 "movd (%0), %%mm0 \n\t"
739 "punpcklbw %%mm0, %%mm0 \n\t"
740 "punpcklwd %%mm0, %%mm0 \n\t"
741 "punpckldq %%mm0, %%mm0 \n\t"
742 "movq %%mm0, -8(%0) \n\t"
743 "movq -8(%0, %2), %%mm1 \n\t"
744 "punpckhbw %%mm1, %%mm1 \n\t"
745 "punpckhwd %%mm1, %%mm1 \n\t"
746 "punpckhdq %%mm1, %%mm1 \n\t"
747 "movq %%mm1, (%0, %2) \n\t"
752 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
757 "movd (%0), %%mm0 \n\t"
758 "punpcklbw %%mm0, %%mm0 \n\t"
759 "punpcklwd %%mm0, %%mm0 \n\t"
760 "punpckldq %%mm0, %%mm0 \n\t"
761 "movq %%mm0, -8(%0) \n\t"
762 "movq %%mm0, -16(%0) \n\t"
763 "movq -8(%0, %2), %%mm1 \n\t"
764 "punpckhbw %%mm1, %%mm1 \n\t"
765 "punpckhwd %%mm1, %%mm1 \n\t"
766 "punpckhdq %%mm1, %%mm1 \n\t"
767 "movq %%mm1, (%0, %2) \n\t"
768 "movq %%mm1, 8(%0, %2) \n\t"
773 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
779 "movd (%0), %%mm0 \n\t"
780 "punpcklbw %%mm0, %%mm0 \n\t"
781 "punpcklwd %%mm0, %%mm0 \n\t"
782 "movd %%mm0, -4(%0) \n\t"
783 "movd -4(%0, %2), %%mm1 \n\t"
784 "punpcklbw %%mm1, %%mm1 \n\t"
785 "punpckhwd %%mm1, %%mm1 \n\t"
786 "punpckhdq %%mm1, %%mm1 \n\t"
787 "movd %%mm1, (%0, %2) \n\t"
792 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
796 /* top and bottom (and hopefully also the corners) */
797 if (sides & EDGE_TOP) {
798 for (i = 0; i < h; i += 4) {
799 ptr = buf - (i + 1) * wrap - w;
802 "movq (%1, %0), %%mm0 \n\t"
803 "movq %%mm0, (%0) \n\t"
804 "movq %%mm0, (%0, %2) \n\t"
805 "movq %%mm0, (%0, %2, 2) \n\t"
806 "movq %%mm0, (%0, %3) \n\t"
811 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
812 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
817 if (sides & EDGE_BOTTOM) {
818 for (i = 0; i < h; i += 4) {
819 ptr = last_line + (i + 1) * wrap - w;
822 "movq (%1, %0), %%mm0 \n\t"
823 "movq %%mm0, (%0) \n\t"
824 "movq %%mm0, (%0, %2) \n\t"
825 "movq %%mm0, (%0, %2, 2) \n\t"
826 "movq %%mm0, (%0, %3) \n\t"
831 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
832 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
833 "r"(ptr + width + 2 * w)
839 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
840 in0, in1, in2, in7, out, OP) \
841 "paddw "#m4", "#m3" \n\t" /* x1 */ \
842 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
843 "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
844 "movq "#in7", "#m3" \n\t" /* d */ \
845 "movq "#in0", %%mm5 \n\t" /* D */ \
846 "paddw "#m3", %%mm5 \n\t" /* x4 */ \
847 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
848 "movq "#in1", %%mm5 \n\t" /* C */ \
849 "movq "#in2", %%mm6 \n\t" /* B */ \
850 "paddw "#m6", %%mm5 \n\t" /* x3 */ \
851 "paddw "#m5", %%mm6 \n\t" /* x2 */ \
852 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
853 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
854 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
855 "paddw "#rnd", %%mm4 \n\t" /* x2 */ \
856 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
857 "psraw $5, %%mm5 \n\t" \
858 "packuswb %%mm5, %%mm5 \n\t" \
859 OP(%%mm5, out, %%mm7, d)
861 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMXEXT) \
862 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, \
871 "pxor %%mm7, %%mm7 \n\t" \
873 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
874 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
875 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
876 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
877 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
878 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
879 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
880 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
881 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
882 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
883 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
884 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
885 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
886 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
887 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
888 "paddw %%mm3, %%mm5 \n\t" /* b */ \
889 "paddw %%mm2, %%mm6 \n\t" /* c */ \
890 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
891 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
892 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
893 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
894 "paddw %%mm4, %%mm0 \n\t" /* a */ \
895 "paddw %%mm1, %%mm5 \n\t" /* d */ \
896 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
897 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
898 "paddw %6, %%mm6 \n\t" \
899 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
900 "psraw $5, %%mm0 \n\t" \
901 "movq %%mm0, %5 \n\t" \
902 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
904 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
905 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
906 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
907 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
908 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
909 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
910 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
911 "paddw %%mm0, %%mm2 \n\t" /* b */ \
912 "paddw %%mm5, %%mm3 \n\t" /* c */ \
913 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
914 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
915 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
916 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
917 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
918 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
919 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
920 "paddw %%mm2, %%mm1 \n\t" /* a */ \
921 "paddw %%mm6, %%mm4 \n\t" /* d */ \
922 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
923 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
924 "paddw %6, %%mm1 \n\t" \
925 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
926 "psraw $5, %%mm3 \n\t" \
927 "movq %5, %%mm1 \n\t" \
928 "packuswb %%mm3, %%mm1 \n\t" \
929 OP_MMXEXT(%%mm1, (%1), %%mm4, q) \
930 /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
932 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
933 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
934 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
935 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
936 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
937 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
938 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
939 "paddw %%mm1, %%mm5 \n\t" /* b */ \
940 "paddw %%mm4, %%mm0 \n\t" /* c */ \
941 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
942 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
943 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
944 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
945 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
946 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
947 "paddw %%mm3, %%mm2 \n\t" /* d */ \
948 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
949 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
950 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
951 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
952 "paddw %%mm2, %%mm6 \n\t" /* a */ \
953 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
954 "paddw %6, %%mm0 \n\t" \
955 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
956 "psraw $5, %%mm0 \n\t" \
957 /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
958 /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
960 "paddw %%mm5, %%mm3 \n\t" /* a */ \
961 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
962 "paddw %%mm4, %%mm6 \n\t" /* b */ \
963 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
964 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
965 "paddw %%mm1, %%mm4 \n\t" /* c */ \
966 "paddw %%mm2, %%mm5 \n\t" /* d */ \
967 "paddw %%mm6, %%mm6 \n\t" /* 2b */ \
968 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
969 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
970 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
971 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
972 "paddw %6, %%mm4 \n\t" \
973 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
974 "psraw $5, %%mm4 \n\t" \
975 "packuswb %%mm4, %%mm0 \n\t" \
976 OP_MMXEXT(%%mm0, 8(%1), %%mm4, q) \
982 : "+a"(src), "+c"(dst), "+D"(h) \
983 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
984 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
989 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, \
996 "pxor %%mm7, %%mm7 \n\t" \
998 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
999 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
1000 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
1001 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
1002 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
1003 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
1004 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
1005 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
1006 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
1007 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
1008 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
1009 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
1010 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
1011 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
1012 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
1013 "paddw %%mm3, %%mm5 \n\t" /* b */ \
1014 "paddw %%mm2, %%mm6 \n\t" /* c */ \
1015 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1016 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
1017 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
1018 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
1019 "paddw %%mm4, %%mm0 \n\t" /* a */ \
1020 "paddw %%mm1, %%mm5 \n\t" /* d */ \
1021 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
1022 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
1023 "paddw %5, %%mm6 \n\t" \
1024 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1025 "psraw $5, %%mm0 \n\t" \
1026 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
1028 "movd 5(%0), %%mm5 \n\t" /* FGHI */ \
1029 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
1030 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
1031 "paddw %%mm5, %%mm1 \n\t" /* a */ \
1032 "paddw %%mm6, %%mm2 \n\t" /* b */ \
1033 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
1034 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
1035 "paddw %%mm6, %%mm3 \n\t" /* c */ \
1036 "paddw %%mm5, %%mm4 \n\t" /* d */ \
1037 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1038 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1039 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1040 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1041 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
1042 "paddw %5, %%mm1 \n\t" \
1043 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
1044 "psraw $5, %%mm3 \n\t" \
1045 "packuswb %%mm3, %%mm0 \n\t" \
1046 OP_MMXEXT(%%mm0, (%1), %%mm4, q) \
1052 : "+a"(src), "+c"(dst), "+d"(h) \
1053 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1054 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
1059 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1060 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1065 uint64_t temp[17 * 4]; \
1066 uint64_t *temp_ptr = temp; \
1069 /* FIXME unroll */ \
1070 __asm__ volatile ( \
1071 "pxor %%mm7, %%mm7 \n\t" \
1073 "movq (%0), %%mm0 \n\t" \
1074 "movq (%0), %%mm1 \n\t" \
1075 "movq 8(%0), %%mm2 \n\t" \
1076 "movq 8(%0), %%mm3 \n\t" \
1077 "punpcklbw %%mm7, %%mm0 \n\t" \
1078 "punpckhbw %%mm7, %%mm1 \n\t" \
1079 "punpcklbw %%mm7, %%mm2 \n\t" \
1080 "punpckhbw %%mm7, %%mm3 \n\t" \
1081 "movq %%mm0, (%1) \n\t" \
1082 "movq %%mm1, 17 * 8(%1) \n\t" \
1083 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1084 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1089 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1090 : "r"((x86_reg)srcStride) \
1097 /* FIXME reorder for speed */ \
1098 __asm__ volatile ( \
1099 /* "pxor %%mm7, %%mm7 \n\t" */ \
1101 "movq (%0), %%mm0 \n\t" \
1102 "movq 8(%0), %%mm1 \n\t" \
1103 "movq 16(%0), %%mm2 \n\t" \
1104 "movq 24(%0), %%mm3 \n\t" \
1105 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1106 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1108 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1110 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1112 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1113 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1115 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1116 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1118 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1119 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1121 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1122 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1124 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1126 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1128 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1129 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1131 "add $136, %0 \n\t" \
1136 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1137 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1138 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1139 "g"(4 - 14 * (x86_reg)dstStride) \
1144 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1149 uint64_t temp[9 * 2]; \
1150 uint64_t *temp_ptr = temp; \
1153 /* FIXME unroll */ \
1154 __asm__ volatile ( \
1155 "pxor %%mm7, %%mm7 \n\t" \
1157 "movq (%0), %%mm0 \n\t" \
1158 "movq (%0), %%mm1 \n\t" \
1159 "punpcklbw %%mm7, %%mm0 \n\t" \
1160 "punpckhbw %%mm7, %%mm1 \n\t" \
1161 "movq %%mm0, (%1) \n\t" \
1162 "movq %%mm1, 9*8(%1) \n\t" \
1167 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1168 : "r"((x86_reg)srcStride) \
1175 /* FIXME reorder for speed */ \
1176 __asm__ volatile ( \
1177 /* "pxor %%mm7, %%mm7 \n\t" */ \
1179 "movq (%0), %%mm0 \n\t" \
1180 "movq 8(%0), %%mm1 \n\t" \
1181 "movq 16(%0), %%mm2 \n\t" \
1182 "movq 24(%0), %%mm3 \n\t" \
1183 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1184 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1186 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1188 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1190 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1192 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1194 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1195 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1197 "add $72, %0 \n\t" \
1202 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1203 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1204 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1205 "g"(4 - 6 * (x86_reg)dstStride) \
1210 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1213 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1216 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1220 uint8_t * const half = (uint8_t*)temp; \
1221 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1223 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1226 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1229 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1233 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1237 uint8_t * const half = (uint8_t*)temp; \
1238 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1240 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1244 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1248 uint8_t * const half = (uint8_t*)temp; \
1249 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1250 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1253 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1256 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1259 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1263 uint8_t * const half = (uint8_t*)temp; \
1264 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1265 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1269 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1272 uint64_t half[8 + 9]; \
1273 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1274 uint8_t * const halfHV = ((uint8_t*)half); \
1275 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1277 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1278 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1279 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1282 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1285 uint64_t half[8 + 9]; \
1286 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1287 uint8_t * const halfHV = ((uint8_t*)half); \
1288 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1290 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1292 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1293 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1296 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1299 uint64_t half[8 + 9]; \
1300 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1301 uint8_t * const halfHV = ((uint8_t*)half); \
1302 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1304 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1305 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1306 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1309 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1312 uint64_t half[8 + 9]; \
1313 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1314 uint8_t * const halfHV = ((uint8_t*)half); \
1315 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1317 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1319 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1320 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1323 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1326 uint64_t half[8 + 9]; \
1327 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1328 uint8_t * const halfHV = ((uint8_t*)half); \
1329 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1331 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1332 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1335 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1338 uint64_t half[8 + 9]; \
1339 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1340 uint8_t * const halfHV = ((uint8_t*)half); \
1341 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1343 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1344 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1347 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1350 uint64_t half[8 + 9]; \
1351 uint8_t * const halfH = ((uint8_t*)half); \
1352 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1354 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1355 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1358 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1361 uint64_t half[8 + 9]; \
1362 uint8_t * const halfH = ((uint8_t*)half); \
1363 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1365 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1367 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1370 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1374 uint8_t * const halfH = ((uint8_t*)half); \
1375 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1377 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1380 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1383 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1386 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1389 uint64_t temp[32]; \
1390 uint8_t * const half = (uint8_t*)temp; \
1391 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1393 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1396 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1399 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1400 stride, stride, 16); \
1403 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1406 uint64_t temp[32]; \
1407 uint8_t * const half = (uint8_t*)temp; \
1408 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1410 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1411 stride, stride, 16); \
1414 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1417 uint64_t temp[32]; \
1418 uint8_t * const half = (uint8_t*)temp; \
1419 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1421 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1424 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1427 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1430 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1433 uint64_t temp[32]; \
1434 uint8_t * const half = (uint8_t*)temp; \
1435 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1437 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1438 stride, stride, 16); \
1441 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1444 uint64_t half[16 * 2 + 17 * 2]; \
1445 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1446 uint8_t * const halfHV = ((uint8_t*)half); \
1447 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1449 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1451 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1453 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1456 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1459 uint64_t half[16 * 2 + 17 * 2]; \
1460 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1461 uint8_t * const halfHV = ((uint8_t*)half); \
1462 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1464 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1466 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1468 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1471 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1474 uint64_t half[16 * 2 + 17 * 2]; \
1475 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1476 uint8_t * const halfHV = ((uint8_t*)half); \
1477 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1479 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1481 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1483 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1487 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1490 uint64_t half[16 * 2 + 17 * 2]; \
1491 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1492 uint8_t * const halfHV = ((uint8_t*)half); \
1493 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1495 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1497 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1499 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1503 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1506 uint64_t half[16 * 2 + 17 * 2]; \
1507 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1508 uint8_t * const halfHV = ((uint8_t*)half); \
1509 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1511 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1513 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1516 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1519 uint64_t half[16 * 2 + 17 * 2]; \
1520 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1521 uint8_t * const halfHV = ((uint8_t*)half); \
1522 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1524 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1526 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1530 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1533 uint64_t half[17 * 2]; \
1534 uint8_t * const halfH = ((uint8_t*)half); \
1535 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1537 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1539 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1542 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1545 uint64_t half[17 * 2]; \
1546 uint8_t * const halfH = ((uint8_t*)half); \
1547 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1549 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1551 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1554 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1557 uint64_t half[17 * 2]; \
1558 uint8_t * const halfH = ((uint8_t*)half); \
1559 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1561 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1564 #define PUT_OP(a, b, temp, size) \
1565 "mov"#size" "#a", "#b" \n\t"
1567 #define AVG_MMXEXT_OP(a, b, temp, size) \
1568 "mov"#size" "#b", "#temp" \n\t" \
1569 "pavgb "#temp", "#a" \n\t" \
1570 "mov"#size" "#a", "#b" \n\t"
1572 QPEL_BASE(put_, ff_pw_16, _, PUT_OP)
1573 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMXEXT_OP)
1574 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP)
1575 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext)
1576 QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
1577 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
1579 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1581 put_pixels8_xy2_mmx(dst, src, stride, 8);
1583 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1585 put_pixels16_xy2_mmx(dst, src, stride, 16);
1587 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1589 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1591 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1593 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1596 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1597 ptrdiff_t linesize, int block_w, int block_h,
1598 int src_x, int src_y, int w, int h);
1600 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1601 int stride, int h, int ox, int oy,
1602 int dxx, int dxy, int dyx, int dyy,
1603 int shift, int r, int width, int height,
1604 emulated_edge_mc_func *emu_edge_fn)
1607 const int ix = ox >> (16 + shift);
1608 const int iy = oy >> (16 + shift);
1609 const int oxs = ox >> 4;
1610 const int oys = oy >> 4;
1611 const int dxxs = dxx >> 4;
1612 const int dxys = dxy >> 4;
1613 const int dyxs = dyx >> 4;
1614 const int dyys = dyy >> 4;
1615 const uint16_t r4[4] = { r, r, r, r };
1616 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1617 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1618 const uint64_t shift2 = 2 * shift;
1619 #define MAX_STRIDE 4096U
1621 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
1624 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1625 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1626 const int dxh = dxy * (h - 1);
1627 const int dyw = dyx * (w - 1);
1628 int need_emu = (unsigned)ix >= width - w ||
1629 (unsigned)iy >= height - h;
1631 if ( // non-constant fullpel offset (3% of blocks)
1632 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1633 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1634 // uses more than 16 bits of subpel mv (only at huge resolution)
1635 || (dxx | dxy | dyx | dyy) & 15
1636 || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
1637 // FIXME could still use mmx for some of the rows
1638 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1639 shift, r, width, height);
1643 src += ix + iy * stride;
1645 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1650 "movd %0, %%mm6 \n\t"
1651 "pxor %%mm7, %%mm7 \n\t"
1652 "punpcklwd %%mm6, %%mm6 \n\t"
1653 "punpcklwd %%mm6, %%mm6 \n\t"
1657 for (x = 0; x < w; x += 4) {
1658 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1659 oxs - dxys + dxxs * (x + 1),
1660 oxs - dxys + dxxs * (x + 2),
1661 oxs - dxys + dxxs * (x + 3) };
1662 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1663 oys - dyys + dyxs * (x + 1),
1664 oys - dyys + dyxs * (x + 2),
1665 oys - dyys + dyxs * (x + 3) };
1667 for (y = 0; y < h; y++) {
1669 "movq %0, %%mm4 \n\t"
1670 "movq %1, %%mm5 \n\t"
1671 "paddw %2, %%mm4 \n\t"
1672 "paddw %3, %%mm5 \n\t"
1673 "movq %%mm4, %0 \n\t"
1674 "movq %%mm5, %1 \n\t"
1675 "psrlw $12, %%mm4 \n\t"
1676 "psrlw $12, %%mm5 \n\t"
1677 : "+m"(*dx4), "+m"(*dy4)
1678 : "m"(*dxy4), "m"(*dyy4)
1682 "movq %%mm6, %%mm2 \n\t"
1683 "movq %%mm6, %%mm1 \n\t"
1684 "psubw %%mm4, %%mm2 \n\t"
1685 "psubw %%mm5, %%mm1 \n\t"
1686 "movq %%mm2, %%mm0 \n\t"
1687 "movq %%mm4, %%mm3 \n\t"
1688 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1689 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1690 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1691 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1693 "movd %4, %%mm5 \n\t"
1694 "movd %3, %%mm4 \n\t"
1695 "punpcklbw %%mm7, %%mm5 \n\t"
1696 "punpcklbw %%mm7, %%mm4 \n\t"
1697 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1698 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1700 "movd %2, %%mm5 \n\t"
1701 "movd %1, %%mm4 \n\t"
1702 "punpcklbw %%mm7, %%mm5 \n\t"
1703 "punpcklbw %%mm7, %%mm4 \n\t"
1704 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1705 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1706 "paddw %5, %%mm1 \n\t"
1707 "paddw %%mm3, %%mm2 \n\t"
1708 "paddw %%mm1, %%mm0 \n\t"
1709 "paddw %%mm2, %%mm0 \n\t"
1711 "psrlw %6, %%mm0 \n\t"
1712 "packuswb %%mm0, %%mm0 \n\t"
1713 "movd %%mm0, %0 \n\t"
1715 : "=m"(dst[x + y * stride])
1716 : "m"(src[0]), "m"(src[1]),
1717 "m"(src[stride]), "m"(src[stride + 1]),
1718 "m"(*r4), "m"(shift2)
1722 src += 4 - h * stride;
1729 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1730 int stride, int h, int ox, int oy,
1731 int dxx, int dxy, int dyx, int dyy,
1732 int shift, int r, int width, int height)
1734 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1735 width, height, &ff_emulated_edge_mc_8);
1738 static void gmc_sse(uint8_t *dst, uint8_t *src,
1739 int stride, int h, int ox, int oy,
1740 int dxx, int dxy, int dyx, int dyy,
1741 int shift, int r, int width, int height)
1743 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1744 width, height, &ff_emulated_edge_mc_8);
1747 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1748 int stride, int h, int ox, int oy,
1749 int dxx, int dxy, int dyx, int dyy,
1750 int shift, int r, int width, int height)
1752 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1753 width, height, &ff_emulated_edge_mc_8);
1758 #endif /* HAVE_INLINE_ASM */
1760 #include "h264_qpel.c"
1762 void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
1763 int stride, int h, int x, int y);
1764 void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
1765 int stride, int h, int x, int y);
1766 void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
1767 int stride, int h, int x, int y);
1769 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1770 int stride, int h, int x, int y);
1771 void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
1772 int stride, int h, int x, int y);
1773 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1774 int stride, int h, int x, int y);
1776 void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1777 int stride, int h, int x, int y);
1778 void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1779 int stride, int h, int x, int y);
1781 void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1782 int stride, int h, int x, int y);
1783 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1784 int stride, int h, int x, int y);
1786 void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1787 int stride, int h, int x, int y);
1788 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1789 int stride, int h, int x, int y);
1791 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1792 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1793 (uint8_t *dst, uint8_t *src, \
1794 int stride, int h, int x, int y);
1796 CHROMA_MC(put, 2, 10, mmxext)
1797 CHROMA_MC(avg, 2, 10, mmxext)
1798 CHROMA_MC(put, 4, 10, mmxext)
1799 CHROMA_MC(avg, 4, 10, mmxext)
1800 CHROMA_MC(put, 8, 10, sse2)
1801 CHROMA_MC(avg, 8, 10, sse2)
1802 CHROMA_MC(put, 8, 10, avx)
1803 CHROMA_MC(avg, 8, 10, avx)
1808 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1810 put_pixels8_mmx(dst, src, stride, 8);
1813 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1815 avg_pixels8_mmx(dst, src, stride, 8);
1818 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1820 put_pixels16_mmx(dst, src, stride, 16);
1823 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1825 avg_pixels16_mmx(dst, src, stride, 16);
1829 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1830 int stride, int rnd)
1832 put_pixels8_mmx(dst, src, stride, 8);
1835 void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
1836 int stride, int rnd)
1838 avg_pixels8_mmxext(dst, src, stride, 8);
1841 #if CONFIG_DIRAC_DECODER
1842 #define DIRAC_PIXOP(OPNAME, EXT)\
1843 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1845 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
1847 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1849 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
1851 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1853 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
1854 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
1857 DIRAC_PIXOP(put, mmx)
1858 DIRAC_PIXOP(avg, mmx)
1859 DIRAC_PIXOP(avg, mmxext)
1862 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1864 ff_put_pixels16_sse2(dst, src[0], stride, h);
1866 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1868 ff_avg_pixels16_sse2(dst, src[0], stride, h);
1870 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1872 ff_put_pixels16_sse2(dst , src[0] , stride, h);
1873 ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
1875 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1877 ff_avg_pixels16_sse2(dst , src[0] , stride, h);
1878 ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
1883 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
1886 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
1890 ff_put_pixels_clamped_mmx(block, dest, line_size);
1893 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
1897 ff_add_pixels_clamped_mmx(block, dest, line_size);
1900 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
1903 ff_mmxext_idct(block);
1904 ff_put_pixels_clamped_mmx(block, dest, line_size);
1907 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
1910 ff_mmxext_idct(block);
1911 ff_add_pixels_clamped_mmx(block, dest, line_size);
1915 static void vector_clipf_sse(float *dst, const float *src,
1916 float min, float max, int len)
1918 x86_reg i = (len - 16) * 4;
1920 "movss %3, %%xmm4 \n\t"
1921 "movss %4, %%xmm5 \n\t"
1922 "shufps $0, %%xmm4, %%xmm4 \n\t"
1923 "shufps $0, %%xmm5, %%xmm5 \n\t"
1925 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1926 "movaps 16(%2, %0), %%xmm1 \n\t"
1927 "movaps 32(%2, %0), %%xmm2 \n\t"
1928 "movaps 48(%2, %0), %%xmm3 \n\t"
1929 "maxps %%xmm4, %%xmm0 \n\t"
1930 "maxps %%xmm4, %%xmm1 \n\t"
1931 "maxps %%xmm4, %%xmm2 \n\t"
1932 "maxps %%xmm4, %%xmm3 \n\t"
1933 "minps %%xmm5, %%xmm0 \n\t"
1934 "minps %%xmm5, %%xmm1 \n\t"
1935 "minps %%xmm5, %%xmm2 \n\t"
1936 "minps %%xmm5, %%xmm3 \n\t"
1937 "movaps %%xmm0, (%1, %0) \n\t"
1938 "movaps %%xmm1, 16(%1, %0) \n\t"
1939 "movaps %%xmm2, 32(%1, %0) \n\t"
1940 "movaps %%xmm3, 48(%1, %0) \n\t"
1944 : "r"(dst), "r"(src), "m"(min), "m"(max)
1949 #endif /* HAVE_INLINE_ASM */
1951 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1953 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1955 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1957 int order, int mul);
1958 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1960 int order, int mul);
1961 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1963 int order, int mul);
1965 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1966 const int16_t *window, unsigned int len);
1967 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1968 const int16_t *window, unsigned int len);
1969 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1970 const int16_t *window, unsigned int len);
1971 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1972 const int16_t *window, unsigned int len);
1973 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1974 const int16_t *window, unsigned int len);
1975 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1976 const int16_t *window, unsigned int len);
1978 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1979 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1981 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1982 const uint8_t *diff, int w,
1983 int *left, int *left_top);
1984 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1986 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1989 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1990 int32_t min, int32_t max, unsigned int len);
1991 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1992 int32_t min, int32_t max, unsigned int len);
1993 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1994 int32_t min, int32_t max, unsigned int len);
1995 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1996 int32_t min, int32_t max, unsigned int len);
1998 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2000 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2001 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2002 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2003 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2004 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2005 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2006 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2007 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2008 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2009 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2010 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2011 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2012 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2013 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2014 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2015 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2018 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2020 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2021 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2022 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2023 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2026 #define H264_QPEL_FUNCS(x, y, CPU) \
2028 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2029 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2030 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2031 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2034 #define H264_QPEL_FUNCS_10(x, y, CPU) \
2036 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2037 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2038 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2039 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2042 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2044 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2047 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2048 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2049 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2051 if (!high_bit_depth) {
2052 c->clear_block = clear_block_mmx;
2053 c->clear_blocks = clear_blocks_mmx;
2054 c->draw_edges = draw_edges_mmx;
2056 SET_HPEL_FUNCS(put, 0, 16, mmx);
2057 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2058 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2059 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2060 SET_HPEL_FUNCS(put, 1, 8, mmx);
2061 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2062 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2065 #if ARCH_X86_32 || !HAVE_YASM
2069 c->add_bytes = add_bytes_mmx;
2071 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2072 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2073 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2075 #endif /* HAVE_INLINE_ASM */
2078 if (!high_bit_depth && CONFIG_H264CHROMA) {
2079 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
2080 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2083 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2088 static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
2091 const int bit_depth = avctx->bits_per_raw_sample;
2092 const int high_bit_depth = bit_depth > 8;
2095 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
2096 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
2098 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
2099 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
2100 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
2101 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
2103 if (!high_bit_depth) {
2104 c->put_pixels_tab[0][1] = put_pixels16_x2_mmxext;
2105 c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
2107 c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
2108 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
2109 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
2111 c->put_pixels_tab[1][1] = put_pixels8_x2_mmxext;
2112 c->put_pixels_tab[1][2] = put_pixels8_y2_mmxext;
2114 c->avg_pixels_tab[1][0] = avg_pixels8_mmxext;
2115 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmxext;
2116 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmxext;
2119 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2120 if (!high_bit_depth) {
2121 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
2122 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
2123 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmxext;
2124 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmxext;
2126 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
2127 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmxext;
2131 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2132 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2133 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmxext;
2134 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmxext;
2136 #endif /* HAVE_INLINE_ASM */
2138 #if HAVE_MMXEXT_EXTERNAL
2139 if (CONFIG_H264QPEL) {
2140 if (!high_bit_depth) {
2141 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
2142 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
2143 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
2144 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
2145 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
2146 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
2147 } else if (bit_depth == 10) {
2149 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2150 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2151 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2152 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2154 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2155 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2159 if (!high_bit_depth && CONFIG_H264CHROMA) {
2160 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
2161 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
2162 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
2163 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
2165 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2166 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
2167 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
2168 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
2169 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
2172 /* slower than cmov version on AMD */
2173 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
2174 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
2176 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
2177 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
2179 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2180 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2182 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
2184 #endif /* HAVE_MMXEXT_EXTERNAL */
2187 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2190 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2193 if (!high_bit_depth) {
2194 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2195 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2197 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2198 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2199 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2201 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2202 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2204 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2205 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2206 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2208 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2209 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2210 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2211 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2212 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2214 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2215 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2219 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2220 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2221 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2222 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2224 #endif /* HAVE_INLINE_ASM */
2227 if (!high_bit_depth && CONFIG_H264CHROMA) {
2228 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
2229 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2231 #endif /* HAVE_YASM */
2234 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2236 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2239 if (!high_bit_depth) {
2240 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2241 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2242 c->clear_block = clear_block_sse;
2243 c->clear_blocks = clear_blocks_sse;
2247 c->vector_clipf = vector_clipf_sse;
2248 #endif /* HAVE_INLINE_ASM */
2251 #if HAVE_INLINE_ASM && CONFIG_VIDEODSP
2254 #endif /* HAVE_YASM */
2257 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2260 const int bit_depth = avctx->bits_per_raw_sample;
2261 const int high_bit_depth = bit_depth > 8;
2263 #if HAVE_SSE2_INLINE
2264 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
2265 c->idct_put = ff_idct_xvid_sse2_put;
2266 c->idct_add = ff_idct_xvid_sse2_add;
2267 c->idct = ff_idct_xvid_sse2;
2268 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2270 #endif /* HAVE_SSE2_INLINE */
2272 #if HAVE_SSE2_EXTERNAL
2273 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2274 // these functions are slower than mmx on AMD, but faster on Intel
2275 if (!high_bit_depth) {
2276 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
2277 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
2278 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
2279 if (CONFIG_H264QPEL)
2280 H264_QPEL_FUNCS(0, 0, sse2);
2284 if (!high_bit_depth && CONFIG_H264QPEL) {
2285 H264_QPEL_FUNCS(0, 1, sse2);
2286 H264_QPEL_FUNCS(0, 2, sse2);
2287 H264_QPEL_FUNCS(0, 3, sse2);
2288 H264_QPEL_FUNCS(1, 1, sse2);
2289 H264_QPEL_FUNCS(1, 2, sse2);
2290 H264_QPEL_FUNCS(1, 3, sse2);
2291 H264_QPEL_FUNCS(2, 1, sse2);
2292 H264_QPEL_FUNCS(2, 2, sse2);
2293 H264_QPEL_FUNCS(2, 3, sse2);
2294 H264_QPEL_FUNCS(3, 1, sse2);
2295 H264_QPEL_FUNCS(3, 2, sse2);
2296 H264_QPEL_FUNCS(3, 3, sse2);
2299 if (bit_depth == 10) {
2300 if (CONFIG_H264QPEL) {
2301 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2302 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2303 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2304 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2305 H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2306 H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2307 H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
2309 if (CONFIG_H264CHROMA) {
2310 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2311 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2315 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2316 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2317 if (mm_flags & AV_CPU_FLAG_ATOM) {
2318 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2320 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2322 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2323 c->apply_window_int16 = ff_apply_window_int16_sse2;
2324 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2325 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
2327 c->bswap_buf = ff_bswap32_buf_sse2;
2328 #endif /* HAVE_SSE2_EXTERNAL */
2331 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2334 #if HAVE_SSSE3_EXTERNAL
2335 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2336 const int bit_depth = avctx->bits_per_raw_sample;
2338 if (!high_bit_depth && CONFIG_H264QPEL) {
2339 H264_QPEL_FUNCS(1, 0, ssse3);
2340 H264_QPEL_FUNCS(1, 1, ssse3);
2341 H264_QPEL_FUNCS(1, 2, ssse3);
2342 H264_QPEL_FUNCS(1, 3, ssse3);
2343 H264_QPEL_FUNCS(2, 0, ssse3);
2344 H264_QPEL_FUNCS(2, 1, ssse3);
2345 H264_QPEL_FUNCS(2, 2, ssse3);
2346 H264_QPEL_FUNCS(2, 3, ssse3);
2347 H264_QPEL_FUNCS(3, 0, ssse3);
2348 H264_QPEL_FUNCS(3, 1, ssse3);
2349 H264_QPEL_FUNCS(3, 2, ssse3);
2350 H264_QPEL_FUNCS(3, 3, ssse3);
2352 if (bit_depth == 10 && CONFIG_H264QPEL) {
2353 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
2354 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
2355 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
2357 if (!high_bit_depth && CONFIG_H264CHROMA) {
2358 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
2359 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
2360 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
2361 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
2363 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2364 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2365 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2367 if (mm_flags & AV_CPU_FLAG_ATOM)
2368 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2370 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2371 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
2372 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2373 c->bswap_buf = ff_bswap32_buf_ssse3;
2374 #endif /* HAVE_SSSE3_EXTERNAL */
2377 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
2380 #if HAVE_SSE4_EXTERNAL
2381 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2382 #endif /* HAVE_SSE4_EXTERNAL */
2385 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2387 #if HAVE_AVX_EXTERNAL
2388 const int bit_depth = avctx->bits_per_raw_sample;
2390 if (bit_depth == 10) {
2391 // AVX implies !cache64.
2392 // TODO: Port cache(32|64) detection from x264.
2393 if (CONFIG_H264QPEL) {
2394 H264_QPEL_FUNCS_10(1, 0, sse2);
2395 H264_QPEL_FUNCS_10(2, 0, sse2);
2396 H264_QPEL_FUNCS_10(3, 0, sse2);
2399 if (CONFIG_H264CHROMA) {
2400 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2401 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2404 #endif /* HAVE_AVX_EXTERNAL */
2407 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
2409 int mm_flags = av_get_cpu_flags();
2411 #if HAVE_7REGS && HAVE_INLINE_ASM
2412 if (mm_flags & AV_CPU_FLAG_CMOV)
2413 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2416 if (mm_flags & AV_CPU_FLAG_MMX) {
2418 const int idct_algo = avctx->idct_algo;
2420 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
2421 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
2422 c->idct_put = ff_simple_idct_put_mmx;
2423 c->idct_add = ff_simple_idct_add_mmx;
2424 c->idct = ff_simple_idct_mmx;
2425 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
2427 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
2428 if (mm_flags & AV_CPU_FLAG_MMX2) {
2429 c->idct_put = ff_libmpeg2mmx2_idct_put;
2430 c->idct_add = ff_libmpeg2mmx2_idct_add;
2431 c->idct = ff_mmxext_idct;
2433 c->idct_put = ff_libmpeg2mmx_idct_put;
2434 c->idct_add = ff_libmpeg2mmx_idct_add;
2435 c->idct = ff_mmx_idct;
2437 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2439 } else if (idct_algo == FF_IDCT_XVIDMMX) {
2440 if (mm_flags & AV_CPU_FLAG_SSE2) {
2441 c->idct_put = ff_idct_xvid_sse2_put;
2442 c->idct_add = ff_idct_xvid_sse2_add;
2443 c->idct = ff_idct_xvid_sse2;
2444 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2445 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
2446 c->idct_put = ff_idct_xvid_mmxext_put;
2447 c->idct_add = ff_idct_xvid_mmxext_add;
2448 c->idct = ff_idct_xvid_mmxext;
2450 c->idct_put = ff_idct_xvid_mmx_put;
2451 c->idct_add = ff_idct_xvid_mmx_add;
2452 c->idct = ff_idct_xvid_mmx;
2456 #endif /* HAVE_INLINE_ASM */
2458 dsputil_init_mmx(c, avctx, mm_flags);
2461 if (mm_flags & AV_CPU_FLAG_MMXEXT)
2462 dsputil_init_mmxext(c, avctx, mm_flags);
2464 if (mm_flags & AV_CPU_FLAG_3DNOW)
2465 dsputil_init_3dnow(c, avctx, mm_flags);
2467 if (mm_flags & AV_CPU_FLAG_SSE)
2468 dsputil_init_sse(c, avctx, mm_flags);
2470 if (mm_flags & AV_CPU_FLAG_SSE2)
2471 dsputil_init_sse2(c, avctx, mm_flags);
2473 if (mm_flags & AV_CPU_FLAG_SSSE3)
2474 dsputil_init_ssse3(c, avctx, mm_flags);
2476 if (mm_flags & AV_CPU_FLAG_SSE4)
2477 dsputil_init_sse4(c, avctx, mm_flags);
2479 if (mm_flags & AV_CPU_FLAG_AVX)
2480 dsputil_init_avx(c, avctx, mm_flags);
2482 if (CONFIG_ENCODERS)
2483 ff_dsputilenc_init_mmx(c, avctx);