2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "dsputil_mmx.h"
32 #include "idct_xvid.h"
33 #include "diracdsp_mmx.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
43 { 0x8000000080000000ULL, 0x8000000080000000ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
52 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
73 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
81 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
82 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
84 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
85 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
89 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
90 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
92 #define MOVQ_BFE(regd) \
94 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
95 "paddb %%"#regd", %%"#regd" \n\t" ::)
98 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
99 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
101 // for shared library it's better to use this way for accessing constants
103 #define MOVQ_BONE(regd) \
105 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
106 "psrlw $15, %%"#regd" \n\t" \
107 "packuswb %%"#regd", %%"#regd" \n\t" ::)
109 #define MOVQ_WTWO(regd) \
111 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
112 "psrlw $15, %%"#regd" \n\t" \
113 "psllw $1, %%"#regd" \n\t"::)
117 // using regr as temporary and for the output result
118 // first argument is unmodifed and second is trashed
119 // regfe is supposed to contain 0xfefefefefefefefe
120 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
121 "movq "#rega", "#regr" \n\t" \
122 "pand "#regb", "#regr" \n\t" \
123 "pxor "#rega", "#regb" \n\t" \
124 "pand "#regfe", "#regb" \n\t" \
125 "psrlq $1, "#regb" \n\t" \
126 "paddb "#regb", "#regr" \n\t"
128 #define PAVGB_MMX(rega, regb, regr, regfe) \
129 "movq "#rega", "#regr" \n\t" \
130 "por "#regb", "#regr" \n\t" \
131 "pxor "#rega", "#regb" \n\t" \
132 "pand "#regfe", "#regb" \n\t" \
133 "psrlq $1, "#regb" \n\t" \
134 "psubb "#regb", "#regr" \n\t"
136 // mm6 is supposed to contain 0xfefefefefefefefe
137 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
138 "movq "#rega", "#regr" \n\t" \
139 "movq "#regc", "#regp" \n\t" \
140 "pand "#regb", "#regr" \n\t" \
141 "pand "#regd", "#regp" \n\t" \
142 "pxor "#rega", "#regb" \n\t" \
143 "pxor "#regc", "#regd" \n\t" \
144 "pand %%mm6, "#regb" \n\t" \
145 "pand %%mm6, "#regd" \n\t" \
146 "psrlq $1, "#regb" \n\t" \
147 "psrlq $1, "#regd" \n\t" \
148 "paddb "#regb", "#regr" \n\t" \
149 "paddb "#regd", "#regp" \n\t"
151 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
152 "movq "#rega", "#regr" \n\t" \
153 "movq "#regc", "#regp" \n\t" \
154 "por "#regb", "#regr" \n\t" \
155 "por "#regd", "#regp" \n\t" \
156 "pxor "#rega", "#regb" \n\t" \
157 "pxor "#regc", "#regd" \n\t" \
158 "pand %%mm6, "#regb" \n\t" \
159 "pand %%mm6, "#regd" \n\t" \
160 "psrlq $1, "#regd" \n\t" \
161 "psrlq $1, "#regb" \n\t" \
162 "psubb "#regb", "#regr" \n\t" \
163 "psubb "#regd", "#regp" \n\t"
165 /***********************************/
166 /* MMX no rounding */
167 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
168 #define SET_RND MOVQ_WONE
169 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
170 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
171 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
173 #include "dsputil_rnd_template.c"
179 /***********************************/
182 #define DEF(x, y) x ## _ ## y ## _mmx
183 #define SET_RND MOVQ_WTWO
184 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
185 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
187 #include "dsputil_rnd_template.c"
195 /***********************************/
198 #define DEF(x) x ## _3dnow
199 #define PAVGB "pavgusb"
200 #define SKIP_FOR_3DNOW
202 #include "dsputil_avg_template.c"
206 #undef SKIP_FOR_3DNOW
208 /***********************************/
209 /* MMXEXT specific */
211 #define DEF(x) x ## _mmxext
213 /* Introduced only in MMXEXT set */
214 #define PAVGB "pavgb"
216 #include "dsputil_avg_template.c"
221 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
222 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
223 #define put_pixels16_mmxext put_pixels16_mmx
224 #define put_pixels8_mmxext put_pixels8_mmx
225 #define put_pixels4_mmxext put_pixels4_mmx
226 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
227 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
229 /***********************************/
232 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
238 /* read the pixels */
243 "movq (%3), %%mm0 \n\t"
244 "movq 8(%3), %%mm1 \n\t"
245 "movq 16(%3), %%mm2 \n\t"
246 "movq 24(%3), %%mm3 \n\t"
247 "movq 32(%3), %%mm4 \n\t"
248 "movq 40(%3), %%mm5 \n\t"
249 "movq 48(%3), %%mm6 \n\t"
250 "movq 56(%3), %%mm7 \n\t"
251 "packuswb %%mm1, %%mm0 \n\t"
252 "packuswb %%mm3, %%mm2 \n\t"
253 "packuswb %%mm5, %%mm4 \n\t"
254 "packuswb %%mm7, %%mm6 \n\t"
255 "movq %%mm0, (%0) \n\t"
256 "movq %%mm2, (%0, %1) \n\t"
257 "movq %%mm4, (%0, %1, 2) \n\t"
258 "movq %%mm6, (%0, %2) \n\t"
259 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
262 pix += line_size * 4;
265 // if here would be an exact copy of the code above
266 // compiler would generate some very strange code
269 "movq (%3), %%mm0 \n\t"
270 "movq 8(%3), %%mm1 \n\t"
271 "movq 16(%3), %%mm2 \n\t"
272 "movq 24(%3), %%mm3 \n\t"
273 "movq 32(%3), %%mm4 \n\t"
274 "movq 40(%3), %%mm5 \n\t"
275 "movq 48(%3), %%mm6 \n\t"
276 "movq 56(%3), %%mm7 \n\t"
277 "packuswb %%mm1, %%mm0 \n\t"
278 "packuswb %%mm3, %%mm2 \n\t"
279 "packuswb %%mm5, %%mm4 \n\t"
280 "packuswb %%mm7, %%mm6 \n\t"
281 "movq %%mm0, (%0) \n\t"
282 "movq %%mm2, (%0, %1) \n\t"
283 "movq %%mm4, (%0, %1, 2) \n\t"
284 "movq %%mm6, (%0, %2) \n\t"
285 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
289 #define put_signed_pixels_clamped_mmx_half(off) \
290 "movq "#off"(%2), %%mm1 \n\t" \
291 "movq 16 + "#off"(%2), %%mm2 \n\t" \
292 "movq 32 + "#off"(%2), %%mm3 \n\t" \
293 "movq 48 + "#off"(%2), %%mm4 \n\t" \
294 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
295 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
296 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
297 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
298 "paddb %%mm0, %%mm1 \n\t" \
299 "paddb %%mm0, %%mm2 \n\t" \
300 "paddb %%mm0, %%mm3 \n\t" \
301 "paddb %%mm0, %%mm4 \n\t" \
302 "movq %%mm1, (%0) \n\t" \
303 "movq %%mm2, (%0, %3) \n\t" \
304 "movq %%mm3, (%0, %3, 2) \n\t" \
305 "movq %%mm4, (%0, %1) \n\t"
307 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
310 x86_reg line_skip = line_size;
314 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
315 "lea (%3, %3, 2), %1 \n\t"
316 put_signed_pixels_clamped_mmx_half(0)
317 "lea (%0, %3, 4), %0 \n\t"
318 put_signed_pixels_clamped_mmx_half(64)
319 : "+&r"(pixels), "=&r"(line_skip3)
320 : "r"(block), "r"(line_skip)
324 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
331 /* read the pixels */
338 "movq (%2), %%mm0 \n\t"
339 "movq 8(%2), %%mm1 \n\t"
340 "movq 16(%2), %%mm2 \n\t"
341 "movq 24(%2), %%mm3 \n\t"
342 "movq %0, %%mm4 \n\t"
343 "movq %1, %%mm6 \n\t"
344 "movq %%mm4, %%mm5 \n\t"
345 "punpcklbw %%mm7, %%mm4 \n\t"
346 "punpckhbw %%mm7, %%mm5 \n\t"
347 "paddsw %%mm4, %%mm0 \n\t"
348 "paddsw %%mm5, %%mm1 \n\t"
349 "movq %%mm6, %%mm5 \n\t"
350 "punpcklbw %%mm7, %%mm6 \n\t"
351 "punpckhbw %%mm7, %%mm5 \n\t"
352 "paddsw %%mm6, %%mm2 \n\t"
353 "paddsw %%mm5, %%mm3 \n\t"
354 "packuswb %%mm1, %%mm0 \n\t"
355 "packuswb %%mm3, %%mm2 \n\t"
356 "movq %%mm0, %0 \n\t"
357 "movq %%mm2, %1 \n\t"
358 : "+m"(*pix), "+m"(*(pix + line_size))
361 pix += line_size * 2;
366 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
367 int line_size, int h)
370 "lea (%3, %3), %%"REG_a" \n\t"
373 "movq (%1 ), %%mm0 \n\t"
374 "movq (%1, %3), %%mm1 \n\t"
375 "movq %%mm0, (%2) \n\t"
376 "movq %%mm1, (%2, %3) \n\t"
377 "add %%"REG_a", %1 \n\t"
378 "add %%"REG_a", %2 \n\t"
379 "movq (%1 ), %%mm0 \n\t"
380 "movq (%1, %3), %%mm1 \n\t"
381 "movq %%mm0, (%2) \n\t"
382 "movq %%mm1, (%2, %3) \n\t"
383 "add %%"REG_a", %1 \n\t"
384 "add %%"REG_a", %2 \n\t"
387 : "+g"(h), "+r"(pixels), "+r"(block)
388 : "r"((x86_reg)line_size)
393 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
394 int line_size, int h)
397 "lea (%3, %3), %%"REG_a" \n\t"
400 "movq (%1 ), %%mm0 \n\t"
401 "movq 8(%1 ), %%mm4 \n\t"
402 "movq (%1, %3), %%mm1 \n\t"
403 "movq 8(%1, %3), %%mm5 \n\t"
404 "movq %%mm0, (%2) \n\t"
405 "movq %%mm4, 8(%2) \n\t"
406 "movq %%mm1, (%2, %3) \n\t"
407 "movq %%mm5, 8(%2, %3) \n\t"
408 "add %%"REG_a", %1 \n\t"
409 "add %%"REG_a", %2 \n\t"
410 "movq (%1 ), %%mm0 \n\t"
411 "movq 8(%1 ), %%mm4 \n\t"
412 "movq (%1, %3), %%mm1 \n\t"
413 "movq 8(%1, %3), %%mm5 \n\t"
414 "movq %%mm0, (%2) \n\t"
415 "movq %%mm4, 8(%2) \n\t"
416 "movq %%mm1, (%2, %3) \n\t"
417 "movq %%mm5, 8(%2, %3) \n\t"
418 "add %%"REG_a", %1 \n\t"
419 "add %%"REG_a", %2 \n\t"
422 : "+g"(h), "+r"(pixels), "+r"(block)
423 : "r"((x86_reg)line_size)
428 #define CLEAR_BLOCKS(name, n) \
429 static void name(DCTELEM *blocks) \
432 "pxor %%mm7, %%mm7 \n\t" \
433 "mov %1, %%"REG_a" \n\t" \
435 "movq %%mm7, (%0, %%"REG_a") \n\t" \
436 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
437 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
438 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
439 "add $32, %%"REG_a" \n\t" \
441 :: "r"(((uint8_t *)blocks) + 128 * n), \
446 CLEAR_BLOCKS(clear_blocks_mmx, 6)
447 CLEAR_BLOCKS(clear_block_mmx, 1)
449 static void clear_block_sse(DCTELEM *block)
452 "xorps %%xmm0, %%xmm0 \n"
453 "movaps %%xmm0, (%0) \n"
454 "movaps %%xmm0, 16(%0) \n"
455 "movaps %%xmm0, 32(%0) \n"
456 "movaps %%xmm0, 48(%0) \n"
457 "movaps %%xmm0, 64(%0) \n"
458 "movaps %%xmm0, 80(%0) \n"
459 "movaps %%xmm0, 96(%0) \n"
460 "movaps %%xmm0, 112(%0) \n"
466 static void clear_blocks_sse(DCTELEM *blocks)
469 "xorps %%xmm0, %%xmm0 \n"
470 "mov %1, %%"REG_a" \n"
472 "movaps %%xmm0, (%0, %%"REG_a") \n"
473 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
474 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
475 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
476 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
477 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
478 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
479 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
480 "add $128, %%"REG_a" \n"
482 :: "r"(((uint8_t *)blocks) + 128 * 6),
488 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
494 "movq (%1, %0), %%mm0 \n\t"
495 "movq (%2, %0), %%mm1 \n\t"
496 "paddb %%mm0, %%mm1 \n\t"
497 "movq %%mm1, (%2, %0) \n\t"
498 "movq 8(%1, %0), %%mm0 \n\t"
499 "movq 8(%2, %0), %%mm1 \n\t"
500 "paddb %%mm0, %%mm1 \n\t"
501 "movq %%mm1, 8(%2, %0) \n\t"
507 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
510 dst[i + 0] += src[i + 0];
514 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
515 const uint8_t *diff, int w,
516 int *left, int *left_top)
520 int l = *left & 0xff;
521 int tl = *left_top & 0xff;
526 "movzbl (%3, %4), %2 \n"
539 "add (%6, %4), %b0 \n"
540 "mov %b0, (%5, %4) \n"
543 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
544 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
551 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
552 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
553 "movd (%1), %%mm0 \n\t"
555 "movd (%1), %%mm1 \n\t"
556 "movd (%1,%3,1), %%mm2 \n\t"
557 "movd (%1,%3,2), %%mm3 \n\t"
558 "punpcklbw %%mm1, %%mm0 \n\t"
559 "punpcklbw %%mm3, %%mm2 \n\t"
560 "movq %%mm0, %%mm1 \n\t"
561 "punpcklwd %%mm2, %%mm0 \n\t"
562 "punpckhwd %%mm2, %%mm1 \n\t"
563 "movd %%mm0, (%0) \n\t"
565 "punpckhdq %%mm0, %%mm0 \n\t"
566 "movd %%mm0, (%0) \n\t"
567 "movd %%mm1, (%0,%2,1) \n\t"
568 "punpckhdq %%mm1, %%mm1 \n\t"
569 "movd %%mm1, (%0,%2,2) \n\t"
579 #define H263_LOOP_FILTER \
580 "pxor %%mm7, %%mm7 \n\t" \
581 "movq %0, %%mm0 \n\t" \
582 "movq %0, %%mm1 \n\t" \
583 "movq %3, %%mm2 \n\t" \
584 "movq %3, %%mm3 \n\t" \
585 "punpcklbw %%mm7, %%mm0 \n\t" \
586 "punpckhbw %%mm7, %%mm1 \n\t" \
587 "punpcklbw %%mm7, %%mm2 \n\t" \
588 "punpckhbw %%mm7, %%mm3 \n\t" \
589 "psubw %%mm2, %%mm0 \n\t" \
590 "psubw %%mm3, %%mm1 \n\t" \
591 "movq %1, %%mm2 \n\t" \
592 "movq %1, %%mm3 \n\t" \
593 "movq %2, %%mm4 \n\t" \
594 "movq %2, %%mm5 \n\t" \
595 "punpcklbw %%mm7, %%mm2 \n\t" \
596 "punpckhbw %%mm7, %%mm3 \n\t" \
597 "punpcklbw %%mm7, %%mm4 \n\t" \
598 "punpckhbw %%mm7, %%mm5 \n\t" \
599 "psubw %%mm2, %%mm4 \n\t" \
600 "psubw %%mm3, %%mm5 \n\t" \
601 "psllw $2, %%mm4 \n\t" \
602 "psllw $2, %%mm5 \n\t" \
603 "paddw %%mm0, %%mm4 \n\t" \
604 "paddw %%mm1, %%mm5 \n\t" \
605 "pxor %%mm6, %%mm6 \n\t" \
606 "pcmpgtw %%mm4, %%mm6 \n\t" \
607 "pcmpgtw %%mm5, %%mm7 \n\t" \
608 "pxor %%mm6, %%mm4 \n\t" \
609 "pxor %%mm7, %%mm5 \n\t" \
610 "psubw %%mm6, %%mm4 \n\t" \
611 "psubw %%mm7, %%mm5 \n\t" \
612 "psrlw $3, %%mm4 \n\t" \
613 "psrlw $3, %%mm5 \n\t" \
614 "packuswb %%mm5, %%mm4 \n\t" \
615 "packsswb %%mm7, %%mm6 \n\t" \
616 "pxor %%mm7, %%mm7 \n\t" \
617 "movd %4, %%mm2 \n\t" \
618 "punpcklbw %%mm2, %%mm2 \n\t" \
619 "punpcklbw %%mm2, %%mm2 \n\t" \
620 "punpcklbw %%mm2, %%mm2 \n\t" \
621 "psubusb %%mm4, %%mm2 \n\t" \
622 "movq %%mm2, %%mm3 \n\t" \
623 "psubusb %%mm4, %%mm3 \n\t" \
624 "psubb %%mm3, %%mm2 \n\t" \
625 "movq %1, %%mm3 \n\t" \
626 "movq %2, %%mm4 \n\t" \
627 "pxor %%mm6, %%mm3 \n\t" \
628 "pxor %%mm6, %%mm4 \n\t" \
629 "paddusb %%mm2, %%mm3 \n\t" \
630 "psubusb %%mm2, %%mm4 \n\t" \
631 "pxor %%mm6, %%mm3 \n\t" \
632 "pxor %%mm6, %%mm4 \n\t" \
633 "paddusb %%mm2, %%mm2 \n\t" \
634 "packsswb %%mm1, %%mm0 \n\t" \
635 "pcmpgtb %%mm0, %%mm7 \n\t" \
636 "pxor %%mm7, %%mm0 \n\t" \
637 "psubb %%mm7, %%mm0 \n\t" \
638 "movq %%mm0, %%mm1 \n\t" \
639 "psubusb %%mm2, %%mm0 \n\t" \
640 "psubb %%mm0, %%mm1 \n\t" \
641 "pand %5, %%mm1 \n\t" \
642 "psrlw $2, %%mm1 \n\t" \
643 "pxor %%mm7, %%mm1 \n\t" \
644 "psubb %%mm7, %%mm1 \n\t" \
645 "movq %0, %%mm5 \n\t" \
646 "movq %3, %%mm6 \n\t" \
647 "psubb %%mm1, %%mm5 \n\t" \
648 "paddb %%mm1, %%mm6 \n\t"
650 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
652 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
653 const int strength = ff_h263_loop_filter_strength[qscale];
658 "movq %%mm3, %1 \n\t"
659 "movq %%mm4, %2 \n\t"
660 "movq %%mm5, %0 \n\t"
661 "movq %%mm6, %3 \n\t"
662 : "+m"(*(uint64_t*)(src - 2 * stride)),
663 "+m"(*(uint64_t*)(src - 1 * stride)),
664 "+m"(*(uint64_t*)(src + 0 * stride)),
665 "+m"(*(uint64_t*)(src + 1 * stride))
666 : "g"(2 * strength), "m"(ff_pb_FC)
671 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
673 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
674 const int strength = ff_h263_loop_filter_strength[qscale];
675 DECLARE_ALIGNED(8, uint64_t, temp)[4];
676 uint8_t *btemp = (uint8_t*)temp;
680 transpose4x4(btemp, src, 8, stride);
681 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
683 H263_LOOP_FILTER // 5 3 4 6
689 : "g"(2 * strength), "m"(ff_pb_FC)
693 "movq %%mm5, %%mm1 \n\t"
694 "movq %%mm4, %%mm0 \n\t"
695 "punpcklbw %%mm3, %%mm5 \n\t"
696 "punpcklbw %%mm6, %%mm4 \n\t"
697 "punpckhbw %%mm3, %%mm1 \n\t"
698 "punpckhbw %%mm6, %%mm0 \n\t"
699 "movq %%mm5, %%mm3 \n\t"
700 "movq %%mm1, %%mm6 \n\t"
701 "punpcklwd %%mm4, %%mm5 \n\t"
702 "punpcklwd %%mm0, %%mm1 \n\t"
703 "punpckhwd %%mm4, %%mm3 \n\t"
704 "punpckhwd %%mm0, %%mm6 \n\t"
705 "movd %%mm5, (%0) \n\t"
706 "punpckhdq %%mm5, %%mm5 \n\t"
707 "movd %%mm5, (%0, %2) \n\t"
708 "movd %%mm3, (%0, %2, 2) \n\t"
709 "punpckhdq %%mm3, %%mm3 \n\t"
710 "movd %%mm3, (%0, %3) \n\t"
711 "movd %%mm1, (%1) \n\t"
712 "punpckhdq %%mm1, %%mm1 \n\t"
713 "movd %%mm1, (%1, %2) \n\t"
714 "movd %%mm6, (%1, %2, 2) \n\t"
715 "punpckhdq %%mm6, %%mm6 \n\t"
716 "movd %%mm6, (%1, %3) \n\t"
718 "r"(src + 4 * stride),
719 "r"((x86_reg)stride),
720 "r"((x86_reg)(3 * stride))
725 /* Draw the edges of width 'w' of an image of size width, height
726 * this MMX version can only handle w == 8 || w == 16. */
727 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
728 int w, int h, int sides)
730 uint8_t *ptr, *last_line;
733 last_line = buf + (height - 1) * wrap;
739 "movd (%0), %%mm0 \n\t"
740 "punpcklbw %%mm0, %%mm0 \n\t"
741 "punpcklwd %%mm0, %%mm0 \n\t"
742 "punpckldq %%mm0, %%mm0 \n\t"
743 "movq %%mm0, -8(%0) \n\t"
744 "movq -8(%0, %2), %%mm1 \n\t"
745 "punpckhbw %%mm1, %%mm1 \n\t"
746 "punpckhwd %%mm1, %%mm1 \n\t"
747 "punpckhdq %%mm1, %%mm1 \n\t"
748 "movq %%mm1, (%0, %2) \n\t"
753 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
758 "movd (%0), %%mm0 \n\t"
759 "punpcklbw %%mm0, %%mm0 \n\t"
760 "punpcklwd %%mm0, %%mm0 \n\t"
761 "punpckldq %%mm0, %%mm0 \n\t"
762 "movq %%mm0, -8(%0) \n\t"
763 "movq %%mm0, -16(%0) \n\t"
764 "movq -8(%0, %2), %%mm1 \n\t"
765 "punpckhbw %%mm1, %%mm1 \n\t"
766 "punpckhwd %%mm1, %%mm1 \n\t"
767 "punpckhdq %%mm1, %%mm1 \n\t"
768 "movq %%mm1, (%0, %2) \n\t"
769 "movq %%mm1, 8(%0, %2) \n\t"
774 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
780 "movd (%0), %%mm0 \n\t"
781 "punpcklbw %%mm0, %%mm0 \n\t"
782 "punpcklwd %%mm0, %%mm0 \n\t"
783 "movd %%mm0, -4(%0) \n\t"
784 "movd -4(%0, %2), %%mm1 \n\t"
785 "punpcklbw %%mm1, %%mm1 \n\t"
786 "punpckhwd %%mm1, %%mm1 \n\t"
787 "punpckhdq %%mm1, %%mm1 \n\t"
788 "movd %%mm1, (%0, %2) \n\t"
793 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
797 /* top and bottom (and hopefully also the corners) */
798 if (sides & EDGE_TOP) {
799 for (i = 0; i < h; i += 4) {
800 ptr = buf - (i + 1) * wrap - w;
803 "movq (%1, %0), %%mm0 \n\t"
804 "movq %%mm0, (%0) \n\t"
805 "movq %%mm0, (%0, %2) \n\t"
806 "movq %%mm0, (%0, %2, 2) \n\t"
807 "movq %%mm0, (%0, %3) \n\t"
812 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
813 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
818 if (sides & EDGE_BOTTOM) {
819 for (i = 0; i < h; i += 4) {
820 ptr = last_line + (i + 1) * wrap - w;
823 "movq (%1, %0), %%mm0 \n\t"
824 "movq %%mm0, (%0) \n\t"
825 "movq %%mm0, (%0, %2) \n\t"
826 "movq %%mm0, (%0, %2, 2) \n\t"
827 "movq %%mm0, (%0, %3) \n\t"
832 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
833 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
834 "r"(ptr + width + 2 * w)
840 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
841 in0, in1, in2, in7, out, OP) \
842 "paddw "#m4", "#m3" \n\t" /* x1 */ \
843 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
844 "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
845 "movq "#in7", "#m3" \n\t" /* d */ \
846 "movq "#in0", %%mm5 \n\t" /* D */ \
847 "paddw "#m3", %%mm5 \n\t" /* x4 */ \
848 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
849 "movq "#in1", %%mm5 \n\t" /* C */ \
850 "movq "#in2", %%mm6 \n\t" /* B */ \
851 "paddw "#m6", %%mm5 \n\t" /* x3 */ \
852 "paddw "#m5", %%mm6 \n\t" /* x2 */ \
853 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
854 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
855 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
856 "paddw "#rnd", %%mm4 \n\t" /* x2 */ \
857 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
858 "psraw $5, %%mm5 \n\t" \
859 "packuswb %%mm5, %%mm5 \n\t" \
860 OP(%%mm5, out, %%mm7, d)
862 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMXEXT) \
863 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, \
872 "pxor %%mm7, %%mm7 \n\t" \
874 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
875 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
876 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
877 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
878 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
879 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
880 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
881 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
882 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
883 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
884 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
885 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
886 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
887 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
888 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
889 "paddw %%mm3, %%mm5 \n\t" /* b */ \
890 "paddw %%mm2, %%mm6 \n\t" /* c */ \
891 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
892 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
893 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
894 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
895 "paddw %%mm4, %%mm0 \n\t" /* a */ \
896 "paddw %%mm1, %%mm5 \n\t" /* d */ \
897 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
898 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
899 "paddw %6, %%mm6 \n\t" \
900 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
901 "psraw $5, %%mm0 \n\t" \
902 "movq %%mm0, %5 \n\t" \
903 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
905 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
906 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
907 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
908 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
909 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
910 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
911 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
912 "paddw %%mm0, %%mm2 \n\t" /* b */ \
913 "paddw %%mm5, %%mm3 \n\t" /* c */ \
914 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
915 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
916 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
917 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
918 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
919 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
920 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
921 "paddw %%mm2, %%mm1 \n\t" /* a */ \
922 "paddw %%mm6, %%mm4 \n\t" /* d */ \
923 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
924 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
925 "paddw %6, %%mm1 \n\t" \
926 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
927 "psraw $5, %%mm3 \n\t" \
928 "movq %5, %%mm1 \n\t" \
929 "packuswb %%mm3, %%mm1 \n\t" \
930 OP_MMXEXT(%%mm1, (%1), %%mm4, q) \
931 /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
933 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
934 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
935 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
936 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
937 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
938 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
939 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
940 "paddw %%mm1, %%mm5 \n\t" /* b */ \
941 "paddw %%mm4, %%mm0 \n\t" /* c */ \
942 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
943 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
944 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
945 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
946 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
947 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
948 "paddw %%mm3, %%mm2 \n\t" /* d */ \
949 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
950 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
951 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
952 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
953 "paddw %%mm2, %%mm6 \n\t" /* a */ \
954 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
955 "paddw %6, %%mm0 \n\t" \
956 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
957 "psraw $5, %%mm0 \n\t" \
958 /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
959 /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
961 "paddw %%mm5, %%mm3 \n\t" /* a */ \
962 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
963 "paddw %%mm4, %%mm6 \n\t" /* b */ \
964 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
965 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
966 "paddw %%mm1, %%mm4 \n\t" /* c */ \
967 "paddw %%mm2, %%mm5 \n\t" /* d */ \
968 "paddw %%mm6, %%mm6 \n\t" /* 2b */ \
969 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
970 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
971 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
972 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
973 "paddw %6, %%mm4 \n\t" \
974 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
975 "psraw $5, %%mm4 \n\t" \
976 "packuswb %%mm4, %%mm0 \n\t" \
977 OP_MMXEXT(%%mm0, 8(%1), %%mm4, q) \
983 : "+a"(src), "+c"(dst), "+D"(h) \
984 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
985 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
990 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, \
997 "pxor %%mm7, %%mm7 \n\t" \
999 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
1000 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
1001 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
1002 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
1003 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
1004 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
1005 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
1006 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
1007 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
1008 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
1009 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
1010 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
1011 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
1012 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
1013 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
1014 "paddw %%mm3, %%mm5 \n\t" /* b */ \
1015 "paddw %%mm2, %%mm6 \n\t" /* c */ \
1016 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1017 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
1018 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
1019 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
1020 "paddw %%mm4, %%mm0 \n\t" /* a */ \
1021 "paddw %%mm1, %%mm5 \n\t" /* d */ \
1022 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
1023 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
1024 "paddw %5, %%mm6 \n\t" \
1025 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1026 "psraw $5, %%mm0 \n\t" \
1027 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
1029 "movd 5(%0), %%mm5 \n\t" /* FGHI */ \
1030 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
1031 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
1032 "paddw %%mm5, %%mm1 \n\t" /* a */ \
1033 "paddw %%mm6, %%mm2 \n\t" /* b */ \
1034 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
1035 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
1036 "paddw %%mm6, %%mm3 \n\t" /* c */ \
1037 "paddw %%mm5, %%mm4 \n\t" /* d */ \
1038 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1039 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1040 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1041 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1042 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
1043 "paddw %5, %%mm1 \n\t" \
1044 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
1045 "psraw $5, %%mm3 \n\t" \
1046 "packuswb %%mm3, %%mm0 \n\t" \
1047 OP_MMXEXT(%%mm0, (%1), %%mm4, q) \
1053 : "+a"(src), "+c"(dst), "+d"(h) \
1054 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1055 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
1060 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1061 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1066 uint64_t temp[17 * 4]; \
1067 uint64_t *temp_ptr = temp; \
1070 /* FIXME unroll */ \
1071 __asm__ volatile ( \
1072 "pxor %%mm7, %%mm7 \n\t" \
1074 "movq (%0), %%mm0 \n\t" \
1075 "movq (%0), %%mm1 \n\t" \
1076 "movq 8(%0), %%mm2 \n\t" \
1077 "movq 8(%0), %%mm3 \n\t" \
1078 "punpcklbw %%mm7, %%mm0 \n\t" \
1079 "punpckhbw %%mm7, %%mm1 \n\t" \
1080 "punpcklbw %%mm7, %%mm2 \n\t" \
1081 "punpckhbw %%mm7, %%mm3 \n\t" \
1082 "movq %%mm0, (%1) \n\t" \
1083 "movq %%mm1, 17 * 8(%1) \n\t" \
1084 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1085 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1090 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1091 : "r"((x86_reg)srcStride) \
1098 /* FIXME reorder for speed */ \
1099 __asm__ volatile ( \
1100 /* "pxor %%mm7, %%mm7 \n\t" */ \
1102 "movq (%0), %%mm0 \n\t" \
1103 "movq 8(%0), %%mm1 \n\t" \
1104 "movq 16(%0), %%mm2 \n\t" \
1105 "movq 24(%0), %%mm3 \n\t" \
1106 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1107 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1109 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1111 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1113 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1114 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1116 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1117 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1119 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1120 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1122 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1123 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1125 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1127 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1129 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1130 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1132 "add $136, %0 \n\t" \
1137 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1138 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1139 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1140 "g"(4 - 14 * (x86_reg)dstStride) \
1145 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1150 uint64_t temp[9 * 2]; \
1151 uint64_t *temp_ptr = temp; \
1154 /* FIXME unroll */ \
1155 __asm__ volatile ( \
1156 "pxor %%mm7, %%mm7 \n\t" \
1158 "movq (%0), %%mm0 \n\t" \
1159 "movq (%0), %%mm1 \n\t" \
1160 "punpcklbw %%mm7, %%mm0 \n\t" \
1161 "punpckhbw %%mm7, %%mm1 \n\t" \
1162 "movq %%mm0, (%1) \n\t" \
1163 "movq %%mm1, 9*8(%1) \n\t" \
1168 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1169 : "r"((x86_reg)srcStride) \
1176 /* FIXME reorder for speed */ \
1177 __asm__ volatile ( \
1178 /* "pxor %%mm7, %%mm7 \n\t" */ \
1180 "movq (%0), %%mm0 \n\t" \
1181 "movq 8(%0), %%mm1 \n\t" \
1182 "movq 16(%0), %%mm2 \n\t" \
1183 "movq 24(%0), %%mm3 \n\t" \
1184 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1185 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1187 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1189 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1191 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1193 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1195 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1196 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1198 "add $72, %0 \n\t" \
1203 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1204 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1205 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1206 "g"(4 - 6 * (x86_reg)dstStride) \
1211 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1214 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1217 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1221 uint8_t * const half = (uint8_t*)temp; \
1222 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1224 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1227 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1230 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1234 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1238 uint8_t * const half = (uint8_t*)temp; \
1239 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1241 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1245 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1249 uint8_t * const half = (uint8_t*)temp; \
1250 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1251 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1254 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1257 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1260 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1264 uint8_t * const half = (uint8_t*)temp; \
1265 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1266 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1270 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1273 uint64_t half[8 + 9]; \
1274 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1275 uint8_t * const halfHV = ((uint8_t*)half); \
1276 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1278 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1279 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1280 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1283 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1286 uint64_t half[8 + 9]; \
1287 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1288 uint8_t * const halfHV = ((uint8_t*)half); \
1289 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1291 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1293 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1294 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1297 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1300 uint64_t half[8 + 9]; \
1301 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1302 uint8_t * const halfHV = ((uint8_t*)half); \
1303 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1305 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1306 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1307 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1310 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1313 uint64_t half[8 + 9]; \
1314 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1315 uint8_t * const halfHV = ((uint8_t*)half); \
1316 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1318 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1320 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1321 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1324 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1327 uint64_t half[8 + 9]; \
1328 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1329 uint8_t * const halfHV = ((uint8_t*)half); \
1330 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1332 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1333 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1336 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1339 uint64_t half[8 + 9]; \
1340 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1341 uint8_t * const halfHV = ((uint8_t*)half); \
1342 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1344 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1345 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1348 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1351 uint64_t half[8 + 9]; \
1352 uint8_t * const halfH = ((uint8_t*)half); \
1353 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1355 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1356 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1359 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1362 uint64_t half[8 + 9]; \
1363 uint8_t * const halfH = ((uint8_t*)half); \
1364 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1366 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1368 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1371 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1375 uint8_t * const halfH = ((uint8_t*)half); \
1376 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1378 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1381 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1384 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1387 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1390 uint64_t temp[32]; \
1391 uint8_t * const half = (uint8_t*)temp; \
1392 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1394 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1397 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1400 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1401 stride, stride, 16); \
1404 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1407 uint64_t temp[32]; \
1408 uint8_t * const half = (uint8_t*)temp; \
1409 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1411 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1412 stride, stride, 16); \
1415 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1418 uint64_t temp[32]; \
1419 uint8_t * const half = (uint8_t*)temp; \
1420 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1422 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1425 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1428 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1431 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1434 uint64_t temp[32]; \
1435 uint8_t * const half = (uint8_t*)temp; \
1436 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1438 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1439 stride, stride, 16); \
1442 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1445 uint64_t half[16 * 2 + 17 * 2]; \
1446 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1447 uint8_t * const halfHV = ((uint8_t*)half); \
1448 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1450 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1452 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1454 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1457 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1460 uint64_t half[16 * 2 + 17 * 2]; \
1461 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1462 uint8_t * const halfHV = ((uint8_t*)half); \
1463 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1465 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1467 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1469 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1472 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1475 uint64_t half[16 * 2 + 17 * 2]; \
1476 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1477 uint8_t * const halfHV = ((uint8_t*)half); \
1478 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1480 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1482 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1484 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1488 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1491 uint64_t half[16 * 2 + 17 * 2]; \
1492 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1493 uint8_t * const halfHV = ((uint8_t*)half); \
1494 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1496 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1498 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1500 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1504 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1507 uint64_t half[16 * 2 + 17 * 2]; \
1508 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1509 uint8_t * const halfHV = ((uint8_t*)half); \
1510 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1512 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1514 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1517 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1520 uint64_t half[16 * 2 + 17 * 2]; \
1521 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1522 uint8_t * const halfHV = ((uint8_t*)half); \
1523 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1525 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1527 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1531 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1534 uint64_t half[17 * 2]; \
1535 uint8_t * const halfH = ((uint8_t*)half); \
1536 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1538 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1540 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1543 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1546 uint64_t half[17 * 2]; \
1547 uint8_t * const halfH = ((uint8_t*)half); \
1548 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1550 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1552 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1555 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1558 uint64_t half[17 * 2]; \
1559 uint8_t * const halfH = ((uint8_t*)half); \
1560 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1562 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1565 #define PUT_OP(a, b, temp, size) \
1566 "mov"#size" "#a", "#b" \n\t"
1568 #define AVG_MMXEXT_OP(a, b, temp, size) \
1569 "mov"#size" "#b", "#temp" \n\t" \
1570 "pavgb "#temp", "#a" \n\t" \
1571 "mov"#size" "#a", "#b" \n\t"
1573 QPEL_BASE(put_, ff_pw_16, _, PUT_OP)
1574 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMXEXT_OP)
1575 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP)
1576 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext)
1577 QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
1578 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
1580 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1582 put_pixels8_xy2_mmx(dst, src, stride, 8);
1584 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1586 put_pixels16_xy2_mmx(dst, src, stride, 16);
1588 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1590 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1592 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1594 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1597 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1598 ptrdiff_t linesize, int block_w, int block_h,
1599 int src_x, int src_y, int w, int h);
1601 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1602 int stride, int h, int ox, int oy,
1603 int dxx, int dxy, int dyx, int dyy,
1604 int shift, int r, int width, int height,
1605 emulated_edge_mc_func *emu_edge_fn)
1608 const int ix = ox >> (16 + shift);
1609 const int iy = oy >> (16 + shift);
1610 const int oxs = ox >> 4;
1611 const int oys = oy >> 4;
1612 const int dxxs = dxx >> 4;
1613 const int dxys = dxy >> 4;
1614 const int dyxs = dyx >> 4;
1615 const int dyys = dyy >> 4;
1616 const uint16_t r4[4] = { r, r, r, r };
1617 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1618 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1619 const uint64_t shift2 = 2 * shift;
1620 #define MAX_STRIDE 4096U
1622 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
1625 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1626 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1627 const int dxh = dxy * (h - 1);
1628 const int dyw = dyx * (w - 1);
1629 int need_emu = (unsigned)ix >= width - w ||
1630 (unsigned)iy >= height - h;
1632 if ( // non-constant fullpel offset (3% of blocks)
1633 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1634 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1635 // uses more than 16 bits of subpel mv (only at huge resolution)
1636 || (dxx | dxy | dyx | dyy) & 15
1637 || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
1638 // FIXME could still use mmx for some of the rows
1639 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1640 shift, r, width, height);
1644 src += ix + iy * stride;
1646 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1651 "movd %0, %%mm6 \n\t"
1652 "pxor %%mm7, %%mm7 \n\t"
1653 "punpcklwd %%mm6, %%mm6 \n\t"
1654 "punpcklwd %%mm6, %%mm6 \n\t"
1658 for (x = 0; x < w; x += 4) {
1659 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1660 oxs - dxys + dxxs * (x + 1),
1661 oxs - dxys + dxxs * (x + 2),
1662 oxs - dxys + dxxs * (x + 3) };
1663 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1664 oys - dyys + dyxs * (x + 1),
1665 oys - dyys + dyxs * (x + 2),
1666 oys - dyys + dyxs * (x + 3) };
1668 for (y = 0; y < h; y++) {
1670 "movq %0, %%mm4 \n\t"
1671 "movq %1, %%mm5 \n\t"
1672 "paddw %2, %%mm4 \n\t"
1673 "paddw %3, %%mm5 \n\t"
1674 "movq %%mm4, %0 \n\t"
1675 "movq %%mm5, %1 \n\t"
1676 "psrlw $12, %%mm4 \n\t"
1677 "psrlw $12, %%mm5 \n\t"
1678 : "+m"(*dx4), "+m"(*dy4)
1679 : "m"(*dxy4), "m"(*dyy4)
1683 "movq %%mm6, %%mm2 \n\t"
1684 "movq %%mm6, %%mm1 \n\t"
1685 "psubw %%mm4, %%mm2 \n\t"
1686 "psubw %%mm5, %%mm1 \n\t"
1687 "movq %%mm2, %%mm0 \n\t"
1688 "movq %%mm4, %%mm3 \n\t"
1689 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1690 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1691 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1692 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1694 "movd %4, %%mm5 \n\t"
1695 "movd %3, %%mm4 \n\t"
1696 "punpcklbw %%mm7, %%mm5 \n\t"
1697 "punpcklbw %%mm7, %%mm4 \n\t"
1698 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1699 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1701 "movd %2, %%mm5 \n\t"
1702 "movd %1, %%mm4 \n\t"
1703 "punpcklbw %%mm7, %%mm5 \n\t"
1704 "punpcklbw %%mm7, %%mm4 \n\t"
1705 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1706 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1707 "paddw %5, %%mm1 \n\t"
1708 "paddw %%mm3, %%mm2 \n\t"
1709 "paddw %%mm1, %%mm0 \n\t"
1710 "paddw %%mm2, %%mm0 \n\t"
1712 "psrlw %6, %%mm0 \n\t"
1713 "packuswb %%mm0, %%mm0 \n\t"
1714 "movd %%mm0, %0 \n\t"
1716 : "=m"(dst[x + y * stride])
1717 : "m"(src[0]), "m"(src[1]),
1718 "m"(src[stride]), "m"(src[stride + 1]),
1719 "m"(*r4), "m"(shift2)
1723 src += 4 - h * stride;
1730 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1731 int stride, int h, int ox, int oy,
1732 int dxx, int dxy, int dyx, int dyy,
1733 int shift, int r, int width, int height)
1735 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1736 width, height, &ff_emulated_edge_mc_8);
1739 static void gmc_sse(uint8_t *dst, uint8_t *src,
1740 int stride, int h, int ox, int oy,
1741 int dxx, int dxy, int dyx, int dyy,
1742 int shift, int r, int width, int height)
1744 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1745 width, height, &ff_emulated_edge_mc_8);
1748 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1749 int stride, int h, int ox, int oy,
1750 int dxx, int dxy, int dyx, int dyy,
1751 int shift, int r, int width, int height)
1753 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1754 width, height, &ff_emulated_edge_mc_8);
1759 #endif /* HAVE_INLINE_ASM */
1761 #include "h264_qpel.c"
1763 void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
1764 int stride, int h, int x, int y);
1765 void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
1766 int stride, int h, int x, int y);
1767 void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
1768 int stride, int h, int x, int y);
1770 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1771 int stride, int h, int x, int y);
1772 void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
1773 int stride, int h, int x, int y);
1774 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1775 int stride, int h, int x, int y);
1777 void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1778 int stride, int h, int x, int y);
1779 void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1780 int stride, int h, int x, int y);
1782 void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1783 int stride, int h, int x, int y);
1784 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1785 int stride, int h, int x, int y);
1787 void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1788 int stride, int h, int x, int y);
1789 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1790 int stride, int h, int x, int y);
1792 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1793 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1794 (uint8_t *dst, uint8_t *src, \
1795 int stride, int h, int x, int y);
1797 CHROMA_MC(put, 2, 10, mmxext)
1798 CHROMA_MC(avg, 2, 10, mmxext)
1799 CHROMA_MC(put, 4, 10, mmxext)
1800 CHROMA_MC(avg, 4, 10, mmxext)
1801 CHROMA_MC(put, 8, 10, sse2)
1802 CHROMA_MC(avg, 8, 10, sse2)
1803 CHROMA_MC(put, 8, 10, avx)
1804 CHROMA_MC(avg, 8, 10, avx)
1809 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1811 put_pixels8_mmx(dst, src, stride, 8);
1814 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1816 avg_pixels8_mmx(dst, src, stride, 8);
1819 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1821 put_pixels16_mmx(dst, src, stride, 16);
1824 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1826 avg_pixels16_mmx(dst, src, stride, 16);
1830 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1831 int stride, int rnd)
1833 put_pixels8_mmx(dst, src, stride, 8);
1836 void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
1837 int stride, int rnd)
1839 avg_pixels8_mmxext(dst, src, stride, 8);
1842 #if CONFIG_DIRAC_DECODER
1843 #define DIRAC_PIXOP(OPNAME, EXT)\
1844 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1846 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
1848 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1850 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
1852 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1854 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
1855 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
1858 DIRAC_PIXOP(put, mmx)
1859 DIRAC_PIXOP(avg, mmx)
1860 DIRAC_PIXOP(avg, mmxext)
1863 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1865 ff_put_pixels16_sse2(dst, src[0], stride, h);
1867 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1869 ff_avg_pixels16_sse2(dst, src[0], stride, h);
1871 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1873 ff_put_pixels16_sse2(dst , src[0] , stride, h);
1874 ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
1876 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1878 ff_avg_pixels16_sse2(dst , src[0] , stride, h);
1879 ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
1884 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
1887 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
1891 ff_put_pixels_clamped_mmx(block, dest, line_size);
1894 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
1898 ff_add_pixels_clamped_mmx(block, dest, line_size);
1901 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
1904 ff_mmxext_idct(block);
1905 ff_put_pixels_clamped_mmx(block, dest, line_size);
1908 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
1911 ff_mmxext_idct(block);
1912 ff_add_pixels_clamped_mmx(block, dest, line_size);
1916 static void vector_clipf_sse(float *dst, const float *src,
1917 float min, float max, int len)
1919 x86_reg i = (len - 16) * 4;
1921 "movss %3, %%xmm4 \n\t"
1922 "movss %4, %%xmm5 \n\t"
1923 "shufps $0, %%xmm4, %%xmm4 \n\t"
1924 "shufps $0, %%xmm5, %%xmm5 \n\t"
1926 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1927 "movaps 16(%2, %0), %%xmm1 \n\t"
1928 "movaps 32(%2, %0), %%xmm2 \n\t"
1929 "movaps 48(%2, %0), %%xmm3 \n\t"
1930 "maxps %%xmm4, %%xmm0 \n\t"
1931 "maxps %%xmm4, %%xmm1 \n\t"
1932 "maxps %%xmm4, %%xmm2 \n\t"
1933 "maxps %%xmm4, %%xmm3 \n\t"
1934 "minps %%xmm5, %%xmm0 \n\t"
1935 "minps %%xmm5, %%xmm1 \n\t"
1936 "minps %%xmm5, %%xmm2 \n\t"
1937 "minps %%xmm5, %%xmm3 \n\t"
1938 "movaps %%xmm0, (%1, %0) \n\t"
1939 "movaps %%xmm1, 16(%1, %0) \n\t"
1940 "movaps %%xmm2, 32(%1, %0) \n\t"
1941 "movaps %%xmm3, 48(%1, %0) \n\t"
1945 : "r"(dst), "r"(src), "m"(min), "m"(max)
1950 #endif /* HAVE_INLINE_ASM */
1952 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1954 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1956 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1958 int order, int mul);
1959 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1961 int order, int mul);
1962 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1964 int order, int mul);
1966 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1967 const int16_t *window, unsigned int len);
1968 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1969 const int16_t *window, unsigned int len);
1970 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1971 const int16_t *window, unsigned int len);
1972 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1973 const int16_t *window, unsigned int len);
1974 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1975 const int16_t *window, unsigned int len);
1976 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1977 const int16_t *window, unsigned int len);
1979 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1980 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1982 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1983 const uint8_t *diff, int w,
1984 int *left, int *left_top);
1985 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1987 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1990 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1991 int32_t min, int32_t max, unsigned int len);
1992 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1993 int32_t min, int32_t max, unsigned int len);
1994 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1995 int32_t min, int32_t max, unsigned int len);
1996 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1997 int32_t min, int32_t max, unsigned int len);
1999 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2001 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2002 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2003 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2004 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2005 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2006 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2007 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2008 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2009 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2010 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2011 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2012 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2013 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2014 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2015 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2016 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2019 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2021 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2022 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2023 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2024 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2027 #define H264_QPEL_FUNCS(x, y, CPU) \
2029 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2030 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2031 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2032 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2035 #define H264_QPEL_FUNCS_10(x, y, CPU) \
2037 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2038 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2039 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2040 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2043 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2045 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2048 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2049 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2050 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2052 if (!high_bit_depth) {
2053 c->clear_block = clear_block_mmx;
2054 c->clear_blocks = clear_blocks_mmx;
2055 c->draw_edges = draw_edges_mmx;
2057 SET_HPEL_FUNCS(put, 0, 16, mmx);
2058 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2059 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2060 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2061 SET_HPEL_FUNCS(put, 1, 8, mmx);
2062 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2063 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2064 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2067 #if ARCH_X86_32 || !HAVE_YASM
2071 c->add_bytes = add_bytes_mmx;
2073 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2074 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2075 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2077 #endif /* HAVE_INLINE_ASM */
2080 if (!high_bit_depth && CONFIG_H264CHROMA) {
2081 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
2082 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2085 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2090 static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
2093 const int bit_depth = avctx->bits_per_raw_sample;
2094 const int high_bit_depth = bit_depth > 8;
2097 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
2098 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
2100 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
2101 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
2102 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
2103 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
2105 if (!high_bit_depth) {
2106 c->put_pixels_tab[0][1] = put_pixels16_x2_mmxext;
2107 c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
2109 c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
2110 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
2111 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
2113 c->put_pixels_tab[1][1] = put_pixels8_x2_mmxext;
2114 c->put_pixels_tab[1][2] = put_pixels8_y2_mmxext;
2116 c->avg_pixels_tab[1][0] = avg_pixels8_mmxext;
2117 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmxext;
2118 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmxext;
2121 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2122 if (!high_bit_depth) {
2123 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
2124 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
2125 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmxext;
2126 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmxext;
2128 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
2129 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmxext;
2133 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2134 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2135 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmxext;
2136 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmxext;
2138 #endif /* HAVE_INLINE_ASM */
2140 #if HAVE_MMXEXT_EXTERNAL
2141 if (CONFIG_H264QPEL) {
2142 if (!high_bit_depth) {
2143 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
2144 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
2145 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
2146 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
2147 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
2148 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
2149 } else if (bit_depth == 10) {
2151 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2152 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2153 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2154 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2156 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2157 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2161 if (!high_bit_depth && CONFIG_H264CHROMA) {
2162 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
2163 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
2164 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
2165 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
2167 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2168 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
2169 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
2170 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
2171 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
2174 /* slower than cmov version on AMD */
2175 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
2176 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
2178 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
2179 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
2181 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2182 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2184 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
2186 #endif /* HAVE_MMXEXT_EXTERNAL */
2189 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2192 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2195 if (!high_bit_depth) {
2196 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2197 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2199 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2200 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2201 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2203 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2204 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2206 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2207 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2208 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2210 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2211 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2212 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2213 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2214 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2216 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2217 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2221 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2222 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2223 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2224 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2226 #endif /* HAVE_INLINE_ASM */
2229 if (!high_bit_depth && CONFIG_H264CHROMA) {
2230 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
2231 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2233 #endif /* HAVE_YASM */
2236 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2238 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2241 if (!high_bit_depth) {
2242 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2243 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2244 c->clear_block = clear_block_sse;
2245 c->clear_blocks = clear_blocks_sse;
2249 c->vector_clipf = vector_clipf_sse;
2250 #endif /* HAVE_INLINE_ASM */
2253 #if HAVE_INLINE_ASM && CONFIG_VIDEODSP
2256 #endif /* HAVE_YASM */
2259 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2262 const int bit_depth = avctx->bits_per_raw_sample;
2263 const int high_bit_depth = bit_depth > 8;
2265 #if HAVE_SSE2_INLINE
2266 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
2267 c->idct_put = ff_idct_xvid_sse2_put;
2268 c->idct_add = ff_idct_xvid_sse2_add;
2269 c->idct = ff_idct_xvid_sse2;
2270 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2272 #endif /* HAVE_SSE2_INLINE */
2274 #if HAVE_SSE2_EXTERNAL
2275 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2276 // these functions are slower than mmx on AMD, but faster on Intel
2277 if (!high_bit_depth) {
2278 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
2279 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
2280 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
2281 if (CONFIG_H264QPEL)
2282 H264_QPEL_FUNCS(0, 0, sse2);
2286 if (!high_bit_depth && CONFIG_H264QPEL) {
2287 H264_QPEL_FUNCS(0, 1, sse2);
2288 H264_QPEL_FUNCS(0, 2, sse2);
2289 H264_QPEL_FUNCS(0, 3, sse2);
2290 H264_QPEL_FUNCS(1, 1, sse2);
2291 H264_QPEL_FUNCS(1, 2, sse2);
2292 H264_QPEL_FUNCS(1, 3, sse2);
2293 H264_QPEL_FUNCS(2, 1, sse2);
2294 H264_QPEL_FUNCS(2, 2, sse2);
2295 H264_QPEL_FUNCS(2, 3, sse2);
2296 H264_QPEL_FUNCS(3, 1, sse2);
2297 H264_QPEL_FUNCS(3, 2, sse2);
2298 H264_QPEL_FUNCS(3, 3, sse2);
2301 if (bit_depth == 10) {
2302 if (CONFIG_H264QPEL) {
2303 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2304 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2305 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2306 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2307 H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2308 H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2309 H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
2311 if (CONFIG_H264CHROMA) {
2312 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2313 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2317 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2318 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2319 if (mm_flags & AV_CPU_FLAG_ATOM) {
2320 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2322 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2324 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2325 c->apply_window_int16 = ff_apply_window_int16_sse2;
2326 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2327 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
2329 c->bswap_buf = ff_bswap32_buf_sse2;
2330 #endif /* HAVE_SSE2_EXTERNAL */
2333 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2336 #if HAVE_SSSE3_EXTERNAL
2337 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2338 const int bit_depth = avctx->bits_per_raw_sample;
2340 if (!high_bit_depth && CONFIG_H264QPEL) {
2341 H264_QPEL_FUNCS(1, 0, ssse3);
2342 H264_QPEL_FUNCS(1, 1, ssse3);
2343 H264_QPEL_FUNCS(1, 2, ssse3);
2344 H264_QPEL_FUNCS(1, 3, ssse3);
2345 H264_QPEL_FUNCS(2, 0, ssse3);
2346 H264_QPEL_FUNCS(2, 1, ssse3);
2347 H264_QPEL_FUNCS(2, 2, ssse3);
2348 H264_QPEL_FUNCS(2, 3, ssse3);
2349 H264_QPEL_FUNCS(3, 0, ssse3);
2350 H264_QPEL_FUNCS(3, 1, ssse3);
2351 H264_QPEL_FUNCS(3, 2, ssse3);
2352 H264_QPEL_FUNCS(3, 3, ssse3);
2354 if (bit_depth == 10 && CONFIG_H264QPEL) {
2355 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
2356 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
2357 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
2359 if (!high_bit_depth && CONFIG_H264CHROMA) {
2360 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
2361 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
2362 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
2363 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
2365 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2366 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2367 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2369 if (mm_flags & AV_CPU_FLAG_ATOM)
2370 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2372 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2373 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
2374 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2375 c->bswap_buf = ff_bswap32_buf_ssse3;
2376 #endif /* HAVE_SSSE3_EXTERNAL */
2379 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
2382 #if HAVE_SSE4_EXTERNAL
2383 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2384 #endif /* HAVE_SSE4_EXTERNAL */
2387 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2389 #if HAVE_AVX_EXTERNAL
2390 const int bit_depth = avctx->bits_per_raw_sample;
2392 if (bit_depth == 10) {
2393 // AVX implies !cache64.
2394 // TODO: Port cache(32|64) detection from x264.
2395 if (CONFIG_H264QPEL) {
2396 H264_QPEL_FUNCS_10(1, 0, sse2);
2397 H264_QPEL_FUNCS_10(2, 0, sse2);
2398 H264_QPEL_FUNCS_10(3, 0, sse2);
2401 if (CONFIG_H264CHROMA) {
2402 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2403 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2406 #endif /* HAVE_AVX_EXTERNAL */
2409 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
2411 int mm_flags = av_get_cpu_flags();
2413 #if HAVE_7REGS && HAVE_INLINE_ASM
2414 if (mm_flags & AV_CPU_FLAG_CMOV)
2415 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2418 if (mm_flags & AV_CPU_FLAG_MMX) {
2420 const int idct_algo = avctx->idct_algo;
2422 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
2423 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
2424 c->idct_put = ff_simple_idct_put_mmx;
2425 c->idct_add = ff_simple_idct_add_mmx;
2426 c->idct = ff_simple_idct_mmx;
2427 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
2429 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
2430 if (mm_flags & AV_CPU_FLAG_MMX2) {
2431 c->idct_put = ff_libmpeg2mmx2_idct_put;
2432 c->idct_add = ff_libmpeg2mmx2_idct_add;
2433 c->idct = ff_mmxext_idct;
2435 c->idct_put = ff_libmpeg2mmx_idct_put;
2436 c->idct_add = ff_libmpeg2mmx_idct_add;
2437 c->idct = ff_mmx_idct;
2439 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2441 } else if (idct_algo == FF_IDCT_XVIDMMX) {
2442 if (mm_flags & AV_CPU_FLAG_SSE2) {
2443 c->idct_put = ff_idct_xvid_sse2_put;
2444 c->idct_add = ff_idct_xvid_sse2_add;
2445 c->idct = ff_idct_xvid_sse2;
2446 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2447 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
2448 c->idct_put = ff_idct_xvid_mmxext_put;
2449 c->idct_add = ff_idct_xvid_mmxext_add;
2450 c->idct = ff_idct_xvid_mmxext;
2452 c->idct_put = ff_idct_xvid_mmx_put;
2453 c->idct_add = ff_idct_xvid_mmx_add;
2454 c->idct = ff_idct_xvid_mmx;
2458 #endif /* HAVE_INLINE_ASM */
2460 dsputil_init_mmx(c, avctx, mm_flags);
2463 if (mm_flags & AV_CPU_FLAG_MMXEXT)
2464 dsputil_init_mmxext(c, avctx, mm_flags);
2466 if (mm_flags & AV_CPU_FLAG_3DNOW)
2467 dsputil_init_3dnow(c, avctx, mm_flags);
2469 if (mm_flags & AV_CPU_FLAG_SSE)
2470 dsputil_init_sse(c, avctx, mm_flags);
2472 if (mm_flags & AV_CPU_FLAG_SSE2)
2473 dsputil_init_sse2(c, avctx, mm_flags);
2475 if (mm_flags & AV_CPU_FLAG_SSSE3)
2476 dsputil_init_ssse3(c, avctx, mm_flags);
2478 if (mm_flags & AV_CPU_FLAG_SSE4)
2479 dsputil_init_sse4(c, avctx, mm_flags);
2481 if (mm_flags & AV_CPU_FLAG_AVX)
2482 dsputil_init_avx(c, avctx, mm_flags);
2484 if (CONFIG_ENCODERS)
2485 ff_dsputilenc_init_mmx(c, avctx);