2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "dsputil_mmx.h"
32 #include "idct_xvid.h"
33 #include "diracdsp_mmx.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
43 { 0x8000000080000000ULL, 0x8000000080000000ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
52 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
73 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
81 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
82 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
84 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
85 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
89 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
90 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
92 #define MOVQ_BFE(regd) \
94 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
95 "paddb %%"#regd", %%"#regd" \n\t" ::)
98 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
99 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
101 // for shared library it's better to use this way for accessing constants
103 #define MOVQ_BONE(regd) \
105 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
106 "psrlw $15, %%"#regd" \n\t" \
107 "packuswb %%"#regd", %%"#regd" \n\t" ::)
109 #define MOVQ_WTWO(regd) \
111 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
112 "psrlw $15, %%"#regd" \n\t" \
113 "psllw $1, %%"#regd" \n\t"::)
117 // using regr as temporary and for the output result
118 // first argument is unmodifed and second is trashed
119 // regfe is supposed to contain 0xfefefefefefefefe
120 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
121 "movq "#rega", "#regr" \n\t" \
122 "pand "#regb", "#regr" \n\t" \
123 "pxor "#rega", "#regb" \n\t" \
124 "pand "#regfe", "#regb" \n\t" \
125 "psrlq $1, "#regb" \n\t" \
126 "paddb "#regb", "#regr" \n\t"
128 #define PAVGB_MMX(rega, regb, regr, regfe) \
129 "movq "#rega", "#regr" \n\t" \
130 "por "#regb", "#regr" \n\t" \
131 "pxor "#rega", "#regb" \n\t" \
132 "pand "#regfe", "#regb" \n\t" \
133 "psrlq $1, "#regb" \n\t" \
134 "psubb "#regb", "#regr" \n\t"
136 // mm6 is supposed to contain 0xfefefefefefefefe
137 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
138 "movq "#rega", "#regr" \n\t" \
139 "movq "#regc", "#regp" \n\t" \
140 "pand "#regb", "#regr" \n\t" \
141 "pand "#regd", "#regp" \n\t" \
142 "pxor "#rega", "#regb" \n\t" \
143 "pxor "#regc", "#regd" \n\t" \
144 "pand %%mm6, "#regb" \n\t" \
145 "pand %%mm6, "#regd" \n\t" \
146 "psrlq $1, "#regb" \n\t" \
147 "psrlq $1, "#regd" \n\t" \
148 "paddb "#regb", "#regr" \n\t" \
149 "paddb "#regd", "#regp" \n\t"
151 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
152 "movq "#rega", "#regr" \n\t" \
153 "movq "#regc", "#regp" \n\t" \
154 "por "#regb", "#regr" \n\t" \
155 "por "#regd", "#regp" \n\t" \
156 "pxor "#rega", "#regb" \n\t" \
157 "pxor "#regc", "#regd" \n\t" \
158 "pand %%mm6, "#regb" \n\t" \
159 "pand %%mm6, "#regd" \n\t" \
160 "psrlq $1, "#regd" \n\t" \
161 "psrlq $1, "#regb" \n\t" \
162 "psubb "#regb", "#regr" \n\t" \
163 "psubb "#regd", "#regp" \n\t"
165 /***********************************/
166 /* MMX no rounding */
167 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
168 #define SET_RND MOVQ_WONE
169 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
170 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
171 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
173 #include "dsputil_rnd_template.c"
179 /***********************************/
182 #define DEF(x, y) x ## _ ## y ## _mmx
183 #define SET_RND MOVQ_WTWO
184 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
185 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
187 #include "dsputil_rnd_template.c"
195 /***********************************/
198 #define DEF(x) x ## _3dnow
199 #define PAVGB "pavgusb"
200 #define SKIP_FOR_3DNOW
202 #include "dsputil_avg_template.c"
206 #undef SKIP_FOR_3DNOW
208 /***********************************/
209 /* MMXEXT specific */
211 #define DEF(x) x ## _mmxext
213 /* Introduced only in MMXEXT set */
214 #define PAVGB "pavgb"
216 #include "dsputil_avg_template.c"
221 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
222 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
223 #define put_pixels16_mmxext put_pixels16_mmx
224 #define put_pixels8_mmxext put_pixels8_mmx
225 #define put_pixels4_mmxext put_pixels4_mmx
226 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
227 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
229 /***********************************/
232 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
238 /* read the pixels */
243 "movq (%3), %%mm0 \n\t"
244 "movq 8(%3), %%mm1 \n\t"
245 "movq 16(%3), %%mm2 \n\t"
246 "movq 24(%3), %%mm3 \n\t"
247 "movq 32(%3), %%mm4 \n\t"
248 "movq 40(%3), %%mm5 \n\t"
249 "movq 48(%3), %%mm6 \n\t"
250 "movq 56(%3), %%mm7 \n\t"
251 "packuswb %%mm1, %%mm0 \n\t"
252 "packuswb %%mm3, %%mm2 \n\t"
253 "packuswb %%mm5, %%mm4 \n\t"
254 "packuswb %%mm7, %%mm6 \n\t"
255 "movq %%mm0, (%0) \n\t"
256 "movq %%mm2, (%0, %1) \n\t"
257 "movq %%mm4, (%0, %1, 2) \n\t"
258 "movq %%mm6, (%0, %2) \n\t"
259 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
262 pix += line_size * 4;
265 // if here would be an exact copy of the code above
266 // compiler would generate some very strange code
269 "movq (%3), %%mm0 \n\t"
270 "movq 8(%3), %%mm1 \n\t"
271 "movq 16(%3), %%mm2 \n\t"
272 "movq 24(%3), %%mm3 \n\t"
273 "movq 32(%3), %%mm4 \n\t"
274 "movq 40(%3), %%mm5 \n\t"
275 "movq 48(%3), %%mm6 \n\t"
276 "movq 56(%3), %%mm7 \n\t"
277 "packuswb %%mm1, %%mm0 \n\t"
278 "packuswb %%mm3, %%mm2 \n\t"
279 "packuswb %%mm5, %%mm4 \n\t"
280 "packuswb %%mm7, %%mm6 \n\t"
281 "movq %%mm0, (%0) \n\t"
282 "movq %%mm2, (%0, %1) \n\t"
283 "movq %%mm4, (%0, %1, 2) \n\t"
284 "movq %%mm6, (%0, %2) \n\t"
285 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
289 #define put_signed_pixels_clamped_mmx_half(off) \
290 "movq "#off"(%2), %%mm1 \n\t" \
291 "movq 16 + "#off"(%2), %%mm2 \n\t" \
292 "movq 32 + "#off"(%2), %%mm3 \n\t" \
293 "movq 48 + "#off"(%2), %%mm4 \n\t" \
294 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
295 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
296 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
297 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
298 "paddb %%mm0, %%mm1 \n\t" \
299 "paddb %%mm0, %%mm2 \n\t" \
300 "paddb %%mm0, %%mm3 \n\t" \
301 "paddb %%mm0, %%mm4 \n\t" \
302 "movq %%mm1, (%0) \n\t" \
303 "movq %%mm2, (%0, %3) \n\t" \
304 "movq %%mm3, (%0, %3, 2) \n\t" \
305 "movq %%mm4, (%0, %1) \n\t"
307 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
310 x86_reg line_skip = line_size;
314 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
315 "lea (%3, %3, 2), %1 \n\t"
316 put_signed_pixels_clamped_mmx_half(0)
317 "lea (%0, %3, 4), %0 \n\t"
318 put_signed_pixels_clamped_mmx_half(64)
319 : "+&r"(pixels), "=&r"(line_skip3)
320 : "r"(block), "r"(line_skip)
324 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
331 /* read the pixels */
338 "movq (%2), %%mm0 \n\t"
339 "movq 8(%2), %%mm1 \n\t"
340 "movq 16(%2), %%mm2 \n\t"
341 "movq 24(%2), %%mm3 \n\t"
342 "movq %0, %%mm4 \n\t"
343 "movq %1, %%mm6 \n\t"
344 "movq %%mm4, %%mm5 \n\t"
345 "punpcklbw %%mm7, %%mm4 \n\t"
346 "punpckhbw %%mm7, %%mm5 \n\t"
347 "paddsw %%mm4, %%mm0 \n\t"
348 "paddsw %%mm5, %%mm1 \n\t"
349 "movq %%mm6, %%mm5 \n\t"
350 "punpcklbw %%mm7, %%mm6 \n\t"
351 "punpckhbw %%mm7, %%mm5 \n\t"
352 "paddsw %%mm6, %%mm2 \n\t"
353 "paddsw %%mm5, %%mm3 \n\t"
354 "packuswb %%mm1, %%mm0 \n\t"
355 "packuswb %%mm3, %%mm2 \n\t"
356 "movq %%mm0, %0 \n\t"
357 "movq %%mm2, %1 \n\t"
358 : "+m"(*pix), "+m"(*(pix + line_size))
361 pix += line_size * 2;
366 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
367 int line_size, int h)
370 "lea (%3, %3), %%"REG_a" \n\t"
373 "movq (%1 ), %%mm0 \n\t"
374 "movq (%1, %3), %%mm1 \n\t"
375 "movq %%mm0, (%2) \n\t"
376 "movq %%mm1, (%2, %3) \n\t"
377 "add %%"REG_a", %1 \n\t"
378 "add %%"REG_a", %2 \n\t"
379 "movq (%1 ), %%mm0 \n\t"
380 "movq (%1, %3), %%mm1 \n\t"
381 "movq %%mm0, (%2) \n\t"
382 "movq %%mm1, (%2, %3) \n\t"
383 "add %%"REG_a", %1 \n\t"
384 "add %%"REG_a", %2 \n\t"
387 : "+g"(h), "+r"(pixels), "+r"(block)
388 : "r"((x86_reg)line_size)
393 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
394 int line_size, int h)
397 "lea (%3, %3), %%"REG_a" \n\t"
400 "movq (%1 ), %%mm0 \n\t"
401 "movq 8(%1 ), %%mm4 \n\t"
402 "movq (%1, %3), %%mm1 \n\t"
403 "movq 8(%1, %3), %%mm5 \n\t"
404 "movq %%mm0, (%2) \n\t"
405 "movq %%mm4, 8(%2) \n\t"
406 "movq %%mm1, (%2, %3) \n\t"
407 "movq %%mm5, 8(%2, %3) \n\t"
408 "add %%"REG_a", %1 \n\t"
409 "add %%"REG_a", %2 \n\t"
410 "movq (%1 ), %%mm0 \n\t"
411 "movq 8(%1 ), %%mm4 \n\t"
412 "movq (%1, %3), %%mm1 \n\t"
413 "movq 8(%1, %3), %%mm5 \n\t"
414 "movq %%mm0, (%2) \n\t"
415 "movq %%mm4, 8(%2) \n\t"
416 "movq %%mm1, (%2, %3) \n\t"
417 "movq %%mm5, 8(%2, %3) \n\t"
418 "add %%"REG_a", %1 \n\t"
419 "add %%"REG_a", %2 \n\t"
422 : "+g"(h), "+r"(pixels), "+r"(block)
423 : "r"((x86_reg)line_size)
428 #define CLEAR_BLOCKS(name, n) \
429 static void name(DCTELEM *blocks) \
432 "pxor %%mm7, %%mm7 \n\t" \
433 "mov %1, %%"REG_a" \n\t" \
435 "movq %%mm7, (%0, %%"REG_a") \n\t" \
436 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
437 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
438 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
439 "add $32, %%"REG_a" \n\t" \
441 :: "r"(((uint8_t *)blocks) + 128 * n), \
446 CLEAR_BLOCKS(clear_blocks_mmx, 6)
447 CLEAR_BLOCKS(clear_block_mmx, 1)
449 static void clear_block_sse(DCTELEM *block)
452 "xorps %%xmm0, %%xmm0 \n"
453 "movaps %%xmm0, (%0) \n"
454 "movaps %%xmm0, 16(%0) \n"
455 "movaps %%xmm0, 32(%0) \n"
456 "movaps %%xmm0, 48(%0) \n"
457 "movaps %%xmm0, 64(%0) \n"
458 "movaps %%xmm0, 80(%0) \n"
459 "movaps %%xmm0, 96(%0) \n"
460 "movaps %%xmm0, 112(%0) \n"
466 static void clear_blocks_sse(DCTELEM *blocks)
469 "xorps %%xmm0, %%xmm0 \n"
470 "mov %1, %%"REG_a" \n"
472 "movaps %%xmm0, (%0, %%"REG_a") \n"
473 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
474 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
475 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
476 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
477 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
478 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
479 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
480 "add $128, %%"REG_a" \n"
482 :: "r"(((uint8_t *)blocks) + 128 * 6),
488 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
494 "movq (%1, %0), %%mm0 \n\t"
495 "movq (%2, %0), %%mm1 \n\t"
496 "paddb %%mm0, %%mm1 \n\t"
497 "movq %%mm1, (%2, %0) \n\t"
498 "movq 8(%1, %0), %%mm0 \n\t"
499 "movq 8(%2, %0), %%mm1 \n\t"
500 "paddb %%mm0, %%mm1 \n\t"
501 "movq %%mm1, 8(%2, %0) \n\t"
507 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
510 dst[i + 0] += src[i + 0];
514 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
515 const uint8_t *diff, int w,
516 int *left, int *left_top)
520 int l = *left & 0xff;
521 int tl = *left_top & 0xff;
526 "movzbl (%3, %4), %2 \n"
539 "add (%6, %4), %b0 \n"
540 "mov %b0, (%5, %4) \n"
543 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
544 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
551 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
552 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
553 "movd (%1), %%mm0 \n\t"
555 "movd (%1), %%mm1 \n\t"
556 "movd (%1,%3,1), %%mm2 \n\t"
557 "movd (%1,%3,2), %%mm3 \n\t"
558 "punpcklbw %%mm1, %%mm0 \n\t"
559 "punpcklbw %%mm3, %%mm2 \n\t"
560 "movq %%mm0, %%mm1 \n\t"
561 "punpcklwd %%mm2, %%mm0 \n\t"
562 "punpckhwd %%mm2, %%mm1 \n\t"
563 "movd %%mm0, (%0) \n\t"
565 "punpckhdq %%mm0, %%mm0 \n\t"
566 "movd %%mm0, (%0) \n\t"
567 "movd %%mm1, (%0,%2,1) \n\t"
568 "punpckhdq %%mm1, %%mm1 \n\t"
569 "movd %%mm1, (%0,%2,2) \n\t"
579 #define H263_LOOP_FILTER \
580 "pxor %%mm7, %%mm7 \n\t" \
581 "movq %0, %%mm0 \n\t" \
582 "movq %0, %%mm1 \n\t" \
583 "movq %3, %%mm2 \n\t" \
584 "movq %3, %%mm3 \n\t" \
585 "punpcklbw %%mm7, %%mm0 \n\t" \
586 "punpckhbw %%mm7, %%mm1 \n\t" \
587 "punpcklbw %%mm7, %%mm2 \n\t" \
588 "punpckhbw %%mm7, %%mm3 \n\t" \
589 "psubw %%mm2, %%mm0 \n\t" \
590 "psubw %%mm3, %%mm1 \n\t" \
591 "movq %1, %%mm2 \n\t" \
592 "movq %1, %%mm3 \n\t" \
593 "movq %2, %%mm4 \n\t" \
594 "movq %2, %%mm5 \n\t" \
595 "punpcklbw %%mm7, %%mm2 \n\t" \
596 "punpckhbw %%mm7, %%mm3 \n\t" \
597 "punpcklbw %%mm7, %%mm4 \n\t" \
598 "punpckhbw %%mm7, %%mm5 \n\t" \
599 "psubw %%mm2, %%mm4 \n\t" \
600 "psubw %%mm3, %%mm5 \n\t" \
601 "psllw $2, %%mm4 \n\t" \
602 "psllw $2, %%mm5 \n\t" \
603 "paddw %%mm0, %%mm4 \n\t" \
604 "paddw %%mm1, %%mm5 \n\t" \
605 "pxor %%mm6, %%mm6 \n\t" \
606 "pcmpgtw %%mm4, %%mm6 \n\t" \
607 "pcmpgtw %%mm5, %%mm7 \n\t" \
608 "pxor %%mm6, %%mm4 \n\t" \
609 "pxor %%mm7, %%mm5 \n\t" \
610 "psubw %%mm6, %%mm4 \n\t" \
611 "psubw %%mm7, %%mm5 \n\t" \
612 "psrlw $3, %%mm4 \n\t" \
613 "psrlw $3, %%mm5 \n\t" \
614 "packuswb %%mm5, %%mm4 \n\t" \
615 "packsswb %%mm7, %%mm6 \n\t" \
616 "pxor %%mm7, %%mm7 \n\t" \
617 "movd %4, %%mm2 \n\t" \
618 "punpcklbw %%mm2, %%mm2 \n\t" \
619 "punpcklbw %%mm2, %%mm2 \n\t" \
620 "punpcklbw %%mm2, %%mm2 \n\t" \
621 "psubusb %%mm4, %%mm2 \n\t" \
622 "movq %%mm2, %%mm3 \n\t" \
623 "psubusb %%mm4, %%mm3 \n\t" \
624 "psubb %%mm3, %%mm2 \n\t" \
625 "movq %1, %%mm3 \n\t" \
626 "movq %2, %%mm4 \n\t" \
627 "pxor %%mm6, %%mm3 \n\t" \
628 "pxor %%mm6, %%mm4 \n\t" \
629 "paddusb %%mm2, %%mm3 \n\t" \
630 "psubusb %%mm2, %%mm4 \n\t" \
631 "pxor %%mm6, %%mm3 \n\t" \
632 "pxor %%mm6, %%mm4 \n\t" \
633 "paddusb %%mm2, %%mm2 \n\t" \
634 "packsswb %%mm1, %%mm0 \n\t" \
635 "pcmpgtb %%mm0, %%mm7 \n\t" \
636 "pxor %%mm7, %%mm0 \n\t" \
637 "psubb %%mm7, %%mm0 \n\t" \
638 "movq %%mm0, %%mm1 \n\t" \
639 "psubusb %%mm2, %%mm0 \n\t" \
640 "psubb %%mm0, %%mm1 \n\t" \
641 "pand %5, %%mm1 \n\t" \
642 "psrlw $2, %%mm1 \n\t" \
643 "pxor %%mm7, %%mm1 \n\t" \
644 "psubb %%mm7, %%mm1 \n\t" \
645 "movq %0, %%mm5 \n\t" \
646 "movq %3, %%mm6 \n\t" \
647 "psubb %%mm1, %%mm5 \n\t" \
648 "paddb %%mm1, %%mm6 \n\t"
650 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
652 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
653 const int strength = ff_h263_loop_filter_strength[qscale];
658 "movq %%mm3, %1 \n\t"
659 "movq %%mm4, %2 \n\t"
660 "movq %%mm5, %0 \n\t"
661 "movq %%mm6, %3 \n\t"
662 : "+m"(*(uint64_t*)(src - 2 * stride)),
663 "+m"(*(uint64_t*)(src - 1 * stride)),
664 "+m"(*(uint64_t*)(src + 0 * stride)),
665 "+m"(*(uint64_t*)(src + 1 * stride))
666 : "g"(2 * strength), "m"(ff_pb_FC)
671 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
673 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
674 const int strength = ff_h263_loop_filter_strength[qscale];
675 DECLARE_ALIGNED(8, uint64_t, temp)[4];
676 uint8_t *btemp = (uint8_t*)temp;
680 transpose4x4(btemp, src, 8, stride);
681 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
683 H263_LOOP_FILTER // 5 3 4 6
689 : "g"(2 * strength), "m"(ff_pb_FC)
693 "movq %%mm5, %%mm1 \n\t"
694 "movq %%mm4, %%mm0 \n\t"
695 "punpcklbw %%mm3, %%mm5 \n\t"
696 "punpcklbw %%mm6, %%mm4 \n\t"
697 "punpckhbw %%mm3, %%mm1 \n\t"
698 "punpckhbw %%mm6, %%mm0 \n\t"
699 "movq %%mm5, %%mm3 \n\t"
700 "movq %%mm1, %%mm6 \n\t"
701 "punpcklwd %%mm4, %%mm5 \n\t"
702 "punpcklwd %%mm0, %%mm1 \n\t"
703 "punpckhwd %%mm4, %%mm3 \n\t"
704 "punpckhwd %%mm0, %%mm6 \n\t"
705 "movd %%mm5, (%0) \n\t"
706 "punpckhdq %%mm5, %%mm5 \n\t"
707 "movd %%mm5, (%0, %2) \n\t"
708 "movd %%mm3, (%0, %2, 2) \n\t"
709 "punpckhdq %%mm3, %%mm3 \n\t"
710 "movd %%mm3, (%0, %3) \n\t"
711 "movd %%mm1, (%1) \n\t"
712 "punpckhdq %%mm1, %%mm1 \n\t"
713 "movd %%mm1, (%1, %2) \n\t"
714 "movd %%mm6, (%1, %2, 2) \n\t"
715 "punpckhdq %%mm6, %%mm6 \n\t"
716 "movd %%mm6, (%1, %3) \n\t"
718 "r"(src + 4 * stride),
719 "r"((x86_reg)stride),
720 "r"((x86_reg)(3 * stride))
725 /* Draw the edges of width 'w' of an image of size width, height
726 * this MMX version can only handle w == 8 || w == 16. */
727 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
728 int w, int h, int sides)
730 uint8_t *ptr, *last_line;
733 last_line = buf + (height - 1) * wrap;
739 "movd (%0), %%mm0 \n\t"
740 "punpcklbw %%mm0, %%mm0 \n\t"
741 "punpcklwd %%mm0, %%mm0 \n\t"
742 "punpckldq %%mm0, %%mm0 \n\t"
743 "movq %%mm0, -8(%0) \n\t"
744 "movq -8(%0, %2), %%mm1 \n\t"
745 "punpckhbw %%mm1, %%mm1 \n\t"
746 "punpckhwd %%mm1, %%mm1 \n\t"
747 "punpckhdq %%mm1, %%mm1 \n\t"
748 "movq %%mm1, (%0, %2) \n\t"
753 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
758 "movd (%0), %%mm0 \n\t"
759 "punpcklbw %%mm0, %%mm0 \n\t"
760 "punpcklwd %%mm0, %%mm0 \n\t"
761 "punpckldq %%mm0, %%mm0 \n\t"
762 "movq %%mm0, -8(%0) \n\t"
763 "movq %%mm0, -16(%0) \n\t"
764 "movq -8(%0, %2), %%mm1 \n\t"
765 "punpckhbw %%mm1, %%mm1 \n\t"
766 "punpckhwd %%mm1, %%mm1 \n\t"
767 "punpckhdq %%mm1, %%mm1 \n\t"
768 "movq %%mm1, (%0, %2) \n\t"
769 "movq %%mm1, 8(%0, %2) \n\t"
774 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
780 "movd (%0), %%mm0 \n\t"
781 "punpcklbw %%mm0, %%mm0 \n\t"
782 "punpcklwd %%mm0, %%mm0 \n\t"
783 "movd %%mm0, -4(%0) \n\t"
784 "movd -4(%0, %2), %%mm1 \n\t"
785 "punpcklbw %%mm1, %%mm1 \n\t"
786 "punpckhwd %%mm1, %%mm1 \n\t"
787 "punpckhdq %%mm1, %%mm1 \n\t"
788 "movd %%mm1, (%0, %2) \n\t"
793 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
797 /* top and bottom (and hopefully also the corners) */
798 if (sides & EDGE_TOP) {
799 for (i = 0; i < h; i += 4) {
800 ptr = buf - (i + 1) * wrap - w;
803 "movq (%1, %0), %%mm0 \n\t"
804 "movq %%mm0, (%0) \n\t"
805 "movq %%mm0, (%0, %2) \n\t"
806 "movq %%mm0, (%0, %2, 2) \n\t"
807 "movq %%mm0, (%0, %3) \n\t"
812 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
813 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
818 if (sides & EDGE_BOTTOM) {
819 for (i = 0; i < h; i += 4) {
820 ptr = last_line + (i + 1) * wrap - w;
823 "movq (%1, %0), %%mm0 \n\t"
824 "movq %%mm0, (%0) \n\t"
825 "movq %%mm0, (%0, %2) \n\t"
826 "movq %%mm0, (%0, %2, 2) \n\t"
827 "movq %%mm0, (%0, %3) \n\t"
832 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
833 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
834 "r"(ptr + width + 2 * w)
840 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
841 in0, in1, in2, in7, out, OP) \
842 "paddw "#m4", "#m3" \n\t" /* x1 */ \
843 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
844 "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
845 "movq "#in7", "#m3" \n\t" /* d */ \
846 "movq "#in0", %%mm5 \n\t" /* D */ \
847 "paddw "#m3", %%mm5 \n\t" /* x4 */ \
848 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
849 "movq "#in1", %%mm5 \n\t" /* C */ \
850 "movq "#in2", %%mm6 \n\t" /* B */ \
851 "paddw "#m6", %%mm5 \n\t" /* x3 */ \
852 "paddw "#m5", %%mm6 \n\t" /* x2 */ \
853 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
854 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
855 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
856 "paddw "#rnd", %%mm4 \n\t" /* x2 */ \
857 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
858 "psraw $5, %%mm5 \n\t" \
859 "packuswb %%mm5, %%mm5 \n\t" \
860 OP(%%mm5, out, %%mm7, d)
862 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMXEXT) \
863 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, \
872 "pxor %%mm7, %%mm7 \n\t" \
874 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
875 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
876 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
877 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
878 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
879 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
880 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
881 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
882 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
883 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
884 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
885 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
886 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
887 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
888 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
889 "paddw %%mm3, %%mm5 \n\t" /* b */ \
890 "paddw %%mm2, %%mm6 \n\t" /* c */ \
891 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
892 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
893 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
894 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
895 "paddw %%mm4, %%mm0 \n\t" /* a */ \
896 "paddw %%mm1, %%mm5 \n\t" /* d */ \
897 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
898 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
899 "paddw %6, %%mm6 \n\t" \
900 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
901 "psraw $5, %%mm0 \n\t" \
902 "movq %%mm0, %5 \n\t" \
903 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
905 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
906 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
907 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
908 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
909 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
910 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
911 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
912 "paddw %%mm0, %%mm2 \n\t" /* b */ \
913 "paddw %%mm5, %%mm3 \n\t" /* c */ \
914 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
915 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
916 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
917 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
918 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
919 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
920 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
921 "paddw %%mm2, %%mm1 \n\t" /* a */ \
922 "paddw %%mm6, %%mm4 \n\t" /* d */ \
923 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
924 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
925 "paddw %6, %%mm1 \n\t" \
926 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
927 "psraw $5, %%mm3 \n\t" \
928 "movq %5, %%mm1 \n\t" \
929 "packuswb %%mm3, %%mm1 \n\t" \
930 OP_MMXEXT(%%mm1, (%1), %%mm4, q) \
931 /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
933 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
934 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
935 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
936 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
937 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
938 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
939 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
940 "paddw %%mm1, %%mm5 \n\t" /* b */ \
941 "paddw %%mm4, %%mm0 \n\t" /* c */ \
942 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
943 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
944 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
945 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
946 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
947 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
948 "paddw %%mm3, %%mm2 \n\t" /* d */ \
949 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
950 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
951 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
952 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
953 "paddw %%mm2, %%mm6 \n\t" /* a */ \
954 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
955 "paddw %6, %%mm0 \n\t" \
956 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
957 "psraw $5, %%mm0 \n\t" \
958 /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
959 /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
961 "paddw %%mm5, %%mm3 \n\t" /* a */ \
962 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
963 "paddw %%mm4, %%mm6 \n\t" /* b */ \
964 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
965 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
966 "paddw %%mm1, %%mm4 \n\t" /* c */ \
967 "paddw %%mm2, %%mm5 \n\t" /* d */ \
968 "paddw %%mm6, %%mm6 \n\t" /* 2b */ \
969 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
970 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
971 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
972 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
973 "paddw %6, %%mm4 \n\t" \
974 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
975 "psraw $5, %%mm4 \n\t" \
976 "packuswb %%mm4, %%mm0 \n\t" \
977 OP_MMXEXT(%%mm0, 8(%1), %%mm4, q) \
983 : "+a"(src), "+c"(dst), "+D"(h) \
984 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
985 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
990 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, \
997 "pxor %%mm7, %%mm7 \n\t" \
999 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
1000 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
1001 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
1002 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
1003 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
1004 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
1005 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
1006 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
1007 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
1008 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
1009 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
1010 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
1011 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
1012 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
1013 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
1014 "paddw %%mm3, %%mm5 \n\t" /* b */ \
1015 "paddw %%mm2, %%mm6 \n\t" /* c */ \
1016 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1017 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
1018 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
1019 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
1020 "paddw %%mm4, %%mm0 \n\t" /* a */ \
1021 "paddw %%mm1, %%mm5 \n\t" /* d */ \
1022 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
1023 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
1024 "paddw %5, %%mm6 \n\t" \
1025 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1026 "psraw $5, %%mm0 \n\t" \
1027 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
1029 "movd 5(%0), %%mm5 \n\t" /* FGHI */ \
1030 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
1031 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
1032 "paddw %%mm5, %%mm1 \n\t" /* a */ \
1033 "paddw %%mm6, %%mm2 \n\t" /* b */ \
1034 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
1035 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
1036 "paddw %%mm6, %%mm3 \n\t" /* c */ \
1037 "paddw %%mm5, %%mm4 \n\t" /* d */ \
1038 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1039 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1040 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1041 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1042 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
1043 "paddw %5, %%mm1 \n\t" \
1044 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
1045 "psraw $5, %%mm3 \n\t" \
1046 "packuswb %%mm3, %%mm0 \n\t" \
1047 OP_MMXEXT(%%mm0, (%1), %%mm4, q) \
1053 : "+a"(src), "+c"(dst), "+d"(h) \
1054 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1055 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
1060 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1061 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1066 uint64_t temp[17 * 4]; \
1067 uint64_t *temp_ptr = temp; \
1070 /* FIXME unroll */ \
1071 __asm__ volatile ( \
1072 "pxor %%mm7, %%mm7 \n\t" \
1074 "movq (%0), %%mm0 \n\t" \
1075 "movq (%0), %%mm1 \n\t" \
1076 "movq 8(%0), %%mm2 \n\t" \
1077 "movq 8(%0), %%mm3 \n\t" \
1078 "punpcklbw %%mm7, %%mm0 \n\t" \
1079 "punpckhbw %%mm7, %%mm1 \n\t" \
1080 "punpcklbw %%mm7, %%mm2 \n\t" \
1081 "punpckhbw %%mm7, %%mm3 \n\t" \
1082 "movq %%mm0, (%1) \n\t" \
1083 "movq %%mm1, 17 * 8(%1) \n\t" \
1084 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1085 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1090 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1091 : "r"((x86_reg)srcStride) \
1098 /* FIXME reorder for speed */ \
1099 __asm__ volatile ( \
1100 /* "pxor %%mm7, %%mm7 \n\t" */ \
1102 "movq (%0), %%mm0 \n\t" \
1103 "movq 8(%0), %%mm1 \n\t" \
1104 "movq 16(%0), %%mm2 \n\t" \
1105 "movq 24(%0), %%mm3 \n\t" \
1106 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1107 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1109 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1111 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1113 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1114 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1116 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1117 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1119 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1120 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1122 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1123 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1125 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1127 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1129 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1130 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1132 "add $136, %0 \n\t" \
1137 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1138 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1139 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1140 "g"(4 - 14 * (x86_reg)dstStride) \
1145 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1150 uint64_t temp[9 * 2]; \
1151 uint64_t *temp_ptr = temp; \
1154 /* FIXME unroll */ \
1155 __asm__ volatile ( \
1156 "pxor %%mm7, %%mm7 \n\t" \
1158 "movq (%0), %%mm0 \n\t" \
1159 "movq (%0), %%mm1 \n\t" \
1160 "punpcklbw %%mm7, %%mm0 \n\t" \
1161 "punpckhbw %%mm7, %%mm1 \n\t" \
1162 "movq %%mm0, (%1) \n\t" \
1163 "movq %%mm1, 9*8(%1) \n\t" \
1168 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1169 : "r"((x86_reg)srcStride) \
1176 /* FIXME reorder for speed */ \
1177 __asm__ volatile ( \
1178 /* "pxor %%mm7, %%mm7 \n\t" */ \
1180 "movq (%0), %%mm0 \n\t" \
1181 "movq 8(%0), %%mm1 \n\t" \
1182 "movq 16(%0), %%mm2 \n\t" \
1183 "movq 24(%0), %%mm3 \n\t" \
1184 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1185 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1187 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1189 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1191 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1193 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1195 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1196 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1198 "add $72, %0 \n\t" \
1203 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1204 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1205 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1206 "g"(4 - 6 * (x86_reg)dstStride) \
1211 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1214 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1217 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1221 uint8_t * const half = (uint8_t*)temp; \
1222 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1224 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1227 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1230 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1234 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1238 uint8_t * const half = (uint8_t*)temp; \
1239 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1241 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1245 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1249 uint8_t * const half = (uint8_t*)temp; \
1250 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1251 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1254 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1257 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1260 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1264 uint8_t * const half = (uint8_t*)temp; \
1265 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1266 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1270 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1273 uint64_t half[8 + 9]; \
1274 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1275 uint8_t * const halfHV = ((uint8_t*)half); \
1276 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1278 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1279 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1280 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1283 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1286 uint64_t half[8 + 9]; \
1287 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1288 uint8_t * const halfHV = ((uint8_t*)half); \
1289 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1291 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1293 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1294 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1297 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1300 uint64_t half[8 + 9]; \
1301 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1302 uint8_t * const halfHV = ((uint8_t*)half); \
1303 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1305 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1306 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1307 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1310 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1313 uint64_t half[8 + 9]; \
1314 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1315 uint8_t * const halfHV = ((uint8_t*)half); \
1316 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1318 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1320 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1321 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1324 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1327 uint64_t half[8 + 9]; \
1328 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1329 uint8_t * const halfHV = ((uint8_t*)half); \
1330 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1332 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1333 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1336 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1339 uint64_t half[8 + 9]; \
1340 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1341 uint8_t * const halfHV = ((uint8_t*)half); \
1342 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1344 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1345 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1348 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1351 uint64_t half[8 + 9]; \
1352 uint8_t * const halfH = ((uint8_t*)half); \
1353 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1355 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1356 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1359 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1362 uint64_t half[8 + 9]; \
1363 uint8_t * const halfH = ((uint8_t*)half); \
1364 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1366 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1368 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1371 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1375 uint8_t * const halfH = ((uint8_t*)half); \
1376 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1378 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1381 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1384 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1387 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1390 uint64_t temp[32]; \
1391 uint8_t * const half = (uint8_t*)temp; \
1392 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1394 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1397 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1400 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1401 stride, stride, 16); \
1404 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1407 uint64_t temp[32]; \
1408 uint8_t * const half = (uint8_t*)temp; \
1409 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1411 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1412 stride, stride, 16); \
1415 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1418 uint64_t temp[32]; \
1419 uint8_t * const half = (uint8_t*)temp; \
1420 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1422 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1425 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1428 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1431 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1434 uint64_t temp[32]; \
1435 uint8_t * const half = (uint8_t*)temp; \
1436 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1438 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1439 stride, stride, 16); \
1442 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1445 uint64_t half[16 * 2 + 17 * 2]; \
1446 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1447 uint8_t * const halfHV = ((uint8_t*)half); \
1448 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1450 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1452 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1454 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1457 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1460 uint64_t half[16 * 2 + 17 * 2]; \
1461 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1462 uint8_t * const halfHV = ((uint8_t*)half); \
1463 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1465 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1467 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1469 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1472 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1475 uint64_t half[16 * 2 + 17 * 2]; \
1476 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1477 uint8_t * const halfHV = ((uint8_t*)half); \
1478 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1480 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1482 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1484 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1488 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1491 uint64_t half[16 * 2 + 17 * 2]; \
1492 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1493 uint8_t * const halfHV = ((uint8_t*)half); \
1494 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1496 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1498 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1500 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1504 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1507 uint64_t half[16 * 2 + 17 * 2]; \
1508 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1509 uint8_t * const halfHV = ((uint8_t*)half); \
1510 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1512 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1514 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1517 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1520 uint64_t half[16 * 2 + 17 * 2]; \
1521 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1522 uint8_t * const halfHV = ((uint8_t*)half); \
1523 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1525 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1527 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1531 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1534 uint64_t half[17 * 2]; \
1535 uint8_t * const halfH = ((uint8_t*)half); \
1536 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1538 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1540 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1543 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1546 uint64_t half[17 * 2]; \
1547 uint8_t * const halfH = ((uint8_t*)half); \
1548 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1550 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1552 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1555 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1558 uint64_t half[17 * 2]; \
1559 uint8_t * const halfH = ((uint8_t*)half); \
1560 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1562 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1565 #define PUT_OP(a, b, temp, size) \
1566 "mov"#size" "#a", "#b" \n\t"
1568 #define AVG_MMXEXT_OP(a, b, temp, size) \
1569 "mov"#size" "#b", "#temp" \n\t" \
1570 "pavgb "#temp", "#a" \n\t" \
1571 "mov"#size" "#a", "#b" \n\t"
1573 QPEL_BASE(put_, ff_pw_16, _, PUT_OP)
1574 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMXEXT_OP)
1575 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP)
1576 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext)
1577 QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
1578 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
1580 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1582 put_pixels8_xy2_mmx(dst, src, stride, 8);
1584 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1586 put_pixels16_xy2_mmx(dst, src, stride, 16);
1588 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1590 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1592 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1594 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1597 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1598 ptrdiff_t linesize, int block_w, int block_h,
1599 int src_x, int src_y, int w, int h);
1601 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1602 int stride, int h, int ox, int oy,
1603 int dxx, int dxy, int dyx, int dyy,
1604 int shift, int r, int width, int height,
1605 emulated_edge_mc_func *emu_edge_fn)
1608 const int ix = ox >> (16 + shift);
1609 const int iy = oy >> (16 + shift);
1610 const int oxs = ox >> 4;
1611 const int oys = oy >> 4;
1612 const int dxxs = dxx >> 4;
1613 const int dxys = dxy >> 4;
1614 const int dyxs = dyx >> 4;
1615 const int dyys = dyy >> 4;
1616 const uint16_t r4[4] = { r, r, r, r };
1617 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1618 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1619 const uint64_t shift2 = 2 * shift;
1620 #define MAX_STRIDE 4096U
1622 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
1625 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1626 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1627 const int dxh = dxy * (h - 1);
1628 const int dyw = dyx * (w - 1);
1629 int need_emu = (unsigned)ix >= width - w ||
1630 (unsigned)iy >= height - h;
1632 if ( // non-constant fullpel offset (3% of blocks)
1633 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1634 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1635 // uses more than 16 bits of subpel mv (only at huge resolution)
1636 || (dxx | dxy | dyx | dyy) & 15
1637 || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
1638 // FIXME could still use mmx for some of the rows
1639 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1640 shift, r, width, height);
1644 src += ix + iy * stride;
1646 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1651 "movd %0, %%mm6 \n\t"
1652 "pxor %%mm7, %%mm7 \n\t"
1653 "punpcklwd %%mm6, %%mm6 \n\t"
1654 "punpcklwd %%mm6, %%mm6 \n\t"
1658 for (x = 0; x < w; x += 4) {
1659 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1660 oxs - dxys + dxxs * (x + 1),
1661 oxs - dxys + dxxs * (x + 2),
1662 oxs - dxys + dxxs * (x + 3) };
1663 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1664 oys - dyys + dyxs * (x + 1),
1665 oys - dyys + dyxs * (x + 2),
1666 oys - dyys + dyxs * (x + 3) };
1668 for (y = 0; y < h; y++) {
1670 "movq %0, %%mm4 \n\t"
1671 "movq %1, %%mm5 \n\t"
1672 "paddw %2, %%mm4 \n\t"
1673 "paddw %3, %%mm5 \n\t"
1674 "movq %%mm4, %0 \n\t"
1675 "movq %%mm5, %1 \n\t"
1676 "psrlw $12, %%mm4 \n\t"
1677 "psrlw $12, %%mm5 \n\t"
1678 : "+m"(*dx4), "+m"(*dy4)
1679 : "m"(*dxy4), "m"(*dyy4)
1683 "movq %%mm6, %%mm2 \n\t"
1684 "movq %%mm6, %%mm1 \n\t"
1685 "psubw %%mm4, %%mm2 \n\t"
1686 "psubw %%mm5, %%mm1 \n\t"
1687 "movq %%mm2, %%mm0 \n\t"
1688 "movq %%mm4, %%mm3 \n\t"
1689 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1690 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1691 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1692 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1694 "movd %4, %%mm5 \n\t"
1695 "movd %3, %%mm4 \n\t"
1696 "punpcklbw %%mm7, %%mm5 \n\t"
1697 "punpcklbw %%mm7, %%mm4 \n\t"
1698 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1699 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1701 "movd %2, %%mm5 \n\t"
1702 "movd %1, %%mm4 \n\t"
1703 "punpcklbw %%mm7, %%mm5 \n\t"
1704 "punpcklbw %%mm7, %%mm4 \n\t"
1705 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1706 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1707 "paddw %5, %%mm1 \n\t"
1708 "paddw %%mm3, %%mm2 \n\t"
1709 "paddw %%mm1, %%mm0 \n\t"
1710 "paddw %%mm2, %%mm0 \n\t"
1712 "psrlw %6, %%mm0 \n\t"
1713 "packuswb %%mm0, %%mm0 \n\t"
1714 "movd %%mm0, %0 \n\t"
1716 : "=m"(dst[x + y * stride])
1717 : "m"(src[0]), "m"(src[1]),
1718 "m"(src[stride]), "m"(src[stride + 1]),
1719 "m"(*r4), "m"(shift2)
1723 src += 4 - h * stride;
1730 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1731 int stride, int h, int ox, int oy,
1732 int dxx, int dxy, int dyx, int dyy,
1733 int shift, int r, int width, int height)
1735 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1736 width, height, &ff_emulated_edge_mc_8);
1739 static void gmc_sse(uint8_t *dst, uint8_t *src,
1740 int stride, int h, int ox, int oy,
1741 int dxx, int dxy, int dyx, int dyy,
1742 int shift, int r, int width, int height)
1744 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1745 width, height, &ff_emulated_edge_mc_8);
1748 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1749 int stride, int h, int ox, int oy,
1750 int dxx, int dxy, int dyx, int dyy,
1751 int shift, int r, int width, int height)
1753 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1754 width, height, &ff_emulated_edge_mc_8);
1759 #endif /* HAVE_INLINE_ASM */
1761 #include "h264_qpel.c"
1763 void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
1764 int stride, int h, int x, int y);
1765 void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
1766 int stride, int h, int x, int y);
1767 void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
1768 int stride, int h, int x, int y);
1770 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1771 int stride, int h, int x, int y);
1772 void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
1773 int stride, int h, int x, int y);
1774 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1775 int stride, int h, int x, int y);
1777 void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1778 int stride, int h, int x, int y);
1779 void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1780 int stride, int h, int x, int y);
1782 void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1783 int stride, int h, int x, int y);
1784 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1785 int stride, int h, int x, int y);
1787 void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1788 int stride, int h, int x, int y);
1789 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1790 int stride, int h, int x, int y);
1792 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1793 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1794 (uint8_t *dst, uint8_t *src, \
1795 int stride, int h, int x, int y);
1797 CHROMA_MC(put, 2, 10, mmxext)
1798 CHROMA_MC(avg, 2, 10, mmxext)
1799 CHROMA_MC(put, 4, 10, mmxext)
1800 CHROMA_MC(avg, 4, 10, mmxext)
1801 CHROMA_MC(put, 8, 10, sse2)
1802 CHROMA_MC(avg, 8, 10, sse2)
1803 CHROMA_MC(put, 8, 10, avx)
1804 CHROMA_MC(avg, 8, 10, avx)
1809 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1811 put_pixels8_mmx(dst, src, stride, 8);
1814 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1816 avg_pixels8_mmx(dst, src, stride, 8);
1819 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1821 put_pixels16_mmx(dst, src, stride, 16);
1824 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1826 avg_pixels16_mmx(dst, src, stride, 16);
1830 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1831 int stride, int rnd)
1833 put_pixels8_mmx(dst, src, stride, 8);
1836 void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
1837 int stride, int rnd)
1839 avg_pixels8_mmxext(dst, src, stride, 8);
1842 #if CONFIG_DIRAC_DECODER
1843 #define DIRAC_PIXOP(OPNAME, EXT)\
1844 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1846 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
1848 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1850 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
1852 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1854 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
1855 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
1858 DIRAC_PIXOP(put, mmx)
1859 DIRAC_PIXOP(avg, mmx)
1860 DIRAC_PIXOP(avg, mmxext)
1863 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1865 ff_put_pixels16_sse2(dst, src[0], stride, h);
1867 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1869 ff_avg_pixels16_sse2(dst, src[0], stride, h);
1871 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1873 ff_put_pixels16_sse2(dst , src[0] , stride, h);
1874 ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
1876 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1878 ff_avg_pixels16_sse2(dst , src[0] , stride, h);
1879 ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
1884 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
1887 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
1891 ff_put_pixels_clamped_mmx(block, dest, line_size);
1894 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
1898 ff_add_pixels_clamped_mmx(block, dest, line_size);
1901 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
1904 ff_mmxext_idct(block);
1905 ff_put_pixels_clamped_mmx(block, dest, line_size);
1908 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
1911 ff_mmxext_idct(block);
1912 ff_add_pixels_clamped_mmx(block, dest, line_size);
1916 static void vector_clipf_sse(float *dst, const float *src,
1917 float min, float max, int len)
1919 x86_reg i = (len - 16) * 4;
1921 "movss %3, %%xmm4 \n\t"
1922 "movss %4, %%xmm5 \n\t"
1923 "shufps $0, %%xmm4, %%xmm4 \n\t"
1924 "shufps $0, %%xmm5, %%xmm5 \n\t"
1926 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1927 "movaps 16(%2, %0), %%xmm1 \n\t"
1928 "movaps 32(%2, %0), %%xmm2 \n\t"
1929 "movaps 48(%2, %0), %%xmm3 \n\t"
1930 "maxps %%xmm4, %%xmm0 \n\t"
1931 "maxps %%xmm4, %%xmm1 \n\t"
1932 "maxps %%xmm4, %%xmm2 \n\t"
1933 "maxps %%xmm4, %%xmm3 \n\t"
1934 "minps %%xmm5, %%xmm0 \n\t"
1935 "minps %%xmm5, %%xmm1 \n\t"
1936 "minps %%xmm5, %%xmm2 \n\t"
1937 "minps %%xmm5, %%xmm3 \n\t"
1938 "movaps %%xmm0, (%1, %0) \n\t"
1939 "movaps %%xmm1, 16(%1, %0) \n\t"
1940 "movaps %%xmm2, 32(%1, %0) \n\t"
1941 "movaps %%xmm3, 48(%1, %0) \n\t"
1945 : "r"(dst), "r"(src), "m"(min), "m"(max)
1950 #endif /* HAVE_INLINE_ASM */
1952 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1954 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1956 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1958 int order, int mul);
1959 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1961 int order, int mul);
1962 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1964 int order, int mul);
1966 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1967 const int16_t *window, unsigned int len);
1968 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1969 const int16_t *window, unsigned int len);
1970 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1971 const int16_t *window, unsigned int len);
1972 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1973 const int16_t *window, unsigned int len);
1974 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1975 const int16_t *window, unsigned int len);
1976 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1977 const int16_t *window, unsigned int len);
1979 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1980 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1982 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1983 const uint8_t *diff, int w,
1984 int *left, int *left_top);
1985 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1987 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1990 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
1992 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1993 int32_t min, int32_t max, unsigned int len);
1994 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1995 int32_t min, int32_t max, unsigned int len);
1996 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1997 int32_t min, int32_t max, unsigned int len);
1998 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1999 int32_t min, int32_t max, unsigned int len);
2001 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2003 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2004 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2005 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2006 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2007 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2008 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2009 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2010 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2011 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2012 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2013 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2014 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2015 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2016 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2017 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2018 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2021 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2023 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2024 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2025 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2026 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2029 #define H264_QPEL_FUNCS(x, y, CPU) \
2031 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2032 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2033 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2034 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2037 #define H264_QPEL_FUNCS_10(x, y, CPU) \
2039 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2040 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2041 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2042 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2045 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2047 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2050 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2051 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2052 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2054 if (!high_bit_depth) {
2055 c->clear_block = clear_block_mmx;
2056 c->clear_blocks = clear_blocks_mmx;
2057 c->draw_edges = draw_edges_mmx;
2059 SET_HPEL_FUNCS(put, 0, 16, mmx);
2060 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2061 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2062 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2063 SET_HPEL_FUNCS(put, 1, 8, mmx);
2064 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2065 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2066 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2069 #if ARCH_X86_32 || !HAVE_YASM
2073 c->add_bytes = add_bytes_mmx;
2075 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2076 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2077 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2079 #endif /* HAVE_INLINE_ASM */
2082 if (!high_bit_depth && CONFIG_H264CHROMA) {
2083 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
2084 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2087 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2092 static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
2095 const int bit_depth = avctx->bits_per_raw_sample;
2096 const int high_bit_depth = bit_depth > 8;
2099 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
2100 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
2102 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
2103 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
2104 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
2105 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
2107 if (!high_bit_depth) {
2108 c->put_pixels_tab[0][1] = put_pixels16_x2_mmxext;
2109 c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
2111 c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
2112 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
2113 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
2115 c->put_pixels_tab[1][1] = put_pixels8_x2_mmxext;
2116 c->put_pixels_tab[1][2] = put_pixels8_y2_mmxext;
2118 c->avg_pixels_tab[1][0] = avg_pixels8_mmxext;
2119 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmxext;
2120 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmxext;
2123 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2124 if (!high_bit_depth) {
2125 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
2126 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
2127 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmxext;
2128 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmxext;
2130 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
2131 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmxext;
2135 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2136 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2137 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmxext;
2138 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmxext;
2140 #endif /* HAVE_INLINE_ASM */
2142 #if HAVE_MMXEXT_EXTERNAL
2143 if (CONFIG_H264QPEL) {
2144 if (!high_bit_depth) {
2145 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
2146 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
2147 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
2148 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
2149 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
2150 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
2151 } else if (bit_depth == 10) {
2153 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2154 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2155 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2156 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2158 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2159 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2163 if (!high_bit_depth && CONFIG_H264CHROMA) {
2164 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
2165 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
2166 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
2167 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
2169 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2170 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
2171 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
2172 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
2173 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
2176 /* slower than cmov version on AMD */
2177 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
2178 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
2180 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
2181 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
2183 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2184 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2186 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
2188 #endif /* HAVE_MMXEXT_EXTERNAL */
2191 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2194 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2197 if (!high_bit_depth) {
2198 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2199 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2201 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2202 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2203 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2205 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2206 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2208 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2209 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2210 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2212 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2213 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2214 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2215 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2216 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2218 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2219 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2223 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2224 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2225 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2226 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2228 #endif /* HAVE_INLINE_ASM */
2231 if (!high_bit_depth && CONFIG_H264CHROMA) {
2232 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
2233 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2235 #endif /* HAVE_YASM */
2238 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2240 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2243 if (!high_bit_depth) {
2244 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2245 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2246 c->clear_block = clear_block_sse;
2247 c->clear_blocks = clear_blocks_sse;
2251 c->vector_clipf = vector_clipf_sse;
2252 #endif /* HAVE_INLINE_ASM */
2255 c->scalarproduct_float = ff_scalarproduct_float_sse;
2257 #if HAVE_INLINE_ASM && CONFIG_VIDEODSP
2260 #endif /* HAVE_YASM */
2263 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2266 const int bit_depth = avctx->bits_per_raw_sample;
2267 const int high_bit_depth = bit_depth > 8;
2269 #if HAVE_SSE2_INLINE
2270 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
2271 c->idct_put = ff_idct_xvid_sse2_put;
2272 c->idct_add = ff_idct_xvid_sse2_add;
2273 c->idct = ff_idct_xvid_sse2;
2274 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2276 #endif /* HAVE_SSE2_INLINE */
2278 #if HAVE_SSE2_EXTERNAL
2279 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2280 // these functions are slower than mmx on AMD, but faster on Intel
2281 if (!high_bit_depth) {
2282 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
2283 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
2284 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
2285 if (CONFIG_H264QPEL)
2286 H264_QPEL_FUNCS(0, 0, sse2);
2290 if (!high_bit_depth && CONFIG_H264QPEL) {
2291 H264_QPEL_FUNCS(0, 1, sse2);
2292 H264_QPEL_FUNCS(0, 2, sse2);
2293 H264_QPEL_FUNCS(0, 3, sse2);
2294 H264_QPEL_FUNCS(1, 1, sse2);
2295 H264_QPEL_FUNCS(1, 2, sse2);
2296 H264_QPEL_FUNCS(1, 3, sse2);
2297 H264_QPEL_FUNCS(2, 1, sse2);
2298 H264_QPEL_FUNCS(2, 2, sse2);
2299 H264_QPEL_FUNCS(2, 3, sse2);
2300 H264_QPEL_FUNCS(3, 1, sse2);
2301 H264_QPEL_FUNCS(3, 2, sse2);
2302 H264_QPEL_FUNCS(3, 3, sse2);
2305 if (bit_depth == 10) {
2306 if (CONFIG_H264QPEL) {
2307 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2308 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2309 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2310 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2311 H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2312 H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2313 H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
2315 if (CONFIG_H264CHROMA) {
2316 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2317 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2321 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2322 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2323 if (mm_flags & AV_CPU_FLAG_ATOM) {
2324 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2326 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2328 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2329 c->apply_window_int16 = ff_apply_window_int16_sse2;
2330 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2331 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
2333 c->bswap_buf = ff_bswap32_buf_sse2;
2334 #endif /* HAVE_SSE2_EXTERNAL */
2337 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2340 #if HAVE_SSSE3_EXTERNAL
2341 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2342 const int bit_depth = avctx->bits_per_raw_sample;
2344 if (!high_bit_depth && CONFIG_H264QPEL) {
2345 H264_QPEL_FUNCS(1, 0, ssse3);
2346 H264_QPEL_FUNCS(1, 1, ssse3);
2347 H264_QPEL_FUNCS(1, 2, ssse3);
2348 H264_QPEL_FUNCS(1, 3, ssse3);
2349 H264_QPEL_FUNCS(2, 0, ssse3);
2350 H264_QPEL_FUNCS(2, 1, ssse3);
2351 H264_QPEL_FUNCS(2, 2, ssse3);
2352 H264_QPEL_FUNCS(2, 3, ssse3);
2353 H264_QPEL_FUNCS(3, 0, ssse3);
2354 H264_QPEL_FUNCS(3, 1, ssse3);
2355 H264_QPEL_FUNCS(3, 2, ssse3);
2356 H264_QPEL_FUNCS(3, 3, ssse3);
2358 if (bit_depth == 10 && CONFIG_H264QPEL) {
2359 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
2360 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
2361 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
2363 if (!high_bit_depth && CONFIG_H264CHROMA) {
2364 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
2365 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
2366 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
2367 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
2369 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2370 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2371 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2373 if (mm_flags & AV_CPU_FLAG_ATOM)
2374 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2376 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2377 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
2378 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2379 c->bswap_buf = ff_bswap32_buf_ssse3;
2380 #endif /* HAVE_SSSE3_EXTERNAL */
2383 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
2386 #if HAVE_SSE4_EXTERNAL
2387 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2388 #endif /* HAVE_SSE4_EXTERNAL */
2391 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2393 #if HAVE_AVX_EXTERNAL
2394 const int bit_depth = avctx->bits_per_raw_sample;
2396 if (bit_depth == 10) {
2397 // AVX implies !cache64.
2398 // TODO: Port cache(32|64) detection from x264.
2399 if (CONFIG_H264QPEL) {
2400 H264_QPEL_FUNCS_10(1, 0, sse2);
2401 H264_QPEL_FUNCS_10(2, 0, sse2);
2402 H264_QPEL_FUNCS_10(3, 0, sse2);
2405 if (CONFIG_H264CHROMA) {
2406 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2407 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2410 #endif /* HAVE_AVX_EXTERNAL */
2413 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
2415 int mm_flags = av_get_cpu_flags();
2417 #if HAVE_7REGS && HAVE_INLINE_ASM
2418 if (mm_flags & AV_CPU_FLAG_CMOV)
2419 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2422 if (mm_flags & AV_CPU_FLAG_MMX) {
2424 const int idct_algo = avctx->idct_algo;
2426 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
2427 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
2428 c->idct_put = ff_simple_idct_put_mmx;
2429 c->idct_add = ff_simple_idct_add_mmx;
2430 c->idct = ff_simple_idct_mmx;
2431 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
2433 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
2434 if (mm_flags & AV_CPU_FLAG_MMX2) {
2435 c->idct_put = ff_libmpeg2mmx2_idct_put;
2436 c->idct_add = ff_libmpeg2mmx2_idct_add;
2437 c->idct = ff_mmxext_idct;
2439 c->idct_put = ff_libmpeg2mmx_idct_put;
2440 c->idct_add = ff_libmpeg2mmx_idct_add;
2441 c->idct = ff_mmx_idct;
2443 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2445 } else if (idct_algo == FF_IDCT_XVIDMMX) {
2446 if (mm_flags & AV_CPU_FLAG_SSE2) {
2447 c->idct_put = ff_idct_xvid_sse2_put;
2448 c->idct_add = ff_idct_xvid_sse2_add;
2449 c->idct = ff_idct_xvid_sse2;
2450 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2451 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
2452 c->idct_put = ff_idct_xvid_mmxext_put;
2453 c->idct_add = ff_idct_xvid_mmxext_add;
2454 c->idct = ff_idct_xvid_mmxext;
2456 c->idct_put = ff_idct_xvid_mmx_put;
2457 c->idct_add = ff_idct_xvid_mmx_add;
2458 c->idct = ff_idct_xvid_mmx;
2462 #endif /* HAVE_INLINE_ASM */
2464 dsputil_init_mmx(c, avctx, mm_flags);
2467 if (mm_flags & AV_CPU_FLAG_MMXEXT)
2468 dsputil_init_mmxext(c, avctx, mm_flags);
2470 if (mm_flags & AV_CPU_FLAG_3DNOW)
2471 dsputil_init_3dnow(c, avctx, mm_flags);
2473 if (mm_flags & AV_CPU_FLAG_SSE)
2474 dsputil_init_sse(c, avctx, mm_flags);
2476 if (mm_flags & AV_CPU_FLAG_SSE2)
2477 dsputil_init_sse2(c, avctx, mm_flags);
2479 if (mm_flags & AV_CPU_FLAG_SSSE3)
2480 dsputil_init_ssse3(c, avctx, mm_flags);
2482 if (mm_flags & AV_CPU_FLAG_SSE4)
2483 dsputil_init_sse4(c, avctx, mm_flags);
2485 if (mm_flags & AV_CPU_FLAG_AVX)
2486 dsputil_init_avx(c, avctx, mm_flags);
2488 if (CONFIG_ENCODERS)
2489 ff_dsputilenc_init_mmx(c, avctx);