2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "dsputil_mmx.h"
32 #include "idct_xvid.h"
33 #include "diracdsp_mmx.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
43 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
44 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
49 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
53 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
57 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
58 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
60 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
62 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
63 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
64 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
65 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
71 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
72 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
74 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
76 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
81 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
82 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
86 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
87 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
89 #define MOVQ_BFE(regd) \
91 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
92 "paddb %%"#regd", %%"#regd" \n\t" ::)
95 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
96 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
98 // for shared library it's better to use this way for accessing constants
100 #define MOVQ_BONE(regd) \
102 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
103 "psrlw $15, %%"#regd" \n\t" \
104 "packuswb %%"#regd", %%"#regd" \n\t" ::)
106 #define MOVQ_WTWO(regd) \
108 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
109 "psrlw $15, %%"#regd" \n\t" \
110 "psllw $1, %%"#regd" \n\t"::)
114 // using regr as temporary and for the output result
115 // first argument is unmodifed and second is trashed
116 // regfe is supposed to contain 0xfefefefefefefefe
117 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
118 "movq "#rega", "#regr" \n\t" \
119 "pand "#regb", "#regr" \n\t" \
120 "pxor "#rega", "#regb" \n\t" \
121 "pand "#regfe", "#regb" \n\t" \
122 "psrlq $1, "#regb" \n\t" \
123 "paddb "#regb", "#regr" \n\t"
125 #define PAVGB_MMX(rega, regb, regr, regfe) \
126 "movq "#rega", "#regr" \n\t" \
127 "por "#regb", "#regr" \n\t" \
128 "pxor "#rega", "#regb" \n\t" \
129 "pand "#regfe", "#regb" \n\t" \
130 "psrlq $1, "#regb" \n\t" \
131 "psubb "#regb", "#regr" \n\t"
133 // mm6 is supposed to contain 0xfefefefefefefefe
134 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
135 "movq "#rega", "#regr" \n\t" \
136 "movq "#regc", "#regp" \n\t" \
137 "pand "#regb", "#regr" \n\t" \
138 "pand "#regd", "#regp" \n\t" \
139 "pxor "#rega", "#regb" \n\t" \
140 "pxor "#regc", "#regd" \n\t" \
141 "pand %%mm6, "#regb" \n\t" \
142 "pand %%mm6, "#regd" \n\t" \
143 "psrlq $1, "#regb" \n\t" \
144 "psrlq $1, "#regd" \n\t" \
145 "paddb "#regb", "#regr" \n\t" \
146 "paddb "#regd", "#regp" \n\t"
148 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
149 "movq "#rega", "#regr" \n\t" \
150 "movq "#regc", "#regp" \n\t" \
151 "por "#regb", "#regr" \n\t" \
152 "por "#regd", "#regp" \n\t" \
153 "pxor "#rega", "#regb" \n\t" \
154 "pxor "#regc", "#regd" \n\t" \
155 "pand %%mm6, "#regb" \n\t" \
156 "pand %%mm6, "#regd" \n\t" \
157 "psrlq $1, "#regd" \n\t" \
158 "psrlq $1, "#regb" \n\t" \
159 "psubb "#regb", "#regr" \n\t" \
160 "psubb "#regd", "#regp" \n\t"
162 /***********************************/
163 /* MMX no rounding */
164 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
165 #define SET_RND MOVQ_WONE
166 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
167 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
168 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
170 #include "dsputil_rnd_template.c"
176 /***********************************/
179 #define DEF(x, y) x ## _ ## y ## _mmx
180 #define SET_RND MOVQ_WTWO
181 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
182 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
184 #include "dsputil_rnd_template.c"
192 /***********************************/
195 #define DEF(x) x ## _3dnow
196 #define PAVGB "pavgusb"
197 #define SKIP_FOR_3DNOW
199 #include "dsputil_avg_template.c"
203 #undef SKIP_FOR_3DNOW
205 /***********************************/
206 /* MMXEXT specific */
208 #define DEF(x) x ## _mmxext
210 /* Introduced only in MMXEXT set */
211 #define PAVGB "pavgb"
213 #include "dsputil_avg_template.c"
218 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
219 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
220 #define put_pixels16_mmxext put_pixels16_mmx
221 #define put_pixels8_mmxext put_pixels8_mmx
222 #define put_pixels4_mmxext put_pixels4_mmx
223 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
224 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
226 /***********************************/
229 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
235 /* read the pixels */
240 "movq (%3), %%mm0 \n\t"
241 "movq 8(%3), %%mm1 \n\t"
242 "movq 16(%3), %%mm2 \n\t"
243 "movq 24(%3), %%mm3 \n\t"
244 "movq 32(%3), %%mm4 \n\t"
245 "movq 40(%3), %%mm5 \n\t"
246 "movq 48(%3), %%mm6 \n\t"
247 "movq 56(%3), %%mm7 \n\t"
248 "packuswb %%mm1, %%mm0 \n\t"
249 "packuswb %%mm3, %%mm2 \n\t"
250 "packuswb %%mm5, %%mm4 \n\t"
251 "packuswb %%mm7, %%mm6 \n\t"
252 "movq %%mm0, (%0) \n\t"
253 "movq %%mm2, (%0, %1) \n\t"
254 "movq %%mm4, (%0, %1, 2) \n\t"
255 "movq %%mm6, (%0, %2) \n\t"
256 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
259 pix += line_size * 4;
262 // if here would be an exact copy of the code above
263 // compiler would generate some very strange code
266 "movq (%3), %%mm0 \n\t"
267 "movq 8(%3), %%mm1 \n\t"
268 "movq 16(%3), %%mm2 \n\t"
269 "movq 24(%3), %%mm3 \n\t"
270 "movq 32(%3), %%mm4 \n\t"
271 "movq 40(%3), %%mm5 \n\t"
272 "movq 48(%3), %%mm6 \n\t"
273 "movq 56(%3), %%mm7 \n\t"
274 "packuswb %%mm1, %%mm0 \n\t"
275 "packuswb %%mm3, %%mm2 \n\t"
276 "packuswb %%mm5, %%mm4 \n\t"
277 "packuswb %%mm7, %%mm6 \n\t"
278 "movq %%mm0, (%0) \n\t"
279 "movq %%mm2, (%0, %1) \n\t"
280 "movq %%mm4, (%0, %1, 2) \n\t"
281 "movq %%mm6, (%0, %2) \n\t"
282 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
286 #define put_signed_pixels_clamped_mmx_half(off) \
287 "movq "#off"(%2), %%mm1 \n\t" \
288 "movq 16 + "#off"(%2), %%mm2 \n\t" \
289 "movq 32 + "#off"(%2), %%mm3 \n\t" \
290 "movq 48 + "#off"(%2), %%mm4 \n\t" \
291 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
292 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
293 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
294 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
295 "paddb %%mm0, %%mm1 \n\t" \
296 "paddb %%mm0, %%mm2 \n\t" \
297 "paddb %%mm0, %%mm3 \n\t" \
298 "paddb %%mm0, %%mm4 \n\t" \
299 "movq %%mm1, (%0) \n\t" \
300 "movq %%mm2, (%0, %3) \n\t" \
301 "movq %%mm3, (%0, %3, 2) \n\t" \
302 "movq %%mm4, (%0, %1) \n\t"
304 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
307 x86_reg line_skip = line_size;
311 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
312 "lea (%3, %3, 2), %1 \n\t"
313 put_signed_pixels_clamped_mmx_half(0)
314 "lea (%0, %3, 4), %0 \n\t"
315 put_signed_pixels_clamped_mmx_half(64)
316 : "+&r"(pixels), "=&r"(line_skip3)
317 : "r"(block), "r"(line_skip)
321 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
328 /* read the pixels */
335 "movq (%2), %%mm0 \n\t"
336 "movq 8(%2), %%mm1 \n\t"
337 "movq 16(%2), %%mm2 \n\t"
338 "movq 24(%2), %%mm3 \n\t"
339 "movq %0, %%mm4 \n\t"
340 "movq %1, %%mm6 \n\t"
341 "movq %%mm4, %%mm5 \n\t"
342 "punpcklbw %%mm7, %%mm4 \n\t"
343 "punpckhbw %%mm7, %%mm5 \n\t"
344 "paddsw %%mm4, %%mm0 \n\t"
345 "paddsw %%mm5, %%mm1 \n\t"
346 "movq %%mm6, %%mm5 \n\t"
347 "punpcklbw %%mm7, %%mm6 \n\t"
348 "punpckhbw %%mm7, %%mm5 \n\t"
349 "paddsw %%mm6, %%mm2 \n\t"
350 "paddsw %%mm5, %%mm3 \n\t"
351 "packuswb %%mm1, %%mm0 \n\t"
352 "packuswb %%mm3, %%mm2 \n\t"
353 "movq %%mm0, %0 \n\t"
354 "movq %%mm2, %1 \n\t"
355 : "+m"(*pix), "+m"(*(pix + line_size))
358 pix += line_size * 2;
363 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
364 int line_size, int h)
367 "lea (%3, %3), %%"REG_a" \n\t"
370 "movq (%1 ), %%mm0 \n\t"
371 "movq (%1, %3), %%mm1 \n\t"
372 "movq %%mm0, (%2) \n\t"
373 "movq %%mm1, (%2, %3) \n\t"
374 "add %%"REG_a", %1 \n\t"
375 "add %%"REG_a", %2 \n\t"
376 "movq (%1 ), %%mm0 \n\t"
377 "movq (%1, %3), %%mm1 \n\t"
378 "movq %%mm0, (%2) \n\t"
379 "movq %%mm1, (%2, %3) \n\t"
380 "add %%"REG_a", %1 \n\t"
381 "add %%"REG_a", %2 \n\t"
384 : "+g"(h), "+r"(pixels), "+r"(block)
385 : "r"((x86_reg)line_size)
390 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
391 int line_size, int h)
394 "lea (%3, %3), %%"REG_a" \n\t"
397 "movq (%1 ), %%mm0 \n\t"
398 "movq 8(%1 ), %%mm4 \n\t"
399 "movq (%1, %3), %%mm1 \n\t"
400 "movq 8(%1, %3), %%mm5 \n\t"
401 "movq %%mm0, (%2) \n\t"
402 "movq %%mm4, 8(%2) \n\t"
403 "movq %%mm1, (%2, %3) \n\t"
404 "movq %%mm5, 8(%2, %3) \n\t"
405 "add %%"REG_a", %1 \n\t"
406 "add %%"REG_a", %2 \n\t"
407 "movq (%1 ), %%mm0 \n\t"
408 "movq 8(%1 ), %%mm4 \n\t"
409 "movq (%1, %3), %%mm1 \n\t"
410 "movq 8(%1, %3), %%mm5 \n\t"
411 "movq %%mm0, (%2) \n\t"
412 "movq %%mm4, 8(%2) \n\t"
413 "movq %%mm1, (%2, %3) \n\t"
414 "movq %%mm5, 8(%2, %3) \n\t"
415 "add %%"REG_a", %1 \n\t"
416 "add %%"REG_a", %2 \n\t"
419 : "+g"(h), "+r"(pixels), "+r"(block)
420 : "r"((x86_reg)line_size)
425 #define CLEAR_BLOCKS(name, n) \
426 static void name(int16_t *blocks) \
429 "pxor %%mm7, %%mm7 \n\t" \
430 "mov %1, %%"REG_a" \n\t" \
432 "movq %%mm7, (%0, %%"REG_a") \n\t" \
433 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
434 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
435 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
436 "add $32, %%"REG_a" \n\t" \
438 :: "r"(((uint8_t *)blocks) + 128 * n), \
443 CLEAR_BLOCKS(clear_blocks_mmx, 6)
444 CLEAR_BLOCKS(clear_block_mmx, 1)
446 static void clear_block_sse(int16_t *block)
449 "xorps %%xmm0, %%xmm0 \n"
450 "movaps %%xmm0, (%0) \n"
451 "movaps %%xmm0, 16(%0) \n"
452 "movaps %%xmm0, 32(%0) \n"
453 "movaps %%xmm0, 48(%0) \n"
454 "movaps %%xmm0, 64(%0) \n"
455 "movaps %%xmm0, 80(%0) \n"
456 "movaps %%xmm0, 96(%0) \n"
457 "movaps %%xmm0, 112(%0) \n"
463 static void clear_blocks_sse(int16_t *blocks)
466 "xorps %%xmm0, %%xmm0 \n"
467 "mov %1, %%"REG_a" \n"
469 "movaps %%xmm0, (%0, %%"REG_a") \n"
470 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
471 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
472 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
473 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
474 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
475 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
476 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
477 "add $128, %%"REG_a" \n"
479 :: "r"(((uint8_t *)blocks) + 128 * 6),
485 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
491 "movq (%1, %0), %%mm0 \n\t"
492 "movq (%2, %0), %%mm1 \n\t"
493 "paddb %%mm0, %%mm1 \n\t"
494 "movq %%mm1, (%2, %0) \n\t"
495 "movq 8(%1, %0), %%mm0 \n\t"
496 "movq 8(%2, %0), %%mm1 \n\t"
497 "paddb %%mm0, %%mm1 \n\t"
498 "movq %%mm1, 8(%2, %0) \n\t"
504 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
507 dst[i + 0] += src[i + 0];
511 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
512 const uint8_t *diff, int w,
513 int *left, int *left_top)
517 int l = *left & 0xff;
518 int tl = *left_top & 0xff;
523 "movzbl (%3, %4), %2 \n"
536 "add (%6, %4), %b0 \n"
537 "mov %b0, (%5, %4) \n"
540 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
541 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
548 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
549 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
550 "movd (%1), %%mm0 \n\t"
552 "movd (%1), %%mm1 \n\t"
553 "movd (%1,%3,1), %%mm2 \n\t"
554 "movd (%1,%3,2), %%mm3 \n\t"
555 "punpcklbw %%mm1, %%mm0 \n\t"
556 "punpcklbw %%mm3, %%mm2 \n\t"
557 "movq %%mm0, %%mm1 \n\t"
558 "punpcklwd %%mm2, %%mm0 \n\t"
559 "punpckhwd %%mm2, %%mm1 \n\t"
560 "movd %%mm0, (%0) \n\t"
562 "punpckhdq %%mm0, %%mm0 \n\t"
563 "movd %%mm0, (%0) \n\t"
564 "movd %%mm1, (%0,%2,1) \n\t"
565 "punpckhdq %%mm1, %%mm1 \n\t"
566 "movd %%mm1, (%0,%2,2) \n\t"
576 #define H263_LOOP_FILTER \
577 "pxor %%mm7, %%mm7 \n\t" \
578 "movq %0, %%mm0 \n\t" \
579 "movq %0, %%mm1 \n\t" \
580 "movq %3, %%mm2 \n\t" \
581 "movq %3, %%mm3 \n\t" \
582 "punpcklbw %%mm7, %%mm0 \n\t" \
583 "punpckhbw %%mm7, %%mm1 \n\t" \
584 "punpcklbw %%mm7, %%mm2 \n\t" \
585 "punpckhbw %%mm7, %%mm3 \n\t" \
586 "psubw %%mm2, %%mm0 \n\t" \
587 "psubw %%mm3, %%mm1 \n\t" \
588 "movq %1, %%mm2 \n\t" \
589 "movq %1, %%mm3 \n\t" \
590 "movq %2, %%mm4 \n\t" \
591 "movq %2, %%mm5 \n\t" \
592 "punpcklbw %%mm7, %%mm2 \n\t" \
593 "punpckhbw %%mm7, %%mm3 \n\t" \
594 "punpcklbw %%mm7, %%mm4 \n\t" \
595 "punpckhbw %%mm7, %%mm5 \n\t" \
596 "psubw %%mm2, %%mm4 \n\t" \
597 "psubw %%mm3, %%mm5 \n\t" \
598 "psllw $2, %%mm4 \n\t" \
599 "psllw $2, %%mm5 \n\t" \
600 "paddw %%mm0, %%mm4 \n\t" \
601 "paddw %%mm1, %%mm5 \n\t" \
602 "pxor %%mm6, %%mm6 \n\t" \
603 "pcmpgtw %%mm4, %%mm6 \n\t" \
604 "pcmpgtw %%mm5, %%mm7 \n\t" \
605 "pxor %%mm6, %%mm4 \n\t" \
606 "pxor %%mm7, %%mm5 \n\t" \
607 "psubw %%mm6, %%mm4 \n\t" \
608 "psubw %%mm7, %%mm5 \n\t" \
609 "psrlw $3, %%mm4 \n\t" \
610 "psrlw $3, %%mm5 \n\t" \
611 "packuswb %%mm5, %%mm4 \n\t" \
612 "packsswb %%mm7, %%mm6 \n\t" \
613 "pxor %%mm7, %%mm7 \n\t" \
614 "movd %4, %%mm2 \n\t" \
615 "punpcklbw %%mm2, %%mm2 \n\t" \
616 "punpcklbw %%mm2, %%mm2 \n\t" \
617 "punpcklbw %%mm2, %%mm2 \n\t" \
618 "psubusb %%mm4, %%mm2 \n\t" \
619 "movq %%mm2, %%mm3 \n\t" \
620 "psubusb %%mm4, %%mm3 \n\t" \
621 "psubb %%mm3, %%mm2 \n\t" \
622 "movq %1, %%mm3 \n\t" \
623 "movq %2, %%mm4 \n\t" \
624 "pxor %%mm6, %%mm3 \n\t" \
625 "pxor %%mm6, %%mm4 \n\t" \
626 "paddusb %%mm2, %%mm3 \n\t" \
627 "psubusb %%mm2, %%mm4 \n\t" \
628 "pxor %%mm6, %%mm3 \n\t" \
629 "pxor %%mm6, %%mm4 \n\t" \
630 "paddusb %%mm2, %%mm2 \n\t" \
631 "packsswb %%mm1, %%mm0 \n\t" \
632 "pcmpgtb %%mm0, %%mm7 \n\t" \
633 "pxor %%mm7, %%mm0 \n\t" \
634 "psubb %%mm7, %%mm0 \n\t" \
635 "movq %%mm0, %%mm1 \n\t" \
636 "psubusb %%mm2, %%mm0 \n\t" \
637 "psubb %%mm0, %%mm1 \n\t" \
638 "pand %5, %%mm1 \n\t" \
639 "psrlw $2, %%mm1 \n\t" \
640 "pxor %%mm7, %%mm1 \n\t" \
641 "psubb %%mm7, %%mm1 \n\t" \
642 "movq %0, %%mm5 \n\t" \
643 "movq %3, %%mm6 \n\t" \
644 "psubb %%mm1, %%mm5 \n\t" \
645 "paddb %%mm1, %%mm6 \n\t"
647 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
649 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
650 const int strength = ff_h263_loop_filter_strength[qscale];
655 "movq %%mm3, %1 \n\t"
656 "movq %%mm4, %2 \n\t"
657 "movq %%mm5, %0 \n\t"
658 "movq %%mm6, %3 \n\t"
659 : "+m"(*(uint64_t*)(src - 2 * stride)),
660 "+m"(*(uint64_t*)(src - 1 * stride)),
661 "+m"(*(uint64_t*)(src + 0 * stride)),
662 "+m"(*(uint64_t*)(src + 1 * stride))
663 : "g"(2 * strength), "m"(ff_pb_FC)
668 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
670 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
671 const int strength = ff_h263_loop_filter_strength[qscale];
672 DECLARE_ALIGNED(8, uint64_t, temp)[4];
673 uint8_t *btemp = (uint8_t*)temp;
677 transpose4x4(btemp, src, 8, stride);
678 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
680 H263_LOOP_FILTER // 5 3 4 6
686 : "g"(2 * strength), "m"(ff_pb_FC)
690 "movq %%mm5, %%mm1 \n\t"
691 "movq %%mm4, %%mm0 \n\t"
692 "punpcklbw %%mm3, %%mm5 \n\t"
693 "punpcklbw %%mm6, %%mm4 \n\t"
694 "punpckhbw %%mm3, %%mm1 \n\t"
695 "punpckhbw %%mm6, %%mm0 \n\t"
696 "movq %%mm5, %%mm3 \n\t"
697 "movq %%mm1, %%mm6 \n\t"
698 "punpcklwd %%mm4, %%mm5 \n\t"
699 "punpcklwd %%mm0, %%mm1 \n\t"
700 "punpckhwd %%mm4, %%mm3 \n\t"
701 "punpckhwd %%mm0, %%mm6 \n\t"
702 "movd %%mm5, (%0) \n\t"
703 "punpckhdq %%mm5, %%mm5 \n\t"
704 "movd %%mm5, (%0, %2) \n\t"
705 "movd %%mm3, (%0, %2, 2) \n\t"
706 "punpckhdq %%mm3, %%mm3 \n\t"
707 "movd %%mm3, (%0, %3) \n\t"
708 "movd %%mm1, (%1) \n\t"
709 "punpckhdq %%mm1, %%mm1 \n\t"
710 "movd %%mm1, (%1, %2) \n\t"
711 "movd %%mm6, (%1, %2, 2) \n\t"
712 "punpckhdq %%mm6, %%mm6 \n\t"
713 "movd %%mm6, (%1, %3) \n\t"
715 "r"(src + 4 * stride),
716 "r"((x86_reg)stride),
717 "r"((x86_reg)(3 * stride))
722 /* Draw the edges of width 'w' of an image of size width, height
723 * this MMX version can only handle w == 8 || w == 16. */
724 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
725 int w, int h, int sides)
727 uint8_t *ptr, *last_line;
730 last_line = buf + (height - 1) * wrap;
736 "movd (%0), %%mm0 \n\t"
737 "punpcklbw %%mm0, %%mm0 \n\t"
738 "punpcklwd %%mm0, %%mm0 \n\t"
739 "punpckldq %%mm0, %%mm0 \n\t"
740 "movq %%mm0, -8(%0) \n\t"
741 "movq -8(%0, %2), %%mm1 \n\t"
742 "punpckhbw %%mm1, %%mm1 \n\t"
743 "punpckhwd %%mm1, %%mm1 \n\t"
744 "punpckhdq %%mm1, %%mm1 \n\t"
745 "movq %%mm1, (%0, %2) \n\t"
750 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
755 "movd (%0), %%mm0 \n\t"
756 "punpcklbw %%mm0, %%mm0 \n\t"
757 "punpcklwd %%mm0, %%mm0 \n\t"
758 "punpckldq %%mm0, %%mm0 \n\t"
759 "movq %%mm0, -8(%0) \n\t"
760 "movq %%mm0, -16(%0) \n\t"
761 "movq -8(%0, %2), %%mm1 \n\t"
762 "punpckhbw %%mm1, %%mm1 \n\t"
763 "punpckhwd %%mm1, %%mm1 \n\t"
764 "punpckhdq %%mm1, %%mm1 \n\t"
765 "movq %%mm1, (%0, %2) \n\t"
766 "movq %%mm1, 8(%0, %2) \n\t"
771 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
777 "movd (%0), %%mm0 \n\t"
778 "punpcklbw %%mm0, %%mm0 \n\t"
779 "punpcklwd %%mm0, %%mm0 \n\t"
780 "movd %%mm0, -4(%0) \n\t"
781 "movd -4(%0, %2), %%mm1 \n\t"
782 "punpcklbw %%mm1, %%mm1 \n\t"
783 "punpckhwd %%mm1, %%mm1 \n\t"
784 "punpckhdq %%mm1, %%mm1 \n\t"
785 "movd %%mm1, (%0, %2) \n\t"
790 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
794 /* top and bottom (and hopefully also the corners) */
795 if (sides & EDGE_TOP) {
796 for (i = 0; i < h; i += 4) {
797 ptr = buf - (i + 1) * wrap - w;
800 "movq (%1, %0), %%mm0 \n\t"
801 "movq %%mm0, (%0) \n\t"
802 "movq %%mm0, (%0, %2) \n\t"
803 "movq %%mm0, (%0, %2, 2) \n\t"
804 "movq %%mm0, (%0, %3) \n\t"
809 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
810 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
815 if (sides & EDGE_BOTTOM) {
816 for (i = 0; i < h; i += 4) {
817 ptr = last_line + (i + 1) * wrap - w;
820 "movq (%1, %0), %%mm0 \n\t"
821 "movq %%mm0, (%0) \n\t"
822 "movq %%mm0, (%0, %2) \n\t"
823 "movq %%mm0, (%0, %2, 2) \n\t"
824 "movq %%mm0, (%0, %3) \n\t"
829 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
830 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
831 "r"(ptr + width + 2 * w)
837 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
838 in0, in1, in2, in7, out, OP) \
839 "paddw "#m4", "#m3" \n\t" /* x1 */ \
840 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
841 "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
842 "movq "#in7", "#m3" \n\t" /* d */ \
843 "movq "#in0", %%mm5 \n\t" /* D */ \
844 "paddw "#m3", %%mm5 \n\t" /* x4 */ \
845 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
846 "movq "#in1", %%mm5 \n\t" /* C */ \
847 "movq "#in2", %%mm6 \n\t" /* B */ \
848 "paddw "#m6", %%mm5 \n\t" /* x3 */ \
849 "paddw "#m5", %%mm6 \n\t" /* x2 */ \
850 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
851 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
852 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
853 "paddw "#rnd", %%mm4 \n\t" /* x2 */ \
854 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
855 "psraw $5, %%mm5 \n\t" \
856 "packuswb %%mm5, %%mm5 \n\t" \
857 OP(%%mm5, out, %%mm7, d)
859 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMXEXT) \
860 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, \
869 "pxor %%mm7, %%mm7 \n\t" \
871 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
872 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
873 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
874 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
875 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
876 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
877 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
878 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
879 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
880 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
881 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
882 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
883 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
884 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
885 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
886 "paddw %%mm3, %%mm5 \n\t" /* b */ \
887 "paddw %%mm2, %%mm6 \n\t" /* c */ \
888 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
889 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
890 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
891 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
892 "paddw %%mm4, %%mm0 \n\t" /* a */ \
893 "paddw %%mm1, %%mm5 \n\t" /* d */ \
894 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
895 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
896 "paddw %6, %%mm6 \n\t" \
897 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
898 "psraw $5, %%mm0 \n\t" \
899 "movq %%mm0, %5 \n\t" \
900 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
902 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
903 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
904 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
905 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
906 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
907 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
908 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
909 "paddw %%mm0, %%mm2 \n\t" /* b */ \
910 "paddw %%mm5, %%mm3 \n\t" /* c */ \
911 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
912 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
913 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
914 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
915 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
916 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
917 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
918 "paddw %%mm2, %%mm1 \n\t" /* a */ \
919 "paddw %%mm6, %%mm4 \n\t" /* d */ \
920 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
921 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
922 "paddw %6, %%mm1 \n\t" \
923 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
924 "psraw $5, %%mm3 \n\t" \
925 "movq %5, %%mm1 \n\t" \
926 "packuswb %%mm3, %%mm1 \n\t" \
927 OP_MMXEXT(%%mm1, (%1), %%mm4, q) \
928 /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
930 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
931 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
932 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
933 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
934 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
935 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
936 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
937 "paddw %%mm1, %%mm5 \n\t" /* b */ \
938 "paddw %%mm4, %%mm0 \n\t" /* c */ \
939 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
940 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
941 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
942 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
943 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
944 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
945 "paddw %%mm3, %%mm2 \n\t" /* d */ \
946 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
947 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
948 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
949 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
950 "paddw %%mm2, %%mm6 \n\t" /* a */ \
951 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
952 "paddw %6, %%mm0 \n\t" \
953 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
954 "psraw $5, %%mm0 \n\t" \
955 /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
956 /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
958 "paddw %%mm5, %%mm3 \n\t" /* a */ \
959 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
960 "paddw %%mm4, %%mm6 \n\t" /* b */ \
961 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
962 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
963 "paddw %%mm1, %%mm4 \n\t" /* c */ \
964 "paddw %%mm2, %%mm5 \n\t" /* d */ \
965 "paddw %%mm6, %%mm6 \n\t" /* 2b */ \
966 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
967 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
968 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
969 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
970 "paddw %6, %%mm4 \n\t" \
971 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
972 "psraw $5, %%mm4 \n\t" \
973 "packuswb %%mm4, %%mm0 \n\t" \
974 OP_MMXEXT(%%mm0, 8(%1), %%mm4, q) \
980 : "+a"(src), "+c"(dst), "+D"(h) \
981 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
982 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
987 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, \
994 "pxor %%mm7, %%mm7 \n\t" \
996 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
997 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
998 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
999 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
1000 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
1001 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
1002 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
1003 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
1004 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
1005 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
1006 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
1007 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
1008 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
1009 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
1010 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
1011 "paddw %%mm3, %%mm5 \n\t" /* b */ \
1012 "paddw %%mm2, %%mm6 \n\t" /* c */ \
1013 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1014 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
1015 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
1016 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
1017 "paddw %%mm4, %%mm0 \n\t" /* a */ \
1018 "paddw %%mm1, %%mm5 \n\t" /* d */ \
1019 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
1020 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
1021 "paddw %5, %%mm6 \n\t" \
1022 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1023 "psraw $5, %%mm0 \n\t" \
1024 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
1026 "movd 5(%0), %%mm5 \n\t" /* FGHI */ \
1027 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
1028 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
1029 "paddw %%mm5, %%mm1 \n\t" /* a */ \
1030 "paddw %%mm6, %%mm2 \n\t" /* b */ \
1031 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
1032 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
1033 "paddw %%mm6, %%mm3 \n\t" /* c */ \
1034 "paddw %%mm5, %%mm4 \n\t" /* d */ \
1035 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1036 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1037 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1038 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1039 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
1040 "paddw %5, %%mm1 \n\t" \
1041 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
1042 "psraw $5, %%mm3 \n\t" \
1043 "packuswb %%mm3, %%mm0 \n\t" \
1044 OP_MMXEXT(%%mm0, (%1), %%mm4, q) \
1050 : "+a"(src), "+c"(dst), "+d"(h) \
1051 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1052 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
1057 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1058 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1063 uint64_t temp[17 * 4]; \
1064 uint64_t *temp_ptr = temp; \
1067 /* FIXME unroll */ \
1068 __asm__ volatile ( \
1069 "pxor %%mm7, %%mm7 \n\t" \
1071 "movq (%0), %%mm0 \n\t" \
1072 "movq (%0), %%mm1 \n\t" \
1073 "movq 8(%0), %%mm2 \n\t" \
1074 "movq 8(%0), %%mm3 \n\t" \
1075 "punpcklbw %%mm7, %%mm0 \n\t" \
1076 "punpckhbw %%mm7, %%mm1 \n\t" \
1077 "punpcklbw %%mm7, %%mm2 \n\t" \
1078 "punpckhbw %%mm7, %%mm3 \n\t" \
1079 "movq %%mm0, (%1) \n\t" \
1080 "movq %%mm1, 17 * 8(%1) \n\t" \
1081 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1082 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1087 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1088 : "r"((x86_reg)srcStride) \
1095 /* FIXME reorder for speed */ \
1096 __asm__ volatile ( \
1097 /* "pxor %%mm7, %%mm7 \n\t" */ \
1099 "movq (%0), %%mm0 \n\t" \
1100 "movq 8(%0), %%mm1 \n\t" \
1101 "movq 16(%0), %%mm2 \n\t" \
1102 "movq 24(%0), %%mm3 \n\t" \
1103 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1104 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1106 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1108 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1110 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1111 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1113 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1114 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1116 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1117 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1119 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1120 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1122 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1124 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1126 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1127 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1129 "add $136, %0 \n\t" \
1134 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1135 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1136 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1137 "g"(4 - 14 * (x86_reg)dstStride) \
1142 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1147 uint64_t temp[9 * 2]; \
1148 uint64_t *temp_ptr = temp; \
1151 /* FIXME unroll */ \
1152 __asm__ volatile ( \
1153 "pxor %%mm7, %%mm7 \n\t" \
1155 "movq (%0), %%mm0 \n\t" \
1156 "movq (%0), %%mm1 \n\t" \
1157 "punpcklbw %%mm7, %%mm0 \n\t" \
1158 "punpckhbw %%mm7, %%mm1 \n\t" \
1159 "movq %%mm0, (%1) \n\t" \
1160 "movq %%mm1, 9*8(%1) \n\t" \
1165 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1166 : "r"((x86_reg)srcStride) \
1173 /* FIXME reorder for speed */ \
1174 __asm__ volatile ( \
1175 /* "pxor %%mm7, %%mm7 \n\t" */ \
1177 "movq (%0), %%mm0 \n\t" \
1178 "movq 8(%0), %%mm1 \n\t" \
1179 "movq 16(%0), %%mm2 \n\t" \
1180 "movq 24(%0), %%mm3 \n\t" \
1181 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1182 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1184 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1186 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1188 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1190 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1192 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1193 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1195 "add $72, %0 \n\t" \
1200 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1201 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1202 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1203 "g"(4 - 6 * (x86_reg)dstStride) \
1208 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1211 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1214 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1218 uint8_t * const half = (uint8_t*)temp; \
1219 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1221 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1224 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1227 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1231 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1235 uint8_t * const half = (uint8_t*)temp; \
1236 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1238 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1242 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1246 uint8_t * const half = (uint8_t*)temp; \
1247 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1248 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1251 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1254 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1257 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1261 uint8_t * const half = (uint8_t*)temp; \
1262 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1263 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1267 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1270 uint64_t half[8 + 9]; \
1271 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1272 uint8_t * const halfHV = ((uint8_t*)half); \
1273 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1275 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1276 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1277 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1280 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1283 uint64_t half[8 + 9]; \
1284 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1285 uint8_t * const halfHV = ((uint8_t*)half); \
1286 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1288 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1290 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1291 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1294 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1297 uint64_t half[8 + 9]; \
1298 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1299 uint8_t * const halfHV = ((uint8_t*)half); \
1300 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1302 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1303 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1304 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1307 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1310 uint64_t half[8 + 9]; \
1311 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1312 uint8_t * const halfHV = ((uint8_t*)half); \
1313 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1315 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1317 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1318 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1321 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1324 uint64_t half[8 + 9]; \
1325 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1326 uint8_t * const halfHV = ((uint8_t*)half); \
1327 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1329 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1330 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1333 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1336 uint64_t half[8 + 9]; \
1337 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1338 uint8_t * const halfHV = ((uint8_t*)half); \
1339 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1341 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1342 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1345 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1348 uint64_t half[8 + 9]; \
1349 uint8_t * const halfH = ((uint8_t*)half); \
1350 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1352 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1353 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1356 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1359 uint64_t half[8 + 9]; \
1360 uint8_t * const halfH = ((uint8_t*)half); \
1361 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1363 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1365 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1368 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1372 uint8_t * const halfH = ((uint8_t*)half); \
1373 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1375 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1378 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1381 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1384 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1387 uint64_t temp[32]; \
1388 uint8_t * const half = (uint8_t*)temp; \
1389 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1391 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1394 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1397 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1398 stride, stride, 16); \
1401 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1404 uint64_t temp[32]; \
1405 uint8_t * const half = (uint8_t*)temp; \
1406 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1408 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1409 stride, stride, 16); \
1412 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1415 uint64_t temp[32]; \
1416 uint8_t * const half = (uint8_t*)temp; \
1417 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1419 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1422 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1425 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1428 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1431 uint64_t temp[32]; \
1432 uint8_t * const half = (uint8_t*)temp; \
1433 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1435 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1436 stride, stride, 16); \
1439 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1442 uint64_t half[16 * 2 + 17 * 2]; \
1443 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1444 uint8_t * const halfHV = ((uint8_t*)half); \
1445 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1447 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1449 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1451 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1454 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1457 uint64_t half[16 * 2 + 17 * 2]; \
1458 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1459 uint8_t * const halfHV = ((uint8_t*)half); \
1460 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1462 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1464 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1466 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1469 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1472 uint64_t half[16 * 2 + 17 * 2]; \
1473 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1474 uint8_t * const halfHV = ((uint8_t*)half); \
1475 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1477 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1479 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1481 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1485 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1488 uint64_t half[16 * 2 + 17 * 2]; \
1489 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1490 uint8_t * const halfHV = ((uint8_t*)half); \
1491 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1493 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1495 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1497 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1501 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1504 uint64_t half[16 * 2 + 17 * 2]; \
1505 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1506 uint8_t * const halfHV = ((uint8_t*)half); \
1507 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1509 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1511 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1514 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1517 uint64_t half[16 * 2 + 17 * 2]; \
1518 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1519 uint8_t * const halfHV = ((uint8_t*)half); \
1520 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1522 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1524 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1528 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1531 uint64_t half[17 * 2]; \
1532 uint8_t * const halfH = ((uint8_t*)half); \
1533 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1535 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1537 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1540 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1543 uint64_t half[17 * 2]; \
1544 uint8_t * const halfH = ((uint8_t*)half); \
1545 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1547 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1549 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1552 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1555 uint64_t half[17 * 2]; \
1556 uint8_t * const halfH = ((uint8_t*)half); \
1557 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1559 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1562 #define PUT_OP(a, b, temp, size) \
1563 "mov"#size" "#a", "#b" \n\t"
1565 #define AVG_MMXEXT_OP(a, b, temp, size) \
1566 "mov"#size" "#b", "#temp" \n\t" \
1567 "pavgb "#temp", "#a" \n\t" \
1568 "mov"#size" "#a", "#b" \n\t"
1570 QPEL_BASE(put_, ff_pw_16, _, PUT_OP)
1571 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMXEXT_OP)
1572 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP)
1573 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext)
1574 QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
1575 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
1577 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1579 put_pixels8_xy2_mmx(dst, src, stride, 8);
1581 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1583 put_pixels16_xy2_mmx(dst, src, stride, 16);
1585 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1587 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1589 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1591 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1594 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1595 ptrdiff_t linesize, int block_w, int block_h,
1596 int src_x, int src_y, int w, int h);
1598 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1599 int stride, int h, int ox, int oy,
1600 int dxx, int dxy, int dyx, int dyy,
1601 int shift, int r, int width, int height,
1602 emulated_edge_mc_func *emu_edge_fn)
1605 const int ix = ox >> (16 + shift);
1606 const int iy = oy >> (16 + shift);
1607 const int oxs = ox >> 4;
1608 const int oys = oy >> 4;
1609 const int dxxs = dxx >> 4;
1610 const int dxys = dxy >> 4;
1611 const int dyxs = dyx >> 4;
1612 const int dyys = dyy >> 4;
1613 const uint16_t r4[4] = { r, r, r, r };
1614 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1615 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1616 const uint64_t shift2 = 2 * shift;
1617 #define MAX_STRIDE 4096U
1619 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
1622 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1623 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1624 const int dxh = dxy * (h - 1);
1625 const int dyw = dyx * (w - 1);
1626 int need_emu = (unsigned)ix >= width - w ||
1627 (unsigned)iy >= height - h;
1629 if ( // non-constant fullpel offset (3% of blocks)
1630 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1631 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1632 // uses more than 16 bits of subpel mv (only at huge resolution)
1633 || (dxx | dxy | dyx | dyy) & 15
1634 || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
1635 // FIXME could still use mmx for some of the rows
1636 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1637 shift, r, width, height);
1641 src += ix + iy * stride;
1643 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1648 "movd %0, %%mm6 \n\t"
1649 "pxor %%mm7, %%mm7 \n\t"
1650 "punpcklwd %%mm6, %%mm6 \n\t"
1651 "punpcklwd %%mm6, %%mm6 \n\t"
1655 for (x = 0; x < w; x += 4) {
1656 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1657 oxs - dxys + dxxs * (x + 1),
1658 oxs - dxys + dxxs * (x + 2),
1659 oxs - dxys + dxxs * (x + 3) };
1660 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1661 oys - dyys + dyxs * (x + 1),
1662 oys - dyys + dyxs * (x + 2),
1663 oys - dyys + dyxs * (x + 3) };
1665 for (y = 0; y < h; y++) {
1667 "movq %0, %%mm4 \n\t"
1668 "movq %1, %%mm5 \n\t"
1669 "paddw %2, %%mm4 \n\t"
1670 "paddw %3, %%mm5 \n\t"
1671 "movq %%mm4, %0 \n\t"
1672 "movq %%mm5, %1 \n\t"
1673 "psrlw $12, %%mm4 \n\t"
1674 "psrlw $12, %%mm5 \n\t"
1675 : "+m"(*dx4), "+m"(*dy4)
1676 : "m"(*dxy4), "m"(*dyy4)
1680 "movq %%mm6, %%mm2 \n\t"
1681 "movq %%mm6, %%mm1 \n\t"
1682 "psubw %%mm4, %%mm2 \n\t"
1683 "psubw %%mm5, %%mm1 \n\t"
1684 "movq %%mm2, %%mm0 \n\t"
1685 "movq %%mm4, %%mm3 \n\t"
1686 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1687 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1688 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1689 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1691 "movd %4, %%mm5 \n\t"
1692 "movd %3, %%mm4 \n\t"
1693 "punpcklbw %%mm7, %%mm5 \n\t"
1694 "punpcklbw %%mm7, %%mm4 \n\t"
1695 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1696 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1698 "movd %2, %%mm5 \n\t"
1699 "movd %1, %%mm4 \n\t"
1700 "punpcklbw %%mm7, %%mm5 \n\t"
1701 "punpcklbw %%mm7, %%mm4 \n\t"
1702 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1703 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1704 "paddw %5, %%mm1 \n\t"
1705 "paddw %%mm3, %%mm2 \n\t"
1706 "paddw %%mm1, %%mm0 \n\t"
1707 "paddw %%mm2, %%mm0 \n\t"
1709 "psrlw %6, %%mm0 \n\t"
1710 "packuswb %%mm0, %%mm0 \n\t"
1711 "movd %%mm0, %0 \n\t"
1713 : "=m"(dst[x + y * stride])
1714 : "m"(src[0]), "m"(src[1]),
1715 "m"(src[stride]), "m"(src[stride + 1]),
1716 "m"(*r4), "m"(shift2)
1720 src += 4 - h * stride;
1727 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1728 int stride, int h, int ox, int oy,
1729 int dxx, int dxy, int dyx, int dyy,
1730 int shift, int r, int width, int height)
1732 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1733 width, height, &ff_emulated_edge_mc_8);
1736 static void gmc_sse(uint8_t *dst, uint8_t *src,
1737 int stride, int h, int ox, int oy,
1738 int dxx, int dxy, int dyx, int dyy,
1739 int shift, int r, int width, int height)
1741 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1742 width, height, &ff_emulated_edge_mc_8);
1745 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1746 int stride, int h, int ox, int oy,
1747 int dxx, int dxy, int dyx, int dyy,
1748 int shift, int r, int width, int height)
1750 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1751 width, height, &ff_emulated_edge_mc_8);
1756 #endif /* HAVE_INLINE_ASM */
1758 #include "h264_qpel.c"
1760 void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
1761 int stride, int h, int x, int y);
1762 void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
1763 int stride, int h, int x, int y);
1764 void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
1765 int stride, int h, int x, int y);
1767 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1768 int stride, int h, int x, int y);
1769 void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
1770 int stride, int h, int x, int y);
1771 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1772 int stride, int h, int x, int y);
1774 void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1775 int stride, int h, int x, int y);
1776 void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1777 int stride, int h, int x, int y);
1779 void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1780 int stride, int h, int x, int y);
1781 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1782 int stride, int h, int x, int y);
1784 void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1785 int stride, int h, int x, int y);
1786 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1787 int stride, int h, int x, int y);
1789 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1790 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1791 (uint8_t *dst, uint8_t *src, \
1792 int stride, int h, int x, int y);
1794 CHROMA_MC(put, 2, 10, mmxext)
1795 CHROMA_MC(avg, 2, 10, mmxext)
1796 CHROMA_MC(put, 4, 10, mmxext)
1797 CHROMA_MC(avg, 4, 10, mmxext)
1798 CHROMA_MC(put, 8, 10, sse2)
1799 CHROMA_MC(avg, 8, 10, sse2)
1800 CHROMA_MC(put, 8, 10, avx)
1801 CHROMA_MC(avg, 8, 10, avx)
1806 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1808 put_pixels8_mmx(dst, src, stride, 8);
1811 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1813 avg_pixels8_mmx(dst, src, stride, 8);
1816 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1818 put_pixels16_mmx(dst, src, stride, 16);
1821 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1823 avg_pixels16_mmx(dst, src, stride, 16);
1827 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1828 int stride, int rnd)
1830 put_pixels8_mmx(dst, src, stride, 8);
1833 void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
1834 int stride, int rnd)
1836 avg_pixels8_mmxext(dst, src, stride, 8);
1839 #if CONFIG_DIRAC_DECODER
1840 #define DIRAC_PIXOP(OPNAME, EXT)\
1841 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1843 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
1845 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1847 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
1849 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1851 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
1852 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
1855 DIRAC_PIXOP(put, mmx)
1856 DIRAC_PIXOP(avg, mmx)
1857 DIRAC_PIXOP(avg, mmxext)
1860 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1862 ff_put_pixels16_sse2(dst, src[0], stride, h);
1864 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1866 ff_avg_pixels16_sse2(dst, src[0], stride, h);
1868 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1870 ff_put_pixels16_sse2(dst , src[0] , stride, h);
1871 ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
1873 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1875 ff_avg_pixels16_sse2(dst , src[0] , stride, h);
1876 ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
1881 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
1884 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
1888 ff_put_pixels_clamped_mmx(block, dest, line_size);
1891 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
1895 ff_add_pixels_clamped_mmx(block, dest, line_size);
1898 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
1901 ff_mmxext_idct(block);
1902 ff_put_pixels_clamped_mmx(block, dest, line_size);
1905 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
1908 ff_mmxext_idct(block);
1909 ff_add_pixels_clamped_mmx(block, dest, line_size);
1913 static void vector_clipf_sse(float *dst, const float *src,
1914 float min, float max, int len)
1916 x86_reg i = (len - 16) * 4;
1918 "movss %3, %%xmm4 \n\t"
1919 "movss %4, %%xmm5 \n\t"
1920 "shufps $0, %%xmm4, %%xmm4 \n\t"
1921 "shufps $0, %%xmm5, %%xmm5 \n\t"
1923 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1924 "movaps 16(%2, %0), %%xmm1 \n\t"
1925 "movaps 32(%2, %0), %%xmm2 \n\t"
1926 "movaps 48(%2, %0), %%xmm3 \n\t"
1927 "maxps %%xmm4, %%xmm0 \n\t"
1928 "maxps %%xmm4, %%xmm1 \n\t"
1929 "maxps %%xmm4, %%xmm2 \n\t"
1930 "maxps %%xmm4, %%xmm3 \n\t"
1931 "minps %%xmm5, %%xmm0 \n\t"
1932 "minps %%xmm5, %%xmm1 \n\t"
1933 "minps %%xmm5, %%xmm2 \n\t"
1934 "minps %%xmm5, %%xmm3 \n\t"
1935 "movaps %%xmm0, (%1, %0) \n\t"
1936 "movaps %%xmm1, 16(%1, %0) \n\t"
1937 "movaps %%xmm2, 32(%1, %0) \n\t"
1938 "movaps %%xmm3, 48(%1, %0) \n\t"
1942 : "r"(dst), "r"(src), "m"(min), "m"(max)
1947 #endif /* HAVE_INLINE_ASM */
1949 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1951 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1953 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1955 int order, int mul);
1956 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1958 int order, int mul);
1959 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1961 int order, int mul);
1963 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1964 const int16_t *window, unsigned int len);
1965 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1966 const int16_t *window, unsigned int len);
1967 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1968 const int16_t *window, unsigned int len);
1969 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1970 const int16_t *window, unsigned int len);
1971 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1972 const int16_t *window, unsigned int len);
1973 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1974 const int16_t *window, unsigned int len);
1976 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1977 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1979 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1980 const uint8_t *diff, int w,
1981 int *left, int *left_top);
1982 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1984 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1987 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1988 int32_t min, int32_t max, unsigned int len);
1989 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1990 int32_t min, int32_t max, unsigned int len);
1991 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1992 int32_t min, int32_t max, unsigned int len);
1993 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1994 int32_t min, int32_t max, unsigned int len);
1996 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
1998 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1999 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2000 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2001 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2002 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2003 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2004 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2005 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2006 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2007 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2008 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2009 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2010 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2011 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2012 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2013 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2016 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2018 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2019 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2020 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2021 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2024 #define H264_QPEL_FUNCS(x, y, CPU) \
2026 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2027 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2028 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2029 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2032 #define H264_QPEL_FUNCS_10(x, y, CPU) \
2034 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2035 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2036 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2037 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2040 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2042 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2045 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2046 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2047 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2049 if (!high_bit_depth) {
2050 c->clear_block = clear_block_mmx;
2051 c->clear_blocks = clear_blocks_mmx;
2052 c->draw_edges = draw_edges_mmx;
2054 SET_HPEL_FUNCS(put, 0, 16, mmx);
2055 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2056 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2057 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2058 SET_HPEL_FUNCS(put, 1, 8, mmx);
2059 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2060 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2061 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2064 #if ARCH_X86_32 || !HAVE_YASM
2068 c->add_bytes = add_bytes_mmx;
2070 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2071 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2072 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2074 #endif /* HAVE_INLINE_ASM */
2077 if (!high_bit_depth && CONFIG_H264CHROMA) {
2078 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
2079 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2082 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2087 static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
2090 const int bit_depth = avctx->bits_per_raw_sample;
2091 const int high_bit_depth = bit_depth > 8;
2094 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
2095 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
2097 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
2098 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
2099 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
2100 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
2102 if (!high_bit_depth) {
2103 c->put_pixels_tab[0][1] = put_pixels16_x2_mmxext;
2104 c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
2106 c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
2107 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
2108 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
2110 c->put_pixels_tab[1][1] = put_pixels8_x2_mmxext;
2111 c->put_pixels_tab[1][2] = put_pixels8_y2_mmxext;
2113 c->avg_pixels_tab[1][0] = avg_pixels8_mmxext;
2114 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmxext;
2115 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmxext;
2118 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2119 if (!high_bit_depth) {
2120 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
2121 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
2122 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmxext;
2123 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmxext;
2125 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
2126 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmxext;
2130 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2131 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2132 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmxext;
2133 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmxext;
2135 #endif /* HAVE_INLINE_ASM */
2137 #if HAVE_MMXEXT_EXTERNAL
2138 if (CONFIG_H264QPEL) {
2139 if (!high_bit_depth) {
2140 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
2141 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
2142 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
2143 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
2144 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
2145 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
2146 } else if (bit_depth == 10) {
2148 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2149 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2150 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2151 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2153 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2154 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2158 if (!high_bit_depth && CONFIG_H264CHROMA) {
2159 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
2160 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
2161 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
2162 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
2164 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2165 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
2166 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
2167 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
2168 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
2171 /* slower than cmov version on AMD */
2172 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
2173 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
2175 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
2176 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
2178 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2179 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2181 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
2183 #endif /* HAVE_MMXEXT_EXTERNAL */
2186 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2189 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2192 if (!high_bit_depth) {
2193 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2194 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2196 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2197 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2198 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2200 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2201 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2203 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2204 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2205 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2207 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2208 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2209 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2210 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2211 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2213 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2214 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2218 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2219 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2220 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2221 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2223 #endif /* HAVE_INLINE_ASM */
2226 if (!high_bit_depth && CONFIG_H264CHROMA) {
2227 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
2228 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2230 #endif /* HAVE_YASM */
2233 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2235 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2238 if (!high_bit_depth) {
2239 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2240 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2241 c->clear_block = clear_block_sse;
2242 c->clear_blocks = clear_blocks_sse;
2246 c->vector_clipf = vector_clipf_sse;
2247 #endif /* HAVE_INLINE_ASM */
2250 #if HAVE_INLINE_ASM && CONFIG_VIDEODSP
2253 #endif /* HAVE_YASM */
2256 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2259 const int bit_depth = avctx->bits_per_raw_sample;
2260 const int high_bit_depth = bit_depth > 8;
2262 #if HAVE_SSE2_INLINE
2263 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
2264 c->idct_put = ff_idct_xvid_sse2_put;
2265 c->idct_add = ff_idct_xvid_sse2_add;
2266 c->idct = ff_idct_xvid_sse2;
2267 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2269 #endif /* HAVE_SSE2_INLINE */
2271 #if HAVE_SSE2_EXTERNAL
2272 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2273 // these functions are slower than mmx on AMD, but faster on Intel
2274 if (!high_bit_depth) {
2275 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
2276 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
2277 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
2278 if (CONFIG_H264QPEL)
2279 H264_QPEL_FUNCS(0, 0, sse2);
2283 if (!high_bit_depth && CONFIG_H264QPEL) {
2284 H264_QPEL_FUNCS(0, 1, sse2);
2285 H264_QPEL_FUNCS(0, 2, sse2);
2286 H264_QPEL_FUNCS(0, 3, sse2);
2287 H264_QPEL_FUNCS(1, 1, sse2);
2288 H264_QPEL_FUNCS(1, 2, sse2);
2289 H264_QPEL_FUNCS(1, 3, sse2);
2290 H264_QPEL_FUNCS(2, 1, sse2);
2291 H264_QPEL_FUNCS(2, 2, sse2);
2292 H264_QPEL_FUNCS(2, 3, sse2);
2293 H264_QPEL_FUNCS(3, 1, sse2);
2294 H264_QPEL_FUNCS(3, 2, sse2);
2295 H264_QPEL_FUNCS(3, 3, sse2);
2298 if (bit_depth == 10) {
2299 if (CONFIG_H264QPEL) {
2300 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2301 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2302 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2303 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2304 H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2305 H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2306 H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
2308 if (CONFIG_H264CHROMA) {
2309 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2310 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2314 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2315 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2316 if (mm_flags & AV_CPU_FLAG_ATOM) {
2317 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2319 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2321 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2322 c->apply_window_int16 = ff_apply_window_int16_sse2;
2323 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2324 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
2326 c->bswap_buf = ff_bswap32_buf_sse2;
2327 #endif /* HAVE_SSE2_EXTERNAL */
2330 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2333 #if HAVE_SSSE3_EXTERNAL
2334 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2335 const int bit_depth = avctx->bits_per_raw_sample;
2337 if (!high_bit_depth && CONFIG_H264QPEL) {
2338 H264_QPEL_FUNCS(1, 0, ssse3);
2339 H264_QPEL_FUNCS(1, 1, ssse3);
2340 H264_QPEL_FUNCS(1, 2, ssse3);
2341 H264_QPEL_FUNCS(1, 3, ssse3);
2342 H264_QPEL_FUNCS(2, 0, ssse3);
2343 H264_QPEL_FUNCS(2, 1, ssse3);
2344 H264_QPEL_FUNCS(2, 2, ssse3);
2345 H264_QPEL_FUNCS(2, 3, ssse3);
2346 H264_QPEL_FUNCS(3, 0, ssse3);
2347 H264_QPEL_FUNCS(3, 1, ssse3);
2348 H264_QPEL_FUNCS(3, 2, ssse3);
2349 H264_QPEL_FUNCS(3, 3, ssse3);
2351 if (bit_depth == 10 && CONFIG_H264QPEL) {
2352 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
2353 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
2354 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
2356 if (!high_bit_depth && CONFIG_H264CHROMA) {
2357 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
2358 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
2359 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
2360 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
2362 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2363 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2364 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2366 if (mm_flags & AV_CPU_FLAG_ATOM)
2367 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2369 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2370 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
2371 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2372 c->bswap_buf = ff_bswap32_buf_ssse3;
2373 #endif /* HAVE_SSSE3_EXTERNAL */
2376 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
2379 #if HAVE_SSE4_EXTERNAL
2380 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2381 #endif /* HAVE_SSE4_EXTERNAL */
2384 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2386 #if HAVE_AVX_EXTERNAL
2387 const int bit_depth = avctx->bits_per_raw_sample;
2389 if (bit_depth == 10) {
2390 // AVX implies !cache64.
2391 // TODO: Port cache(32|64) detection from x264.
2392 if (CONFIG_H264QPEL) {
2393 H264_QPEL_FUNCS_10(1, 0, sse2);
2394 H264_QPEL_FUNCS_10(2, 0, sse2);
2395 H264_QPEL_FUNCS_10(3, 0, sse2);
2398 if (CONFIG_H264CHROMA) {
2399 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2400 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2403 #endif /* HAVE_AVX_EXTERNAL */
2406 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
2408 int mm_flags = av_get_cpu_flags();
2410 #if HAVE_7REGS && HAVE_INLINE_ASM
2411 if (mm_flags & AV_CPU_FLAG_CMOV)
2412 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2415 if (mm_flags & AV_CPU_FLAG_MMX) {
2417 const int idct_algo = avctx->idct_algo;
2419 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
2420 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
2421 c->idct_put = ff_simple_idct_put_mmx;
2422 c->idct_add = ff_simple_idct_add_mmx;
2423 c->idct = ff_simple_idct_mmx;
2424 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
2426 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
2427 if (mm_flags & AV_CPU_FLAG_MMX2) {
2428 c->idct_put = ff_libmpeg2mmx2_idct_put;
2429 c->idct_add = ff_libmpeg2mmx2_idct_add;
2430 c->idct = ff_mmxext_idct;
2432 c->idct_put = ff_libmpeg2mmx_idct_put;
2433 c->idct_add = ff_libmpeg2mmx_idct_add;
2434 c->idct = ff_mmx_idct;
2436 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2438 } else if (idct_algo == FF_IDCT_XVIDMMX) {
2439 if (mm_flags & AV_CPU_FLAG_SSE2) {
2440 c->idct_put = ff_idct_xvid_sse2_put;
2441 c->idct_add = ff_idct_xvid_sse2_add;
2442 c->idct = ff_idct_xvid_sse2;
2443 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2444 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
2445 c->idct_put = ff_idct_xvid_mmxext_put;
2446 c->idct_add = ff_idct_xvid_mmxext_add;
2447 c->idct = ff_idct_xvid_mmxext;
2449 c->idct_put = ff_idct_xvid_mmx_put;
2450 c->idct_add = ff_idct_xvid_mmx_add;
2451 c->idct = ff_idct_xvid_mmx;
2455 #endif /* HAVE_INLINE_ASM */
2457 dsputil_init_mmx(c, avctx, mm_flags);
2460 if (mm_flags & AV_CPU_FLAG_MMXEXT)
2461 dsputil_init_mmxext(c, avctx, mm_flags);
2463 if (mm_flags & AV_CPU_FLAG_3DNOW)
2464 dsputil_init_3dnow(c, avctx, mm_flags);
2466 if (mm_flags & AV_CPU_FLAG_SSE)
2467 dsputil_init_sse(c, avctx, mm_flags);
2469 if (mm_flags & AV_CPU_FLAG_SSE2)
2470 dsputil_init_sse2(c, avctx, mm_flags);
2472 if (mm_flags & AV_CPU_FLAG_SSSE3)
2473 dsputil_init_ssse3(c, avctx, mm_flags);
2475 if (mm_flags & AV_CPU_FLAG_SSE4)
2476 dsputil_init_sse4(c, avctx, mm_flags);
2478 if (mm_flags & AV_CPU_FLAG_AVX)
2479 dsputil_init_avx(c, avctx, mm_flags);
2481 if (CONFIG_ENCODERS)
2482 ff_dsputilenc_init_mmx(c, avctx);