2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "dsputil_mmx.h"
32 #include "idct_xvid.h"
33 #include "diracdsp_mmx.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
43 { 0x8000000080000000ULL, 0x8000000080000000ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
52 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
73 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
81 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
82 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
84 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
85 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
89 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
90 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
92 #define MOVQ_BFE(regd) \
94 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
95 "paddb %%"#regd", %%"#regd" \n\t" ::)
98 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
99 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
101 // for shared library it's better to use this way for accessing constants
103 #define MOVQ_BONE(regd) \
105 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
106 "psrlw $15, %%"#regd" \n\t" \
107 "packuswb %%"#regd", %%"#regd" \n\t" ::)
109 #define MOVQ_WTWO(regd) \
111 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
112 "psrlw $15, %%"#regd" \n\t" \
113 "psllw $1, %%"#regd" \n\t"::)
117 // using regr as temporary and for the output result
118 // first argument is unmodifed and second is trashed
119 // regfe is supposed to contain 0xfefefefefefefefe
120 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
121 "movq "#rega", "#regr" \n\t" \
122 "pand "#regb", "#regr" \n\t" \
123 "pxor "#rega", "#regb" \n\t" \
124 "pand "#regfe", "#regb" \n\t" \
125 "psrlq $1, "#regb" \n\t" \
126 "paddb "#regb", "#regr" \n\t"
128 #define PAVGB_MMX(rega, regb, regr, regfe) \
129 "movq "#rega", "#regr" \n\t" \
130 "por "#regb", "#regr" \n\t" \
131 "pxor "#rega", "#regb" \n\t" \
132 "pand "#regfe", "#regb" \n\t" \
133 "psrlq $1, "#regb" \n\t" \
134 "psubb "#regb", "#regr" \n\t"
136 // mm6 is supposed to contain 0xfefefefefefefefe
137 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
138 "movq "#rega", "#regr" \n\t" \
139 "movq "#regc", "#regp" \n\t" \
140 "pand "#regb", "#regr" \n\t" \
141 "pand "#regd", "#regp" \n\t" \
142 "pxor "#rega", "#regb" \n\t" \
143 "pxor "#regc", "#regd" \n\t" \
144 "pand %%mm6, "#regb" \n\t" \
145 "pand %%mm6, "#regd" \n\t" \
146 "psrlq $1, "#regb" \n\t" \
147 "psrlq $1, "#regd" \n\t" \
148 "paddb "#regb", "#regr" \n\t" \
149 "paddb "#regd", "#regp" \n\t"
151 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
152 "movq "#rega", "#regr" \n\t" \
153 "movq "#regc", "#regp" \n\t" \
154 "por "#regb", "#regr" \n\t" \
155 "por "#regd", "#regp" \n\t" \
156 "pxor "#rega", "#regb" \n\t" \
157 "pxor "#regc", "#regd" \n\t" \
158 "pand %%mm6, "#regb" \n\t" \
159 "pand %%mm6, "#regd" \n\t" \
160 "psrlq $1, "#regd" \n\t" \
161 "psrlq $1, "#regb" \n\t" \
162 "psubb "#regb", "#regr" \n\t" \
163 "psubb "#regd", "#regp" \n\t"
165 /***********************************/
166 /* MMX no rounding */
167 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
168 #define SET_RND MOVQ_WONE
169 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
170 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
171 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
173 #include "dsputil_rnd_template.c"
179 /***********************************/
182 #define DEF(x, y) x ## _ ## y ## _mmx
183 #define SET_RND MOVQ_WTWO
184 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
185 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
187 #include "dsputil_rnd_template.c"
195 /***********************************/
198 #define DEF(x) x ## _3dnow
199 #define PAVGB "pavgusb"
201 #define SKIP_FOR_3DNOW
203 #include "dsputil_avg_template.c"
208 #undef SKIP_FOR_3DNOW
210 /***********************************/
211 /* MMXEXT specific */
213 #define DEF(x) x ## _mmxext
215 /* Introduced only in MMXEXT set */
216 #define PAVGB "pavgb"
219 #include "dsputil_avg_template.c"
225 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
226 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
227 #define put_pixels16_mmxext put_pixels16_mmx
228 #define put_pixels8_mmxext put_pixels8_mmx
229 #define put_pixels4_mmxext put_pixels4_mmx
230 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
231 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
233 /***********************************/
236 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
242 /* read the pixels */
247 "movq (%3), %%mm0 \n\t"
248 "movq 8(%3), %%mm1 \n\t"
249 "movq 16(%3), %%mm2 \n\t"
250 "movq 24(%3), %%mm3 \n\t"
251 "movq 32(%3), %%mm4 \n\t"
252 "movq 40(%3), %%mm5 \n\t"
253 "movq 48(%3), %%mm6 \n\t"
254 "movq 56(%3), %%mm7 \n\t"
255 "packuswb %%mm1, %%mm0 \n\t"
256 "packuswb %%mm3, %%mm2 \n\t"
257 "packuswb %%mm5, %%mm4 \n\t"
258 "packuswb %%mm7, %%mm6 \n\t"
259 "movq %%mm0, (%0) \n\t"
260 "movq %%mm2, (%0, %1) \n\t"
261 "movq %%mm4, (%0, %1, 2) \n\t"
262 "movq %%mm6, (%0, %2) \n\t"
263 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
266 pix += line_size * 4;
269 // if here would be an exact copy of the code above
270 // compiler would generate some very strange code
273 "movq (%3), %%mm0 \n\t"
274 "movq 8(%3), %%mm1 \n\t"
275 "movq 16(%3), %%mm2 \n\t"
276 "movq 24(%3), %%mm3 \n\t"
277 "movq 32(%3), %%mm4 \n\t"
278 "movq 40(%3), %%mm5 \n\t"
279 "movq 48(%3), %%mm6 \n\t"
280 "movq 56(%3), %%mm7 \n\t"
281 "packuswb %%mm1, %%mm0 \n\t"
282 "packuswb %%mm3, %%mm2 \n\t"
283 "packuswb %%mm5, %%mm4 \n\t"
284 "packuswb %%mm7, %%mm6 \n\t"
285 "movq %%mm0, (%0) \n\t"
286 "movq %%mm2, (%0, %1) \n\t"
287 "movq %%mm4, (%0, %1, 2) \n\t"
288 "movq %%mm6, (%0, %2) \n\t"
289 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
293 #define put_signed_pixels_clamped_mmx_half(off) \
294 "movq "#off"(%2), %%mm1 \n\t" \
295 "movq 16 + "#off"(%2), %%mm2 \n\t" \
296 "movq 32 + "#off"(%2), %%mm3 \n\t" \
297 "movq 48 + "#off"(%2), %%mm4 \n\t" \
298 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
299 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
300 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
301 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
302 "paddb %%mm0, %%mm1 \n\t" \
303 "paddb %%mm0, %%mm2 \n\t" \
304 "paddb %%mm0, %%mm3 \n\t" \
305 "paddb %%mm0, %%mm4 \n\t" \
306 "movq %%mm1, (%0) \n\t" \
307 "movq %%mm2, (%0, %3) \n\t" \
308 "movq %%mm3, (%0, %3, 2) \n\t" \
309 "movq %%mm4, (%0, %1) \n\t"
311 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
314 x86_reg line_skip = line_size;
318 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
319 "lea (%3, %3, 2), %1 \n\t"
320 put_signed_pixels_clamped_mmx_half(0)
321 "lea (%0, %3, 4), %0 \n\t"
322 put_signed_pixels_clamped_mmx_half(64)
323 : "+&r"(pixels), "=&r"(line_skip3)
324 : "r"(block), "r"(line_skip)
328 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
335 /* read the pixels */
342 "movq (%2), %%mm0 \n\t"
343 "movq 8(%2), %%mm1 \n\t"
344 "movq 16(%2), %%mm2 \n\t"
345 "movq 24(%2), %%mm3 \n\t"
346 "movq %0, %%mm4 \n\t"
347 "movq %1, %%mm6 \n\t"
348 "movq %%mm4, %%mm5 \n\t"
349 "punpcklbw %%mm7, %%mm4 \n\t"
350 "punpckhbw %%mm7, %%mm5 \n\t"
351 "paddsw %%mm4, %%mm0 \n\t"
352 "paddsw %%mm5, %%mm1 \n\t"
353 "movq %%mm6, %%mm5 \n\t"
354 "punpcklbw %%mm7, %%mm6 \n\t"
355 "punpckhbw %%mm7, %%mm5 \n\t"
356 "paddsw %%mm6, %%mm2 \n\t"
357 "paddsw %%mm5, %%mm3 \n\t"
358 "packuswb %%mm1, %%mm0 \n\t"
359 "packuswb %%mm3, %%mm2 \n\t"
360 "movq %%mm0, %0 \n\t"
361 "movq %%mm2, %1 \n\t"
362 : "+m"(*pix), "+m"(*(pix + line_size))
365 pix += line_size * 2;
370 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
371 int line_size, int h)
374 "lea (%3, %3), %%"REG_a" \n\t"
377 "movq (%1 ), %%mm0 \n\t"
378 "movq (%1, %3), %%mm1 \n\t"
379 "movq %%mm0, (%2) \n\t"
380 "movq %%mm1, (%2, %3) \n\t"
381 "add %%"REG_a", %1 \n\t"
382 "add %%"REG_a", %2 \n\t"
383 "movq (%1 ), %%mm0 \n\t"
384 "movq (%1, %3), %%mm1 \n\t"
385 "movq %%mm0, (%2) \n\t"
386 "movq %%mm1, (%2, %3) \n\t"
387 "add %%"REG_a", %1 \n\t"
388 "add %%"REG_a", %2 \n\t"
391 : "+g"(h), "+r"(pixels), "+r"(block)
392 : "r"((x86_reg)line_size)
397 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
398 int line_size, int h)
401 "lea (%3, %3), %%"REG_a" \n\t"
404 "movq (%1 ), %%mm0 \n\t"
405 "movq 8(%1 ), %%mm4 \n\t"
406 "movq (%1, %3), %%mm1 \n\t"
407 "movq 8(%1, %3), %%mm5 \n\t"
408 "movq %%mm0, (%2) \n\t"
409 "movq %%mm4, 8(%2) \n\t"
410 "movq %%mm1, (%2, %3) \n\t"
411 "movq %%mm5, 8(%2, %3) \n\t"
412 "add %%"REG_a", %1 \n\t"
413 "add %%"REG_a", %2 \n\t"
414 "movq (%1 ), %%mm0 \n\t"
415 "movq 8(%1 ), %%mm4 \n\t"
416 "movq (%1, %3), %%mm1 \n\t"
417 "movq 8(%1, %3), %%mm5 \n\t"
418 "movq %%mm0, (%2) \n\t"
419 "movq %%mm4, 8(%2) \n\t"
420 "movq %%mm1, (%2, %3) \n\t"
421 "movq %%mm5, 8(%2, %3) \n\t"
422 "add %%"REG_a", %1 \n\t"
423 "add %%"REG_a", %2 \n\t"
426 : "+g"(h), "+r"(pixels), "+r"(block)
427 : "r"((x86_reg)line_size)
432 #define CLEAR_BLOCKS(name, n) \
433 static void name(DCTELEM *blocks) \
436 "pxor %%mm7, %%mm7 \n\t" \
437 "mov %1, %%"REG_a" \n\t" \
439 "movq %%mm7, (%0, %%"REG_a") \n\t" \
440 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
441 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
442 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
443 "add $32, %%"REG_a" \n\t" \
445 :: "r"(((uint8_t *)blocks) + 128 * n), \
450 CLEAR_BLOCKS(clear_blocks_mmx, 6)
451 CLEAR_BLOCKS(clear_block_mmx, 1)
453 static void clear_block_sse(DCTELEM *block)
456 "xorps %%xmm0, %%xmm0 \n"
457 "movaps %%xmm0, (%0) \n"
458 "movaps %%xmm0, 16(%0) \n"
459 "movaps %%xmm0, 32(%0) \n"
460 "movaps %%xmm0, 48(%0) \n"
461 "movaps %%xmm0, 64(%0) \n"
462 "movaps %%xmm0, 80(%0) \n"
463 "movaps %%xmm0, 96(%0) \n"
464 "movaps %%xmm0, 112(%0) \n"
470 static void clear_blocks_sse(DCTELEM *blocks)
473 "xorps %%xmm0, %%xmm0 \n"
474 "mov %1, %%"REG_a" \n"
476 "movaps %%xmm0, (%0, %%"REG_a") \n"
477 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
478 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
479 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
480 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
481 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
482 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
483 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
484 "add $128, %%"REG_a" \n"
486 :: "r"(((uint8_t *)blocks) + 128 * 6),
492 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
498 "movq (%1, %0), %%mm0 \n\t"
499 "movq (%2, %0), %%mm1 \n\t"
500 "paddb %%mm0, %%mm1 \n\t"
501 "movq %%mm1, (%2, %0) \n\t"
502 "movq 8(%1, %0), %%mm0 \n\t"
503 "movq 8(%2, %0), %%mm1 \n\t"
504 "paddb %%mm0, %%mm1 \n\t"
505 "movq %%mm1, 8(%2, %0) \n\t"
511 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
514 dst[i + 0] += src[i + 0];
518 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
519 const uint8_t *diff, int w,
520 int *left, int *left_top)
524 int l = *left & 0xff;
525 int tl = *left_top & 0xff;
530 "movzbl (%3, %4), %2 \n"
543 "add (%6, %4), %b0 \n"
544 "mov %b0, (%5, %4) \n"
547 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
548 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
555 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
556 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
557 "movd (%1), %%mm0 \n\t"
559 "movd (%1), %%mm1 \n\t"
560 "movd (%1,%3,1), %%mm2 \n\t"
561 "movd (%1,%3,2), %%mm3 \n\t"
562 "punpcklbw %%mm1, %%mm0 \n\t"
563 "punpcklbw %%mm3, %%mm2 \n\t"
564 "movq %%mm0, %%mm1 \n\t"
565 "punpcklwd %%mm2, %%mm0 \n\t"
566 "punpckhwd %%mm2, %%mm1 \n\t"
567 "movd %%mm0, (%0) \n\t"
569 "punpckhdq %%mm0, %%mm0 \n\t"
570 "movd %%mm0, (%0) \n\t"
571 "movd %%mm1, (%0,%2,1) \n\t"
572 "punpckhdq %%mm1, %%mm1 \n\t"
573 "movd %%mm1, (%0,%2,2) \n\t"
583 #define H263_LOOP_FILTER \
584 "pxor %%mm7, %%mm7 \n\t" \
585 "movq %0, %%mm0 \n\t" \
586 "movq %0, %%mm1 \n\t" \
587 "movq %3, %%mm2 \n\t" \
588 "movq %3, %%mm3 \n\t" \
589 "punpcklbw %%mm7, %%mm0 \n\t" \
590 "punpckhbw %%mm7, %%mm1 \n\t" \
591 "punpcklbw %%mm7, %%mm2 \n\t" \
592 "punpckhbw %%mm7, %%mm3 \n\t" \
593 "psubw %%mm2, %%mm0 \n\t" \
594 "psubw %%mm3, %%mm1 \n\t" \
595 "movq %1, %%mm2 \n\t" \
596 "movq %1, %%mm3 \n\t" \
597 "movq %2, %%mm4 \n\t" \
598 "movq %2, %%mm5 \n\t" \
599 "punpcklbw %%mm7, %%mm2 \n\t" \
600 "punpckhbw %%mm7, %%mm3 \n\t" \
601 "punpcklbw %%mm7, %%mm4 \n\t" \
602 "punpckhbw %%mm7, %%mm5 \n\t" \
603 "psubw %%mm2, %%mm4 \n\t" \
604 "psubw %%mm3, %%mm5 \n\t" \
605 "psllw $2, %%mm4 \n\t" \
606 "psllw $2, %%mm5 \n\t" \
607 "paddw %%mm0, %%mm4 \n\t" \
608 "paddw %%mm1, %%mm5 \n\t" \
609 "pxor %%mm6, %%mm6 \n\t" \
610 "pcmpgtw %%mm4, %%mm6 \n\t" \
611 "pcmpgtw %%mm5, %%mm7 \n\t" \
612 "pxor %%mm6, %%mm4 \n\t" \
613 "pxor %%mm7, %%mm5 \n\t" \
614 "psubw %%mm6, %%mm4 \n\t" \
615 "psubw %%mm7, %%mm5 \n\t" \
616 "psrlw $3, %%mm4 \n\t" \
617 "psrlw $3, %%mm5 \n\t" \
618 "packuswb %%mm5, %%mm4 \n\t" \
619 "packsswb %%mm7, %%mm6 \n\t" \
620 "pxor %%mm7, %%mm7 \n\t" \
621 "movd %4, %%mm2 \n\t" \
622 "punpcklbw %%mm2, %%mm2 \n\t" \
623 "punpcklbw %%mm2, %%mm2 \n\t" \
624 "punpcklbw %%mm2, %%mm2 \n\t" \
625 "psubusb %%mm4, %%mm2 \n\t" \
626 "movq %%mm2, %%mm3 \n\t" \
627 "psubusb %%mm4, %%mm3 \n\t" \
628 "psubb %%mm3, %%mm2 \n\t" \
629 "movq %1, %%mm3 \n\t" \
630 "movq %2, %%mm4 \n\t" \
631 "pxor %%mm6, %%mm3 \n\t" \
632 "pxor %%mm6, %%mm4 \n\t" \
633 "paddusb %%mm2, %%mm3 \n\t" \
634 "psubusb %%mm2, %%mm4 \n\t" \
635 "pxor %%mm6, %%mm3 \n\t" \
636 "pxor %%mm6, %%mm4 \n\t" \
637 "paddusb %%mm2, %%mm2 \n\t" \
638 "packsswb %%mm1, %%mm0 \n\t" \
639 "pcmpgtb %%mm0, %%mm7 \n\t" \
640 "pxor %%mm7, %%mm0 \n\t" \
641 "psubb %%mm7, %%mm0 \n\t" \
642 "movq %%mm0, %%mm1 \n\t" \
643 "psubusb %%mm2, %%mm0 \n\t" \
644 "psubb %%mm0, %%mm1 \n\t" \
645 "pand %5, %%mm1 \n\t" \
646 "psrlw $2, %%mm1 \n\t" \
647 "pxor %%mm7, %%mm1 \n\t" \
648 "psubb %%mm7, %%mm1 \n\t" \
649 "movq %0, %%mm5 \n\t" \
650 "movq %3, %%mm6 \n\t" \
651 "psubb %%mm1, %%mm5 \n\t" \
652 "paddb %%mm1, %%mm6 \n\t"
654 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
656 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
657 const int strength = ff_h263_loop_filter_strength[qscale];
662 "movq %%mm3, %1 \n\t"
663 "movq %%mm4, %2 \n\t"
664 "movq %%mm5, %0 \n\t"
665 "movq %%mm6, %3 \n\t"
666 : "+m"(*(uint64_t*)(src - 2 * stride)),
667 "+m"(*(uint64_t*)(src - 1 * stride)),
668 "+m"(*(uint64_t*)(src + 0 * stride)),
669 "+m"(*(uint64_t*)(src + 1 * stride))
670 : "g"(2 * strength), "m"(ff_pb_FC)
675 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
677 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
678 const int strength = ff_h263_loop_filter_strength[qscale];
679 DECLARE_ALIGNED(8, uint64_t, temp)[4];
680 uint8_t *btemp = (uint8_t*)temp;
684 transpose4x4(btemp, src, 8, stride);
685 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
687 H263_LOOP_FILTER // 5 3 4 6
693 : "g"(2 * strength), "m"(ff_pb_FC)
697 "movq %%mm5, %%mm1 \n\t"
698 "movq %%mm4, %%mm0 \n\t"
699 "punpcklbw %%mm3, %%mm5 \n\t"
700 "punpcklbw %%mm6, %%mm4 \n\t"
701 "punpckhbw %%mm3, %%mm1 \n\t"
702 "punpckhbw %%mm6, %%mm0 \n\t"
703 "movq %%mm5, %%mm3 \n\t"
704 "movq %%mm1, %%mm6 \n\t"
705 "punpcklwd %%mm4, %%mm5 \n\t"
706 "punpcklwd %%mm0, %%mm1 \n\t"
707 "punpckhwd %%mm4, %%mm3 \n\t"
708 "punpckhwd %%mm0, %%mm6 \n\t"
709 "movd %%mm5, (%0) \n\t"
710 "punpckhdq %%mm5, %%mm5 \n\t"
711 "movd %%mm5, (%0, %2) \n\t"
712 "movd %%mm3, (%0, %2, 2) \n\t"
713 "punpckhdq %%mm3, %%mm3 \n\t"
714 "movd %%mm3, (%0, %3) \n\t"
715 "movd %%mm1, (%1) \n\t"
716 "punpckhdq %%mm1, %%mm1 \n\t"
717 "movd %%mm1, (%1, %2) \n\t"
718 "movd %%mm6, (%1, %2, 2) \n\t"
719 "punpckhdq %%mm6, %%mm6 \n\t"
720 "movd %%mm6, (%1, %3) \n\t"
722 "r"(src + 4 * stride),
723 "r"((x86_reg)stride),
724 "r"((x86_reg)(3 * stride))
729 /* Draw the edges of width 'w' of an image of size width, height
730 * this MMX version can only handle w == 8 || w == 16. */
731 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
732 int w, int h, int sides)
734 uint8_t *ptr, *last_line;
737 last_line = buf + (height - 1) * wrap;
743 "movd (%0), %%mm0 \n\t"
744 "punpcklbw %%mm0, %%mm0 \n\t"
745 "punpcklwd %%mm0, %%mm0 \n\t"
746 "punpckldq %%mm0, %%mm0 \n\t"
747 "movq %%mm0, -8(%0) \n\t"
748 "movq -8(%0, %2), %%mm1 \n\t"
749 "punpckhbw %%mm1, %%mm1 \n\t"
750 "punpckhwd %%mm1, %%mm1 \n\t"
751 "punpckhdq %%mm1, %%mm1 \n\t"
752 "movq %%mm1, (%0, %2) \n\t"
757 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
762 "movd (%0), %%mm0 \n\t"
763 "punpcklbw %%mm0, %%mm0 \n\t"
764 "punpcklwd %%mm0, %%mm0 \n\t"
765 "punpckldq %%mm0, %%mm0 \n\t"
766 "movq %%mm0, -8(%0) \n\t"
767 "movq %%mm0, -16(%0) \n\t"
768 "movq -8(%0, %2), %%mm1 \n\t"
769 "punpckhbw %%mm1, %%mm1 \n\t"
770 "punpckhwd %%mm1, %%mm1 \n\t"
771 "punpckhdq %%mm1, %%mm1 \n\t"
772 "movq %%mm1, (%0, %2) \n\t"
773 "movq %%mm1, 8(%0, %2) \n\t"
778 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
784 "movd (%0), %%mm0 \n\t"
785 "punpcklbw %%mm0, %%mm0 \n\t"
786 "punpcklwd %%mm0, %%mm0 \n\t"
787 "movd %%mm0, -4(%0) \n\t"
788 "movd -4(%0, %2), %%mm1 \n\t"
789 "punpcklbw %%mm1, %%mm1 \n\t"
790 "punpckhwd %%mm1, %%mm1 \n\t"
791 "punpckhdq %%mm1, %%mm1 \n\t"
792 "movd %%mm1, (%0, %2) \n\t"
797 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
801 /* top and bottom (and hopefully also the corners) */
802 if (sides & EDGE_TOP) {
803 for (i = 0; i < h; i += 4) {
804 ptr = buf - (i + 1) * wrap - w;
807 "movq (%1, %0), %%mm0 \n\t"
808 "movq %%mm0, (%0) \n\t"
809 "movq %%mm0, (%0, %2) \n\t"
810 "movq %%mm0, (%0, %2, 2) \n\t"
811 "movq %%mm0, (%0, %3) \n\t"
816 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
817 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
822 if (sides & EDGE_BOTTOM) {
823 for (i = 0; i < h; i += 4) {
824 ptr = last_line + (i + 1) * wrap - w;
827 "movq (%1, %0), %%mm0 \n\t"
828 "movq %%mm0, (%0) \n\t"
829 "movq %%mm0, (%0, %2) \n\t"
830 "movq %%mm0, (%0, %2, 2) \n\t"
831 "movq %%mm0, (%0, %3) \n\t"
836 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
837 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
838 "r"(ptr + width + 2 * w)
844 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
845 in0, in1, in2, in7, out, OP) \
846 "paddw "#m4", "#m3" \n\t" /* x1 */ \
847 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
848 "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
849 "movq "#in7", "#m3" \n\t" /* d */ \
850 "movq "#in0", %%mm5 \n\t" /* D */ \
851 "paddw "#m3", %%mm5 \n\t" /* x4 */ \
852 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
853 "movq "#in1", %%mm5 \n\t" /* C */ \
854 "movq "#in2", %%mm6 \n\t" /* B */ \
855 "paddw "#m6", %%mm5 \n\t" /* x3 */ \
856 "paddw "#m5", %%mm6 \n\t" /* x2 */ \
857 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
858 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
859 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
860 "paddw "#rnd", %%mm4 \n\t" /* x2 */ \
861 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
862 "psraw $5, %%mm5 \n\t" \
863 "packuswb %%mm5, %%mm5 \n\t" \
864 OP(%%mm5, out, %%mm7, d)
866 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMXEXT) \
867 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, \
876 "pxor %%mm7, %%mm7 \n\t" \
878 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
879 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
880 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
881 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
882 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
883 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
884 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
885 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
886 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
887 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
888 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
889 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
890 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
891 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
892 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
893 "paddw %%mm3, %%mm5 \n\t" /* b */ \
894 "paddw %%mm2, %%mm6 \n\t" /* c */ \
895 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
896 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
897 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
898 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
899 "paddw %%mm4, %%mm0 \n\t" /* a */ \
900 "paddw %%mm1, %%mm5 \n\t" /* d */ \
901 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
902 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
903 "paddw %6, %%mm6 \n\t" \
904 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
905 "psraw $5, %%mm0 \n\t" \
906 "movq %%mm0, %5 \n\t" \
907 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
909 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
910 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
911 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
912 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
913 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
914 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
915 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
916 "paddw %%mm0, %%mm2 \n\t" /* b */ \
917 "paddw %%mm5, %%mm3 \n\t" /* c */ \
918 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
919 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
920 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
921 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
922 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
923 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
924 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
925 "paddw %%mm2, %%mm1 \n\t" /* a */ \
926 "paddw %%mm6, %%mm4 \n\t" /* d */ \
927 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
928 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
929 "paddw %6, %%mm1 \n\t" \
930 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
931 "psraw $5, %%mm3 \n\t" \
932 "movq %5, %%mm1 \n\t" \
933 "packuswb %%mm3, %%mm1 \n\t" \
934 OP_MMXEXT(%%mm1, (%1), %%mm4, q) \
935 /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
937 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
938 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
939 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
940 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
941 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
942 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
943 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
944 "paddw %%mm1, %%mm5 \n\t" /* b */ \
945 "paddw %%mm4, %%mm0 \n\t" /* c */ \
946 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
947 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
948 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
949 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
950 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
951 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
952 "paddw %%mm3, %%mm2 \n\t" /* d */ \
953 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
954 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
955 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
956 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
957 "paddw %%mm2, %%mm6 \n\t" /* a */ \
958 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
959 "paddw %6, %%mm0 \n\t" \
960 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
961 "psraw $5, %%mm0 \n\t" \
962 /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
963 /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
965 "paddw %%mm5, %%mm3 \n\t" /* a */ \
966 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
967 "paddw %%mm4, %%mm6 \n\t" /* b */ \
968 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
969 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
970 "paddw %%mm1, %%mm4 \n\t" /* c */ \
971 "paddw %%mm2, %%mm5 \n\t" /* d */ \
972 "paddw %%mm6, %%mm6 \n\t" /* 2b */ \
973 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
974 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
975 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
976 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
977 "paddw %6, %%mm4 \n\t" \
978 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
979 "psraw $5, %%mm4 \n\t" \
980 "packuswb %%mm4, %%mm0 \n\t" \
981 OP_MMXEXT(%%mm0, 8(%1), %%mm4, q) \
987 : "+a"(src), "+c"(dst), "+D"(h) \
988 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
989 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
994 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, \
1000 __asm__ volatile ( \
1001 "pxor %%mm7, %%mm7 \n\t" \
1003 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
1004 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
1005 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
1006 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
1007 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
1008 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
1009 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
1010 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
1011 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
1012 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
1013 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
1014 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
1015 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
1016 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
1017 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
1018 "paddw %%mm3, %%mm5 \n\t" /* b */ \
1019 "paddw %%mm2, %%mm6 \n\t" /* c */ \
1020 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1021 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
1022 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
1023 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
1024 "paddw %%mm4, %%mm0 \n\t" /* a */ \
1025 "paddw %%mm1, %%mm5 \n\t" /* d */ \
1026 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
1027 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
1028 "paddw %5, %%mm6 \n\t" \
1029 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1030 "psraw $5, %%mm0 \n\t" \
1031 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
1033 "movd 5(%0), %%mm5 \n\t" /* FGHI */ \
1034 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
1035 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
1036 "paddw %%mm5, %%mm1 \n\t" /* a */ \
1037 "paddw %%mm6, %%mm2 \n\t" /* b */ \
1038 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
1039 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
1040 "paddw %%mm6, %%mm3 \n\t" /* c */ \
1041 "paddw %%mm5, %%mm4 \n\t" /* d */ \
1042 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1043 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1044 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1045 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1046 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
1047 "paddw %5, %%mm1 \n\t" \
1048 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
1049 "psraw $5, %%mm3 \n\t" \
1050 "packuswb %%mm3, %%mm0 \n\t" \
1051 OP_MMXEXT(%%mm0, (%1), %%mm4, q) \
1057 : "+a"(src), "+c"(dst), "+d"(h) \
1058 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1059 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
1064 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1065 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1070 uint64_t temp[17 * 4]; \
1071 uint64_t *temp_ptr = temp; \
1074 /* FIXME unroll */ \
1075 __asm__ volatile ( \
1076 "pxor %%mm7, %%mm7 \n\t" \
1078 "movq (%0), %%mm0 \n\t" \
1079 "movq (%0), %%mm1 \n\t" \
1080 "movq 8(%0), %%mm2 \n\t" \
1081 "movq 8(%0), %%mm3 \n\t" \
1082 "punpcklbw %%mm7, %%mm0 \n\t" \
1083 "punpckhbw %%mm7, %%mm1 \n\t" \
1084 "punpcklbw %%mm7, %%mm2 \n\t" \
1085 "punpckhbw %%mm7, %%mm3 \n\t" \
1086 "movq %%mm0, (%1) \n\t" \
1087 "movq %%mm1, 17 * 8(%1) \n\t" \
1088 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1089 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1094 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1095 : "r"((x86_reg)srcStride) \
1102 /* FIXME reorder for speed */ \
1103 __asm__ volatile ( \
1104 /* "pxor %%mm7, %%mm7 \n\t" */ \
1106 "movq (%0), %%mm0 \n\t" \
1107 "movq 8(%0), %%mm1 \n\t" \
1108 "movq 16(%0), %%mm2 \n\t" \
1109 "movq 24(%0), %%mm3 \n\t" \
1110 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1111 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1113 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1115 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1117 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1118 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1120 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1121 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1123 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1124 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1126 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1127 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1129 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1131 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1133 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1134 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1136 "add $136, %0 \n\t" \
1141 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1142 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1143 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1144 "g"(4 - 14 * (x86_reg)dstStride) \
1149 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1154 uint64_t temp[9 * 2]; \
1155 uint64_t *temp_ptr = temp; \
1158 /* FIXME unroll */ \
1159 __asm__ volatile ( \
1160 "pxor %%mm7, %%mm7 \n\t" \
1162 "movq (%0), %%mm0 \n\t" \
1163 "movq (%0), %%mm1 \n\t" \
1164 "punpcklbw %%mm7, %%mm0 \n\t" \
1165 "punpckhbw %%mm7, %%mm1 \n\t" \
1166 "movq %%mm0, (%1) \n\t" \
1167 "movq %%mm1, 9*8(%1) \n\t" \
1172 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1173 : "r"((x86_reg)srcStride) \
1180 /* FIXME reorder for speed */ \
1181 __asm__ volatile ( \
1182 /* "pxor %%mm7, %%mm7 \n\t" */ \
1184 "movq (%0), %%mm0 \n\t" \
1185 "movq 8(%0), %%mm1 \n\t" \
1186 "movq 16(%0), %%mm2 \n\t" \
1187 "movq 24(%0), %%mm3 \n\t" \
1188 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1189 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1191 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1193 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1195 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1197 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1199 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1200 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1202 "add $72, %0 \n\t" \
1207 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1208 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1209 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1210 "g"(4 - 6 * (x86_reg)dstStride) \
1215 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1218 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1221 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1225 uint8_t * const half = (uint8_t*)temp; \
1226 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1228 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1231 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1234 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1238 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1242 uint8_t * const half = (uint8_t*)temp; \
1243 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1245 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1249 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1253 uint8_t * const half = (uint8_t*)temp; \
1254 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1255 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1258 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1261 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1264 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1268 uint8_t * const half = (uint8_t*)temp; \
1269 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1270 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1274 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1277 uint64_t half[8 + 9]; \
1278 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1279 uint8_t * const halfHV = ((uint8_t*)half); \
1280 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1282 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1283 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1284 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1287 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1290 uint64_t half[8 + 9]; \
1291 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1292 uint8_t * const halfHV = ((uint8_t*)half); \
1293 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1295 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1297 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1298 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1301 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1304 uint64_t half[8 + 9]; \
1305 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1306 uint8_t * const halfHV = ((uint8_t*)half); \
1307 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1309 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1310 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1311 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1314 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1317 uint64_t half[8 + 9]; \
1318 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1319 uint8_t * const halfHV = ((uint8_t*)half); \
1320 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1322 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1324 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1325 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1328 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1331 uint64_t half[8 + 9]; \
1332 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1333 uint8_t * const halfHV = ((uint8_t*)half); \
1334 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1336 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1337 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1340 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1343 uint64_t half[8 + 9]; \
1344 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1345 uint8_t * const halfHV = ((uint8_t*)half); \
1346 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1348 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1349 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1352 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1355 uint64_t half[8 + 9]; \
1356 uint8_t * const halfH = ((uint8_t*)half); \
1357 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1359 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1360 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1363 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1366 uint64_t half[8 + 9]; \
1367 uint8_t * const halfH = ((uint8_t*)half); \
1368 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1370 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1372 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1375 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1379 uint8_t * const halfH = ((uint8_t*)half); \
1380 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1382 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1385 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1388 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1391 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1394 uint64_t temp[32]; \
1395 uint8_t * const half = (uint8_t*)temp; \
1396 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1398 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1401 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1404 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1405 stride, stride, 16); \
1408 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1411 uint64_t temp[32]; \
1412 uint8_t * const half = (uint8_t*)temp; \
1413 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1415 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1416 stride, stride, 16); \
1419 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1422 uint64_t temp[32]; \
1423 uint8_t * const half = (uint8_t*)temp; \
1424 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1426 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1429 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1432 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1435 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1438 uint64_t temp[32]; \
1439 uint8_t * const half = (uint8_t*)temp; \
1440 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1442 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1443 stride, stride, 16); \
1446 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1449 uint64_t half[16 * 2 + 17 * 2]; \
1450 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1451 uint8_t * const halfHV = ((uint8_t*)half); \
1452 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1454 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1456 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1458 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1461 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1464 uint64_t half[16 * 2 + 17 * 2]; \
1465 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1466 uint8_t * const halfHV = ((uint8_t*)half); \
1467 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1469 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1471 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1473 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1476 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1479 uint64_t half[16 * 2 + 17 * 2]; \
1480 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1481 uint8_t * const halfHV = ((uint8_t*)half); \
1482 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1484 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1486 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1488 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1492 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1495 uint64_t half[16 * 2 + 17 * 2]; \
1496 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1497 uint8_t * const halfHV = ((uint8_t*)half); \
1498 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1500 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1502 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1504 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1508 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1511 uint64_t half[16 * 2 + 17 * 2]; \
1512 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1513 uint8_t * const halfHV = ((uint8_t*)half); \
1514 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1516 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1518 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1521 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1524 uint64_t half[16 * 2 + 17 * 2]; \
1525 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1526 uint8_t * const halfHV = ((uint8_t*)half); \
1527 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1529 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1531 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1535 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1538 uint64_t half[17 * 2]; \
1539 uint8_t * const halfH = ((uint8_t*)half); \
1540 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1542 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1544 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1547 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1550 uint64_t half[17 * 2]; \
1551 uint8_t * const halfH = ((uint8_t*)half); \
1552 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1554 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1556 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1559 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1562 uint64_t half[17 * 2]; \
1563 uint8_t * const halfH = ((uint8_t*)half); \
1564 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1566 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1569 #define PUT_OP(a, b, temp, size) \
1570 "mov"#size" "#a", "#b" \n\t"
1572 #define AVG_MMXEXT_OP(a, b, temp, size) \
1573 "mov"#size" "#b", "#temp" \n\t" \
1574 "pavgb "#temp", "#a" \n\t" \
1575 "mov"#size" "#a", "#b" \n\t"
1577 QPEL_BASE(put_, ff_pw_16, _, PUT_OP)
1578 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMXEXT_OP)
1579 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP)
1580 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext)
1581 QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
1582 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
1584 /***********************************/
1585 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1587 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL) \
1588 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1592 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE); \
1595 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2) \
1596 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1600 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE, \
1604 #define QPEL_2TAP(OPNAME, SIZE, MMX) \
1605 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX) \
1606 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX) \
1607 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx) \
1608 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX = \
1609 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX; \
1610 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX = \
1611 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX; \
1612 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX = \
1613 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX; \
1614 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, \
1618 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE); \
1620 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, \
1624 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride, \
1627 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0) \
1628 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0) \
1629 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0) \
1630 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0) \
1631 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1) \
1632 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1) \
1633 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1) \
1634 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1) \
1636 QPEL_2TAP(put_, 16, mmxext)
1637 QPEL_2TAP(avg_, 16, mmxext)
1638 QPEL_2TAP(put_, 8, mmxext)
1639 QPEL_2TAP(avg_, 8, mmxext)
1641 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1643 put_pixels8_xy2_mmx(dst, src, stride, 8);
1645 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1647 put_pixels16_xy2_mmx(dst, src, stride, 16);
1649 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1651 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1653 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1655 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1658 #endif /* HAVE_INLINE_ASM */
1661 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
1662 x86_reg linesize, x86_reg start_y,
1663 x86_reg end_y, x86_reg block_h,
1664 x86_reg start_x, x86_reg end_x,
1666 extern emu_edge_core_func ff_emu_edge_core_mmx;
1667 extern emu_edge_core_func ff_emu_edge_core_sse;
1669 static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
1671 int block_w, int block_h,
1672 int src_x, int src_y,
1674 emu_edge_core_func *core_fn)
1676 int start_y, start_x, end_y, end_x, src_y_add = 0;
1679 src -= src_y*linesize;
1682 } else if (src_y <= -block_h) {
1683 src -= src_y*linesize;
1684 src_y_add = 1 - block_h;
1685 src_y = 1 - block_h;
1688 src += w - 1 - src_x;
1690 } else if (src_x <= -block_w) {
1691 src += 1 - block_w - src_x;
1692 src_x = 1 - block_w;
1695 start_y = FFMAX(0, -src_y);
1696 start_x = FFMAX(0, -src_x);
1697 end_y = FFMIN(block_h, h-src_y);
1698 end_x = FFMIN(block_w, w-src_x);
1699 av_assert2(start_x < end_x && block_w > 0);
1700 av_assert2(start_y < end_y && block_h > 0);
1702 // fill in the to-be-copied part plus all above/below
1703 src += (src_y_add + start_y) * linesize + start_x;
1705 core_fn(buf, src, linesize, start_y, end_y,
1706 block_h, start_x, end_x, block_w);
1710 static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
1712 int block_w, int block_h,
1713 int src_x, int src_y, int w, int h)
1715 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1716 w, h, &ff_emu_edge_core_mmx);
1720 static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
1722 int block_w, int block_h,
1723 int src_x, int src_y, int w, int h)
1725 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1726 w, h, &ff_emu_edge_core_sse);
1728 #endif /* HAVE_YASM */
1732 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1733 int linesize, int block_w, int block_h,
1734 int src_x, int src_y, int w, int h);
1736 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1737 int stride, int h, int ox, int oy,
1738 int dxx, int dxy, int dyx, int dyy,
1739 int shift, int r, int width, int height,
1740 emulated_edge_mc_func *emu_edge_fn)
1743 const int ix = ox >> (16 + shift);
1744 const int iy = oy >> (16 + shift);
1745 const int oxs = ox >> 4;
1746 const int oys = oy >> 4;
1747 const int dxxs = dxx >> 4;
1748 const int dxys = dxy >> 4;
1749 const int dyxs = dyx >> 4;
1750 const int dyys = dyy >> 4;
1751 const uint16_t r4[4] = { r, r, r, r };
1752 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1753 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1754 const uint64_t shift2 = 2 * shift;
1755 #define MAX_STRIDE 4096U
1757 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
1760 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1761 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1762 const int dxh = dxy * (h - 1);
1763 const int dyw = dyx * (w - 1);
1764 int need_emu = (unsigned)ix >= width - w ||
1765 (unsigned)iy >= height - h;
1767 if ( // non-constant fullpel offset (3% of blocks)
1768 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1769 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1770 // uses more than 16 bits of subpel mv (only at huge resolution)
1771 || (dxx | dxy | dyx | dyy) & 15
1772 || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
1773 // FIXME could still use mmx for some of the rows
1774 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1775 shift, r, width, height);
1779 src += ix + iy * stride;
1781 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1786 "movd %0, %%mm6 \n\t"
1787 "pxor %%mm7, %%mm7 \n\t"
1788 "punpcklwd %%mm6, %%mm6 \n\t"
1789 "punpcklwd %%mm6, %%mm6 \n\t"
1793 for (x = 0; x < w; x += 4) {
1794 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1795 oxs - dxys + dxxs * (x + 1),
1796 oxs - dxys + dxxs * (x + 2),
1797 oxs - dxys + dxxs * (x + 3) };
1798 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1799 oys - dyys + dyxs * (x + 1),
1800 oys - dyys + dyxs * (x + 2),
1801 oys - dyys + dyxs * (x + 3) };
1803 for (y = 0; y < h; y++) {
1805 "movq %0, %%mm4 \n\t"
1806 "movq %1, %%mm5 \n\t"
1807 "paddw %2, %%mm4 \n\t"
1808 "paddw %3, %%mm5 \n\t"
1809 "movq %%mm4, %0 \n\t"
1810 "movq %%mm5, %1 \n\t"
1811 "psrlw $12, %%mm4 \n\t"
1812 "psrlw $12, %%mm5 \n\t"
1813 : "+m"(*dx4), "+m"(*dy4)
1814 : "m"(*dxy4), "m"(*dyy4)
1818 "movq %%mm6, %%mm2 \n\t"
1819 "movq %%mm6, %%mm1 \n\t"
1820 "psubw %%mm4, %%mm2 \n\t"
1821 "psubw %%mm5, %%mm1 \n\t"
1822 "movq %%mm2, %%mm0 \n\t"
1823 "movq %%mm4, %%mm3 \n\t"
1824 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1825 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1826 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1827 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1829 "movd %4, %%mm5 \n\t"
1830 "movd %3, %%mm4 \n\t"
1831 "punpcklbw %%mm7, %%mm5 \n\t"
1832 "punpcklbw %%mm7, %%mm4 \n\t"
1833 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1834 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1836 "movd %2, %%mm5 \n\t"
1837 "movd %1, %%mm4 \n\t"
1838 "punpcklbw %%mm7, %%mm5 \n\t"
1839 "punpcklbw %%mm7, %%mm4 \n\t"
1840 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1841 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1842 "paddw %5, %%mm1 \n\t"
1843 "paddw %%mm3, %%mm2 \n\t"
1844 "paddw %%mm1, %%mm0 \n\t"
1845 "paddw %%mm2, %%mm0 \n\t"
1847 "psrlw %6, %%mm0 \n\t"
1848 "packuswb %%mm0, %%mm0 \n\t"
1849 "movd %%mm0, %0 \n\t"
1851 : "=m"(dst[x + y * stride])
1852 : "m"(src[0]), "m"(src[1]),
1853 "m"(src[stride]), "m"(src[stride + 1]),
1854 "m"(*r4), "m"(shift2)
1858 src += 4 - h * stride;
1864 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1865 int stride, int h, int ox, int oy,
1866 int dxx, int dxy, int dyx, int dyy,
1867 int shift, int r, int width, int height)
1869 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1870 width, height, &emulated_edge_mc_mmx);
1873 static void gmc_sse(uint8_t *dst, uint8_t *src,
1874 int stride, int h, int ox, int oy,
1875 int dxx, int dxy, int dyx, int dyy,
1876 int shift, int r, int width, int height)
1878 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1879 width, height, &emulated_edge_mc_sse);
1882 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1883 int stride, int h, int ox, int oy,
1884 int dxx, int dxy, int dyx, int dyy,
1885 int shift, int r, int width, int height)
1887 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1888 width, height, &ff_emulated_edge_mc_8);
1892 #define PREFETCH(name, op) \
1893 static void name(void *mem, int stride, int h) \
1895 const uint8_t *p = mem; \
1897 __asm__ volatile (#op" %0" :: "m"(*p)); \
1902 PREFETCH(prefetch_mmxext, prefetcht0)
1903 PREFETCH(prefetch_3dnow, prefetch)
1906 #endif /* HAVE_INLINE_ASM */
1908 #include "h264_qpel.c"
1910 void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
1911 int stride, int h, int x, int y);
1912 void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
1913 int stride, int h, int x, int y);
1914 void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
1915 int stride, int h, int x, int y);
1917 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1918 int stride, int h, int x, int y);
1919 void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
1920 int stride, int h, int x, int y);
1921 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1922 int stride, int h, int x, int y);
1924 void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1925 int stride, int h, int x, int y);
1926 void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1927 int stride, int h, int x, int y);
1929 void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1930 int stride, int h, int x, int y);
1931 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1932 int stride, int h, int x, int y);
1934 void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1935 int stride, int h, int x, int y);
1936 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1937 int stride, int h, int x, int y);
1939 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1940 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1941 (uint8_t *dst, uint8_t *src, \
1942 int stride, int h, int x, int y);
1944 CHROMA_MC(put, 2, 10, mmxext)
1945 CHROMA_MC(avg, 2, 10, mmxext)
1946 CHROMA_MC(put, 4, 10, mmxext)
1947 CHROMA_MC(avg, 4, 10, mmxext)
1948 CHROMA_MC(put, 8, 10, sse2)
1949 CHROMA_MC(avg, 8, 10, sse2)
1950 CHROMA_MC(put, 8, 10, avx)
1951 CHROMA_MC(avg, 8, 10, avx)
1956 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1958 put_pixels8_mmx(dst, src, stride, 8);
1961 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1963 avg_pixels8_mmx(dst, src, stride, 8);
1966 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1968 put_pixels16_mmx(dst, src, stride, 16);
1971 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1973 avg_pixels16_mmx(dst, src, stride, 16);
1977 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1978 int stride, int rnd)
1980 put_pixels8_mmx(dst, src, stride, 8);
1983 void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
1984 int stride, int rnd)
1986 avg_pixels8_mmxext(dst, src, stride, 8);
1989 /* only used in VP3/5/6 */
1990 static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
1996 "movq (%1), %%mm0 \n\t"
1997 "movq (%2), %%mm1 \n\t"
1998 "movq (%1,%4), %%mm2 \n\t"
1999 "movq (%2,%4), %%mm3 \n\t"
2000 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
2001 "movq %%mm4, (%3) \n\t"
2002 "movq %%mm5, (%3,%4) \n\t"
2004 "movq (%1,%4,2), %%mm0 \n\t"
2005 "movq (%2,%4,2), %%mm1 \n\t"
2006 "movq (%1,%5), %%mm2 \n\t"
2007 "movq (%2,%5), %%mm3 \n\t"
2008 "lea (%1,%4,4), %1 \n\t"
2009 "lea (%2,%4,4), %2 \n\t"
2010 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
2011 "movq %%mm4, (%3,%4,2) \n\t"
2012 "movq %%mm5, (%3,%5) \n\t"
2013 "lea (%3,%4,4), %3 \n\t"
2016 :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
2017 :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
2019 // STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
2021 static void put_vp_no_rnd_pixels16_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
2023 put_vp_no_rnd_pixels8_l2_mmx(dst, a, b, stride, h);
2024 put_vp_no_rnd_pixels8_l2_mmx(dst+8, a+8, b+8, stride, h);
2027 #if CONFIG_DIRAC_DECODER
2028 #define DIRAC_PIXOP(OPNAME, EXT)\
2029 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2031 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
2033 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2035 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
2037 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2039 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
2040 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
2043 DIRAC_PIXOP(put, mmx)
2044 DIRAC_PIXOP(avg, mmx)
2045 DIRAC_PIXOP(avg, mmxext)
2048 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2050 ff_put_pixels16_sse2(dst, src[0], stride, h);
2052 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2054 ff_avg_pixels16_sse2(dst, src[0], stride, h);
2056 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2058 ff_put_pixels16_sse2(dst , src[0] , stride, h);
2059 ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
2061 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2063 ff_avg_pixels16_sse2(dst , src[0] , stride, h);
2064 ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
2069 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
2072 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
2076 ff_put_pixels_clamped_mmx(block, dest, line_size);
2079 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
2083 ff_add_pixels_clamped_mmx(block, dest, line_size);
2086 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
2089 ff_mmxext_idct(block);
2090 ff_put_pixels_clamped_mmx(block, dest, line_size);
2093 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
2096 ff_mmxext_idct(block);
2097 ff_add_pixels_clamped_mmx(block, dest, line_size);
2101 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2104 __asm__ volatile ("pxor %%mm7, %%mm7":);
2105 for (i = 0; i < blocksize; i += 2) {
2107 "movq %0, %%mm0 \n\t"
2108 "movq %1, %%mm1 \n\t"
2109 "movq %%mm0, %%mm2 \n\t"
2110 "movq %%mm1, %%mm3 \n\t"
2111 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2112 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2113 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2114 "pxor %%mm2, %%mm1 \n\t"
2115 "movq %%mm3, %%mm4 \n\t"
2116 "pand %%mm1, %%mm3 \n\t"
2117 "pandn %%mm1, %%mm4 \n\t"
2118 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2119 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2120 "movq %%mm3, %1 \n\t"
2121 "movq %%mm0, %0 \n\t"
2122 : "+m"(mag[i]), "+m"(ang[i])
2126 __asm__ volatile ("femms");
2129 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2134 "movaps %0, %%xmm5 \n\t"
2135 :: "m"(ff_pdw_80000000[0])
2137 for (i = 0; i < blocksize; i += 4) {
2139 "movaps %0, %%xmm0 \n\t"
2140 "movaps %1, %%xmm1 \n\t"
2141 "xorps %%xmm2, %%xmm2 \n\t"
2142 "xorps %%xmm3, %%xmm3 \n\t"
2143 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2144 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2145 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2146 "xorps %%xmm2, %%xmm1 \n\t"
2147 "movaps %%xmm3, %%xmm4 \n\t"
2148 "andps %%xmm1, %%xmm3 \n\t"
2149 "andnps %%xmm1, %%xmm4 \n\t"
2150 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2151 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2152 "movaps %%xmm3, %1 \n\t"
2153 "movaps %%xmm0, %0 \n\t"
2154 : "+m"(mag[i]), "+m"(ang[i])
2161 static void vector_fmul_window_3dnowext(float *dst, const float *src0,
2162 const float *src1, const float *win,
2165 x86_reg i = -len * 4;
2166 x86_reg j = len * 4 - 8;
2169 "pswapd (%5, %1), %%mm1 \n"
2170 "movq (%5, %0), %%mm0 \n"
2171 "pswapd (%4, %1), %%mm5 \n"
2172 "movq (%3, %0), %%mm4 \n"
2173 "movq %%mm0, %%mm2 \n"
2174 "movq %%mm1, %%mm3 \n"
2175 "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]
2176 "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]
2177 "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]
2178 "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]
2179 "pfadd %%mm3, %%mm2 \n"
2180 "pfsub %%mm0, %%mm1 \n"
2181 "pswapd %%mm2, %%mm2 \n"
2182 "movq %%mm1, (%2, %0) \n"
2183 "movq %%mm2, (%2, %1) \n"
2189 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2193 static void vector_fmul_window_sse(float *dst, const float *src0,
2194 const float *src1, const float *win, int len)
2196 x86_reg i = -len * 4;
2197 x86_reg j = len * 4 - 16;
2200 "movaps (%5, %1), %%xmm1 \n"
2201 "movaps (%5, %0), %%xmm0 \n"
2202 "movaps (%4, %1), %%xmm5 \n"
2203 "movaps (%3, %0), %%xmm4 \n"
2204 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2205 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2206 "movaps %%xmm0, %%xmm2 \n"
2207 "movaps %%xmm1, %%xmm3 \n"
2208 "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]
2209 "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]
2210 "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]
2211 "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]
2212 "addps %%xmm3, %%xmm2 \n"
2213 "subps %%xmm0, %%xmm1 \n"
2214 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2215 "movaps %%xmm1, (%2, %0) \n"
2216 "movaps %%xmm2, (%2, %1) \n"
2221 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2224 #endif /* HAVE_6REGS */
2226 static void vector_clipf_sse(float *dst, const float *src,
2227 float min, float max, int len)
2229 x86_reg i = (len - 16) * 4;
2231 "movss %3, %%xmm4 \n\t"
2232 "movss %4, %%xmm5 \n\t"
2233 "shufps $0, %%xmm4, %%xmm4 \n\t"
2234 "shufps $0, %%xmm5, %%xmm5 \n\t"
2236 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
2237 "movaps 16(%2, %0), %%xmm1 \n\t"
2238 "movaps 32(%2, %0), %%xmm2 \n\t"
2239 "movaps 48(%2, %0), %%xmm3 \n\t"
2240 "maxps %%xmm4, %%xmm0 \n\t"
2241 "maxps %%xmm4, %%xmm1 \n\t"
2242 "maxps %%xmm4, %%xmm2 \n\t"
2243 "maxps %%xmm4, %%xmm3 \n\t"
2244 "minps %%xmm5, %%xmm0 \n\t"
2245 "minps %%xmm5, %%xmm1 \n\t"
2246 "minps %%xmm5, %%xmm2 \n\t"
2247 "minps %%xmm5, %%xmm3 \n\t"
2248 "movaps %%xmm0, (%1, %0) \n\t"
2249 "movaps %%xmm1, 16(%1, %0) \n\t"
2250 "movaps %%xmm2, 32(%1, %0) \n\t"
2251 "movaps %%xmm3, 48(%1, %0) \n\t"
2255 : "r"(dst), "r"(src), "m"(min), "m"(max)
2260 #endif /* HAVE_INLINE_ASM */
2262 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
2264 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
2266 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
2268 int order, int mul);
2269 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
2271 int order, int mul);
2272 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
2274 int order, int mul);
2276 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
2277 const int16_t *window, unsigned int len);
2278 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
2279 const int16_t *window, unsigned int len);
2280 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
2281 const int16_t *window, unsigned int len);
2282 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
2283 const int16_t *window, unsigned int len);
2284 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
2285 const int16_t *window, unsigned int len);
2286 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2287 const int16_t *window, unsigned int len);
2289 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
2290 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
2292 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
2293 const uint8_t *diff, int w,
2294 int *left, int *left_top);
2295 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
2297 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
2300 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2302 void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
2303 const float *src1, int len);
2304 void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
2305 const float *src1, int len);
2307 void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2308 const float *src2, int len);
2309 void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
2310 const float *src2, int len);
2312 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
2313 int32_t min, int32_t max, unsigned int len);
2314 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
2315 int32_t min, int32_t max, unsigned int len);
2316 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
2317 int32_t min, int32_t max, unsigned int len);
2318 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
2319 int32_t min, int32_t max, unsigned int len);
2321 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2322 const float *src1, int len);
2323 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2324 const float *src1, int len);
2326 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2328 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2329 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2330 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2331 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2332 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2333 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2334 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2335 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2336 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2337 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2338 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2339 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2340 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2341 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2342 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2343 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2346 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2348 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2349 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2350 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2351 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2354 #define H264_QPEL_FUNCS(x, y, CPU) \
2356 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2357 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2358 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2359 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2362 #define H264_QPEL_FUNCS_10(x, y, CPU) \
2364 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2365 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2366 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2367 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2370 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2372 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2375 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2376 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2377 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2379 if (!high_bit_depth) {
2380 c->clear_block = clear_block_mmx;
2381 c->clear_blocks = clear_blocks_mmx;
2382 c->draw_edges = draw_edges_mmx;
2384 SET_HPEL_FUNCS(put, 0, 16, mmx);
2385 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2386 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2387 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2388 SET_HPEL_FUNCS(put, 1, 8, mmx);
2389 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2390 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2391 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2394 #if ARCH_X86_32 || !HAVE_YASM
2398 c->add_bytes = add_bytes_mmx;
2400 c->put_no_rnd_pixels_l2[0]= put_vp_no_rnd_pixels16_l2_mmx;
2401 c->put_no_rnd_pixels_l2[1]= put_vp_no_rnd_pixels8_l2_mmx;
2403 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2404 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2405 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2407 #endif /* HAVE_INLINE_ASM */
2411 if (!high_bit_depth)
2412 c->emulated_edge_mc = emulated_edge_mc_mmx;
2415 if (!high_bit_depth && CONFIG_H264CHROMA) {
2416 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
2417 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2420 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2425 static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
2428 const int bit_depth = avctx->bits_per_raw_sample;
2429 const int high_bit_depth = bit_depth > 8;
2432 c->prefetch = prefetch_mmxext;
2434 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
2435 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
2436 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmxext, );
2437 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmxext, );
2439 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
2440 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
2441 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmxext, );
2442 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmxext, );
2443 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
2444 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
2446 if (!high_bit_depth) {
2447 c->put_pixels_tab[0][1] = put_pixels16_x2_mmxext;
2448 c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
2450 c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
2451 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
2452 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
2454 c->put_pixels_tab[1][1] = put_pixels8_x2_mmxext;
2455 c->put_pixels_tab[1][2] = put_pixels8_y2_mmxext;
2457 c->avg_pixels_tab[1][0] = avg_pixels8_mmxext;
2458 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmxext;
2459 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmxext;
2462 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2463 if (!high_bit_depth) {
2464 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
2465 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
2466 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmxext;
2467 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmxext;
2469 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
2470 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmxext;
2474 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2475 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2476 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmxext;
2477 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmxext;
2479 #endif /* HAVE_INLINE_ASM */
2481 #if HAVE_MMXEXT_EXTERNAL
2482 if (CONFIG_H264QPEL) {
2483 if (!high_bit_depth) {
2484 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
2485 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
2486 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
2487 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
2488 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
2489 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
2490 } else if (bit_depth == 10) {
2492 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2493 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2494 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2495 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2497 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2498 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2502 if (!high_bit_depth && CONFIG_H264CHROMA) {
2503 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
2504 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
2505 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
2506 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
2508 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2509 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
2510 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
2511 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
2512 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
2515 /* slower than cmov version on AMD */
2516 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
2517 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
2519 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
2520 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
2522 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2523 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2525 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
2527 #endif /* HAVE_MMXEXT_EXTERNAL */
2530 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2533 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2536 c->prefetch = prefetch_3dnow;
2538 if (!high_bit_depth) {
2539 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2540 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2542 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2543 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2544 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2546 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2547 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2549 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2550 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2551 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2553 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2554 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2555 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2556 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2557 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2559 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2560 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2564 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2565 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2566 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2567 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2570 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2571 #endif /* HAVE_INLINE_ASM */
2574 if (!high_bit_depth && CONFIG_H264CHROMA) {
2575 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
2576 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2578 #endif /* HAVE_YASM */
2581 static void dsputil_init_3dnowext(DSPContext *c, AVCodecContext *avctx,
2584 #if HAVE_AMD3DNOWEXT_INLINE && HAVE_6REGS
2585 c->vector_fmul_window = vector_fmul_window_3dnowext;
2589 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2591 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2594 if (!high_bit_depth) {
2595 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2596 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2597 c->clear_block = clear_block_sse;
2598 c->clear_blocks = clear_blocks_sse;
2602 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2605 c->vector_fmul_window = vector_fmul_window_sse;
2608 c->vector_clipf = vector_clipf_sse;
2609 #endif /* HAVE_INLINE_ASM */
2612 c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
2613 c->vector_fmul_add = ff_vector_fmul_add_sse;
2615 c->scalarproduct_float = ff_scalarproduct_float_sse;
2616 c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
2618 if (!high_bit_depth)
2619 c->emulated_edge_mc = emulated_edge_mc_sse;
2623 #endif /* HAVE_YASM */
2626 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2629 const int bit_depth = avctx->bits_per_raw_sample;
2630 const int high_bit_depth = bit_depth > 8;
2632 #if HAVE_SSE2_INLINE
2633 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
2634 c->idct_put = ff_idct_xvid_sse2_put;
2635 c->idct_add = ff_idct_xvid_sse2_add;
2636 c->idct = ff_idct_xvid_sse2;
2637 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2639 #endif /* HAVE_SSE2_INLINE */
2641 #if HAVE_SSE2_EXTERNAL
2642 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2643 // these functions are slower than mmx on AMD, but faster on Intel
2644 if (!high_bit_depth) {
2645 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
2646 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
2647 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
2648 if (CONFIG_H264QPEL)
2649 H264_QPEL_FUNCS(0, 0, sse2);
2653 if (!high_bit_depth && CONFIG_H264QPEL) {
2654 H264_QPEL_FUNCS(0, 1, sse2);
2655 H264_QPEL_FUNCS(0, 2, sse2);
2656 H264_QPEL_FUNCS(0, 3, sse2);
2657 H264_QPEL_FUNCS(1, 1, sse2);
2658 H264_QPEL_FUNCS(1, 2, sse2);
2659 H264_QPEL_FUNCS(1, 3, sse2);
2660 H264_QPEL_FUNCS(2, 1, sse2);
2661 H264_QPEL_FUNCS(2, 2, sse2);
2662 H264_QPEL_FUNCS(2, 3, sse2);
2663 H264_QPEL_FUNCS(3, 1, sse2);
2664 H264_QPEL_FUNCS(3, 2, sse2);
2665 H264_QPEL_FUNCS(3, 3, sse2);
2668 if (bit_depth == 10) {
2669 if (CONFIG_H264QPEL) {
2670 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2671 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2672 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2673 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2674 H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2675 H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2676 H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
2678 if (CONFIG_H264CHROMA) {
2679 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2680 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2684 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2685 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2686 if (mm_flags & AV_CPU_FLAG_ATOM) {
2687 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2689 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2691 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2692 c->apply_window_int16 = ff_apply_window_int16_sse2;
2693 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2694 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
2696 c->bswap_buf = ff_bswap32_buf_sse2;
2697 #endif /* HAVE_SSE2_EXTERNAL */
2700 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2703 #if HAVE_SSSE3_EXTERNAL
2704 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2705 const int bit_depth = avctx->bits_per_raw_sample;
2707 if (!high_bit_depth && CONFIG_H264QPEL) {
2708 H264_QPEL_FUNCS(1, 0, ssse3);
2709 H264_QPEL_FUNCS(1, 1, ssse3);
2710 H264_QPEL_FUNCS(1, 2, ssse3);
2711 H264_QPEL_FUNCS(1, 3, ssse3);
2712 H264_QPEL_FUNCS(2, 0, ssse3);
2713 H264_QPEL_FUNCS(2, 1, ssse3);
2714 H264_QPEL_FUNCS(2, 2, ssse3);
2715 H264_QPEL_FUNCS(2, 3, ssse3);
2716 H264_QPEL_FUNCS(3, 0, ssse3);
2717 H264_QPEL_FUNCS(3, 1, ssse3);
2718 H264_QPEL_FUNCS(3, 2, ssse3);
2719 H264_QPEL_FUNCS(3, 3, ssse3);
2721 if (bit_depth == 10 && CONFIG_H264QPEL) {
2722 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
2723 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
2724 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
2726 if (!high_bit_depth && CONFIG_H264CHROMA) {
2727 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
2728 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
2729 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
2730 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
2732 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2733 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2734 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2736 if (mm_flags & AV_CPU_FLAG_ATOM)
2737 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2739 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2740 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
2741 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2742 c->bswap_buf = ff_bswap32_buf_ssse3;
2743 #endif /* HAVE_SSSE3_EXTERNAL */
2746 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
2749 #if HAVE_SSE4_EXTERNAL
2750 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2751 #endif /* HAVE_SSE4_EXTERNAL */
2754 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2756 #if HAVE_AVX_EXTERNAL
2757 const int bit_depth = avctx->bits_per_raw_sample;
2759 if (bit_depth == 10) {
2760 // AVX implies !cache64.
2761 // TODO: Port cache(32|64) detection from x264.
2762 if (CONFIG_H264QPEL) {
2763 H264_QPEL_FUNCS_10(1, 0, sse2);
2764 H264_QPEL_FUNCS_10(2, 0, sse2);
2765 H264_QPEL_FUNCS_10(3, 0, sse2);
2768 if (CONFIG_H264CHROMA) {
2769 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2770 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2773 c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
2774 c->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
2775 c->vector_fmul_add = ff_vector_fmul_add_avx;
2776 #endif /* HAVE_AVX_EXTERNAL */
2779 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
2781 int mm_flags = av_get_cpu_flags();
2783 #if HAVE_7REGS && HAVE_INLINE_ASM
2784 if (mm_flags & AV_CPU_FLAG_CMOV)
2785 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2788 if (mm_flags & AV_CPU_FLAG_MMX) {
2790 const int idct_algo = avctx->idct_algo;
2792 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
2793 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
2794 c->idct_put = ff_simple_idct_put_mmx;
2795 c->idct_add = ff_simple_idct_add_mmx;
2796 c->idct = ff_simple_idct_mmx;
2797 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
2799 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
2800 if (mm_flags & AV_CPU_FLAG_MMX2) {
2801 c->idct_put = ff_libmpeg2mmx2_idct_put;
2802 c->idct_add = ff_libmpeg2mmx2_idct_add;
2803 c->idct = ff_mmxext_idct;
2805 c->idct_put = ff_libmpeg2mmx_idct_put;
2806 c->idct_add = ff_libmpeg2mmx_idct_add;
2807 c->idct = ff_mmx_idct;
2809 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2811 } else if (idct_algo == FF_IDCT_XVIDMMX) {
2812 if (mm_flags & AV_CPU_FLAG_SSE2) {
2813 c->idct_put = ff_idct_xvid_sse2_put;
2814 c->idct_add = ff_idct_xvid_sse2_add;
2815 c->idct = ff_idct_xvid_sse2;
2816 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2817 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
2818 c->idct_put = ff_idct_xvid_mmxext_put;
2819 c->idct_add = ff_idct_xvid_mmxext_add;
2820 c->idct = ff_idct_xvid_mmxext;
2822 c->idct_put = ff_idct_xvid_mmx_put;
2823 c->idct_add = ff_idct_xvid_mmx_add;
2824 c->idct = ff_idct_xvid_mmx;
2828 #endif /* HAVE_INLINE_ASM */
2830 dsputil_init_mmx(c, avctx, mm_flags);
2833 if (mm_flags & AV_CPU_FLAG_MMXEXT)
2834 dsputil_init_mmxext(c, avctx, mm_flags);
2836 if (mm_flags & AV_CPU_FLAG_3DNOW)
2837 dsputil_init_3dnow(c, avctx, mm_flags);
2839 if (mm_flags & AV_CPU_FLAG_3DNOWEXT)
2840 dsputil_init_3dnowext(c, avctx, mm_flags);
2842 if (mm_flags & AV_CPU_FLAG_SSE)
2843 dsputil_init_sse(c, avctx, mm_flags);
2845 if (mm_flags & AV_CPU_FLAG_SSE2)
2846 dsputil_init_sse2(c, avctx, mm_flags);
2848 if (mm_flags & AV_CPU_FLAG_SSSE3)
2849 dsputil_init_ssse3(c, avctx, mm_flags);
2851 if (mm_flags & AV_CPU_FLAG_SSE4)
2852 dsputil_init_sse4(c, avctx, mm_flags);
2854 if (mm_flags & AV_CPU_FLAG_AVX)
2855 dsputil_init_avx(c, avctx, mm_flags);
2857 if (CONFIG_ENCODERS)
2858 ff_dsputilenc_init_mmx(c, avctx);