2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "dsputil_mmx.h"
32 #include "idct_xvid.h"
33 #include "diracdsp_mmx.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
43 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
44 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
49 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
53 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
57 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
58 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
60 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
62 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
63 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
64 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
65 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
71 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
72 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
74 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
76 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
81 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
82 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
86 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
87 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
89 #define MOVQ_BFE(regd) \
91 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
92 "paddb %%"#regd", %%"#regd" \n\t" ::)
95 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
96 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
98 // for shared library it's better to use this way for accessing constants
100 #define MOVQ_BONE(regd) \
102 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
103 "psrlw $15, %%"#regd" \n\t" \
104 "packuswb %%"#regd", %%"#regd" \n\t" ::)
106 #define MOVQ_WTWO(regd) \
108 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
109 "psrlw $15, %%"#regd" \n\t" \
110 "psllw $1, %%"#regd" \n\t"::)
114 // using regr as temporary and for the output result
115 // first argument is unmodifed and second is trashed
116 // regfe is supposed to contain 0xfefefefefefefefe
117 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
118 "movq "#rega", "#regr" \n\t" \
119 "pand "#regb", "#regr" \n\t" \
120 "pxor "#rega", "#regb" \n\t" \
121 "pand "#regfe", "#regb" \n\t" \
122 "psrlq $1, "#regb" \n\t" \
123 "paddb "#regb", "#regr" \n\t"
125 #define PAVGB_MMX(rega, regb, regr, regfe) \
126 "movq "#rega", "#regr" \n\t" \
127 "por "#regb", "#regr" \n\t" \
128 "pxor "#rega", "#regb" \n\t" \
129 "pand "#regfe", "#regb" \n\t" \
130 "psrlq $1, "#regb" \n\t" \
131 "psubb "#regb", "#regr" \n\t"
133 // mm6 is supposed to contain 0xfefefefefefefefe
134 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
135 "movq "#rega", "#regr" \n\t" \
136 "movq "#regc", "#regp" \n\t" \
137 "pand "#regb", "#regr" \n\t" \
138 "pand "#regd", "#regp" \n\t" \
139 "pxor "#rega", "#regb" \n\t" \
140 "pxor "#regc", "#regd" \n\t" \
141 "pand %%mm6, "#regb" \n\t" \
142 "pand %%mm6, "#regd" \n\t" \
143 "psrlq $1, "#regb" \n\t" \
144 "psrlq $1, "#regd" \n\t" \
145 "paddb "#regb", "#regr" \n\t" \
146 "paddb "#regd", "#regp" \n\t"
148 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
149 "movq "#rega", "#regr" \n\t" \
150 "movq "#regc", "#regp" \n\t" \
151 "por "#regb", "#regr" \n\t" \
152 "por "#regd", "#regp" \n\t" \
153 "pxor "#rega", "#regb" \n\t" \
154 "pxor "#regc", "#regd" \n\t" \
155 "pand %%mm6, "#regb" \n\t" \
156 "pand %%mm6, "#regd" \n\t" \
157 "psrlq $1, "#regd" \n\t" \
158 "psrlq $1, "#regb" \n\t" \
159 "psubb "#regb", "#regr" \n\t" \
160 "psubb "#regd", "#regp" \n\t"
162 /***********************************/
163 /* MMX no rounding */
165 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
166 #define SET_RND MOVQ_WONE
167 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
168 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
169 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
171 #include "dsputil_rnd_template.c"
178 /***********************************/
181 #define DEF(x, y) x ## _ ## y ## _mmx
182 #define SET_RND MOVQ_WTWO
183 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
184 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
186 #include "dsputil_rnd_template.c"
194 /***********************************/
197 #define DEF(x) x ## _3dnow
198 #define PAVGB "pavgusb"
199 #define SKIP_FOR_3DNOW
201 #include "dsputil_avg_template.c"
205 #undef SKIP_FOR_3DNOW
207 /***********************************/
208 /* MMXEXT specific */
210 #define DEF(x) x ## _mmxext
212 /* Introduced only in MMXEXT set */
213 #define PAVGB "pavgb"
215 #include "dsputil_avg_template.c"
220 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
221 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
222 #define put_pixels16_mmxext put_pixels16_mmx
223 #define put_pixels8_mmxext put_pixels8_mmx
224 #define put_pixels4_mmxext put_pixels4_mmx
225 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
226 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
228 /***********************************/
231 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
237 /* read the pixels */
242 "movq (%3), %%mm0 \n\t"
243 "movq 8(%3), %%mm1 \n\t"
244 "movq 16(%3), %%mm2 \n\t"
245 "movq 24(%3), %%mm3 \n\t"
246 "movq 32(%3), %%mm4 \n\t"
247 "movq 40(%3), %%mm5 \n\t"
248 "movq 48(%3), %%mm6 \n\t"
249 "movq 56(%3), %%mm7 \n\t"
250 "packuswb %%mm1, %%mm0 \n\t"
251 "packuswb %%mm3, %%mm2 \n\t"
252 "packuswb %%mm5, %%mm4 \n\t"
253 "packuswb %%mm7, %%mm6 \n\t"
254 "movq %%mm0, (%0) \n\t"
255 "movq %%mm2, (%0, %1) \n\t"
256 "movq %%mm4, (%0, %1, 2) \n\t"
257 "movq %%mm6, (%0, %2) \n\t"
258 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
261 pix += line_size * 4;
264 // if here would be an exact copy of the code above
265 // compiler would generate some very strange code
268 "movq (%3), %%mm0 \n\t"
269 "movq 8(%3), %%mm1 \n\t"
270 "movq 16(%3), %%mm2 \n\t"
271 "movq 24(%3), %%mm3 \n\t"
272 "movq 32(%3), %%mm4 \n\t"
273 "movq 40(%3), %%mm5 \n\t"
274 "movq 48(%3), %%mm6 \n\t"
275 "movq 56(%3), %%mm7 \n\t"
276 "packuswb %%mm1, %%mm0 \n\t"
277 "packuswb %%mm3, %%mm2 \n\t"
278 "packuswb %%mm5, %%mm4 \n\t"
279 "packuswb %%mm7, %%mm6 \n\t"
280 "movq %%mm0, (%0) \n\t"
281 "movq %%mm2, (%0, %1) \n\t"
282 "movq %%mm4, (%0, %1, 2) \n\t"
283 "movq %%mm6, (%0, %2) \n\t"
284 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
288 #define put_signed_pixels_clamped_mmx_half(off) \
289 "movq "#off"(%2), %%mm1 \n\t" \
290 "movq 16 + "#off"(%2), %%mm2 \n\t" \
291 "movq 32 + "#off"(%2), %%mm3 \n\t" \
292 "movq 48 + "#off"(%2), %%mm4 \n\t" \
293 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
294 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
295 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
296 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
297 "paddb %%mm0, %%mm1 \n\t" \
298 "paddb %%mm0, %%mm2 \n\t" \
299 "paddb %%mm0, %%mm3 \n\t" \
300 "paddb %%mm0, %%mm4 \n\t" \
301 "movq %%mm1, (%0) \n\t" \
302 "movq %%mm2, (%0, %3) \n\t" \
303 "movq %%mm3, (%0, %3, 2) \n\t" \
304 "movq %%mm4, (%0, %1) \n\t"
306 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
309 x86_reg line_skip = line_size;
313 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
314 "lea (%3, %3, 2), %1 \n\t"
315 put_signed_pixels_clamped_mmx_half(0)
316 "lea (%0, %3, 4), %0 \n\t"
317 put_signed_pixels_clamped_mmx_half(64)
318 : "+&r"(pixels), "=&r"(line_skip3)
319 : "r"(block), "r"(line_skip)
323 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
330 /* read the pixels */
337 "movq (%2), %%mm0 \n\t"
338 "movq 8(%2), %%mm1 \n\t"
339 "movq 16(%2), %%mm2 \n\t"
340 "movq 24(%2), %%mm3 \n\t"
341 "movq %0, %%mm4 \n\t"
342 "movq %1, %%mm6 \n\t"
343 "movq %%mm4, %%mm5 \n\t"
344 "punpcklbw %%mm7, %%mm4 \n\t"
345 "punpckhbw %%mm7, %%mm5 \n\t"
346 "paddsw %%mm4, %%mm0 \n\t"
347 "paddsw %%mm5, %%mm1 \n\t"
348 "movq %%mm6, %%mm5 \n\t"
349 "punpcklbw %%mm7, %%mm6 \n\t"
350 "punpckhbw %%mm7, %%mm5 \n\t"
351 "paddsw %%mm6, %%mm2 \n\t"
352 "paddsw %%mm5, %%mm3 \n\t"
353 "packuswb %%mm1, %%mm0 \n\t"
354 "packuswb %%mm3, %%mm2 \n\t"
355 "movq %%mm0, %0 \n\t"
356 "movq %%mm2, %1 \n\t"
357 : "+m"(*pix), "+m"(*(pix + line_size))
360 pix += line_size * 2;
365 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
366 int line_size, int h)
369 "lea (%3, %3), %%"REG_a" \n\t"
372 "movq (%1 ), %%mm0 \n\t"
373 "movq (%1, %3), %%mm1 \n\t"
374 "movq %%mm0, (%2) \n\t"
375 "movq %%mm1, (%2, %3) \n\t"
376 "add %%"REG_a", %1 \n\t"
377 "add %%"REG_a", %2 \n\t"
378 "movq (%1 ), %%mm0 \n\t"
379 "movq (%1, %3), %%mm1 \n\t"
380 "movq %%mm0, (%2) \n\t"
381 "movq %%mm1, (%2, %3) \n\t"
382 "add %%"REG_a", %1 \n\t"
383 "add %%"REG_a", %2 \n\t"
386 : "+g"(h), "+r"(pixels), "+r"(block)
387 : "r"((x86_reg)line_size)
392 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
393 int line_size, int h)
396 "lea (%3, %3), %%"REG_a" \n\t"
399 "movq (%1 ), %%mm0 \n\t"
400 "movq 8(%1 ), %%mm4 \n\t"
401 "movq (%1, %3), %%mm1 \n\t"
402 "movq 8(%1, %3), %%mm5 \n\t"
403 "movq %%mm0, (%2) \n\t"
404 "movq %%mm4, 8(%2) \n\t"
405 "movq %%mm1, (%2, %3) \n\t"
406 "movq %%mm5, 8(%2, %3) \n\t"
407 "add %%"REG_a", %1 \n\t"
408 "add %%"REG_a", %2 \n\t"
409 "movq (%1 ), %%mm0 \n\t"
410 "movq 8(%1 ), %%mm4 \n\t"
411 "movq (%1, %3), %%mm1 \n\t"
412 "movq 8(%1, %3), %%mm5 \n\t"
413 "movq %%mm0, (%2) \n\t"
414 "movq %%mm4, 8(%2) \n\t"
415 "movq %%mm1, (%2, %3) \n\t"
416 "movq %%mm5, 8(%2, %3) \n\t"
417 "add %%"REG_a", %1 \n\t"
418 "add %%"REG_a", %2 \n\t"
421 : "+g"(h), "+r"(pixels), "+r"(block)
422 : "r"((x86_reg)line_size)
427 #define CLEAR_BLOCKS(name, n) \
428 static void name(int16_t *blocks) \
431 "pxor %%mm7, %%mm7 \n\t" \
432 "mov %1, %%"REG_a" \n\t" \
434 "movq %%mm7, (%0, %%"REG_a") \n\t" \
435 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
436 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
437 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
438 "add $32, %%"REG_a" \n\t" \
440 :: "r"(((uint8_t *)blocks) + 128 * n), \
445 CLEAR_BLOCKS(clear_blocks_mmx, 6)
446 CLEAR_BLOCKS(clear_block_mmx, 1)
448 static void clear_block_sse(int16_t *block)
451 "xorps %%xmm0, %%xmm0 \n"
452 "movaps %%xmm0, (%0) \n"
453 "movaps %%xmm0, 16(%0) \n"
454 "movaps %%xmm0, 32(%0) \n"
455 "movaps %%xmm0, 48(%0) \n"
456 "movaps %%xmm0, 64(%0) \n"
457 "movaps %%xmm0, 80(%0) \n"
458 "movaps %%xmm0, 96(%0) \n"
459 "movaps %%xmm0, 112(%0) \n"
465 static void clear_blocks_sse(int16_t *blocks)
468 "xorps %%xmm0, %%xmm0 \n"
469 "mov %1, %%"REG_a" \n"
471 "movaps %%xmm0, (%0, %%"REG_a") \n"
472 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
473 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
474 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
475 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
476 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
477 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
478 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
479 "add $128, %%"REG_a" \n"
481 :: "r"(((uint8_t *)blocks) + 128 * 6),
487 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
493 "movq (%1, %0), %%mm0 \n\t"
494 "movq (%2, %0), %%mm1 \n\t"
495 "paddb %%mm0, %%mm1 \n\t"
496 "movq %%mm1, (%2, %0) \n\t"
497 "movq 8(%1, %0), %%mm0 \n\t"
498 "movq 8(%2, %0), %%mm1 \n\t"
499 "paddb %%mm0, %%mm1 \n\t"
500 "movq %%mm1, 8(%2, %0) \n\t"
506 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
509 dst[i + 0] += src[i + 0];
513 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
514 const uint8_t *diff, int w,
515 int *left, int *left_top)
519 int l = *left & 0xff;
520 int tl = *left_top & 0xff;
525 "movzbl (%3, %4), %2 \n"
538 "add (%6, %4), %b0 \n"
539 "mov %b0, (%5, %4) \n"
542 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
543 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
550 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
551 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
552 "movd (%1), %%mm0 \n\t"
554 "movd (%1), %%mm1 \n\t"
555 "movd (%1,%3,1), %%mm2 \n\t"
556 "movd (%1,%3,2), %%mm3 \n\t"
557 "punpcklbw %%mm1, %%mm0 \n\t"
558 "punpcklbw %%mm3, %%mm2 \n\t"
559 "movq %%mm0, %%mm1 \n\t"
560 "punpcklwd %%mm2, %%mm0 \n\t"
561 "punpckhwd %%mm2, %%mm1 \n\t"
562 "movd %%mm0, (%0) \n\t"
564 "punpckhdq %%mm0, %%mm0 \n\t"
565 "movd %%mm0, (%0) \n\t"
566 "movd %%mm1, (%0,%2,1) \n\t"
567 "punpckhdq %%mm1, %%mm1 \n\t"
568 "movd %%mm1, (%0,%2,2) \n\t"
578 #define H263_LOOP_FILTER \
579 "pxor %%mm7, %%mm7 \n\t" \
580 "movq %0, %%mm0 \n\t" \
581 "movq %0, %%mm1 \n\t" \
582 "movq %3, %%mm2 \n\t" \
583 "movq %3, %%mm3 \n\t" \
584 "punpcklbw %%mm7, %%mm0 \n\t" \
585 "punpckhbw %%mm7, %%mm1 \n\t" \
586 "punpcklbw %%mm7, %%mm2 \n\t" \
587 "punpckhbw %%mm7, %%mm3 \n\t" \
588 "psubw %%mm2, %%mm0 \n\t" \
589 "psubw %%mm3, %%mm1 \n\t" \
590 "movq %1, %%mm2 \n\t" \
591 "movq %1, %%mm3 \n\t" \
592 "movq %2, %%mm4 \n\t" \
593 "movq %2, %%mm5 \n\t" \
594 "punpcklbw %%mm7, %%mm2 \n\t" \
595 "punpckhbw %%mm7, %%mm3 \n\t" \
596 "punpcklbw %%mm7, %%mm4 \n\t" \
597 "punpckhbw %%mm7, %%mm5 \n\t" \
598 "psubw %%mm2, %%mm4 \n\t" \
599 "psubw %%mm3, %%mm5 \n\t" \
600 "psllw $2, %%mm4 \n\t" \
601 "psllw $2, %%mm5 \n\t" \
602 "paddw %%mm0, %%mm4 \n\t" \
603 "paddw %%mm1, %%mm5 \n\t" \
604 "pxor %%mm6, %%mm6 \n\t" \
605 "pcmpgtw %%mm4, %%mm6 \n\t" \
606 "pcmpgtw %%mm5, %%mm7 \n\t" \
607 "pxor %%mm6, %%mm4 \n\t" \
608 "pxor %%mm7, %%mm5 \n\t" \
609 "psubw %%mm6, %%mm4 \n\t" \
610 "psubw %%mm7, %%mm5 \n\t" \
611 "psrlw $3, %%mm4 \n\t" \
612 "psrlw $3, %%mm5 \n\t" \
613 "packuswb %%mm5, %%mm4 \n\t" \
614 "packsswb %%mm7, %%mm6 \n\t" \
615 "pxor %%mm7, %%mm7 \n\t" \
616 "movd %4, %%mm2 \n\t" \
617 "punpcklbw %%mm2, %%mm2 \n\t" \
618 "punpcklbw %%mm2, %%mm2 \n\t" \
619 "punpcklbw %%mm2, %%mm2 \n\t" \
620 "psubusb %%mm4, %%mm2 \n\t" \
621 "movq %%mm2, %%mm3 \n\t" \
622 "psubusb %%mm4, %%mm3 \n\t" \
623 "psubb %%mm3, %%mm2 \n\t" \
624 "movq %1, %%mm3 \n\t" \
625 "movq %2, %%mm4 \n\t" \
626 "pxor %%mm6, %%mm3 \n\t" \
627 "pxor %%mm6, %%mm4 \n\t" \
628 "paddusb %%mm2, %%mm3 \n\t" \
629 "psubusb %%mm2, %%mm4 \n\t" \
630 "pxor %%mm6, %%mm3 \n\t" \
631 "pxor %%mm6, %%mm4 \n\t" \
632 "paddusb %%mm2, %%mm2 \n\t" \
633 "packsswb %%mm1, %%mm0 \n\t" \
634 "pcmpgtb %%mm0, %%mm7 \n\t" \
635 "pxor %%mm7, %%mm0 \n\t" \
636 "psubb %%mm7, %%mm0 \n\t" \
637 "movq %%mm0, %%mm1 \n\t" \
638 "psubusb %%mm2, %%mm0 \n\t" \
639 "psubb %%mm0, %%mm1 \n\t" \
640 "pand %5, %%mm1 \n\t" \
641 "psrlw $2, %%mm1 \n\t" \
642 "pxor %%mm7, %%mm1 \n\t" \
643 "psubb %%mm7, %%mm1 \n\t" \
644 "movq %0, %%mm5 \n\t" \
645 "movq %3, %%mm6 \n\t" \
646 "psubb %%mm1, %%mm5 \n\t" \
647 "paddb %%mm1, %%mm6 \n\t"
649 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
651 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
652 const int strength = ff_h263_loop_filter_strength[qscale];
657 "movq %%mm3, %1 \n\t"
658 "movq %%mm4, %2 \n\t"
659 "movq %%mm5, %0 \n\t"
660 "movq %%mm6, %3 \n\t"
661 : "+m"(*(uint64_t*)(src - 2 * stride)),
662 "+m"(*(uint64_t*)(src - 1 * stride)),
663 "+m"(*(uint64_t*)(src + 0 * stride)),
664 "+m"(*(uint64_t*)(src + 1 * stride))
665 : "g"(2 * strength), "m"(ff_pb_FC)
670 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
672 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
673 const int strength = ff_h263_loop_filter_strength[qscale];
674 DECLARE_ALIGNED(8, uint64_t, temp)[4];
675 uint8_t *btemp = (uint8_t*)temp;
679 transpose4x4(btemp, src, 8, stride);
680 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
682 H263_LOOP_FILTER // 5 3 4 6
688 : "g"(2 * strength), "m"(ff_pb_FC)
692 "movq %%mm5, %%mm1 \n\t"
693 "movq %%mm4, %%mm0 \n\t"
694 "punpcklbw %%mm3, %%mm5 \n\t"
695 "punpcklbw %%mm6, %%mm4 \n\t"
696 "punpckhbw %%mm3, %%mm1 \n\t"
697 "punpckhbw %%mm6, %%mm0 \n\t"
698 "movq %%mm5, %%mm3 \n\t"
699 "movq %%mm1, %%mm6 \n\t"
700 "punpcklwd %%mm4, %%mm5 \n\t"
701 "punpcklwd %%mm0, %%mm1 \n\t"
702 "punpckhwd %%mm4, %%mm3 \n\t"
703 "punpckhwd %%mm0, %%mm6 \n\t"
704 "movd %%mm5, (%0) \n\t"
705 "punpckhdq %%mm5, %%mm5 \n\t"
706 "movd %%mm5, (%0, %2) \n\t"
707 "movd %%mm3, (%0, %2, 2) \n\t"
708 "punpckhdq %%mm3, %%mm3 \n\t"
709 "movd %%mm3, (%0, %3) \n\t"
710 "movd %%mm1, (%1) \n\t"
711 "punpckhdq %%mm1, %%mm1 \n\t"
712 "movd %%mm1, (%1, %2) \n\t"
713 "movd %%mm6, (%1, %2, 2) \n\t"
714 "punpckhdq %%mm6, %%mm6 \n\t"
715 "movd %%mm6, (%1, %3) \n\t"
717 "r"(src + 4 * stride),
718 "r"((x86_reg)stride),
719 "r"((x86_reg)(3 * stride))
724 /* Draw the edges of width 'w' of an image of size width, height
725 * this MMX version can only handle w == 8 || w == 16. */
726 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
727 int w, int h, int sides)
729 uint8_t *ptr, *last_line;
732 last_line = buf + (height - 1) * wrap;
738 "movd (%0), %%mm0 \n\t"
739 "punpcklbw %%mm0, %%mm0 \n\t"
740 "punpcklwd %%mm0, %%mm0 \n\t"
741 "punpckldq %%mm0, %%mm0 \n\t"
742 "movq %%mm0, -8(%0) \n\t"
743 "movq -8(%0, %2), %%mm1 \n\t"
744 "punpckhbw %%mm1, %%mm1 \n\t"
745 "punpckhwd %%mm1, %%mm1 \n\t"
746 "punpckhdq %%mm1, %%mm1 \n\t"
747 "movq %%mm1, (%0, %2) \n\t"
752 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
757 "movd (%0), %%mm0 \n\t"
758 "punpcklbw %%mm0, %%mm0 \n\t"
759 "punpcklwd %%mm0, %%mm0 \n\t"
760 "punpckldq %%mm0, %%mm0 \n\t"
761 "movq %%mm0, -8(%0) \n\t"
762 "movq %%mm0, -16(%0) \n\t"
763 "movq -8(%0, %2), %%mm1 \n\t"
764 "punpckhbw %%mm1, %%mm1 \n\t"
765 "punpckhwd %%mm1, %%mm1 \n\t"
766 "punpckhdq %%mm1, %%mm1 \n\t"
767 "movq %%mm1, (%0, %2) \n\t"
768 "movq %%mm1, 8(%0, %2) \n\t"
773 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
779 "movd (%0), %%mm0 \n\t"
780 "punpcklbw %%mm0, %%mm0 \n\t"
781 "punpcklwd %%mm0, %%mm0 \n\t"
782 "movd %%mm0, -4(%0) \n\t"
783 "movd -4(%0, %2), %%mm1 \n\t"
784 "punpcklbw %%mm1, %%mm1 \n\t"
785 "punpckhwd %%mm1, %%mm1 \n\t"
786 "punpckhdq %%mm1, %%mm1 \n\t"
787 "movd %%mm1, (%0, %2) \n\t"
792 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
796 /* top and bottom (and hopefully also the corners) */
797 if (sides & EDGE_TOP) {
798 for (i = 0; i < h; i += 4) {
799 ptr = buf - (i + 1) * wrap - w;
802 "movq (%1, %0), %%mm0 \n\t"
803 "movq %%mm0, (%0) \n\t"
804 "movq %%mm0, (%0, %2) \n\t"
805 "movq %%mm0, (%0, %2, 2) \n\t"
806 "movq %%mm0, (%0, %3) \n\t"
811 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
812 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
817 if (sides & EDGE_BOTTOM) {
818 for (i = 0; i < h; i += 4) {
819 ptr = last_line + (i + 1) * wrap - w;
822 "movq (%1, %0), %%mm0 \n\t"
823 "movq %%mm0, (%0) \n\t"
824 "movq %%mm0, (%0, %2) \n\t"
825 "movq %%mm0, (%0, %2, 2) \n\t"
826 "movq %%mm0, (%0, %3) \n\t"
831 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
832 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
833 "r"(ptr + width + 2 * w)
839 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
840 in0, in1, in2, in7, out, OP) \
841 "paddw "#m4", "#m3" \n\t" /* x1 */ \
842 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
843 "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
844 "movq "#in7", "#m3" \n\t" /* d */ \
845 "movq "#in0", %%mm5 \n\t" /* D */ \
846 "paddw "#m3", %%mm5 \n\t" /* x4 */ \
847 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
848 "movq "#in1", %%mm5 \n\t" /* C */ \
849 "movq "#in2", %%mm6 \n\t" /* B */ \
850 "paddw "#m6", %%mm5 \n\t" /* x3 */ \
851 "paddw "#m5", %%mm6 \n\t" /* x2 */ \
852 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
853 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
854 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
855 "paddw "#rnd", %%mm4 \n\t" /* x2 */ \
856 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
857 "psraw $5, %%mm5 \n\t" \
858 "packuswb %%mm5, %%mm5 \n\t" \
859 OP(%%mm5, out, %%mm7, d)
861 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMXEXT) \
862 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, \
871 "pxor %%mm7, %%mm7 \n\t" \
873 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
874 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
875 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
876 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
877 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
878 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
879 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
880 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
881 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
882 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
883 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
884 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
885 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
886 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
887 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
888 "paddw %%mm3, %%mm5 \n\t" /* b */ \
889 "paddw %%mm2, %%mm6 \n\t" /* c */ \
890 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
891 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
892 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
893 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
894 "paddw %%mm4, %%mm0 \n\t" /* a */ \
895 "paddw %%mm1, %%mm5 \n\t" /* d */ \
896 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
897 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
898 "paddw %6, %%mm6 \n\t" \
899 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
900 "psraw $5, %%mm0 \n\t" \
901 "movq %%mm0, %5 \n\t" \
902 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
904 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
905 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
906 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
907 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
908 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
909 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
910 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
911 "paddw %%mm0, %%mm2 \n\t" /* b */ \
912 "paddw %%mm5, %%mm3 \n\t" /* c */ \
913 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
914 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
915 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
916 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
917 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
918 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
919 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
920 "paddw %%mm2, %%mm1 \n\t" /* a */ \
921 "paddw %%mm6, %%mm4 \n\t" /* d */ \
922 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
923 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
924 "paddw %6, %%mm1 \n\t" \
925 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
926 "psraw $5, %%mm3 \n\t" \
927 "movq %5, %%mm1 \n\t" \
928 "packuswb %%mm3, %%mm1 \n\t" \
929 OP_MMXEXT(%%mm1, (%1), %%mm4, q) \
930 /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
932 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
933 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
934 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
935 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
936 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
937 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
938 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
939 "paddw %%mm1, %%mm5 \n\t" /* b */ \
940 "paddw %%mm4, %%mm0 \n\t" /* c */ \
941 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
942 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
943 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
944 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
945 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
946 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
947 "paddw %%mm3, %%mm2 \n\t" /* d */ \
948 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
949 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
950 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
951 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
952 "paddw %%mm2, %%mm6 \n\t" /* a */ \
953 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
954 "paddw %6, %%mm0 \n\t" \
955 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
956 "psraw $5, %%mm0 \n\t" \
957 /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
958 /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
960 "paddw %%mm5, %%mm3 \n\t" /* a */ \
961 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
962 "paddw %%mm4, %%mm6 \n\t" /* b */ \
963 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
964 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
965 "paddw %%mm1, %%mm4 \n\t" /* c */ \
966 "paddw %%mm2, %%mm5 \n\t" /* d */ \
967 "paddw %%mm6, %%mm6 \n\t" /* 2b */ \
968 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
969 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
970 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
971 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
972 "paddw %6, %%mm4 \n\t" \
973 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
974 "psraw $5, %%mm4 \n\t" \
975 "packuswb %%mm4, %%mm0 \n\t" \
976 OP_MMXEXT(%%mm0, 8(%1), %%mm4, q) \
982 : "+a"(src), "+c"(dst), "+D"(h) \
983 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
984 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
989 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, \
996 "pxor %%mm7, %%mm7 \n\t" \
998 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
999 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
1000 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
1001 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
1002 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
1003 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
1004 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
1005 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
1006 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
1007 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
1008 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
1009 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
1010 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
1011 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
1012 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
1013 "paddw %%mm3, %%mm5 \n\t" /* b */ \
1014 "paddw %%mm2, %%mm6 \n\t" /* c */ \
1015 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1016 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
1017 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
1018 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
1019 "paddw %%mm4, %%mm0 \n\t" /* a */ \
1020 "paddw %%mm1, %%mm5 \n\t" /* d */ \
1021 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
1022 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
1023 "paddw %5, %%mm6 \n\t" \
1024 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1025 "psraw $5, %%mm0 \n\t" \
1026 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
1028 "movd 5(%0), %%mm5 \n\t" /* FGHI */ \
1029 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
1030 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
1031 "paddw %%mm5, %%mm1 \n\t" /* a */ \
1032 "paddw %%mm6, %%mm2 \n\t" /* b */ \
1033 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
1034 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
1035 "paddw %%mm6, %%mm3 \n\t" /* c */ \
1036 "paddw %%mm5, %%mm4 \n\t" /* d */ \
1037 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1038 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1039 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1040 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1041 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
1042 "paddw %5, %%mm1 \n\t" \
1043 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
1044 "psraw $5, %%mm3 \n\t" \
1045 "packuswb %%mm3, %%mm0 \n\t" \
1046 OP_MMXEXT(%%mm0, (%1), %%mm4, q) \
1052 : "+a"(src), "+c"(dst), "+d"(h) \
1053 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1054 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
1059 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1060 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1065 uint64_t temp[17 * 4]; \
1066 uint64_t *temp_ptr = temp; \
1069 /* FIXME unroll */ \
1070 __asm__ volatile ( \
1071 "pxor %%mm7, %%mm7 \n\t" \
1073 "movq (%0), %%mm0 \n\t" \
1074 "movq (%0), %%mm1 \n\t" \
1075 "movq 8(%0), %%mm2 \n\t" \
1076 "movq 8(%0), %%mm3 \n\t" \
1077 "punpcklbw %%mm7, %%mm0 \n\t" \
1078 "punpckhbw %%mm7, %%mm1 \n\t" \
1079 "punpcklbw %%mm7, %%mm2 \n\t" \
1080 "punpckhbw %%mm7, %%mm3 \n\t" \
1081 "movq %%mm0, (%1) \n\t" \
1082 "movq %%mm1, 17 * 8(%1) \n\t" \
1083 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1084 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1089 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1090 : "r"((x86_reg)srcStride) \
1097 /* FIXME reorder for speed */ \
1098 __asm__ volatile ( \
1099 /* "pxor %%mm7, %%mm7 \n\t" */ \
1101 "movq (%0), %%mm0 \n\t" \
1102 "movq 8(%0), %%mm1 \n\t" \
1103 "movq 16(%0), %%mm2 \n\t" \
1104 "movq 24(%0), %%mm3 \n\t" \
1105 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1106 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1108 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1110 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1112 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1113 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1115 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1116 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1118 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1119 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1121 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1122 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1124 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1126 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1128 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1129 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1131 "add $136, %0 \n\t" \
1136 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1137 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1138 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1139 "g"(4 - 14 * (x86_reg)dstStride) \
1144 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1149 uint64_t temp[9 * 2]; \
1150 uint64_t *temp_ptr = temp; \
1153 /* FIXME unroll */ \
1154 __asm__ volatile ( \
1155 "pxor %%mm7, %%mm7 \n\t" \
1157 "movq (%0), %%mm0 \n\t" \
1158 "movq (%0), %%mm1 \n\t" \
1159 "punpcklbw %%mm7, %%mm0 \n\t" \
1160 "punpckhbw %%mm7, %%mm1 \n\t" \
1161 "movq %%mm0, (%1) \n\t" \
1162 "movq %%mm1, 9*8(%1) \n\t" \
1167 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1168 : "r"((x86_reg)srcStride) \
1175 /* FIXME reorder for speed */ \
1176 __asm__ volatile ( \
1177 /* "pxor %%mm7, %%mm7 \n\t" */ \
1179 "movq (%0), %%mm0 \n\t" \
1180 "movq 8(%0), %%mm1 \n\t" \
1181 "movq 16(%0), %%mm2 \n\t" \
1182 "movq 24(%0), %%mm3 \n\t" \
1183 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1184 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1186 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1188 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1190 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1192 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1194 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1195 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1197 "add $72, %0 \n\t" \
1202 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1203 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1204 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1205 "g"(4 - 6 * (x86_reg)dstStride) \
1210 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1213 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1216 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1220 uint8_t * const half = (uint8_t*)temp; \
1221 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1223 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1226 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1229 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1233 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1237 uint8_t * const half = (uint8_t*)temp; \
1238 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1240 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1244 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1248 uint8_t * const half = (uint8_t*)temp; \
1249 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1250 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1253 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1256 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1259 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1263 uint8_t * const half = (uint8_t*)temp; \
1264 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1265 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1269 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1272 uint64_t half[8 + 9]; \
1273 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1274 uint8_t * const halfHV = ((uint8_t*)half); \
1275 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1277 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1278 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1279 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1282 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1285 uint64_t half[8 + 9]; \
1286 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1287 uint8_t * const halfHV = ((uint8_t*)half); \
1288 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1290 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1292 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1293 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1296 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1299 uint64_t half[8 + 9]; \
1300 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1301 uint8_t * const halfHV = ((uint8_t*)half); \
1302 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1304 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1305 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1306 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1309 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1312 uint64_t half[8 + 9]; \
1313 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1314 uint8_t * const halfHV = ((uint8_t*)half); \
1315 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1317 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1319 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1320 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1323 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1326 uint64_t half[8 + 9]; \
1327 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1328 uint8_t * const halfHV = ((uint8_t*)half); \
1329 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1331 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1332 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1335 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1338 uint64_t half[8 + 9]; \
1339 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1340 uint8_t * const halfHV = ((uint8_t*)half); \
1341 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1343 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1344 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1347 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1350 uint64_t half[8 + 9]; \
1351 uint8_t * const halfH = ((uint8_t*)half); \
1352 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1354 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1355 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1358 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1361 uint64_t half[8 + 9]; \
1362 uint8_t * const halfH = ((uint8_t*)half); \
1363 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1365 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1367 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1370 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1374 uint8_t * const halfH = ((uint8_t*)half); \
1375 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1377 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1380 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1383 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1386 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1389 uint64_t temp[32]; \
1390 uint8_t * const half = (uint8_t*)temp; \
1391 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1393 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1396 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1399 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1400 stride, stride, 16); \
1403 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1406 uint64_t temp[32]; \
1407 uint8_t * const half = (uint8_t*)temp; \
1408 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1410 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1411 stride, stride, 16); \
1414 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1417 uint64_t temp[32]; \
1418 uint8_t * const half = (uint8_t*)temp; \
1419 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1421 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1424 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1427 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1430 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1433 uint64_t temp[32]; \
1434 uint8_t * const half = (uint8_t*)temp; \
1435 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1437 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1438 stride, stride, 16); \
1441 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1444 uint64_t half[16 * 2 + 17 * 2]; \
1445 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1446 uint8_t * const halfHV = ((uint8_t*)half); \
1447 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1449 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1451 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1453 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1456 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1459 uint64_t half[16 * 2 + 17 * 2]; \
1460 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1461 uint8_t * const halfHV = ((uint8_t*)half); \
1462 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1464 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1466 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1468 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1471 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1474 uint64_t half[16 * 2 + 17 * 2]; \
1475 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1476 uint8_t * const halfHV = ((uint8_t*)half); \
1477 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1479 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1481 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1483 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1487 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1490 uint64_t half[16 * 2 + 17 * 2]; \
1491 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1492 uint8_t * const halfHV = ((uint8_t*)half); \
1493 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1495 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1497 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1499 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1503 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1506 uint64_t half[16 * 2 + 17 * 2]; \
1507 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1508 uint8_t * const halfHV = ((uint8_t*)half); \
1509 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1511 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1513 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1516 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1519 uint64_t half[16 * 2 + 17 * 2]; \
1520 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1521 uint8_t * const halfHV = ((uint8_t*)half); \
1522 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1524 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1526 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1530 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1533 uint64_t half[17 * 2]; \
1534 uint8_t * const halfH = ((uint8_t*)half); \
1535 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1537 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1539 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1542 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1545 uint64_t half[17 * 2]; \
1546 uint8_t * const halfH = ((uint8_t*)half); \
1547 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1549 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1551 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1554 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1557 uint64_t half[17 * 2]; \
1558 uint8_t * const halfH = ((uint8_t*)half); \
1559 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1561 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1564 #define PUT_OP(a, b, temp, size) \
1565 "mov"#size" "#a", "#b" \n\t"
1567 #define AVG_MMXEXT_OP(a, b, temp, size) \
1568 "mov"#size" "#b", "#temp" \n\t" \
1569 "pavgb "#temp", "#a" \n\t" \
1570 "mov"#size" "#a", "#b" \n\t"
1572 QPEL_BASE(put_, ff_pw_16, _, PUT_OP)
1573 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMXEXT_OP)
1574 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP)
1575 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext)
1576 QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
1577 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
1579 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1581 put_pixels8_xy2_mmx(dst, src, stride, 8);
1583 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1585 put_pixels16_xy2_mmx(dst, src, stride, 16);
1587 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1589 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1591 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1593 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1596 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1597 ptrdiff_t linesize, int block_w, int block_h,
1598 int src_x, int src_y, int w, int h);
1600 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1601 int stride, int h, int ox, int oy,
1602 int dxx, int dxy, int dyx, int dyy,
1603 int shift, int r, int width, int height,
1604 emulated_edge_mc_func *emu_edge_fn)
1607 const int ix = ox >> (16 + shift);
1608 const int iy = oy >> (16 + shift);
1609 const int oxs = ox >> 4;
1610 const int oys = oy >> 4;
1611 const int dxxs = dxx >> 4;
1612 const int dxys = dxy >> 4;
1613 const int dyxs = dyx >> 4;
1614 const int dyys = dyy >> 4;
1615 const uint16_t r4[4] = { r, r, r, r };
1616 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1617 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1618 const uint64_t shift2 = 2 * shift;
1619 #define MAX_STRIDE 4096U
1621 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
1624 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1625 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1626 const int dxh = dxy * (h - 1);
1627 const int dyw = dyx * (w - 1);
1628 int need_emu = (unsigned)ix >= width - w ||
1629 (unsigned)iy >= height - h;
1631 if ( // non-constant fullpel offset (3% of blocks)
1632 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1633 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1634 // uses more than 16 bits of subpel mv (only at huge resolution)
1635 || (dxx | dxy | dyx | dyy) & 15
1636 || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
1637 // FIXME could still use mmx for some of the rows
1638 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1639 shift, r, width, height);
1643 src += ix + iy * stride;
1645 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1650 "movd %0, %%mm6 \n\t"
1651 "pxor %%mm7, %%mm7 \n\t"
1652 "punpcklwd %%mm6, %%mm6 \n\t"
1653 "punpcklwd %%mm6, %%mm6 \n\t"
1657 for (x = 0; x < w; x += 4) {
1658 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1659 oxs - dxys + dxxs * (x + 1),
1660 oxs - dxys + dxxs * (x + 2),
1661 oxs - dxys + dxxs * (x + 3) };
1662 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1663 oys - dyys + dyxs * (x + 1),
1664 oys - dyys + dyxs * (x + 2),
1665 oys - dyys + dyxs * (x + 3) };
1667 for (y = 0; y < h; y++) {
1669 "movq %0, %%mm4 \n\t"
1670 "movq %1, %%mm5 \n\t"
1671 "paddw %2, %%mm4 \n\t"
1672 "paddw %3, %%mm5 \n\t"
1673 "movq %%mm4, %0 \n\t"
1674 "movq %%mm5, %1 \n\t"
1675 "psrlw $12, %%mm4 \n\t"
1676 "psrlw $12, %%mm5 \n\t"
1677 : "+m"(*dx4), "+m"(*dy4)
1678 : "m"(*dxy4), "m"(*dyy4)
1682 "movq %%mm6, %%mm2 \n\t"
1683 "movq %%mm6, %%mm1 \n\t"
1684 "psubw %%mm4, %%mm2 \n\t"
1685 "psubw %%mm5, %%mm1 \n\t"
1686 "movq %%mm2, %%mm0 \n\t"
1687 "movq %%mm4, %%mm3 \n\t"
1688 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1689 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1690 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1691 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1693 "movd %4, %%mm5 \n\t"
1694 "movd %3, %%mm4 \n\t"
1695 "punpcklbw %%mm7, %%mm5 \n\t"
1696 "punpcklbw %%mm7, %%mm4 \n\t"
1697 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1698 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1700 "movd %2, %%mm5 \n\t"
1701 "movd %1, %%mm4 \n\t"
1702 "punpcklbw %%mm7, %%mm5 \n\t"
1703 "punpcklbw %%mm7, %%mm4 \n\t"
1704 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1705 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1706 "paddw %5, %%mm1 \n\t"
1707 "paddw %%mm3, %%mm2 \n\t"
1708 "paddw %%mm1, %%mm0 \n\t"
1709 "paddw %%mm2, %%mm0 \n\t"
1711 "psrlw %6, %%mm0 \n\t"
1712 "packuswb %%mm0, %%mm0 \n\t"
1713 "movd %%mm0, %0 \n\t"
1715 : "=m"(dst[x + y * stride])
1716 : "m"(src[0]), "m"(src[1]),
1717 "m"(src[stride]), "m"(src[stride + 1]),
1718 "m"(*r4), "m"(shift2)
1722 src += 4 - h * stride;
1729 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1730 int stride, int h, int ox, int oy,
1731 int dxx, int dxy, int dyx, int dyy,
1732 int shift, int r, int width, int height)
1734 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1735 width, height, &ff_emulated_edge_mc_8);
1738 static void gmc_sse(uint8_t *dst, uint8_t *src,
1739 int stride, int h, int ox, int oy,
1740 int dxx, int dxy, int dyx, int dyy,
1741 int shift, int r, int width, int height)
1743 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1744 width, height, &ff_emulated_edge_mc_8);
1747 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1748 int stride, int h, int ox, int oy,
1749 int dxx, int dxy, int dyx, int dyy,
1750 int shift, int r, int width, int height)
1752 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1753 width, height, &ff_emulated_edge_mc_8);
1758 #endif /* HAVE_INLINE_ASM */
1760 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1761 int line_size, int h);
1762 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1763 int line_size, int h);
1765 void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
1766 int stride, int h, int x, int y);
1767 void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
1768 int stride, int h, int x, int y);
1769 void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
1770 int stride, int h, int x, int y);
1772 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1773 int stride, int h, int x, int y);
1774 void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
1775 int stride, int h, int x, int y);
1776 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1777 int stride, int h, int x, int y);
1779 void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1780 int stride, int h, int x, int y);
1781 void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1782 int stride, int h, int x, int y);
1784 void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1785 int stride, int h, int x, int y);
1786 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1787 int stride, int h, int x, int y);
1789 void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1790 int stride, int h, int x, int y);
1791 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1792 int stride, int h, int x, int y);
1794 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1795 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1796 (uint8_t *dst, uint8_t *src, \
1797 int stride, int h, int x, int y);
1799 CHROMA_MC(put, 2, 10, mmxext)
1800 CHROMA_MC(avg, 2, 10, mmxext)
1801 CHROMA_MC(put, 4, 10, mmxext)
1802 CHROMA_MC(avg, 4, 10, mmxext)
1803 CHROMA_MC(put, 8, 10, sse2)
1804 CHROMA_MC(avg, 8, 10, sse2)
1805 CHROMA_MC(put, 8, 10, avx)
1806 CHROMA_MC(avg, 8, 10, avx)
1811 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1813 put_pixels8_mmx(dst, src, stride, 8);
1816 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1818 avg_pixels8_mmx(dst, src, stride, 8);
1821 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1823 put_pixels16_mmx(dst, src, stride, 16);
1826 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1828 avg_pixels16_mmx(dst, src, stride, 16);
1832 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1833 int stride, int rnd)
1835 put_pixels8_mmx(dst, src, stride, 8);
1838 void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
1839 int stride, int rnd)
1841 avg_pixels8_mmxext(dst, src, stride, 8);
1844 #if CONFIG_DIRAC_DECODER
1845 #define DIRAC_PIXOP(OPNAME, EXT)\
1846 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1848 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
1850 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1852 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
1854 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1856 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
1857 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
1860 DIRAC_PIXOP(put, mmx)
1861 DIRAC_PIXOP(avg, mmx)
1862 DIRAC_PIXOP(avg, mmxext)
1865 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1867 ff_put_pixels16_sse2(dst, src[0], stride, h);
1869 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1871 ff_avg_pixels16_sse2(dst, src[0], stride, h);
1873 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1875 ff_put_pixels16_sse2(dst , src[0] , stride, h);
1876 ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
1878 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1880 ff_avg_pixels16_sse2(dst , src[0] , stride, h);
1881 ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
1886 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
1889 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
1893 ff_put_pixels_clamped_mmx(block, dest, line_size);
1896 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
1900 ff_add_pixels_clamped_mmx(block, dest, line_size);
1903 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
1906 ff_mmxext_idct(block);
1907 ff_put_pixels_clamped_mmx(block, dest, line_size);
1910 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
1913 ff_mmxext_idct(block);
1914 ff_add_pixels_clamped_mmx(block, dest, line_size);
1918 static void vector_clipf_sse(float *dst, const float *src,
1919 float min, float max, int len)
1921 x86_reg i = (len - 16) * 4;
1923 "movss %3, %%xmm4 \n\t"
1924 "movss %4, %%xmm5 \n\t"
1925 "shufps $0, %%xmm4, %%xmm4 \n\t"
1926 "shufps $0, %%xmm5, %%xmm5 \n\t"
1928 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1929 "movaps 16(%2, %0), %%xmm1 \n\t"
1930 "movaps 32(%2, %0), %%xmm2 \n\t"
1931 "movaps 48(%2, %0), %%xmm3 \n\t"
1932 "maxps %%xmm4, %%xmm0 \n\t"
1933 "maxps %%xmm4, %%xmm1 \n\t"
1934 "maxps %%xmm4, %%xmm2 \n\t"
1935 "maxps %%xmm4, %%xmm3 \n\t"
1936 "minps %%xmm5, %%xmm0 \n\t"
1937 "minps %%xmm5, %%xmm1 \n\t"
1938 "minps %%xmm5, %%xmm2 \n\t"
1939 "minps %%xmm5, %%xmm3 \n\t"
1940 "movaps %%xmm0, (%1, %0) \n\t"
1941 "movaps %%xmm1, 16(%1, %0) \n\t"
1942 "movaps %%xmm2, 32(%1, %0) \n\t"
1943 "movaps %%xmm3, 48(%1, %0) \n\t"
1947 : "r"(dst), "r"(src), "m"(min), "m"(max)
1952 #endif /* HAVE_INLINE_ASM */
1954 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1956 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1958 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1960 int order, int mul);
1961 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1963 int order, int mul);
1964 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1966 int order, int mul);
1968 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1969 const int16_t *window, unsigned int len);
1970 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1971 const int16_t *window, unsigned int len);
1972 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1973 const int16_t *window, unsigned int len);
1974 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1975 const int16_t *window, unsigned int len);
1976 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1977 const int16_t *window, unsigned int len);
1978 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1979 const int16_t *window, unsigned int len);
1981 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1982 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1984 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1985 const uint8_t *diff, int w,
1986 int *left, int *left_top);
1987 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1989 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1992 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1993 int32_t min, int32_t max, unsigned int len);
1994 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1995 int32_t min, int32_t max, unsigned int len);
1996 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1997 int32_t min, int32_t max, unsigned int len);
1998 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1999 int32_t min, int32_t max, unsigned int len);
2001 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2003 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2004 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2005 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2006 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2007 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2008 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2009 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2010 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2011 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2012 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2013 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2014 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2015 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2016 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2017 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2018 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2021 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2023 c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2024 c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2025 c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2026 c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2029 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2031 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2034 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2035 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2036 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2038 if (!high_bit_depth) {
2039 c->clear_block = clear_block_mmx;
2040 c->clear_blocks = clear_blocks_mmx;
2041 c->draw_edges = draw_edges_mmx;
2043 SET_HPEL_FUNCS(put, [0], 16, mmx);
2044 SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
2045 SET_HPEL_FUNCS(avg, [0], 16, mmx);
2046 SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
2047 SET_HPEL_FUNCS(put, [1], 8, mmx);
2048 SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
2049 SET_HPEL_FUNCS(avg, [1], 8, mmx);
2052 #if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
2056 c->add_bytes = add_bytes_mmx;
2058 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2059 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2060 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2062 #endif /* HAVE_INLINE_ASM */
2065 if (!high_bit_depth && CONFIG_H264CHROMA) {
2066 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
2067 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2070 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2075 static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
2078 const int bit_depth = avctx->bits_per_raw_sample;
2079 const int high_bit_depth = bit_depth > 8;
2082 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
2083 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
2085 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
2086 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
2087 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
2088 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
2090 if (!high_bit_depth) {
2091 c->put_pixels_tab[0][1] = put_pixels16_x2_mmxext;
2092 c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
2094 c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
2095 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
2096 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
2098 c->put_pixels_tab[1][1] = put_pixels8_x2_mmxext;
2099 c->put_pixels_tab[1][2] = put_pixels8_y2_mmxext;
2101 c->avg_pixels_tab[1][0] = avg_pixels8_mmxext;
2102 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmxext;
2103 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmxext;
2106 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2107 if (!high_bit_depth) {
2108 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
2109 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
2110 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmxext;
2111 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmxext;
2113 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
2114 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmxext;
2118 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2119 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2120 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmxext;
2121 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmxext;
2123 #endif /* HAVE_INLINE_ASM */
2125 #if HAVE_MMXEXT_EXTERNAL
2126 if (!high_bit_depth && CONFIG_H264CHROMA) {
2127 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
2128 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
2129 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
2130 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
2132 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2133 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
2134 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
2135 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
2136 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
2139 /* slower than cmov version on AMD */
2140 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
2141 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
2143 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
2144 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
2146 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2147 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2149 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
2151 #endif /* HAVE_MMXEXT_EXTERNAL */
2154 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2157 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2160 if (!high_bit_depth) {
2161 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2162 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2164 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2165 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2166 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2168 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2169 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2171 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2172 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2173 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2175 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2176 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2177 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2178 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2179 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2181 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2182 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2186 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2187 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2188 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2189 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2191 #endif /* HAVE_INLINE_ASM */
2194 if (!high_bit_depth && CONFIG_H264CHROMA) {
2195 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
2196 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2198 #endif /* HAVE_YASM */
2201 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2203 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2206 if (!high_bit_depth) {
2207 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2208 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2209 c->clear_block = clear_block_sse;
2210 c->clear_blocks = clear_blocks_sse;
2214 c->vector_clipf = vector_clipf_sse;
2215 #endif /* HAVE_INLINE_ASM */
2218 #if HAVE_INLINE_ASM && CONFIG_VIDEODSP
2221 #endif /* HAVE_YASM */
2224 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2227 const int bit_depth = avctx->bits_per_raw_sample;
2228 const int high_bit_depth = bit_depth > 8;
2230 #if HAVE_SSE2_INLINE
2231 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
2232 c->idct_put = ff_idct_xvid_sse2_put;
2233 c->idct_add = ff_idct_xvid_sse2_add;
2234 c->idct = ff_idct_xvid_sse2;
2235 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2237 #endif /* HAVE_SSE2_INLINE */
2239 #if HAVE_SSE2_EXTERNAL
2240 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2241 // these functions are slower than mmx on AMD, but faster on Intel
2242 if (!high_bit_depth) {
2243 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
2244 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
2245 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
2249 if (bit_depth == 10) {
2250 if (CONFIG_H264CHROMA) {
2251 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2252 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2256 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2257 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2258 if (mm_flags & AV_CPU_FLAG_ATOM) {
2259 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2261 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2263 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2264 c->apply_window_int16 = ff_apply_window_int16_sse2;
2265 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2266 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
2268 c->bswap_buf = ff_bswap32_buf_sse2;
2269 #endif /* HAVE_SSE2_EXTERNAL */
2272 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2275 #if HAVE_SSSE3_EXTERNAL
2276 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2278 if (!high_bit_depth && CONFIG_H264CHROMA) {
2279 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
2280 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
2281 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
2282 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
2284 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2285 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2286 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2288 if (mm_flags & AV_CPU_FLAG_ATOM)
2289 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2291 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2292 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
2293 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2294 c->bswap_buf = ff_bswap32_buf_ssse3;
2295 #endif /* HAVE_SSSE3_EXTERNAL */
2298 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
2301 #if HAVE_SSE4_EXTERNAL
2302 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2303 #endif /* HAVE_SSE4_EXTERNAL */
2306 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2308 #if HAVE_AVX_EXTERNAL
2309 const int bit_depth = avctx->bits_per_raw_sample;
2311 if (bit_depth == 10) {
2312 // AVX implies !cache64.
2313 // TODO: Port cache(32|64) detection from x264.
2314 if (CONFIG_H264CHROMA) {
2315 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2316 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2319 #endif /* HAVE_AVX_EXTERNAL */
2322 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
2324 int mm_flags = av_get_cpu_flags();
2326 #if HAVE_7REGS && HAVE_INLINE_ASM
2327 if (mm_flags & AV_CPU_FLAG_CMOV)
2328 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2331 if (mm_flags & AV_CPU_FLAG_MMX) {
2333 const int idct_algo = avctx->idct_algo;
2335 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
2336 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
2337 c->idct_put = ff_simple_idct_put_mmx;
2338 c->idct_add = ff_simple_idct_add_mmx;
2339 c->idct = ff_simple_idct_mmx;
2340 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
2342 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
2343 if (mm_flags & AV_CPU_FLAG_MMX2) {
2344 c->idct_put = ff_libmpeg2mmx2_idct_put;
2345 c->idct_add = ff_libmpeg2mmx2_idct_add;
2346 c->idct = ff_mmxext_idct;
2348 c->idct_put = ff_libmpeg2mmx_idct_put;
2349 c->idct_add = ff_libmpeg2mmx_idct_add;
2350 c->idct = ff_mmx_idct;
2352 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2354 } else if (idct_algo == FF_IDCT_XVIDMMX) {
2355 if (mm_flags & AV_CPU_FLAG_SSE2) {
2356 c->idct_put = ff_idct_xvid_sse2_put;
2357 c->idct_add = ff_idct_xvid_sse2_add;
2358 c->idct = ff_idct_xvid_sse2;
2359 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2360 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
2361 c->idct_put = ff_idct_xvid_mmxext_put;
2362 c->idct_add = ff_idct_xvid_mmxext_add;
2363 c->idct = ff_idct_xvid_mmxext;
2365 c->idct_put = ff_idct_xvid_mmx_put;
2366 c->idct_add = ff_idct_xvid_mmx_add;
2367 c->idct = ff_idct_xvid_mmx;
2371 #endif /* HAVE_INLINE_ASM */
2373 dsputil_init_mmx(c, avctx, mm_flags);
2376 if (mm_flags & AV_CPU_FLAG_MMXEXT)
2377 dsputil_init_mmxext(c, avctx, mm_flags);
2379 if (mm_flags & AV_CPU_FLAG_3DNOW)
2380 dsputil_init_3dnow(c, avctx, mm_flags);
2382 if (mm_flags & AV_CPU_FLAG_SSE)
2383 dsputil_init_sse(c, avctx, mm_flags);
2385 if (mm_flags & AV_CPU_FLAG_SSE2)
2386 dsputil_init_sse2(c, avctx, mm_flags);
2388 if (mm_flags & AV_CPU_FLAG_SSSE3)
2389 dsputil_init_ssse3(c, avctx, mm_flags);
2391 if (mm_flags & AV_CPU_FLAG_SSE4)
2392 dsputil_init_sse4(c, avctx, mm_flags);
2394 if (mm_flags & AV_CPU_FLAG_AVX)
2395 dsputil_init_avx(c, avctx, mm_flags);
2397 if (CONFIG_ENCODERS)
2398 ff_dsputilenc_init_mmx(c, avctx);