2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "dsputil_mmx.h"
32 #include "idct_xvid.h"
37 /* pixel operations */
38 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
39 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
41 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
42 { 0x8000000080000000ULL, 0x8000000080000000ULL };
44 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
51 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
55 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
59 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
61 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
63 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
66 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
76 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
77 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
78 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
80 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
81 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
83 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
84 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
88 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
89 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
91 #define MOVQ_BFE(regd) \
93 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
94 "paddb %%"#regd", %%"#regd" \n\t" ::)
97 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
98 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
100 // for shared library it's better to use this way for accessing constants
102 #define MOVQ_BONE(regd) \
104 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
105 "psrlw $15, %%"#regd" \n\t" \
106 "packuswb %%"#regd", %%"#regd" \n\t" ::)
108 #define MOVQ_WTWO(regd) \
110 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
111 "psrlw $15, %%"#regd" \n\t" \
112 "psllw $1, %%"#regd" \n\t"::)
116 // using regr as temporary and for the output result
117 // first argument is unmodifed and second is trashed
118 // regfe is supposed to contain 0xfefefefefefefefe
119 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
120 "movq "#rega", "#regr" \n\t" \
121 "pand "#regb", "#regr" \n\t" \
122 "pxor "#rega", "#regb" \n\t" \
123 "pand "#regfe", "#regb" \n\t" \
124 "psrlq $1, "#regb" \n\t" \
125 "paddb "#regb", "#regr" \n\t"
127 #define PAVGB_MMX(rega, regb, regr, regfe) \
128 "movq "#rega", "#regr" \n\t" \
129 "por "#regb", "#regr" \n\t" \
130 "pxor "#rega", "#regb" \n\t" \
131 "pand "#regfe", "#regb" \n\t" \
132 "psrlq $1, "#regb" \n\t" \
133 "psubb "#regb", "#regr" \n\t"
135 // mm6 is supposed to contain 0xfefefefefefefefe
136 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
137 "movq "#rega", "#regr" \n\t" \
138 "movq "#regc", "#regp" \n\t" \
139 "pand "#regb", "#regr" \n\t" \
140 "pand "#regd", "#regp" \n\t" \
141 "pxor "#rega", "#regb" \n\t" \
142 "pxor "#regc", "#regd" \n\t" \
143 "pand %%mm6, "#regb" \n\t" \
144 "pand %%mm6, "#regd" \n\t" \
145 "psrlq $1, "#regb" \n\t" \
146 "psrlq $1, "#regd" \n\t" \
147 "paddb "#regb", "#regr" \n\t" \
148 "paddb "#regd", "#regp" \n\t"
150 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
151 "movq "#rega", "#regr" \n\t" \
152 "movq "#regc", "#regp" \n\t" \
153 "por "#regb", "#regr" \n\t" \
154 "por "#regd", "#regp" \n\t" \
155 "pxor "#rega", "#regb" \n\t" \
156 "pxor "#regc", "#regd" \n\t" \
157 "pand %%mm6, "#regb" \n\t" \
158 "pand %%mm6, "#regd" \n\t" \
159 "psrlq $1, "#regd" \n\t" \
160 "psrlq $1, "#regb" \n\t" \
161 "psubb "#regb", "#regr" \n\t" \
162 "psubb "#regd", "#regp" \n\t"
164 /***********************************/
165 /* MMX no rounding */
166 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
167 #define SET_RND MOVQ_WONE
168 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
169 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
170 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
172 #include "dsputil_rnd_template.c"
178 /***********************************/
181 #define DEF(x, y) x ## _ ## y ## _mmx
182 #define SET_RND MOVQ_WTWO
183 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
184 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
186 #include "dsputil_rnd_template.c"
194 /***********************************/
197 #define DEF(x) x ## _3dnow
198 #define PAVGB "pavgusb"
199 #define SKIP_FOR_3DNOW
201 #include "dsputil_avg_template.c"
205 #undef SKIP_FOR_3DNOW
207 /***********************************/
208 /* MMXEXT specific */
210 #define DEF(x) x ## _mmxext
212 /* Introduced only in MMXEXT set */
213 #define PAVGB "pavgb"
215 #include "dsputil_avg_template.c"
220 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
221 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
222 #define put_pixels16_mmxext put_pixels16_mmx
223 #define put_pixels8_mmxext put_pixels8_mmx
224 #define put_pixels4_mmxext put_pixels4_mmx
225 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
226 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
228 /***********************************/
231 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
237 /* read the pixels */
242 "movq (%3), %%mm0 \n\t"
243 "movq 8(%3), %%mm1 \n\t"
244 "movq 16(%3), %%mm2 \n\t"
245 "movq 24(%3), %%mm3 \n\t"
246 "movq 32(%3), %%mm4 \n\t"
247 "movq 40(%3), %%mm5 \n\t"
248 "movq 48(%3), %%mm6 \n\t"
249 "movq 56(%3), %%mm7 \n\t"
250 "packuswb %%mm1, %%mm0 \n\t"
251 "packuswb %%mm3, %%mm2 \n\t"
252 "packuswb %%mm5, %%mm4 \n\t"
253 "packuswb %%mm7, %%mm6 \n\t"
254 "movq %%mm0, (%0) \n\t"
255 "movq %%mm2, (%0, %1) \n\t"
256 "movq %%mm4, (%0, %1, 2) \n\t"
257 "movq %%mm6, (%0, %2) \n\t"
258 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
261 pix += line_size * 4;
264 // if here would be an exact copy of the code above
265 // compiler would generate some very strange code
268 "movq (%3), %%mm0 \n\t"
269 "movq 8(%3), %%mm1 \n\t"
270 "movq 16(%3), %%mm2 \n\t"
271 "movq 24(%3), %%mm3 \n\t"
272 "movq 32(%3), %%mm4 \n\t"
273 "movq 40(%3), %%mm5 \n\t"
274 "movq 48(%3), %%mm6 \n\t"
275 "movq 56(%3), %%mm7 \n\t"
276 "packuswb %%mm1, %%mm0 \n\t"
277 "packuswb %%mm3, %%mm2 \n\t"
278 "packuswb %%mm5, %%mm4 \n\t"
279 "packuswb %%mm7, %%mm6 \n\t"
280 "movq %%mm0, (%0) \n\t"
281 "movq %%mm2, (%0, %1) \n\t"
282 "movq %%mm4, (%0, %1, 2) \n\t"
283 "movq %%mm6, (%0, %2) \n\t"
284 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
288 #define put_signed_pixels_clamped_mmx_half(off) \
289 "movq "#off"(%2), %%mm1 \n\t" \
290 "movq 16 + "#off"(%2), %%mm2 \n\t" \
291 "movq 32 + "#off"(%2), %%mm3 \n\t" \
292 "movq 48 + "#off"(%2), %%mm4 \n\t" \
293 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
294 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
295 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
296 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
297 "paddb %%mm0, %%mm1 \n\t" \
298 "paddb %%mm0, %%mm2 \n\t" \
299 "paddb %%mm0, %%mm3 \n\t" \
300 "paddb %%mm0, %%mm4 \n\t" \
301 "movq %%mm1, (%0) \n\t" \
302 "movq %%mm2, (%0, %3) \n\t" \
303 "movq %%mm3, (%0, %3, 2) \n\t" \
304 "movq %%mm4, (%0, %1) \n\t"
306 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
309 x86_reg line_skip = line_size;
313 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
314 "lea (%3, %3, 2), %1 \n\t"
315 put_signed_pixels_clamped_mmx_half(0)
316 "lea (%0, %3, 4), %0 \n\t"
317 put_signed_pixels_clamped_mmx_half(64)
318 : "+&r"(pixels), "=&r"(line_skip3)
319 : "r"(block), "r"(line_skip)
323 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
330 /* read the pixels */
337 "movq (%2), %%mm0 \n\t"
338 "movq 8(%2), %%mm1 \n\t"
339 "movq 16(%2), %%mm2 \n\t"
340 "movq 24(%2), %%mm3 \n\t"
341 "movq %0, %%mm4 \n\t"
342 "movq %1, %%mm6 \n\t"
343 "movq %%mm4, %%mm5 \n\t"
344 "punpcklbw %%mm7, %%mm4 \n\t"
345 "punpckhbw %%mm7, %%mm5 \n\t"
346 "paddsw %%mm4, %%mm0 \n\t"
347 "paddsw %%mm5, %%mm1 \n\t"
348 "movq %%mm6, %%mm5 \n\t"
349 "punpcklbw %%mm7, %%mm6 \n\t"
350 "punpckhbw %%mm7, %%mm5 \n\t"
351 "paddsw %%mm6, %%mm2 \n\t"
352 "paddsw %%mm5, %%mm3 \n\t"
353 "packuswb %%mm1, %%mm0 \n\t"
354 "packuswb %%mm3, %%mm2 \n\t"
355 "movq %%mm0, %0 \n\t"
356 "movq %%mm2, %1 \n\t"
357 : "+m"(*pix), "+m"(*(pix + line_size))
360 pix += line_size * 2;
365 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
366 int line_size, int h)
369 "lea (%3, %3), %%"REG_a" \n\t"
372 "movq (%1 ), %%mm0 \n\t"
373 "movq (%1, %3), %%mm1 \n\t"
374 "movq %%mm0, (%2) \n\t"
375 "movq %%mm1, (%2, %3) \n\t"
376 "add %%"REG_a", %1 \n\t"
377 "add %%"REG_a", %2 \n\t"
378 "movq (%1 ), %%mm0 \n\t"
379 "movq (%1, %3), %%mm1 \n\t"
380 "movq %%mm0, (%2) \n\t"
381 "movq %%mm1, (%2, %3) \n\t"
382 "add %%"REG_a", %1 \n\t"
383 "add %%"REG_a", %2 \n\t"
386 : "+g"(h), "+r"(pixels), "+r"(block)
387 : "r"((x86_reg)line_size)
392 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
393 int line_size, int h)
396 "lea (%3, %3), %%"REG_a" \n\t"
399 "movq (%1 ), %%mm0 \n\t"
400 "movq 8(%1 ), %%mm4 \n\t"
401 "movq (%1, %3), %%mm1 \n\t"
402 "movq 8(%1, %3), %%mm5 \n\t"
403 "movq %%mm0, (%2) \n\t"
404 "movq %%mm4, 8(%2) \n\t"
405 "movq %%mm1, (%2, %3) \n\t"
406 "movq %%mm5, 8(%2, %3) \n\t"
407 "add %%"REG_a", %1 \n\t"
408 "add %%"REG_a", %2 \n\t"
409 "movq (%1 ), %%mm0 \n\t"
410 "movq 8(%1 ), %%mm4 \n\t"
411 "movq (%1, %3), %%mm1 \n\t"
412 "movq 8(%1, %3), %%mm5 \n\t"
413 "movq %%mm0, (%2) \n\t"
414 "movq %%mm4, 8(%2) \n\t"
415 "movq %%mm1, (%2, %3) \n\t"
416 "movq %%mm5, 8(%2, %3) \n\t"
417 "add %%"REG_a", %1 \n\t"
418 "add %%"REG_a", %2 \n\t"
421 : "+g"(h), "+r"(pixels), "+r"(block)
422 : "r"((x86_reg)line_size)
427 #define CLEAR_BLOCKS(name, n) \
428 static void name(DCTELEM *blocks) \
431 "pxor %%mm7, %%mm7 \n\t" \
432 "mov %1, %%"REG_a" \n\t" \
434 "movq %%mm7, (%0, %%"REG_a") \n\t" \
435 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
436 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
437 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
438 "add $32, %%"REG_a" \n\t" \
440 :: "r"(((uint8_t *)blocks) + 128 * n), \
445 CLEAR_BLOCKS(clear_blocks_mmx, 6)
446 CLEAR_BLOCKS(clear_block_mmx, 1)
448 static void clear_block_sse(DCTELEM *block)
451 "xorps %%xmm0, %%xmm0 \n"
452 "movaps %%xmm0, (%0) \n"
453 "movaps %%xmm0, 16(%0) \n"
454 "movaps %%xmm0, 32(%0) \n"
455 "movaps %%xmm0, 48(%0) \n"
456 "movaps %%xmm0, 64(%0) \n"
457 "movaps %%xmm0, 80(%0) \n"
458 "movaps %%xmm0, 96(%0) \n"
459 "movaps %%xmm0, 112(%0) \n"
465 static void clear_blocks_sse(DCTELEM *blocks)
468 "xorps %%xmm0, %%xmm0 \n"
469 "mov %1, %%"REG_a" \n"
471 "movaps %%xmm0, (%0, %%"REG_a") \n"
472 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
473 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
474 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
475 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
476 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
477 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
478 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
479 "add $128, %%"REG_a" \n"
481 :: "r"(((uint8_t *)blocks) + 128 * 6),
487 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
493 "movq (%1, %0), %%mm0 \n\t"
494 "movq (%2, %0), %%mm1 \n\t"
495 "paddb %%mm0, %%mm1 \n\t"
496 "movq %%mm1, (%2, %0) \n\t"
497 "movq 8(%1, %0), %%mm0 \n\t"
498 "movq 8(%2, %0), %%mm1 \n\t"
499 "paddb %%mm0, %%mm1 \n\t"
500 "movq %%mm1, 8(%2, %0) \n\t"
506 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
509 dst[i + 0] += src[i + 0];
513 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
514 const uint8_t *diff, int w,
515 int *left, int *left_top)
519 int l = *left & 0xff;
520 int tl = *left_top & 0xff;
525 "movzbl (%3, %4), %2 \n"
538 "add (%6, %4), %b0 \n"
539 "mov %b0, (%5, %4) \n"
542 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
543 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
550 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
551 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
552 "movd (%1), %%mm0 \n\t"
554 "movd (%1), %%mm1 \n\t"
555 "movd (%1,%3,1), %%mm2 \n\t"
556 "movd (%1,%3,2), %%mm3 \n\t"
557 "punpcklbw %%mm1, %%mm0 \n\t"
558 "punpcklbw %%mm3, %%mm2 \n\t"
559 "movq %%mm0, %%mm1 \n\t"
560 "punpcklwd %%mm2, %%mm0 \n\t"
561 "punpckhwd %%mm2, %%mm1 \n\t"
562 "movd %%mm0, (%0) \n\t"
564 "punpckhdq %%mm0, %%mm0 \n\t"
565 "movd %%mm0, (%0) \n\t"
566 "movd %%mm1, (%0,%2,1) \n\t"
567 "punpckhdq %%mm1, %%mm1 \n\t"
568 "movd %%mm1, (%0,%2,2) \n\t"
578 #define H263_LOOP_FILTER \
579 "pxor %%mm7, %%mm7 \n\t" \
580 "movq %0, %%mm0 \n\t" \
581 "movq %0, %%mm1 \n\t" \
582 "movq %3, %%mm2 \n\t" \
583 "movq %3, %%mm3 \n\t" \
584 "punpcklbw %%mm7, %%mm0 \n\t" \
585 "punpckhbw %%mm7, %%mm1 \n\t" \
586 "punpcklbw %%mm7, %%mm2 \n\t" \
587 "punpckhbw %%mm7, %%mm3 \n\t" \
588 "psubw %%mm2, %%mm0 \n\t" \
589 "psubw %%mm3, %%mm1 \n\t" \
590 "movq %1, %%mm2 \n\t" \
591 "movq %1, %%mm3 \n\t" \
592 "movq %2, %%mm4 \n\t" \
593 "movq %2, %%mm5 \n\t" \
594 "punpcklbw %%mm7, %%mm2 \n\t" \
595 "punpckhbw %%mm7, %%mm3 \n\t" \
596 "punpcklbw %%mm7, %%mm4 \n\t" \
597 "punpckhbw %%mm7, %%mm5 \n\t" \
598 "psubw %%mm2, %%mm4 \n\t" \
599 "psubw %%mm3, %%mm5 \n\t" \
600 "psllw $2, %%mm4 \n\t" \
601 "psllw $2, %%mm5 \n\t" \
602 "paddw %%mm0, %%mm4 \n\t" \
603 "paddw %%mm1, %%mm5 \n\t" \
604 "pxor %%mm6, %%mm6 \n\t" \
605 "pcmpgtw %%mm4, %%mm6 \n\t" \
606 "pcmpgtw %%mm5, %%mm7 \n\t" \
607 "pxor %%mm6, %%mm4 \n\t" \
608 "pxor %%mm7, %%mm5 \n\t" \
609 "psubw %%mm6, %%mm4 \n\t" \
610 "psubw %%mm7, %%mm5 \n\t" \
611 "psrlw $3, %%mm4 \n\t" \
612 "psrlw $3, %%mm5 \n\t" \
613 "packuswb %%mm5, %%mm4 \n\t" \
614 "packsswb %%mm7, %%mm6 \n\t" \
615 "pxor %%mm7, %%mm7 \n\t" \
616 "movd %4, %%mm2 \n\t" \
617 "punpcklbw %%mm2, %%mm2 \n\t" \
618 "punpcklbw %%mm2, %%mm2 \n\t" \
619 "punpcklbw %%mm2, %%mm2 \n\t" \
620 "psubusb %%mm4, %%mm2 \n\t" \
621 "movq %%mm2, %%mm3 \n\t" \
622 "psubusb %%mm4, %%mm3 \n\t" \
623 "psubb %%mm3, %%mm2 \n\t" \
624 "movq %1, %%mm3 \n\t" \
625 "movq %2, %%mm4 \n\t" \
626 "pxor %%mm6, %%mm3 \n\t" \
627 "pxor %%mm6, %%mm4 \n\t" \
628 "paddusb %%mm2, %%mm3 \n\t" \
629 "psubusb %%mm2, %%mm4 \n\t" \
630 "pxor %%mm6, %%mm3 \n\t" \
631 "pxor %%mm6, %%mm4 \n\t" \
632 "paddusb %%mm2, %%mm2 \n\t" \
633 "packsswb %%mm1, %%mm0 \n\t" \
634 "pcmpgtb %%mm0, %%mm7 \n\t" \
635 "pxor %%mm7, %%mm0 \n\t" \
636 "psubb %%mm7, %%mm0 \n\t" \
637 "movq %%mm0, %%mm1 \n\t" \
638 "psubusb %%mm2, %%mm0 \n\t" \
639 "psubb %%mm0, %%mm1 \n\t" \
640 "pand %5, %%mm1 \n\t" \
641 "psrlw $2, %%mm1 \n\t" \
642 "pxor %%mm7, %%mm1 \n\t" \
643 "psubb %%mm7, %%mm1 \n\t" \
644 "movq %0, %%mm5 \n\t" \
645 "movq %3, %%mm6 \n\t" \
646 "psubb %%mm1, %%mm5 \n\t" \
647 "paddb %%mm1, %%mm6 \n\t"
649 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
651 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
652 const int strength = ff_h263_loop_filter_strength[qscale];
657 "movq %%mm3, %1 \n\t"
658 "movq %%mm4, %2 \n\t"
659 "movq %%mm5, %0 \n\t"
660 "movq %%mm6, %3 \n\t"
661 : "+m"(*(uint64_t*)(src - 2 * stride)),
662 "+m"(*(uint64_t*)(src - 1 * stride)),
663 "+m"(*(uint64_t*)(src + 0 * stride)),
664 "+m"(*(uint64_t*)(src + 1 * stride))
665 : "g"(2 * strength), "m"(ff_pb_FC)
670 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
672 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
673 const int strength = ff_h263_loop_filter_strength[qscale];
674 DECLARE_ALIGNED(8, uint64_t, temp)[4];
675 uint8_t *btemp = (uint8_t*)temp;
679 transpose4x4(btemp, src, 8, stride);
680 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
682 H263_LOOP_FILTER // 5 3 4 6
688 : "g"(2 * strength), "m"(ff_pb_FC)
692 "movq %%mm5, %%mm1 \n\t"
693 "movq %%mm4, %%mm0 \n\t"
694 "punpcklbw %%mm3, %%mm5 \n\t"
695 "punpcklbw %%mm6, %%mm4 \n\t"
696 "punpckhbw %%mm3, %%mm1 \n\t"
697 "punpckhbw %%mm6, %%mm0 \n\t"
698 "movq %%mm5, %%mm3 \n\t"
699 "movq %%mm1, %%mm6 \n\t"
700 "punpcklwd %%mm4, %%mm5 \n\t"
701 "punpcklwd %%mm0, %%mm1 \n\t"
702 "punpckhwd %%mm4, %%mm3 \n\t"
703 "punpckhwd %%mm0, %%mm6 \n\t"
704 "movd %%mm5, (%0) \n\t"
705 "punpckhdq %%mm5, %%mm5 \n\t"
706 "movd %%mm5, (%0, %2) \n\t"
707 "movd %%mm3, (%0, %2, 2) \n\t"
708 "punpckhdq %%mm3, %%mm3 \n\t"
709 "movd %%mm3, (%0, %3) \n\t"
710 "movd %%mm1, (%1) \n\t"
711 "punpckhdq %%mm1, %%mm1 \n\t"
712 "movd %%mm1, (%1, %2) \n\t"
713 "movd %%mm6, (%1, %2, 2) \n\t"
714 "punpckhdq %%mm6, %%mm6 \n\t"
715 "movd %%mm6, (%1, %3) \n\t"
717 "r"(src + 4 * stride),
718 "r"((x86_reg)stride),
719 "r"((x86_reg)(3 * stride))
724 /* Draw the edges of width 'w' of an image of size width, height
725 * this MMX version can only handle w == 8 || w == 16. */
726 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
727 int w, int h, int sides)
729 uint8_t *ptr, *last_line;
732 last_line = buf + (height - 1) * wrap;
738 "movd (%0), %%mm0 \n\t"
739 "punpcklbw %%mm0, %%mm0 \n\t"
740 "punpcklwd %%mm0, %%mm0 \n\t"
741 "punpckldq %%mm0, %%mm0 \n\t"
742 "movq %%mm0, -8(%0) \n\t"
743 "movq -8(%0, %2), %%mm1 \n\t"
744 "punpckhbw %%mm1, %%mm1 \n\t"
745 "punpckhwd %%mm1, %%mm1 \n\t"
746 "punpckhdq %%mm1, %%mm1 \n\t"
747 "movq %%mm1, (%0, %2) \n\t"
752 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
757 "movd (%0), %%mm0 \n\t"
758 "punpcklbw %%mm0, %%mm0 \n\t"
759 "punpcklwd %%mm0, %%mm0 \n\t"
760 "punpckldq %%mm0, %%mm0 \n\t"
761 "movq %%mm0, -8(%0) \n\t"
762 "movq %%mm0, -16(%0) \n\t"
763 "movq -8(%0, %2), %%mm1 \n\t"
764 "punpckhbw %%mm1, %%mm1 \n\t"
765 "punpckhwd %%mm1, %%mm1 \n\t"
766 "punpckhdq %%mm1, %%mm1 \n\t"
767 "movq %%mm1, (%0, %2) \n\t"
768 "movq %%mm1, 8(%0, %2) \n\t"
773 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
777 /* top and bottom (and hopefully also the corners) */
778 if (sides & EDGE_TOP) {
779 for (i = 0; i < h; i += 4) {
780 ptr = buf - (i + 1) * wrap - w;
783 "movq (%1, %0), %%mm0 \n\t"
784 "movq %%mm0, (%0) \n\t"
785 "movq %%mm0, (%0, %2) \n\t"
786 "movq %%mm0, (%0, %2, 2) \n\t"
787 "movq %%mm0, (%0, %3) \n\t"
792 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
793 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
798 if (sides & EDGE_BOTTOM) {
799 for (i = 0; i < h; i += 4) {
800 ptr = last_line + (i + 1) * wrap - w;
803 "movq (%1, %0), %%mm0 \n\t"
804 "movq %%mm0, (%0) \n\t"
805 "movq %%mm0, (%0, %2) \n\t"
806 "movq %%mm0, (%0, %2, 2) \n\t"
807 "movq %%mm0, (%0, %3) \n\t"
812 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
813 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
814 "r"(ptr + width + 2 * w)
820 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
821 in0, in1, in2, in7, out, OP) \
822 "paddw "#m4", "#m3" \n\t" /* x1 */ \
823 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
824 "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
825 "movq "#in7", "#m3" \n\t" /* d */ \
826 "movq "#in0", %%mm5 \n\t" /* D */ \
827 "paddw "#m3", %%mm5 \n\t" /* x4 */ \
828 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
829 "movq "#in1", %%mm5 \n\t" /* C */ \
830 "movq "#in2", %%mm6 \n\t" /* B */ \
831 "paddw "#m6", %%mm5 \n\t" /* x3 */ \
832 "paddw "#m5", %%mm6 \n\t" /* x2 */ \
833 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
834 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
835 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
836 "paddw "#rnd", %%mm4 \n\t" /* x2 */ \
837 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
838 "psraw $5, %%mm5 \n\t" \
839 "packuswb %%mm5, %%mm5 \n\t" \
840 OP(%%mm5, out, %%mm7, d)
842 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMXEXT) \
843 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, \
852 "pxor %%mm7, %%mm7 \n\t" \
854 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
855 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
856 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
857 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
858 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
859 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
860 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
861 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
862 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
863 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
864 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
865 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
866 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
867 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
868 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
869 "paddw %%mm3, %%mm5 \n\t" /* b */ \
870 "paddw %%mm2, %%mm6 \n\t" /* c */ \
871 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
872 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
873 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
874 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
875 "paddw %%mm4, %%mm0 \n\t" /* a */ \
876 "paddw %%mm1, %%mm5 \n\t" /* d */ \
877 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
878 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
879 "paddw %6, %%mm6 \n\t" \
880 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
881 "psraw $5, %%mm0 \n\t" \
882 "movq %%mm0, %5 \n\t" \
883 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
885 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
886 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
887 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
888 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
889 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
890 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
891 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
892 "paddw %%mm0, %%mm2 \n\t" /* b */ \
893 "paddw %%mm5, %%mm3 \n\t" /* c */ \
894 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
895 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
896 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
897 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
898 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
899 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
900 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
901 "paddw %%mm2, %%mm1 \n\t" /* a */ \
902 "paddw %%mm6, %%mm4 \n\t" /* d */ \
903 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
904 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
905 "paddw %6, %%mm1 \n\t" \
906 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
907 "psraw $5, %%mm3 \n\t" \
908 "movq %5, %%mm1 \n\t" \
909 "packuswb %%mm3, %%mm1 \n\t" \
910 OP_MMXEXT(%%mm1, (%1), %%mm4, q) \
911 /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
913 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
914 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
915 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
916 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
917 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
918 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
919 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
920 "paddw %%mm1, %%mm5 \n\t" /* b */ \
921 "paddw %%mm4, %%mm0 \n\t" /* c */ \
922 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
923 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
924 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
925 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
926 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
927 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
928 "paddw %%mm3, %%mm2 \n\t" /* d */ \
929 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
930 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
931 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
932 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
933 "paddw %%mm2, %%mm6 \n\t" /* a */ \
934 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
935 "paddw %6, %%mm0 \n\t" \
936 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
937 "psraw $5, %%mm0 \n\t" \
938 /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
939 /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
941 "paddw %%mm5, %%mm3 \n\t" /* a */ \
942 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
943 "paddw %%mm4, %%mm6 \n\t" /* b */ \
944 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
945 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
946 "paddw %%mm1, %%mm4 \n\t" /* c */ \
947 "paddw %%mm2, %%mm5 \n\t" /* d */ \
948 "paddw %%mm6, %%mm6 \n\t" /* 2b */ \
949 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
950 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
951 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
952 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
953 "paddw %6, %%mm4 \n\t" \
954 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
955 "psraw $5, %%mm4 \n\t" \
956 "packuswb %%mm4, %%mm0 \n\t" \
957 OP_MMXEXT(%%mm0, 8(%1), %%mm4, q) \
963 : "+a"(src), "+c"(dst), "+D"(h) \
964 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
965 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
970 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, \
977 "pxor %%mm7, %%mm7 \n\t" \
979 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
980 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
981 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
982 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
983 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
984 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
985 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
986 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
987 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
988 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
989 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
990 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
991 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
992 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
993 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
994 "paddw %%mm3, %%mm5 \n\t" /* b */ \
995 "paddw %%mm2, %%mm6 \n\t" /* c */ \
996 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
997 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
998 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
999 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
1000 "paddw %%mm4, %%mm0 \n\t" /* a */ \
1001 "paddw %%mm1, %%mm5 \n\t" /* d */ \
1002 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
1003 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
1004 "paddw %5, %%mm6 \n\t" \
1005 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1006 "psraw $5, %%mm0 \n\t" \
1007 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
1009 "movd 5(%0), %%mm5 \n\t" /* FGHI */ \
1010 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
1011 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
1012 "paddw %%mm5, %%mm1 \n\t" /* a */ \
1013 "paddw %%mm6, %%mm2 \n\t" /* b */ \
1014 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
1015 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
1016 "paddw %%mm6, %%mm3 \n\t" /* c */ \
1017 "paddw %%mm5, %%mm4 \n\t" /* d */ \
1018 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1019 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1020 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1021 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1022 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
1023 "paddw %5, %%mm1 \n\t" \
1024 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
1025 "psraw $5, %%mm3 \n\t" \
1026 "packuswb %%mm3, %%mm0 \n\t" \
1027 OP_MMXEXT(%%mm0, (%1), %%mm4, q) \
1033 : "+a"(src), "+c"(dst), "+d"(h) \
1034 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1035 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
1040 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1041 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1046 uint64_t temp[17 * 4]; \
1047 uint64_t *temp_ptr = temp; \
1050 /* FIXME unroll */ \
1051 __asm__ volatile ( \
1052 "pxor %%mm7, %%mm7 \n\t" \
1054 "movq (%0), %%mm0 \n\t" \
1055 "movq (%0), %%mm1 \n\t" \
1056 "movq 8(%0), %%mm2 \n\t" \
1057 "movq 8(%0), %%mm3 \n\t" \
1058 "punpcklbw %%mm7, %%mm0 \n\t" \
1059 "punpckhbw %%mm7, %%mm1 \n\t" \
1060 "punpcklbw %%mm7, %%mm2 \n\t" \
1061 "punpckhbw %%mm7, %%mm3 \n\t" \
1062 "movq %%mm0, (%1) \n\t" \
1063 "movq %%mm1, 17 * 8(%1) \n\t" \
1064 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1065 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1070 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1071 : "r"((x86_reg)srcStride) \
1078 /* FIXME reorder for speed */ \
1079 __asm__ volatile ( \
1080 /* "pxor %%mm7, %%mm7 \n\t" */ \
1082 "movq (%0), %%mm0 \n\t" \
1083 "movq 8(%0), %%mm1 \n\t" \
1084 "movq 16(%0), %%mm2 \n\t" \
1085 "movq 24(%0), %%mm3 \n\t" \
1086 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1087 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1089 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1091 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1093 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1094 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1096 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1097 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1099 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1100 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1102 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1103 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1105 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1107 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1109 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1110 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1112 "add $136, %0 \n\t" \
1117 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1118 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1119 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1120 "g"(4 - 14 * (x86_reg)dstStride) \
1125 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1130 uint64_t temp[9 * 2]; \
1131 uint64_t *temp_ptr = temp; \
1134 /* FIXME unroll */ \
1135 __asm__ volatile ( \
1136 "pxor %%mm7, %%mm7 \n\t" \
1138 "movq (%0), %%mm0 \n\t" \
1139 "movq (%0), %%mm1 \n\t" \
1140 "punpcklbw %%mm7, %%mm0 \n\t" \
1141 "punpckhbw %%mm7, %%mm1 \n\t" \
1142 "movq %%mm0, (%1) \n\t" \
1143 "movq %%mm1, 9*8(%1) \n\t" \
1148 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1149 : "r"((x86_reg)srcStride) \
1156 /* FIXME reorder for speed */ \
1157 __asm__ volatile ( \
1158 /* "pxor %%mm7, %%mm7 \n\t" */ \
1160 "movq (%0), %%mm0 \n\t" \
1161 "movq 8(%0), %%mm1 \n\t" \
1162 "movq 16(%0), %%mm2 \n\t" \
1163 "movq 24(%0), %%mm3 \n\t" \
1164 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1165 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1167 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1169 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1171 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1173 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1175 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1176 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1178 "add $72, %0 \n\t" \
1183 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1184 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1185 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1186 "g"(4 - 6 * (x86_reg)dstStride) \
1191 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1194 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1197 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1201 uint8_t * const half = (uint8_t*)temp; \
1202 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1204 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1207 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1210 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1214 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1218 uint8_t * const half = (uint8_t*)temp; \
1219 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1221 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1225 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1229 uint8_t * const half = (uint8_t*)temp; \
1230 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1231 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1234 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1237 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1240 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1244 uint8_t * const half = (uint8_t*)temp; \
1245 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1246 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1250 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1253 uint64_t half[8 + 9]; \
1254 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1255 uint8_t * const halfHV = ((uint8_t*)half); \
1256 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1258 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1259 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1260 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1263 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1266 uint64_t half[8 + 9]; \
1267 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1268 uint8_t * const halfHV = ((uint8_t*)half); \
1269 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1271 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1273 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1274 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1277 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1280 uint64_t half[8 + 9]; \
1281 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1282 uint8_t * const halfHV = ((uint8_t*)half); \
1283 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1285 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1286 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1287 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1290 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1293 uint64_t half[8 + 9]; \
1294 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1295 uint8_t * const halfHV = ((uint8_t*)half); \
1296 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1298 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1300 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1301 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1304 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1307 uint64_t half[8 + 9]; \
1308 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1309 uint8_t * const halfHV = ((uint8_t*)half); \
1310 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1312 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1313 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1316 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1319 uint64_t half[8 + 9]; \
1320 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1321 uint8_t * const halfHV = ((uint8_t*)half); \
1322 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1324 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1325 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1328 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1331 uint64_t half[8 + 9]; \
1332 uint8_t * const halfH = ((uint8_t*)half); \
1333 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1335 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1336 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1339 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1342 uint64_t half[8 + 9]; \
1343 uint8_t * const halfH = ((uint8_t*)half); \
1344 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1346 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1348 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1351 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1355 uint8_t * const halfH = ((uint8_t*)half); \
1356 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1358 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1361 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1364 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1367 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1370 uint64_t temp[32]; \
1371 uint8_t * const half = (uint8_t*)temp; \
1372 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1374 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1377 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1380 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1381 stride, stride, 16); \
1384 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1387 uint64_t temp[32]; \
1388 uint8_t * const half = (uint8_t*)temp; \
1389 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1391 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1392 stride, stride, 16); \
1395 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1398 uint64_t temp[32]; \
1399 uint8_t * const half = (uint8_t*)temp; \
1400 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1402 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1405 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1408 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1411 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1414 uint64_t temp[32]; \
1415 uint8_t * const half = (uint8_t*)temp; \
1416 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1418 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1419 stride, stride, 16); \
1422 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1425 uint64_t half[16 * 2 + 17 * 2]; \
1426 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1427 uint8_t * const halfHV = ((uint8_t*)half); \
1428 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1430 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1432 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1434 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1437 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1440 uint64_t half[16 * 2 + 17 * 2]; \
1441 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1442 uint8_t * const halfHV = ((uint8_t*)half); \
1443 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1445 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1447 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1449 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1452 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1455 uint64_t half[16 * 2 + 17 * 2]; \
1456 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1457 uint8_t * const halfHV = ((uint8_t*)half); \
1458 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1460 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1462 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1464 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1468 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1471 uint64_t half[16 * 2 + 17 * 2]; \
1472 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1473 uint8_t * const halfHV = ((uint8_t*)half); \
1474 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1476 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1478 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1480 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1484 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1487 uint64_t half[16 * 2 + 17 * 2]; \
1488 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1489 uint8_t * const halfHV = ((uint8_t*)half); \
1490 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1492 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1494 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1497 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1500 uint64_t half[16 * 2 + 17 * 2]; \
1501 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1502 uint8_t * const halfHV = ((uint8_t*)half); \
1503 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1505 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1507 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1511 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1514 uint64_t half[17 * 2]; \
1515 uint8_t * const halfH = ((uint8_t*)half); \
1516 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1518 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1520 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1523 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1526 uint64_t half[17 * 2]; \
1527 uint8_t * const halfH = ((uint8_t*)half); \
1528 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1530 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1532 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1535 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1538 uint64_t half[17 * 2]; \
1539 uint8_t * const halfH = ((uint8_t*)half); \
1540 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1542 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1545 #define PUT_OP(a, b, temp, size) \
1546 "mov"#size" "#a", "#b" \n\t"
1548 #define AVG_MMXEXT_OP(a, b, temp, size) \
1549 "mov"#size" "#b", "#temp" \n\t" \
1550 "pavgb "#temp", "#a" \n\t" \
1551 "mov"#size" "#a", "#b" \n\t"
1553 QPEL_BASE(put_, ff_pw_16, _, PUT_OP)
1554 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMXEXT_OP)
1555 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP)
1556 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext)
1557 QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
1558 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
1560 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1562 put_pixels8_xy2_mmx(dst, src, stride, 8);
1564 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1566 put_pixels16_xy2_mmx(dst, src, stride, 16);
1568 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1570 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1572 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1574 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1577 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1578 int stride, int h, int ox, int oy,
1579 int dxx, int dxy, int dyx, int dyy,
1580 int shift, int r, int width, int height)
1583 const int ix = ox >> (16 + shift);
1584 const int iy = oy >> (16 + shift);
1585 const int oxs = ox >> 4;
1586 const int oys = oy >> 4;
1587 const int dxxs = dxx >> 4;
1588 const int dxys = dxy >> 4;
1589 const int dyxs = dyx >> 4;
1590 const int dyys = dyy >> 4;
1591 const uint16_t r4[4] = { r, r, r, r };
1592 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1593 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1594 const uint64_t shift2 = 2 * shift;
1597 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1598 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1599 const int dxh = dxy * (h - 1);
1600 const int dyw = dyx * (w - 1);
1601 if ( // non-constant fullpel offset (3% of blocks)
1602 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1603 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1604 // uses more than 16 bits of subpel mv (only at huge resolution)
1605 || (dxx | dxy | dyx | dyy) & 15 ||
1606 (unsigned)ix >= width - w ||
1607 (unsigned)iy >= height - h) {
1608 // FIXME could still use mmx for some of the rows
1609 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1610 shift, r, width, height);
1614 src += ix + iy * stride;
1617 "movd %0, %%mm6 \n\t"
1618 "pxor %%mm7, %%mm7 \n\t"
1619 "punpcklwd %%mm6, %%mm6 \n\t"
1620 "punpcklwd %%mm6, %%mm6 \n\t"
1624 for (x = 0; x < w; x += 4) {
1625 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1626 oxs - dxys + dxxs * (x + 1),
1627 oxs - dxys + dxxs * (x + 2),
1628 oxs - dxys + dxxs * (x + 3) };
1629 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1630 oys - dyys + dyxs * (x + 1),
1631 oys - dyys + dyxs * (x + 2),
1632 oys - dyys + dyxs * (x + 3) };
1634 for (y = 0; y < h; y++) {
1636 "movq %0, %%mm4 \n\t"
1637 "movq %1, %%mm5 \n\t"
1638 "paddw %2, %%mm4 \n\t"
1639 "paddw %3, %%mm5 \n\t"
1640 "movq %%mm4, %0 \n\t"
1641 "movq %%mm5, %1 \n\t"
1642 "psrlw $12, %%mm4 \n\t"
1643 "psrlw $12, %%mm5 \n\t"
1644 : "+m"(*dx4), "+m"(*dy4)
1645 : "m"(*dxy4), "m"(*dyy4)
1649 "movq %%mm6, %%mm2 \n\t"
1650 "movq %%mm6, %%mm1 \n\t"
1651 "psubw %%mm4, %%mm2 \n\t"
1652 "psubw %%mm5, %%mm1 \n\t"
1653 "movq %%mm2, %%mm0 \n\t"
1654 "movq %%mm4, %%mm3 \n\t"
1655 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1656 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1657 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1658 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1660 "movd %4, %%mm5 \n\t"
1661 "movd %3, %%mm4 \n\t"
1662 "punpcklbw %%mm7, %%mm5 \n\t"
1663 "punpcklbw %%mm7, %%mm4 \n\t"
1664 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1665 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1667 "movd %2, %%mm5 \n\t"
1668 "movd %1, %%mm4 \n\t"
1669 "punpcklbw %%mm7, %%mm5 \n\t"
1670 "punpcklbw %%mm7, %%mm4 \n\t"
1671 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1672 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1673 "paddw %5, %%mm1 \n\t"
1674 "paddw %%mm3, %%mm2 \n\t"
1675 "paddw %%mm1, %%mm0 \n\t"
1676 "paddw %%mm2, %%mm0 \n\t"
1678 "psrlw %6, %%mm0 \n\t"
1679 "packuswb %%mm0, %%mm0 \n\t"
1680 "movd %%mm0, %0 \n\t"
1682 : "=m"(dst[x + y * stride])
1683 : "m"(src[0]), "m"(src[1]),
1684 "m"(src[stride]), "m"(src[stride + 1]),
1685 "m"(*r4), "m"(shift2)
1689 src += 4 - h * stride;
1692 #endif /* HAVE_INLINE_ASM */
1694 #include "h264_qpel.c"
1696 void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
1697 int stride, int h, int x, int y);
1698 void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
1699 int stride, int h, int x, int y);
1700 void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
1701 int stride, int h, int x, int y);
1703 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1704 int stride, int h, int x, int y);
1705 void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
1706 int stride, int h, int x, int y);
1707 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1708 int stride, int h, int x, int y);
1710 void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1711 int stride, int h, int x, int y);
1712 void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1713 int stride, int h, int x, int y);
1715 void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1716 int stride, int h, int x, int y);
1717 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1718 int stride, int h, int x, int y);
1720 void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1721 int stride, int h, int x, int y);
1722 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1723 int stride, int h, int x, int y);
1725 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1726 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1727 (uint8_t *dst, uint8_t *src, \
1728 int stride, int h, int x, int y);
1730 CHROMA_MC(put, 2, 10, mmxext)
1731 CHROMA_MC(avg, 2, 10, mmxext)
1732 CHROMA_MC(put, 4, 10, mmxext)
1733 CHROMA_MC(avg, 4, 10, mmxext)
1734 CHROMA_MC(put, 8, 10, sse2)
1735 CHROMA_MC(avg, 8, 10, sse2)
1736 CHROMA_MC(put, 8, 10, avx)
1737 CHROMA_MC(avg, 8, 10, avx)
1742 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1744 put_pixels8_mmx(dst, src, stride, 8);
1747 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1749 avg_pixels8_mmx(dst, src, stride, 8);
1752 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1754 put_pixels16_mmx(dst, src, stride, 16);
1757 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1759 avg_pixels16_mmx(dst, src, stride, 16);
1763 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1764 int stride, int rnd)
1766 put_pixels8_mmx(dst, src, stride, 8);
1769 void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
1770 int stride, int rnd)
1772 avg_pixels8_mmxext(dst, src, stride, 8);
1775 static void vector_clipf_sse(float *dst, const float *src,
1776 float min, float max, int len)
1778 x86_reg i = (len - 16) * 4;
1780 "movss %3, %%xmm4 \n\t"
1781 "movss %4, %%xmm5 \n\t"
1782 "shufps $0, %%xmm4, %%xmm4 \n\t"
1783 "shufps $0, %%xmm5, %%xmm5 \n\t"
1785 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1786 "movaps 16(%2, %0), %%xmm1 \n\t"
1787 "movaps 32(%2, %0), %%xmm2 \n\t"
1788 "movaps 48(%2, %0), %%xmm3 \n\t"
1789 "maxps %%xmm4, %%xmm0 \n\t"
1790 "maxps %%xmm4, %%xmm1 \n\t"
1791 "maxps %%xmm4, %%xmm2 \n\t"
1792 "maxps %%xmm4, %%xmm3 \n\t"
1793 "minps %%xmm5, %%xmm0 \n\t"
1794 "minps %%xmm5, %%xmm1 \n\t"
1795 "minps %%xmm5, %%xmm2 \n\t"
1796 "minps %%xmm5, %%xmm3 \n\t"
1797 "movaps %%xmm0, (%1, %0) \n\t"
1798 "movaps %%xmm1, 16(%1, %0) \n\t"
1799 "movaps %%xmm2, 32(%1, %0) \n\t"
1800 "movaps %%xmm3, 48(%1, %0) \n\t"
1804 : "r"(dst), "r"(src), "m"(min), "m"(max)
1809 #endif /* HAVE_INLINE_ASM */
1811 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1813 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1815 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1817 int order, int mul);
1818 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1820 int order, int mul);
1821 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1823 int order, int mul);
1825 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1826 const int16_t *window, unsigned int len);
1827 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1828 const int16_t *window, unsigned int len);
1829 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1830 const int16_t *window, unsigned int len);
1831 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1832 const int16_t *window, unsigned int len);
1833 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1834 const int16_t *window, unsigned int len);
1835 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1836 const int16_t *window, unsigned int len);
1838 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1839 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1841 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1842 const uint8_t *diff, int w,
1843 int *left, int *left_top);
1844 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1846 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1849 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1850 int32_t min, int32_t max, unsigned int len);
1851 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1852 int32_t min, int32_t max, unsigned int len);
1853 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1854 int32_t min, int32_t max, unsigned int len);
1855 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1856 int32_t min, int32_t max, unsigned int len);
1858 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
1860 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1861 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1862 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
1863 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
1864 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
1865 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
1866 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
1867 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
1868 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
1869 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
1870 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
1871 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
1872 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
1873 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
1874 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
1875 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
1878 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
1880 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
1881 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
1882 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
1883 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
1886 #define H264_QPEL_FUNCS(x, y, CPU) \
1888 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
1889 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
1890 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
1891 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
1894 #define H264_QPEL_FUNCS_10(x, y, CPU) \
1896 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
1897 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
1898 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
1899 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
1902 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
1904 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1907 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
1908 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
1909 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
1911 if (!high_bit_depth) {
1912 c->clear_block = clear_block_mmx;
1913 c->clear_blocks = clear_blocks_mmx;
1914 c->draw_edges = draw_edges_mmx;
1916 SET_HPEL_FUNCS(put, 0, 16, mmx);
1917 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
1918 SET_HPEL_FUNCS(avg, 0, 16, mmx);
1919 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
1920 SET_HPEL_FUNCS(put, 1, 8, mmx);
1921 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
1922 SET_HPEL_FUNCS(avg, 1, 8, mmx);
1923 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
1925 switch (avctx->idct_algo) {
1927 case FF_IDCT_SIMPLEMMX:
1928 c->idct_put = ff_simple_idct_put_mmx;
1929 c->idct_add = ff_simple_idct_add_mmx;
1930 c->idct = ff_simple_idct_mmx;
1931 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
1933 case FF_IDCT_XVIDMMX:
1934 c->idct_put = ff_idct_xvid_mmx_put;
1935 c->idct_add = ff_idct_xvid_mmx_add;
1936 c->idct = ff_idct_xvid_mmx;
1943 c->add_bytes = add_bytes_mmx;
1945 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1946 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
1947 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
1949 #endif /* HAVE_INLINE_ASM */
1952 if (!high_bit_depth && CONFIG_H264CHROMA) {
1953 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
1954 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
1957 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
1962 static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
1965 const int bit_depth = avctx->bits_per_raw_sample;
1966 const int high_bit_depth = bit_depth > 8;
1969 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
1970 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
1972 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
1973 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
1974 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
1975 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
1977 if (!high_bit_depth) {
1978 c->put_pixels_tab[0][1] = put_pixels16_x2_mmxext;
1979 c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
1981 c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
1982 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
1983 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
1985 c->put_pixels_tab[1][1] = put_pixels8_x2_mmxext;
1986 c->put_pixels_tab[1][2] = put_pixels8_y2_mmxext;
1988 c->avg_pixels_tab[1][0] = avg_pixels8_mmxext;
1989 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmxext;
1990 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmxext;
1993 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1994 if (!high_bit_depth) {
1995 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
1996 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
1997 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmxext;
1998 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmxext;
2000 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
2001 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmxext;
2005 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
2006 c->idct_put = ff_idct_xvid_mmxext_put;
2007 c->idct_add = ff_idct_xvid_mmxext_add;
2008 c->idct = ff_idct_xvid_mmxext;
2011 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2012 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2013 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmxext;
2014 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmxext;
2016 #endif /* HAVE_INLINE_ASM */
2018 #if HAVE_MMXEXT_EXTERNAL
2019 if (CONFIG_H264QPEL) {
2020 if (!high_bit_depth) {
2021 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
2022 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
2023 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
2024 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
2025 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
2026 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
2027 } else if (bit_depth == 10) {
2029 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2030 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2031 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2032 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2034 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2035 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2039 if (!high_bit_depth && CONFIG_H264CHROMA) {
2040 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
2041 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
2042 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
2043 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
2045 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2046 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
2047 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
2048 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
2049 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
2052 /* slower than cmov version on AMD */
2053 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
2054 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
2056 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
2057 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
2059 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2060 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2062 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
2064 #endif /* HAVE_MMXEXT_EXTERNAL */
2067 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2070 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2073 if (!high_bit_depth) {
2074 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2075 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2077 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2078 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2079 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2081 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2082 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2084 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2085 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2086 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2088 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2089 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2090 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2091 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2092 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2094 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2095 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2099 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2100 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2101 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2102 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2104 #endif /* HAVE_INLINE_ASM */
2107 if (!high_bit_depth && CONFIG_H264CHROMA) {
2108 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
2109 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2111 #endif /* HAVE_YASM */
2114 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2116 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2119 if (!high_bit_depth) {
2120 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2121 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2122 c->clear_block = clear_block_sse;
2123 c->clear_blocks = clear_blocks_sse;
2127 c->vector_clipf = vector_clipf_sse;
2128 #endif /* HAVE_INLINE_ASM */
2131 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2134 const int bit_depth = avctx->bits_per_raw_sample;
2135 const int high_bit_depth = bit_depth > 8;
2137 #if HAVE_SSE2_INLINE
2138 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
2139 c->idct_put = ff_idct_xvid_sse2_put;
2140 c->idct_add = ff_idct_xvid_sse2_add;
2141 c->idct = ff_idct_xvid_sse2;
2142 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2144 #endif /* HAVE_SSE2_INLINE */
2146 #if HAVE_SSE2_EXTERNAL
2147 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2148 // these functions are slower than mmx on AMD, but faster on Intel
2149 if (!high_bit_depth) {
2150 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
2151 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
2152 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
2153 if (CONFIG_H264QPEL)
2154 H264_QPEL_FUNCS(0, 0, sse2);
2158 if (!high_bit_depth && CONFIG_H264QPEL) {
2159 H264_QPEL_FUNCS(0, 1, sse2);
2160 H264_QPEL_FUNCS(0, 2, sse2);
2161 H264_QPEL_FUNCS(0, 3, sse2);
2162 H264_QPEL_FUNCS(1, 1, sse2);
2163 H264_QPEL_FUNCS(1, 2, sse2);
2164 H264_QPEL_FUNCS(1, 3, sse2);
2165 H264_QPEL_FUNCS(2, 1, sse2);
2166 H264_QPEL_FUNCS(2, 2, sse2);
2167 H264_QPEL_FUNCS(2, 3, sse2);
2168 H264_QPEL_FUNCS(3, 1, sse2);
2169 H264_QPEL_FUNCS(3, 2, sse2);
2170 H264_QPEL_FUNCS(3, 3, sse2);
2173 if (bit_depth == 10) {
2174 if (CONFIG_H264QPEL) {
2175 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2176 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2177 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2178 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2179 H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2180 H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2181 H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
2183 if (CONFIG_H264CHROMA) {
2184 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2185 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2189 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2190 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2191 if (mm_flags & AV_CPU_FLAG_ATOM) {
2192 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2194 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2196 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2197 c->apply_window_int16 = ff_apply_window_int16_sse2;
2198 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2199 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
2201 c->bswap_buf = ff_bswap32_buf_sse2;
2202 #endif /* HAVE_SSE2_EXTERNAL */
2205 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2208 #if HAVE_SSSE3_EXTERNAL
2209 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2210 const int bit_depth = avctx->bits_per_raw_sample;
2212 if (!high_bit_depth && CONFIG_H264QPEL) {
2213 H264_QPEL_FUNCS(1, 0, ssse3);
2214 H264_QPEL_FUNCS(1, 1, ssse3);
2215 H264_QPEL_FUNCS(1, 2, ssse3);
2216 H264_QPEL_FUNCS(1, 3, ssse3);
2217 H264_QPEL_FUNCS(2, 0, ssse3);
2218 H264_QPEL_FUNCS(2, 1, ssse3);
2219 H264_QPEL_FUNCS(2, 2, ssse3);
2220 H264_QPEL_FUNCS(2, 3, ssse3);
2221 H264_QPEL_FUNCS(3, 0, ssse3);
2222 H264_QPEL_FUNCS(3, 1, ssse3);
2223 H264_QPEL_FUNCS(3, 2, ssse3);
2224 H264_QPEL_FUNCS(3, 3, ssse3);
2226 if (bit_depth == 10 && CONFIG_H264QPEL) {
2227 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
2228 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
2229 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
2231 if (!high_bit_depth && CONFIG_H264CHROMA) {
2232 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
2233 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
2234 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
2235 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
2237 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2238 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2239 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2241 if (mm_flags & AV_CPU_FLAG_ATOM)
2242 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2244 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2245 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
2246 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2247 c->bswap_buf = ff_bswap32_buf_ssse3;
2248 #endif /* HAVE_SSSE3_EXTERNAL */
2251 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
2254 #if HAVE_SSE4_EXTERNAL
2255 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2256 #endif /* HAVE_SSE4_EXTERNAL */
2259 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2261 #if HAVE_AVX_EXTERNAL
2262 const int bit_depth = avctx->bits_per_raw_sample;
2264 if (bit_depth == 10) {
2265 // AVX implies !cache64.
2266 // TODO: Port cache(32|64) detection from x264.
2267 if (CONFIG_H264QPEL) {
2268 H264_QPEL_FUNCS_10(1, 0, sse2);
2269 H264_QPEL_FUNCS_10(2, 0, sse2);
2270 H264_QPEL_FUNCS_10(3, 0, sse2);
2273 if (CONFIG_H264CHROMA) {
2274 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2275 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2278 #endif /* HAVE_AVX_EXTERNAL */
2281 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
2283 int mm_flags = av_get_cpu_flags();
2285 #if HAVE_7REGS && HAVE_INLINE_ASM
2286 if (mm_flags & AV_CPU_FLAG_CMOV)
2287 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2290 if (mm_flags & AV_CPU_FLAG_MMX)
2291 dsputil_init_mmx(c, avctx, mm_flags);
2293 if (mm_flags & AV_CPU_FLAG_MMXEXT)
2294 dsputil_init_mmxext(c, avctx, mm_flags);
2296 if (mm_flags & AV_CPU_FLAG_3DNOW)
2297 dsputil_init_3dnow(c, avctx, mm_flags);
2299 if (mm_flags & AV_CPU_FLAG_SSE)
2300 dsputil_init_sse(c, avctx, mm_flags);
2302 if (mm_flags & AV_CPU_FLAG_SSE2)
2303 dsputil_init_sse2(c, avctx, mm_flags);
2305 if (mm_flags & AV_CPU_FLAG_SSSE3)
2306 dsputil_init_ssse3(c, avctx, mm_flags);
2308 if (mm_flags & AV_CPU_FLAG_SSE4)
2309 dsputil_init_sse4(c, avctx, mm_flags);
2311 if (mm_flags & AV_CPU_FLAG_AVX)
2312 dsputil_init_avx(c, avctx, mm_flags);
2314 if (CONFIG_ENCODERS)
2315 ff_dsputilenc_init_mmx(c, avctx);