2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "dsputil_mmx.h"
32 #include "idct_xvid.h"
37 /* pixel operations */
38 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
39 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
41 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
42 { 0x8000000080000000ULL, 0x8000000080000000ULL };
44 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
51 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
55 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
59 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
61 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
63 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
66 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
76 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
77 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
78 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
80 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
81 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
83 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
84 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
88 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
89 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
91 #define MOVQ_BFE(regd) \
93 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
94 "paddb %%"#regd", %%"#regd" \n\t" ::)
97 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
98 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
100 // for shared library it's better to use this way for accessing constants
102 #define MOVQ_BONE(regd) \
104 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
105 "psrlw $15, %%"#regd" \n\t" \
106 "packuswb %%"#regd", %%"#regd" \n\t" ::)
108 #define MOVQ_WTWO(regd) \
110 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
111 "psrlw $15, %%"#regd" \n\t" \
112 "psllw $1, %%"#regd" \n\t"::)
116 // using regr as temporary and for the output result
117 // first argument is unmodifed and second is trashed
118 // regfe is supposed to contain 0xfefefefefefefefe
119 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
120 "movq "#rega", "#regr" \n\t" \
121 "pand "#regb", "#regr" \n\t" \
122 "pxor "#rega", "#regb" \n\t" \
123 "pand "#regfe", "#regb" \n\t" \
124 "psrlq $1, "#regb" \n\t" \
125 "paddb "#regb", "#regr" \n\t"
127 #define PAVGB_MMX(rega, regb, regr, regfe) \
128 "movq "#rega", "#regr" \n\t" \
129 "por "#regb", "#regr" \n\t" \
130 "pxor "#rega", "#regb" \n\t" \
131 "pand "#regfe", "#regb" \n\t" \
132 "psrlq $1, "#regb" \n\t" \
133 "psubb "#regb", "#regr" \n\t"
135 // mm6 is supposed to contain 0xfefefefefefefefe
136 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
137 "movq "#rega", "#regr" \n\t" \
138 "movq "#regc", "#regp" \n\t" \
139 "pand "#regb", "#regr" \n\t" \
140 "pand "#regd", "#regp" \n\t" \
141 "pxor "#rega", "#regb" \n\t" \
142 "pxor "#regc", "#regd" \n\t" \
143 "pand %%mm6, "#regb" \n\t" \
144 "pand %%mm6, "#regd" \n\t" \
145 "psrlq $1, "#regb" \n\t" \
146 "psrlq $1, "#regd" \n\t" \
147 "paddb "#regb", "#regr" \n\t" \
148 "paddb "#regd", "#regp" \n\t"
150 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
151 "movq "#rega", "#regr" \n\t" \
152 "movq "#regc", "#regp" \n\t" \
153 "por "#regb", "#regr" \n\t" \
154 "por "#regd", "#regp" \n\t" \
155 "pxor "#rega", "#regb" \n\t" \
156 "pxor "#regc", "#regd" \n\t" \
157 "pand %%mm6, "#regb" \n\t" \
158 "pand %%mm6, "#regd" \n\t" \
159 "psrlq $1, "#regd" \n\t" \
160 "psrlq $1, "#regb" \n\t" \
161 "psubb "#regb", "#regr" \n\t" \
162 "psubb "#regd", "#regp" \n\t"
164 /***********************************/
165 /* MMX no rounding */
166 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
167 #define SET_RND MOVQ_WONE
168 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
169 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
170 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
172 #include "dsputil_rnd_template.c"
178 /***********************************/
181 #define DEF(x, y) x ## _ ## y ## _mmx
182 #define SET_RND MOVQ_WTWO
183 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
184 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
186 #include "dsputil_rnd_template.c"
194 /***********************************/
197 #define DEF(x) x ## _3dnow
198 #define PAVGB "pavgusb"
201 #include "dsputil_avg_template.c"
207 /***********************************/
208 /* MMXEXT specific */
210 #define DEF(x) x ## _mmxext
212 /* Introduced only in MMXEXT set */
213 #define PAVGB "pavgb"
216 #include "dsputil_avg_template.c"
222 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
223 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
224 #define put_pixels16_mmxext put_pixels16_mmx
225 #define put_pixels8_mmxext put_pixels8_mmx
226 #define put_pixels4_mmxext put_pixels4_mmx
227 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
228 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
229 #define put_pixels16_3dnow put_pixels16_mmx
230 #define put_pixels8_3dnow put_pixels8_mmx
231 #define put_pixels4_3dnow put_pixels4_mmx
232 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
233 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
235 /***********************************/
238 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
244 /* read the pixels */
249 "movq (%3), %%mm0 \n\t"
250 "movq 8(%3), %%mm1 \n\t"
251 "movq 16(%3), %%mm2 \n\t"
252 "movq 24(%3), %%mm3 \n\t"
253 "movq 32(%3), %%mm4 \n\t"
254 "movq 40(%3), %%mm5 \n\t"
255 "movq 48(%3), %%mm6 \n\t"
256 "movq 56(%3), %%mm7 \n\t"
257 "packuswb %%mm1, %%mm0 \n\t"
258 "packuswb %%mm3, %%mm2 \n\t"
259 "packuswb %%mm5, %%mm4 \n\t"
260 "packuswb %%mm7, %%mm6 \n\t"
261 "movq %%mm0, (%0) \n\t"
262 "movq %%mm2, (%0, %1) \n\t"
263 "movq %%mm4, (%0, %1, 2) \n\t"
264 "movq %%mm6, (%0, %2) \n\t"
265 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
268 pix += line_size * 4;
271 // if here would be an exact copy of the code above
272 // compiler would generate some very strange code
275 "movq (%3), %%mm0 \n\t"
276 "movq 8(%3), %%mm1 \n\t"
277 "movq 16(%3), %%mm2 \n\t"
278 "movq 24(%3), %%mm3 \n\t"
279 "movq 32(%3), %%mm4 \n\t"
280 "movq 40(%3), %%mm5 \n\t"
281 "movq 48(%3), %%mm6 \n\t"
282 "movq 56(%3), %%mm7 \n\t"
283 "packuswb %%mm1, %%mm0 \n\t"
284 "packuswb %%mm3, %%mm2 \n\t"
285 "packuswb %%mm5, %%mm4 \n\t"
286 "packuswb %%mm7, %%mm6 \n\t"
287 "movq %%mm0, (%0) \n\t"
288 "movq %%mm2, (%0, %1) \n\t"
289 "movq %%mm4, (%0, %1, 2) \n\t"
290 "movq %%mm6, (%0, %2) \n\t"
291 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
295 #define put_signed_pixels_clamped_mmx_half(off) \
296 "movq "#off"(%2), %%mm1 \n\t" \
297 "movq 16 + "#off"(%2), %%mm2 \n\t" \
298 "movq 32 + "#off"(%2), %%mm3 \n\t" \
299 "movq 48 + "#off"(%2), %%mm4 \n\t" \
300 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
301 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
302 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
303 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
304 "paddb %%mm0, %%mm1 \n\t" \
305 "paddb %%mm0, %%mm2 \n\t" \
306 "paddb %%mm0, %%mm3 \n\t" \
307 "paddb %%mm0, %%mm4 \n\t" \
308 "movq %%mm1, (%0) \n\t" \
309 "movq %%mm2, (%0, %3) \n\t" \
310 "movq %%mm3, (%0, %3, 2) \n\t" \
311 "movq %%mm4, (%0, %1) \n\t"
313 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
316 x86_reg line_skip = line_size;
320 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
321 "lea (%3, %3, 2), %1 \n\t"
322 put_signed_pixels_clamped_mmx_half(0)
323 "lea (%0, %3, 4), %0 \n\t"
324 put_signed_pixels_clamped_mmx_half(64)
325 : "+&r"(pixels), "=&r"(line_skip3)
326 : "r"(block), "r"(line_skip)
330 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
337 /* read the pixels */
344 "movq (%2), %%mm0 \n\t"
345 "movq 8(%2), %%mm1 \n\t"
346 "movq 16(%2), %%mm2 \n\t"
347 "movq 24(%2), %%mm3 \n\t"
348 "movq %0, %%mm4 \n\t"
349 "movq %1, %%mm6 \n\t"
350 "movq %%mm4, %%mm5 \n\t"
351 "punpcklbw %%mm7, %%mm4 \n\t"
352 "punpckhbw %%mm7, %%mm5 \n\t"
353 "paddsw %%mm4, %%mm0 \n\t"
354 "paddsw %%mm5, %%mm1 \n\t"
355 "movq %%mm6, %%mm5 \n\t"
356 "punpcklbw %%mm7, %%mm6 \n\t"
357 "punpckhbw %%mm7, %%mm5 \n\t"
358 "paddsw %%mm6, %%mm2 \n\t"
359 "paddsw %%mm5, %%mm3 \n\t"
360 "packuswb %%mm1, %%mm0 \n\t"
361 "packuswb %%mm3, %%mm2 \n\t"
362 "movq %%mm0, %0 \n\t"
363 "movq %%mm2, %1 \n\t"
364 : "+m"(*pix), "+m"(*(pix + line_size))
367 pix += line_size * 2;
372 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
373 int line_size, int h)
376 "lea (%3, %3), %%"REG_a" \n\t"
379 "movd (%1 ), %%mm0 \n\t"
380 "movd (%1, %3), %%mm1 \n\t"
381 "movd %%mm0, (%2) \n\t"
382 "movd %%mm1, (%2, %3) \n\t"
383 "add %%"REG_a", %1 \n\t"
384 "add %%"REG_a", %2 \n\t"
385 "movd (%1 ), %%mm0 \n\t"
386 "movd (%1, %3), %%mm1 \n\t"
387 "movd %%mm0, (%2) \n\t"
388 "movd %%mm1, (%2, %3) \n\t"
389 "add %%"REG_a", %1 \n\t"
390 "add %%"REG_a", %2 \n\t"
393 : "+g"(h), "+r"(pixels), "+r"(block)
394 : "r"((x86_reg)line_size)
399 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
400 int line_size, int h)
403 "lea (%3, %3), %%"REG_a" \n\t"
406 "movq (%1 ), %%mm0 \n\t"
407 "movq (%1, %3), %%mm1 \n\t"
408 "movq %%mm0, (%2) \n\t"
409 "movq %%mm1, (%2, %3) \n\t"
410 "add %%"REG_a", %1 \n\t"
411 "add %%"REG_a", %2 \n\t"
412 "movq (%1 ), %%mm0 \n\t"
413 "movq (%1, %3), %%mm1 \n\t"
414 "movq %%mm0, (%2) \n\t"
415 "movq %%mm1, (%2, %3) \n\t"
416 "add %%"REG_a", %1 \n\t"
417 "add %%"REG_a", %2 \n\t"
420 : "+g"(h), "+r"(pixels), "+r"(block)
421 : "r"((x86_reg)line_size)
426 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
427 int line_size, int h)
430 "lea (%3, %3), %%"REG_a" \n\t"
433 "movq (%1 ), %%mm0 \n\t"
434 "movq 8(%1 ), %%mm4 \n\t"
435 "movq (%1, %3), %%mm1 \n\t"
436 "movq 8(%1, %3), %%mm5 \n\t"
437 "movq %%mm0, (%2) \n\t"
438 "movq %%mm4, 8(%2) \n\t"
439 "movq %%mm1, (%2, %3) \n\t"
440 "movq %%mm5, 8(%2, %3) \n\t"
441 "add %%"REG_a", %1 \n\t"
442 "add %%"REG_a", %2 \n\t"
443 "movq (%1 ), %%mm0 \n\t"
444 "movq 8(%1 ), %%mm4 \n\t"
445 "movq (%1, %3), %%mm1 \n\t"
446 "movq 8(%1, %3), %%mm5 \n\t"
447 "movq %%mm0, (%2) \n\t"
448 "movq %%mm4, 8(%2) \n\t"
449 "movq %%mm1, (%2, %3) \n\t"
450 "movq %%mm5, 8(%2, %3) \n\t"
451 "add %%"REG_a", %1 \n\t"
452 "add %%"REG_a", %2 \n\t"
455 : "+g"(h), "+r"(pixels), "+r"(block)
456 : "r"((x86_reg)line_size)
461 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
462 int line_size, int h)
466 "movdqu (%1 ), %%xmm0 \n\t"
467 "movdqu (%1, %3 ), %%xmm1 \n\t"
468 "movdqu (%1, %3, 2), %%xmm2 \n\t"
469 "movdqu (%1, %4 ), %%xmm3 \n\t"
470 "lea (%1, %3, 4), %1 \n\t"
471 "movdqa %%xmm0, (%2) \n\t"
472 "movdqa %%xmm1, (%2, %3) \n\t"
473 "movdqa %%xmm2, (%2, %3, 2) \n\t"
474 "movdqa %%xmm3, (%2, %4) \n\t"
476 "lea (%2, %3, 4), %2 \n\t"
478 : "+g"(h), "+r"(pixels), "+r"(block)
479 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
484 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
485 int line_size, int h)
489 "movdqu (%1 ), %%xmm0 \n\t"
490 "movdqu (%1, %3 ), %%xmm1 \n\t"
491 "movdqu (%1, %3, 2), %%xmm2 \n\t"
492 "movdqu (%1, %4 ), %%xmm3 \n\t"
493 "lea (%1, %3, 4), %1 \n\t"
494 "pavgb (%2 ), %%xmm0 \n\t"
495 "pavgb (%2, %3 ), %%xmm1 \n\t"
496 "pavgb (%2, %3, 2), %%xmm2 \n\t"
497 "pavgb (%2, %4), %%xmm3 \n\t"
498 "movdqa %%xmm0, (%2) \n\t"
499 "movdqa %%xmm1, (%2, %3) \n\t"
500 "movdqa %%xmm2, (%2, %3, 2) \n\t"
501 "movdqa %%xmm3, (%2, %4) \n\t"
503 "lea (%2, %3, 4), %2 \n\t"
505 : "+g"(h), "+r"(pixels), "+r"(block)
506 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
511 #define CLEAR_BLOCKS(name, n) \
512 static void name(DCTELEM *blocks) \
515 "pxor %%mm7, %%mm7 \n\t" \
516 "mov %1, %%"REG_a" \n\t" \
518 "movq %%mm7, (%0, %%"REG_a") \n\t" \
519 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
520 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
521 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
522 "add $32, %%"REG_a" \n\t" \
524 :: "r"(((uint8_t *)blocks) + 128 * n), \
529 CLEAR_BLOCKS(clear_blocks_mmx, 6)
530 CLEAR_BLOCKS(clear_block_mmx, 1)
532 static void clear_block_sse(DCTELEM *block)
535 "xorps %%xmm0, %%xmm0 \n"
536 "movaps %%xmm0, (%0) \n"
537 "movaps %%xmm0, 16(%0) \n"
538 "movaps %%xmm0, 32(%0) \n"
539 "movaps %%xmm0, 48(%0) \n"
540 "movaps %%xmm0, 64(%0) \n"
541 "movaps %%xmm0, 80(%0) \n"
542 "movaps %%xmm0, 96(%0) \n"
543 "movaps %%xmm0, 112(%0) \n"
549 static void clear_blocks_sse(DCTELEM *blocks)
552 "xorps %%xmm0, %%xmm0 \n"
553 "mov %1, %%"REG_a" \n"
555 "movaps %%xmm0, (%0, %%"REG_a") \n"
556 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
557 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
558 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
559 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
560 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
561 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
562 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
563 "add $128, %%"REG_a" \n"
565 :: "r"(((uint8_t *)blocks) + 128 * 6),
571 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
577 "movq (%1, %0), %%mm0 \n\t"
578 "movq (%2, %0), %%mm1 \n\t"
579 "paddb %%mm0, %%mm1 \n\t"
580 "movq %%mm1, (%2, %0) \n\t"
581 "movq 8(%1, %0), %%mm0 \n\t"
582 "movq 8(%2, %0), %%mm1 \n\t"
583 "paddb %%mm0, %%mm1 \n\t"
584 "movq %%mm1, 8(%2, %0) \n\t"
590 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
593 dst[i + 0] += src[i + 0];
597 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
598 const uint8_t *diff, int w,
599 int *left, int *left_top)
603 int l = *left & 0xff;
604 int tl = *left_top & 0xff;
609 "movzbl (%3, %4), %2 \n"
622 "add (%6, %4), %b0 \n"
623 "mov %b0, (%5, %4) \n"
626 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
627 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
634 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
635 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
636 "movd (%1), %%mm0 \n\t"
638 "movd (%1), %%mm1 \n\t"
639 "movd (%1,%3,1), %%mm2 \n\t"
640 "movd (%1,%3,2), %%mm3 \n\t"
641 "punpcklbw %%mm1, %%mm0 \n\t"
642 "punpcklbw %%mm3, %%mm2 \n\t"
643 "movq %%mm0, %%mm1 \n\t"
644 "punpcklwd %%mm2, %%mm0 \n\t"
645 "punpckhwd %%mm2, %%mm1 \n\t"
646 "movd %%mm0, (%0) \n\t"
648 "punpckhdq %%mm0, %%mm0 \n\t"
649 "movd %%mm0, (%0) \n\t"
650 "movd %%mm1, (%0,%2,1) \n\t"
651 "punpckhdq %%mm1, %%mm1 \n\t"
652 "movd %%mm1, (%0,%2,2) \n\t"
662 #define H263_LOOP_FILTER \
663 "pxor %%mm7, %%mm7 \n\t" \
664 "movq %0, %%mm0 \n\t" \
665 "movq %0, %%mm1 \n\t" \
666 "movq %3, %%mm2 \n\t" \
667 "movq %3, %%mm3 \n\t" \
668 "punpcklbw %%mm7, %%mm0 \n\t" \
669 "punpckhbw %%mm7, %%mm1 \n\t" \
670 "punpcklbw %%mm7, %%mm2 \n\t" \
671 "punpckhbw %%mm7, %%mm3 \n\t" \
672 "psubw %%mm2, %%mm0 \n\t" \
673 "psubw %%mm3, %%mm1 \n\t" \
674 "movq %1, %%mm2 \n\t" \
675 "movq %1, %%mm3 \n\t" \
676 "movq %2, %%mm4 \n\t" \
677 "movq %2, %%mm5 \n\t" \
678 "punpcklbw %%mm7, %%mm2 \n\t" \
679 "punpckhbw %%mm7, %%mm3 \n\t" \
680 "punpcklbw %%mm7, %%mm4 \n\t" \
681 "punpckhbw %%mm7, %%mm5 \n\t" \
682 "psubw %%mm2, %%mm4 \n\t" \
683 "psubw %%mm3, %%mm5 \n\t" \
684 "psllw $2, %%mm4 \n\t" \
685 "psllw $2, %%mm5 \n\t" \
686 "paddw %%mm0, %%mm4 \n\t" \
687 "paddw %%mm1, %%mm5 \n\t" \
688 "pxor %%mm6, %%mm6 \n\t" \
689 "pcmpgtw %%mm4, %%mm6 \n\t" \
690 "pcmpgtw %%mm5, %%mm7 \n\t" \
691 "pxor %%mm6, %%mm4 \n\t" \
692 "pxor %%mm7, %%mm5 \n\t" \
693 "psubw %%mm6, %%mm4 \n\t" \
694 "psubw %%mm7, %%mm5 \n\t" \
695 "psrlw $3, %%mm4 \n\t" \
696 "psrlw $3, %%mm5 \n\t" \
697 "packuswb %%mm5, %%mm4 \n\t" \
698 "packsswb %%mm7, %%mm6 \n\t" \
699 "pxor %%mm7, %%mm7 \n\t" \
700 "movd %4, %%mm2 \n\t" \
701 "punpcklbw %%mm2, %%mm2 \n\t" \
702 "punpcklbw %%mm2, %%mm2 \n\t" \
703 "punpcklbw %%mm2, %%mm2 \n\t" \
704 "psubusb %%mm4, %%mm2 \n\t" \
705 "movq %%mm2, %%mm3 \n\t" \
706 "psubusb %%mm4, %%mm3 \n\t" \
707 "psubb %%mm3, %%mm2 \n\t" \
708 "movq %1, %%mm3 \n\t" \
709 "movq %2, %%mm4 \n\t" \
710 "pxor %%mm6, %%mm3 \n\t" \
711 "pxor %%mm6, %%mm4 \n\t" \
712 "paddusb %%mm2, %%mm3 \n\t" \
713 "psubusb %%mm2, %%mm4 \n\t" \
714 "pxor %%mm6, %%mm3 \n\t" \
715 "pxor %%mm6, %%mm4 \n\t" \
716 "paddusb %%mm2, %%mm2 \n\t" \
717 "packsswb %%mm1, %%mm0 \n\t" \
718 "pcmpgtb %%mm0, %%mm7 \n\t" \
719 "pxor %%mm7, %%mm0 \n\t" \
720 "psubb %%mm7, %%mm0 \n\t" \
721 "movq %%mm0, %%mm1 \n\t" \
722 "psubusb %%mm2, %%mm0 \n\t" \
723 "psubb %%mm0, %%mm1 \n\t" \
724 "pand %5, %%mm1 \n\t" \
725 "psrlw $2, %%mm1 \n\t" \
726 "pxor %%mm7, %%mm1 \n\t" \
727 "psubb %%mm7, %%mm1 \n\t" \
728 "movq %0, %%mm5 \n\t" \
729 "movq %3, %%mm6 \n\t" \
730 "psubb %%mm1, %%mm5 \n\t" \
731 "paddb %%mm1, %%mm6 \n\t"
733 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
735 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
736 const int strength = ff_h263_loop_filter_strength[qscale];
741 "movq %%mm3, %1 \n\t"
742 "movq %%mm4, %2 \n\t"
743 "movq %%mm5, %0 \n\t"
744 "movq %%mm6, %3 \n\t"
745 : "+m"(*(uint64_t*)(src - 2 * stride)),
746 "+m"(*(uint64_t*)(src - 1 * stride)),
747 "+m"(*(uint64_t*)(src + 0 * stride)),
748 "+m"(*(uint64_t*)(src + 1 * stride))
749 : "g"(2 * strength), "m"(ff_pb_FC)
754 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
756 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
757 const int strength = ff_h263_loop_filter_strength[qscale];
758 DECLARE_ALIGNED(8, uint64_t, temp)[4];
759 uint8_t *btemp = (uint8_t*)temp;
763 transpose4x4(btemp, src, 8, stride);
764 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
766 H263_LOOP_FILTER // 5 3 4 6
772 : "g"(2 * strength), "m"(ff_pb_FC)
776 "movq %%mm5, %%mm1 \n\t"
777 "movq %%mm4, %%mm0 \n\t"
778 "punpcklbw %%mm3, %%mm5 \n\t"
779 "punpcklbw %%mm6, %%mm4 \n\t"
780 "punpckhbw %%mm3, %%mm1 \n\t"
781 "punpckhbw %%mm6, %%mm0 \n\t"
782 "movq %%mm5, %%mm3 \n\t"
783 "movq %%mm1, %%mm6 \n\t"
784 "punpcklwd %%mm4, %%mm5 \n\t"
785 "punpcklwd %%mm0, %%mm1 \n\t"
786 "punpckhwd %%mm4, %%mm3 \n\t"
787 "punpckhwd %%mm0, %%mm6 \n\t"
788 "movd %%mm5, (%0) \n\t"
789 "punpckhdq %%mm5, %%mm5 \n\t"
790 "movd %%mm5, (%0, %2) \n\t"
791 "movd %%mm3, (%0, %2, 2) \n\t"
792 "punpckhdq %%mm3, %%mm3 \n\t"
793 "movd %%mm3, (%0, %3) \n\t"
794 "movd %%mm1, (%1) \n\t"
795 "punpckhdq %%mm1, %%mm1 \n\t"
796 "movd %%mm1, (%1, %2) \n\t"
797 "movd %%mm6, (%1, %2, 2) \n\t"
798 "punpckhdq %%mm6, %%mm6 \n\t"
799 "movd %%mm6, (%1, %3) \n\t"
801 "r"(src + 4 * stride),
802 "r"((x86_reg)stride),
803 "r"((x86_reg)(3 * stride))
808 /* Draw the edges of width 'w' of an image of size width, height
809 * this MMX version can only handle w == 8 || w == 16. */
810 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
811 int w, int h, int sides)
813 uint8_t *ptr, *last_line;
816 last_line = buf + (height - 1) * wrap;
822 "movd (%0), %%mm0 \n\t"
823 "punpcklbw %%mm0, %%mm0 \n\t"
824 "punpcklwd %%mm0, %%mm0 \n\t"
825 "punpckldq %%mm0, %%mm0 \n\t"
826 "movq %%mm0, -8(%0) \n\t"
827 "movq -8(%0, %2), %%mm1 \n\t"
828 "punpckhbw %%mm1, %%mm1 \n\t"
829 "punpckhwd %%mm1, %%mm1 \n\t"
830 "punpckhdq %%mm1, %%mm1 \n\t"
831 "movq %%mm1, (%0, %2) \n\t"
836 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
841 "movd (%0), %%mm0 \n\t"
842 "punpcklbw %%mm0, %%mm0 \n\t"
843 "punpcklwd %%mm0, %%mm0 \n\t"
844 "punpckldq %%mm0, %%mm0 \n\t"
845 "movq %%mm0, -8(%0) \n\t"
846 "movq %%mm0, -16(%0) \n\t"
847 "movq -8(%0, %2), %%mm1 \n\t"
848 "punpckhbw %%mm1, %%mm1 \n\t"
849 "punpckhwd %%mm1, %%mm1 \n\t"
850 "punpckhdq %%mm1, %%mm1 \n\t"
851 "movq %%mm1, (%0, %2) \n\t"
852 "movq %%mm1, 8(%0, %2) \n\t"
857 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
861 /* top and bottom (and hopefully also the corners) */
862 if (sides & EDGE_TOP) {
863 for (i = 0; i < h; i += 4) {
864 ptr = buf - (i + 1) * wrap - w;
867 "movq (%1, %0), %%mm0 \n\t"
868 "movq %%mm0, (%0) \n\t"
869 "movq %%mm0, (%0, %2) \n\t"
870 "movq %%mm0, (%0, %2, 2) \n\t"
871 "movq %%mm0, (%0, %3) \n\t"
876 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
877 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
882 if (sides & EDGE_BOTTOM) {
883 for (i = 0; i < h; i += 4) {
884 ptr = last_line + (i + 1) * wrap - w;
887 "movq (%1, %0), %%mm0 \n\t"
888 "movq %%mm0, (%0) \n\t"
889 "movq %%mm0, (%0, %2) \n\t"
890 "movq %%mm0, (%0, %2, 2) \n\t"
891 "movq %%mm0, (%0, %3) \n\t"
896 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
897 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
898 "r"(ptr + width + 2 * w)
904 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
905 in0, in1, in2, in7, out, OP) \
906 "paddw "#m4", "#m3" \n\t" /* x1 */ \
907 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
908 "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
909 "movq "#in7", "#m3" \n\t" /* d */ \
910 "movq "#in0", %%mm5 \n\t" /* D */ \
911 "paddw "#m3", %%mm5 \n\t" /* x4 */ \
912 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
913 "movq "#in1", %%mm5 \n\t" /* C */ \
914 "movq "#in2", %%mm6 \n\t" /* B */ \
915 "paddw "#m6", %%mm5 \n\t" /* x3 */ \
916 "paddw "#m5", %%mm6 \n\t" /* x2 */ \
917 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
918 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
919 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
920 "paddw "#rnd", %%mm4 \n\t" /* x2 */ \
921 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
922 "psraw $5, %%mm5 \n\t" \
923 "packuswb %%mm5, %%mm5 \n\t" \
924 OP(%%mm5, out, %%mm7, d)
926 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMXEXT, OP_3DNOW) \
927 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, \
936 "pxor %%mm7, %%mm7 \n\t" \
938 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
939 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
940 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
941 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
942 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
943 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
944 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
945 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
946 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
947 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
948 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
949 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
950 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
951 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
952 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
953 "paddw %%mm3, %%mm5 \n\t" /* b */ \
954 "paddw %%mm2, %%mm6 \n\t" /* c */ \
955 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
956 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
957 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
958 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
959 "paddw %%mm4, %%mm0 \n\t" /* a */ \
960 "paddw %%mm1, %%mm5 \n\t" /* d */ \
961 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
962 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
963 "paddw %6, %%mm6 \n\t" \
964 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
965 "psraw $5, %%mm0 \n\t" \
966 "movq %%mm0, %5 \n\t" \
967 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
969 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
970 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
971 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
972 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
973 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
974 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
975 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
976 "paddw %%mm0, %%mm2 \n\t" /* b */ \
977 "paddw %%mm5, %%mm3 \n\t" /* c */ \
978 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
979 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
980 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
981 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
982 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
983 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
984 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
985 "paddw %%mm2, %%mm1 \n\t" /* a */ \
986 "paddw %%mm6, %%mm4 \n\t" /* d */ \
987 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
988 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
989 "paddw %6, %%mm1 \n\t" \
990 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
991 "psraw $5, %%mm3 \n\t" \
992 "movq %5, %%mm1 \n\t" \
993 "packuswb %%mm3, %%mm1 \n\t" \
994 OP_MMXEXT(%%mm1, (%1), %%mm4, q) \
995 /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
997 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
998 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
999 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
1000 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
1001 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
1002 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
1003 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
1004 "paddw %%mm1, %%mm5 \n\t" /* b */ \
1005 "paddw %%mm4, %%mm0 \n\t" /* c */ \
1006 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1007 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
1008 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
1009 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
1010 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
1011 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
1012 "paddw %%mm3, %%mm2 \n\t" /* d */ \
1013 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
1014 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
1015 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
1016 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
1017 "paddw %%mm2, %%mm6 \n\t" /* a */ \
1018 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
1019 "paddw %6, %%mm0 \n\t" \
1020 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1021 "psraw $5, %%mm0 \n\t" \
1022 /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
1023 /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
1025 "paddw %%mm5, %%mm3 \n\t" /* a */ \
1026 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
1027 "paddw %%mm4, %%mm6 \n\t" /* b */ \
1028 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
1029 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
1030 "paddw %%mm1, %%mm4 \n\t" /* c */ \
1031 "paddw %%mm2, %%mm5 \n\t" /* d */ \
1032 "paddw %%mm6, %%mm6 \n\t" /* 2b */ \
1033 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
1034 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
1035 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
1036 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
1037 "paddw %6, %%mm4 \n\t" \
1038 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
1039 "psraw $5, %%mm4 \n\t" \
1040 "packuswb %%mm4, %%mm0 \n\t" \
1041 OP_MMXEXT(%%mm0, 8(%1), %%mm4, q) \
1047 : "+a"(src), "+c"(dst), "+D"(h) \
1048 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
1049 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
1054 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, \
1062 /* quick HACK, XXX FIXME MUST be optimized */ \
1063 for (i = 0; i < h; i++) { \
1064 temp[ 0] = (src[ 0] + src[ 1]) * 20 - (src[ 0] + src[ 2]) * 6 + \
1065 (src[ 1] + src[ 3]) * 3 - (src[ 2] + src[ 4]); \
1066 temp[ 1] = (src[ 1] + src[ 2]) * 20 - (src[ 0] + src[ 3]) * 6 + \
1067 (src[ 0] + src[ 4]) * 3 - (src[ 1] + src[ 5]); \
1068 temp[ 2] = (src[ 2] + src[ 3]) * 20 - (src[ 1] + src[ 4]) * 6 + \
1069 (src[ 0] + src[ 5]) * 3 - (src[ 0] + src[ 6]); \
1070 temp[ 3] = (src[ 3] + src[ 4]) * 20 - (src[ 2] + src[ 5]) * 6 + \
1071 (src[ 1] + src[ 6]) * 3 - (src[ 0] + src[ 7]); \
1072 temp[ 4] = (src[ 4] + src[ 5]) * 20 - (src[ 3] + src[ 6]) * 6 + \
1073 (src[ 2] + src[ 7]) * 3 - (src[ 1] + src[ 8]); \
1074 temp[ 5] = (src[ 5] + src[ 6]) * 20 - (src[ 4] + src[ 7]) * 6 + \
1075 (src[ 3] + src[ 8]) * 3 - (src[ 2] + src[ 9]); \
1076 temp[ 6] = (src[ 6] + src[ 7]) * 20 - (src[ 5] + src[ 8]) * 6 + \
1077 (src[ 4] + src[ 9]) * 3 - (src[ 3] + src[10]); \
1078 temp[ 7] = (src[ 7] + src[ 8]) * 20 - (src[ 6] + src[ 9]) * 6 + \
1079 (src[ 5] + src[10]) * 3 - (src[ 4] + src[11]); \
1080 temp[ 8] = (src[ 8] + src[ 9]) * 20 - (src[ 7] + src[10]) * 6 + \
1081 (src[ 6] + src[11]) * 3 - (src[ 5] + src[12]); \
1082 temp[ 9] = (src[ 9] + src[10]) * 20 - (src[ 8] + src[11]) * 6 + \
1083 (src[ 7] + src[12]) * 3 - (src[ 6] + src[13]); \
1084 temp[10] = (src[10] + src[11]) * 20 - (src[ 9] + src[12]) * 6 + \
1085 (src[ 8] + src[13]) * 3 - (src[ 7] + src[14]); \
1086 temp[11] = (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + \
1087 (src[ 9] + src[14]) * 3 - (src[ 8] + src[15]); \
1088 temp[12] = (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + \
1089 (src[10] + src[15]) * 3 - (src[ 9] + src[16]); \
1090 temp[13] = (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + \
1091 (src[11] + src[16]) * 3 - (src[10] + src[16]); \
1092 temp[14] = (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + \
1093 (src[12] + src[16]) * 3 - (src[11] + src[15]); \
1094 temp[15] = (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + \
1095 (src[13] + src[15]) * 3 - (src[12] + src[14]); \
1096 __asm__ volatile ( \
1097 "movq (%0), %%mm0 \n\t" \
1098 "movq 8(%0), %%mm1 \n\t" \
1099 "paddw %2, %%mm0 \n\t" \
1100 "paddw %2, %%mm1 \n\t" \
1101 "psraw $5, %%mm0 \n\t" \
1102 "psraw $5, %%mm1 \n\t" \
1103 "packuswb %%mm1, %%mm0 \n\t" \
1104 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
1105 "movq 16(%0), %%mm0 \n\t" \
1106 "movq 24(%0), %%mm1 \n\t" \
1107 "paddw %2, %%mm0 \n\t" \
1108 "paddw %2, %%mm1 \n\t" \
1109 "psraw $5, %%mm0 \n\t" \
1110 "psraw $5, %%mm1 \n\t" \
1111 "packuswb %%mm1, %%mm0 \n\t" \
1112 OP_3DNOW(%%mm0, 8(%1), %%mm1, q) \
1113 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
1121 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, \
1127 __asm__ volatile ( \
1128 "pxor %%mm7, %%mm7 \n\t" \
1130 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
1131 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
1132 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
1133 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
1134 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
1135 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
1136 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
1137 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
1138 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
1139 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
1140 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
1141 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
1142 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
1143 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
1144 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
1145 "paddw %%mm3, %%mm5 \n\t" /* b */ \
1146 "paddw %%mm2, %%mm6 \n\t" /* c */ \
1147 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1148 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
1149 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
1150 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
1151 "paddw %%mm4, %%mm0 \n\t" /* a */ \
1152 "paddw %%mm1, %%mm5 \n\t" /* d */ \
1153 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
1154 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
1155 "paddw %5, %%mm6 \n\t" \
1156 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1157 "psraw $5, %%mm0 \n\t" \
1158 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
1160 "movd 5(%0), %%mm5 \n\t" /* FGHI */ \
1161 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
1162 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
1163 "paddw %%mm5, %%mm1 \n\t" /* a */ \
1164 "paddw %%mm6, %%mm2 \n\t" /* b */ \
1165 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
1166 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
1167 "paddw %%mm6, %%mm3 \n\t" /* c */ \
1168 "paddw %%mm5, %%mm4 \n\t" /* d */ \
1169 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1170 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1171 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1172 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1173 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
1174 "paddw %5, %%mm1 \n\t" \
1175 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
1176 "psraw $5, %%mm3 \n\t" \
1177 "packuswb %%mm3, %%mm0 \n\t" \
1178 OP_MMXEXT(%%mm0, (%1), %%mm4, q) \
1184 : "+a"(src), "+c"(dst), "+d"(h) \
1185 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1186 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
1191 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, \
1199 /* quick HACK, XXX FIXME MUST be optimized */ \
1200 for (i = 0; i < h; i++) { \
1201 temp[0] = (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + \
1202 (src[1] + src[3]) * 3 - (src[2] + src[4]); \
1203 temp[1] = (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + \
1204 (src[0] + src[4]) * 3 - (src[1] + src[5]); \
1205 temp[2] = (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + \
1206 (src[0] + src[5]) * 3 - (src[0] + src[6]); \
1207 temp[3] = (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + \
1208 (src[1] + src[6]) * 3 - (src[0] + src[7]); \
1209 temp[4] = (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + \
1210 (src[2] + src[7]) * 3 - (src[1] + src[8]); \
1211 temp[5] = (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + \
1212 (src[3] + src[8]) * 3 - (src[2] + src[8]); \
1213 temp[6] = (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + \
1214 (src[4] + src[8]) * 3 - (src[3] + src[7]); \
1215 temp[7] = (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + \
1216 (src[5] + src[7]) * 3 - (src[4] + src[6]); \
1217 __asm__ volatile ( \
1218 "movq (%0), %%mm0 \n\t" \
1219 "movq 8(%0), %%mm1 \n\t" \
1220 "paddw %2, %%mm0 \n\t" \
1221 "paddw %2, %%mm1 \n\t" \
1222 "psraw $5, %%mm0 \n\t" \
1223 "psraw $5, %%mm1 \n\t" \
1224 "packuswb %%mm1, %%mm0 \n\t" \
1225 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
1226 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
1234 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1235 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1240 uint64_t temp[17 * 4]; \
1241 uint64_t *temp_ptr = temp; \
1244 /* FIXME unroll */ \
1245 __asm__ volatile ( \
1246 "pxor %%mm7, %%mm7 \n\t" \
1248 "movq (%0), %%mm0 \n\t" \
1249 "movq (%0), %%mm1 \n\t" \
1250 "movq 8(%0), %%mm2 \n\t" \
1251 "movq 8(%0), %%mm3 \n\t" \
1252 "punpcklbw %%mm7, %%mm0 \n\t" \
1253 "punpckhbw %%mm7, %%mm1 \n\t" \
1254 "punpcklbw %%mm7, %%mm2 \n\t" \
1255 "punpckhbw %%mm7, %%mm3 \n\t" \
1256 "movq %%mm0, (%1) \n\t" \
1257 "movq %%mm1, 17 * 8(%1) \n\t" \
1258 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1259 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1264 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1265 : "r"((x86_reg)srcStride) \
1272 /* FIXME reorder for speed */ \
1273 __asm__ volatile ( \
1274 /* "pxor %%mm7, %%mm7 \n\t" */ \
1276 "movq (%0), %%mm0 \n\t" \
1277 "movq 8(%0), %%mm1 \n\t" \
1278 "movq 16(%0), %%mm2 \n\t" \
1279 "movq 24(%0), %%mm3 \n\t" \
1280 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1281 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1283 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1285 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1287 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1288 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1290 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1291 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1293 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1294 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1296 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1297 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1299 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1301 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1303 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1304 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1306 "add $136, %0 \n\t" \
1311 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1312 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1313 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1314 "g"(4 - 14 * (x86_reg)dstStride) \
1319 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1324 uint64_t temp[9 * 2]; \
1325 uint64_t *temp_ptr = temp; \
1328 /* FIXME unroll */ \
1329 __asm__ volatile ( \
1330 "pxor %%mm7, %%mm7 \n\t" \
1332 "movq (%0), %%mm0 \n\t" \
1333 "movq (%0), %%mm1 \n\t" \
1334 "punpcklbw %%mm7, %%mm0 \n\t" \
1335 "punpckhbw %%mm7, %%mm1 \n\t" \
1336 "movq %%mm0, (%1) \n\t" \
1337 "movq %%mm1, 9*8(%1) \n\t" \
1342 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1343 : "r"((x86_reg)srcStride) \
1350 /* FIXME reorder for speed */ \
1351 __asm__ volatile ( \
1352 /* "pxor %%mm7, %%mm7 \n\t" */ \
1354 "movq (%0), %%mm0 \n\t" \
1355 "movq 8(%0), %%mm1 \n\t" \
1356 "movq 16(%0), %%mm2 \n\t" \
1357 "movq 24(%0), %%mm3 \n\t" \
1358 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1359 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1361 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1363 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1365 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1367 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1369 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1370 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1372 "add $72, %0 \n\t" \
1377 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1378 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1379 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1380 "g"(4 - 6 * (x86_reg)dstStride) \
1385 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1388 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1391 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1395 uint8_t * const half = (uint8_t*)temp; \
1396 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1398 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1401 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1404 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1408 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1412 uint8_t * const half = (uint8_t*)temp; \
1413 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1415 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1419 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1423 uint8_t * const half = (uint8_t*)temp; \
1424 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1425 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1428 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1431 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1434 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1438 uint8_t * const half = (uint8_t*)temp; \
1439 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1440 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1444 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1447 uint64_t half[8 + 9]; \
1448 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1449 uint8_t * const halfHV = ((uint8_t*)half); \
1450 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1452 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1453 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1454 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1457 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1460 uint64_t half[8 + 9]; \
1461 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1462 uint8_t * const halfHV = ((uint8_t*)half); \
1463 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1465 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1467 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1468 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1471 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1474 uint64_t half[8 + 9]; \
1475 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1476 uint8_t * const halfHV = ((uint8_t*)half); \
1477 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1479 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1480 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1481 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1484 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1487 uint64_t half[8 + 9]; \
1488 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1489 uint8_t * const halfHV = ((uint8_t*)half); \
1490 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1492 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1494 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1495 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1498 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1501 uint64_t half[8 + 9]; \
1502 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1503 uint8_t * const halfHV = ((uint8_t*)half); \
1504 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1506 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1507 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1510 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1513 uint64_t half[8 + 9]; \
1514 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1515 uint8_t * const halfHV = ((uint8_t*)half); \
1516 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1518 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1519 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1522 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1525 uint64_t half[8 + 9]; \
1526 uint8_t * const halfH = ((uint8_t*)half); \
1527 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1529 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1530 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1533 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1536 uint64_t half[8 + 9]; \
1537 uint8_t * const halfH = ((uint8_t*)half); \
1538 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1540 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1542 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1545 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1549 uint8_t * const halfH = ((uint8_t*)half); \
1550 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1552 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1555 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1558 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1561 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1564 uint64_t temp[32]; \
1565 uint8_t * const half = (uint8_t*)temp; \
1566 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1568 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1571 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1574 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1575 stride, stride, 16); \
1578 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1581 uint64_t temp[32]; \
1582 uint8_t * const half = (uint8_t*)temp; \
1583 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1585 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1586 stride, stride, 16); \
1589 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1592 uint64_t temp[32]; \
1593 uint8_t * const half = (uint8_t*)temp; \
1594 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1596 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1599 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1602 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1605 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1608 uint64_t temp[32]; \
1609 uint8_t * const half = (uint8_t*)temp; \
1610 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1612 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1613 stride, stride, 16); \
1616 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1619 uint64_t half[16 * 2 + 17 * 2]; \
1620 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1621 uint8_t * const halfHV = ((uint8_t*)half); \
1622 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1624 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1626 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1628 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1631 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1634 uint64_t half[16 * 2 + 17 * 2]; \
1635 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1636 uint8_t * const halfHV = ((uint8_t*)half); \
1637 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1639 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1641 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1643 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1646 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1649 uint64_t half[16 * 2 + 17 * 2]; \
1650 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1651 uint8_t * const halfHV = ((uint8_t*)half); \
1652 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1654 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1656 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1658 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1662 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1665 uint64_t half[16 * 2 + 17 * 2]; \
1666 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1667 uint8_t * const halfHV = ((uint8_t*)half); \
1668 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1670 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1672 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1674 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1678 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1681 uint64_t half[16 * 2 + 17 * 2]; \
1682 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1683 uint8_t * const halfHV = ((uint8_t*)half); \
1684 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1686 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1688 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1691 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1694 uint64_t half[16 * 2 + 17 * 2]; \
1695 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1696 uint8_t * const halfHV = ((uint8_t*)half); \
1697 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1699 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1701 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1705 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1708 uint64_t half[17 * 2]; \
1709 uint8_t * const halfH = ((uint8_t*)half); \
1710 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1712 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1714 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1717 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1720 uint64_t half[17 * 2]; \
1721 uint8_t * const halfH = ((uint8_t*)half); \
1722 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1724 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1726 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1729 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1732 uint64_t half[17 * 2]; \
1733 uint8_t * const halfH = ((uint8_t*)half); \
1734 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1736 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1739 #define PUT_OP(a, b, temp, size) \
1740 "mov"#size" "#a", "#b" \n\t"
1742 #define AVG_3DNOW_OP(a, b, temp, size) \
1743 "mov"#size" "#b", "#temp" \n\t" \
1744 "pavgusb "#temp", "#a" \n\t" \
1745 "mov"#size" "#a", "#b" \n\t"
1747 #define AVG_MMXEXT_OP(a, b, temp, size) \
1748 "mov"#size" "#b", "#temp" \n\t" \
1749 "pavgb "#temp", "#a" \n\t" \
1750 "mov"#size" "#a", "#b" \n\t"
1752 QPEL_BASE(put_, ff_pw_16, _, PUT_OP, PUT_OP)
1753 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMXEXT_OP, AVG_3DNOW_OP)
1754 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1755 QPEL_OP(put_, ff_pw_16, _, PUT_OP, 3dnow)
1756 QPEL_OP(avg_, ff_pw_16, _, AVG_3DNOW_OP, 3dnow)
1757 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1758 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext)
1759 QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
1760 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
1762 /***********************************/
1763 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1765 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL) \
1766 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1770 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE); \
1773 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2) \
1774 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1778 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE, \
1782 #define QPEL_2TAP(OPNAME, SIZE, MMX) \
1783 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX) \
1784 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX) \
1785 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx) \
1786 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX = \
1787 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX; \
1788 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX = \
1789 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX; \
1790 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX = \
1791 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX; \
1792 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, \
1796 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE); \
1798 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, \
1802 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride, \
1805 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0) \
1806 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0) \
1807 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0) \
1808 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0) \
1809 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1) \
1810 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1) \
1811 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1) \
1812 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1) \
1814 QPEL_2TAP(put_, 16, mmxext)
1815 QPEL_2TAP(avg_, 16, mmxext)
1816 QPEL_2TAP(put_, 8, mmxext)
1817 QPEL_2TAP(avg_, 8, mmxext)
1818 QPEL_2TAP(put_, 16, 3dnow)
1819 QPEL_2TAP(avg_, 16, 3dnow)
1820 QPEL_2TAP(put_, 8, 3dnow)
1821 QPEL_2TAP(avg_, 8, 3dnow)
1823 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1825 put_pixels8_xy2_mmx(dst, src, stride, 8);
1827 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1829 put_pixels16_xy2_mmx(dst, src, stride, 16);
1831 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1833 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1835 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1837 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1840 #endif /* HAVE_INLINE_ASM */
1843 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
1844 x86_reg linesize, x86_reg start_y,
1845 x86_reg end_y, x86_reg block_h,
1846 x86_reg start_x, x86_reg end_x,
1848 extern emu_edge_core_func ff_emu_edge_core_mmx;
1849 extern emu_edge_core_func ff_emu_edge_core_sse;
1851 static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
1853 int block_w, int block_h,
1854 int src_x, int src_y,
1856 emu_edge_core_func *core_fn)
1858 int start_y, start_x, end_y, end_x, src_y_add = 0;
1861 src_y_add = h - 1 - src_y;
1863 } else if (src_y <= -block_h) {
1864 src_y_add = 1 - block_h - src_y;
1865 src_y = 1 - block_h;
1868 src += w - 1 - src_x;
1870 } else if (src_x <= -block_w) {
1871 src += 1 - block_w - src_x;
1872 src_x = 1 - block_w;
1875 start_y = FFMAX(0, -src_y);
1876 start_x = FFMAX(0, -src_x);
1877 end_y = FFMIN(block_h, h-src_y);
1878 end_x = FFMIN(block_w, w-src_x);
1879 assert(start_x < end_x && block_w > 0);
1880 assert(start_y < end_y && block_h > 0);
1882 // fill in the to-be-copied part plus all above/below
1883 src += (src_y_add + start_y) * linesize + start_x;
1885 core_fn(buf, src, linesize, start_y, end_y,
1886 block_h, start_x, end_x, block_w);
1890 static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
1892 int block_w, int block_h,
1893 int src_x, int src_y, int w, int h)
1895 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1896 w, h, &ff_emu_edge_core_mmx);
1900 static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
1902 int block_w, int block_h,
1903 int src_x, int src_y, int w, int h)
1905 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1906 w, h, &ff_emu_edge_core_sse);
1908 #endif /* HAVE_YASM */
1912 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1913 int stride, int h, int ox, int oy,
1914 int dxx, int dxy, int dyx, int dyy,
1915 int shift, int r, int width, int height)
1918 const int ix = ox >> (16 + shift);
1919 const int iy = oy >> (16 + shift);
1920 const int oxs = ox >> 4;
1921 const int oys = oy >> 4;
1922 const int dxxs = dxx >> 4;
1923 const int dxys = dxy >> 4;
1924 const int dyxs = dyx >> 4;
1925 const int dyys = dyy >> 4;
1926 const uint16_t r4[4] = { r, r, r, r };
1927 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1928 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1929 const uint64_t shift2 = 2 * shift;
1932 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1933 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1934 const int dxh = dxy * (h - 1);
1935 const int dyw = dyx * (w - 1);
1936 if ( // non-constant fullpel offset (3% of blocks)
1937 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1938 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1939 // uses more than 16 bits of subpel mv (only at huge resolution)
1940 || (dxx | dxy | dyx | dyy) & 15 ||
1941 (unsigned)ix >= width - w ||
1942 (unsigned)iy >= height - h) {
1943 // FIXME could still use mmx for some of the rows
1944 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1945 shift, r, width, height);
1949 src += ix + iy * stride;
1952 "movd %0, %%mm6 \n\t"
1953 "pxor %%mm7, %%mm7 \n\t"
1954 "punpcklwd %%mm6, %%mm6 \n\t"
1955 "punpcklwd %%mm6, %%mm6 \n\t"
1959 for (x = 0; x < w; x += 4) {
1960 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1961 oxs - dxys + dxxs * (x + 1),
1962 oxs - dxys + dxxs * (x + 2),
1963 oxs - dxys + dxxs * (x + 3) };
1964 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1965 oys - dyys + dyxs * (x + 1),
1966 oys - dyys + dyxs * (x + 2),
1967 oys - dyys + dyxs * (x + 3) };
1969 for (y = 0; y < h; y++) {
1971 "movq %0, %%mm4 \n\t"
1972 "movq %1, %%mm5 \n\t"
1973 "paddw %2, %%mm4 \n\t"
1974 "paddw %3, %%mm5 \n\t"
1975 "movq %%mm4, %0 \n\t"
1976 "movq %%mm5, %1 \n\t"
1977 "psrlw $12, %%mm4 \n\t"
1978 "psrlw $12, %%mm5 \n\t"
1979 : "+m"(*dx4), "+m"(*dy4)
1980 : "m"(*dxy4), "m"(*dyy4)
1984 "movq %%mm6, %%mm2 \n\t"
1985 "movq %%mm6, %%mm1 \n\t"
1986 "psubw %%mm4, %%mm2 \n\t"
1987 "psubw %%mm5, %%mm1 \n\t"
1988 "movq %%mm2, %%mm0 \n\t"
1989 "movq %%mm4, %%mm3 \n\t"
1990 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1991 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1992 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1993 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1995 "movd %4, %%mm5 \n\t"
1996 "movd %3, %%mm4 \n\t"
1997 "punpcklbw %%mm7, %%mm5 \n\t"
1998 "punpcklbw %%mm7, %%mm4 \n\t"
1999 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
2000 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
2002 "movd %2, %%mm5 \n\t"
2003 "movd %1, %%mm4 \n\t"
2004 "punpcklbw %%mm7, %%mm5 \n\t"
2005 "punpcklbw %%mm7, %%mm4 \n\t"
2006 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
2007 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
2008 "paddw %5, %%mm1 \n\t"
2009 "paddw %%mm3, %%mm2 \n\t"
2010 "paddw %%mm1, %%mm0 \n\t"
2011 "paddw %%mm2, %%mm0 \n\t"
2013 "psrlw %6, %%mm0 \n\t"
2014 "packuswb %%mm0, %%mm0 \n\t"
2015 "movd %%mm0, %0 \n\t"
2017 : "=m"(dst[x + y * stride])
2018 : "m"(src[0]), "m"(src[1]),
2019 "m"(src[stride]), "m"(src[stride + 1]),
2020 "m"(*r4), "m"(shift2)
2024 src += 4 - h * stride;
2028 #define PREFETCH(name, op) \
2029 static void name(void *mem, int stride, int h) \
2031 const uint8_t *p = mem; \
2033 __asm__ volatile (#op" %0" :: "m"(*p)); \
2038 PREFETCH(prefetch_mmxext, prefetcht0)
2039 PREFETCH(prefetch_3dnow, prefetch)
2042 #endif /* HAVE_INLINE_ASM */
2044 #include "h264_qpel.c"
2046 void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
2047 int stride, int h, int x, int y);
2048 void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
2049 int stride, int h, int x, int y);
2050 void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
2051 int stride, int h, int x, int y);
2053 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
2054 int stride, int h, int x, int y);
2055 void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
2056 int stride, int h, int x, int y);
2057 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
2058 int stride, int h, int x, int y);
2060 void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
2061 int stride, int h, int x, int y);
2062 void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
2063 int stride, int h, int x, int y);
2065 void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
2066 int stride, int h, int x, int y);
2067 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
2068 int stride, int h, int x, int y);
2070 void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
2071 int stride, int h, int x, int y);
2072 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
2073 int stride, int h, int x, int y);
2075 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
2076 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
2077 (uint8_t *dst, uint8_t *src, \
2078 int stride, int h, int x, int y);
2080 CHROMA_MC(put, 2, 10, mmxext)
2081 CHROMA_MC(avg, 2, 10, mmxext)
2082 CHROMA_MC(put, 4, 10, mmxext)
2083 CHROMA_MC(avg, 4, 10, mmxext)
2084 CHROMA_MC(put, 8, 10, sse2)
2085 CHROMA_MC(avg, 8, 10, sse2)
2086 CHROMA_MC(put, 8, 10, avx)
2087 CHROMA_MC(avg, 8, 10, avx)
2092 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
2094 put_pixels8_mmx(dst, src, stride, 8);
2097 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
2099 avg_pixels8_mmx(dst, src, stride, 8);
2102 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
2104 put_pixels16_mmx(dst, src, stride, 16);
2107 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
2109 avg_pixels16_mmx(dst, src, stride, 16);
2113 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
2114 int stride, int rnd)
2116 put_pixels8_mmx(dst, src, stride, 8);
2119 void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
2120 int stride, int rnd)
2122 avg_pixels8_mmxext(dst, src, stride, 8);
2125 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2128 __asm__ volatile ("pxor %%mm7, %%mm7":);
2129 for (i = 0; i < blocksize; i += 2) {
2131 "movq %0, %%mm0 \n\t"
2132 "movq %1, %%mm1 \n\t"
2133 "movq %%mm0, %%mm2 \n\t"
2134 "movq %%mm1, %%mm3 \n\t"
2135 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2136 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2137 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2138 "pxor %%mm2, %%mm1 \n\t"
2139 "movq %%mm3, %%mm4 \n\t"
2140 "pand %%mm1, %%mm3 \n\t"
2141 "pandn %%mm1, %%mm4 \n\t"
2142 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2143 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2144 "movq %%mm3, %1 \n\t"
2145 "movq %%mm0, %0 \n\t"
2146 : "+m"(mag[i]), "+m"(ang[i])
2150 __asm__ volatile ("femms");
2153 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2158 "movaps %0, %%xmm5 \n\t"
2159 :: "m"(ff_pdw_80000000[0])
2161 for (i = 0; i < blocksize; i += 4) {
2163 "movaps %0, %%xmm0 \n\t"
2164 "movaps %1, %%xmm1 \n\t"
2165 "xorps %%xmm2, %%xmm2 \n\t"
2166 "xorps %%xmm3, %%xmm3 \n\t"
2167 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2168 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2169 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2170 "xorps %%xmm2, %%xmm1 \n\t"
2171 "movaps %%xmm3, %%xmm4 \n\t"
2172 "andps %%xmm1, %%xmm3 \n\t"
2173 "andnps %%xmm1, %%xmm4 \n\t"
2174 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2175 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2176 "movaps %%xmm3, %1 \n\t"
2177 "movaps %%xmm0, %0 \n\t"
2178 : "+m"(mag[i]), "+m"(ang[i])
2185 static void vector_fmul_window_3dnowext(float *dst, const float *src0,
2186 const float *src1, const float *win,
2189 x86_reg i = -len * 4;
2190 x86_reg j = len * 4 - 8;
2193 "pswapd (%5, %1), %%mm1 \n"
2194 "movq (%5, %0), %%mm0 \n"
2195 "pswapd (%4, %1), %%mm5 \n"
2196 "movq (%3, %0), %%mm4 \n"
2197 "movq %%mm0, %%mm2 \n"
2198 "movq %%mm1, %%mm3 \n"
2199 "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]
2200 "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]
2201 "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]
2202 "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]
2203 "pfadd %%mm3, %%mm2 \n"
2204 "pfsub %%mm0, %%mm1 \n"
2205 "pswapd %%mm2, %%mm2 \n"
2206 "movq %%mm1, (%2, %0) \n"
2207 "movq %%mm2, (%2, %1) \n"
2213 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2217 static void vector_fmul_window_sse(float *dst, const float *src0,
2218 const float *src1, const float *win, int len)
2220 x86_reg i = -len * 4;
2221 x86_reg j = len * 4 - 16;
2224 "movaps (%5, %1), %%xmm1 \n"
2225 "movaps (%5, %0), %%xmm0 \n"
2226 "movaps (%4, %1), %%xmm5 \n"
2227 "movaps (%3, %0), %%xmm4 \n"
2228 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2229 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2230 "movaps %%xmm0, %%xmm2 \n"
2231 "movaps %%xmm1, %%xmm3 \n"
2232 "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]
2233 "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]
2234 "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]
2235 "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]
2236 "addps %%xmm3, %%xmm2 \n"
2237 "subps %%xmm0, %%xmm1 \n"
2238 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2239 "movaps %%xmm1, (%2, %0) \n"
2240 "movaps %%xmm2, (%2, %1) \n"
2245 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2248 #endif /* HAVE_6REGS */
2250 static void vector_clipf_sse(float *dst, const float *src,
2251 float min, float max, int len)
2253 x86_reg i = (len - 16) * 4;
2255 "movss %3, %%xmm4 \n\t"
2256 "movss %4, %%xmm5 \n\t"
2257 "shufps $0, %%xmm4, %%xmm4 \n\t"
2258 "shufps $0, %%xmm5, %%xmm5 \n\t"
2260 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
2261 "movaps 16(%2, %0), %%xmm1 \n\t"
2262 "movaps 32(%2, %0), %%xmm2 \n\t"
2263 "movaps 48(%2, %0), %%xmm3 \n\t"
2264 "maxps %%xmm4, %%xmm0 \n\t"
2265 "maxps %%xmm4, %%xmm1 \n\t"
2266 "maxps %%xmm4, %%xmm2 \n\t"
2267 "maxps %%xmm4, %%xmm3 \n\t"
2268 "minps %%xmm5, %%xmm0 \n\t"
2269 "minps %%xmm5, %%xmm1 \n\t"
2270 "minps %%xmm5, %%xmm2 \n\t"
2271 "minps %%xmm5, %%xmm3 \n\t"
2272 "movaps %%xmm0, (%1, %0) \n\t"
2273 "movaps %%xmm1, 16(%1, %0) \n\t"
2274 "movaps %%xmm2, 32(%1, %0) \n\t"
2275 "movaps %%xmm3, 48(%1, %0) \n\t"
2279 : "r"(dst), "r"(src), "m"(min), "m"(max)
2284 #endif /* HAVE_INLINE_ASM */
2286 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
2288 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
2290 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
2292 int order, int mul);
2293 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
2295 int order, int mul);
2296 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
2298 int order, int mul);
2300 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
2301 const int16_t *window, unsigned int len);
2302 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2303 const int16_t *window, unsigned int len);
2304 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
2305 const int16_t *window, unsigned int len);
2306 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
2307 const int16_t *window, unsigned int len);
2308 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
2309 const int16_t *window, unsigned int len);
2310 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2311 const int16_t *window, unsigned int len);
2313 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
2314 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
2316 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
2317 const uint8_t *diff, int w,
2318 int *left, int *left_top);
2319 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
2321 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
2324 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2326 void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
2327 const float *src1, int len);
2328 void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
2329 const float *src1, int len);
2331 void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2332 const float *src2, int len);
2333 void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
2334 const float *src2, int len);
2336 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
2337 int32_t min, int32_t max, unsigned int len);
2338 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
2339 int32_t min, int32_t max, unsigned int len);
2340 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
2341 int32_t min, int32_t max, unsigned int len);
2342 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
2343 int32_t min, int32_t max, unsigned int len);
2345 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2346 const float *src1, int len);
2347 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2348 const float *src1, int len);
2350 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2352 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2353 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2354 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2355 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2356 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2357 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2358 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2359 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2360 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2361 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2362 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2363 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2364 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2365 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2366 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2367 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2370 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2372 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2373 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2374 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2375 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2378 #define H264_QPEL_FUNCS(x, y, CPU) \
2380 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2381 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2382 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2383 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2386 #define H264_QPEL_FUNCS_10(x, y, CPU) \
2388 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2389 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2390 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2391 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2394 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2396 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2399 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2400 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2401 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2403 if (!high_bit_depth) {
2404 c->clear_block = clear_block_mmx;
2405 c->clear_blocks = clear_blocks_mmx;
2406 c->draw_edges = draw_edges_mmx;
2408 SET_HPEL_FUNCS(put, 0, 16, mmx);
2409 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2410 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2411 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2412 SET_HPEL_FUNCS(put, 1, 8, mmx);
2413 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2414 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2415 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2417 switch (avctx->idct_algo) {
2419 case FF_IDCT_SIMPLEMMX:
2420 c->idct_put = ff_simple_idct_put_mmx;
2421 c->idct_add = ff_simple_idct_add_mmx;
2422 c->idct = ff_simple_idct_mmx;
2423 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
2425 case FF_IDCT_XVIDMMX:
2426 c->idct_put = ff_idct_xvid_mmx_put;
2427 c->idct_add = ff_idct_xvid_mmx_add;
2428 c->idct = ff_idct_xvid_mmx;
2435 c->add_bytes = add_bytes_mmx;
2437 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2438 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2439 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2441 #endif /* HAVE_INLINE_ASM */
2445 if (!high_bit_depth)
2446 c->emulated_edge_mc = emulated_edge_mc_mmx;
2449 if (!high_bit_depth && CONFIG_H264CHROMA) {
2450 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
2451 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2454 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2459 static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
2462 const int bit_depth = avctx->bits_per_raw_sample;
2463 const int high_bit_depth = bit_depth > 8;
2466 c->prefetch = prefetch_mmxext;
2468 if (!high_bit_depth) {
2469 c->put_pixels_tab[0][1] = put_pixels16_x2_mmxext;
2470 c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
2472 c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
2473 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
2474 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
2476 c->put_pixels_tab[1][1] = put_pixels8_x2_mmxext;
2477 c->put_pixels_tab[1][2] = put_pixels8_y2_mmxext;
2479 c->avg_pixels_tab[1][0] = avg_pixels8_mmxext;
2480 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmxext;
2481 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmxext;
2484 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2485 if (!high_bit_depth) {
2486 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
2487 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
2488 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmxext;
2489 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmxext;
2491 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
2492 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmxext;
2496 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
2497 c->idct_put = ff_idct_xvid_mmxext_put;
2498 c->idct_add = ff_idct_xvid_mmxext_add;
2499 c->idct = ff_idct_xvid_mmxext;
2502 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2503 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2504 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmxext;
2505 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmxext;
2507 #endif /* HAVE_INLINE_ASM */
2509 if (CONFIG_H264QPEL) {
2511 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
2512 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
2513 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
2514 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
2515 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
2516 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
2517 #endif /* HAVE_INLINE_ASM */
2519 if (!high_bit_depth) {
2521 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
2522 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
2523 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
2524 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
2525 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
2526 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
2527 #endif /* HAVE_INLINE_ASM */
2528 } else if (bit_depth == 10) {
2531 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2532 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2533 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2534 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2536 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2537 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2538 #endif /* HAVE_YASM */
2542 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmxext, );
2543 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmxext, );
2544 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmxext, );
2545 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmxext, );
2546 #endif /* HAVE_INLINE_ASM */
2550 if (!high_bit_depth && CONFIG_H264CHROMA) {
2551 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
2552 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
2553 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
2554 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
2556 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2557 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
2558 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
2559 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
2560 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
2563 /* slower than cmov version on AMD */
2564 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
2565 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
2567 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
2568 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
2570 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2571 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2573 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2575 #endif /* HAVE_YASM */
2578 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2581 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2584 c->prefetch = prefetch_3dnow;
2586 if (!high_bit_depth) {
2587 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2588 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2590 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2591 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2592 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2594 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2595 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2597 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2598 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2599 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2601 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2602 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2603 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2604 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2605 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2607 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2608 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2612 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2613 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2614 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2615 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2618 if (CONFIG_H264QPEL) {
2619 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
2620 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
2621 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2622 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
2623 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
2624 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
2626 if (!high_bit_depth) {
2627 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2628 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
2629 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
2630 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2631 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
2632 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
2635 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2636 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
2637 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2638 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
2641 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2642 #endif /* HAVE_INLINE_ASM */
2645 if (!high_bit_depth && CONFIG_H264CHROMA) {
2646 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
2647 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2649 #endif /* HAVE_YASM */
2652 static void dsputil_init_3dnowext(DSPContext *c, AVCodecContext *avctx,
2655 #if HAVE_AMD3DNOWEXT_INLINE && HAVE_6REGS
2656 c->vector_fmul_window = vector_fmul_window_3dnowext;
2660 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2662 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2665 if (!high_bit_depth) {
2666 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2667 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2668 c->clear_block = clear_block_sse;
2669 c->clear_blocks = clear_blocks_sse;
2673 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2676 c->vector_fmul_window = vector_fmul_window_sse;
2679 c->vector_clipf = vector_clipf_sse;
2680 #endif /* HAVE_INLINE_ASM */
2683 c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
2684 c->vector_fmul_add = ff_vector_fmul_add_sse;
2686 c->scalarproduct_float = ff_scalarproduct_float_sse;
2687 c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
2689 if (!high_bit_depth)
2690 c->emulated_edge_mc = emulated_edge_mc_sse;
2691 #endif /* HAVE_YASM */
2694 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2697 const int bit_depth = avctx->bits_per_raw_sample;
2700 const int high_bit_depth = bit_depth > 8;
2702 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2703 // these functions are slower than mmx on AMD, but faster on Intel
2704 if (!high_bit_depth) {
2705 c->put_pixels_tab[0][0] = put_pixels16_sse2;
2706 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2707 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2708 if (CONFIG_H264QPEL)
2709 H264_QPEL_FUNCS(0, 0, sse2);
2713 if (!high_bit_depth && CONFIG_H264QPEL) {
2714 H264_QPEL_FUNCS(0, 1, sse2);
2715 H264_QPEL_FUNCS(0, 2, sse2);
2716 H264_QPEL_FUNCS(0, 3, sse2);
2717 H264_QPEL_FUNCS(1, 1, sse2);
2718 H264_QPEL_FUNCS(1, 2, sse2);
2719 H264_QPEL_FUNCS(1, 3, sse2);
2720 H264_QPEL_FUNCS(2, 1, sse2);
2721 H264_QPEL_FUNCS(2, 2, sse2);
2722 H264_QPEL_FUNCS(2, 3, sse2);
2723 H264_QPEL_FUNCS(3, 1, sse2);
2724 H264_QPEL_FUNCS(3, 2, sse2);
2725 H264_QPEL_FUNCS(3, 3, sse2);
2728 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
2729 c->idct_put = ff_idct_xvid_sse2_put;
2730 c->idct_add = ff_idct_xvid_sse2_add;
2731 c->idct = ff_idct_xvid_sse2;
2732 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
2734 #endif /* HAVE_INLINE_ASM */
2737 if (bit_depth == 10) {
2738 if (CONFIG_H264QPEL) {
2739 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2740 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2741 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2742 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2743 H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2744 H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2745 H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
2747 if (CONFIG_H264CHROMA) {
2748 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2749 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2753 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2754 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2755 if (mm_flags & AV_CPU_FLAG_ATOM) {
2756 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2758 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2760 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2761 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
2762 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2763 c->apply_window_int16 = ff_apply_window_int16_sse2;
2765 c->bswap_buf = ff_bswap32_buf_sse2;
2766 #endif /* HAVE_YASM */
2769 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2772 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2773 const int bit_depth = avctx->bits_per_raw_sample;
2775 #if HAVE_SSSE3_INLINE
2776 if (!high_bit_depth && CONFIG_H264QPEL) {
2777 H264_QPEL_FUNCS(1, 0, ssse3);
2778 H264_QPEL_FUNCS(1, 1, ssse3);
2779 H264_QPEL_FUNCS(1, 2, ssse3);
2780 H264_QPEL_FUNCS(1, 3, ssse3);
2781 H264_QPEL_FUNCS(2, 0, ssse3);
2782 H264_QPEL_FUNCS(2, 1, ssse3);
2783 H264_QPEL_FUNCS(2, 2, ssse3);
2784 H264_QPEL_FUNCS(2, 3, ssse3);
2785 H264_QPEL_FUNCS(3, 0, ssse3);
2786 H264_QPEL_FUNCS(3, 1, ssse3);
2787 H264_QPEL_FUNCS(3, 2, ssse3);
2788 H264_QPEL_FUNCS(3, 3, ssse3);
2790 #endif /* HAVE_SSSE3_INLINE */
2792 #if HAVE_SSSE3_EXTERNAL
2793 if (bit_depth == 10 && CONFIG_H264QPEL) {
2794 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
2795 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
2796 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
2798 if (!high_bit_depth && CONFIG_H264CHROMA) {
2799 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
2800 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
2801 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
2802 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
2804 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2805 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2806 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2808 if (mm_flags & AV_CPU_FLAG_ATOM)
2809 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2811 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2812 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
2813 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2814 c->bswap_buf = ff_bswap32_buf_ssse3;
2815 #endif /* HAVE_SSSE3_EXTERNAL */
2818 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
2821 #if HAVE_SSE4_EXTERNAL
2822 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2823 #endif /* HAVE_SSE4_EXTERNAL */
2826 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2828 #if HAVE_AVX_EXTERNAL
2829 const int bit_depth = avctx->bits_per_raw_sample;
2831 if (bit_depth == 10) {
2832 // AVX implies !cache64.
2833 // TODO: Port cache(32|64) detection from x264.
2834 if (CONFIG_H264QPEL) {
2835 H264_QPEL_FUNCS_10(1, 0, sse2);
2836 H264_QPEL_FUNCS_10(2, 0, sse2);
2837 H264_QPEL_FUNCS_10(3, 0, sse2);
2840 if (CONFIG_H264CHROMA) {
2841 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2842 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2845 c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
2846 c->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
2847 c->vector_fmul_add = ff_vector_fmul_add_avx;
2848 #endif /* HAVE_AVX_EXTERNAL */
2851 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
2853 int mm_flags = av_get_cpu_flags();
2855 #if HAVE_7REGS && HAVE_INLINE_ASM
2856 if (mm_flags & AV_CPU_FLAG_CMOV)
2857 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2860 if (mm_flags & AV_CPU_FLAG_MMX)
2861 dsputil_init_mmx(c, avctx, mm_flags);
2863 if (mm_flags & AV_CPU_FLAG_MMXEXT)
2864 dsputil_init_mmxext(c, avctx, mm_flags);
2866 if (mm_flags & AV_CPU_FLAG_3DNOW)
2867 dsputil_init_3dnow(c, avctx, mm_flags);
2869 if (mm_flags & AV_CPU_FLAG_3DNOWEXT)
2870 dsputil_init_3dnowext(c, avctx, mm_flags);
2872 if (mm_flags & AV_CPU_FLAG_SSE)
2873 dsputil_init_sse(c, avctx, mm_flags);
2875 if (mm_flags & AV_CPU_FLAG_SSE2)
2876 dsputil_init_sse2(c, avctx, mm_flags);
2878 if (mm_flags & AV_CPU_FLAG_SSSE3)
2879 dsputil_init_ssse3(c, avctx, mm_flags);
2881 if (mm_flags & AV_CPU_FLAG_SSE4)
2882 dsputil_init_sse4(c, avctx, mm_flags);
2884 if (mm_flags & AV_CPU_FLAG_AVX)
2885 dsputil_init_avx(c, avctx, mm_flags);
2887 if (CONFIG_ENCODERS)
2888 ff_dsputilenc_init_mmx(c, avctx);