2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86_cpu.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "libavcodec/ac3dec.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
43 { 0x8000000080000000ULL, 0x8000000080000000ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
52 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
73 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
81 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
82 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
84 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
85 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
87 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
88 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
90 #define MOVQ_BFE(regd) \
92 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
93 "paddb %%"#regd", %%"#regd" \n\t" ::)
96 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
97 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
99 // for shared library it's better to use this way for accessing constants
101 #define MOVQ_BONE(regd) \
103 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
104 "psrlw $15, %%"#regd" \n\t" \
105 "packuswb %%"#regd", %%"#regd" \n\t" ::)
107 #define MOVQ_WTWO(regd) \
109 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
110 "psrlw $15, %%"#regd" \n\t" \
111 "psllw $1, %%"#regd" \n\t"::)
115 // using regr as temporary and for the output result
116 // first argument is unmodifed and second is trashed
117 // regfe is supposed to contain 0xfefefefefefefefe
118 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
119 "movq "#rega", "#regr" \n\t" \
120 "pand "#regb", "#regr" \n\t" \
121 "pxor "#rega", "#regb" \n\t" \
122 "pand "#regfe", "#regb" \n\t" \
123 "psrlq $1, "#regb" \n\t" \
124 "paddb "#regb", "#regr" \n\t"
126 #define PAVGB_MMX(rega, regb, regr, regfe) \
127 "movq "#rega", "#regr" \n\t" \
128 "por "#regb", "#regr" \n\t" \
129 "pxor "#rega", "#regb" \n\t" \
130 "pand "#regfe", "#regb" \n\t" \
131 "psrlq $1, "#regb" \n\t" \
132 "psubb "#regb", "#regr" \n\t"
134 // mm6 is supposed to contain 0xfefefefefefefefe
135 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
136 "movq "#rega", "#regr" \n\t" \
137 "movq "#regc", "#regp" \n\t" \
138 "pand "#regb", "#regr" \n\t" \
139 "pand "#regd", "#regp" \n\t" \
140 "pxor "#rega", "#regb" \n\t" \
141 "pxor "#regc", "#regd" \n\t" \
142 "pand %%mm6, "#regb" \n\t" \
143 "pand %%mm6, "#regd" \n\t" \
144 "psrlq $1, "#regb" \n\t" \
145 "psrlq $1, "#regd" \n\t" \
146 "paddb "#regb", "#regr" \n\t" \
147 "paddb "#regd", "#regp" \n\t"
149 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
150 "movq "#rega", "#regr" \n\t" \
151 "movq "#regc", "#regp" \n\t" \
152 "por "#regb", "#regr" \n\t" \
153 "por "#regd", "#regp" \n\t" \
154 "pxor "#rega", "#regb" \n\t" \
155 "pxor "#regc", "#regd" \n\t" \
156 "pand %%mm6, "#regb" \n\t" \
157 "pand %%mm6, "#regd" \n\t" \
158 "psrlq $1, "#regd" \n\t" \
159 "psrlq $1, "#regb" \n\t" \
160 "psubb "#regb", "#regr" \n\t" \
161 "psubb "#regd", "#regp" \n\t"
163 /***********************************/
164 /* MMX no rounding */
165 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
166 #define SET_RND MOVQ_WONE
167 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
168 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
169 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
171 #include "dsputil_mmx_rnd_template.c"
177 /***********************************/
180 #define DEF(x, y) x ## _ ## y ## _mmx
181 #define SET_RND MOVQ_WTWO
182 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
183 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
185 #include "dsputil_mmx_rnd_template.c"
193 /***********************************/
196 #define DEF(x) x ## _3dnow
197 #define PAVGB "pavgusb"
200 #include "dsputil_mmx_avg_template.c"
206 /***********************************/
209 #define DEF(x) x ## _mmx2
211 /* Introduced only in MMX2 set */
212 #define PAVGB "pavgb"
215 #include "dsputil_mmx_avg_template.c"
221 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
222 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
223 #define put_pixels16_mmx2 put_pixels16_mmx
224 #define put_pixels8_mmx2 put_pixels8_mmx
225 #define put_pixels4_mmx2 put_pixels4_mmx
226 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
227 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
228 #define put_pixels16_3dnow put_pixels16_mmx
229 #define put_pixels8_3dnow put_pixels8_mmx
230 #define put_pixels4_3dnow put_pixels4_mmx
231 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
232 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
234 /***********************************/
237 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
243 /* read the pixels */
248 "movq %3, %%mm0 \n\t"
249 "movq 8%3, %%mm1 \n\t"
250 "movq 16%3, %%mm2 \n\t"
251 "movq 24%3, %%mm3 \n\t"
252 "movq 32%3, %%mm4 \n\t"
253 "movq 40%3, %%mm5 \n\t"
254 "movq 48%3, %%mm6 \n\t"
255 "movq 56%3, %%mm7 \n\t"
256 "packuswb %%mm1, %%mm0 \n\t"
257 "packuswb %%mm3, %%mm2 \n\t"
258 "packuswb %%mm5, %%mm4 \n\t"
259 "packuswb %%mm7, %%mm6 \n\t"
260 "movq %%mm0, (%0) \n\t"
261 "movq %%mm2, (%0, %1) \n\t"
262 "movq %%mm4, (%0, %1, 2) \n\t"
263 "movq %%mm6, (%0, %2) \n\t"
264 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
267 pix += line_size * 4;
270 // if here would be an exact copy of the code above
271 // compiler would generate some very strange code
274 "movq (%3), %%mm0 \n\t"
275 "movq 8(%3), %%mm1 \n\t"
276 "movq 16(%3), %%mm2 \n\t"
277 "movq 24(%3), %%mm3 \n\t"
278 "movq 32(%3), %%mm4 \n\t"
279 "movq 40(%3), %%mm5 \n\t"
280 "movq 48(%3), %%mm6 \n\t"
281 "movq 56(%3), %%mm7 \n\t"
282 "packuswb %%mm1, %%mm0 \n\t"
283 "packuswb %%mm3, %%mm2 \n\t"
284 "packuswb %%mm5, %%mm4 \n\t"
285 "packuswb %%mm7, %%mm6 \n\t"
286 "movq %%mm0, (%0) \n\t"
287 "movq %%mm2, (%0, %1) \n\t"
288 "movq %%mm4, (%0, %1, 2) \n\t"
289 "movq %%mm6, (%0, %2) \n\t"
290 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
294 #define put_signed_pixels_clamped_mmx_half(off) \
295 "movq "#off"(%2), %%mm1 \n\t" \
296 "movq 16 + "#off"(%2), %%mm2 \n\t" \
297 "movq 32 + "#off"(%2), %%mm3 \n\t" \
298 "movq 48 + "#off"(%2), %%mm4 \n\t" \
299 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
300 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
301 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
302 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
303 "paddb %%mm0, %%mm1 \n\t" \
304 "paddb %%mm0, %%mm2 \n\t" \
305 "paddb %%mm0, %%mm3 \n\t" \
306 "paddb %%mm0, %%mm4 \n\t" \
307 "movq %%mm1, (%0) \n\t" \
308 "movq %%mm2, (%0, %3) \n\t" \
309 "movq %%mm3, (%0, %3, 2) \n\t" \
310 "movq %%mm4, (%0, %1) \n\t"
312 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
315 x86_reg line_skip = line_size;
319 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
320 "lea (%3, %3, 2), %1 \n\t"
321 put_signed_pixels_clamped_mmx_half(0)
322 "lea (%0, %3, 4), %0 \n\t"
323 put_signed_pixels_clamped_mmx_half(64)
324 : "+&r"(pixels), "=&r"(line_skip3)
325 : "r"(block), "r"(line_skip)
329 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
336 /* read the pixels */
343 "movq (%2), %%mm0 \n\t"
344 "movq 8(%2), %%mm1 \n\t"
345 "movq 16(%2), %%mm2 \n\t"
346 "movq 24(%2), %%mm3 \n\t"
347 "movq %0, %%mm4 \n\t"
348 "movq %1, %%mm6 \n\t"
349 "movq %%mm4, %%mm5 \n\t"
350 "punpcklbw %%mm7, %%mm4 \n\t"
351 "punpckhbw %%mm7, %%mm5 \n\t"
352 "paddsw %%mm4, %%mm0 \n\t"
353 "paddsw %%mm5, %%mm1 \n\t"
354 "movq %%mm6, %%mm5 \n\t"
355 "punpcklbw %%mm7, %%mm6 \n\t"
356 "punpckhbw %%mm7, %%mm5 \n\t"
357 "paddsw %%mm6, %%mm2 \n\t"
358 "paddsw %%mm5, %%mm3 \n\t"
359 "packuswb %%mm1, %%mm0 \n\t"
360 "packuswb %%mm3, %%mm2 \n\t"
361 "movq %%mm0, %0 \n\t"
362 "movq %%mm2, %1 \n\t"
363 : "+m"(*pix), "+m"(*(pix + line_size))
366 pix += line_size * 2;
371 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
372 int line_size, int h)
375 "lea (%3, %3), %%"REG_a" \n\t"
378 "movd (%1 ), %%mm0 \n\t"
379 "movd (%1, %3), %%mm1 \n\t"
380 "movd %%mm0, (%2) \n\t"
381 "movd %%mm1, (%2, %3) \n\t"
382 "add %%"REG_a", %1 \n\t"
383 "add %%"REG_a", %2 \n\t"
384 "movd (%1 ), %%mm0 \n\t"
385 "movd (%1, %3), %%mm1 \n\t"
386 "movd %%mm0, (%2) \n\t"
387 "movd %%mm1, (%2, %3) \n\t"
388 "add %%"REG_a", %1 \n\t"
389 "add %%"REG_a", %2 \n\t"
392 : "+g"(h), "+r"(pixels), "+r"(block)
393 : "r"((x86_reg)line_size)
398 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
399 int line_size, int h)
402 "lea (%3, %3), %%"REG_a" \n\t"
405 "movq (%1 ), %%mm0 \n\t"
406 "movq (%1, %3), %%mm1 \n\t"
407 "movq %%mm0, (%2) \n\t"
408 "movq %%mm1, (%2, %3) \n\t"
409 "add %%"REG_a", %1 \n\t"
410 "add %%"REG_a", %2 \n\t"
411 "movq (%1 ), %%mm0 \n\t"
412 "movq (%1, %3), %%mm1 \n\t"
413 "movq %%mm0, (%2) \n\t"
414 "movq %%mm1, (%2, %3) \n\t"
415 "add %%"REG_a", %1 \n\t"
416 "add %%"REG_a", %2 \n\t"
419 : "+g"(h), "+r"(pixels), "+r"(block)
420 : "r"((x86_reg)line_size)
425 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
426 int line_size, int h)
429 "lea (%3, %3), %%"REG_a" \n\t"
432 "movq (%1 ), %%mm0 \n\t"
433 "movq 8(%1 ), %%mm4 \n\t"
434 "movq (%1, %3), %%mm1 \n\t"
435 "movq 8(%1, %3), %%mm5 \n\t"
436 "movq %%mm0, (%2) \n\t"
437 "movq %%mm4, 8(%2) \n\t"
438 "movq %%mm1, (%2, %3) \n\t"
439 "movq %%mm5, 8(%2, %3) \n\t"
440 "add %%"REG_a", %1 \n\t"
441 "add %%"REG_a", %2 \n\t"
442 "movq (%1 ), %%mm0 \n\t"
443 "movq 8(%1 ), %%mm4 \n\t"
444 "movq (%1, %3), %%mm1 \n\t"
445 "movq 8(%1, %3), %%mm5 \n\t"
446 "movq %%mm0, (%2) \n\t"
447 "movq %%mm4, 8(%2) \n\t"
448 "movq %%mm1, (%2, %3) \n\t"
449 "movq %%mm5, 8(%2, %3) \n\t"
450 "add %%"REG_a", %1 \n\t"
451 "add %%"REG_a", %2 \n\t"
454 : "+g"(h), "+r"(pixels), "+r"(block)
455 : "r"((x86_reg)line_size)
460 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
461 int line_size, int h)
465 "movdqu (%1 ), %%xmm0 \n\t"
466 "movdqu (%1, %3 ), %%xmm1 \n\t"
467 "movdqu (%1, %3, 2), %%xmm2 \n\t"
468 "movdqu (%1, %4 ), %%xmm3 \n\t"
469 "lea (%1, %3, 4), %1 \n\t"
470 "movdqa %%xmm0, (%2) \n\t"
471 "movdqa %%xmm1, (%2, %3) \n\t"
472 "movdqa %%xmm2, (%2, %3, 2) \n\t"
473 "movdqa %%xmm3, (%2, %4) \n\t"
475 "lea (%2, %3, 4), %2 \n\t"
477 : "+g"(h), "+r"(pixels), "+r"(block)
478 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
483 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
484 int line_size, int h)
488 "movdqu (%1 ), %%xmm0 \n\t"
489 "movdqu (%1, %3 ), %%xmm1 \n\t"
490 "movdqu (%1, %3, 2), %%xmm2 \n\t"
491 "movdqu (%1, %4 ), %%xmm3 \n\t"
492 "lea (%1, %3, 4), %1 \n\t"
493 "pavgb (%2 ), %%xmm0 \n\t"
494 "pavgb (%2, %3 ), %%xmm1 \n\t"
495 "pavgb (%2, %3, 2), %%xmm2 \n\t"
496 "pavgb (%2, %4), %%xmm3 \n\t"
497 "movdqa %%xmm0, (%2) \n\t"
498 "movdqa %%xmm1, (%2, %3) \n\t"
499 "movdqa %%xmm2, (%2, %3, 2) \n\t"
500 "movdqa %%xmm3, (%2, %4) \n\t"
502 "lea (%2, %3, 4), %2 \n\t"
504 : "+g"(h), "+r"(pixels), "+r"(block)
505 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
510 #define CLEAR_BLOCKS(name, n) \
511 static void name(DCTELEM *blocks) \
514 "pxor %%mm7, %%mm7 \n\t" \
515 "mov %1, %%"REG_a" \n\t" \
517 "movq %%mm7, (%0, %%"REG_a") \n\t" \
518 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
519 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
520 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
521 "add $32, %%"REG_a" \n\t" \
523 :: "r"(((uint8_t *)blocks) + 128 * n), \
528 CLEAR_BLOCKS(clear_blocks_mmx, 6)
529 CLEAR_BLOCKS(clear_block_mmx, 1)
531 static void clear_block_sse(DCTELEM *block)
534 "xorps %%xmm0, %%xmm0 \n"
535 "movaps %%xmm0, (%0) \n"
536 "movaps %%xmm0, 16(%0) \n"
537 "movaps %%xmm0, 32(%0) \n"
538 "movaps %%xmm0, 48(%0) \n"
539 "movaps %%xmm0, 64(%0) \n"
540 "movaps %%xmm0, 80(%0) \n"
541 "movaps %%xmm0, 96(%0) \n"
542 "movaps %%xmm0, 112(%0) \n"
548 static void clear_blocks_sse(DCTELEM *blocks)
551 "xorps %%xmm0, %%xmm0 \n"
552 "mov %1, %%"REG_a" \n"
554 "movaps %%xmm0, (%0, %%"REG_a") \n"
555 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
556 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
557 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
558 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
559 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
560 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
561 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
562 "add $128, %%"REG_a" \n"
564 :: "r"(((uint8_t *)blocks) + 128 * 6),
570 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
576 "movq (%1, %0), %%mm0 \n\t"
577 "movq (%2, %0), %%mm1 \n\t"
578 "paddb %%mm0, %%mm1 \n\t"
579 "movq %%mm1, (%2, %0) \n\t"
580 "movq 8(%1, %0), %%mm0 \n\t"
581 "movq 8(%2, %0), %%mm1 \n\t"
582 "paddb %%mm0, %%mm1 \n\t"
583 "movq %%mm1, 8(%2, %0) \n\t"
589 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
592 dst[i + 0] += src[i + 0];
596 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
597 const uint8_t *diff, int w,
598 int *left, int *left_top)
602 int l = *left & 0xff;
603 int tl = *left_top & 0xff;
608 "movzbl (%3, %4), %2 \n"
621 "add (%6, %4), %b0 \n"
622 "mov %b0, (%5, %4) \n"
625 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
626 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
633 #define H263_LOOP_FILTER \
634 "pxor %%mm7, %%mm7 \n\t" \
635 "movq %0, %%mm0 \n\t" \
636 "movq %0, %%mm1 \n\t" \
637 "movq %3, %%mm2 \n\t" \
638 "movq %3, %%mm3 \n\t" \
639 "punpcklbw %%mm7, %%mm0 \n\t" \
640 "punpckhbw %%mm7, %%mm1 \n\t" \
641 "punpcklbw %%mm7, %%mm2 \n\t" \
642 "punpckhbw %%mm7, %%mm3 \n\t" \
643 "psubw %%mm2, %%mm0 \n\t" \
644 "psubw %%mm3, %%mm1 \n\t" \
645 "movq %1, %%mm2 \n\t" \
646 "movq %1, %%mm3 \n\t" \
647 "movq %2, %%mm4 \n\t" \
648 "movq %2, %%mm5 \n\t" \
649 "punpcklbw %%mm7, %%mm2 \n\t" \
650 "punpckhbw %%mm7, %%mm3 \n\t" \
651 "punpcklbw %%mm7, %%mm4 \n\t" \
652 "punpckhbw %%mm7, %%mm5 \n\t" \
653 "psubw %%mm2, %%mm4 \n\t" \
654 "psubw %%mm3, %%mm5 \n\t" \
655 "psllw $2, %%mm4 \n\t" \
656 "psllw $2, %%mm5 \n\t" \
657 "paddw %%mm0, %%mm4 \n\t" \
658 "paddw %%mm1, %%mm5 \n\t" \
659 "pxor %%mm6, %%mm6 \n\t" \
660 "pcmpgtw %%mm4, %%mm6 \n\t" \
661 "pcmpgtw %%mm5, %%mm7 \n\t" \
662 "pxor %%mm6, %%mm4 \n\t" \
663 "pxor %%mm7, %%mm5 \n\t" \
664 "psubw %%mm6, %%mm4 \n\t" \
665 "psubw %%mm7, %%mm5 \n\t" \
666 "psrlw $3, %%mm4 \n\t" \
667 "psrlw $3, %%mm5 \n\t" \
668 "packuswb %%mm5, %%mm4 \n\t" \
669 "packsswb %%mm7, %%mm6 \n\t" \
670 "pxor %%mm7, %%mm7 \n\t" \
671 "movd %4, %%mm2 \n\t" \
672 "punpcklbw %%mm2, %%mm2 \n\t" \
673 "punpcklbw %%mm2, %%mm2 \n\t" \
674 "punpcklbw %%mm2, %%mm2 \n\t" \
675 "psubusb %%mm4, %%mm2 \n\t" \
676 "movq %%mm2, %%mm3 \n\t" \
677 "psubusb %%mm4, %%mm3 \n\t" \
678 "psubb %%mm3, %%mm2 \n\t" \
679 "movq %1, %%mm3 \n\t" \
680 "movq %2, %%mm4 \n\t" \
681 "pxor %%mm6, %%mm3 \n\t" \
682 "pxor %%mm6, %%mm4 \n\t" \
683 "paddusb %%mm2, %%mm3 \n\t" \
684 "psubusb %%mm2, %%mm4 \n\t" \
685 "pxor %%mm6, %%mm3 \n\t" \
686 "pxor %%mm6, %%mm4 \n\t" \
687 "paddusb %%mm2, %%mm2 \n\t" \
688 "packsswb %%mm1, %%mm0 \n\t" \
689 "pcmpgtb %%mm0, %%mm7 \n\t" \
690 "pxor %%mm7, %%mm0 \n\t" \
691 "psubb %%mm7, %%mm0 \n\t" \
692 "movq %%mm0, %%mm1 \n\t" \
693 "psubusb %%mm2, %%mm0 \n\t" \
694 "psubb %%mm0, %%mm1 \n\t" \
695 "pand %5, %%mm1 \n\t" \
696 "psrlw $2, %%mm1 \n\t" \
697 "pxor %%mm7, %%mm1 \n\t" \
698 "psubb %%mm7, %%mm1 \n\t" \
699 "movq %0, %%mm5 \n\t" \
700 "movq %3, %%mm6 \n\t" \
701 "psubb %%mm1, %%mm5 \n\t" \
702 "paddb %%mm1, %%mm6 \n\t"
704 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
706 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
707 const int strength = ff_h263_loop_filter_strength[qscale];
712 "movq %%mm3, %1 \n\t"
713 "movq %%mm4, %2 \n\t"
714 "movq %%mm5, %0 \n\t"
715 "movq %%mm6, %3 \n\t"
716 : "+m"(*(uint64_t*)(src - 2 * stride)),
717 "+m"(*(uint64_t*)(src - 1 * stride)),
718 "+m"(*(uint64_t*)(src + 0 * stride)),
719 "+m"(*(uint64_t*)(src + 1 * stride))
720 : "g"(2 * strength), "m"(ff_pb_FC)
725 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
727 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
728 const int strength = ff_h263_loop_filter_strength[qscale];
729 DECLARE_ALIGNED(8, uint64_t, temp)[4];
730 uint8_t *btemp = (uint8_t*)temp;
734 transpose4x4(btemp, src, 8, stride);
735 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
737 H263_LOOP_FILTER // 5 3 4 6
743 : "g"(2 * strength), "m"(ff_pb_FC)
747 "movq %%mm5, %%mm1 \n\t"
748 "movq %%mm4, %%mm0 \n\t"
749 "punpcklbw %%mm3, %%mm5 \n\t"
750 "punpcklbw %%mm6, %%mm4 \n\t"
751 "punpckhbw %%mm3, %%mm1 \n\t"
752 "punpckhbw %%mm6, %%mm0 \n\t"
753 "movq %%mm5, %%mm3 \n\t"
754 "movq %%mm1, %%mm6 \n\t"
755 "punpcklwd %%mm4, %%mm5 \n\t"
756 "punpcklwd %%mm0, %%mm1 \n\t"
757 "punpckhwd %%mm4, %%mm3 \n\t"
758 "punpckhwd %%mm0, %%mm6 \n\t"
759 "movd %%mm5, (%0) \n\t"
760 "punpckhdq %%mm5, %%mm5 \n\t"
761 "movd %%mm5, (%0, %2) \n\t"
762 "movd %%mm3, (%0, %2, 2) \n\t"
763 "punpckhdq %%mm3, %%mm3 \n\t"
764 "movd %%mm3, (%0, %3) \n\t"
765 "movd %%mm1, (%1) \n\t"
766 "punpckhdq %%mm1, %%mm1 \n\t"
767 "movd %%mm1, (%1, %2) \n\t"
768 "movd %%mm6, (%1, %2, 2) \n\t"
769 "punpckhdq %%mm6, %%mm6 \n\t"
770 "movd %%mm6, (%1, %3) \n\t"
772 "r"(src + 4 * stride),
773 "r"((x86_reg)stride),
774 "r"((x86_reg)(3 * stride))
779 /* Draw the edges of width 'w' of an image of size width, height
780 * this MMX version can only handle w == 8 || w == 16. */
781 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
782 int w, int h, int sides)
784 uint8_t *ptr, *last_line;
787 last_line = buf + (height - 1) * wrap;
793 "movd (%0), %%mm0 \n\t"
794 "punpcklbw %%mm0, %%mm0 \n\t"
795 "punpcklwd %%mm0, %%mm0 \n\t"
796 "punpckldq %%mm0, %%mm0 \n\t"
797 "movq %%mm0, -8(%0) \n\t"
798 "movq -8(%0, %2), %%mm1 \n\t"
799 "punpckhbw %%mm1, %%mm1 \n\t"
800 "punpckhwd %%mm1, %%mm1 \n\t"
801 "punpckhdq %%mm1, %%mm1 \n\t"
802 "movq %%mm1, (%0, %2) \n\t"
807 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
812 "movd (%0), %%mm0 \n\t"
813 "punpcklbw %%mm0, %%mm0 \n\t"
814 "punpcklwd %%mm0, %%mm0 \n\t"
815 "punpckldq %%mm0, %%mm0 \n\t"
816 "movq %%mm0, -8(%0) \n\t"
817 "movq %%mm0, -16(%0) \n\t"
818 "movq -8(%0, %2), %%mm1 \n\t"
819 "punpckhbw %%mm1, %%mm1 \n\t"
820 "punpckhwd %%mm1, %%mm1 \n\t"
821 "punpckhdq %%mm1, %%mm1 \n\t"
822 "movq %%mm1, (%0, %2) \n\t"
823 "movq %%mm1, 8(%0, %2) \n\t"
828 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
832 /* top and bottom (and hopefully also the corners) */
833 if (sides & EDGE_TOP) {
834 for (i = 0; i < h; i += 4) {
835 ptr = buf - (i + 1) * wrap - w;
838 "movq (%1, %0), %%mm0 \n\t"
839 "movq %%mm0, (%0) \n\t"
840 "movq %%mm0, (%0, %2) \n\t"
841 "movq %%mm0, (%0, %2, 2) \n\t"
842 "movq %%mm0, (%0, %3) \n\t"
847 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
848 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
853 if (sides & EDGE_BOTTOM) {
854 for (i = 0; i < h; i += 4) {
855 ptr = last_line + (i + 1) * wrap - w;
858 "movq (%1, %0), %%mm0 \n\t"
859 "movq %%mm0, (%0) \n\t"
860 "movq %%mm0, (%0, %2) \n\t"
861 "movq %%mm0, (%0, %2, 2) \n\t"
862 "movq %%mm0, (%0, %3) \n\t"
867 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
868 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
869 "r"(ptr + width + 2 * w)
875 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
876 in0, in1, in2, in7, out, OP) \
877 "paddw "#m4", "#m3" \n\t" /* x1 */ \
878 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
879 "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
880 "movq "#in7", "#m3" \n\t" /* d */ \
881 "movq "#in0", %%mm5 \n\t" /* D */ \
882 "paddw "#m3", %%mm5 \n\t" /* x4 */ \
883 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
884 "movq "#in1", %%mm5 \n\t" /* C */ \
885 "movq "#in2", %%mm6 \n\t" /* B */ \
886 "paddw "#m6", %%mm5 \n\t" /* x3 */ \
887 "paddw "#m5", %%mm6 \n\t" /* x2 */ \
888 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
889 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
890 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
891 "paddw "#rnd", %%mm4 \n\t" /* x2 */ \
892 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
893 "psraw $5, %%mm5 \n\t" \
894 "packuswb %%mm5, %%mm5 \n\t" \
895 OP(%%mm5, out, %%mm7, d)
897 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW) \
898 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, \
907 "pxor %%mm7, %%mm7 \n\t" \
909 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
910 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
911 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
912 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
913 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
914 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
915 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
916 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
917 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
918 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
919 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
920 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
921 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
922 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
923 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
924 "paddw %%mm3, %%mm5 \n\t" /* b */ \
925 "paddw %%mm2, %%mm6 \n\t" /* c */ \
926 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
927 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
928 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
929 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
930 "paddw %%mm4, %%mm0 \n\t" /* a */ \
931 "paddw %%mm1, %%mm5 \n\t" /* d */ \
932 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
933 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
934 "paddw %6, %%mm6 \n\t" \
935 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
936 "psraw $5, %%mm0 \n\t" \
937 "movq %%mm0, %5 \n\t" \
938 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
940 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
941 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
942 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
943 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
944 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
945 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
946 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
947 "paddw %%mm0, %%mm2 \n\t" /* b */ \
948 "paddw %%mm5, %%mm3 \n\t" /* c */ \
949 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
950 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
951 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
952 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
953 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
954 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
955 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
956 "paddw %%mm2, %%mm1 \n\t" /* a */ \
957 "paddw %%mm6, %%mm4 \n\t" /* d */ \
958 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
959 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
960 "paddw %6, %%mm1 \n\t" \
961 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
962 "psraw $5, %%mm3 \n\t" \
963 "movq %5, %%mm1 \n\t" \
964 "packuswb %%mm3, %%mm1 \n\t" \
965 OP_MMX2(%%mm1, (%1), %%mm4, q) \
966 /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
968 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
969 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
970 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
971 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
972 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
973 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
974 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
975 "paddw %%mm1, %%mm5 \n\t" /* b */ \
976 "paddw %%mm4, %%mm0 \n\t" /* c */ \
977 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
978 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
979 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
980 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
981 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
982 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
983 "paddw %%mm3, %%mm2 \n\t" /* d */ \
984 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
985 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
986 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
987 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
988 "paddw %%mm2, %%mm6 \n\t" /* a */ \
989 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
990 "paddw %6, %%mm0 \n\t" \
991 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
992 "psraw $5, %%mm0 \n\t" \
993 /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
994 /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
996 "paddw %%mm5, %%mm3 \n\t" /* a */ \
997 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
998 "paddw %%mm4, %%mm6 \n\t" /* b */ \
999 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
1000 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
1001 "paddw %%mm1, %%mm4 \n\t" /* c */ \
1002 "paddw %%mm2, %%mm5 \n\t" /* d */ \
1003 "paddw %%mm6, %%mm6 \n\t" /* 2b */ \
1004 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
1005 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
1006 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
1007 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
1008 "paddw %6, %%mm4 \n\t" \
1009 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
1010 "psraw $5, %%mm4 \n\t" \
1011 "packuswb %%mm4, %%mm0 \n\t" \
1012 OP_MMX2(%%mm0, 8(%1), %%mm4, q) \
1018 : "+a"(src), "+c"(dst), "+D"(h) \
1019 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
1020 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
1025 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, \
1033 /* quick HACK, XXX FIXME MUST be optimized */ \
1034 for (i = 0; i < h; i++) { \
1035 temp[ 0] = (src[ 0] + src[ 1]) * 20 - (src[ 0] + src[ 2]) * 6 + \
1036 (src[ 1] + src[ 3]) * 3 - (src[ 2] + src[ 4]); \
1037 temp[ 1] = (src[ 1] + src[ 2]) * 20 - (src[ 0] + src[ 3]) * 6 + \
1038 (src[ 0] + src[ 4]) * 3 - (src[ 1] + src[ 5]); \
1039 temp[ 2] = (src[ 2] + src[ 3]) * 20 - (src[ 1] + src[ 4]) * 6 + \
1040 (src[ 0] + src[ 5]) * 3 - (src[ 0] + src[ 6]); \
1041 temp[ 3] = (src[ 3] + src[ 4]) * 20 - (src[ 2] + src[ 5]) * 6 + \
1042 (src[ 1] + src[ 6]) * 3 - (src[ 0] + src[ 7]); \
1043 temp[ 4] = (src[ 4] + src[ 5]) * 20 - (src[ 3] + src[ 6]) * 6 + \
1044 (src[ 2] + src[ 7]) * 3 - (src[ 1] + src[ 8]); \
1045 temp[ 5] = (src[ 5] + src[ 6]) * 20 - (src[ 4] + src[ 7]) * 6 + \
1046 (src[ 3] + src[ 8]) * 3 - (src[ 2] + src[ 9]); \
1047 temp[ 6] = (src[ 6] + src[ 7]) * 20 - (src[ 5] + src[ 8]) * 6 + \
1048 (src[ 4] + src[ 9]) * 3 - (src[ 3] + src[10]); \
1049 temp[ 7] = (src[ 7] + src[ 8]) * 20 - (src[ 6] + src[ 9]) * 6 + \
1050 (src[ 5] + src[10]) * 3 - (src[ 4] + src[11]); \
1051 temp[ 8] = (src[ 8] + src[ 9]) * 20 - (src[ 7] + src[10]) * 6 + \
1052 (src[ 6] + src[11]) * 3 - (src[ 5] + src[12]); \
1053 temp[ 9] = (src[ 9] + src[10]) * 20 - (src[ 8] + src[11]) * 6 + \
1054 (src[ 7] + src[12]) * 3 - (src[ 6] + src[13]); \
1055 temp[10] = (src[10] + src[11]) * 20 - (src[ 9] + src[12]) * 6 + \
1056 (src[ 8] + src[13]) * 3 - (src[ 7] + src[14]); \
1057 temp[11] = (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + \
1058 (src[ 9] + src[14]) * 3 - (src[ 8] + src[15]); \
1059 temp[12] = (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + \
1060 (src[10] + src[15]) * 3 - (src[ 9] + src[16]); \
1061 temp[13] = (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + \
1062 (src[11] + src[16]) * 3 - (src[10] + src[16]); \
1063 temp[14] = (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + \
1064 (src[12] + src[16]) * 3 - (src[11] + src[15]); \
1065 temp[15] = (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + \
1066 (src[13] + src[15]) * 3 - (src[12] + src[14]); \
1067 __asm__ volatile ( \
1068 "movq (%0), %%mm0 \n\t" \
1069 "movq 8(%0), %%mm1 \n\t" \
1070 "paddw %2, %%mm0 \n\t" \
1071 "paddw %2, %%mm1 \n\t" \
1072 "psraw $5, %%mm0 \n\t" \
1073 "psraw $5, %%mm1 \n\t" \
1074 "packuswb %%mm1, %%mm0 \n\t" \
1075 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
1076 "movq 16(%0), %%mm0 \n\t" \
1077 "movq 24(%0), %%mm1 \n\t" \
1078 "paddw %2, %%mm0 \n\t" \
1079 "paddw %2, %%mm1 \n\t" \
1080 "psraw $5, %%mm0 \n\t" \
1081 "psraw $5, %%mm1 \n\t" \
1082 "packuswb %%mm1, %%mm0 \n\t" \
1083 OP_3DNOW(%%mm0, 8(%1), %%mm1, q) \
1084 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
1092 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, \
1098 __asm__ volatile ( \
1099 "pxor %%mm7, %%mm7 \n\t" \
1101 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
1102 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
1103 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
1104 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
1105 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
1106 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
1107 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
1108 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
1109 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
1110 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
1111 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
1112 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
1113 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
1114 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
1115 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
1116 "paddw %%mm3, %%mm5 \n\t" /* b */ \
1117 "paddw %%mm2, %%mm6 \n\t" /* c */ \
1118 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1119 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
1120 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
1121 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
1122 "paddw %%mm4, %%mm0 \n\t" /* a */ \
1123 "paddw %%mm1, %%mm5 \n\t" /* d */ \
1124 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
1125 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
1126 "paddw %5, %%mm6 \n\t" \
1127 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1128 "psraw $5, %%mm0 \n\t" \
1129 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
1131 "movd 5(%0), %%mm5 \n\t" /* FGHI */ \
1132 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
1133 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
1134 "paddw %%mm5, %%mm1 \n\t" /* a */ \
1135 "paddw %%mm6, %%mm2 \n\t" /* b */ \
1136 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
1137 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
1138 "paddw %%mm6, %%mm3 \n\t" /* c */ \
1139 "paddw %%mm5, %%mm4 \n\t" /* d */ \
1140 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1141 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1142 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1143 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1144 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
1145 "paddw %5, %%mm1 \n\t" \
1146 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
1147 "psraw $5, %%mm3 \n\t" \
1148 "packuswb %%mm3, %%mm0 \n\t" \
1149 OP_MMX2(%%mm0, (%1), %%mm4, q) \
1155 : "+a"(src), "+c"(dst), "+d"(h) \
1156 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1157 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
1162 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, \
1170 /* quick HACK, XXX FIXME MUST be optimized */ \
1171 for (i = 0; i < h; i++) { \
1172 temp[0] = (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + \
1173 (src[1] + src[3]) * 3 - (src[2] + src[4]); \
1174 temp[1] = (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + \
1175 (src[0] + src[4]) * 3 - (src[1] + src[5]); \
1176 temp[2] = (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + \
1177 (src[0] + src[5]) * 3 - (src[0] + src[6]); \
1178 temp[3] = (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + \
1179 (src[1] + src[6]) * 3 - (src[0] + src[7]); \
1180 temp[4] = (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + \
1181 (src[2] + src[7]) * 3 - (src[1] + src[8]); \
1182 temp[5] = (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + \
1183 (src[3] + src[8]) * 3 - (src[2] + src[8]); \
1184 temp[6] = (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + \
1185 (src[4] + src[8]) * 3 - (src[3] + src[7]); \
1186 temp[7] = (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + \
1187 (src[5] + src[7]) * 3 - (src[4] + src[6]); \
1188 __asm__ volatile ( \
1189 "movq (%0), %%mm0 \n\t" \
1190 "movq 8(%0), %%mm1 \n\t" \
1191 "paddw %2, %%mm0 \n\t" \
1192 "paddw %2, %%mm1 \n\t" \
1193 "psraw $5, %%mm0 \n\t" \
1194 "psraw $5, %%mm1 \n\t" \
1195 "packuswb %%mm1, %%mm0 \n\t" \
1196 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
1197 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
1205 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1206 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1211 uint64_t temp[17 * 4]; \
1212 uint64_t *temp_ptr = temp; \
1215 /* FIXME unroll */ \
1216 __asm__ volatile ( \
1217 "pxor %%mm7, %%mm7 \n\t" \
1219 "movq (%0), %%mm0 \n\t" \
1220 "movq (%0), %%mm1 \n\t" \
1221 "movq 8(%0), %%mm2 \n\t" \
1222 "movq 8(%0), %%mm3 \n\t" \
1223 "punpcklbw %%mm7, %%mm0 \n\t" \
1224 "punpckhbw %%mm7, %%mm1 \n\t" \
1225 "punpcklbw %%mm7, %%mm2 \n\t" \
1226 "punpckhbw %%mm7, %%mm3 \n\t" \
1227 "movq %%mm0, (%1) \n\t" \
1228 "movq %%mm1, 17 * 8(%1) \n\t" \
1229 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1230 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1235 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1236 : "r"((x86_reg)srcStride) \
1243 /* FIXME reorder for speed */ \
1244 __asm__ volatile ( \
1245 /* "pxor %%mm7, %%mm7 \n\t" */ \
1247 "movq (%0), %%mm0 \n\t" \
1248 "movq 8(%0), %%mm1 \n\t" \
1249 "movq 16(%0), %%mm2 \n\t" \
1250 "movq 24(%0), %%mm3 \n\t" \
1251 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1252 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1254 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1256 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1258 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1259 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1261 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1262 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1264 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1265 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1267 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1268 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1270 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1272 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1274 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1275 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1277 "add $136, %0 \n\t" \
1282 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1283 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1284 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1285 "g"(4 - 14 * (x86_reg)dstStride) \
1290 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1295 uint64_t temp[9 * 2]; \
1296 uint64_t *temp_ptr = temp; \
1299 /* FIXME unroll */ \
1300 __asm__ volatile ( \
1301 "pxor %%mm7, %%mm7 \n\t" \
1303 "movq (%0), %%mm0 \n\t" \
1304 "movq (%0), %%mm1 \n\t" \
1305 "punpcklbw %%mm7, %%mm0 \n\t" \
1306 "punpckhbw %%mm7, %%mm1 \n\t" \
1307 "movq %%mm0, (%1) \n\t" \
1308 "movq %%mm1, 9*8(%1) \n\t" \
1313 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1314 : "r"((x86_reg)srcStride) \
1321 /* FIXME reorder for speed */ \
1322 __asm__ volatile ( \
1323 /* "pxor %%mm7, %%mm7 \n\t" */ \
1325 "movq (%0), %%mm0 \n\t" \
1326 "movq 8(%0), %%mm1 \n\t" \
1327 "movq 16(%0), %%mm2 \n\t" \
1328 "movq 24(%0), %%mm3 \n\t" \
1329 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1330 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1332 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1334 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1336 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1338 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1340 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1341 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1343 "add $72, %0 \n\t" \
1348 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1349 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1350 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1351 "g"(4 - 6 * (x86_reg)dstStride) \
1356 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1359 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1362 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1366 uint8_t * const half = (uint8_t*)temp; \
1367 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1369 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1372 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1375 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1379 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1383 uint8_t * const half = (uint8_t*)temp; \
1384 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1386 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1390 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1394 uint8_t * const half = (uint8_t*)temp; \
1395 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1396 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1399 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1402 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1405 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1409 uint8_t * const half = (uint8_t*)temp; \
1410 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1411 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1415 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1418 uint64_t half[8 + 9]; \
1419 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1420 uint8_t * const halfHV = ((uint8_t*)half); \
1421 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1423 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1424 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1425 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1428 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1431 uint64_t half[8 + 9]; \
1432 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1433 uint8_t * const halfHV = ((uint8_t*)half); \
1434 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1436 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1438 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1439 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1442 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1445 uint64_t half[8 + 9]; \
1446 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1447 uint8_t * const halfHV = ((uint8_t*)half); \
1448 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1450 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1451 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1452 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1455 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1458 uint64_t half[8 + 9]; \
1459 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1460 uint8_t * const halfHV = ((uint8_t*)half); \
1461 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1463 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1465 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1466 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1469 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1472 uint64_t half[8 + 9]; \
1473 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1474 uint8_t * const halfHV = ((uint8_t*)half); \
1475 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1477 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1478 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1481 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1484 uint64_t half[8 + 9]; \
1485 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1486 uint8_t * const halfHV = ((uint8_t*)half); \
1487 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1489 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1490 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1493 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1496 uint64_t half[8 + 9]; \
1497 uint8_t * const halfH = ((uint8_t*)half); \
1498 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1500 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1501 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1504 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1507 uint64_t half[8 + 9]; \
1508 uint8_t * const halfH = ((uint8_t*)half); \
1509 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1511 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1513 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1516 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1520 uint8_t * const halfH = ((uint8_t*)half); \
1521 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1523 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1526 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1529 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1532 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1535 uint64_t temp[32]; \
1536 uint8_t * const half = (uint8_t*)temp; \
1537 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1539 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1542 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1545 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1546 stride, stride, 16); \
1549 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1552 uint64_t temp[32]; \
1553 uint8_t * const half = (uint8_t*)temp; \
1554 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1556 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1557 stride, stride, 16); \
1560 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1563 uint64_t temp[32]; \
1564 uint8_t * const half = (uint8_t*)temp; \
1565 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1567 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1570 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1573 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1576 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1579 uint64_t temp[32]; \
1580 uint8_t * const half = (uint8_t*)temp; \
1581 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1583 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1584 stride, stride, 16); \
1587 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1590 uint64_t half[16 * 2 + 17 * 2]; \
1591 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1592 uint8_t * const halfHV = ((uint8_t*)half); \
1593 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1595 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1597 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1599 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1602 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1605 uint64_t half[16 * 2 + 17 * 2]; \
1606 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1607 uint8_t * const halfHV = ((uint8_t*)half); \
1608 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1610 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1612 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1614 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1617 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1620 uint64_t half[16 * 2 + 17 * 2]; \
1621 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1622 uint8_t * const halfHV = ((uint8_t*)half); \
1623 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1625 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1627 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1629 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1633 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1636 uint64_t half[16 * 2 + 17 * 2]; \
1637 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1638 uint8_t * const halfHV = ((uint8_t*)half); \
1639 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1641 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1643 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1645 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1649 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1652 uint64_t half[16 * 2 + 17 * 2]; \
1653 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1654 uint8_t * const halfHV = ((uint8_t*)half); \
1655 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1657 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1659 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1662 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1665 uint64_t half[16 * 2 + 17 * 2]; \
1666 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1667 uint8_t * const halfHV = ((uint8_t*)half); \
1668 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1670 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1672 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1676 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1679 uint64_t half[17 * 2]; \
1680 uint8_t * const halfH = ((uint8_t*)half); \
1681 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1683 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1685 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1688 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1691 uint64_t half[17 * 2]; \
1692 uint8_t * const halfH = ((uint8_t*)half); \
1693 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1695 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1697 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1700 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1703 uint64_t half[17 * 2]; \
1704 uint8_t * const halfH = ((uint8_t*)half); \
1705 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1707 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1710 #define PUT_OP(a, b, temp, size) \
1711 "mov"#size" "#a", "#b" \n\t"
1713 #define AVG_3DNOW_OP(a, b, temp, size) \
1714 "mov"#size" "#b", "#temp" \n\t" \
1715 "pavgusb "#temp", "#a" \n\t" \
1716 "mov"#size" "#a", "#b" \n\t"
1718 #define AVG_MMX2_OP(a, b, temp, size) \
1719 "mov"#size" "#b", "#temp" \n\t" \
1720 "pavgb "#temp", "#a" \n\t" \
1721 "mov"#size" "#a", "#b" \n\t"
1723 QPEL_BASE(put_, ff_pw_16, _, PUT_OP, PUT_OP)
1724 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMX2_OP, AVG_3DNOW_OP)
1725 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1726 QPEL_OP(put_, ff_pw_16, _, PUT_OP, 3dnow)
1727 QPEL_OP(avg_, ff_pw_16, _, AVG_3DNOW_OP, 3dnow)
1728 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1729 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmx2)
1730 QPEL_OP(avg_, ff_pw_16, _, AVG_MMX2_OP, mmx2)
1731 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1733 /***********************************/
1734 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1736 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL) \
1737 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1741 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE); \
1744 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2) \
1745 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1749 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE, \
1753 #define QPEL_2TAP(OPNAME, SIZE, MMX) \
1754 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX) \
1755 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX) \
1756 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx) \
1757 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX = \
1758 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX; \
1759 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX = \
1760 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX; \
1761 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX = \
1762 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX; \
1763 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, \
1767 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE); \
1769 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, \
1773 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride, \
1776 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0) \
1777 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0) \
1778 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0) \
1779 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0) \
1780 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1) \
1781 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1) \
1782 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1) \
1783 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1) \
1785 QPEL_2TAP(put_, 16, mmx2)
1786 QPEL_2TAP(avg_, 16, mmx2)
1787 QPEL_2TAP(put_, 8, mmx2)
1788 QPEL_2TAP(avg_, 8, mmx2)
1789 QPEL_2TAP(put_, 16, 3dnow)
1790 QPEL_2TAP(avg_, 16, 3dnow)
1791 QPEL_2TAP(put_, 8, 3dnow)
1792 QPEL_2TAP(avg_, 8, 3dnow)
1796 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
1797 x86_reg linesize, x86_reg start_y,
1798 x86_reg end_y, x86_reg block_h,
1799 x86_reg start_x, x86_reg end_x,
1801 extern emu_edge_core_func ff_emu_edge_core_mmx;
1802 extern emu_edge_core_func ff_emu_edge_core_sse;
1804 static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
1806 int block_w, int block_h,
1807 int src_x, int src_y,
1809 emu_edge_core_func *core_fn)
1811 int start_y, start_x, end_y, end_x, src_y_add = 0;
1814 src_y_add = h - 1 - src_y;
1816 } else if (src_y <= -block_h) {
1817 src_y_add = 1 - block_h - src_y;
1818 src_y = 1 - block_h;
1821 src += w - 1 - src_x;
1823 } else if (src_x <= -block_w) {
1824 src += 1 - block_w - src_x;
1825 src_x = 1 - block_w;
1828 start_y = FFMAX(0, -src_y);
1829 start_x = FFMAX(0, -src_x);
1830 end_y = FFMIN(block_h, h-src_y);
1831 end_x = FFMIN(block_w, w-src_x);
1832 assert(start_x < end_x && block_w > 0);
1833 assert(start_y < end_y && block_h > 0);
1835 // fill in the to-be-copied part plus all above/below
1836 src += (src_y_add + start_y) * linesize + start_x;
1838 core_fn(buf, src, linesize, start_y, end_y,
1839 block_h, start_x, end_x, block_w);
1843 static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
1845 int block_w, int block_h,
1846 int src_x, int src_y, int w, int h)
1848 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1849 w, h, &ff_emu_edge_core_mmx);
1853 static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
1855 int block_w, int block_h,
1856 int src_x, int src_y, int w, int h)
1858 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1859 w, h, &ff_emu_edge_core_sse);
1861 #endif /* HAVE_YASM */
1863 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1864 int linesize, int block_w, int block_h,
1865 int src_x, int src_y, int w, int h);
1867 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1868 int stride, int h, int ox, int oy,
1869 int dxx, int dxy, int dyx, int dyy,
1870 int shift, int r, int width, int height,
1871 emulated_edge_mc_func *emu_edge_fn)
1874 const int ix = ox >> (16 + shift);
1875 const int iy = oy >> (16 + shift);
1876 const int oxs = ox >> 4;
1877 const int oys = oy >> 4;
1878 const int dxxs = dxx >> 4;
1879 const int dxys = dxy >> 4;
1880 const int dyxs = dyx >> 4;
1881 const int dyys = dyy >> 4;
1882 const uint16_t r4[4] = { r, r, r, r };
1883 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1884 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1885 const uint64_t shift2 = 2 * shift;
1886 uint8_t edge_buf[(h + 1) * stride];
1889 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1890 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1891 const int dxh = dxy * (h - 1);
1892 const int dyw = dyx * (w - 1);
1893 if ( // non-constant fullpel offset (3% of blocks)
1894 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1895 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1896 // uses more than 16 bits of subpel mv (only at huge resolution)
1897 || (dxx | dxy | dyx | dyy) & 15) {
1898 // FIXME could still use mmx for some of the rows
1899 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1900 shift, r, width, height);
1904 src += ix + iy * stride;
1905 if ((unsigned)ix >= width - w ||
1906 (unsigned)iy >= height - h) {
1907 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1912 "movd %0, %%mm6 \n\t"
1913 "pxor %%mm7, %%mm7 \n\t"
1914 "punpcklwd %%mm6, %%mm6 \n\t"
1915 "punpcklwd %%mm6, %%mm6 \n\t"
1919 for (x = 0; x < w; x += 4) {
1920 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1921 oxs - dxys + dxxs * (x + 1),
1922 oxs - dxys + dxxs * (x + 2),
1923 oxs - dxys + dxxs * (x + 3) };
1924 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1925 oys - dyys + dyxs * (x + 1),
1926 oys - dyys + dyxs * (x + 2),
1927 oys - dyys + dyxs * (x + 3) };
1929 for (y = 0; y < h; y++) {
1931 "movq %0, %%mm4 \n\t"
1932 "movq %1, %%mm5 \n\t"
1933 "paddw %2, %%mm4 \n\t"
1934 "paddw %3, %%mm5 \n\t"
1935 "movq %%mm4, %0 \n\t"
1936 "movq %%mm5, %1 \n\t"
1937 "psrlw $12, %%mm4 \n\t"
1938 "psrlw $12, %%mm5 \n\t"
1939 : "+m"(*dx4), "+m"(*dy4)
1940 : "m"(*dxy4), "m"(*dyy4)
1944 "movq %%mm6, %%mm2 \n\t"
1945 "movq %%mm6, %%mm1 \n\t"
1946 "psubw %%mm4, %%mm2 \n\t"
1947 "psubw %%mm5, %%mm1 \n\t"
1948 "movq %%mm2, %%mm0 \n\t"
1949 "movq %%mm4, %%mm3 \n\t"
1950 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1951 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1952 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1953 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1955 "movd %4, %%mm5 \n\t"
1956 "movd %3, %%mm4 \n\t"
1957 "punpcklbw %%mm7, %%mm5 \n\t"
1958 "punpcklbw %%mm7, %%mm4 \n\t"
1959 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1960 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1962 "movd %2, %%mm5 \n\t"
1963 "movd %1, %%mm4 \n\t"
1964 "punpcklbw %%mm7, %%mm5 \n\t"
1965 "punpcklbw %%mm7, %%mm4 \n\t"
1966 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1967 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1968 "paddw %5, %%mm1 \n\t"
1969 "paddw %%mm3, %%mm2 \n\t"
1970 "paddw %%mm1, %%mm0 \n\t"
1971 "paddw %%mm2, %%mm0 \n\t"
1973 "psrlw %6, %%mm0 \n\t"
1974 "packuswb %%mm0, %%mm0 \n\t"
1975 "movd %%mm0, %0 \n\t"
1977 : "=m"(dst[x + y * stride])
1978 : "m"(src[0]), "m"(src[1]),
1979 "m"(src[stride]), "m"(src[stride + 1]),
1980 "m"(*r4), "m"(shift2)
1984 src += 4 - h * stride;
1990 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1991 int stride, int h, int ox, int oy,
1992 int dxx, int dxy, int dyx, int dyy,
1993 int shift, int r, int width, int height)
1995 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1996 width, height, &emulated_edge_mc_mmx);
1999 static void gmc_sse(uint8_t *dst, uint8_t *src,
2000 int stride, int h, int ox, int oy,
2001 int dxx, int dxy, int dyx, int dyy,
2002 int shift, int r, int width, int height)
2004 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2005 width, height, &emulated_edge_mc_sse);
2008 static void gmc_mmx(uint8_t *dst, uint8_t *src,
2009 int stride, int h, int ox, int oy,
2010 int dxx, int dxy, int dyx, int dyy,
2011 int shift, int r, int width, int height)
2013 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2014 width, height, &ff_emulated_edge_mc_8);
2018 #define PREFETCH(name, op) \
2019 static void name(void *mem, int stride, int h) \
2021 const uint8_t *p = mem; \
2023 __asm__ volatile (#op" %0" :: "m"(*p)); \
2028 PREFETCH(prefetch_mmx2, prefetcht0)
2029 PREFETCH(prefetch_3dnow, prefetch)
2032 #include "h264_qpel_mmx.c"
2034 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
2035 int stride, int h, int x, int y);
2036 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
2037 int stride, int h, int x, int y);
2038 void ff_avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst, uint8_t *src,
2039 int stride, int h, int x, int y);
2041 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
2042 int stride, int h, int x, int y);
2043 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
2044 int stride, int h, int x, int y);
2045 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
2046 int stride, int h, int x, int y);
2048 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
2049 int stride, int h, int x, int y);
2050 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
2051 int stride, int h, int x, int y);
2053 void ff_put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
2054 int stride, int h, int x, int y);
2055 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
2056 int stride, int h, int x, int y);
2058 void ff_avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
2059 int stride, int h, int x, int y);
2060 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
2061 int stride, int h, int x, int y);
2063 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
2064 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
2065 (uint8_t *dst, uint8_t *src, \
2066 int stride, int h, int x, int y);
2068 CHROMA_MC(put, 2, 10, mmxext)
2069 CHROMA_MC(avg, 2, 10, mmxext)
2070 CHROMA_MC(put, 4, 10, mmxext)
2071 CHROMA_MC(avg, 4, 10, mmxext)
2072 CHROMA_MC(put, 8, 10, sse2)
2073 CHROMA_MC(avg, 8, 10, sse2)
2074 CHROMA_MC(put, 8, 10, avx)
2075 CHROMA_MC(avg, 8, 10, avx)
2078 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2080 put_pixels8_mmx(dst, src, stride, 8);
2083 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2085 avg_pixels8_mmx(dst, src, stride, 8);
2088 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2090 put_pixels16_mmx(dst, src, stride, 16);
2093 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2095 avg_pixels16_mmx(dst, src, stride, 16);
2099 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
2100 int stride, int rnd)
2102 put_pixels8_mmx(dst, src, stride, 8);
2105 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src,
2106 int stride, int rnd)
2108 avg_pixels8_mmx2(dst, src, stride, 8);
2111 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
2114 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
2118 ff_put_pixels_clamped_mmx(block, dest, line_size);
2121 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
2125 ff_add_pixels_clamped_mmx(block, dest, line_size);
2128 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
2131 ff_mmxext_idct(block);
2132 ff_put_pixels_clamped_mmx(block, dest, line_size);
2135 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
2138 ff_mmxext_idct(block);
2139 ff_add_pixels_clamped_mmx(block, dest, line_size);
2143 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2145 ff_idct_xvid_mmx(block);
2146 ff_put_pixels_clamped_mmx(block, dest, line_size);
2149 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2151 ff_idct_xvid_mmx(block);
2152 ff_add_pixels_clamped_mmx(block, dest, line_size);
2155 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2157 ff_idct_xvid_mmx2(block);
2158 ff_put_pixels_clamped_mmx(block, dest, line_size);
2161 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2163 ff_idct_xvid_mmx2(block);
2164 ff_add_pixels_clamped_mmx(block, dest, line_size);
2167 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2170 __asm__ volatile ("pxor %%mm7, %%mm7":);
2171 for (i = 0; i < blocksize; i += 2) {
2173 "movq %0, %%mm0 \n\t"
2174 "movq %1, %%mm1 \n\t"
2175 "movq %%mm0, %%mm2 \n\t"
2176 "movq %%mm1, %%mm3 \n\t"
2177 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2178 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2179 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2180 "pxor %%mm2, %%mm1 \n\t"
2181 "movq %%mm3, %%mm4 \n\t"
2182 "pand %%mm1, %%mm3 \n\t"
2183 "pandn %%mm1, %%mm4 \n\t"
2184 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2185 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2186 "movq %%mm3, %1 \n\t"
2187 "movq %%mm0, %0 \n\t"
2188 : "+m"(mag[i]), "+m"(ang[i])
2192 __asm__ volatile ("femms");
2195 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2200 "movaps %0, %%xmm5 \n\t"
2201 :: "m"(ff_pdw_80000000[0])
2203 for (i = 0; i < blocksize; i += 4) {
2205 "movaps %0, %%xmm0 \n\t"
2206 "movaps %1, %%xmm1 \n\t"
2207 "xorps %%xmm2, %%xmm2 \n\t"
2208 "xorps %%xmm3, %%xmm3 \n\t"
2209 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2210 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2211 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2212 "xorps %%xmm2, %%xmm1 \n\t"
2213 "movaps %%xmm3, %%xmm4 \n\t"
2214 "andps %%xmm1, %%xmm3 \n\t"
2215 "andnps %%xmm1, %%xmm4 \n\t"
2216 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2217 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2218 "movaps %%xmm3, %1 \n\t"
2219 "movaps %%xmm0, %0 \n\t"
2220 : "+m"(mag[i]), "+m"(ang[i])
2229 #define MIX5(mono, stereo) \
2230 __asm__ volatile ( \
2231 "movss 0(%2), %%xmm5 \n" \
2232 "movss 8(%2), %%xmm6 \n" \
2233 "movss 24(%2), %%xmm7 \n" \
2234 "shufps $0, %%xmm5, %%xmm5 \n" \
2235 "shufps $0, %%xmm6, %%xmm6 \n" \
2236 "shufps $0, %%xmm7, %%xmm7 \n" \
2238 "movaps (%0, %1), %%xmm0 \n" \
2239 "movaps 0x400(%0, %1), %%xmm1 \n" \
2240 "movaps 0x800(%0, %1), %%xmm2 \n" \
2241 "movaps 0xc00(%0, %1), %%xmm3 \n" \
2242 "movaps 0x1000(%0, %1), %%xmm4 \n" \
2243 "mulps %%xmm5, %%xmm0 \n" \
2244 "mulps %%xmm6, %%xmm1 \n" \
2245 "mulps %%xmm5, %%xmm2 \n" \
2246 "mulps %%xmm7, %%xmm3 \n" \
2247 "mulps %%xmm7, %%xmm4 \n" \
2248 stereo("addps %%xmm1, %%xmm0 \n") \
2249 "addps %%xmm1, %%xmm2 \n" \
2250 "addps %%xmm3, %%xmm0 \n" \
2251 "addps %%xmm4, %%xmm2 \n" \
2252 mono("addps %%xmm2, %%xmm0 \n") \
2253 "movaps %%xmm0, (%0, %1) \n" \
2254 stereo("movaps %%xmm2, 0x400(%0, %1) \n") \
2258 : "r"(samples[0] + len), "r"(matrix) \
2259 : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2260 "%xmm4", "%xmm5", "%xmm6", "%xmm7",) \
2264 #define MIX_MISC(stereo) \
2265 __asm__ volatile ( \
2267 "movaps (%3, %0), %%xmm0 \n" \
2268 stereo("movaps %%xmm0, %%xmm1 \n") \
2269 "mulps %%xmm4, %%xmm0 \n" \
2270 stereo("mulps %%xmm5, %%xmm1 \n") \
2271 "lea 1024(%3, %0), %1 \n" \
2274 "movaps (%1), %%xmm2 \n" \
2275 stereo("movaps %%xmm2, %%xmm3 \n") \
2276 "mulps (%4, %2), %%xmm2 \n" \
2277 stereo("mulps 16(%4, %2), %%xmm3 \n") \
2278 "addps %%xmm2, %%xmm0 \n" \
2279 stereo("addps %%xmm3, %%xmm1 \n") \
2280 "add $1024, %1 \n" \
2283 "movaps %%xmm0, (%3, %0) \n" \
2284 stereo("movaps %%xmm1, 1024(%3, %0) \n") \
2287 : "+&r"(i), "=&r"(j), "=&r"(k) \
2288 : "r"(samples[0] + len), "r"(matrix_simd + in_ch), \
2289 "g"((intptr_t) - 32 * (in_ch - 1)) \
2293 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2],
2294 int out_ch, int in_ch, int len)
2296 int (*matrix_cmp)[2] = (int(*)[2])matrix;
2299 i = -len * sizeof(float);
2300 if (in_ch == 5 && out_ch == 2 &&
2301 !(matrix_cmp[0][1] | matrix_cmp[2][0] |
2302 matrix_cmp[3][1] | matrix_cmp[4][0] |
2303 (matrix_cmp[1][0] ^ matrix_cmp[1][1]) |
2304 (matrix_cmp[0][0] ^ matrix_cmp[2][1]))) {
2306 } else if (in_ch == 5 && out_ch == 1 &&
2307 matrix_cmp[0][0] == matrix_cmp[2][0] &&
2308 matrix_cmp[3][0] == matrix_cmp[4][0]) {
2311 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2312 j = 2 * in_ch * sizeof(float);
2316 "movss (%2, %0), %%xmm4 \n"
2317 "movss 4(%2, %0), %%xmm5 \n"
2318 "shufps $0, %%xmm4, %%xmm4 \n"
2319 "shufps $0, %%xmm5, %%xmm5 \n"
2320 "movaps %%xmm4, (%1, %0, 4) \n"
2321 "movaps %%xmm5, 16(%1, %0, 4) \n"
2324 : "r"(matrix_simd), "r"(matrix)
2335 static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1,
2338 x86_reg i = (len - 4) * 4;
2341 "movq (%2, %0), %%mm0 \n\t"
2342 "movq 8(%2, %0), %%mm1 \n\t"
2343 "pfmul (%3, %0), %%mm0 \n\t"
2344 "pfmul 8(%3, %0), %%mm1 \n\t"
2345 "movq %%mm0, (%1, %0) \n\t"
2346 "movq %%mm1, 8(%1, %0) \n\t"
2351 : "r"(dst), "r"(src0), "r"(src1)
2356 static void vector_fmul_sse(float *dst, const float *src0, const float *src1,
2359 x86_reg i = (len - 8) * 4;
2362 "movaps (%2, %0), %%xmm0 \n\t"
2363 "movaps 16(%2, %0), %%xmm1 \n\t"
2364 "mulps (%3, %0), %%xmm0 \n\t"
2365 "mulps 16(%3, %0), %%xmm1 \n\t"
2366 "movaps %%xmm0, (%1, %0) \n\t"
2367 "movaps %%xmm1, 16(%1, %0) \n\t"
2371 : "r"(dst), "r"(src0), "r"(src1)
2376 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0,
2377 const float *src1, int len)
2379 x86_reg i = len * 4 - 16;
2382 "pswapd 8(%1), %%mm0 \n\t"
2383 "pswapd (%1), %%mm1 \n\t"
2384 "pfmul (%3, %0), %%mm0 \n\t"
2385 "pfmul 8(%3, %0), %%mm1 \n\t"
2386 "movq %%mm0, (%2, %0) \n\t"
2387 "movq %%mm1, 8(%2, %0) \n\t"
2391 : "+r"(i), "+r"(src1)
2392 : "r"(dst), "r"(src0)
2394 __asm__ volatile ("femms");
2397 static void vector_fmul_reverse_sse(float *dst, const float *src0,
2398 const float *src1, int len)
2400 x86_reg i = len * 4 - 32;
2403 "movaps 16(%1), %%xmm0 \n\t"
2404 "movaps (%1), %%xmm1 \n\t"
2405 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2406 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2407 "mulps (%3, %0), %%xmm0 \n\t"
2408 "mulps 16(%3, %0), %%xmm1 \n\t"
2409 "movaps %%xmm0, (%2, %0) \n\t"
2410 "movaps %%xmm1, 16(%2, %0) \n\t"
2414 : "+r"(i), "+r"(src1)
2415 : "r"(dst), "r"(src0)
2419 static void vector_fmul_add_3dnow(float *dst, const float *src0,
2420 const float *src1, const float *src2, int len)
2422 x86_reg i = (len - 4) * 4;
2425 "movq (%2, %0), %%mm0 \n\t"
2426 "movq 8(%2, %0), %%mm1 \n\t"
2427 "pfmul (%3, %0), %%mm0 \n\t"
2428 "pfmul 8(%3, %0), %%mm1 \n\t"
2429 "pfadd (%4, %0), %%mm0 \n\t"
2430 "pfadd 8(%4, %0), %%mm1 \n\t"
2431 "movq %%mm0, (%1, %0) \n\t"
2432 "movq %%mm1, 8(%1, %0) \n\t"
2436 : "r"(dst), "r"(src0), "r"(src1), "r"(src2)
2439 __asm__ volatile ("femms");
2442 static void vector_fmul_add_sse(float *dst, const float *src0,
2443 const float *src1, const float *src2, int len)
2445 x86_reg i = (len - 8) * 4;
2448 "movaps (%2, %0), %%xmm0 \n\t"
2449 "movaps 16(%2, %0), %%xmm1 \n\t"
2450 "mulps (%3, %0), %%xmm0 \n\t"
2451 "mulps 16(%3, %0), %%xmm1 \n\t"
2452 "addps (%4, %0), %%xmm0 \n\t"
2453 "addps 16(%4, %0), %%xmm1 \n\t"
2454 "movaps %%xmm0, (%1, %0) \n\t"
2455 "movaps %%xmm1, 16(%1, %0) \n\t"
2459 : "r"(dst), "r"(src0), "r"(src1), "r"(src2)
2465 static void vector_fmul_window_3dnow2(float *dst, const float *src0,
2466 const float *src1, const float *win,
2469 x86_reg i = -len * 4;
2470 x86_reg j = len * 4 - 8;
2473 "pswapd (%5, %1), %%mm1 \n"
2474 "movq (%5, %0), %%mm0 \n"
2475 "pswapd (%4, %1), %%mm5 \n"
2476 "movq (%3, %0), %%mm4 \n"
2477 "movq %%mm0, %%mm2 \n"
2478 "movq %%mm1, %%mm3 \n"
2479 "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]
2480 "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]
2481 "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]
2482 "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]
2483 "pfadd %%mm3, %%mm2 \n"
2484 "pfsub %%mm0, %%mm1 \n"
2485 "pswapd %%mm2, %%mm2 \n"
2486 "movq %%mm1, (%2, %0) \n"
2487 "movq %%mm2, (%2, %1) \n"
2493 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2497 static void vector_fmul_window_sse(float *dst, const float *src0,
2498 const float *src1, const float *win, int len)
2500 x86_reg i = -len * 4;
2501 x86_reg j = len * 4 - 16;
2504 "movaps (%5, %1), %%xmm1 \n"
2505 "movaps (%5, %0), %%xmm0 \n"
2506 "movaps (%4, %1), %%xmm5 \n"
2507 "movaps (%3, %0), %%xmm4 \n"
2508 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2509 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2510 "movaps %%xmm0, %%xmm2 \n"
2511 "movaps %%xmm1, %%xmm3 \n"
2512 "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]
2513 "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]
2514 "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]
2515 "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]
2516 "addps %%xmm3, %%xmm2 \n"
2517 "subps %%xmm0, %%xmm1 \n"
2518 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2519 "movaps %%xmm1, (%2, %0) \n"
2520 "movaps %%xmm2, (%2, %1) \n"
2525 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2528 #endif /* HAVE_6REGS */
2530 static void vector_clipf_sse(float *dst, const float *src,
2531 float min, float max, int len)
2533 x86_reg i = (len - 16) * 4;
2535 "movss %3, %%xmm4 \n\t"
2536 "movss %4, %%xmm5 \n\t"
2537 "shufps $0, %%xmm4, %%xmm4 \n\t"
2538 "shufps $0, %%xmm5, %%xmm5 \n\t"
2540 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
2541 "movaps 16(%2, %0), %%xmm1 \n\t"
2542 "movaps 32(%2, %0), %%xmm2 \n\t"
2543 "movaps 48(%2, %0), %%xmm3 \n\t"
2544 "maxps %%xmm4, %%xmm0 \n\t"
2545 "maxps %%xmm4, %%xmm1 \n\t"
2546 "maxps %%xmm4, %%xmm2 \n\t"
2547 "maxps %%xmm4, %%xmm3 \n\t"
2548 "minps %%xmm5, %%xmm0 \n\t"
2549 "minps %%xmm5, %%xmm1 \n\t"
2550 "minps %%xmm5, %%xmm2 \n\t"
2551 "minps %%xmm5, %%xmm3 \n\t"
2552 "movaps %%xmm0, (%1, %0) \n\t"
2553 "movaps %%xmm1, 16(%1, %0) \n\t"
2554 "movaps %%xmm2, 32(%1, %0) \n\t"
2555 "movaps %%xmm3, 48(%1, %0) \n\t"
2559 : "r"(dst), "r"(src), "m"(min), "m"(max)
2564 void ff_vp3_idct_mmx(int16_t *input_data);
2565 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2566 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2568 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size,
2569 const DCTELEM *block);
2571 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2572 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2574 void ff_vp3_idct_sse2(int16_t *input_data);
2575 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2576 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2578 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2,
2580 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
2582 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2,
2584 int order, int mul);
2585 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
2587 int order, int mul);
2588 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
2590 int order, int mul);
2592 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
2593 const int16_t *window, unsigned int len);
2594 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2595 const int16_t *window, unsigned int len);
2596 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
2597 const int16_t *window, unsigned int len);
2598 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
2599 const int16_t *window, unsigned int len);
2600 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
2601 const int16_t *window, unsigned int len);
2602 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2603 const int16_t *window, unsigned int len);
2605 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
2606 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
2608 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top,
2609 const uint8_t *diff, int w,
2610 int *left, int *left_top);
2611 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
2613 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
2616 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2618 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
2619 int32_t min, int32_t max, unsigned int len);
2620 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
2621 int32_t min, int32_t max, unsigned int len);
2622 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
2623 int32_t min, int32_t max, unsigned int len);
2624 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
2625 int32_t min, int32_t max, unsigned int len);
2627 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2628 const float *src1, int len);
2629 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2630 const float *src1, int len);
2632 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2634 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2635 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2636 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2637 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2638 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2639 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2640 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2641 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2642 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2643 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2644 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2645 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2646 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2647 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2648 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2649 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2652 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2654 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2655 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2656 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2657 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2660 #define H264_QPEL_FUNCS(x, y, CPU) \
2662 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2663 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2664 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2665 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2668 #define H264_QPEL_FUNCS_10(x, y, CPU) \
2670 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2671 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2672 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2673 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2676 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2678 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2680 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2681 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2682 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2684 if (!high_bit_depth) {
2685 c->clear_block = clear_block_mmx;
2686 c->clear_blocks = clear_blocks_mmx;
2687 c->draw_edges = draw_edges_mmx;
2689 SET_HPEL_FUNCS(put, 0, 16, mmx);
2690 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2691 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2692 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2693 SET_HPEL_FUNCS(put, 1, 8, mmx);
2694 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2695 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2696 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2699 #if ARCH_X86_32 || !HAVE_YASM
2702 #if ARCH_X86_32 && HAVE_YASM
2703 if (!high_bit_depth)
2704 c->emulated_edge_mc = emulated_edge_mc_mmx;
2707 c->add_bytes = add_bytes_mmx;
2709 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2710 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2711 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2715 if (!high_bit_depth && CONFIG_H264CHROMA) {
2716 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
2717 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2720 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2725 static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
2728 const int bit_depth = avctx->bits_per_raw_sample;
2729 const int high_bit_depth = bit_depth > 8;
2731 c->prefetch = prefetch_mmx2;
2733 if (!high_bit_depth) {
2734 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2735 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2737 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2738 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2739 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2741 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2742 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2744 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2745 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2746 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2749 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2750 if (!high_bit_depth) {
2751 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2752 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2753 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2754 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2756 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2757 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2760 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2761 c->vp3_v_loop_filter = ff_vp3_v_loop_filter_mmx2;
2762 c->vp3_h_loop_filter = ff_vp3_h_loop_filter_mmx2;
2765 if (CONFIG_VP3_DECODER && HAVE_YASM)
2766 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
2768 if (CONFIG_VP3_DECODER && (avctx->codec_id == CODEC_ID_VP3 ||
2769 avctx->codec_id == CODEC_ID_THEORA)) {
2770 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2771 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2774 if (CONFIG_H264QPEL) {
2775 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
2776 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
2777 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
2778 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
2779 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
2780 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
2782 if (!high_bit_depth) {
2783 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
2784 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
2785 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
2786 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
2787 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
2788 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
2789 } else if (bit_depth == 10) {
2792 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2793 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2794 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2795 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2797 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2798 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2802 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
2803 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
2804 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
2805 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
2809 if (!high_bit_depth && CONFIG_H264CHROMA) {
2810 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_mmx2_rnd;
2811 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmx2;
2812 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmx2;
2813 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmx2;
2815 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2816 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
2817 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
2818 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
2819 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
2822 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2824 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2825 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2827 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2828 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2830 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2835 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2838 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2840 c->prefetch = prefetch_3dnow;
2842 if (!high_bit_depth) {
2843 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2844 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2846 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2847 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2848 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2850 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2851 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2853 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2854 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2855 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2857 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2858 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2859 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2860 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2861 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2863 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2864 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2868 if (CONFIG_VP3_DECODER && (avctx->codec_id == CODEC_ID_VP3 ||
2869 avctx->codec_id == CODEC_ID_THEORA)) {
2870 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2871 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2874 if (CONFIG_H264QPEL) {
2875 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
2876 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
2877 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2878 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
2879 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
2880 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
2882 if (!high_bit_depth) {
2883 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2884 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
2885 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
2886 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2887 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
2888 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
2891 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2892 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
2893 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2894 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
2898 if (!high_bit_depth && CONFIG_H264CHROMA) {
2899 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_3dnow_rnd;
2900 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2904 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2905 c->vector_fmul = vector_fmul_3dnow;
2906 c->vector_fmul_add = vector_fmul_add_3dnow;
2909 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2913 static void dsputil_init_3dnow2(DSPContext *c, AVCodecContext *avctx,
2916 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
2918 c->vector_fmul_window = vector_fmul_window_3dnow2;
2922 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2924 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2926 if (!high_bit_depth) {
2927 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2928 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2929 c->clear_block = clear_block_sse;
2930 c->clear_blocks = clear_blocks_sse;
2934 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2935 c->ac3_downmix = ac3_downmix_sse;
2936 c->vector_fmul = vector_fmul_sse;
2937 c->vector_fmul_reverse = vector_fmul_reverse_sse;
2939 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
2940 c->vector_fmul_add = vector_fmul_add_sse;
2943 c->vector_fmul_window = vector_fmul_window_sse;
2946 c->vector_clipf = vector_clipf_sse;
2949 c->scalarproduct_float = ff_scalarproduct_float_sse;
2950 c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
2952 if (!high_bit_depth)
2953 c->emulated_edge_mc = emulated_edge_mc_sse;
2958 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2961 const int bit_depth = avctx->bits_per_raw_sample;
2962 const int high_bit_depth = bit_depth > 8;
2964 if (mm_flags & AV_CPU_FLAG_3DNOW) {
2965 // these functions are slower than mmx on AMD, but faster on Intel
2966 if (!high_bit_depth) {
2967 c->put_pixels_tab[0][0] = put_pixels16_sse2;
2968 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2969 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2970 if (CONFIG_H264QPEL)
2971 H264_QPEL_FUNCS(0, 0, sse2);
2975 if (!high_bit_depth && CONFIG_H264QPEL) {
2976 H264_QPEL_FUNCS(0, 1, sse2);
2977 H264_QPEL_FUNCS(0, 2, sse2);
2978 H264_QPEL_FUNCS(0, 3, sse2);
2979 H264_QPEL_FUNCS(1, 1, sse2);
2980 H264_QPEL_FUNCS(1, 2, sse2);
2981 H264_QPEL_FUNCS(1, 3, sse2);
2982 H264_QPEL_FUNCS(2, 1, sse2);
2983 H264_QPEL_FUNCS(2, 2, sse2);
2984 H264_QPEL_FUNCS(2, 3, sse2);
2985 H264_QPEL_FUNCS(3, 1, sse2);
2986 H264_QPEL_FUNCS(3, 2, sse2);
2987 H264_QPEL_FUNCS(3, 3, sse2);
2991 if (bit_depth == 10) {
2992 if (CONFIG_H264QPEL) {
2993 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2994 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2995 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2996 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2997 H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2998 H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2999 H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
3001 if (CONFIG_H264CHROMA) {
3002 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
3003 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
3007 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
3008 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
3009 if (mm_flags & AV_CPU_FLAG_ATOM) {
3010 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
3012 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
3014 if (avctx->flags & CODEC_FLAG_BITEXACT) {
3015 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
3016 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
3017 c->apply_window_int16 = ff_apply_window_int16_sse2;
3019 c->bswap_buf = ff_bswap32_buf_sse2;
3023 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
3027 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
3028 const int bit_depth = avctx->bits_per_raw_sample;
3030 if (!high_bit_depth && CONFIG_H264QPEL) {
3031 H264_QPEL_FUNCS(1, 0, ssse3);
3032 H264_QPEL_FUNCS(1, 1, ssse3);
3033 H264_QPEL_FUNCS(1, 2, ssse3);
3034 H264_QPEL_FUNCS(1, 3, ssse3);
3035 H264_QPEL_FUNCS(2, 0, ssse3);
3036 H264_QPEL_FUNCS(2, 1, ssse3);
3037 H264_QPEL_FUNCS(2, 2, ssse3);
3038 H264_QPEL_FUNCS(2, 3, ssse3);
3039 H264_QPEL_FUNCS(3, 0, ssse3);
3040 H264_QPEL_FUNCS(3, 1, ssse3);
3041 H264_QPEL_FUNCS(3, 2, ssse3);
3042 H264_QPEL_FUNCS(3, 3, ssse3);
3045 else if (bit_depth == 10 && CONFIG_H264QPEL) {
3046 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
3047 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
3048 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
3050 if (!high_bit_depth && CONFIG_H264CHROMA) {
3051 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_ssse3_rnd;
3052 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_ssse3_rnd;
3053 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
3054 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
3056 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
3057 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
3058 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
3060 if (mm_flags & AV_CPU_FLAG_ATOM)
3061 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
3063 c->apply_window_int16 = ff_apply_window_int16_ssse3;
3064 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
3065 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
3066 c->bswap_buf = ff_bswap32_buf_ssse3;
3071 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
3075 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
3079 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
3081 #if HAVE_AVX && HAVE_YASM
3082 const int bit_depth = avctx->bits_per_raw_sample;
3084 if (bit_depth == 10) {
3085 // AVX implies !cache64.
3086 // TODO: Port cache(32|64) detection from x264.
3087 if (CONFIG_H264QPEL) {
3088 H264_QPEL_FUNCS_10(1, 0, sse2);
3089 H264_QPEL_FUNCS_10(2, 0, sse2);
3090 H264_QPEL_FUNCS_10(3, 0, sse2);
3093 if (CONFIG_H264CHROMA) {
3094 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
3095 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
3098 c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
3102 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
3104 int mm_flags = av_get_cpu_flags();
3107 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
3108 if (mm_flags & AV_CPU_FLAG_MMX)
3109 av_log(avctx, AV_LOG_INFO, " mmx");
3110 if (mm_flags & AV_CPU_FLAG_MMX2)
3111 av_log(avctx, AV_LOG_INFO, " mmx2");
3112 if (mm_flags & AV_CPU_FLAG_3DNOW)
3113 av_log(avctx, AV_LOG_INFO, " 3dnow");
3114 if (mm_flags & AV_CPU_FLAG_SSE)
3115 av_log(avctx, AV_LOG_INFO, " sse");
3116 if (mm_flags & AV_CPU_FLAG_SSE2)
3117 av_log(avctx, AV_LOG_INFO, " sse2");
3118 av_log(avctx, AV_LOG_INFO, "\n");
3121 if (mm_flags & AV_CPU_FLAG_MMX) {
3122 const int idct_algo = avctx->idct_algo;
3124 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
3125 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
3126 c->idct_put = ff_simple_idct_put_mmx;
3127 c->idct_add = ff_simple_idct_add_mmx;
3128 c->idct = ff_simple_idct_mmx;
3129 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
3131 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
3132 if (mm_flags & AV_CPU_FLAG_MMX2) {
3133 c->idct_put = ff_libmpeg2mmx2_idct_put;
3134 c->idct_add = ff_libmpeg2mmx2_idct_add;
3135 c->idct = ff_mmxext_idct;
3137 c->idct_put = ff_libmpeg2mmx_idct_put;
3138 c->idct_add = ff_libmpeg2mmx_idct_add;
3139 c->idct = ff_mmx_idct;
3141 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
3143 } else if ((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER ||
3144 CONFIG_VP6_DECODER) &&
3145 idct_algo == FF_IDCT_VP3 && HAVE_YASM) {
3146 if (mm_flags & AV_CPU_FLAG_SSE2) {
3147 c->idct_put = ff_vp3_idct_put_sse2;
3148 c->idct_add = ff_vp3_idct_add_sse2;
3149 c->idct = ff_vp3_idct_sse2;
3150 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
3152 c->idct_put = ff_vp3_idct_put_mmx;
3153 c->idct_add = ff_vp3_idct_add_mmx;
3154 c->idct = ff_vp3_idct_mmx;
3155 c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
3157 } else if (idct_algo == FF_IDCT_CAVS) {
3158 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
3159 } else if (idct_algo == FF_IDCT_XVIDMMX) {
3160 if (mm_flags & AV_CPU_FLAG_SSE2) {
3161 c->idct_put = ff_idct_xvid_sse2_put;
3162 c->idct_add = ff_idct_xvid_sse2_add;
3163 c->idct = ff_idct_xvid_sse2;
3164 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
3165 } else if (mm_flags & AV_CPU_FLAG_MMX2) {
3166 c->idct_put = ff_idct_xvid_mmx2_put;
3167 c->idct_add = ff_idct_xvid_mmx2_add;
3168 c->idct = ff_idct_xvid_mmx2;
3170 c->idct_put = ff_idct_xvid_mmx_put;
3171 c->idct_add = ff_idct_xvid_mmx_add;
3172 c->idct = ff_idct_xvid_mmx;
3177 dsputil_init_mmx(c, avctx, mm_flags);
3180 if (mm_flags & AV_CPU_FLAG_MMX2)
3181 dsputil_init_mmx2(c, avctx, mm_flags);
3183 if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW)
3184 dsputil_init_3dnow(c, avctx, mm_flags);
3186 if (mm_flags & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT)
3187 dsputil_init_3dnow2(c, avctx, mm_flags);
3189 if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE)
3190 dsputil_init_sse(c, avctx, mm_flags);
3192 if (mm_flags & AV_CPU_FLAG_SSE2)
3193 dsputil_init_sse2(c, avctx, mm_flags);
3195 if (mm_flags & AV_CPU_FLAG_SSSE3)
3196 dsputil_init_ssse3(c, avctx, mm_flags);
3198 if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE)
3199 dsputil_init_sse4(c, avctx, mm_flags);
3201 if (mm_flags & AV_CPU_FLAG_AVX)
3202 dsputil_init_avx(c, avctx, mm_flags);
3204 if (CONFIG_ENCODERS)
3205 ff_dsputilenc_init_mmx(c, avctx);