2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86_cpu.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "libavcodec/ac3dec.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
43 { 0x8000000080000000ULL, 0x8000000080000000ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
52 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
73 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
81 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
82 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
84 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
85 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
87 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
88 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
90 #define MOVQ_BFE(regd) \
92 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
93 "paddb %%"#regd", %%"#regd" \n\t" ::)
96 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
97 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
99 // for shared library it's better to use this way for accessing constants
101 #define MOVQ_BONE(regd) \
103 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
104 "psrlw $15, %%"#regd" \n\t" \
105 "packuswb %%"#regd", %%"#regd" \n\t" ::)
107 #define MOVQ_WTWO(regd) \
109 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
110 "psrlw $15, %%"#regd" \n\t" \
111 "psllw $1, %%"#regd" \n\t"::)
115 // using regr as temporary and for the output result
116 // first argument is unmodifed and second is trashed
117 // regfe is supposed to contain 0xfefefefefefefefe
118 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
119 "movq "#rega", "#regr" \n\t" \
120 "pand "#regb", "#regr" \n\t" \
121 "pxor "#rega", "#regb" \n\t" \
122 "pand "#regfe", "#regb" \n\t" \
123 "psrlq $1, "#regb" \n\t" \
124 "paddb "#regb", "#regr" \n\t"
126 #define PAVGB_MMX(rega, regb, regr, regfe) \
127 "movq "#rega", "#regr" \n\t" \
128 "por "#regb", "#regr" \n\t" \
129 "pxor "#rega", "#regb" \n\t" \
130 "pand "#regfe", "#regb" \n\t" \
131 "psrlq $1, "#regb" \n\t" \
132 "psubb "#regb", "#regr" \n\t"
134 // mm6 is supposed to contain 0xfefefefefefefefe
135 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
136 "movq "#rega", "#regr" \n\t" \
137 "movq "#regc", "#regp" \n\t" \
138 "pand "#regb", "#regr" \n\t" \
139 "pand "#regd", "#regp" \n\t" \
140 "pxor "#rega", "#regb" \n\t" \
141 "pxor "#regc", "#regd" \n\t" \
142 "pand %%mm6, "#regb" \n\t" \
143 "pand %%mm6, "#regd" \n\t" \
144 "psrlq $1, "#regb" \n\t" \
145 "psrlq $1, "#regd" \n\t" \
146 "paddb "#regb", "#regr" \n\t" \
147 "paddb "#regd", "#regp" \n\t"
149 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
150 "movq "#rega", "#regr" \n\t" \
151 "movq "#regc", "#regp" \n\t" \
152 "por "#regb", "#regr" \n\t" \
153 "por "#regd", "#regp" \n\t" \
154 "pxor "#rega", "#regb" \n\t" \
155 "pxor "#regc", "#regd" \n\t" \
156 "pand %%mm6, "#regb" \n\t" \
157 "pand %%mm6, "#regd" \n\t" \
158 "psrlq $1, "#regd" \n\t" \
159 "psrlq $1, "#regb" \n\t" \
160 "psubb "#regb", "#regr" \n\t" \
161 "psubb "#regd", "#regp" \n\t"
163 /***********************************/
164 /* MMX no rounding */
165 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
166 #define SET_RND MOVQ_WONE
167 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
168 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
169 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
171 #include "dsputil_mmx_rnd_template.c"
177 /***********************************/
180 #define DEF(x, y) x ## _ ## y ## _mmx
181 #define SET_RND MOVQ_WTWO
182 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
183 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
185 #include "dsputil_mmx_rnd_template.c"
193 /***********************************/
196 #define DEF(x) x ## _3dnow
197 #define PAVGB "pavgusb"
200 #include "dsputil_mmx_avg_template.c"
206 /***********************************/
209 #define DEF(x) x ## _mmx2
211 /* Introduced only in MMX2 set */
212 #define PAVGB "pavgb"
215 #include "dsputil_mmx_avg_template.c"
221 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
222 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
223 #define put_pixels16_mmx2 put_pixels16_mmx
224 #define put_pixels8_mmx2 put_pixels8_mmx
225 #define put_pixels4_mmx2 put_pixels4_mmx
226 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
227 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
228 #define put_pixels16_3dnow put_pixels16_mmx
229 #define put_pixels8_3dnow put_pixels8_mmx
230 #define put_pixels4_3dnow put_pixels4_mmx
231 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
232 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
234 /***********************************/
237 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
243 /* read the pixels */
248 "movq %3, %%mm0 \n\t"
249 "movq 8%3, %%mm1 \n\t"
250 "movq 16%3, %%mm2 \n\t"
251 "movq 24%3, %%mm3 \n\t"
252 "movq 32%3, %%mm4 \n\t"
253 "movq 40%3, %%mm5 \n\t"
254 "movq 48%3, %%mm6 \n\t"
255 "movq 56%3, %%mm7 \n\t"
256 "packuswb %%mm1, %%mm0 \n\t"
257 "packuswb %%mm3, %%mm2 \n\t"
258 "packuswb %%mm5, %%mm4 \n\t"
259 "packuswb %%mm7, %%mm6 \n\t"
260 "movq %%mm0, (%0) \n\t"
261 "movq %%mm2, (%0, %1) \n\t"
262 "movq %%mm4, (%0, %1, 2) \n\t"
263 "movq %%mm6, (%0, %2) \n\t"
264 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
267 pix += line_size * 4;
270 // if here would be an exact copy of the code above
271 // compiler would generate some very strange code
274 "movq (%3), %%mm0 \n\t"
275 "movq 8(%3), %%mm1 \n\t"
276 "movq 16(%3), %%mm2 \n\t"
277 "movq 24(%3), %%mm3 \n\t"
278 "movq 32(%3), %%mm4 \n\t"
279 "movq 40(%3), %%mm5 \n\t"
280 "movq 48(%3), %%mm6 \n\t"
281 "movq 56(%3), %%mm7 \n\t"
282 "packuswb %%mm1, %%mm0 \n\t"
283 "packuswb %%mm3, %%mm2 \n\t"
284 "packuswb %%mm5, %%mm4 \n\t"
285 "packuswb %%mm7, %%mm6 \n\t"
286 "movq %%mm0, (%0) \n\t"
287 "movq %%mm2, (%0, %1) \n\t"
288 "movq %%mm4, (%0, %1, 2) \n\t"
289 "movq %%mm6, (%0, %2) \n\t"
290 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
294 #define put_signed_pixels_clamped_mmx_half(off) \
295 "movq "#off"(%2), %%mm1 \n\t" \
296 "movq 16 + "#off"(%2), %%mm2 \n\t" \
297 "movq 32 + "#off"(%2), %%mm3 \n\t" \
298 "movq 48 + "#off"(%2), %%mm4 \n\t" \
299 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
300 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
301 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
302 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
303 "paddb %%mm0, %%mm1 \n\t" \
304 "paddb %%mm0, %%mm2 \n\t" \
305 "paddb %%mm0, %%mm3 \n\t" \
306 "paddb %%mm0, %%mm4 \n\t" \
307 "movq %%mm1, (%0) \n\t" \
308 "movq %%mm2, (%0, %3) \n\t" \
309 "movq %%mm3, (%0, %3, 2) \n\t" \
310 "movq %%mm4, (%0, %1) \n\t"
312 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
315 x86_reg line_skip = line_size;
319 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
320 "lea (%3, %3, 2), %1 \n\t"
321 put_signed_pixels_clamped_mmx_half(0)
322 "lea (%0, %3, 4), %0 \n\t"
323 put_signed_pixels_clamped_mmx_half(64)
324 : "+&r"(pixels), "=&r"(line_skip3)
325 : "r"(block), "r"(line_skip)
329 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
336 /* read the pixels */
343 "movq (%2), %%mm0 \n\t"
344 "movq 8(%2), %%mm1 \n\t"
345 "movq 16(%2), %%mm2 \n\t"
346 "movq 24(%2), %%mm3 \n\t"
347 "movq %0, %%mm4 \n\t"
348 "movq %1, %%mm6 \n\t"
349 "movq %%mm4, %%mm5 \n\t"
350 "punpcklbw %%mm7, %%mm4 \n\t"
351 "punpckhbw %%mm7, %%mm5 \n\t"
352 "paddsw %%mm4, %%mm0 \n\t"
353 "paddsw %%mm5, %%mm1 \n\t"
354 "movq %%mm6, %%mm5 \n\t"
355 "punpcklbw %%mm7, %%mm6 \n\t"
356 "punpckhbw %%mm7, %%mm5 \n\t"
357 "paddsw %%mm6, %%mm2 \n\t"
358 "paddsw %%mm5, %%mm3 \n\t"
359 "packuswb %%mm1, %%mm0 \n\t"
360 "packuswb %%mm3, %%mm2 \n\t"
361 "movq %%mm0, %0 \n\t"
362 "movq %%mm2, %1 \n\t"
363 : "+m"(*pix), "+m"(*(pix + line_size))
366 pix += line_size * 2;
371 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
372 int line_size, int h)
375 "lea (%3, %3), %%"REG_a" \n\t"
378 "movd (%1 ), %%mm0 \n\t"
379 "movd (%1, %3), %%mm1 \n\t"
380 "movd %%mm0, (%2) \n\t"
381 "movd %%mm1, (%2, %3) \n\t"
382 "add %%"REG_a", %1 \n\t"
383 "add %%"REG_a", %2 \n\t"
384 "movd (%1 ), %%mm0 \n\t"
385 "movd (%1, %3), %%mm1 \n\t"
386 "movd %%mm0, (%2) \n\t"
387 "movd %%mm1, (%2, %3) \n\t"
388 "add %%"REG_a", %1 \n\t"
389 "add %%"REG_a", %2 \n\t"
392 : "+g"(h), "+r"(pixels), "+r"(block)
393 : "r"((x86_reg)line_size)
398 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
399 int line_size, int h)
402 "lea (%3, %3), %%"REG_a" \n\t"
405 "movq (%1 ), %%mm0 \n\t"
406 "movq (%1, %3), %%mm1 \n\t"
407 "movq %%mm0, (%2) \n\t"
408 "movq %%mm1, (%2, %3) \n\t"
409 "add %%"REG_a", %1 \n\t"
410 "add %%"REG_a", %2 \n\t"
411 "movq (%1 ), %%mm0 \n\t"
412 "movq (%1, %3), %%mm1 \n\t"
413 "movq %%mm0, (%2) \n\t"
414 "movq %%mm1, (%2, %3) \n\t"
415 "add %%"REG_a", %1 \n\t"
416 "add %%"REG_a", %2 \n\t"
419 : "+g"(h), "+r"(pixels), "+r"(block)
420 : "r"((x86_reg)line_size)
425 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
426 int line_size, int h)
429 "lea (%3, %3), %%"REG_a" \n\t"
432 "movq (%1 ), %%mm0 \n\t"
433 "movq 8(%1 ), %%mm4 \n\t"
434 "movq (%1, %3), %%mm1 \n\t"
435 "movq 8(%1, %3), %%mm5 \n\t"
436 "movq %%mm0, (%2) \n\t"
437 "movq %%mm4, 8(%2) \n\t"
438 "movq %%mm1, (%2, %3) \n\t"
439 "movq %%mm5, 8(%2, %3) \n\t"
440 "add %%"REG_a", %1 \n\t"
441 "add %%"REG_a", %2 \n\t"
442 "movq (%1 ), %%mm0 \n\t"
443 "movq 8(%1 ), %%mm4 \n\t"
444 "movq (%1, %3), %%mm1 \n\t"
445 "movq 8(%1, %3), %%mm5 \n\t"
446 "movq %%mm0, (%2) \n\t"
447 "movq %%mm4, 8(%2) \n\t"
448 "movq %%mm1, (%2, %3) \n\t"
449 "movq %%mm5, 8(%2, %3) \n\t"
450 "add %%"REG_a", %1 \n\t"
451 "add %%"REG_a", %2 \n\t"
454 : "+g"(h), "+r"(pixels), "+r"(block)
455 : "r"((x86_reg)line_size)
460 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
461 int line_size, int h)
465 "movdqu (%1 ), %%xmm0 \n\t"
466 "movdqu (%1, %3 ), %%xmm1 \n\t"
467 "movdqu (%1, %3, 2), %%xmm2 \n\t"
468 "movdqu (%1, %4 ), %%xmm3 \n\t"
469 "lea (%1, %3, 4), %1 \n\t"
470 "movdqa %%xmm0, (%2) \n\t"
471 "movdqa %%xmm1, (%2, %3) \n\t"
472 "movdqa %%xmm2, (%2, %3, 2) \n\t"
473 "movdqa %%xmm3, (%2, %4) \n\t"
475 "lea (%2, %3, 4), %2 \n\t"
477 : "+g"(h), "+r"(pixels), "+r"(block)
478 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
483 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
484 int line_size, int h)
488 "movdqu (%1 ), %%xmm0 \n\t"
489 "movdqu (%1, %3 ), %%xmm1 \n\t"
490 "movdqu (%1, %3, 2), %%xmm2 \n\t"
491 "movdqu (%1, %4 ), %%xmm3 \n\t"
492 "lea (%1, %3, 4), %1 \n\t"
493 "pavgb (%2 ), %%xmm0 \n\t"
494 "pavgb (%2, %3 ), %%xmm1 \n\t"
495 "pavgb (%2, %3, 2), %%xmm2 \n\t"
496 "pavgb (%2, %4), %%xmm3 \n\t"
497 "movdqa %%xmm0, (%2) \n\t"
498 "movdqa %%xmm1, (%2, %3) \n\t"
499 "movdqa %%xmm2, (%2, %3, 2) \n\t"
500 "movdqa %%xmm3, (%2, %4) \n\t"
502 "lea (%2, %3, 4), %2 \n\t"
504 : "+g"(h), "+r"(pixels), "+r"(block)
505 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
510 #define CLEAR_BLOCKS(name, n) \
511 static void name(DCTELEM *blocks) \
514 "pxor %%mm7, %%mm7 \n\t" \
515 "mov %1, %%"REG_a" \n\t" \
517 "movq %%mm7, (%0, %%"REG_a") \n\t" \
518 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
519 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
520 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
521 "add $32, %%"REG_a" \n\t" \
523 :: "r"(((uint8_t *)blocks) + 128 * n), \
528 CLEAR_BLOCKS(clear_blocks_mmx, 6)
529 CLEAR_BLOCKS(clear_block_mmx, 1)
531 static void clear_block_sse(DCTELEM *block)
534 "xorps %%xmm0, %%xmm0 \n"
535 "movaps %%xmm0, (%0) \n"
536 "movaps %%xmm0, 16(%0) \n"
537 "movaps %%xmm0, 32(%0) \n"
538 "movaps %%xmm0, 48(%0) \n"
539 "movaps %%xmm0, 64(%0) \n"
540 "movaps %%xmm0, 80(%0) \n"
541 "movaps %%xmm0, 96(%0) \n"
542 "movaps %%xmm0, 112(%0) \n"
548 static void clear_blocks_sse(DCTELEM *blocks)
551 "xorps %%xmm0, %%xmm0 \n"
552 "mov %1, %%"REG_a" \n"
554 "movaps %%xmm0, (%0, %%"REG_a") \n"
555 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
556 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
557 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
558 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
559 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
560 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
561 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
562 "add $128, %%"REG_a" \n"
564 :: "r"(((uint8_t *)blocks) + 128 * 6),
570 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
576 "movq (%1, %0), %%mm0 \n\t"
577 "movq (%2, %0), %%mm1 \n\t"
578 "paddb %%mm0, %%mm1 \n\t"
579 "movq %%mm1, (%2, %0) \n\t"
580 "movq 8(%1, %0), %%mm0 \n\t"
581 "movq 8(%2, %0), %%mm1 \n\t"
582 "paddb %%mm0, %%mm1 \n\t"
583 "movq %%mm1, 8(%2, %0) \n\t"
589 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
592 dst[i + 0] += src[i + 0];
596 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
597 const uint8_t *diff, int w,
598 int *left, int *left_top)
602 int l = *left & 0xff;
603 int tl = *left_top & 0xff;
608 "movzbl (%3, %4), %2 \n"
621 "add (%6, %4), %b0 \n"
622 "mov %b0, (%5, %4) \n"
625 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
626 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
633 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
634 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
635 "movd (%1), %%mm0 \n\t"
637 "movd (%1), %%mm1 \n\t"
638 "movd (%1,%3,1), %%mm2 \n\t"
639 "movd (%1,%3,2), %%mm3 \n\t"
640 "punpcklbw %%mm1, %%mm0 \n\t"
641 "punpcklbw %%mm3, %%mm2 \n\t"
642 "movq %%mm0, %%mm1 \n\t"
643 "punpcklwd %%mm2, %%mm0 \n\t"
644 "punpckhwd %%mm2, %%mm1 \n\t"
645 "movd %%mm0, (%0) \n\t"
647 "punpckhdq %%mm0, %%mm0 \n\t"
648 "movd %%mm0, (%0) \n\t"
649 "movd %%mm1, (%0,%2,1) \n\t"
650 "punpckhdq %%mm1, %%mm1 \n\t"
651 "movd %%mm1, (%0,%2,2) \n\t"
661 #define H263_LOOP_FILTER \
662 "pxor %%mm7, %%mm7 \n\t" \
663 "movq %0, %%mm0 \n\t" \
664 "movq %0, %%mm1 \n\t" \
665 "movq %3, %%mm2 \n\t" \
666 "movq %3, %%mm3 \n\t" \
667 "punpcklbw %%mm7, %%mm0 \n\t" \
668 "punpckhbw %%mm7, %%mm1 \n\t" \
669 "punpcklbw %%mm7, %%mm2 \n\t" \
670 "punpckhbw %%mm7, %%mm3 \n\t" \
671 "psubw %%mm2, %%mm0 \n\t" \
672 "psubw %%mm3, %%mm1 \n\t" \
673 "movq %1, %%mm2 \n\t" \
674 "movq %1, %%mm3 \n\t" \
675 "movq %2, %%mm4 \n\t" \
676 "movq %2, %%mm5 \n\t" \
677 "punpcklbw %%mm7, %%mm2 \n\t" \
678 "punpckhbw %%mm7, %%mm3 \n\t" \
679 "punpcklbw %%mm7, %%mm4 \n\t" \
680 "punpckhbw %%mm7, %%mm5 \n\t" \
681 "psubw %%mm2, %%mm4 \n\t" \
682 "psubw %%mm3, %%mm5 \n\t" \
683 "psllw $2, %%mm4 \n\t" \
684 "psllw $2, %%mm5 \n\t" \
685 "paddw %%mm0, %%mm4 \n\t" \
686 "paddw %%mm1, %%mm5 \n\t" \
687 "pxor %%mm6, %%mm6 \n\t" \
688 "pcmpgtw %%mm4, %%mm6 \n\t" \
689 "pcmpgtw %%mm5, %%mm7 \n\t" \
690 "pxor %%mm6, %%mm4 \n\t" \
691 "pxor %%mm7, %%mm5 \n\t" \
692 "psubw %%mm6, %%mm4 \n\t" \
693 "psubw %%mm7, %%mm5 \n\t" \
694 "psrlw $3, %%mm4 \n\t" \
695 "psrlw $3, %%mm5 \n\t" \
696 "packuswb %%mm5, %%mm4 \n\t" \
697 "packsswb %%mm7, %%mm6 \n\t" \
698 "pxor %%mm7, %%mm7 \n\t" \
699 "movd %4, %%mm2 \n\t" \
700 "punpcklbw %%mm2, %%mm2 \n\t" \
701 "punpcklbw %%mm2, %%mm2 \n\t" \
702 "punpcklbw %%mm2, %%mm2 \n\t" \
703 "psubusb %%mm4, %%mm2 \n\t" \
704 "movq %%mm2, %%mm3 \n\t" \
705 "psubusb %%mm4, %%mm3 \n\t" \
706 "psubb %%mm3, %%mm2 \n\t" \
707 "movq %1, %%mm3 \n\t" \
708 "movq %2, %%mm4 \n\t" \
709 "pxor %%mm6, %%mm3 \n\t" \
710 "pxor %%mm6, %%mm4 \n\t" \
711 "paddusb %%mm2, %%mm3 \n\t" \
712 "psubusb %%mm2, %%mm4 \n\t" \
713 "pxor %%mm6, %%mm3 \n\t" \
714 "pxor %%mm6, %%mm4 \n\t" \
715 "paddusb %%mm2, %%mm2 \n\t" \
716 "packsswb %%mm1, %%mm0 \n\t" \
717 "pcmpgtb %%mm0, %%mm7 \n\t" \
718 "pxor %%mm7, %%mm0 \n\t" \
719 "psubb %%mm7, %%mm0 \n\t" \
720 "movq %%mm0, %%mm1 \n\t" \
721 "psubusb %%mm2, %%mm0 \n\t" \
722 "psubb %%mm0, %%mm1 \n\t" \
723 "pand %5, %%mm1 \n\t" \
724 "psrlw $2, %%mm1 \n\t" \
725 "pxor %%mm7, %%mm1 \n\t" \
726 "psubb %%mm7, %%mm1 \n\t" \
727 "movq %0, %%mm5 \n\t" \
728 "movq %3, %%mm6 \n\t" \
729 "psubb %%mm1, %%mm5 \n\t" \
730 "paddb %%mm1, %%mm6 \n\t"
732 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
734 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
735 const int strength = ff_h263_loop_filter_strength[qscale];
740 "movq %%mm3, %1 \n\t"
741 "movq %%mm4, %2 \n\t"
742 "movq %%mm5, %0 \n\t"
743 "movq %%mm6, %3 \n\t"
744 : "+m"(*(uint64_t*)(src - 2 * stride)),
745 "+m"(*(uint64_t*)(src - 1 * stride)),
746 "+m"(*(uint64_t*)(src + 0 * stride)),
747 "+m"(*(uint64_t*)(src + 1 * stride))
748 : "g"(2 * strength), "m"(ff_pb_FC)
753 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
755 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
756 const int strength = ff_h263_loop_filter_strength[qscale];
757 DECLARE_ALIGNED(8, uint64_t, temp)[4];
758 uint8_t *btemp = (uint8_t*)temp;
762 transpose4x4(btemp, src, 8, stride);
763 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
765 H263_LOOP_FILTER // 5 3 4 6
771 : "g"(2 * strength), "m"(ff_pb_FC)
775 "movq %%mm5, %%mm1 \n\t"
776 "movq %%mm4, %%mm0 \n\t"
777 "punpcklbw %%mm3, %%mm5 \n\t"
778 "punpcklbw %%mm6, %%mm4 \n\t"
779 "punpckhbw %%mm3, %%mm1 \n\t"
780 "punpckhbw %%mm6, %%mm0 \n\t"
781 "movq %%mm5, %%mm3 \n\t"
782 "movq %%mm1, %%mm6 \n\t"
783 "punpcklwd %%mm4, %%mm5 \n\t"
784 "punpcklwd %%mm0, %%mm1 \n\t"
785 "punpckhwd %%mm4, %%mm3 \n\t"
786 "punpckhwd %%mm0, %%mm6 \n\t"
787 "movd %%mm5, (%0) \n\t"
788 "punpckhdq %%mm5, %%mm5 \n\t"
789 "movd %%mm5, (%0, %2) \n\t"
790 "movd %%mm3, (%0, %2, 2) \n\t"
791 "punpckhdq %%mm3, %%mm3 \n\t"
792 "movd %%mm3, (%0, %3) \n\t"
793 "movd %%mm1, (%1) \n\t"
794 "punpckhdq %%mm1, %%mm1 \n\t"
795 "movd %%mm1, (%1, %2) \n\t"
796 "movd %%mm6, (%1, %2, 2) \n\t"
797 "punpckhdq %%mm6, %%mm6 \n\t"
798 "movd %%mm6, (%1, %3) \n\t"
800 "r"(src + 4 * stride),
801 "r"((x86_reg)stride),
802 "r"((x86_reg)(3 * stride))
807 /* Draw the edges of width 'w' of an image of size width, height
808 * this MMX version can only handle w == 8 || w == 16. */
809 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
810 int w, int h, int sides)
812 uint8_t *ptr, *last_line;
815 last_line = buf + (height - 1) * wrap;
821 "movd (%0), %%mm0 \n\t"
822 "punpcklbw %%mm0, %%mm0 \n\t"
823 "punpcklwd %%mm0, %%mm0 \n\t"
824 "punpckldq %%mm0, %%mm0 \n\t"
825 "movq %%mm0, -8(%0) \n\t"
826 "movq -8(%0, %2), %%mm1 \n\t"
827 "punpckhbw %%mm1, %%mm1 \n\t"
828 "punpckhwd %%mm1, %%mm1 \n\t"
829 "punpckhdq %%mm1, %%mm1 \n\t"
830 "movq %%mm1, (%0, %2) \n\t"
835 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
840 "movd (%0), %%mm0 \n\t"
841 "punpcklbw %%mm0, %%mm0 \n\t"
842 "punpcklwd %%mm0, %%mm0 \n\t"
843 "punpckldq %%mm0, %%mm0 \n\t"
844 "movq %%mm0, -8(%0) \n\t"
845 "movq %%mm0, -16(%0) \n\t"
846 "movq -8(%0, %2), %%mm1 \n\t"
847 "punpckhbw %%mm1, %%mm1 \n\t"
848 "punpckhwd %%mm1, %%mm1 \n\t"
849 "punpckhdq %%mm1, %%mm1 \n\t"
850 "movq %%mm1, (%0, %2) \n\t"
851 "movq %%mm1, 8(%0, %2) \n\t"
856 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
860 /* top and bottom (and hopefully also the corners) */
861 if (sides & EDGE_TOP) {
862 for (i = 0; i < h; i += 4) {
863 ptr = buf - (i + 1) * wrap - w;
866 "movq (%1, %0), %%mm0 \n\t"
867 "movq %%mm0, (%0) \n\t"
868 "movq %%mm0, (%0, %2) \n\t"
869 "movq %%mm0, (%0, %2, 2) \n\t"
870 "movq %%mm0, (%0, %3) \n\t"
875 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
876 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
881 if (sides & EDGE_BOTTOM) {
882 for (i = 0; i < h; i += 4) {
883 ptr = last_line + (i + 1) * wrap - w;
886 "movq (%1, %0), %%mm0 \n\t"
887 "movq %%mm0, (%0) \n\t"
888 "movq %%mm0, (%0, %2) \n\t"
889 "movq %%mm0, (%0, %2, 2) \n\t"
890 "movq %%mm0, (%0, %3) \n\t"
895 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
896 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
897 "r"(ptr + width + 2 * w)
903 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
904 in0, in1, in2, in7, out, OP) \
905 "paddw "#m4", "#m3" \n\t" /* x1 */ \
906 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
907 "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
908 "movq "#in7", "#m3" \n\t" /* d */ \
909 "movq "#in0", %%mm5 \n\t" /* D */ \
910 "paddw "#m3", %%mm5 \n\t" /* x4 */ \
911 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
912 "movq "#in1", %%mm5 \n\t" /* C */ \
913 "movq "#in2", %%mm6 \n\t" /* B */ \
914 "paddw "#m6", %%mm5 \n\t" /* x3 */ \
915 "paddw "#m5", %%mm6 \n\t" /* x2 */ \
916 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
917 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
918 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
919 "paddw "#rnd", %%mm4 \n\t" /* x2 */ \
920 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
921 "psraw $5, %%mm5 \n\t" \
922 "packuswb %%mm5, %%mm5 \n\t" \
923 OP(%%mm5, out, %%mm7, d)
925 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW) \
926 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, \
935 "pxor %%mm7, %%mm7 \n\t" \
937 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
938 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
939 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
940 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
941 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
942 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
943 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
944 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
945 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
946 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
947 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
948 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
949 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
950 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
951 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
952 "paddw %%mm3, %%mm5 \n\t" /* b */ \
953 "paddw %%mm2, %%mm6 \n\t" /* c */ \
954 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
955 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
956 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
957 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
958 "paddw %%mm4, %%mm0 \n\t" /* a */ \
959 "paddw %%mm1, %%mm5 \n\t" /* d */ \
960 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
961 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
962 "paddw %6, %%mm6 \n\t" \
963 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
964 "psraw $5, %%mm0 \n\t" \
965 "movq %%mm0, %5 \n\t" \
966 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
968 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
969 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
970 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
971 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
972 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
973 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
974 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
975 "paddw %%mm0, %%mm2 \n\t" /* b */ \
976 "paddw %%mm5, %%mm3 \n\t" /* c */ \
977 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
978 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
979 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
980 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
981 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
982 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
983 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
984 "paddw %%mm2, %%mm1 \n\t" /* a */ \
985 "paddw %%mm6, %%mm4 \n\t" /* d */ \
986 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
987 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
988 "paddw %6, %%mm1 \n\t" \
989 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
990 "psraw $5, %%mm3 \n\t" \
991 "movq %5, %%mm1 \n\t" \
992 "packuswb %%mm3, %%mm1 \n\t" \
993 OP_MMX2(%%mm1, (%1), %%mm4, q) \
994 /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
996 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
997 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
998 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
999 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
1000 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
1001 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
1002 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
1003 "paddw %%mm1, %%mm5 \n\t" /* b */ \
1004 "paddw %%mm4, %%mm0 \n\t" /* c */ \
1005 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1006 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
1007 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
1008 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
1009 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
1010 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
1011 "paddw %%mm3, %%mm2 \n\t" /* d */ \
1012 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
1013 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
1014 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
1015 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
1016 "paddw %%mm2, %%mm6 \n\t" /* a */ \
1017 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
1018 "paddw %6, %%mm0 \n\t" \
1019 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1020 "psraw $5, %%mm0 \n\t" \
1021 /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
1022 /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
1024 "paddw %%mm5, %%mm3 \n\t" /* a */ \
1025 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
1026 "paddw %%mm4, %%mm6 \n\t" /* b */ \
1027 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
1028 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
1029 "paddw %%mm1, %%mm4 \n\t" /* c */ \
1030 "paddw %%mm2, %%mm5 \n\t" /* d */ \
1031 "paddw %%mm6, %%mm6 \n\t" /* 2b */ \
1032 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
1033 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
1034 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
1035 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
1036 "paddw %6, %%mm4 \n\t" \
1037 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
1038 "psraw $5, %%mm4 \n\t" \
1039 "packuswb %%mm4, %%mm0 \n\t" \
1040 OP_MMX2(%%mm0, 8(%1), %%mm4, q) \
1046 : "+a"(src), "+c"(dst), "+D"(h) \
1047 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
1048 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
1053 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, \
1061 /* quick HACK, XXX FIXME MUST be optimized */ \
1062 for (i = 0; i < h; i++) { \
1063 temp[ 0] = (src[ 0] + src[ 1]) * 20 - (src[ 0] + src[ 2]) * 6 + \
1064 (src[ 1] + src[ 3]) * 3 - (src[ 2] + src[ 4]); \
1065 temp[ 1] = (src[ 1] + src[ 2]) * 20 - (src[ 0] + src[ 3]) * 6 + \
1066 (src[ 0] + src[ 4]) * 3 - (src[ 1] + src[ 5]); \
1067 temp[ 2] = (src[ 2] + src[ 3]) * 20 - (src[ 1] + src[ 4]) * 6 + \
1068 (src[ 0] + src[ 5]) * 3 - (src[ 0] + src[ 6]); \
1069 temp[ 3] = (src[ 3] + src[ 4]) * 20 - (src[ 2] + src[ 5]) * 6 + \
1070 (src[ 1] + src[ 6]) * 3 - (src[ 0] + src[ 7]); \
1071 temp[ 4] = (src[ 4] + src[ 5]) * 20 - (src[ 3] + src[ 6]) * 6 + \
1072 (src[ 2] + src[ 7]) * 3 - (src[ 1] + src[ 8]); \
1073 temp[ 5] = (src[ 5] + src[ 6]) * 20 - (src[ 4] + src[ 7]) * 6 + \
1074 (src[ 3] + src[ 8]) * 3 - (src[ 2] + src[ 9]); \
1075 temp[ 6] = (src[ 6] + src[ 7]) * 20 - (src[ 5] + src[ 8]) * 6 + \
1076 (src[ 4] + src[ 9]) * 3 - (src[ 3] + src[10]); \
1077 temp[ 7] = (src[ 7] + src[ 8]) * 20 - (src[ 6] + src[ 9]) * 6 + \
1078 (src[ 5] + src[10]) * 3 - (src[ 4] + src[11]); \
1079 temp[ 8] = (src[ 8] + src[ 9]) * 20 - (src[ 7] + src[10]) * 6 + \
1080 (src[ 6] + src[11]) * 3 - (src[ 5] + src[12]); \
1081 temp[ 9] = (src[ 9] + src[10]) * 20 - (src[ 8] + src[11]) * 6 + \
1082 (src[ 7] + src[12]) * 3 - (src[ 6] + src[13]); \
1083 temp[10] = (src[10] + src[11]) * 20 - (src[ 9] + src[12]) * 6 + \
1084 (src[ 8] + src[13]) * 3 - (src[ 7] + src[14]); \
1085 temp[11] = (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + \
1086 (src[ 9] + src[14]) * 3 - (src[ 8] + src[15]); \
1087 temp[12] = (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + \
1088 (src[10] + src[15]) * 3 - (src[ 9] + src[16]); \
1089 temp[13] = (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + \
1090 (src[11] + src[16]) * 3 - (src[10] + src[16]); \
1091 temp[14] = (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + \
1092 (src[12] + src[16]) * 3 - (src[11] + src[15]); \
1093 temp[15] = (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + \
1094 (src[13] + src[15]) * 3 - (src[12] + src[14]); \
1095 __asm__ volatile ( \
1096 "movq (%0), %%mm0 \n\t" \
1097 "movq 8(%0), %%mm1 \n\t" \
1098 "paddw %2, %%mm0 \n\t" \
1099 "paddw %2, %%mm1 \n\t" \
1100 "psraw $5, %%mm0 \n\t" \
1101 "psraw $5, %%mm1 \n\t" \
1102 "packuswb %%mm1, %%mm0 \n\t" \
1103 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
1104 "movq 16(%0), %%mm0 \n\t" \
1105 "movq 24(%0), %%mm1 \n\t" \
1106 "paddw %2, %%mm0 \n\t" \
1107 "paddw %2, %%mm1 \n\t" \
1108 "psraw $5, %%mm0 \n\t" \
1109 "psraw $5, %%mm1 \n\t" \
1110 "packuswb %%mm1, %%mm0 \n\t" \
1111 OP_3DNOW(%%mm0, 8(%1), %%mm1, q) \
1112 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
1120 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, \
1126 __asm__ volatile ( \
1127 "pxor %%mm7, %%mm7 \n\t" \
1129 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
1130 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
1131 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
1132 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
1133 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
1134 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
1135 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
1136 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
1137 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
1138 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
1139 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
1140 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
1141 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
1142 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
1143 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
1144 "paddw %%mm3, %%mm5 \n\t" /* b */ \
1145 "paddw %%mm2, %%mm6 \n\t" /* c */ \
1146 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1147 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
1148 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
1149 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
1150 "paddw %%mm4, %%mm0 \n\t" /* a */ \
1151 "paddw %%mm1, %%mm5 \n\t" /* d */ \
1152 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
1153 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
1154 "paddw %5, %%mm6 \n\t" \
1155 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1156 "psraw $5, %%mm0 \n\t" \
1157 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
1159 "movd 5(%0), %%mm5 \n\t" /* FGHI */ \
1160 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
1161 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
1162 "paddw %%mm5, %%mm1 \n\t" /* a */ \
1163 "paddw %%mm6, %%mm2 \n\t" /* b */ \
1164 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
1165 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
1166 "paddw %%mm6, %%mm3 \n\t" /* c */ \
1167 "paddw %%mm5, %%mm4 \n\t" /* d */ \
1168 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1169 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1170 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1171 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1172 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
1173 "paddw %5, %%mm1 \n\t" \
1174 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
1175 "psraw $5, %%mm3 \n\t" \
1176 "packuswb %%mm3, %%mm0 \n\t" \
1177 OP_MMX2(%%mm0, (%1), %%mm4, q) \
1183 : "+a"(src), "+c"(dst), "+d"(h) \
1184 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1185 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
1190 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, \
1198 /* quick HACK, XXX FIXME MUST be optimized */ \
1199 for (i = 0; i < h; i++) { \
1200 temp[0] = (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + \
1201 (src[1] + src[3]) * 3 - (src[2] + src[4]); \
1202 temp[1] = (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + \
1203 (src[0] + src[4]) * 3 - (src[1] + src[5]); \
1204 temp[2] = (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + \
1205 (src[0] + src[5]) * 3 - (src[0] + src[6]); \
1206 temp[3] = (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + \
1207 (src[1] + src[6]) * 3 - (src[0] + src[7]); \
1208 temp[4] = (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + \
1209 (src[2] + src[7]) * 3 - (src[1] + src[8]); \
1210 temp[5] = (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + \
1211 (src[3] + src[8]) * 3 - (src[2] + src[8]); \
1212 temp[6] = (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + \
1213 (src[4] + src[8]) * 3 - (src[3] + src[7]); \
1214 temp[7] = (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + \
1215 (src[5] + src[7]) * 3 - (src[4] + src[6]); \
1216 __asm__ volatile ( \
1217 "movq (%0), %%mm0 \n\t" \
1218 "movq 8(%0), %%mm1 \n\t" \
1219 "paddw %2, %%mm0 \n\t" \
1220 "paddw %2, %%mm1 \n\t" \
1221 "psraw $5, %%mm0 \n\t" \
1222 "psraw $5, %%mm1 \n\t" \
1223 "packuswb %%mm1, %%mm0 \n\t" \
1224 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
1225 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
1233 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1234 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1239 uint64_t temp[17 * 4]; \
1240 uint64_t *temp_ptr = temp; \
1243 /* FIXME unroll */ \
1244 __asm__ volatile ( \
1245 "pxor %%mm7, %%mm7 \n\t" \
1247 "movq (%0), %%mm0 \n\t" \
1248 "movq (%0), %%mm1 \n\t" \
1249 "movq 8(%0), %%mm2 \n\t" \
1250 "movq 8(%0), %%mm3 \n\t" \
1251 "punpcklbw %%mm7, %%mm0 \n\t" \
1252 "punpckhbw %%mm7, %%mm1 \n\t" \
1253 "punpcklbw %%mm7, %%mm2 \n\t" \
1254 "punpckhbw %%mm7, %%mm3 \n\t" \
1255 "movq %%mm0, (%1) \n\t" \
1256 "movq %%mm1, 17 * 8(%1) \n\t" \
1257 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1258 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1263 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1264 : "r"((x86_reg)srcStride) \
1271 /* FIXME reorder for speed */ \
1272 __asm__ volatile ( \
1273 /* "pxor %%mm7, %%mm7 \n\t" */ \
1275 "movq (%0), %%mm0 \n\t" \
1276 "movq 8(%0), %%mm1 \n\t" \
1277 "movq 16(%0), %%mm2 \n\t" \
1278 "movq 24(%0), %%mm3 \n\t" \
1279 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1280 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1282 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1284 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1286 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1287 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1289 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1290 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1292 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1293 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1295 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1296 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1298 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1300 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1302 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1303 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1305 "add $136, %0 \n\t" \
1310 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1311 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1312 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1313 "g"(4 - 14 * (x86_reg)dstStride) \
1318 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1323 uint64_t temp[9 * 2]; \
1324 uint64_t *temp_ptr = temp; \
1327 /* FIXME unroll */ \
1328 __asm__ volatile ( \
1329 "pxor %%mm7, %%mm7 \n\t" \
1331 "movq (%0), %%mm0 \n\t" \
1332 "movq (%0), %%mm1 \n\t" \
1333 "punpcklbw %%mm7, %%mm0 \n\t" \
1334 "punpckhbw %%mm7, %%mm1 \n\t" \
1335 "movq %%mm0, (%1) \n\t" \
1336 "movq %%mm1, 9*8(%1) \n\t" \
1341 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1342 : "r"((x86_reg)srcStride) \
1349 /* FIXME reorder for speed */ \
1350 __asm__ volatile ( \
1351 /* "pxor %%mm7, %%mm7 \n\t" */ \
1353 "movq (%0), %%mm0 \n\t" \
1354 "movq 8(%0), %%mm1 \n\t" \
1355 "movq 16(%0), %%mm2 \n\t" \
1356 "movq 24(%0), %%mm3 \n\t" \
1357 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1358 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1360 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1362 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1364 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1366 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1368 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1369 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1371 "add $72, %0 \n\t" \
1376 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1377 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1378 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1379 "g"(4 - 6 * (x86_reg)dstStride) \
1384 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1387 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1390 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1394 uint8_t * const half = (uint8_t*)temp; \
1395 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1397 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1400 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1403 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1407 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1411 uint8_t * const half = (uint8_t*)temp; \
1412 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1414 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1418 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1422 uint8_t * const half = (uint8_t*)temp; \
1423 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1424 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1427 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1430 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1433 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1437 uint8_t * const half = (uint8_t*)temp; \
1438 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1439 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1443 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1446 uint64_t half[8 + 9]; \
1447 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1448 uint8_t * const halfHV = ((uint8_t*)half); \
1449 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1451 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1452 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1453 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1456 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1459 uint64_t half[8 + 9]; \
1460 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1461 uint8_t * const halfHV = ((uint8_t*)half); \
1462 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1464 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1466 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1467 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1470 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1473 uint64_t half[8 + 9]; \
1474 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1475 uint8_t * const halfHV = ((uint8_t*)half); \
1476 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1478 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1479 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1480 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1483 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1486 uint64_t half[8 + 9]; \
1487 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1488 uint8_t * const halfHV = ((uint8_t*)half); \
1489 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1491 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1493 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1494 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1497 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1500 uint64_t half[8 + 9]; \
1501 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1502 uint8_t * const halfHV = ((uint8_t*)half); \
1503 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1505 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1506 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1509 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1512 uint64_t half[8 + 9]; \
1513 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1514 uint8_t * const halfHV = ((uint8_t*)half); \
1515 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1517 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1518 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1521 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1524 uint64_t half[8 + 9]; \
1525 uint8_t * const halfH = ((uint8_t*)half); \
1526 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1528 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1529 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1532 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1535 uint64_t half[8 + 9]; \
1536 uint8_t * const halfH = ((uint8_t*)half); \
1537 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1539 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1541 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1544 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1548 uint8_t * const halfH = ((uint8_t*)half); \
1549 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1551 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1554 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1557 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1560 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1563 uint64_t temp[32]; \
1564 uint8_t * const half = (uint8_t*)temp; \
1565 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1567 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1570 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1573 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1574 stride, stride, 16); \
1577 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1580 uint64_t temp[32]; \
1581 uint8_t * const half = (uint8_t*)temp; \
1582 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1584 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1585 stride, stride, 16); \
1588 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1591 uint64_t temp[32]; \
1592 uint8_t * const half = (uint8_t*)temp; \
1593 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1595 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1598 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1601 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1604 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1607 uint64_t temp[32]; \
1608 uint8_t * const half = (uint8_t*)temp; \
1609 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1611 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1612 stride, stride, 16); \
1615 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1618 uint64_t half[16 * 2 + 17 * 2]; \
1619 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1620 uint8_t * const halfHV = ((uint8_t*)half); \
1621 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1623 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1625 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1627 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1630 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1633 uint64_t half[16 * 2 + 17 * 2]; \
1634 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1635 uint8_t * const halfHV = ((uint8_t*)half); \
1636 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1638 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1640 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1642 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1645 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1648 uint64_t half[16 * 2 + 17 * 2]; \
1649 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1650 uint8_t * const halfHV = ((uint8_t*)half); \
1651 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1653 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1655 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1657 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1661 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1664 uint64_t half[16 * 2 + 17 * 2]; \
1665 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1666 uint8_t * const halfHV = ((uint8_t*)half); \
1667 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1669 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1671 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1673 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1677 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1680 uint64_t half[16 * 2 + 17 * 2]; \
1681 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1682 uint8_t * const halfHV = ((uint8_t*)half); \
1683 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1685 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1687 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1690 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1693 uint64_t half[16 * 2 + 17 * 2]; \
1694 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1695 uint8_t * const halfHV = ((uint8_t*)half); \
1696 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1698 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1700 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1704 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1707 uint64_t half[17 * 2]; \
1708 uint8_t * const halfH = ((uint8_t*)half); \
1709 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1711 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1713 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1716 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1719 uint64_t half[17 * 2]; \
1720 uint8_t * const halfH = ((uint8_t*)half); \
1721 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1723 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1725 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1728 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1731 uint64_t half[17 * 2]; \
1732 uint8_t * const halfH = ((uint8_t*)half); \
1733 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1735 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1738 #define PUT_OP(a, b, temp, size) \
1739 "mov"#size" "#a", "#b" \n\t"
1741 #define AVG_3DNOW_OP(a, b, temp, size) \
1742 "mov"#size" "#b", "#temp" \n\t" \
1743 "pavgusb "#temp", "#a" \n\t" \
1744 "mov"#size" "#a", "#b" \n\t"
1746 #define AVG_MMX2_OP(a, b, temp, size) \
1747 "mov"#size" "#b", "#temp" \n\t" \
1748 "pavgb "#temp", "#a" \n\t" \
1749 "mov"#size" "#a", "#b" \n\t"
1751 QPEL_BASE(put_, ff_pw_16, _, PUT_OP, PUT_OP)
1752 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMX2_OP, AVG_3DNOW_OP)
1753 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1754 QPEL_OP(put_, ff_pw_16, _, PUT_OP, 3dnow)
1755 QPEL_OP(avg_, ff_pw_16, _, AVG_3DNOW_OP, 3dnow)
1756 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1757 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmx2)
1758 QPEL_OP(avg_, ff_pw_16, _, AVG_MMX2_OP, mmx2)
1759 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1761 /***********************************/
1762 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1764 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL) \
1765 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1769 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE); \
1772 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2) \
1773 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1777 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE, \
1781 #define QPEL_2TAP(OPNAME, SIZE, MMX) \
1782 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX) \
1783 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX) \
1784 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx) \
1785 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX = \
1786 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX; \
1787 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX = \
1788 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX; \
1789 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX = \
1790 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX; \
1791 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, \
1795 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE); \
1797 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, \
1801 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride, \
1804 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0) \
1805 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0) \
1806 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0) \
1807 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0) \
1808 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1) \
1809 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1) \
1810 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1) \
1811 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1) \
1813 QPEL_2TAP(put_, 16, mmx2)
1814 QPEL_2TAP(avg_, 16, mmx2)
1815 QPEL_2TAP(put_, 8, mmx2)
1816 QPEL_2TAP(avg_, 8, mmx2)
1817 QPEL_2TAP(put_, 16, 3dnow)
1818 QPEL_2TAP(avg_, 16, 3dnow)
1819 QPEL_2TAP(put_, 8, 3dnow)
1820 QPEL_2TAP(avg_, 8, 3dnow)
1822 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1824 put_pixels8_xy2_mmx(dst, src, stride, 8);
1826 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1828 put_pixels16_xy2_mmx(dst, src, stride, 16);
1830 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1832 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1834 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1836 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1840 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
1841 x86_reg linesize, x86_reg start_y,
1842 x86_reg end_y, x86_reg block_h,
1843 x86_reg start_x, x86_reg end_x,
1845 extern emu_edge_core_func ff_emu_edge_core_mmx;
1846 extern emu_edge_core_func ff_emu_edge_core_sse;
1848 static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
1850 int block_w, int block_h,
1851 int src_x, int src_y,
1853 emu_edge_core_func *core_fn)
1855 int start_y, start_x, end_y, end_x, src_y_add = 0;
1858 src_y_add = h - 1 - src_y;
1860 } else if (src_y <= -block_h) {
1861 src_y_add = 1 - block_h - src_y;
1862 src_y = 1 - block_h;
1865 src += w - 1 - src_x;
1867 } else if (src_x <= -block_w) {
1868 src += 1 - block_w - src_x;
1869 src_x = 1 - block_w;
1872 start_y = FFMAX(0, -src_y);
1873 start_x = FFMAX(0, -src_x);
1874 end_y = FFMIN(block_h, h-src_y);
1875 end_x = FFMIN(block_w, w-src_x);
1876 assert(start_x < end_x && block_w > 0);
1877 assert(start_y < end_y && block_h > 0);
1879 // fill in the to-be-copied part plus all above/below
1880 src += (src_y_add + start_y) * linesize + start_x;
1882 core_fn(buf, src, linesize, start_y, end_y,
1883 block_h, start_x, end_x, block_w);
1887 static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
1889 int block_w, int block_h,
1890 int src_x, int src_y, int w, int h)
1892 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1893 w, h, &ff_emu_edge_core_mmx);
1897 static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
1899 int block_w, int block_h,
1900 int src_x, int src_y, int w, int h)
1902 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1903 w, h, &ff_emu_edge_core_sse);
1905 #endif /* HAVE_YASM */
1907 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1908 int linesize, int block_w, int block_h,
1909 int src_x, int src_y, int w, int h);
1911 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1912 int stride, int h, int ox, int oy,
1913 int dxx, int dxy, int dyx, int dyy,
1914 int shift, int r, int width, int height,
1915 emulated_edge_mc_func *emu_edge_fn)
1918 const int ix = ox >> (16 + shift);
1919 const int iy = oy >> (16 + shift);
1920 const int oxs = ox >> 4;
1921 const int oys = oy >> 4;
1922 const int dxxs = dxx >> 4;
1923 const int dxys = dxy >> 4;
1924 const int dyxs = dyx >> 4;
1925 const int dyys = dyy >> 4;
1926 const uint16_t r4[4] = { r, r, r, r };
1927 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1928 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1929 const uint64_t shift2 = 2 * shift;
1930 uint8_t edge_buf[(h + 1) * stride];
1933 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1934 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1935 const int dxh = dxy * (h - 1);
1936 const int dyw = dyx * (w - 1);
1937 if ( // non-constant fullpel offset (3% of blocks)
1938 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1939 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1940 // uses more than 16 bits of subpel mv (only at huge resolution)
1941 || (dxx | dxy | dyx | dyy) & 15) {
1942 // FIXME could still use mmx for some of the rows
1943 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1944 shift, r, width, height);
1948 src += ix + iy * stride;
1949 if ((unsigned)ix >= width - w ||
1950 (unsigned)iy >= height - h) {
1951 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1956 "movd %0, %%mm6 \n\t"
1957 "pxor %%mm7, %%mm7 \n\t"
1958 "punpcklwd %%mm6, %%mm6 \n\t"
1959 "punpcklwd %%mm6, %%mm6 \n\t"
1963 for (x = 0; x < w; x += 4) {
1964 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1965 oxs - dxys + dxxs * (x + 1),
1966 oxs - dxys + dxxs * (x + 2),
1967 oxs - dxys + dxxs * (x + 3) };
1968 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1969 oys - dyys + dyxs * (x + 1),
1970 oys - dyys + dyxs * (x + 2),
1971 oys - dyys + dyxs * (x + 3) };
1973 for (y = 0; y < h; y++) {
1975 "movq %0, %%mm4 \n\t"
1976 "movq %1, %%mm5 \n\t"
1977 "paddw %2, %%mm4 \n\t"
1978 "paddw %3, %%mm5 \n\t"
1979 "movq %%mm4, %0 \n\t"
1980 "movq %%mm5, %1 \n\t"
1981 "psrlw $12, %%mm4 \n\t"
1982 "psrlw $12, %%mm5 \n\t"
1983 : "+m"(*dx4), "+m"(*dy4)
1984 : "m"(*dxy4), "m"(*dyy4)
1988 "movq %%mm6, %%mm2 \n\t"
1989 "movq %%mm6, %%mm1 \n\t"
1990 "psubw %%mm4, %%mm2 \n\t"
1991 "psubw %%mm5, %%mm1 \n\t"
1992 "movq %%mm2, %%mm0 \n\t"
1993 "movq %%mm4, %%mm3 \n\t"
1994 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1995 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1996 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1997 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1999 "movd %4, %%mm5 \n\t"
2000 "movd %3, %%mm4 \n\t"
2001 "punpcklbw %%mm7, %%mm5 \n\t"
2002 "punpcklbw %%mm7, %%mm4 \n\t"
2003 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
2004 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
2006 "movd %2, %%mm5 \n\t"
2007 "movd %1, %%mm4 \n\t"
2008 "punpcklbw %%mm7, %%mm5 \n\t"
2009 "punpcklbw %%mm7, %%mm4 \n\t"
2010 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
2011 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
2012 "paddw %5, %%mm1 \n\t"
2013 "paddw %%mm3, %%mm2 \n\t"
2014 "paddw %%mm1, %%mm0 \n\t"
2015 "paddw %%mm2, %%mm0 \n\t"
2017 "psrlw %6, %%mm0 \n\t"
2018 "packuswb %%mm0, %%mm0 \n\t"
2019 "movd %%mm0, %0 \n\t"
2021 : "=m"(dst[x + y * stride])
2022 : "m"(src[0]), "m"(src[1]),
2023 "m"(src[stride]), "m"(src[stride + 1]),
2024 "m"(*r4), "m"(shift2)
2028 src += 4 - h * stride;
2034 static void gmc_mmx(uint8_t *dst, uint8_t *src,
2035 int stride, int h, int ox, int oy,
2036 int dxx, int dxy, int dyx, int dyy,
2037 int shift, int r, int width, int height)
2039 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2040 width, height, &emulated_edge_mc_mmx);
2043 static void gmc_sse(uint8_t *dst, uint8_t *src,
2044 int stride, int h, int ox, int oy,
2045 int dxx, int dxy, int dyx, int dyy,
2046 int shift, int r, int width, int height)
2048 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2049 width, height, &emulated_edge_mc_sse);
2052 static void gmc_mmx(uint8_t *dst, uint8_t *src,
2053 int stride, int h, int ox, int oy,
2054 int dxx, int dxy, int dyx, int dyy,
2055 int shift, int r, int width, int height)
2057 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2058 width, height, &ff_emulated_edge_mc_8);
2062 #define PREFETCH(name, op) \
2063 static void name(void *mem, int stride, int h) \
2065 const uint8_t *p = mem; \
2067 __asm__ volatile (#op" %0" :: "m"(*p)); \
2072 PREFETCH(prefetch_mmx2, prefetcht0)
2073 PREFETCH(prefetch_3dnow, prefetch)
2076 #include "h264_qpel_mmx.c"
2078 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
2079 int stride, int h, int x, int y);
2080 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
2081 int stride, int h, int x, int y);
2082 void ff_avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst, uint8_t *src,
2083 int stride, int h, int x, int y);
2085 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
2086 int stride, int h, int x, int y);
2087 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
2088 int stride, int h, int x, int y);
2089 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
2090 int stride, int h, int x, int y);
2092 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
2093 int stride, int h, int x, int y);
2094 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
2095 int stride, int h, int x, int y);
2097 void ff_put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
2098 int stride, int h, int x, int y);
2099 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
2100 int stride, int h, int x, int y);
2102 void ff_avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
2103 int stride, int h, int x, int y);
2104 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
2105 int stride, int h, int x, int y);
2107 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
2108 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
2109 (uint8_t *dst, uint8_t *src, \
2110 int stride, int h, int x, int y);
2112 CHROMA_MC(put, 2, 10, mmxext)
2113 CHROMA_MC(avg, 2, 10, mmxext)
2114 CHROMA_MC(put, 4, 10, mmxext)
2115 CHROMA_MC(avg, 4, 10, mmxext)
2116 CHROMA_MC(put, 8, 10, sse2)
2117 CHROMA_MC(avg, 8, 10, sse2)
2118 CHROMA_MC(put, 8, 10, avx)
2119 CHROMA_MC(avg, 8, 10, avx)
2122 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2124 put_pixels8_mmx(dst, src, stride, 8);
2127 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2129 avg_pixels8_mmx(dst, src, stride, 8);
2132 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2134 put_pixels16_mmx(dst, src, stride, 16);
2137 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2139 avg_pixels16_mmx(dst, src, stride, 16);
2143 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
2144 int stride, int rnd)
2146 put_pixels8_mmx(dst, src, stride, 8);
2149 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src,
2150 int stride, int rnd)
2152 avg_pixels8_mmx2(dst, src, stride, 8);
2155 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
2158 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
2162 ff_put_pixels_clamped_mmx(block, dest, line_size);
2165 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
2169 ff_add_pixels_clamped_mmx(block, dest, line_size);
2172 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
2175 ff_mmxext_idct(block);
2176 ff_put_pixels_clamped_mmx(block, dest, line_size);
2179 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
2182 ff_mmxext_idct(block);
2183 ff_add_pixels_clamped_mmx(block, dest, line_size);
2187 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2189 ff_idct_xvid_mmx(block);
2190 ff_put_pixels_clamped_mmx(block, dest, line_size);
2193 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2195 ff_idct_xvid_mmx(block);
2196 ff_add_pixels_clamped_mmx(block, dest, line_size);
2199 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2201 ff_idct_xvid_mmx2(block);
2202 ff_put_pixels_clamped_mmx(block, dest, line_size);
2205 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2207 ff_idct_xvid_mmx2(block);
2208 ff_add_pixels_clamped_mmx(block, dest, line_size);
2211 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2214 __asm__ volatile ("pxor %%mm7, %%mm7":);
2215 for (i = 0; i < blocksize; i += 2) {
2217 "movq %0, %%mm0 \n\t"
2218 "movq %1, %%mm1 \n\t"
2219 "movq %%mm0, %%mm2 \n\t"
2220 "movq %%mm1, %%mm3 \n\t"
2221 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2222 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2223 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2224 "pxor %%mm2, %%mm1 \n\t"
2225 "movq %%mm3, %%mm4 \n\t"
2226 "pand %%mm1, %%mm3 \n\t"
2227 "pandn %%mm1, %%mm4 \n\t"
2228 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2229 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2230 "movq %%mm3, %1 \n\t"
2231 "movq %%mm0, %0 \n\t"
2232 : "+m"(mag[i]), "+m"(ang[i])
2236 __asm__ volatile ("femms");
2239 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2244 "movaps %0, %%xmm5 \n\t"
2245 :: "m"(ff_pdw_80000000[0])
2247 for (i = 0; i < blocksize; i += 4) {
2249 "movaps %0, %%xmm0 \n\t"
2250 "movaps %1, %%xmm1 \n\t"
2251 "xorps %%xmm2, %%xmm2 \n\t"
2252 "xorps %%xmm3, %%xmm3 \n\t"
2253 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2254 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2255 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2256 "xorps %%xmm2, %%xmm1 \n\t"
2257 "movaps %%xmm3, %%xmm4 \n\t"
2258 "andps %%xmm1, %%xmm3 \n\t"
2259 "andnps %%xmm1, %%xmm4 \n\t"
2260 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2261 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2262 "movaps %%xmm3, %1 \n\t"
2263 "movaps %%xmm0, %0 \n\t"
2264 : "+m"(mag[i]), "+m"(ang[i])
2273 #define MIX5(mono, stereo) \
2274 __asm__ volatile ( \
2275 "movss 0(%2), %%xmm5 \n" \
2276 "movss 8(%2), %%xmm6 \n" \
2277 "movss 24(%2), %%xmm7 \n" \
2278 "shufps $0, %%xmm5, %%xmm5 \n" \
2279 "shufps $0, %%xmm6, %%xmm6 \n" \
2280 "shufps $0, %%xmm7, %%xmm7 \n" \
2282 "movaps (%0, %1), %%xmm0 \n" \
2283 "movaps 0x400(%0, %1), %%xmm1 \n" \
2284 "movaps 0x800(%0, %1), %%xmm2 \n" \
2285 "movaps 0xc00(%0, %1), %%xmm3 \n" \
2286 "movaps 0x1000(%0, %1), %%xmm4 \n" \
2287 "mulps %%xmm5, %%xmm0 \n" \
2288 "mulps %%xmm6, %%xmm1 \n" \
2289 "mulps %%xmm5, %%xmm2 \n" \
2290 "mulps %%xmm7, %%xmm3 \n" \
2291 "mulps %%xmm7, %%xmm4 \n" \
2292 stereo("addps %%xmm1, %%xmm0 \n") \
2293 "addps %%xmm1, %%xmm2 \n" \
2294 "addps %%xmm3, %%xmm0 \n" \
2295 "addps %%xmm4, %%xmm2 \n" \
2296 mono("addps %%xmm2, %%xmm0 \n") \
2297 "movaps %%xmm0, (%0, %1) \n" \
2298 stereo("movaps %%xmm2, 0x400(%0, %1) \n") \
2302 : "r"(samples[0] + len), "r"(matrix) \
2303 : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2304 "%xmm4", "%xmm5", "%xmm6", "%xmm7",) \
2308 #define MIX_MISC(stereo) \
2309 __asm__ volatile ( \
2311 "movaps (%3, %0), %%xmm0 \n" \
2312 stereo("movaps %%xmm0, %%xmm1 \n") \
2313 "mulps %%xmm4, %%xmm0 \n" \
2314 stereo("mulps %%xmm5, %%xmm1 \n") \
2315 "lea 1024(%3, %0), %1 \n" \
2318 "movaps (%1), %%xmm2 \n" \
2319 stereo("movaps %%xmm2, %%xmm3 \n") \
2320 "mulps (%4, %2), %%xmm2 \n" \
2321 stereo("mulps 16(%4, %2), %%xmm3 \n") \
2322 "addps %%xmm2, %%xmm0 \n" \
2323 stereo("addps %%xmm3, %%xmm1 \n") \
2324 "add $1024, %1 \n" \
2327 "movaps %%xmm0, (%3, %0) \n" \
2328 stereo("movaps %%xmm1, 1024(%3, %0) \n") \
2331 : "+&r"(i), "=&r"(j), "=&r"(k) \
2332 : "r"(samples[0] + len), "r"(matrix_simd + in_ch), \
2333 "g"((intptr_t) - 32 * (in_ch - 1)) \
2337 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2],
2338 int out_ch, int in_ch, int len)
2340 int (*matrix_cmp)[2] = (int(*)[2])matrix;
2343 i = -len * sizeof(float);
2344 if (in_ch == 5 && out_ch == 2 &&
2345 !(matrix_cmp[0][1] | matrix_cmp[2][0] |
2346 matrix_cmp[3][1] | matrix_cmp[4][0] |
2347 (matrix_cmp[1][0] ^ matrix_cmp[1][1]) |
2348 (matrix_cmp[0][0] ^ matrix_cmp[2][1]))) {
2350 } else if (in_ch == 5 && out_ch == 1 &&
2351 matrix_cmp[0][0] == matrix_cmp[2][0] &&
2352 matrix_cmp[3][0] == matrix_cmp[4][0]) {
2355 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2356 j = 2 * in_ch * sizeof(float);
2360 "movss (%2, %0), %%xmm4 \n"
2361 "movss 4(%2, %0), %%xmm5 \n"
2362 "shufps $0, %%xmm4, %%xmm4 \n"
2363 "shufps $0, %%xmm5, %%xmm5 \n"
2364 "movaps %%xmm4, (%1, %0, 4) \n"
2365 "movaps %%xmm5, 16(%1, %0, 4) \n"
2368 : "r"(matrix_simd), "r"(matrix)
2380 static void vector_fmul_window_3dnow2(float *dst, const float *src0,
2381 const float *src1, const float *win,
2384 x86_reg i = -len * 4;
2385 x86_reg j = len * 4 - 8;
2388 "pswapd (%5, %1), %%mm1 \n"
2389 "movq (%5, %0), %%mm0 \n"
2390 "pswapd (%4, %1), %%mm5 \n"
2391 "movq (%3, %0), %%mm4 \n"
2392 "movq %%mm0, %%mm2 \n"
2393 "movq %%mm1, %%mm3 \n"
2394 "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]
2395 "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]
2396 "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]
2397 "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]
2398 "pfadd %%mm3, %%mm2 \n"
2399 "pfsub %%mm0, %%mm1 \n"
2400 "pswapd %%mm2, %%mm2 \n"
2401 "movq %%mm1, (%2, %0) \n"
2402 "movq %%mm2, (%2, %1) \n"
2408 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2412 static void vector_fmul_window_sse(float *dst, const float *src0,
2413 const float *src1, const float *win, int len)
2415 x86_reg i = -len * 4;
2416 x86_reg j = len * 4 - 16;
2419 "movaps (%5, %1), %%xmm1 \n"
2420 "movaps (%5, %0), %%xmm0 \n"
2421 "movaps (%4, %1), %%xmm5 \n"
2422 "movaps (%3, %0), %%xmm4 \n"
2423 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2424 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2425 "movaps %%xmm0, %%xmm2 \n"
2426 "movaps %%xmm1, %%xmm3 \n"
2427 "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]
2428 "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]
2429 "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]
2430 "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]
2431 "addps %%xmm3, %%xmm2 \n"
2432 "subps %%xmm0, %%xmm1 \n"
2433 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2434 "movaps %%xmm1, (%2, %0) \n"
2435 "movaps %%xmm2, (%2, %1) \n"
2440 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2443 #endif /* HAVE_6REGS */
2445 static void vector_clipf_sse(float *dst, const float *src,
2446 float min, float max, int len)
2448 x86_reg i = (len - 16) * 4;
2450 "movss %3, %%xmm4 \n\t"
2451 "movss %4, %%xmm5 \n\t"
2452 "shufps $0, %%xmm4, %%xmm4 \n\t"
2453 "shufps $0, %%xmm5, %%xmm5 \n\t"
2455 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
2456 "movaps 16(%2, %0), %%xmm1 \n\t"
2457 "movaps 32(%2, %0), %%xmm2 \n\t"
2458 "movaps 48(%2, %0), %%xmm3 \n\t"
2459 "maxps %%xmm4, %%xmm0 \n\t"
2460 "maxps %%xmm4, %%xmm1 \n\t"
2461 "maxps %%xmm4, %%xmm2 \n\t"
2462 "maxps %%xmm4, %%xmm3 \n\t"
2463 "minps %%xmm5, %%xmm0 \n\t"
2464 "minps %%xmm5, %%xmm1 \n\t"
2465 "minps %%xmm5, %%xmm2 \n\t"
2466 "minps %%xmm5, %%xmm3 \n\t"
2467 "movaps %%xmm0, (%1, %0) \n\t"
2468 "movaps %%xmm1, 16(%1, %0) \n\t"
2469 "movaps %%xmm2, 32(%1, %0) \n\t"
2470 "movaps %%xmm3, 48(%1, %0) \n\t"
2474 : "r"(dst), "r"(src), "m"(min), "m"(max)
2479 void ff_vp3_idct_mmx(int16_t *input_data);
2480 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2481 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2483 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size,
2484 const DCTELEM *block);
2486 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2487 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2489 void ff_vp3_idct_sse2(int16_t *input_data);
2490 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2491 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2493 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2,
2495 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
2497 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2,
2499 int order, int mul);
2500 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
2502 int order, int mul);
2503 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
2505 int order, int mul);
2507 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
2508 const int16_t *window, unsigned int len);
2509 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2510 const int16_t *window, unsigned int len);
2511 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
2512 const int16_t *window, unsigned int len);
2513 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
2514 const int16_t *window, unsigned int len);
2515 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
2516 const int16_t *window, unsigned int len);
2517 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2518 const int16_t *window, unsigned int len);
2520 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
2521 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
2523 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top,
2524 const uint8_t *diff, int w,
2525 int *left, int *left_top);
2526 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
2528 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
2531 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2533 void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
2534 const float *src1, int len);
2535 void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
2536 const float *src1, int len);
2538 void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2539 const float *src2, int len);
2540 void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
2541 const float *src2, int len);
2543 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
2544 int32_t min, int32_t max, unsigned int len);
2545 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
2546 int32_t min, int32_t max, unsigned int len);
2547 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
2548 int32_t min, int32_t max, unsigned int len);
2549 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
2550 int32_t min, int32_t max, unsigned int len);
2552 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2553 const float *src1, int len);
2554 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2555 const float *src1, int len);
2557 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2559 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2560 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2561 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2562 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2563 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2564 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2565 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2566 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2567 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2568 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2569 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2570 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2571 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2572 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2573 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2574 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2577 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2579 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2580 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2581 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2582 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2585 #define H264_QPEL_FUNCS(x, y, CPU) \
2587 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2588 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2589 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2590 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2593 #define H264_QPEL_FUNCS_10(x, y, CPU) \
2595 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2596 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2597 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2598 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2601 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2603 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2605 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2606 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2607 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2609 if (!high_bit_depth) {
2610 c->clear_block = clear_block_mmx;
2611 c->clear_blocks = clear_blocks_mmx;
2612 c->draw_edges = draw_edges_mmx;
2614 SET_HPEL_FUNCS(put, 0, 16, mmx);
2615 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2616 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2617 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2618 SET_HPEL_FUNCS(put, 1, 8, mmx);
2619 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2620 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2621 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2624 #if ARCH_X86_32 || !HAVE_YASM
2627 #if ARCH_X86_32 && HAVE_YASM
2628 if (!high_bit_depth)
2629 c->emulated_edge_mc = emulated_edge_mc_mmx;
2632 c->add_bytes = add_bytes_mmx;
2634 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2635 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2636 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2640 if (!high_bit_depth && CONFIG_H264CHROMA) {
2641 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
2642 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2645 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2650 static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
2653 const int bit_depth = avctx->bits_per_raw_sample;
2654 const int high_bit_depth = bit_depth > 8;
2656 c->prefetch = prefetch_mmx2;
2658 if (!high_bit_depth) {
2659 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2660 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2662 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2663 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2664 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2666 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2667 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2669 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2670 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2671 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2674 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2675 if (!high_bit_depth) {
2676 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2677 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2678 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2679 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2681 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2682 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2685 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2686 c->vp3_v_loop_filter = ff_vp3_v_loop_filter_mmx2;
2687 c->vp3_h_loop_filter = ff_vp3_h_loop_filter_mmx2;
2690 if (CONFIG_VP3_DECODER && HAVE_YASM)
2691 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
2693 if (CONFIG_VP3_DECODER && (avctx->codec_id == CODEC_ID_VP3 ||
2694 avctx->codec_id == CODEC_ID_THEORA)) {
2695 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2696 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2699 if (CONFIG_H264QPEL) {
2700 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
2701 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
2702 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
2703 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
2704 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
2705 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
2707 if (!high_bit_depth) {
2708 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
2709 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
2710 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
2711 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
2712 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
2713 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
2714 } else if (bit_depth == 10) {
2717 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2718 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2719 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2720 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2722 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2723 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2727 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
2728 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
2729 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
2730 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
2734 if (!high_bit_depth && CONFIG_H264CHROMA) {
2735 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_mmx2_rnd;
2736 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmx2;
2737 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmx2;
2738 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmx2;
2740 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2741 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
2742 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
2743 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
2744 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
2747 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2749 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2750 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2752 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2753 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2755 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2760 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2763 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2765 c->prefetch = prefetch_3dnow;
2767 if (!high_bit_depth) {
2768 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2769 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2771 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2772 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2773 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2775 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2776 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2778 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2779 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2780 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2782 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2783 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2784 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2785 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2786 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2788 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2789 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2793 if (CONFIG_VP3_DECODER && (avctx->codec_id == CODEC_ID_VP3 ||
2794 avctx->codec_id == CODEC_ID_THEORA)) {
2795 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2796 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2799 if (CONFIG_H264QPEL) {
2800 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
2801 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
2802 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2803 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
2804 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
2805 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
2807 if (!high_bit_depth) {
2808 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2809 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
2810 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
2811 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2812 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
2813 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
2816 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2817 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
2818 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2819 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
2823 if (!high_bit_depth && CONFIG_H264CHROMA) {
2824 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_3dnow_rnd;
2825 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2829 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2832 if (mm_flags & AV_CPU_FLAG_CMOV)
2833 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2837 static void dsputil_init_3dnow2(DSPContext *c, AVCodecContext *avctx,
2841 c->vector_fmul_window = vector_fmul_window_3dnow2;
2845 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2847 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2849 if (!high_bit_depth) {
2850 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2851 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2852 c->clear_block = clear_block_sse;
2853 c->clear_blocks = clear_blocks_sse;
2857 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2858 c->ac3_downmix = ac3_downmix_sse;
2860 c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
2861 c->vector_fmul_add = ff_vector_fmul_add_sse;
2865 c->vector_fmul_window = vector_fmul_window_sse;
2868 c->vector_clipf = vector_clipf_sse;
2871 c->scalarproduct_float = ff_scalarproduct_float_sse;
2872 c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
2874 if (!high_bit_depth)
2875 c->emulated_edge_mc = emulated_edge_mc_sse;
2880 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2883 const int bit_depth = avctx->bits_per_raw_sample;
2884 const int high_bit_depth = bit_depth > 8;
2886 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2887 // these functions are slower than mmx on AMD, but faster on Intel
2888 if (!high_bit_depth) {
2889 c->put_pixels_tab[0][0] = put_pixels16_sse2;
2890 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2891 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2892 if (CONFIG_H264QPEL)
2893 H264_QPEL_FUNCS(0, 0, sse2);
2897 if (!high_bit_depth && CONFIG_H264QPEL) {
2898 H264_QPEL_FUNCS(0, 1, sse2);
2899 H264_QPEL_FUNCS(0, 2, sse2);
2900 H264_QPEL_FUNCS(0, 3, sse2);
2901 H264_QPEL_FUNCS(1, 1, sse2);
2902 H264_QPEL_FUNCS(1, 2, sse2);
2903 H264_QPEL_FUNCS(1, 3, sse2);
2904 H264_QPEL_FUNCS(2, 1, sse2);
2905 H264_QPEL_FUNCS(2, 2, sse2);
2906 H264_QPEL_FUNCS(2, 3, sse2);
2907 H264_QPEL_FUNCS(3, 1, sse2);
2908 H264_QPEL_FUNCS(3, 2, sse2);
2909 H264_QPEL_FUNCS(3, 3, sse2);
2913 if (bit_depth == 10) {
2914 if (CONFIG_H264QPEL) {
2915 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2916 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2917 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2918 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2919 H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2920 H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2921 H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
2923 if (CONFIG_H264CHROMA) {
2924 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2925 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2929 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2930 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2931 if (mm_flags & AV_CPU_FLAG_ATOM) {
2932 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2934 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2936 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2937 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
2938 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2939 c->apply_window_int16 = ff_apply_window_int16_sse2;
2941 c->bswap_buf = ff_bswap32_buf_sse2;
2945 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2949 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2950 const int bit_depth = avctx->bits_per_raw_sample;
2952 if (!high_bit_depth && CONFIG_H264QPEL) {
2953 H264_QPEL_FUNCS(1, 0, ssse3);
2954 H264_QPEL_FUNCS(1, 1, ssse3);
2955 H264_QPEL_FUNCS(1, 2, ssse3);
2956 H264_QPEL_FUNCS(1, 3, ssse3);
2957 H264_QPEL_FUNCS(2, 0, ssse3);
2958 H264_QPEL_FUNCS(2, 1, ssse3);
2959 H264_QPEL_FUNCS(2, 2, ssse3);
2960 H264_QPEL_FUNCS(2, 3, ssse3);
2961 H264_QPEL_FUNCS(3, 0, ssse3);
2962 H264_QPEL_FUNCS(3, 1, ssse3);
2963 H264_QPEL_FUNCS(3, 2, ssse3);
2964 H264_QPEL_FUNCS(3, 3, ssse3);
2967 else if (bit_depth == 10 && CONFIG_H264QPEL) {
2968 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
2969 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
2970 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
2972 if (!high_bit_depth && CONFIG_H264CHROMA) {
2973 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_ssse3_rnd;
2974 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_ssse3_rnd;
2975 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
2976 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
2978 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2979 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2980 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2982 if (mm_flags & AV_CPU_FLAG_ATOM)
2983 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2985 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2986 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
2987 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2988 c->bswap_buf = ff_bswap32_buf_ssse3;
2993 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
2997 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
3001 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
3003 #if HAVE_AVX && HAVE_YASM
3004 const int bit_depth = avctx->bits_per_raw_sample;
3006 if (bit_depth == 10) {
3007 // AVX implies !cache64.
3008 // TODO: Port cache(32|64) detection from x264.
3009 if (CONFIG_H264QPEL) {
3010 H264_QPEL_FUNCS_10(1, 0, sse2);
3011 H264_QPEL_FUNCS_10(2, 0, sse2);
3012 H264_QPEL_FUNCS_10(3, 0, sse2);
3015 if (CONFIG_H264CHROMA) {
3016 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
3017 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
3020 c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
3021 c->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
3022 c->vector_fmul_add = ff_vector_fmul_add_avx;
3026 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
3028 int mm_flags = av_get_cpu_flags();
3031 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
3032 if (mm_flags & AV_CPU_FLAG_MMX)
3033 av_log(avctx, AV_LOG_INFO, " mmx");
3034 if (mm_flags & AV_CPU_FLAG_MMX2)
3035 av_log(avctx, AV_LOG_INFO, " mmx2");
3036 if (mm_flags & AV_CPU_FLAG_3DNOW)
3037 av_log(avctx, AV_LOG_INFO, " 3dnow");
3038 if (mm_flags & AV_CPU_FLAG_SSE)
3039 av_log(avctx, AV_LOG_INFO, " sse");
3040 if (mm_flags & AV_CPU_FLAG_SSE2)
3041 av_log(avctx, AV_LOG_INFO, " sse2");
3042 av_log(avctx, AV_LOG_INFO, "\n");
3045 if (mm_flags & AV_CPU_FLAG_MMX) {
3046 const int idct_algo = avctx->idct_algo;
3048 if (avctx->bits_per_raw_sample <= 8) {
3049 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
3050 c->idct_put = ff_simple_idct_put_mmx;
3051 c->idct_add = ff_simple_idct_add_mmx;
3052 c->idct = ff_simple_idct_mmx;
3053 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
3055 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
3056 if (mm_flags & AV_CPU_FLAG_MMX2) {
3057 c->idct_put = ff_libmpeg2mmx2_idct_put;
3058 c->idct_add = ff_libmpeg2mmx2_idct_add;
3059 c->idct = ff_mmxext_idct;
3061 c->idct_put = ff_libmpeg2mmx_idct_put;
3062 c->idct_add = ff_libmpeg2mmx_idct_add;
3063 c->idct = ff_mmx_idct;
3065 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
3067 } else if ((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER ||
3068 CONFIG_VP6_DECODER) &&
3069 idct_algo == FF_IDCT_VP3 && HAVE_YASM) {
3070 if (mm_flags & AV_CPU_FLAG_SSE2) {
3071 c->idct_put = ff_vp3_idct_put_sse2;
3072 c->idct_add = ff_vp3_idct_add_sse2;
3073 c->idct = ff_vp3_idct_sse2;
3074 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
3076 c->idct_put = ff_vp3_idct_put_mmx;
3077 c->idct_add = ff_vp3_idct_add_mmx;
3078 c->idct = ff_vp3_idct_mmx;
3079 c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
3081 } else if (idct_algo == FF_IDCT_CAVS) {
3082 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
3083 } else if (idct_algo == FF_IDCT_XVIDMMX) {
3084 if (mm_flags & AV_CPU_FLAG_SSE2) {
3085 c->idct_put = ff_idct_xvid_sse2_put;
3086 c->idct_add = ff_idct_xvid_sse2_add;
3087 c->idct = ff_idct_xvid_sse2;
3088 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
3089 } else if (mm_flags & AV_CPU_FLAG_MMX2) {
3090 c->idct_put = ff_idct_xvid_mmx2_put;
3091 c->idct_add = ff_idct_xvid_mmx2_add;
3092 c->idct = ff_idct_xvid_mmx2;
3094 c->idct_put = ff_idct_xvid_mmx_put;
3095 c->idct_add = ff_idct_xvid_mmx_add;
3096 c->idct = ff_idct_xvid_mmx;
3101 dsputil_init_mmx(c, avctx, mm_flags);
3104 if (mm_flags & AV_CPU_FLAG_MMX2)
3105 dsputil_init_mmx2(c, avctx, mm_flags);
3107 if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW)
3108 dsputil_init_3dnow(c, avctx, mm_flags);
3110 if (mm_flags & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT)
3111 dsputil_init_3dnow2(c, avctx, mm_flags);
3113 if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE)
3114 dsputil_init_sse(c, avctx, mm_flags);
3116 if (mm_flags & AV_CPU_FLAG_SSE2)
3117 dsputil_init_sse2(c, avctx, mm_flags);
3119 if (mm_flags & AV_CPU_FLAG_SSSE3)
3120 dsputil_init_ssse3(c, avctx, mm_flags);
3122 if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE)
3123 dsputil_init_sse4(c, avctx, mm_flags);
3125 if (mm_flags & AV_CPU_FLAG_AVX)
3126 dsputil_init_avx(c, avctx, mm_flags);
3128 if (CONFIG_ENCODERS)
3129 ff_dsputilenc_init_mmx(c, avctx);