2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "libavcodec/ac3dec.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
43 { 0x8000000080000000ULL, 0x8000000080000000ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
52 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
73 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
81 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
82 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
84 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
85 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
89 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
90 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
92 #define MOVQ_BFE(regd) \
94 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
95 "paddb %%"#regd", %%"#regd" \n\t" ::)
98 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
99 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
101 // for shared library it's better to use this way for accessing constants
103 #define MOVQ_BONE(regd) \
105 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
106 "psrlw $15, %%"#regd" \n\t" \
107 "packuswb %%"#regd", %%"#regd" \n\t" ::)
109 #define MOVQ_WTWO(regd) \
111 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
112 "psrlw $15, %%"#regd" \n\t" \
113 "psllw $1, %%"#regd" \n\t"::)
117 // using regr as temporary and for the output result
118 // first argument is unmodifed and second is trashed
119 // regfe is supposed to contain 0xfefefefefefefefe
120 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
121 "movq "#rega", "#regr" \n\t" \
122 "pand "#regb", "#regr" \n\t" \
123 "pxor "#rega", "#regb" \n\t" \
124 "pand "#regfe", "#regb" \n\t" \
125 "psrlq $1, "#regb" \n\t" \
126 "paddb "#regb", "#regr" \n\t"
128 #define PAVGB_MMX(rega, regb, regr, regfe) \
129 "movq "#rega", "#regr" \n\t" \
130 "por "#regb", "#regr" \n\t" \
131 "pxor "#rega", "#regb" \n\t" \
132 "pand "#regfe", "#regb" \n\t" \
133 "psrlq $1, "#regb" \n\t" \
134 "psubb "#regb", "#regr" \n\t"
136 // mm6 is supposed to contain 0xfefefefefefefefe
137 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
138 "movq "#rega", "#regr" \n\t" \
139 "movq "#regc", "#regp" \n\t" \
140 "pand "#regb", "#regr" \n\t" \
141 "pand "#regd", "#regp" \n\t" \
142 "pxor "#rega", "#regb" \n\t" \
143 "pxor "#regc", "#regd" \n\t" \
144 "pand %%mm6, "#regb" \n\t" \
145 "pand %%mm6, "#regd" \n\t" \
146 "psrlq $1, "#regb" \n\t" \
147 "psrlq $1, "#regd" \n\t" \
148 "paddb "#regb", "#regr" \n\t" \
149 "paddb "#regd", "#regp" \n\t"
151 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
152 "movq "#rega", "#regr" \n\t" \
153 "movq "#regc", "#regp" \n\t" \
154 "por "#regb", "#regr" \n\t" \
155 "por "#regd", "#regp" \n\t" \
156 "pxor "#rega", "#regb" \n\t" \
157 "pxor "#regc", "#regd" \n\t" \
158 "pand %%mm6, "#regb" \n\t" \
159 "pand %%mm6, "#regd" \n\t" \
160 "psrlq $1, "#regd" \n\t" \
161 "psrlq $1, "#regb" \n\t" \
162 "psubb "#regb", "#regr" \n\t" \
163 "psubb "#regd", "#regp" \n\t"
165 /***********************************/
166 /* MMX no rounding */
167 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
168 #define SET_RND MOVQ_WONE
169 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
170 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
171 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
173 #include "dsputil_mmx_rnd_template.c"
179 /***********************************/
182 #define DEF(x, y) x ## _ ## y ## _mmx
183 #define SET_RND MOVQ_WTWO
184 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
185 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
187 #include "dsputil_mmx_rnd_template.c"
195 /***********************************/
198 #define DEF(x) x ## _3dnow
199 #define PAVGB "pavgusb"
202 #include "dsputil_mmx_avg_template.c"
208 /***********************************/
211 #define DEF(x) x ## _mmx2
213 /* Introduced only in MMX2 set */
214 #define PAVGB "pavgb"
217 #include "dsputil_mmx_avg_template.c"
223 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
224 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
225 #define put_pixels16_mmx2 put_pixels16_mmx
226 #define put_pixels8_mmx2 put_pixels8_mmx
227 #define put_pixels4_mmx2 put_pixels4_mmx
228 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
229 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
230 #define put_pixels16_3dnow put_pixels16_mmx
231 #define put_pixels8_3dnow put_pixels8_mmx
232 #define put_pixels4_3dnow put_pixels4_mmx
233 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
234 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
236 /***********************************/
239 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
245 /* read the pixels */
250 "movq (%3), %%mm0 \n\t"
251 "movq 8(%3), %%mm1 \n\t"
252 "movq 16(%3), %%mm2 \n\t"
253 "movq 24(%3), %%mm3 \n\t"
254 "movq 32(%3), %%mm4 \n\t"
255 "movq 40(%3), %%mm5 \n\t"
256 "movq 48(%3), %%mm6 \n\t"
257 "movq 56(%3), %%mm7 \n\t"
258 "packuswb %%mm1, %%mm0 \n\t"
259 "packuswb %%mm3, %%mm2 \n\t"
260 "packuswb %%mm5, %%mm4 \n\t"
261 "packuswb %%mm7, %%mm6 \n\t"
262 "movq %%mm0, (%0) \n\t"
263 "movq %%mm2, (%0, %1) \n\t"
264 "movq %%mm4, (%0, %1, 2) \n\t"
265 "movq %%mm6, (%0, %2) \n\t"
266 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
269 pix += line_size * 4;
272 // if here would be an exact copy of the code above
273 // compiler would generate some very strange code
276 "movq (%3), %%mm0 \n\t"
277 "movq 8(%3), %%mm1 \n\t"
278 "movq 16(%3), %%mm2 \n\t"
279 "movq 24(%3), %%mm3 \n\t"
280 "movq 32(%3), %%mm4 \n\t"
281 "movq 40(%3), %%mm5 \n\t"
282 "movq 48(%3), %%mm6 \n\t"
283 "movq 56(%3), %%mm7 \n\t"
284 "packuswb %%mm1, %%mm0 \n\t"
285 "packuswb %%mm3, %%mm2 \n\t"
286 "packuswb %%mm5, %%mm4 \n\t"
287 "packuswb %%mm7, %%mm6 \n\t"
288 "movq %%mm0, (%0) \n\t"
289 "movq %%mm2, (%0, %1) \n\t"
290 "movq %%mm4, (%0, %1, 2) \n\t"
291 "movq %%mm6, (%0, %2) \n\t"
292 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
296 #define put_signed_pixels_clamped_mmx_half(off) \
297 "movq "#off"(%2), %%mm1 \n\t" \
298 "movq 16 + "#off"(%2), %%mm2 \n\t" \
299 "movq 32 + "#off"(%2), %%mm3 \n\t" \
300 "movq 48 + "#off"(%2), %%mm4 \n\t" \
301 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
302 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
303 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
304 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
305 "paddb %%mm0, %%mm1 \n\t" \
306 "paddb %%mm0, %%mm2 \n\t" \
307 "paddb %%mm0, %%mm3 \n\t" \
308 "paddb %%mm0, %%mm4 \n\t" \
309 "movq %%mm1, (%0) \n\t" \
310 "movq %%mm2, (%0, %3) \n\t" \
311 "movq %%mm3, (%0, %3, 2) \n\t" \
312 "movq %%mm4, (%0, %1) \n\t"
314 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
317 x86_reg line_skip = line_size;
321 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
322 "lea (%3, %3, 2), %1 \n\t"
323 put_signed_pixels_clamped_mmx_half(0)
324 "lea (%0, %3, 4), %0 \n\t"
325 put_signed_pixels_clamped_mmx_half(64)
326 : "+&r"(pixels), "=&r"(line_skip3)
327 : "r"(block), "r"(line_skip)
331 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
338 /* read the pixels */
345 "movq (%2), %%mm0 \n\t"
346 "movq 8(%2), %%mm1 \n\t"
347 "movq 16(%2), %%mm2 \n\t"
348 "movq 24(%2), %%mm3 \n\t"
349 "movq %0, %%mm4 \n\t"
350 "movq %1, %%mm6 \n\t"
351 "movq %%mm4, %%mm5 \n\t"
352 "punpcklbw %%mm7, %%mm4 \n\t"
353 "punpckhbw %%mm7, %%mm5 \n\t"
354 "paddsw %%mm4, %%mm0 \n\t"
355 "paddsw %%mm5, %%mm1 \n\t"
356 "movq %%mm6, %%mm5 \n\t"
357 "punpcklbw %%mm7, %%mm6 \n\t"
358 "punpckhbw %%mm7, %%mm5 \n\t"
359 "paddsw %%mm6, %%mm2 \n\t"
360 "paddsw %%mm5, %%mm3 \n\t"
361 "packuswb %%mm1, %%mm0 \n\t"
362 "packuswb %%mm3, %%mm2 \n\t"
363 "movq %%mm0, %0 \n\t"
364 "movq %%mm2, %1 \n\t"
365 : "+m"(*pix), "+m"(*(pix + line_size))
368 pix += line_size * 2;
373 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
374 int line_size, int h)
377 "lea (%3, %3), %%"REG_a" \n\t"
380 "movd (%1 ), %%mm0 \n\t"
381 "movd (%1, %3), %%mm1 \n\t"
382 "movd %%mm0, (%2) \n\t"
383 "movd %%mm1, (%2, %3) \n\t"
384 "add %%"REG_a", %1 \n\t"
385 "add %%"REG_a", %2 \n\t"
386 "movd (%1 ), %%mm0 \n\t"
387 "movd (%1, %3), %%mm1 \n\t"
388 "movd %%mm0, (%2) \n\t"
389 "movd %%mm1, (%2, %3) \n\t"
390 "add %%"REG_a", %1 \n\t"
391 "add %%"REG_a", %2 \n\t"
394 : "+g"(h), "+r"(pixels), "+r"(block)
395 : "r"((x86_reg)line_size)
400 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
401 int line_size, int h)
404 "lea (%3, %3), %%"REG_a" \n\t"
407 "movq (%1 ), %%mm0 \n\t"
408 "movq (%1, %3), %%mm1 \n\t"
409 "movq %%mm0, (%2) \n\t"
410 "movq %%mm1, (%2, %3) \n\t"
411 "add %%"REG_a", %1 \n\t"
412 "add %%"REG_a", %2 \n\t"
413 "movq (%1 ), %%mm0 \n\t"
414 "movq (%1, %3), %%mm1 \n\t"
415 "movq %%mm0, (%2) \n\t"
416 "movq %%mm1, (%2, %3) \n\t"
417 "add %%"REG_a", %1 \n\t"
418 "add %%"REG_a", %2 \n\t"
421 : "+g"(h), "+r"(pixels), "+r"(block)
422 : "r"((x86_reg)line_size)
427 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
428 int line_size, int h)
431 "lea (%3, %3), %%"REG_a" \n\t"
434 "movq (%1 ), %%mm0 \n\t"
435 "movq 8(%1 ), %%mm4 \n\t"
436 "movq (%1, %3), %%mm1 \n\t"
437 "movq 8(%1, %3), %%mm5 \n\t"
438 "movq %%mm0, (%2) \n\t"
439 "movq %%mm4, 8(%2) \n\t"
440 "movq %%mm1, (%2, %3) \n\t"
441 "movq %%mm5, 8(%2, %3) \n\t"
442 "add %%"REG_a", %1 \n\t"
443 "add %%"REG_a", %2 \n\t"
444 "movq (%1 ), %%mm0 \n\t"
445 "movq 8(%1 ), %%mm4 \n\t"
446 "movq (%1, %3), %%mm1 \n\t"
447 "movq 8(%1, %3), %%mm5 \n\t"
448 "movq %%mm0, (%2) \n\t"
449 "movq %%mm4, 8(%2) \n\t"
450 "movq %%mm1, (%2, %3) \n\t"
451 "movq %%mm5, 8(%2, %3) \n\t"
452 "add %%"REG_a", %1 \n\t"
453 "add %%"REG_a", %2 \n\t"
456 : "+g"(h), "+r"(pixels), "+r"(block)
457 : "r"((x86_reg)line_size)
462 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
463 int line_size, int h)
467 "movdqu (%1 ), %%xmm0 \n\t"
468 "movdqu (%1, %3 ), %%xmm1 \n\t"
469 "movdqu (%1, %3, 2), %%xmm2 \n\t"
470 "movdqu (%1, %4 ), %%xmm3 \n\t"
471 "lea (%1, %3, 4), %1 \n\t"
472 "movdqa %%xmm0, (%2) \n\t"
473 "movdqa %%xmm1, (%2, %3) \n\t"
474 "movdqa %%xmm2, (%2, %3, 2) \n\t"
475 "movdqa %%xmm3, (%2, %4) \n\t"
477 "lea (%2, %3, 4), %2 \n\t"
479 : "+g"(h), "+r"(pixels), "+r"(block)
480 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
485 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
486 int line_size, int h)
490 "movdqu (%1 ), %%xmm0 \n\t"
491 "movdqu (%1, %3 ), %%xmm1 \n\t"
492 "movdqu (%1, %3, 2), %%xmm2 \n\t"
493 "movdqu (%1, %4 ), %%xmm3 \n\t"
494 "lea (%1, %3, 4), %1 \n\t"
495 "pavgb (%2 ), %%xmm0 \n\t"
496 "pavgb (%2, %3 ), %%xmm1 \n\t"
497 "pavgb (%2, %3, 2), %%xmm2 \n\t"
498 "pavgb (%2, %4), %%xmm3 \n\t"
499 "movdqa %%xmm0, (%2) \n\t"
500 "movdqa %%xmm1, (%2, %3) \n\t"
501 "movdqa %%xmm2, (%2, %3, 2) \n\t"
502 "movdqa %%xmm3, (%2, %4) \n\t"
504 "lea (%2, %3, 4), %2 \n\t"
506 : "+g"(h), "+r"(pixels), "+r"(block)
507 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
512 #define CLEAR_BLOCKS(name, n) \
513 static void name(DCTELEM *blocks) \
516 "pxor %%mm7, %%mm7 \n\t" \
517 "mov %1, %%"REG_a" \n\t" \
519 "movq %%mm7, (%0, %%"REG_a") \n\t" \
520 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
521 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
522 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
523 "add $32, %%"REG_a" \n\t" \
525 :: "r"(((uint8_t *)blocks) + 128 * n), \
530 CLEAR_BLOCKS(clear_blocks_mmx, 6)
531 CLEAR_BLOCKS(clear_block_mmx, 1)
533 static void clear_block_sse(DCTELEM *block)
536 "xorps %%xmm0, %%xmm0 \n"
537 "movaps %%xmm0, (%0) \n"
538 "movaps %%xmm0, 16(%0) \n"
539 "movaps %%xmm0, 32(%0) \n"
540 "movaps %%xmm0, 48(%0) \n"
541 "movaps %%xmm0, 64(%0) \n"
542 "movaps %%xmm0, 80(%0) \n"
543 "movaps %%xmm0, 96(%0) \n"
544 "movaps %%xmm0, 112(%0) \n"
550 static void clear_blocks_sse(DCTELEM *blocks)
553 "xorps %%xmm0, %%xmm0 \n"
554 "mov %1, %%"REG_a" \n"
556 "movaps %%xmm0, (%0, %%"REG_a") \n"
557 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
558 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
559 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
560 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
561 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
562 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
563 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
564 "add $128, %%"REG_a" \n"
566 :: "r"(((uint8_t *)blocks) + 128 * 6),
572 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
578 "movq (%1, %0), %%mm0 \n\t"
579 "movq (%2, %0), %%mm1 \n\t"
580 "paddb %%mm0, %%mm1 \n\t"
581 "movq %%mm1, (%2, %0) \n\t"
582 "movq 8(%1, %0), %%mm0 \n\t"
583 "movq 8(%2, %0), %%mm1 \n\t"
584 "paddb %%mm0, %%mm1 \n\t"
585 "movq %%mm1, 8(%2, %0) \n\t"
591 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
594 dst[i + 0] += src[i + 0];
598 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
599 const uint8_t *diff, int w,
600 int *left, int *left_top)
604 int l = *left & 0xff;
605 int tl = *left_top & 0xff;
610 "movzbl (%3, %4), %2 \n"
623 "add (%6, %4), %b0 \n"
624 "mov %b0, (%5, %4) \n"
627 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
628 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
635 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
636 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
637 "movd (%1), %%mm0 \n\t"
639 "movd (%1), %%mm1 \n\t"
640 "movd (%1,%3,1), %%mm2 \n\t"
641 "movd (%1,%3,2), %%mm3 \n\t"
642 "punpcklbw %%mm1, %%mm0 \n\t"
643 "punpcklbw %%mm3, %%mm2 \n\t"
644 "movq %%mm0, %%mm1 \n\t"
645 "punpcklwd %%mm2, %%mm0 \n\t"
646 "punpckhwd %%mm2, %%mm1 \n\t"
647 "movd %%mm0, (%0) \n\t"
649 "punpckhdq %%mm0, %%mm0 \n\t"
650 "movd %%mm0, (%0) \n\t"
651 "movd %%mm1, (%0,%2,1) \n\t"
652 "punpckhdq %%mm1, %%mm1 \n\t"
653 "movd %%mm1, (%0,%2,2) \n\t"
663 #define H263_LOOP_FILTER \
664 "pxor %%mm7, %%mm7 \n\t" \
665 "movq %0, %%mm0 \n\t" \
666 "movq %0, %%mm1 \n\t" \
667 "movq %3, %%mm2 \n\t" \
668 "movq %3, %%mm3 \n\t" \
669 "punpcklbw %%mm7, %%mm0 \n\t" \
670 "punpckhbw %%mm7, %%mm1 \n\t" \
671 "punpcklbw %%mm7, %%mm2 \n\t" \
672 "punpckhbw %%mm7, %%mm3 \n\t" \
673 "psubw %%mm2, %%mm0 \n\t" \
674 "psubw %%mm3, %%mm1 \n\t" \
675 "movq %1, %%mm2 \n\t" \
676 "movq %1, %%mm3 \n\t" \
677 "movq %2, %%mm4 \n\t" \
678 "movq %2, %%mm5 \n\t" \
679 "punpcklbw %%mm7, %%mm2 \n\t" \
680 "punpckhbw %%mm7, %%mm3 \n\t" \
681 "punpcklbw %%mm7, %%mm4 \n\t" \
682 "punpckhbw %%mm7, %%mm5 \n\t" \
683 "psubw %%mm2, %%mm4 \n\t" \
684 "psubw %%mm3, %%mm5 \n\t" \
685 "psllw $2, %%mm4 \n\t" \
686 "psllw $2, %%mm5 \n\t" \
687 "paddw %%mm0, %%mm4 \n\t" \
688 "paddw %%mm1, %%mm5 \n\t" \
689 "pxor %%mm6, %%mm6 \n\t" \
690 "pcmpgtw %%mm4, %%mm6 \n\t" \
691 "pcmpgtw %%mm5, %%mm7 \n\t" \
692 "pxor %%mm6, %%mm4 \n\t" \
693 "pxor %%mm7, %%mm5 \n\t" \
694 "psubw %%mm6, %%mm4 \n\t" \
695 "psubw %%mm7, %%mm5 \n\t" \
696 "psrlw $3, %%mm4 \n\t" \
697 "psrlw $3, %%mm5 \n\t" \
698 "packuswb %%mm5, %%mm4 \n\t" \
699 "packsswb %%mm7, %%mm6 \n\t" \
700 "pxor %%mm7, %%mm7 \n\t" \
701 "movd %4, %%mm2 \n\t" \
702 "punpcklbw %%mm2, %%mm2 \n\t" \
703 "punpcklbw %%mm2, %%mm2 \n\t" \
704 "punpcklbw %%mm2, %%mm2 \n\t" \
705 "psubusb %%mm4, %%mm2 \n\t" \
706 "movq %%mm2, %%mm3 \n\t" \
707 "psubusb %%mm4, %%mm3 \n\t" \
708 "psubb %%mm3, %%mm2 \n\t" \
709 "movq %1, %%mm3 \n\t" \
710 "movq %2, %%mm4 \n\t" \
711 "pxor %%mm6, %%mm3 \n\t" \
712 "pxor %%mm6, %%mm4 \n\t" \
713 "paddusb %%mm2, %%mm3 \n\t" \
714 "psubusb %%mm2, %%mm4 \n\t" \
715 "pxor %%mm6, %%mm3 \n\t" \
716 "pxor %%mm6, %%mm4 \n\t" \
717 "paddusb %%mm2, %%mm2 \n\t" \
718 "packsswb %%mm1, %%mm0 \n\t" \
719 "pcmpgtb %%mm0, %%mm7 \n\t" \
720 "pxor %%mm7, %%mm0 \n\t" \
721 "psubb %%mm7, %%mm0 \n\t" \
722 "movq %%mm0, %%mm1 \n\t" \
723 "psubusb %%mm2, %%mm0 \n\t" \
724 "psubb %%mm0, %%mm1 \n\t" \
725 "pand %5, %%mm1 \n\t" \
726 "psrlw $2, %%mm1 \n\t" \
727 "pxor %%mm7, %%mm1 \n\t" \
728 "psubb %%mm7, %%mm1 \n\t" \
729 "movq %0, %%mm5 \n\t" \
730 "movq %3, %%mm6 \n\t" \
731 "psubb %%mm1, %%mm5 \n\t" \
732 "paddb %%mm1, %%mm6 \n\t"
734 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
736 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
737 const int strength = ff_h263_loop_filter_strength[qscale];
742 "movq %%mm3, %1 \n\t"
743 "movq %%mm4, %2 \n\t"
744 "movq %%mm5, %0 \n\t"
745 "movq %%mm6, %3 \n\t"
746 : "+m"(*(uint64_t*)(src - 2 * stride)),
747 "+m"(*(uint64_t*)(src - 1 * stride)),
748 "+m"(*(uint64_t*)(src + 0 * stride)),
749 "+m"(*(uint64_t*)(src + 1 * stride))
750 : "g"(2 * strength), "m"(ff_pb_FC)
755 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
757 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
758 const int strength = ff_h263_loop_filter_strength[qscale];
759 DECLARE_ALIGNED(8, uint64_t, temp)[4];
760 uint8_t *btemp = (uint8_t*)temp;
764 transpose4x4(btemp, src, 8, stride);
765 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
767 H263_LOOP_FILTER // 5 3 4 6
773 : "g"(2 * strength), "m"(ff_pb_FC)
777 "movq %%mm5, %%mm1 \n\t"
778 "movq %%mm4, %%mm0 \n\t"
779 "punpcklbw %%mm3, %%mm5 \n\t"
780 "punpcklbw %%mm6, %%mm4 \n\t"
781 "punpckhbw %%mm3, %%mm1 \n\t"
782 "punpckhbw %%mm6, %%mm0 \n\t"
783 "movq %%mm5, %%mm3 \n\t"
784 "movq %%mm1, %%mm6 \n\t"
785 "punpcklwd %%mm4, %%mm5 \n\t"
786 "punpcklwd %%mm0, %%mm1 \n\t"
787 "punpckhwd %%mm4, %%mm3 \n\t"
788 "punpckhwd %%mm0, %%mm6 \n\t"
789 "movd %%mm5, (%0) \n\t"
790 "punpckhdq %%mm5, %%mm5 \n\t"
791 "movd %%mm5, (%0, %2) \n\t"
792 "movd %%mm3, (%0, %2, 2) \n\t"
793 "punpckhdq %%mm3, %%mm3 \n\t"
794 "movd %%mm3, (%0, %3) \n\t"
795 "movd %%mm1, (%1) \n\t"
796 "punpckhdq %%mm1, %%mm1 \n\t"
797 "movd %%mm1, (%1, %2) \n\t"
798 "movd %%mm6, (%1, %2, 2) \n\t"
799 "punpckhdq %%mm6, %%mm6 \n\t"
800 "movd %%mm6, (%1, %3) \n\t"
802 "r"(src + 4 * stride),
803 "r"((x86_reg)stride),
804 "r"((x86_reg)(3 * stride))
809 /* Draw the edges of width 'w' of an image of size width, height
810 * this MMX version can only handle w == 8 || w == 16. */
811 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
812 int w, int h, int sides)
814 uint8_t *ptr, *last_line;
817 last_line = buf + (height - 1) * wrap;
823 "movd (%0), %%mm0 \n\t"
824 "punpcklbw %%mm0, %%mm0 \n\t"
825 "punpcklwd %%mm0, %%mm0 \n\t"
826 "punpckldq %%mm0, %%mm0 \n\t"
827 "movq %%mm0, -8(%0) \n\t"
828 "movq -8(%0, %2), %%mm1 \n\t"
829 "punpckhbw %%mm1, %%mm1 \n\t"
830 "punpckhwd %%mm1, %%mm1 \n\t"
831 "punpckhdq %%mm1, %%mm1 \n\t"
832 "movq %%mm1, (%0, %2) \n\t"
837 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
842 "movd (%0), %%mm0 \n\t"
843 "punpcklbw %%mm0, %%mm0 \n\t"
844 "punpcklwd %%mm0, %%mm0 \n\t"
845 "punpckldq %%mm0, %%mm0 \n\t"
846 "movq %%mm0, -8(%0) \n\t"
847 "movq %%mm0, -16(%0) \n\t"
848 "movq -8(%0, %2), %%mm1 \n\t"
849 "punpckhbw %%mm1, %%mm1 \n\t"
850 "punpckhwd %%mm1, %%mm1 \n\t"
851 "punpckhdq %%mm1, %%mm1 \n\t"
852 "movq %%mm1, (%0, %2) \n\t"
853 "movq %%mm1, 8(%0, %2) \n\t"
858 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
862 /* top and bottom (and hopefully also the corners) */
863 if (sides & EDGE_TOP) {
864 for (i = 0; i < h; i += 4) {
865 ptr = buf - (i + 1) * wrap - w;
868 "movq (%1, %0), %%mm0 \n\t"
869 "movq %%mm0, (%0) \n\t"
870 "movq %%mm0, (%0, %2) \n\t"
871 "movq %%mm0, (%0, %2, 2) \n\t"
872 "movq %%mm0, (%0, %3) \n\t"
877 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
878 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
883 if (sides & EDGE_BOTTOM) {
884 for (i = 0; i < h; i += 4) {
885 ptr = last_line + (i + 1) * wrap - w;
888 "movq (%1, %0), %%mm0 \n\t"
889 "movq %%mm0, (%0) \n\t"
890 "movq %%mm0, (%0, %2) \n\t"
891 "movq %%mm0, (%0, %2, 2) \n\t"
892 "movq %%mm0, (%0, %3) \n\t"
897 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
898 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
899 "r"(ptr + width + 2 * w)
905 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
906 in0, in1, in2, in7, out, OP) \
907 "paddw "#m4", "#m3" \n\t" /* x1 */ \
908 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
909 "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
910 "movq "#in7", "#m3" \n\t" /* d */ \
911 "movq "#in0", %%mm5 \n\t" /* D */ \
912 "paddw "#m3", %%mm5 \n\t" /* x4 */ \
913 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
914 "movq "#in1", %%mm5 \n\t" /* C */ \
915 "movq "#in2", %%mm6 \n\t" /* B */ \
916 "paddw "#m6", %%mm5 \n\t" /* x3 */ \
917 "paddw "#m5", %%mm6 \n\t" /* x2 */ \
918 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
919 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
920 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
921 "paddw "#rnd", %%mm4 \n\t" /* x2 */ \
922 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
923 "psraw $5, %%mm5 \n\t" \
924 "packuswb %%mm5, %%mm5 \n\t" \
925 OP(%%mm5, out, %%mm7, d)
927 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW) \
928 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, \
937 "pxor %%mm7, %%mm7 \n\t" \
939 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
940 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
941 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
942 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
943 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
944 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
945 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
946 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
947 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
948 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
949 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
950 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
951 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
952 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
953 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
954 "paddw %%mm3, %%mm5 \n\t" /* b */ \
955 "paddw %%mm2, %%mm6 \n\t" /* c */ \
956 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
957 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
958 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
959 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
960 "paddw %%mm4, %%mm0 \n\t" /* a */ \
961 "paddw %%mm1, %%mm5 \n\t" /* d */ \
962 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
963 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
964 "paddw %6, %%mm6 \n\t" \
965 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
966 "psraw $5, %%mm0 \n\t" \
967 "movq %%mm0, %5 \n\t" \
968 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
970 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
971 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
972 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
973 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
974 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
975 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
976 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
977 "paddw %%mm0, %%mm2 \n\t" /* b */ \
978 "paddw %%mm5, %%mm3 \n\t" /* c */ \
979 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
980 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
981 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
982 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
983 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
984 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
985 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
986 "paddw %%mm2, %%mm1 \n\t" /* a */ \
987 "paddw %%mm6, %%mm4 \n\t" /* d */ \
988 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
989 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
990 "paddw %6, %%mm1 \n\t" \
991 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
992 "psraw $5, %%mm3 \n\t" \
993 "movq %5, %%mm1 \n\t" \
994 "packuswb %%mm3, %%mm1 \n\t" \
995 OP_MMX2(%%mm1, (%1), %%mm4, q) \
996 /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
998 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
999 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
1000 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
1001 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
1002 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
1003 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
1004 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
1005 "paddw %%mm1, %%mm5 \n\t" /* b */ \
1006 "paddw %%mm4, %%mm0 \n\t" /* c */ \
1007 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1008 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
1009 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
1010 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
1011 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
1012 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
1013 "paddw %%mm3, %%mm2 \n\t" /* d */ \
1014 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
1015 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
1016 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
1017 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
1018 "paddw %%mm2, %%mm6 \n\t" /* a */ \
1019 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
1020 "paddw %6, %%mm0 \n\t" \
1021 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1022 "psraw $5, %%mm0 \n\t" \
1023 /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
1024 /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
1026 "paddw %%mm5, %%mm3 \n\t" /* a */ \
1027 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
1028 "paddw %%mm4, %%mm6 \n\t" /* b */ \
1029 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
1030 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
1031 "paddw %%mm1, %%mm4 \n\t" /* c */ \
1032 "paddw %%mm2, %%mm5 \n\t" /* d */ \
1033 "paddw %%mm6, %%mm6 \n\t" /* 2b */ \
1034 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
1035 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
1036 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
1037 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
1038 "paddw %6, %%mm4 \n\t" \
1039 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
1040 "psraw $5, %%mm4 \n\t" \
1041 "packuswb %%mm4, %%mm0 \n\t" \
1042 OP_MMX2(%%mm0, 8(%1), %%mm4, q) \
1048 : "+a"(src), "+c"(dst), "+D"(h) \
1049 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
1050 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
1055 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, \
1063 /* quick HACK, XXX FIXME MUST be optimized */ \
1064 for (i = 0; i < h; i++) { \
1065 temp[ 0] = (src[ 0] + src[ 1]) * 20 - (src[ 0] + src[ 2]) * 6 + \
1066 (src[ 1] + src[ 3]) * 3 - (src[ 2] + src[ 4]); \
1067 temp[ 1] = (src[ 1] + src[ 2]) * 20 - (src[ 0] + src[ 3]) * 6 + \
1068 (src[ 0] + src[ 4]) * 3 - (src[ 1] + src[ 5]); \
1069 temp[ 2] = (src[ 2] + src[ 3]) * 20 - (src[ 1] + src[ 4]) * 6 + \
1070 (src[ 0] + src[ 5]) * 3 - (src[ 0] + src[ 6]); \
1071 temp[ 3] = (src[ 3] + src[ 4]) * 20 - (src[ 2] + src[ 5]) * 6 + \
1072 (src[ 1] + src[ 6]) * 3 - (src[ 0] + src[ 7]); \
1073 temp[ 4] = (src[ 4] + src[ 5]) * 20 - (src[ 3] + src[ 6]) * 6 + \
1074 (src[ 2] + src[ 7]) * 3 - (src[ 1] + src[ 8]); \
1075 temp[ 5] = (src[ 5] + src[ 6]) * 20 - (src[ 4] + src[ 7]) * 6 + \
1076 (src[ 3] + src[ 8]) * 3 - (src[ 2] + src[ 9]); \
1077 temp[ 6] = (src[ 6] + src[ 7]) * 20 - (src[ 5] + src[ 8]) * 6 + \
1078 (src[ 4] + src[ 9]) * 3 - (src[ 3] + src[10]); \
1079 temp[ 7] = (src[ 7] + src[ 8]) * 20 - (src[ 6] + src[ 9]) * 6 + \
1080 (src[ 5] + src[10]) * 3 - (src[ 4] + src[11]); \
1081 temp[ 8] = (src[ 8] + src[ 9]) * 20 - (src[ 7] + src[10]) * 6 + \
1082 (src[ 6] + src[11]) * 3 - (src[ 5] + src[12]); \
1083 temp[ 9] = (src[ 9] + src[10]) * 20 - (src[ 8] + src[11]) * 6 + \
1084 (src[ 7] + src[12]) * 3 - (src[ 6] + src[13]); \
1085 temp[10] = (src[10] + src[11]) * 20 - (src[ 9] + src[12]) * 6 + \
1086 (src[ 8] + src[13]) * 3 - (src[ 7] + src[14]); \
1087 temp[11] = (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + \
1088 (src[ 9] + src[14]) * 3 - (src[ 8] + src[15]); \
1089 temp[12] = (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + \
1090 (src[10] + src[15]) * 3 - (src[ 9] + src[16]); \
1091 temp[13] = (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + \
1092 (src[11] + src[16]) * 3 - (src[10] + src[16]); \
1093 temp[14] = (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + \
1094 (src[12] + src[16]) * 3 - (src[11] + src[15]); \
1095 temp[15] = (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + \
1096 (src[13] + src[15]) * 3 - (src[12] + src[14]); \
1097 __asm__ volatile ( \
1098 "movq (%0), %%mm0 \n\t" \
1099 "movq 8(%0), %%mm1 \n\t" \
1100 "paddw %2, %%mm0 \n\t" \
1101 "paddw %2, %%mm1 \n\t" \
1102 "psraw $5, %%mm0 \n\t" \
1103 "psraw $5, %%mm1 \n\t" \
1104 "packuswb %%mm1, %%mm0 \n\t" \
1105 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
1106 "movq 16(%0), %%mm0 \n\t" \
1107 "movq 24(%0), %%mm1 \n\t" \
1108 "paddw %2, %%mm0 \n\t" \
1109 "paddw %2, %%mm1 \n\t" \
1110 "psraw $5, %%mm0 \n\t" \
1111 "psraw $5, %%mm1 \n\t" \
1112 "packuswb %%mm1, %%mm0 \n\t" \
1113 OP_3DNOW(%%mm0, 8(%1), %%mm1, q) \
1114 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
1122 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, \
1128 __asm__ volatile ( \
1129 "pxor %%mm7, %%mm7 \n\t" \
1131 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
1132 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
1133 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
1134 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
1135 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
1136 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
1137 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
1138 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
1139 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
1140 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
1141 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
1142 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
1143 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
1144 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
1145 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
1146 "paddw %%mm3, %%mm5 \n\t" /* b */ \
1147 "paddw %%mm2, %%mm6 \n\t" /* c */ \
1148 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1149 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
1150 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
1151 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
1152 "paddw %%mm4, %%mm0 \n\t" /* a */ \
1153 "paddw %%mm1, %%mm5 \n\t" /* d */ \
1154 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
1155 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
1156 "paddw %5, %%mm6 \n\t" \
1157 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1158 "psraw $5, %%mm0 \n\t" \
1159 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
1161 "movd 5(%0), %%mm5 \n\t" /* FGHI */ \
1162 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
1163 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
1164 "paddw %%mm5, %%mm1 \n\t" /* a */ \
1165 "paddw %%mm6, %%mm2 \n\t" /* b */ \
1166 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
1167 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
1168 "paddw %%mm6, %%mm3 \n\t" /* c */ \
1169 "paddw %%mm5, %%mm4 \n\t" /* d */ \
1170 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1171 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1172 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1173 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1174 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
1175 "paddw %5, %%mm1 \n\t" \
1176 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
1177 "psraw $5, %%mm3 \n\t" \
1178 "packuswb %%mm3, %%mm0 \n\t" \
1179 OP_MMX2(%%mm0, (%1), %%mm4, q) \
1185 : "+a"(src), "+c"(dst), "+d"(h) \
1186 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1187 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
1192 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, \
1200 /* quick HACK, XXX FIXME MUST be optimized */ \
1201 for (i = 0; i < h; i++) { \
1202 temp[0] = (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + \
1203 (src[1] + src[3]) * 3 - (src[2] + src[4]); \
1204 temp[1] = (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + \
1205 (src[0] + src[4]) * 3 - (src[1] + src[5]); \
1206 temp[2] = (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + \
1207 (src[0] + src[5]) * 3 - (src[0] + src[6]); \
1208 temp[3] = (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + \
1209 (src[1] + src[6]) * 3 - (src[0] + src[7]); \
1210 temp[4] = (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + \
1211 (src[2] + src[7]) * 3 - (src[1] + src[8]); \
1212 temp[5] = (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + \
1213 (src[3] + src[8]) * 3 - (src[2] + src[8]); \
1214 temp[6] = (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + \
1215 (src[4] + src[8]) * 3 - (src[3] + src[7]); \
1216 temp[7] = (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + \
1217 (src[5] + src[7]) * 3 - (src[4] + src[6]); \
1218 __asm__ volatile ( \
1219 "movq (%0), %%mm0 \n\t" \
1220 "movq 8(%0), %%mm1 \n\t" \
1221 "paddw %2, %%mm0 \n\t" \
1222 "paddw %2, %%mm1 \n\t" \
1223 "psraw $5, %%mm0 \n\t" \
1224 "psraw $5, %%mm1 \n\t" \
1225 "packuswb %%mm1, %%mm0 \n\t" \
1226 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
1227 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
1235 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1236 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1241 uint64_t temp[17 * 4]; \
1242 uint64_t *temp_ptr = temp; \
1245 /* FIXME unroll */ \
1246 __asm__ volatile ( \
1247 "pxor %%mm7, %%mm7 \n\t" \
1249 "movq (%0), %%mm0 \n\t" \
1250 "movq (%0), %%mm1 \n\t" \
1251 "movq 8(%0), %%mm2 \n\t" \
1252 "movq 8(%0), %%mm3 \n\t" \
1253 "punpcklbw %%mm7, %%mm0 \n\t" \
1254 "punpckhbw %%mm7, %%mm1 \n\t" \
1255 "punpcklbw %%mm7, %%mm2 \n\t" \
1256 "punpckhbw %%mm7, %%mm3 \n\t" \
1257 "movq %%mm0, (%1) \n\t" \
1258 "movq %%mm1, 17 * 8(%1) \n\t" \
1259 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1260 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1265 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1266 : "r"((x86_reg)srcStride) \
1273 /* FIXME reorder for speed */ \
1274 __asm__ volatile ( \
1275 /* "pxor %%mm7, %%mm7 \n\t" */ \
1277 "movq (%0), %%mm0 \n\t" \
1278 "movq 8(%0), %%mm1 \n\t" \
1279 "movq 16(%0), %%mm2 \n\t" \
1280 "movq 24(%0), %%mm3 \n\t" \
1281 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1282 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1284 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1286 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1288 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1289 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1291 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1292 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1294 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1295 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1297 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1298 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1300 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1302 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1304 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1305 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1307 "add $136, %0 \n\t" \
1312 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1313 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1314 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1315 "g"(4 - 14 * (x86_reg)dstStride) \
1320 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1325 uint64_t temp[9 * 2]; \
1326 uint64_t *temp_ptr = temp; \
1329 /* FIXME unroll */ \
1330 __asm__ volatile ( \
1331 "pxor %%mm7, %%mm7 \n\t" \
1333 "movq (%0), %%mm0 \n\t" \
1334 "movq (%0), %%mm1 \n\t" \
1335 "punpcklbw %%mm7, %%mm0 \n\t" \
1336 "punpckhbw %%mm7, %%mm1 \n\t" \
1337 "movq %%mm0, (%1) \n\t" \
1338 "movq %%mm1, 9*8(%1) \n\t" \
1343 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1344 : "r"((x86_reg)srcStride) \
1351 /* FIXME reorder for speed */ \
1352 __asm__ volatile ( \
1353 /* "pxor %%mm7, %%mm7 \n\t" */ \
1355 "movq (%0), %%mm0 \n\t" \
1356 "movq 8(%0), %%mm1 \n\t" \
1357 "movq 16(%0), %%mm2 \n\t" \
1358 "movq 24(%0), %%mm3 \n\t" \
1359 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1360 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1362 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1364 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1366 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1368 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1370 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1371 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1373 "add $72, %0 \n\t" \
1378 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1379 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1380 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1381 "g"(4 - 6 * (x86_reg)dstStride) \
1386 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1389 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1392 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1396 uint8_t * const half = (uint8_t*)temp; \
1397 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1399 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1402 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1405 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1409 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1413 uint8_t * const half = (uint8_t*)temp; \
1414 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1416 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1420 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1424 uint8_t * const half = (uint8_t*)temp; \
1425 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1426 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1429 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1432 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1435 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1439 uint8_t * const half = (uint8_t*)temp; \
1440 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1441 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1445 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1448 uint64_t half[8 + 9]; \
1449 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1450 uint8_t * const halfHV = ((uint8_t*)half); \
1451 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1453 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1454 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1455 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1458 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1461 uint64_t half[8 + 9]; \
1462 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1463 uint8_t * const halfHV = ((uint8_t*)half); \
1464 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1466 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1468 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1469 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1472 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1475 uint64_t half[8 + 9]; \
1476 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1477 uint8_t * const halfHV = ((uint8_t*)half); \
1478 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1480 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1481 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1482 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1485 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1488 uint64_t half[8 + 9]; \
1489 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1490 uint8_t * const halfHV = ((uint8_t*)half); \
1491 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1493 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1495 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1496 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1499 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1502 uint64_t half[8 + 9]; \
1503 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1504 uint8_t * const halfHV = ((uint8_t*)half); \
1505 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1507 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1508 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1511 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1514 uint64_t half[8 + 9]; \
1515 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1516 uint8_t * const halfHV = ((uint8_t*)half); \
1517 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1519 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1520 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1523 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1526 uint64_t half[8 + 9]; \
1527 uint8_t * const halfH = ((uint8_t*)half); \
1528 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1530 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1531 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1534 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1537 uint64_t half[8 + 9]; \
1538 uint8_t * const halfH = ((uint8_t*)half); \
1539 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1541 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1543 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1546 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1550 uint8_t * const halfH = ((uint8_t*)half); \
1551 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1553 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1556 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1559 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1562 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1565 uint64_t temp[32]; \
1566 uint8_t * const half = (uint8_t*)temp; \
1567 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1569 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1572 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1575 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1576 stride, stride, 16); \
1579 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1582 uint64_t temp[32]; \
1583 uint8_t * const half = (uint8_t*)temp; \
1584 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1586 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1587 stride, stride, 16); \
1590 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1593 uint64_t temp[32]; \
1594 uint8_t * const half = (uint8_t*)temp; \
1595 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1597 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1600 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1603 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1606 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1609 uint64_t temp[32]; \
1610 uint8_t * const half = (uint8_t*)temp; \
1611 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1613 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1614 stride, stride, 16); \
1617 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1620 uint64_t half[16 * 2 + 17 * 2]; \
1621 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1622 uint8_t * const halfHV = ((uint8_t*)half); \
1623 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1625 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1627 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1629 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1632 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1635 uint64_t half[16 * 2 + 17 * 2]; \
1636 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1637 uint8_t * const halfHV = ((uint8_t*)half); \
1638 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1640 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1642 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1644 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1647 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1650 uint64_t half[16 * 2 + 17 * 2]; \
1651 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1652 uint8_t * const halfHV = ((uint8_t*)half); \
1653 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1655 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1657 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1659 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1663 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1666 uint64_t half[16 * 2 + 17 * 2]; \
1667 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1668 uint8_t * const halfHV = ((uint8_t*)half); \
1669 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1671 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1673 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1675 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1679 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1682 uint64_t half[16 * 2 + 17 * 2]; \
1683 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1684 uint8_t * const halfHV = ((uint8_t*)half); \
1685 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1687 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1689 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1692 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1695 uint64_t half[16 * 2 + 17 * 2]; \
1696 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1697 uint8_t * const halfHV = ((uint8_t*)half); \
1698 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1700 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1702 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1706 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1709 uint64_t half[17 * 2]; \
1710 uint8_t * const halfH = ((uint8_t*)half); \
1711 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1713 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1715 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1718 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1721 uint64_t half[17 * 2]; \
1722 uint8_t * const halfH = ((uint8_t*)half); \
1723 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1725 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1727 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1730 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1733 uint64_t half[17 * 2]; \
1734 uint8_t * const halfH = ((uint8_t*)half); \
1735 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1737 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1740 #define PUT_OP(a, b, temp, size) \
1741 "mov"#size" "#a", "#b" \n\t"
1743 #define AVG_3DNOW_OP(a, b, temp, size) \
1744 "mov"#size" "#b", "#temp" \n\t" \
1745 "pavgusb "#temp", "#a" \n\t" \
1746 "mov"#size" "#a", "#b" \n\t"
1748 #define AVG_MMX2_OP(a, b, temp, size) \
1749 "mov"#size" "#b", "#temp" \n\t" \
1750 "pavgb "#temp", "#a" \n\t" \
1751 "mov"#size" "#a", "#b" \n\t"
1753 QPEL_BASE(put_, ff_pw_16, _, PUT_OP, PUT_OP)
1754 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMX2_OP, AVG_3DNOW_OP)
1755 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1756 QPEL_OP(put_, ff_pw_16, _, PUT_OP, 3dnow)
1757 QPEL_OP(avg_, ff_pw_16, _, AVG_3DNOW_OP, 3dnow)
1758 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1759 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmx2)
1760 QPEL_OP(avg_, ff_pw_16, _, AVG_MMX2_OP, mmx2)
1761 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1763 /***********************************/
1764 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1766 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL) \
1767 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1771 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE); \
1774 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2) \
1775 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1779 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE, \
1783 #define QPEL_2TAP(OPNAME, SIZE, MMX) \
1784 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX) \
1785 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX) \
1786 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx) \
1787 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX = \
1788 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX; \
1789 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX = \
1790 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX; \
1791 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX = \
1792 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX; \
1793 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, \
1797 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE); \
1799 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, \
1803 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride, \
1806 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0) \
1807 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0) \
1808 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0) \
1809 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0) \
1810 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1) \
1811 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1) \
1812 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1) \
1813 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1) \
1815 QPEL_2TAP(put_, 16, mmx2)
1816 QPEL_2TAP(avg_, 16, mmx2)
1817 QPEL_2TAP(put_, 8, mmx2)
1818 QPEL_2TAP(avg_, 8, mmx2)
1819 QPEL_2TAP(put_, 16, 3dnow)
1820 QPEL_2TAP(avg_, 16, 3dnow)
1821 QPEL_2TAP(put_, 8, 3dnow)
1822 QPEL_2TAP(avg_, 8, 3dnow)
1824 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1826 put_pixels8_xy2_mmx(dst, src, stride, 8);
1828 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1830 put_pixels16_xy2_mmx(dst, src, stride, 16);
1832 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1834 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1836 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1838 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1841 #endif /* HAVE_INLINE_ASM */
1844 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
1845 x86_reg linesize, x86_reg start_y,
1846 x86_reg end_y, x86_reg block_h,
1847 x86_reg start_x, x86_reg end_x,
1849 extern emu_edge_core_func ff_emu_edge_core_mmx;
1850 extern emu_edge_core_func ff_emu_edge_core_sse;
1852 static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
1854 int block_w, int block_h,
1855 int src_x, int src_y,
1857 emu_edge_core_func *core_fn)
1859 int start_y, start_x, end_y, end_x, src_y_add = 0;
1862 src_y_add = h - 1 - src_y;
1864 } else if (src_y <= -block_h) {
1865 src_y_add = 1 - block_h - src_y;
1866 src_y = 1 - block_h;
1869 src += w - 1 - src_x;
1871 } else if (src_x <= -block_w) {
1872 src += 1 - block_w - src_x;
1873 src_x = 1 - block_w;
1876 start_y = FFMAX(0, -src_y);
1877 start_x = FFMAX(0, -src_x);
1878 end_y = FFMIN(block_h, h-src_y);
1879 end_x = FFMIN(block_w, w-src_x);
1880 assert(start_x < end_x && block_w > 0);
1881 assert(start_y < end_y && block_h > 0);
1883 // fill in the to-be-copied part plus all above/below
1884 src += (src_y_add + start_y) * linesize + start_x;
1886 core_fn(buf, src, linesize, start_y, end_y,
1887 block_h, start_x, end_x, block_w);
1891 static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
1893 int block_w, int block_h,
1894 int src_x, int src_y, int w, int h)
1896 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1897 w, h, &ff_emu_edge_core_mmx);
1901 static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
1903 int block_w, int block_h,
1904 int src_x, int src_y, int w, int h)
1906 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1907 w, h, &ff_emu_edge_core_sse);
1909 #endif /* HAVE_YASM */
1913 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1914 int linesize, int block_w, int block_h,
1915 int src_x, int src_y, int w, int h);
1917 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1918 int stride, int h, int ox, int oy,
1919 int dxx, int dxy, int dyx, int dyy,
1920 int shift, int r, int width, int height,
1921 emulated_edge_mc_func *emu_edge_fn)
1924 const int ix = ox >> (16 + shift);
1925 const int iy = oy >> (16 + shift);
1926 const int oxs = ox >> 4;
1927 const int oys = oy >> 4;
1928 const int dxxs = dxx >> 4;
1929 const int dxys = dxy >> 4;
1930 const int dyxs = dyx >> 4;
1931 const int dyys = dyy >> 4;
1932 const uint16_t r4[4] = { r, r, r, r };
1933 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1934 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1935 const uint64_t shift2 = 2 * shift;
1936 uint8_t edge_buf[(h + 1) * stride];
1939 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1940 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1941 const int dxh = dxy * (h - 1);
1942 const int dyw = dyx * (w - 1);
1943 if ( // non-constant fullpel offset (3% of blocks)
1944 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1945 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1946 // uses more than 16 bits of subpel mv (only at huge resolution)
1947 || (dxx | dxy | dyx | dyy) & 15) {
1948 // FIXME could still use mmx for some of the rows
1949 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1950 shift, r, width, height);
1954 src += ix + iy * stride;
1955 if ((unsigned)ix >= width - w ||
1956 (unsigned)iy >= height - h) {
1957 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1962 "movd %0, %%mm6 \n\t"
1963 "pxor %%mm7, %%mm7 \n\t"
1964 "punpcklwd %%mm6, %%mm6 \n\t"
1965 "punpcklwd %%mm6, %%mm6 \n\t"
1969 for (x = 0; x < w; x += 4) {
1970 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1971 oxs - dxys + dxxs * (x + 1),
1972 oxs - dxys + dxxs * (x + 2),
1973 oxs - dxys + dxxs * (x + 3) };
1974 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1975 oys - dyys + dyxs * (x + 1),
1976 oys - dyys + dyxs * (x + 2),
1977 oys - dyys + dyxs * (x + 3) };
1979 for (y = 0; y < h; y++) {
1981 "movq %0, %%mm4 \n\t"
1982 "movq %1, %%mm5 \n\t"
1983 "paddw %2, %%mm4 \n\t"
1984 "paddw %3, %%mm5 \n\t"
1985 "movq %%mm4, %0 \n\t"
1986 "movq %%mm5, %1 \n\t"
1987 "psrlw $12, %%mm4 \n\t"
1988 "psrlw $12, %%mm5 \n\t"
1989 : "+m"(*dx4), "+m"(*dy4)
1990 : "m"(*dxy4), "m"(*dyy4)
1994 "movq %%mm6, %%mm2 \n\t"
1995 "movq %%mm6, %%mm1 \n\t"
1996 "psubw %%mm4, %%mm2 \n\t"
1997 "psubw %%mm5, %%mm1 \n\t"
1998 "movq %%mm2, %%mm0 \n\t"
1999 "movq %%mm4, %%mm3 \n\t"
2000 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
2001 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
2002 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
2003 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
2005 "movd %4, %%mm5 \n\t"
2006 "movd %3, %%mm4 \n\t"
2007 "punpcklbw %%mm7, %%mm5 \n\t"
2008 "punpcklbw %%mm7, %%mm4 \n\t"
2009 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
2010 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
2012 "movd %2, %%mm5 \n\t"
2013 "movd %1, %%mm4 \n\t"
2014 "punpcklbw %%mm7, %%mm5 \n\t"
2015 "punpcklbw %%mm7, %%mm4 \n\t"
2016 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
2017 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
2018 "paddw %5, %%mm1 \n\t"
2019 "paddw %%mm3, %%mm2 \n\t"
2020 "paddw %%mm1, %%mm0 \n\t"
2021 "paddw %%mm2, %%mm0 \n\t"
2023 "psrlw %6, %%mm0 \n\t"
2024 "packuswb %%mm0, %%mm0 \n\t"
2025 "movd %%mm0, %0 \n\t"
2027 : "=m"(dst[x + y * stride])
2028 : "m"(src[0]), "m"(src[1]),
2029 "m"(src[stride]), "m"(src[stride + 1]),
2030 "m"(*r4), "m"(shift2)
2034 src += 4 - h * stride;
2040 static void gmc_mmx(uint8_t *dst, uint8_t *src,
2041 int stride, int h, int ox, int oy,
2042 int dxx, int dxy, int dyx, int dyy,
2043 int shift, int r, int width, int height)
2045 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2046 width, height, &emulated_edge_mc_mmx);
2049 static void gmc_sse(uint8_t *dst, uint8_t *src,
2050 int stride, int h, int ox, int oy,
2051 int dxx, int dxy, int dyx, int dyy,
2052 int shift, int r, int width, int height)
2054 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2055 width, height, &emulated_edge_mc_sse);
2058 static void gmc_mmx(uint8_t *dst, uint8_t *src,
2059 int stride, int h, int ox, int oy,
2060 int dxx, int dxy, int dyx, int dyy,
2061 int shift, int r, int width, int height)
2063 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2064 width, height, &ff_emulated_edge_mc_8);
2068 #define PREFETCH(name, op) \
2069 static void name(void *mem, int stride, int h) \
2071 const uint8_t *p = mem; \
2073 __asm__ volatile (#op" %0" :: "m"(*p)); \
2078 PREFETCH(prefetch_mmx2, prefetcht0)
2079 PREFETCH(prefetch_3dnow, prefetch)
2082 #endif /* HAVE_INLINE_ASM */
2084 #include "h264_qpel.c"
2086 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
2087 int stride, int h, int x, int y);
2088 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
2089 int stride, int h, int x, int y);
2090 void ff_avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst, uint8_t *src,
2091 int stride, int h, int x, int y);
2093 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
2094 int stride, int h, int x, int y);
2095 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
2096 int stride, int h, int x, int y);
2097 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
2098 int stride, int h, int x, int y);
2100 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
2101 int stride, int h, int x, int y);
2102 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
2103 int stride, int h, int x, int y);
2105 void ff_put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
2106 int stride, int h, int x, int y);
2107 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
2108 int stride, int h, int x, int y);
2110 void ff_avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
2111 int stride, int h, int x, int y);
2112 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
2113 int stride, int h, int x, int y);
2115 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
2116 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
2117 (uint8_t *dst, uint8_t *src, \
2118 int stride, int h, int x, int y);
2120 CHROMA_MC(put, 2, 10, mmx2)
2121 CHROMA_MC(avg, 2, 10, mmx2)
2122 CHROMA_MC(put, 4, 10, mmx2)
2123 CHROMA_MC(avg, 4, 10, mmx2)
2124 CHROMA_MC(put, 8, 10, sse2)
2125 CHROMA_MC(avg, 8, 10, sse2)
2126 CHROMA_MC(put, 8, 10, avx)
2127 CHROMA_MC(avg, 8, 10, avx)
2132 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2134 put_pixels8_mmx(dst, src, stride, 8);
2137 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2139 avg_pixels8_mmx(dst, src, stride, 8);
2142 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2144 put_pixels16_mmx(dst, src, stride, 16);
2147 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2149 avg_pixels16_mmx(dst, src, stride, 16);
2153 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
2154 int stride, int rnd)
2156 put_pixels8_mmx(dst, src, stride, 8);
2159 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src,
2160 int stride, int rnd)
2162 avg_pixels8_mmx2(dst, src, stride, 8);
2165 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
2168 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2170 ff_idct_xvid_mmx(block);
2171 ff_put_pixels_clamped_mmx(block, dest, line_size);
2174 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2176 ff_idct_xvid_mmx(block);
2177 ff_add_pixels_clamped_mmx(block, dest, line_size);
2180 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2182 ff_idct_xvid_mmx2(block);
2183 ff_put_pixels_clamped_mmx(block, dest, line_size);
2186 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2188 ff_idct_xvid_mmx2(block);
2189 ff_add_pixels_clamped_mmx(block, dest, line_size);
2192 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2195 __asm__ volatile ("pxor %%mm7, %%mm7":);
2196 for (i = 0; i < blocksize; i += 2) {
2198 "movq %0, %%mm0 \n\t"
2199 "movq %1, %%mm1 \n\t"
2200 "movq %%mm0, %%mm2 \n\t"
2201 "movq %%mm1, %%mm3 \n\t"
2202 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2203 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2204 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2205 "pxor %%mm2, %%mm1 \n\t"
2206 "movq %%mm3, %%mm4 \n\t"
2207 "pand %%mm1, %%mm3 \n\t"
2208 "pandn %%mm1, %%mm4 \n\t"
2209 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2210 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2211 "movq %%mm3, %1 \n\t"
2212 "movq %%mm0, %0 \n\t"
2213 : "+m"(mag[i]), "+m"(ang[i])
2217 __asm__ volatile ("femms");
2220 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2225 "movaps %0, %%xmm5 \n\t"
2226 :: "m"(ff_pdw_80000000[0])
2228 for (i = 0; i < blocksize; i += 4) {
2230 "movaps %0, %%xmm0 \n\t"
2231 "movaps %1, %%xmm1 \n\t"
2232 "xorps %%xmm2, %%xmm2 \n\t"
2233 "xorps %%xmm3, %%xmm3 \n\t"
2234 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2235 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2236 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2237 "xorps %%xmm2, %%xmm1 \n\t"
2238 "movaps %%xmm3, %%xmm4 \n\t"
2239 "andps %%xmm1, %%xmm3 \n\t"
2240 "andnps %%xmm1, %%xmm4 \n\t"
2241 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2242 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2243 "movaps %%xmm3, %1 \n\t"
2244 "movaps %%xmm0, %0 \n\t"
2245 : "+m"(mag[i]), "+m"(ang[i])
2254 #define MIX5(mono, stereo) \
2255 __asm__ volatile ( \
2256 "movss 0(%2), %%xmm5 \n" \
2257 "movss 8(%2), %%xmm6 \n" \
2258 "movss 24(%2), %%xmm7 \n" \
2259 "shufps $0, %%xmm5, %%xmm5 \n" \
2260 "shufps $0, %%xmm6, %%xmm6 \n" \
2261 "shufps $0, %%xmm7, %%xmm7 \n" \
2263 "movaps (%0, %1), %%xmm0 \n" \
2264 "movaps 0x400(%0, %1), %%xmm1 \n" \
2265 "movaps 0x800(%0, %1), %%xmm2 \n" \
2266 "movaps 0xc00(%0, %1), %%xmm3 \n" \
2267 "movaps 0x1000(%0, %1), %%xmm4 \n" \
2268 "mulps %%xmm5, %%xmm0 \n" \
2269 "mulps %%xmm6, %%xmm1 \n" \
2270 "mulps %%xmm5, %%xmm2 \n" \
2271 "mulps %%xmm7, %%xmm3 \n" \
2272 "mulps %%xmm7, %%xmm4 \n" \
2273 stereo("addps %%xmm1, %%xmm0 \n") \
2274 "addps %%xmm1, %%xmm2 \n" \
2275 "addps %%xmm3, %%xmm0 \n" \
2276 "addps %%xmm4, %%xmm2 \n" \
2277 mono("addps %%xmm2, %%xmm0 \n") \
2278 "movaps %%xmm0, (%0, %1) \n" \
2279 stereo("movaps %%xmm2, 0x400(%0, %1) \n") \
2283 : "r"(samples[0] + len), "r"(matrix) \
2284 : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2285 "%xmm4", "%xmm5", "%xmm6", "%xmm7",) \
2289 #define MIX_MISC(stereo) \
2290 __asm__ volatile ( \
2292 "movaps (%3, %0), %%xmm0 \n" \
2293 stereo("movaps %%xmm0, %%xmm1 \n") \
2294 "mulps %%xmm4, %%xmm0 \n" \
2295 stereo("mulps %%xmm5, %%xmm1 \n") \
2296 "lea 1024(%3, %0), %1 \n" \
2299 "movaps (%1), %%xmm2 \n" \
2300 stereo("movaps %%xmm2, %%xmm3 \n") \
2301 "mulps (%4, %2), %%xmm2 \n" \
2302 stereo("mulps 16(%4, %2), %%xmm3 \n") \
2303 "addps %%xmm2, %%xmm0 \n" \
2304 stereo("addps %%xmm3, %%xmm1 \n") \
2305 "add $1024, %1 \n" \
2308 "movaps %%xmm0, (%3, %0) \n" \
2309 stereo("movaps %%xmm1, 1024(%3, %0) \n") \
2312 : "+&r"(i), "=&r"(j), "=&r"(k) \
2313 : "r"(samples[0] + len), "r"(matrix_simd + in_ch), \
2314 "g"((intptr_t) - 32 * (in_ch - 1)) \
2318 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2],
2319 int out_ch, int in_ch, int len)
2321 int (*matrix_cmp)[2] = (int(*)[2])matrix;
2324 i = -len * sizeof(float);
2325 if (in_ch == 5 && out_ch == 2 &&
2326 !(matrix_cmp[0][1] | matrix_cmp[2][0] |
2327 matrix_cmp[3][1] | matrix_cmp[4][0] |
2328 (matrix_cmp[1][0] ^ matrix_cmp[1][1]) |
2329 (matrix_cmp[0][0] ^ matrix_cmp[2][1]))) {
2331 } else if (in_ch == 5 && out_ch == 1 &&
2332 matrix_cmp[0][0] == matrix_cmp[2][0] &&
2333 matrix_cmp[3][0] == matrix_cmp[4][0]) {
2336 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2337 j = 2 * in_ch * sizeof(float);
2341 "movss (%2, %0), %%xmm4 \n"
2342 "movss 4(%2, %0), %%xmm5 \n"
2343 "shufps $0, %%xmm4, %%xmm4 \n"
2344 "shufps $0, %%xmm5, %%xmm5 \n"
2345 "movaps %%xmm4, (%1, %0, 4) \n"
2346 "movaps %%xmm5, 16(%1, %0, 4) \n"
2349 : "r"(matrix_simd), "r"(matrix)
2361 static void vector_fmul_window_3dnowext(float *dst, const float *src0,
2362 const float *src1, const float *win,
2365 x86_reg i = -len * 4;
2366 x86_reg j = len * 4 - 8;
2369 "pswapd (%5, %1), %%mm1 \n"
2370 "movq (%5, %0), %%mm0 \n"
2371 "pswapd (%4, %1), %%mm5 \n"
2372 "movq (%3, %0), %%mm4 \n"
2373 "movq %%mm0, %%mm2 \n"
2374 "movq %%mm1, %%mm3 \n"
2375 "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]
2376 "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]
2377 "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]
2378 "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]
2379 "pfadd %%mm3, %%mm2 \n"
2380 "pfsub %%mm0, %%mm1 \n"
2381 "pswapd %%mm2, %%mm2 \n"
2382 "movq %%mm1, (%2, %0) \n"
2383 "movq %%mm2, (%2, %1) \n"
2389 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2393 static void vector_fmul_window_sse(float *dst, const float *src0,
2394 const float *src1, const float *win, int len)
2396 x86_reg i = -len * 4;
2397 x86_reg j = len * 4 - 16;
2400 "movaps (%5, %1), %%xmm1 \n"
2401 "movaps (%5, %0), %%xmm0 \n"
2402 "movaps (%4, %1), %%xmm5 \n"
2403 "movaps (%3, %0), %%xmm4 \n"
2404 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2405 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2406 "movaps %%xmm0, %%xmm2 \n"
2407 "movaps %%xmm1, %%xmm3 \n"
2408 "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]
2409 "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]
2410 "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]
2411 "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]
2412 "addps %%xmm3, %%xmm2 \n"
2413 "subps %%xmm0, %%xmm1 \n"
2414 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2415 "movaps %%xmm1, (%2, %0) \n"
2416 "movaps %%xmm2, (%2, %1) \n"
2421 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2424 #endif /* HAVE_6REGS */
2426 static void vector_clipf_sse(float *dst, const float *src,
2427 float min, float max, int len)
2429 x86_reg i = (len - 16) * 4;
2431 "movss %3, %%xmm4 \n\t"
2432 "movss %4, %%xmm5 \n\t"
2433 "shufps $0, %%xmm4, %%xmm4 \n\t"
2434 "shufps $0, %%xmm5, %%xmm5 \n\t"
2436 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
2437 "movaps 16(%2, %0), %%xmm1 \n\t"
2438 "movaps 32(%2, %0), %%xmm2 \n\t"
2439 "movaps 48(%2, %0), %%xmm3 \n\t"
2440 "maxps %%xmm4, %%xmm0 \n\t"
2441 "maxps %%xmm4, %%xmm1 \n\t"
2442 "maxps %%xmm4, %%xmm2 \n\t"
2443 "maxps %%xmm4, %%xmm3 \n\t"
2444 "minps %%xmm5, %%xmm0 \n\t"
2445 "minps %%xmm5, %%xmm1 \n\t"
2446 "minps %%xmm5, %%xmm2 \n\t"
2447 "minps %%xmm5, %%xmm3 \n\t"
2448 "movaps %%xmm0, (%1, %0) \n\t"
2449 "movaps %%xmm1, 16(%1, %0) \n\t"
2450 "movaps %%xmm2, 32(%1, %0) \n\t"
2451 "movaps %%xmm3, 48(%1, %0) \n\t"
2455 : "r"(dst), "r"(src), "m"(min), "m"(max)
2460 #endif /* HAVE_INLINE_ASM */
2462 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2,
2464 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
2466 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2,
2468 int order, int mul);
2469 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
2471 int order, int mul);
2472 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
2474 int order, int mul);
2476 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
2477 const int16_t *window, unsigned int len);
2478 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2479 const int16_t *window, unsigned int len);
2480 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
2481 const int16_t *window, unsigned int len);
2482 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
2483 const int16_t *window, unsigned int len);
2484 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
2485 const int16_t *window, unsigned int len);
2486 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2487 const int16_t *window, unsigned int len);
2489 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
2490 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
2492 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top,
2493 const uint8_t *diff, int w,
2494 int *left, int *left_top);
2495 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
2497 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
2500 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2502 void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
2503 const float *src1, int len);
2504 void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
2505 const float *src1, int len);
2507 void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2508 const float *src2, int len);
2509 void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
2510 const float *src2, int len);
2512 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
2513 int32_t min, int32_t max, unsigned int len);
2514 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
2515 int32_t min, int32_t max, unsigned int len);
2516 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
2517 int32_t min, int32_t max, unsigned int len);
2518 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
2519 int32_t min, int32_t max, unsigned int len);
2521 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2522 const float *src1, int len);
2523 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2524 const float *src1, int len);
2526 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2528 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2529 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2530 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2531 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2532 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2533 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2534 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2535 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2536 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2537 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2538 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2539 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2540 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2541 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2542 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2543 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2546 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2548 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2549 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2550 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2551 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2554 #define H264_QPEL_FUNCS(x, y, CPU) \
2556 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2557 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2558 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2559 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2562 #define H264_QPEL_FUNCS_10(x, y, CPU) \
2564 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2565 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2566 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2567 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2570 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2572 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2575 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2576 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2577 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2579 if (!high_bit_depth) {
2580 c->clear_block = clear_block_mmx;
2581 c->clear_blocks = clear_blocks_mmx;
2582 c->draw_edges = draw_edges_mmx;
2584 SET_HPEL_FUNCS(put, 0, 16, mmx);
2585 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2586 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2587 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2588 SET_HPEL_FUNCS(put, 1, 8, mmx);
2589 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2590 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2591 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2594 #if ARCH_X86_32 || !HAVE_YASM
2598 c->add_bytes = add_bytes_mmx;
2600 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2601 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2602 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2604 #endif /* HAVE_INLINE_ASM */
2608 if (!high_bit_depth)
2609 c->emulated_edge_mc = emulated_edge_mc_mmx;
2612 if (!high_bit_depth && CONFIG_H264CHROMA) {
2613 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
2614 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2617 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2622 static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
2625 const int bit_depth = avctx->bits_per_raw_sample;
2626 const int high_bit_depth = bit_depth > 8;
2629 c->prefetch = prefetch_mmx2;
2631 if (!high_bit_depth) {
2632 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2633 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2635 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2636 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2637 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2639 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2640 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2642 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2643 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2644 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2647 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2648 if (!high_bit_depth) {
2649 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2650 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2651 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2652 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2654 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2655 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2659 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2660 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2661 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2662 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2664 #endif /* HAVE_INLINE_ASM */
2666 if (CONFIG_H264QPEL) {
2668 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
2669 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
2670 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
2671 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
2672 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
2673 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
2674 #endif /* HAVE_INLINE_ASM */
2676 if (!high_bit_depth) {
2678 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
2679 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
2680 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
2681 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
2682 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
2683 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
2684 #endif /* HAVE_INLINE_ASM */
2685 } else if (bit_depth == 10) {
2688 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2689 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2690 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2691 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2693 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2694 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2695 #endif /* HAVE_YASM */
2699 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
2700 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
2701 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
2702 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
2703 #endif /* HAVE_INLINE_ASM */
2707 if (!high_bit_depth && CONFIG_H264CHROMA) {
2708 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_mmx2_rnd;
2709 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmx2;
2710 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmx2;
2711 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmx2;
2713 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2714 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmx2;
2715 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmx2;
2716 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmx2;
2717 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmx2;
2720 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2722 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2723 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2725 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2726 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2728 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2730 #endif /* HAVE_YASM */
2733 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2736 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2739 c->prefetch = prefetch_3dnow;
2741 if (!high_bit_depth) {
2742 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2743 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2745 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2746 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2747 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2749 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2750 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2752 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2753 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2754 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2756 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2757 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2758 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2759 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2760 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2762 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2763 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2767 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2768 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2769 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2770 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2773 if (CONFIG_H264QPEL) {
2774 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
2775 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
2776 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2777 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
2778 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
2779 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
2781 if (!high_bit_depth) {
2782 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2783 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
2784 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
2785 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2786 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
2787 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
2790 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2791 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
2792 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2793 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
2796 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2799 if (mm_flags & AV_CPU_FLAG_CMOV)
2800 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2802 #endif /* HAVE_INLINE_ASM */
2805 if (!high_bit_depth && CONFIG_H264CHROMA) {
2806 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_3dnow_rnd;
2807 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2809 #endif /* HAVE_YASM */
2812 static void dsputil_init_3dnowext(DSPContext *c, AVCodecContext *avctx,
2815 #if HAVE_6REGS && HAVE_INLINE_ASM
2816 c->vector_fmul_window = vector_fmul_window_3dnowext;
2820 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2822 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2825 if (!high_bit_depth) {
2826 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2827 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2828 c->clear_block = clear_block_sse;
2829 c->clear_blocks = clear_blocks_sse;
2833 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2834 c->ac3_downmix = ac3_downmix_sse;
2837 c->vector_fmul_window = vector_fmul_window_sse;
2840 c->vector_clipf = vector_clipf_sse;
2841 #endif /* HAVE_INLINE_ASM */
2844 c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
2845 c->vector_fmul_add = ff_vector_fmul_add_sse;
2847 c->scalarproduct_float = ff_scalarproduct_float_sse;
2848 c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
2850 if (!high_bit_depth)
2851 c->emulated_edge_mc = emulated_edge_mc_sse;
2855 #endif /* HAVE_YASM */
2858 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2861 const int bit_depth = avctx->bits_per_raw_sample;
2864 const int high_bit_depth = bit_depth > 8;
2866 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2867 // these functions are slower than mmx on AMD, but faster on Intel
2868 if (!high_bit_depth) {
2869 c->put_pixels_tab[0][0] = put_pixels16_sse2;
2870 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2871 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2872 if (CONFIG_H264QPEL)
2873 H264_QPEL_FUNCS(0, 0, sse2);
2877 if (!high_bit_depth && CONFIG_H264QPEL) {
2878 H264_QPEL_FUNCS(0, 1, sse2);
2879 H264_QPEL_FUNCS(0, 2, sse2);
2880 H264_QPEL_FUNCS(0, 3, sse2);
2881 H264_QPEL_FUNCS(1, 1, sse2);
2882 H264_QPEL_FUNCS(1, 2, sse2);
2883 H264_QPEL_FUNCS(1, 3, sse2);
2884 H264_QPEL_FUNCS(2, 1, sse2);
2885 H264_QPEL_FUNCS(2, 2, sse2);
2886 H264_QPEL_FUNCS(2, 3, sse2);
2887 H264_QPEL_FUNCS(3, 1, sse2);
2888 H264_QPEL_FUNCS(3, 2, sse2);
2889 H264_QPEL_FUNCS(3, 3, sse2);
2891 #endif /* HAVE_INLINE_ASM */
2894 if (bit_depth == 10) {
2895 if (CONFIG_H264QPEL) {
2896 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2897 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2898 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2899 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2900 H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2901 H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2902 H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
2904 if (CONFIG_H264CHROMA) {
2905 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2906 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2910 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2911 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2912 if (mm_flags & AV_CPU_FLAG_ATOM) {
2913 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2915 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2917 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2918 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
2919 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2920 c->apply_window_int16 = ff_apply_window_int16_sse2;
2922 c->bswap_buf = ff_bswap32_buf_sse2;
2923 #endif /* HAVE_YASM */
2926 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2930 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2931 const int bit_depth = avctx->bits_per_raw_sample;
2934 if (!high_bit_depth && CONFIG_H264QPEL) {
2935 H264_QPEL_FUNCS(1, 0, ssse3);
2936 H264_QPEL_FUNCS(1, 1, ssse3);
2937 H264_QPEL_FUNCS(1, 2, ssse3);
2938 H264_QPEL_FUNCS(1, 3, ssse3);
2939 H264_QPEL_FUNCS(2, 0, ssse3);
2940 H264_QPEL_FUNCS(2, 1, ssse3);
2941 H264_QPEL_FUNCS(2, 2, ssse3);
2942 H264_QPEL_FUNCS(2, 3, ssse3);
2943 H264_QPEL_FUNCS(3, 0, ssse3);
2944 H264_QPEL_FUNCS(3, 1, ssse3);
2945 H264_QPEL_FUNCS(3, 2, ssse3);
2946 H264_QPEL_FUNCS(3, 3, ssse3);
2948 #endif /* HAVE_INLINE_ASM */
2950 if (bit_depth == 10 && CONFIG_H264QPEL) {
2951 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
2952 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
2953 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
2955 if (!high_bit_depth && CONFIG_H264CHROMA) {
2956 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_ssse3_rnd;
2957 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_ssse3_rnd;
2958 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
2959 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
2961 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2962 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2963 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2965 if (mm_flags & AV_CPU_FLAG_ATOM)
2966 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2968 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2969 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
2970 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2971 c->bswap_buf = ff_bswap32_buf_ssse3;
2976 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
2980 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2984 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2986 #if HAVE_AVX && HAVE_YASM
2987 const int bit_depth = avctx->bits_per_raw_sample;
2989 if (bit_depth == 10) {
2990 // AVX implies !cache64.
2991 // TODO: Port cache(32|64) detection from x264.
2992 if (CONFIG_H264QPEL) {
2993 H264_QPEL_FUNCS_10(1, 0, sse2);
2994 H264_QPEL_FUNCS_10(2, 0, sse2);
2995 H264_QPEL_FUNCS_10(3, 0, sse2);
2998 if (CONFIG_H264CHROMA) {
2999 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
3000 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
3003 c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
3004 c->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
3005 c->vector_fmul_add = ff_vector_fmul_add_avx;
3009 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
3011 int mm_flags = av_get_cpu_flags();
3013 if (mm_flags & AV_CPU_FLAG_MMX) {
3015 const int idct_algo = avctx->idct_algo;
3017 if (avctx->bits_per_raw_sample <= 8) {
3018 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
3019 c->idct_put = ff_simple_idct_put_mmx;
3020 c->idct_add = ff_simple_idct_add_mmx;
3021 c->idct = ff_simple_idct_mmx;
3022 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
3023 } else if (idct_algo == FF_IDCT_CAVS) {
3024 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
3025 } else if (idct_algo == FF_IDCT_XVIDMMX) {
3026 if (mm_flags & AV_CPU_FLAG_SSE2) {
3027 c->idct_put = ff_idct_xvid_sse2_put;
3028 c->idct_add = ff_idct_xvid_sse2_add;
3029 c->idct = ff_idct_xvid_sse2;
3030 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
3031 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
3032 c->idct_put = ff_idct_xvid_mmx2_put;
3033 c->idct_add = ff_idct_xvid_mmx2_add;
3034 c->idct = ff_idct_xvid_mmx2;
3036 c->idct_put = ff_idct_xvid_mmx_put;
3037 c->idct_add = ff_idct_xvid_mmx_add;
3038 c->idct = ff_idct_xvid_mmx;
3042 #endif /* HAVE_INLINE_ASM */
3044 dsputil_init_mmx(c, avctx, mm_flags);
3047 if (mm_flags & AV_CPU_FLAG_MMXEXT)
3048 dsputil_init_mmx2(c, avctx, mm_flags);
3050 if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW)
3051 dsputil_init_3dnow(c, avctx, mm_flags);
3053 if (mm_flags & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT)
3054 dsputil_init_3dnowext(c, avctx, mm_flags);
3056 if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE)
3057 dsputil_init_sse(c, avctx, mm_flags);
3059 if (mm_flags & AV_CPU_FLAG_SSE2)
3060 dsputil_init_sse2(c, avctx, mm_flags);
3062 if (mm_flags & AV_CPU_FLAG_SSSE3)
3063 dsputil_init_ssse3(c, avctx, mm_flags);
3065 if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE)
3066 dsputil_init_sse4(c, avctx, mm_flags);
3068 if (mm_flags & AV_CPU_FLAG_AVX)
3069 dsputil_init_avx(c, avctx, mm_flags);
3071 if (CONFIG_ENCODERS)
3072 ff_dsputilenc_init_mmx(c, avctx);