2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "dsputil_mmx.h"
32 #include "idct_xvid.h"
33 #include "diracdsp_mmx.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
43 { 0x8000000080000000ULL, 0x8000000080000000ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
52 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
73 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
81 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
82 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
84 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
85 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
89 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
90 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
92 #define MOVQ_BFE(regd) \
94 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
95 "paddb %%"#regd", %%"#regd" \n\t" ::)
98 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
99 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
101 // for shared library it's better to use this way for accessing constants
103 #define MOVQ_BONE(regd) \
105 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
106 "psrlw $15, %%"#regd" \n\t" \
107 "packuswb %%"#regd", %%"#regd" \n\t" ::)
109 #define MOVQ_WTWO(regd) \
111 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
112 "psrlw $15, %%"#regd" \n\t" \
113 "psllw $1, %%"#regd" \n\t"::)
117 // using regr as temporary and for the output result
118 // first argument is unmodifed and second is trashed
119 // regfe is supposed to contain 0xfefefefefefefefe
120 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
121 "movq "#rega", "#regr" \n\t" \
122 "pand "#regb", "#regr" \n\t" \
123 "pxor "#rega", "#regb" \n\t" \
124 "pand "#regfe", "#regb" \n\t" \
125 "psrlq $1, "#regb" \n\t" \
126 "paddb "#regb", "#regr" \n\t"
128 #define PAVGB_MMX(rega, regb, regr, regfe) \
129 "movq "#rega", "#regr" \n\t" \
130 "por "#regb", "#regr" \n\t" \
131 "pxor "#rega", "#regb" \n\t" \
132 "pand "#regfe", "#regb" \n\t" \
133 "psrlq $1, "#regb" \n\t" \
134 "psubb "#regb", "#regr" \n\t"
136 // mm6 is supposed to contain 0xfefefefefefefefe
137 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
138 "movq "#rega", "#regr" \n\t" \
139 "movq "#regc", "#regp" \n\t" \
140 "pand "#regb", "#regr" \n\t" \
141 "pand "#regd", "#regp" \n\t" \
142 "pxor "#rega", "#regb" \n\t" \
143 "pxor "#regc", "#regd" \n\t" \
144 "pand %%mm6, "#regb" \n\t" \
145 "pand %%mm6, "#regd" \n\t" \
146 "psrlq $1, "#regb" \n\t" \
147 "psrlq $1, "#regd" \n\t" \
148 "paddb "#regb", "#regr" \n\t" \
149 "paddb "#regd", "#regp" \n\t"
151 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
152 "movq "#rega", "#regr" \n\t" \
153 "movq "#regc", "#regp" \n\t" \
154 "por "#regb", "#regr" \n\t" \
155 "por "#regd", "#regp" \n\t" \
156 "pxor "#rega", "#regb" \n\t" \
157 "pxor "#regc", "#regd" \n\t" \
158 "pand %%mm6, "#regb" \n\t" \
159 "pand %%mm6, "#regd" \n\t" \
160 "psrlq $1, "#regd" \n\t" \
161 "psrlq $1, "#regb" \n\t" \
162 "psubb "#regb", "#regr" \n\t" \
163 "psubb "#regd", "#regp" \n\t"
165 /***********************************/
166 /* MMX no rounding */
167 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
168 #define SET_RND MOVQ_WONE
169 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
170 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
171 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
173 #include "dsputil_rnd_template.c"
179 /***********************************/
182 #define DEF(x, y) x ## _ ## y ## _mmx
183 #define SET_RND MOVQ_WTWO
184 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
185 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
187 #include "dsputil_rnd_template.c"
195 /***********************************/
198 #define DEF(x) x ## _3dnow
199 #define PAVGB "pavgusb"
202 #include "dsputil_avg_template.c"
208 /***********************************/
211 #define DEF(x) x ## _mmx2
213 /* Introduced only in MMX2 set */
214 #define PAVGB "pavgb"
217 #include "dsputil_avg_template.c"
223 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
224 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
225 #define put_pixels16_mmx2 put_pixels16_mmx
226 #define put_pixels8_mmx2 put_pixels8_mmx
227 #define put_pixels4_mmx2 put_pixels4_mmx
228 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
229 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
230 #define put_pixels16_3dnow put_pixels16_mmx
231 #define put_pixels8_3dnow put_pixels8_mmx
232 #define put_pixels4_3dnow put_pixels4_mmx
233 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
234 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
236 /***********************************/
239 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
245 /* read the pixels */
250 "movq (%3), %%mm0 \n\t"
251 "movq 8(%3), %%mm1 \n\t"
252 "movq 16(%3), %%mm2 \n\t"
253 "movq 24(%3), %%mm3 \n\t"
254 "movq 32(%3), %%mm4 \n\t"
255 "movq 40(%3), %%mm5 \n\t"
256 "movq 48(%3), %%mm6 \n\t"
257 "movq 56(%3), %%mm7 \n\t"
258 "packuswb %%mm1, %%mm0 \n\t"
259 "packuswb %%mm3, %%mm2 \n\t"
260 "packuswb %%mm5, %%mm4 \n\t"
261 "packuswb %%mm7, %%mm6 \n\t"
262 "movq %%mm0, (%0) \n\t"
263 "movq %%mm2, (%0, %1) \n\t"
264 "movq %%mm4, (%0, %1, 2) \n\t"
265 "movq %%mm6, (%0, %2) \n\t"
266 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
269 pix += line_size * 4;
272 // if here would be an exact copy of the code above
273 // compiler would generate some very strange code
276 "movq (%3), %%mm0 \n\t"
277 "movq 8(%3), %%mm1 \n\t"
278 "movq 16(%3), %%mm2 \n\t"
279 "movq 24(%3), %%mm3 \n\t"
280 "movq 32(%3), %%mm4 \n\t"
281 "movq 40(%3), %%mm5 \n\t"
282 "movq 48(%3), %%mm6 \n\t"
283 "movq 56(%3), %%mm7 \n\t"
284 "packuswb %%mm1, %%mm0 \n\t"
285 "packuswb %%mm3, %%mm2 \n\t"
286 "packuswb %%mm5, %%mm4 \n\t"
287 "packuswb %%mm7, %%mm6 \n\t"
288 "movq %%mm0, (%0) \n\t"
289 "movq %%mm2, (%0, %1) \n\t"
290 "movq %%mm4, (%0, %1, 2) \n\t"
291 "movq %%mm6, (%0, %2) \n\t"
292 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
296 #define put_signed_pixels_clamped_mmx_half(off) \
297 "movq "#off"(%2), %%mm1 \n\t" \
298 "movq 16 + "#off"(%2), %%mm2 \n\t" \
299 "movq 32 + "#off"(%2), %%mm3 \n\t" \
300 "movq 48 + "#off"(%2), %%mm4 \n\t" \
301 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
302 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
303 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
304 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
305 "paddb %%mm0, %%mm1 \n\t" \
306 "paddb %%mm0, %%mm2 \n\t" \
307 "paddb %%mm0, %%mm3 \n\t" \
308 "paddb %%mm0, %%mm4 \n\t" \
309 "movq %%mm1, (%0) \n\t" \
310 "movq %%mm2, (%0, %3) \n\t" \
311 "movq %%mm3, (%0, %3, 2) \n\t" \
312 "movq %%mm4, (%0, %1) \n\t"
314 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
317 x86_reg line_skip = line_size;
321 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
322 "lea (%3, %3, 2), %1 \n\t"
323 put_signed_pixels_clamped_mmx_half(0)
324 "lea (%0, %3, 4), %0 \n\t"
325 put_signed_pixels_clamped_mmx_half(64)
326 : "+&r"(pixels), "=&r"(line_skip3)
327 : "r"(block), "r"(line_skip)
331 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
338 /* read the pixels */
345 "movq (%2), %%mm0 \n\t"
346 "movq 8(%2), %%mm1 \n\t"
347 "movq 16(%2), %%mm2 \n\t"
348 "movq 24(%2), %%mm3 \n\t"
349 "movq %0, %%mm4 \n\t"
350 "movq %1, %%mm6 \n\t"
351 "movq %%mm4, %%mm5 \n\t"
352 "punpcklbw %%mm7, %%mm4 \n\t"
353 "punpckhbw %%mm7, %%mm5 \n\t"
354 "paddsw %%mm4, %%mm0 \n\t"
355 "paddsw %%mm5, %%mm1 \n\t"
356 "movq %%mm6, %%mm5 \n\t"
357 "punpcklbw %%mm7, %%mm6 \n\t"
358 "punpckhbw %%mm7, %%mm5 \n\t"
359 "paddsw %%mm6, %%mm2 \n\t"
360 "paddsw %%mm5, %%mm3 \n\t"
361 "packuswb %%mm1, %%mm0 \n\t"
362 "packuswb %%mm3, %%mm2 \n\t"
363 "movq %%mm0, %0 \n\t"
364 "movq %%mm2, %1 \n\t"
365 : "+m"(*pix), "+m"(*(pix + line_size))
368 pix += line_size * 2;
373 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
374 int line_size, int h)
377 "lea (%3, %3), %%"REG_a" \n\t"
380 "movd (%1 ), %%mm0 \n\t"
381 "movd (%1, %3), %%mm1 \n\t"
382 "movd %%mm0, (%2) \n\t"
383 "movd %%mm1, (%2, %3) \n\t"
384 "add %%"REG_a", %1 \n\t"
385 "add %%"REG_a", %2 \n\t"
386 "movd (%1 ), %%mm0 \n\t"
387 "movd (%1, %3), %%mm1 \n\t"
388 "movd %%mm0, (%2) \n\t"
389 "movd %%mm1, (%2, %3) \n\t"
390 "add %%"REG_a", %1 \n\t"
391 "add %%"REG_a", %2 \n\t"
394 : "+g"(h), "+r"(pixels), "+r"(block)
395 : "r"((x86_reg)line_size)
400 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
401 int line_size, int h)
404 "lea (%3, %3), %%"REG_a" \n\t"
407 "movq (%1 ), %%mm0 \n\t"
408 "movq (%1, %3), %%mm1 \n\t"
409 "movq %%mm0, (%2) \n\t"
410 "movq %%mm1, (%2, %3) \n\t"
411 "add %%"REG_a", %1 \n\t"
412 "add %%"REG_a", %2 \n\t"
413 "movq (%1 ), %%mm0 \n\t"
414 "movq (%1, %3), %%mm1 \n\t"
415 "movq %%mm0, (%2) \n\t"
416 "movq %%mm1, (%2, %3) \n\t"
417 "add %%"REG_a", %1 \n\t"
418 "add %%"REG_a", %2 \n\t"
421 : "+g"(h), "+r"(pixels), "+r"(block)
422 : "r"((x86_reg)line_size)
427 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
428 int line_size, int h)
431 "lea (%3, %3), %%"REG_a" \n\t"
434 "movq (%1 ), %%mm0 \n\t"
435 "movq 8(%1 ), %%mm4 \n\t"
436 "movq (%1, %3), %%mm1 \n\t"
437 "movq 8(%1, %3), %%mm5 \n\t"
438 "movq %%mm0, (%2) \n\t"
439 "movq %%mm4, 8(%2) \n\t"
440 "movq %%mm1, (%2, %3) \n\t"
441 "movq %%mm5, 8(%2, %3) \n\t"
442 "add %%"REG_a", %1 \n\t"
443 "add %%"REG_a", %2 \n\t"
444 "movq (%1 ), %%mm0 \n\t"
445 "movq 8(%1 ), %%mm4 \n\t"
446 "movq (%1, %3), %%mm1 \n\t"
447 "movq 8(%1, %3), %%mm5 \n\t"
448 "movq %%mm0, (%2) \n\t"
449 "movq %%mm4, 8(%2) \n\t"
450 "movq %%mm1, (%2, %3) \n\t"
451 "movq %%mm5, 8(%2, %3) \n\t"
452 "add %%"REG_a", %1 \n\t"
453 "add %%"REG_a", %2 \n\t"
456 : "+g"(h), "+r"(pixels), "+r"(block)
457 : "r"((x86_reg)line_size)
462 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
463 int line_size, int h)
467 "movdqu (%1 ), %%xmm0 \n\t"
468 "movdqu (%1, %3 ), %%xmm1 \n\t"
469 "movdqu (%1, %3, 2), %%xmm2 \n\t"
470 "movdqu (%1, %4 ), %%xmm3 \n\t"
471 "lea (%1, %3, 4), %1 \n\t"
472 "movdqa %%xmm0, (%2) \n\t"
473 "movdqa %%xmm1, (%2, %3) \n\t"
474 "movdqa %%xmm2, (%2, %3, 2) \n\t"
475 "movdqa %%xmm3, (%2, %4) \n\t"
477 "lea (%2, %3, 4), %2 \n\t"
479 : "+g"(h), "+r"(pixels), "+r"(block)
480 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
485 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
486 int line_size, int h)
490 "movdqu (%1 ), %%xmm0 \n\t"
491 "movdqu (%1, %3 ), %%xmm1 \n\t"
492 "movdqu (%1, %3, 2), %%xmm2 \n\t"
493 "movdqu (%1, %4 ), %%xmm3 \n\t"
494 "lea (%1, %3, 4), %1 \n\t"
495 "pavgb (%2 ), %%xmm0 \n\t"
496 "pavgb (%2, %3 ), %%xmm1 \n\t"
497 "pavgb (%2, %3, 2), %%xmm2 \n\t"
498 "pavgb (%2, %4), %%xmm3 \n\t"
499 "movdqa %%xmm0, (%2) \n\t"
500 "movdqa %%xmm1, (%2, %3) \n\t"
501 "movdqa %%xmm2, (%2, %3, 2) \n\t"
502 "movdqa %%xmm3, (%2, %4) \n\t"
504 "lea (%2, %3, 4), %2 \n\t"
506 : "+g"(h), "+r"(pixels), "+r"(block)
507 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
512 #define CLEAR_BLOCKS(name, n) \
513 static void name(DCTELEM *blocks) \
516 "pxor %%mm7, %%mm7 \n\t" \
517 "mov %1, %%"REG_a" \n\t" \
519 "movq %%mm7, (%0, %%"REG_a") \n\t" \
520 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
521 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
522 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
523 "add $32, %%"REG_a" \n\t" \
525 :: "r"(((uint8_t *)blocks) + 128 * n), \
530 CLEAR_BLOCKS(clear_blocks_mmx, 6)
531 CLEAR_BLOCKS(clear_block_mmx, 1)
533 static void clear_block_sse(DCTELEM *block)
536 "xorps %%xmm0, %%xmm0 \n"
537 "movaps %%xmm0, (%0) \n"
538 "movaps %%xmm0, 16(%0) \n"
539 "movaps %%xmm0, 32(%0) \n"
540 "movaps %%xmm0, 48(%0) \n"
541 "movaps %%xmm0, 64(%0) \n"
542 "movaps %%xmm0, 80(%0) \n"
543 "movaps %%xmm0, 96(%0) \n"
544 "movaps %%xmm0, 112(%0) \n"
550 static void clear_blocks_sse(DCTELEM *blocks)
553 "xorps %%xmm0, %%xmm0 \n"
554 "mov %1, %%"REG_a" \n"
556 "movaps %%xmm0, (%0, %%"REG_a") \n"
557 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
558 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
559 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
560 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
561 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
562 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
563 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
564 "add $128, %%"REG_a" \n"
566 :: "r"(((uint8_t *)blocks) + 128 * 6),
572 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
578 "movq (%1, %0), %%mm0 \n\t"
579 "movq (%2, %0), %%mm1 \n\t"
580 "paddb %%mm0, %%mm1 \n\t"
581 "movq %%mm1, (%2, %0) \n\t"
582 "movq 8(%1, %0), %%mm0 \n\t"
583 "movq 8(%2, %0), %%mm1 \n\t"
584 "paddb %%mm0, %%mm1 \n\t"
585 "movq %%mm1, 8(%2, %0) \n\t"
591 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
594 dst[i + 0] += src[i + 0];
598 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
599 const uint8_t *diff, int w,
600 int *left, int *left_top)
604 int l = *left & 0xff;
605 int tl = *left_top & 0xff;
610 "movzbl (%3, %4), %2 \n"
623 "add (%6, %4), %b0 \n"
624 "mov %b0, (%5, %4) \n"
627 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
628 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
635 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
636 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
637 "movd (%1), %%mm0 \n\t"
639 "movd (%1), %%mm1 \n\t"
640 "movd (%1,%3,1), %%mm2 \n\t"
641 "movd (%1,%3,2), %%mm3 \n\t"
642 "punpcklbw %%mm1, %%mm0 \n\t"
643 "punpcklbw %%mm3, %%mm2 \n\t"
644 "movq %%mm0, %%mm1 \n\t"
645 "punpcklwd %%mm2, %%mm0 \n\t"
646 "punpckhwd %%mm2, %%mm1 \n\t"
647 "movd %%mm0, (%0) \n\t"
649 "punpckhdq %%mm0, %%mm0 \n\t"
650 "movd %%mm0, (%0) \n\t"
651 "movd %%mm1, (%0,%2,1) \n\t"
652 "punpckhdq %%mm1, %%mm1 \n\t"
653 "movd %%mm1, (%0,%2,2) \n\t"
663 #define H263_LOOP_FILTER \
664 "pxor %%mm7, %%mm7 \n\t" \
665 "movq %0, %%mm0 \n\t" \
666 "movq %0, %%mm1 \n\t" \
667 "movq %3, %%mm2 \n\t" \
668 "movq %3, %%mm3 \n\t" \
669 "punpcklbw %%mm7, %%mm0 \n\t" \
670 "punpckhbw %%mm7, %%mm1 \n\t" \
671 "punpcklbw %%mm7, %%mm2 \n\t" \
672 "punpckhbw %%mm7, %%mm3 \n\t" \
673 "psubw %%mm2, %%mm0 \n\t" \
674 "psubw %%mm3, %%mm1 \n\t" \
675 "movq %1, %%mm2 \n\t" \
676 "movq %1, %%mm3 \n\t" \
677 "movq %2, %%mm4 \n\t" \
678 "movq %2, %%mm5 \n\t" \
679 "punpcklbw %%mm7, %%mm2 \n\t" \
680 "punpckhbw %%mm7, %%mm3 \n\t" \
681 "punpcklbw %%mm7, %%mm4 \n\t" \
682 "punpckhbw %%mm7, %%mm5 \n\t" \
683 "psubw %%mm2, %%mm4 \n\t" \
684 "psubw %%mm3, %%mm5 \n\t" \
685 "psllw $2, %%mm4 \n\t" \
686 "psllw $2, %%mm5 \n\t" \
687 "paddw %%mm0, %%mm4 \n\t" \
688 "paddw %%mm1, %%mm5 \n\t" \
689 "pxor %%mm6, %%mm6 \n\t" \
690 "pcmpgtw %%mm4, %%mm6 \n\t" \
691 "pcmpgtw %%mm5, %%mm7 \n\t" \
692 "pxor %%mm6, %%mm4 \n\t" \
693 "pxor %%mm7, %%mm5 \n\t" \
694 "psubw %%mm6, %%mm4 \n\t" \
695 "psubw %%mm7, %%mm5 \n\t" \
696 "psrlw $3, %%mm4 \n\t" \
697 "psrlw $3, %%mm5 \n\t" \
698 "packuswb %%mm5, %%mm4 \n\t" \
699 "packsswb %%mm7, %%mm6 \n\t" \
700 "pxor %%mm7, %%mm7 \n\t" \
701 "movd %4, %%mm2 \n\t" \
702 "punpcklbw %%mm2, %%mm2 \n\t" \
703 "punpcklbw %%mm2, %%mm2 \n\t" \
704 "punpcklbw %%mm2, %%mm2 \n\t" \
705 "psubusb %%mm4, %%mm2 \n\t" \
706 "movq %%mm2, %%mm3 \n\t" \
707 "psubusb %%mm4, %%mm3 \n\t" \
708 "psubb %%mm3, %%mm2 \n\t" \
709 "movq %1, %%mm3 \n\t" \
710 "movq %2, %%mm4 \n\t" \
711 "pxor %%mm6, %%mm3 \n\t" \
712 "pxor %%mm6, %%mm4 \n\t" \
713 "paddusb %%mm2, %%mm3 \n\t" \
714 "psubusb %%mm2, %%mm4 \n\t" \
715 "pxor %%mm6, %%mm3 \n\t" \
716 "pxor %%mm6, %%mm4 \n\t" \
717 "paddusb %%mm2, %%mm2 \n\t" \
718 "packsswb %%mm1, %%mm0 \n\t" \
719 "pcmpgtb %%mm0, %%mm7 \n\t" \
720 "pxor %%mm7, %%mm0 \n\t" \
721 "psubb %%mm7, %%mm0 \n\t" \
722 "movq %%mm0, %%mm1 \n\t" \
723 "psubusb %%mm2, %%mm0 \n\t" \
724 "psubb %%mm0, %%mm1 \n\t" \
725 "pand %5, %%mm1 \n\t" \
726 "psrlw $2, %%mm1 \n\t" \
727 "pxor %%mm7, %%mm1 \n\t" \
728 "psubb %%mm7, %%mm1 \n\t" \
729 "movq %0, %%mm5 \n\t" \
730 "movq %3, %%mm6 \n\t" \
731 "psubb %%mm1, %%mm5 \n\t" \
732 "paddb %%mm1, %%mm6 \n\t"
734 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
736 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
737 const int strength = ff_h263_loop_filter_strength[qscale];
742 "movq %%mm3, %1 \n\t"
743 "movq %%mm4, %2 \n\t"
744 "movq %%mm5, %0 \n\t"
745 "movq %%mm6, %3 \n\t"
746 : "+m"(*(uint64_t*)(src - 2 * stride)),
747 "+m"(*(uint64_t*)(src - 1 * stride)),
748 "+m"(*(uint64_t*)(src + 0 * stride)),
749 "+m"(*(uint64_t*)(src + 1 * stride))
750 : "g"(2 * strength), "m"(ff_pb_FC)
755 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
757 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
758 const int strength = ff_h263_loop_filter_strength[qscale];
759 DECLARE_ALIGNED(8, uint64_t, temp)[4];
760 uint8_t *btemp = (uint8_t*)temp;
764 transpose4x4(btemp, src, 8, stride);
765 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
767 H263_LOOP_FILTER // 5 3 4 6
773 : "g"(2 * strength), "m"(ff_pb_FC)
777 "movq %%mm5, %%mm1 \n\t"
778 "movq %%mm4, %%mm0 \n\t"
779 "punpcklbw %%mm3, %%mm5 \n\t"
780 "punpcklbw %%mm6, %%mm4 \n\t"
781 "punpckhbw %%mm3, %%mm1 \n\t"
782 "punpckhbw %%mm6, %%mm0 \n\t"
783 "movq %%mm5, %%mm3 \n\t"
784 "movq %%mm1, %%mm6 \n\t"
785 "punpcklwd %%mm4, %%mm5 \n\t"
786 "punpcklwd %%mm0, %%mm1 \n\t"
787 "punpckhwd %%mm4, %%mm3 \n\t"
788 "punpckhwd %%mm0, %%mm6 \n\t"
789 "movd %%mm5, (%0) \n\t"
790 "punpckhdq %%mm5, %%mm5 \n\t"
791 "movd %%mm5, (%0, %2) \n\t"
792 "movd %%mm3, (%0, %2, 2) \n\t"
793 "punpckhdq %%mm3, %%mm3 \n\t"
794 "movd %%mm3, (%0, %3) \n\t"
795 "movd %%mm1, (%1) \n\t"
796 "punpckhdq %%mm1, %%mm1 \n\t"
797 "movd %%mm1, (%1, %2) \n\t"
798 "movd %%mm6, (%1, %2, 2) \n\t"
799 "punpckhdq %%mm6, %%mm6 \n\t"
800 "movd %%mm6, (%1, %3) \n\t"
802 "r"(src + 4 * stride),
803 "r"((x86_reg)stride),
804 "r"((x86_reg)(3 * stride))
809 /* Draw the edges of width 'w' of an image of size width, height
810 * this MMX version can only handle w == 8 || w == 16. */
811 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
812 int w, int h, int sides)
814 uint8_t *ptr, *last_line;
817 last_line = buf + (height - 1) * wrap;
823 "movd (%0), %%mm0 \n\t"
824 "punpcklbw %%mm0, %%mm0 \n\t"
825 "punpcklwd %%mm0, %%mm0 \n\t"
826 "punpckldq %%mm0, %%mm0 \n\t"
827 "movq %%mm0, -8(%0) \n\t"
828 "movq -8(%0, %2), %%mm1 \n\t"
829 "punpckhbw %%mm1, %%mm1 \n\t"
830 "punpckhwd %%mm1, %%mm1 \n\t"
831 "punpckhdq %%mm1, %%mm1 \n\t"
832 "movq %%mm1, (%0, %2) \n\t"
837 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
842 "movd (%0), %%mm0 \n\t"
843 "punpcklbw %%mm0, %%mm0 \n\t"
844 "punpcklwd %%mm0, %%mm0 \n\t"
845 "punpckldq %%mm0, %%mm0 \n\t"
846 "movq %%mm0, -8(%0) \n\t"
847 "movq %%mm0, -16(%0) \n\t"
848 "movq -8(%0, %2), %%mm1 \n\t"
849 "punpckhbw %%mm1, %%mm1 \n\t"
850 "punpckhwd %%mm1, %%mm1 \n\t"
851 "punpckhdq %%mm1, %%mm1 \n\t"
852 "movq %%mm1, (%0, %2) \n\t"
853 "movq %%mm1, 8(%0, %2) \n\t"
858 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
864 "movd (%0), %%mm0 \n\t"
865 "punpcklbw %%mm0, %%mm0 \n\t"
866 "punpcklwd %%mm0, %%mm0 \n\t"
867 "movd %%mm0, -4(%0) \n\t"
868 "movd -4(%0, %2), %%mm1 \n\t"
869 "punpcklbw %%mm1, %%mm1 \n\t"
870 "punpckhwd %%mm1, %%mm1 \n\t"
871 "punpckhdq %%mm1, %%mm1 \n\t"
872 "movd %%mm1, (%0, %2) \n\t"
877 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
881 /* top and bottom (and hopefully also the corners) */
882 if (sides & EDGE_TOP) {
883 for (i = 0; i < h; i += 4) {
884 ptr = buf - (i + 1) * wrap - w;
887 "movq (%1, %0), %%mm0 \n\t"
888 "movq %%mm0, (%0) \n\t"
889 "movq %%mm0, (%0, %2) \n\t"
890 "movq %%mm0, (%0, %2, 2) \n\t"
891 "movq %%mm0, (%0, %3) \n\t"
896 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
897 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
902 if (sides & EDGE_BOTTOM) {
903 for (i = 0; i < h; i += 4) {
904 ptr = last_line + (i + 1) * wrap - w;
907 "movq (%1, %0), %%mm0 \n\t"
908 "movq %%mm0, (%0) \n\t"
909 "movq %%mm0, (%0, %2) \n\t"
910 "movq %%mm0, (%0, %2, 2) \n\t"
911 "movq %%mm0, (%0, %3) \n\t"
916 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
917 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
918 "r"(ptr + width + 2 * w)
924 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
925 in0, in1, in2, in7, out, OP) \
926 "paddw "#m4", "#m3" \n\t" /* x1 */ \
927 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
928 "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
929 "movq "#in7", "#m3" \n\t" /* d */ \
930 "movq "#in0", %%mm5 \n\t" /* D */ \
931 "paddw "#m3", %%mm5 \n\t" /* x4 */ \
932 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
933 "movq "#in1", %%mm5 \n\t" /* C */ \
934 "movq "#in2", %%mm6 \n\t" /* B */ \
935 "paddw "#m6", %%mm5 \n\t" /* x3 */ \
936 "paddw "#m5", %%mm6 \n\t" /* x2 */ \
937 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
938 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
939 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
940 "paddw "#rnd", %%mm4 \n\t" /* x2 */ \
941 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
942 "psraw $5, %%mm5 \n\t" \
943 "packuswb %%mm5, %%mm5 \n\t" \
944 OP(%%mm5, out, %%mm7, d)
946 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW) \
947 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, \
956 "pxor %%mm7, %%mm7 \n\t" \
958 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
959 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
960 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
961 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
962 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
963 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
964 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
965 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
966 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
967 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
968 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
969 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
970 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
971 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
972 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
973 "paddw %%mm3, %%mm5 \n\t" /* b */ \
974 "paddw %%mm2, %%mm6 \n\t" /* c */ \
975 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
976 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
977 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
978 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
979 "paddw %%mm4, %%mm0 \n\t" /* a */ \
980 "paddw %%mm1, %%mm5 \n\t" /* d */ \
981 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
982 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
983 "paddw %6, %%mm6 \n\t" \
984 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
985 "psraw $5, %%mm0 \n\t" \
986 "movq %%mm0, %5 \n\t" \
987 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
989 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
990 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
991 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
992 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
993 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
994 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
995 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
996 "paddw %%mm0, %%mm2 \n\t" /* b */ \
997 "paddw %%mm5, %%mm3 \n\t" /* c */ \
998 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
999 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1000 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
1001 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
1002 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
1003 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
1004 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1005 "paddw %%mm2, %%mm1 \n\t" /* a */ \
1006 "paddw %%mm6, %%mm4 \n\t" /* d */ \
1007 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1008 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
1009 "paddw %6, %%mm1 \n\t" \
1010 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
1011 "psraw $5, %%mm3 \n\t" \
1012 "movq %5, %%mm1 \n\t" \
1013 "packuswb %%mm3, %%mm1 \n\t" \
1014 OP_MMX2(%%mm1, (%1), %%mm4, q) \
1015 /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
1017 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
1018 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
1019 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
1020 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
1021 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
1022 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
1023 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
1024 "paddw %%mm1, %%mm5 \n\t" /* b */ \
1025 "paddw %%mm4, %%mm0 \n\t" /* c */ \
1026 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1027 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
1028 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
1029 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
1030 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
1031 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
1032 "paddw %%mm3, %%mm2 \n\t" /* d */ \
1033 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
1034 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
1035 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
1036 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
1037 "paddw %%mm2, %%mm6 \n\t" /* a */ \
1038 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
1039 "paddw %6, %%mm0 \n\t" \
1040 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1041 "psraw $5, %%mm0 \n\t" \
1042 /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
1043 /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
1045 "paddw %%mm5, %%mm3 \n\t" /* a */ \
1046 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
1047 "paddw %%mm4, %%mm6 \n\t" /* b */ \
1048 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
1049 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
1050 "paddw %%mm1, %%mm4 \n\t" /* c */ \
1051 "paddw %%mm2, %%mm5 \n\t" /* d */ \
1052 "paddw %%mm6, %%mm6 \n\t" /* 2b */ \
1053 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
1054 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
1055 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
1056 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
1057 "paddw %6, %%mm4 \n\t" \
1058 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
1059 "psraw $5, %%mm4 \n\t" \
1060 "packuswb %%mm4, %%mm0 \n\t" \
1061 OP_MMX2(%%mm0, 8(%1), %%mm4, q) \
1067 : "+a"(src), "+c"(dst), "+D"(h) \
1068 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
1069 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
1074 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, \
1082 /* quick HACK, XXX FIXME MUST be optimized */ \
1083 for (i = 0; i < h; i++) { \
1084 temp[ 0] = (src[ 0] + src[ 1]) * 20 - (src[ 0] + src[ 2]) * 6 + \
1085 (src[ 1] + src[ 3]) * 3 - (src[ 2] + src[ 4]); \
1086 temp[ 1] = (src[ 1] + src[ 2]) * 20 - (src[ 0] + src[ 3]) * 6 + \
1087 (src[ 0] + src[ 4]) * 3 - (src[ 1] + src[ 5]); \
1088 temp[ 2] = (src[ 2] + src[ 3]) * 20 - (src[ 1] + src[ 4]) * 6 + \
1089 (src[ 0] + src[ 5]) * 3 - (src[ 0] + src[ 6]); \
1090 temp[ 3] = (src[ 3] + src[ 4]) * 20 - (src[ 2] + src[ 5]) * 6 + \
1091 (src[ 1] + src[ 6]) * 3 - (src[ 0] + src[ 7]); \
1092 temp[ 4] = (src[ 4] + src[ 5]) * 20 - (src[ 3] + src[ 6]) * 6 + \
1093 (src[ 2] + src[ 7]) * 3 - (src[ 1] + src[ 8]); \
1094 temp[ 5] = (src[ 5] + src[ 6]) * 20 - (src[ 4] + src[ 7]) * 6 + \
1095 (src[ 3] + src[ 8]) * 3 - (src[ 2] + src[ 9]); \
1096 temp[ 6] = (src[ 6] + src[ 7]) * 20 - (src[ 5] + src[ 8]) * 6 + \
1097 (src[ 4] + src[ 9]) * 3 - (src[ 3] + src[10]); \
1098 temp[ 7] = (src[ 7] + src[ 8]) * 20 - (src[ 6] + src[ 9]) * 6 + \
1099 (src[ 5] + src[10]) * 3 - (src[ 4] + src[11]); \
1100 temp[ 8] = (src[ 8] + src[ 9]) * 20 - (src[ 7] + src[10]) * 6 + \
1101 (src[ 6] + src[11]) * 3 - (src[ 5] + src[12]); \
1102 temp[ 9] = (src[ 9] + src[10]) * 20 - (src[ 8] + src[11]) * 6 + \
1103 (src[ 7] + src[12]) * 3 - (src[ 6] + src[13]); \
1104 temp[10] = (src[10] + src[11]) * 20 - (src[ 9] + src[12]) * 6 + \
1105 (src[ 8] + src[13]) * 3 - (src[ 7] + src[14]); \
1106 temp[11] = (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + \
1107 (src[ 9] + src[14]) * 3 - (src[ 8] + src[15]); \
1108 temp[12] = (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + \
1109 (src[10] + src[15]) * 3 - (src[ 9] + src[16]); \
1110 temp[13] = (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + \
1111 (src[11] + src[16]) * 3 - (src[10] + src[16]); \
1112 temp[14] = (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + \
1113 (src[12] + src[16]) * 3 - (src[11] + src[15]); \
1114 temp[15] = (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + \
1115 (src[13] + src[15]) * 3 - (src[12] + src[14]); \
1116 __asm__ volatile ( \
1117 "movq (%0), %%mm0 \n\t" \
1118 "movq 8(%0), %%mm1 \n\t" \
1119 "paddw %2, %%mm0 \n\t" \
1120 "paddw %2, %%mm1 \n\t" \
1121 "psraw $5, %%mm0 \n\t" \
1122 "psraw $5, %%mm1 \n\t" \
1123 "packuswb %%mm1, %%mm0 \n\t" \
1124 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
1125 "movq 16(%0), %%mm0 \n\t" \
1126 "movq 24(%0), %%mm1 \n\t" \
1127 "paddw %2, %%mm0 \n\t" \
1128 "paddw %2, %%mm1 \n\t" \
1129 "psraw $5, %%mm0 \n\t" \
1130 "psraw $5, %%mm1 \n\t" \
1131 "packuswb %%mm1, %%mm0 \n\t" \
1132 OP_3DNOW(%%mm0, 8(%1), %%mm1, q) \
1133 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
1141 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, \
1147 __asm__ volatile ( \
1148 "pxor %%mm7, %%mm7 \n\t" \
1150 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
1151 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
1152 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
1153 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
1154 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
1155 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
1156 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
1157 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
1158 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
1159 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
1160 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
1161 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
1162 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
1163 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
1164 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
1165 "paddw %%mm3, %%mm5 \n\t" /* b */ \
1166 "paddw %%mm2, %%mm6 \n\t" /* c */ \
1167 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1168 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
1169 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
1170 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
1171 "paddw %%mm4, %%mm0 \n\t" /* a */ \
1172 "paddw %%mm1, %%mm5 \n\t" /* d */ \
1173 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
1174 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
1175 "paddw %5, %%mm6 \n\t" \
1176 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1177 "psraw $5, %%mm0 \n\t" \
1178 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
1180 "movd 5(%0), %%mm5 \n\t" /* FGHI */ \
1181 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
1182 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
1183 "paddw %%mm5, %%mm1 \n\t" /* a */ \
1184 "paddw %%mm6, %%mm2 \n\t" /* b */ \
1185 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
1186 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
1187 "paddw %%mm6, %%mm3 \n\t" /* c */ \
1188 "paddw %%mm5, %%mm4 \n\t" /* d */ \
1189 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1190 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1191 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1192 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1193 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
1194 "paddw %5, %%mm1 \n\t" \
1195 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
1196 "psraw $5, %%mm3 \n\t" \
1197 "packuswb %%mm3, %%mm0 \n\t" \
1198 OP_MMX2(%%mm0, (%1), %%mm4, q) \
1204 : "+a"(src), "+c"(dst), "+d"(h) \
1205 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1206 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
1211 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, \
1219 /* quick HACK, XXX FIXME MUST be optimized */ \
1220 for (i = 0; i < h; i++) { \
1221 temp[0] = (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + \
1222 (src[1] + src[3]) * 3 - (src[2] + src[4]); \
1223 temp[1] = (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + \
1224 (src[0] + src[4]) * 3 - (src[1] + src[5]); \
1225 temp[2] = (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + \
1226 (src[0] + src[5]) * 3 - (src[0] + src[6]); \
1227 temp[3] = (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + \
1228 (src[1] + src[6]) * 3 - (src[0] + src[7]); \
1229 temp[4] = (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + \
1230 (src[2] + src[7]) * 3 - (src[1] + src[8]); \
1231 temp[5] = (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + \
1232 (src[3] + src[8]) * 3 - (src[2] + src[8]); \
1233 temp[6] = (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + \
1234 (src[4] + src[8]) * 3 - (src[3] + src[7]); \
1235 temp[7] = (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + \
1236 (src[5] + src[7]) * 3 - (src[4] + src[6]); \
1237 __asm__ volatile ( \
1238 "movq (%0), %%mm0 \n\t" \
1239 "movq 8(%0), %%mm1 \n\t" \
1240 "paddw %2, %%mm0 \n\t" \
1241 "paddw %2, %%mm1 \n\t" \
1242 "psraw $5, %%mm0 \n\t" \
1243 "psraw $5, %%mm1 \n\t" \
1244 "packuswb %%mm1, %%mm0 \n\t" \
1245 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
1246 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
1254 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1255 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1260 uint64_t temp[17 * 4]; \
1261 uint64_t *temp_ptr = temp; \
1264 /* FIXME unroll */ \
1265 __asm__ volatile ( \
1266 "pxor %%mm7, %%mm7 \n\t" \
1268 "movq (%0), %%mm0 \n\t" \
1269 "movq (%0), %%mm1 \n\t" \
1270 "movq 8(%0), %%mm2 \n\t" \
1271 "movq 8(%0), %%mm3 \n\t" \
1272 "punpcklbw %%mm7, %%mm0 \n\t" \
1273 "punpckhbw %%mm7, %%mm1 \n\t" \
1274 "punpcklbw %%mm7, %%mm2 \n\t" \
1275 "punpckhbw %%mm7, %%mm3 \n\t" \
1276 "movq %%mm0, (%1) \n\t" \
1277 "movq %%mm1, 17 * 8(%1) \n\t" \
1278 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1279 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1284 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1285 : "r"((x86_reg)srcStride) \
1292 /* FIXME reorder for speed */ \
1293 __asm__ volatile ( \
1294 /* "pxor %%mm7, %%mm7 \n\t" */ \
1296 "movq (%0), %%mm0 \n\t" \
1297 "movq 8(%0), %%mm1 \n\t" \
1298 "movq 16(%0), %%mm2 \n\t" \
1299 "movq 24(%0), %%mm3 \n\t" \
1300 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1301 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1303 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1305 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1307 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1308 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1310 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1311 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1313 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1314 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1316 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1317 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1319 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1321 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1323 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1324 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1326 "add $136, %0 \n\t" \
1331 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1332 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1333 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1334 "g"(4 - 14 * (x86_reg)dstStride) \
1339 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1344 uint64_t temp[9 * 2]; \
1345 uint64_t *temp_ptr = temp; \
1348 /* FIXME unroll */ \
1349 __asm__ volatile ( \
1350 "pxor %%mm7, %%mm7 \n\t" \
1352 "movq (%0), %%mm0 \n\t" \
1353 "movq (%0), %%mm1 \n\t" \
1354 "punpcklbw %%mm7, %%mm0 \n\t" \
1355 "punpckhbw %%mm7, %%mm1 \n\t" \
1356 "movq %%mm0, (%1) \n\t" \
1357 "movq %%mm1, 9*8(%1) \n\t" \
1362 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1363 : "r"((x86_reg)srcStride) \
1370 /* FIXME reorder for speed */ \
1371 __asm__ volatile ( \
1372 /* "pxor %%mm7, %%mm7 \n\t" */ \
1374 "movq (%0), %%mm0 \n\t" \
1375 "movq 8(%0), %%mm1 \n\t" \
1376 "movq 16(%0), %%mm2 \n\t" \
1377 "movq 24(%0), %%mm3 \n\t" \
1378 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1379 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1381 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1383 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1385 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1387 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1389 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1390 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1392 "add $72, %0 \n\t" \
1397 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1398 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1399 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1400 "g"(4 - 6 * (x86_reg)dstStride) \
1405 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1408 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1411 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1415 uint8_t * const half = (uint8_t*)temp; \
1416 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1418 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1421 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1424 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1428 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1432 uint8_t * const half = (uint8_t*)temp; \
1433 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1435 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1439 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1443 uint8_t * const half = (uint8_t*)temp; \
1444 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1445 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1448 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1451 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1454 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1458 uint8_t * const half = (uint8_t*)temp; \
1459 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1460 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1464 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1467 uint64_t half[8 + 9]; \
1468 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1469 uint8_t * const halfHV = ((uint8_t*)half); \
1470 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1472 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1473 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1474 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1477 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1480 uint64_t half[8 + 9]; \
1481 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1482 uint8_t * const halfHV = ((uint8_t*)half); \
1483 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1485 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1487 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1488 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1491 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1494 uint64_t half[8 + 9]; \
1495 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1496 uint8_t * const halfHV = ((uint8_t*)half); \
1497 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1499 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1500 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1501 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1504 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1507 uint64_t half[8 + 9]; \
1508 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1509 uint8_t * const halfHV = ((uint8_t*)half); \
1510 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1512 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1514 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1515 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1518 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1521 uint64_t half[8 + 9]; \
1522 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1523 uint8_t * const halfHV = ((uint8_t*)half); \
1524 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1526 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1527 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1530 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1533 uint64_t half[8 + 9]; \
1534 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1535 uint8_t * const halfHV = ((uint8_t*)half); \
1536 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1538 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1539 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1542 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1545 uint64_t half[8 + 9]; \
1546 uint8_t * const halfH = ((uint8_t*)half); \
1547 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1549 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1550 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1553 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1556 uint64_t half[8 + 9]; \
1557 uint8_t * const halfH = ((uint8_t*)half); \
1558 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1560 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1562 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1565 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1569 uint8_t * const halfH = ((uint8_t*)half); \
1570 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1572 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1575 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1578 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1581 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1584 uint64_t temp[32]; \
1585 uint8_t * const half = (uint8_t*)temp; \
1586 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1588 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1591 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1594 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1595 stride, stride, 16); \
1598 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1601 uint64_t temp[32]; \
1602 uint8_t * const half = (uint8_t*)temp; \
1603 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1605 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1606 stride, stride, 16); \
1609 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1612 uint64_t temp[32]; \
1613 uint8_t * const half = (uint8_t*)temp; \
1614 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1616 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1619 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1622 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1625 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1628 uint64_t temp[32]; \
1629 uint8_t * const half = (uint8_t*)temp; \
1630 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1632 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1633 stride, stride, 16); \
1636 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1639 uint64_t half[16 * 2 + 17 * 2]; \
1640 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1641 uint8_t * const halfHV = ((uint8_t*)half); \
1642 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1644 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1646 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1648 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1651 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1654 uint64_t half[16 * 2 + 17 * 2]; \
1655 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1656 uint8_t * const halfHV = ((uint8_t*)half); \
1657 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1659 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1661 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1663 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1666 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1669 uint64_t half[16 * 2 + 17 * 2]; \
1670 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1671 uint8_t * const halfHV = ((uint8_t*)half); \
1672 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1674 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1676 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1678 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1682 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1685 uint64_t half[16 * 2 + 17 * 2]; \
1686 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1687 uint8_t * const halfHV = ((uint8_t*)half); \
1688 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1690 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1692 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1694 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1698 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1701 uint64_t half[16 * 2 + 17 * 2]; \
1702 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1703 uint8_t * const halfHV = ((uint8_t*)half); \
1704 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1706 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1708 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1711 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1714 uint64_t half[16 * 2 + 17 * 2]; \
1715 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1716 uint8_t * const halfHV = ((uint8_t*)half); \
1717 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1719 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1721 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1725 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1728 uint64_t half[17 * 2]; \
1729 uint8_t * const halfH = ((uint8_t*)half); \
1730 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1732 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1734 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1737 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1740 uint64_t half[17 * 2]; \
1741 uint8_t * const halfH = ((uint8_t*)half); \
1742 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1744 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1746 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1749 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1752 uint64_t half[17 * 2]; \
1753 uint8_t * const halfH = ((uint8_t*)half); \
1754 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1756 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1759 #define PUT_OP(a, b, temp, size) \
1760 "mov"#size" "#a", "#b" \n\t"
1762 #define AVG_3DNOW_OP(a, b, temp, size) \
1763 "mov"#size" "#b", "#temp" \n\t" \
1764 "pavgusb "#temp", "#a" \n\t" \
1765 "mov"#size" "#a", "#b" \n\t"
1767 #define AVG_MMX2_OP(a, b, temp, size) \
1768 "mov"#size" "#b", "#temp" \n\t" \
1769 "pavgb "#temp", "#a" \n\t" \
1770 "mov"#size" "#a", "#b" \n\t"
1772 QPEL_BASE(put_, ff_pw_16, _, PUT_OP, PUT_OP)
1773 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMX2_OP, AVG_3DNOW_OP)
1774 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1775 QPEL_OP(put_, ff_pw_16, _, PUT_OP, 3dnow)
1776 QPEL_OP(avg_, ff_pw_16, _, AVG_3DNOW_OP, 3dnow)
1777 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1778 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmx2)
1779 QPEL_OP(avg_, ff_pw_16, _, AVG_MMX2_OP, mmx2)
1780 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1782 /***********************************/
1783 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1785 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL) \
1786 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1790 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE); \
1793 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2) \
1794 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1798 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE, \
1802 #define QPEL_2TAP(OPNAME, SIZE, MMX) \
1803 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX) \
1804 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX) \
1805 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx) \
1806 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX = \
1807 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX; \
1808 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX = \
1809 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX; \
1810 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX = \
1811 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX; \
1812 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, \
1816 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE); \
1818 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, \
1822 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride, \
1825 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0) \
1826 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0) \
1827 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0) \
1828 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0) \
1829 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1) \
1830 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1) \
1831 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1) \
1832 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1) \
1834 QPEL_2TAP(put_, 16, mmx2)
1835 QPEL_2TAP(avg_, 16, mmx2)
1836 QPEL_2TAP(put_, 8, mmx2)
1837 QPEL_2TAP(avg_, 8, mmx2)
1838 QPEL_2TAP(put_, 16, 3dnow)
1839 QPEL_2TAP(avg_, 16, 3dnow)
1840 QPEL_2TAP(put_, 8, 3dnow)
1841 QPEL_2TAP(avg_, 8, 3dnow)
1843 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1845 put_pixels8_xy2_mmx(dst, src, stride, 8);
1847 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1849 put_pixels16_xy2_mmx(dst, src, stride, 16);
1851 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1853 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1855 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1857 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1860 #endif /* HAVE_INLINE_ASM */
1863 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
1864 x86_reg linesize, x86_reg start_y,
1865 x86_reg end_y, x86_reg block_h,
1866 x86_reg start_x, x86_reg end_x,
1868 extern emu_edge_core_func ff_emu_edge_core_mmx;
1869 extern emu_edge_core_func ff_emu_edge_core_sse;
1871 static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
1873 int block_w, int block_h,
1874 int src_x, int src_y,
1876 emu_edge_core_func *core_fn)
1878 int start_y, start_x, end_y, end_x, src_y_add = 0;
1881 src_y_add = h - 1 - src_y;
1883 } else if (src_y <= -block_h) {
1884 src_y_add = 1 - block_h - src_y;
1885 src_y = 1 - block_h;
1888 src += w - 1 - src_x;
1890 } else if (src_x <= -block_w) {
1891 src += 1 - block_w - src_x;
1892 src_x = 1 - block_w;
1895 start_y = FFMAX(0, -src_y);
1896 start_x = FFMAX(0, -src_x);
1897 end_y = FFMIN(block_h, h-src_y);
1898 end_x = FFMIN(block_w, w-src_x);
1899 assert(start_x < end_x && block_w > 0);
1900 assert(start_y < end_y && block_h > 0);
1902 // fill in the to-be-copied part plus all above/below
1903 src += (src_y_add + start_y) * linesize + start_x;
1905 core_fn(buf, src, linesize, start_y, end_y,
1906 block_h, start_x, end_x, block_w);
1910 static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
1912 int block_w, int block_h,
1913 int src_x, int src_y, int w, int h)
1915 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1916 w, h, &ff_emu_edge_core_mmx);
1920 static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
1922 int block_w, int block_h,
1923 int src_x, int src_y, int w, int h)
1925 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1926 w, h, &ff_emu_edge_core_sse);
1928 #endif /* HAVE_YASM */
1932 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1933 int linesize, int block_w, int block_h,
1934 int src_x, int src_y, int w, int h);
1936 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1937 int stride, int h, int ox, int oy,
1938 int dxx, int dxy, int dyx, int dyy,
1939 int shift, int r, int width, int height,
1940 emulated_edge_mc_func *emu_edge_fn)
1943 const int ix = ox >> (16 + shift);
1944 const int iy = oy >> (16 + shift);
1945 const int oxs = ox >> 4;
1946 const int oys = oy >> 4;
1947 const int dxxs = dxx >> 4;
1948 const int dxys = dxy >> 4;
1949 const int dyxs = dyx >> 4;
1950 const int dyys = dyy >> 4;
1951 const uint16_t r4[4] = { r, r, r, r };
1952 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1953 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1954 const uint64_t shift2 = 2 * shift;
1955 uint8_t edge_buf[(h + 1) * stride];
1958 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1959 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1960 const int dxh = dxy * (h - 1);
1961 const int dyw = dyx * (w - 1);
1962 if ( // non-constant fullpel offset (3% of blocks)
1963 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1964 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1965 // uses more than 16 bits of subpel mv (only at huge resolution)
1966 || (dxx | dxy | dyx | dyy) & 15) {
1967 // FIXME could still use mmx for some of the rows
1968 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1969 shift, r, width, height);
1973 src += ix + iy * stride;
1974 if ((unsigned)ix >= width - w ||
1975 (unsigned)iy >= height - h) {
1976 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1981 "movd %0, %%mm6 \n\t"
1982 "pxor %%mm7, %%mm7 \n\t"
1983 "punpcklwd %%mm6, %%mm6 \n\t"
1984 "punpcklwd %%mm6, %%mm6 \n\t"
1988 for (x = 0; x < w; x += 4) {
1989 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1990 oxs - dxys + dxxs * (x + 1),
1991 oxs - dxys + dxxs * (x + 2),
1992 oxs - dxys + dxxs * (x + 3) };
1993 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1994 oys - dyys + dyxs * (x + 1),
1995 oys - dyys + dyxs * (x + 2),
1996 oys - dyys + dyxs * (x + 3) };
1998 for (y = 0; y < h; y++) {
2000 "movq %0, %%mm4 \n\t"
2001 "movq %1, %%mm5 \n\t"
2002 "paddw %2, %%mm4 \n\t"
2003 "paddw %3, %%mm5 \n\t"
2004 "movq %%mm4, %0 \n\t"
2005 "movq %%mm5, %1 \n\t"
2006 "psrlw $12, %%mm4 \n\t"
2007 "psrlw $12, %%mm5 \n\t"
2008 : "+m"(*dx4), "+m"(*dy4)
2009 : "m"(*dxy4), "m"(*dyy4)
2013 "movq %%mm6, %%mm2 \n\t"
2014 "movq %%mm6, %%mm1 \n\t"
2015 "psubw %%mm4, %%mm2 \n\t"
2016 "psubw %%mm5, %%mm1 \n\t"
2017 "movq %%mm2, %%mm0 \n\t"
2018 "movq %%mm4, %%mm3 \n\t"
2019 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
2020 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
2021 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
2022 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
2024 "movd %4, %%mm5 \n\t"
2025 "movd %3, %%mm4 \n\t"
2026 "punpcklbw %%mm7, %%mm5 \n\t"
2027 "punpcklbw %%mm7, %%mm4 \n\t"
2028 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
2029 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
2031 "movd %2, %%mm5 \n\t"
2032 "movd %1, %%mm4 \n\t"
2033 "punpcklbw %%mm7, %%mm5 \n\t"
2034 "punpcklbw %%mm7, %%mm4 \n\t"
2035 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
2036 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
2037 "paddw %5, %%mm1 \n\t"
2038 "paddw %%mm3, %%mm2 \n\t"
2039 "paddw %%mm1, %%mm0 \n\t"
2040 "paddw %%mm2, %%mm0 \n\t"
2042 "psrlw %6, %%mm0 \n\t"
2043 "packuswb %%mm0, %%mm0 \n\t"
2044 "movd %%mm0, %0 \n\t"
2046 : "=m"(dst[x + y * stride])
2047 : "m"(src[0]), "m"(src[1]),
2048 "m"(src[stride]), "m"(src[stride + 1]),
2049 "m"(*r4), "m"(shift2)
2053 src += 4 - h * stride;
2059 static void gmc_mmx(uint8_t *dst, uint8_t *src,
2060 int stride, int h, int ox, int oy,
2061 int dxx, int dxy, int dyx, int dyy,
2062 int shift, int r, int width, int height)
2064 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2065 width, height, &emulated_edge_mc_mmx);
2068 static void gmc_sse(uint8_t *dst, uint8_t *src,
2069 int stride, int h, int ox, int oy,
2070 int dxx, int dxy, int dyx, int dyy,
2071 int shift, int r, int width, int height)
2073 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2074 width, height, &emulated_edge_mc_sse);
2077 static void gmc_mmx(uint8_t *dst, uint8_t *src,
2078 int stride, int h, int ox, int oy,
2079 int dxx, int dxy, int dyx, int dyy,
2080 int shift, int r, int width, int height)
2082 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2083 width, height, &ff_emulated_edge_mc_8);
2087 #define PREFETCH(name, op) \
2088 static void name(void *mem, int stride, int h) \
2090 const uint8_t *p = mem; \
2092 __asm__ volatile (#op" %0" :: "m"(*p)); \
2097 PREFETCH(prefetch_mmx2, prefetcht0)
2098 PREFETCH(prefetch_3dnow, prefetch)
2101 #endif /* HAVE_INLINE_ASM */
2103 #include "h264_qpel.c"
2105 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
2106 int stride, int h, int x, int y);
2107 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
2108 int stride, int h, int x, int y);
2109 void ff_avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst, uint8_t *src,
2110 int stride, int h, int x, int y);
2112 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
2113 int stride, int h, int x, int y);
2114 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
2115 int stride, int h, int x, int y);
2116 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
2117 int stride, int h, int x, int y);
2119 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
2120 int stride, int h, int x, int y);
2121 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
2122 int stride, int h, int x, int y);
2124 void ff_put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
2125 int stride, int h, int x, int y);
2126 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
2127 int stride, int h, int x, int y);
2129 void ff_avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
2130 int stride, int h, int x, int y);
2131 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
2132 int stride, int h, int x, int y);
2134 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
2135 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
2136 (uint8_t *dst, uint8_t *src, \
2137 int stride, int h, int x, int y);
2139 CHROMA_MC(put, 2, 10, mmx2)
2140 CHROMA_MC(avg, 2, 10, mmx2)
2141 CHROMA_MC(put, 4, 10, mmx2)
2142 CHROMA_MC(avg, 4, 10, mmx2)
2143 CHROMA_MC(put, 8, 10, sse2)
2144 CHROMA_MC(avg, 8, 10, sse2)
2145 CHROMA_MC(put, 8, 10, avx)
2146 CHROMA_MC(avg, 8, 10, avx)
2151 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2153 put_pixels8_mmx(dst, src, stride, 8);
2156 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2158 avg_pixels8_mmx(dst, src, stride, 8);
2161 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2163 put_pixels16_mmx(dst, src, stride, 16);
2166 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2168 avg_pixels16_mmx(dst, src, stride, 16);
2172 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
2173 int stride, int rnd)
2175 put_pixels8_mmx(dst, src, stride, 8);
2178 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src,
2179 int stride, int rnd)
2181 avg_pixels8_mmx2(dst, src, stride, 8);
2184 /* only used in VP3/5/6 */
2185 static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
2191 "movq (%1), %%mm0 \n\t"
2192 "movq (%2), %%mm1 \n\t"
2193 "movq (%1,%4), %%mm2 \n\t"
2194 "movq (%2,%4), %%mm3 \n\t"
2195 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
2196 "movq %%mm4, (%3) \n\t"
2197 "movq %%mm5, (%3,%4) \n\t"
2199 "movq (%1,%4,2), %%mm0 \n\t"
2200 "movq (%2,%4,2), %%mm1 \n\t"
2201 "movq (%1,%5), %%mm2 \n\t"
2202 "movq (%2,%5), %%mm3 \n\t"
2203 "lea (%1,%4,4), %1 \n\t"
2204 "lea (%2,%4,4), %2 \n\t"
2205 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
2206 "movq %%mm4, (%3,%4,2) \n\t"
2207 "movq %%mm5, (%3,%5) \n\t"
2208 "lea (%3,%4,4), %3 \n\t"
2211 :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
2212 :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
2214 // STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
2216 static void put_vp_no_rnd_pixels16_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
2218 put_vp_no_rnd_pixels8_l2_mmx(dst, a, b, stride, h);
2219 put_vp_no_rnd_pixels8_l2_mmx(dst+8, a+8, b+8, stride, h);
2222 #if CONFIG_DIRAC_DECODER
2223 #define DIRAC_PIXOP(OPNAME, EXT)\
2224 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2226 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
2228 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2230 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
2232 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2234 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
2235 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
2238 DIRAC_PIXOP(put, mmx)
2239 DIRAC_PIXOP(avg, mmx)
2240 DIRAC_PIXOP(avg, mmx2)
2242 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2244 put_pixels16_sse2(dst, src[0], stride, h);
2246 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2248 avg_pixels16_sse2(dst, src[0], stride, h);
2250 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2252 put_pixels16_sse2(dst , src[0] , stride, h);
2253 put_pixels16_sse2(dst+16, src[0]+16, stride, h);
2255 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2257 avg_pixels16_sse2(dst , src[0] , stride, h);
2258 avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
2262 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
2265 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
2269 ff_put_pixels_clamped_mmx(block, dest, line_size);
2272 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
2276 ff_add_pixels_clamped_mmx(block, dest, line_size);
2279 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
2282 ff_mmxext_idct(block);
2283 ff_put_pixels_clamped_mmx(block, dest, line_size);
2286 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
2289 ff_mmxext_idct(block);
2290 ff_add_pixels_clamped_mmx(block, dest, line_size);
2294 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2297 __asm__ volatile ("pxor %%mm7, %%mm7":);
2298 for (i = 0; i < blocksize; i += 2) {
2300 "movq %0, %%mm0 \n\t"
2301 "movq %1, %%mm1 \n\t"
2302 "movq %%mm0, %%mm2 \n\t"
2303 "movq %%mm1, %%mm3 \n\t"
2304 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2305 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2306 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2307 "pxor %%mm2, %%mm1 \n\t"
2308 "movq %%mm3, %%mm4 \n\t"
2309 "pand %%mm1, %%mm3 \n\t"
2310 "pandn %%mm1, %%mm4 \n\t"
2311 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2312 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2313 "movq %%mm3, %1 \n\t"
2314 "movq %%mm0, %0 \n\t"
2315 : "+m"(mag[i]), "+m"(ang[i])
2319 __asm__ volatile ("femms");
2322 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2327 "movaps %0, %%xmm5 \n\t"
2328 :: "m"(ff_pdw_80000000[0])
2330 for (i = 0; i < blocksize; i += 4) {
2332 "movaps %0, %%xmm0 \n\t"
2333 "movaps %1, %%xmm1 \n\t"
2334 "xorps %%xmm2, %%xmm2 \n\t"
2335 "xorps %%xmm3, %%xmm3 \n\t"
2336 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2337 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2338 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2339 "xorps %%xmm2, %%xmm1 \n\t"
2340 "movaps %%xmm3, %%xmm4 \n\t"
2341 "andps %%xmm1, %%xmm3 \n\t"
2342 "andnps %%xmm1, %%xmm4 \n\t"
2343 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2344 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2345 "movaps %%xmm3, %1 \n\t"
2346 "movaps %%xmm0, %0 \n\t"
2347 : "+m"(mag[i]), "+m"(ang[i])
2354 static void vector_fmul_window_3dnowext(float *dst, const float *src0,
2355 const float *src1, const float *win,
2358 x86_reg i = -len * 4;
2359 x86_reg j = len * 4 - 8;
2362 "pswapd (%5, %1), %%mm1 \n"
2363 "movq (%5, %0), %%mm0 \n"
2364 "pswapd (%4, %1), %%mm5 \n"
2365 "movq (%3, %0), %%mm4 \n"
2366 "movq %%mm0, %%mm2 \n"
2367 "movq %%mm1, %%mm3 \n"
2368 "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]
2369 "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]
2370 "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]
2371 "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]
2372 "pfadd %%mm3, %%mm2 \n"
2373 "pfsub %%mm0, %%mm1 \n"
2374 "pswapd %%mm2, %%mm2 \n"
2375 "movq %%mm1, (%2, %0) \n"
2376 "movq %%mm2, (%2, %1) \n"
2382 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2386 static void vector_fmul_window_sse(float *dst, const float *src0,
2387 const float *src1, const float *win, int len)
2389 x86_reg i = -len * 4;
2390 x86_reg j = len * 4 - 16;
2393 "movaps (%5, %1), %%xmm1 \n"
2394 "movaps (%5, %0), %%xmm0 \n"
2395 "movaps (%4, %1), %%xmm5 \n"
2396 "movaps (%3, %0), %%xmm4 \n"
2397 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2398 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2399 "movaps %%xmm0, %%xmm2 \n"
2400 "movaps %%xmm1, %%xmm3 \n"
2401 "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]
2402 "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]
2403 "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]
2404 "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]
2405 "addps %%xmm3, %%xmm2 \n"
2406 "subps %%xmm0, %%xmm1 \n"
2407 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2408 "movaps %%xmm1, (%2, %0) \n"
2409 "movaps %%xmm2, (%2, %1) \n"
2414 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2417 #endif /* HAVE_6REGS */
2419 static void vector_clipf_sse(float *dst, const float *src,
2420 float min, float max, int len)
2422 x86_reg i = (len - 16) * 4;
2424 "movss %3, %%xmm4 \n\t"
2425 "movss %4, %%xmm5 \n\t"
2426 "shufps $0, %%xmm4, %%xmm4 \n\t"
2427 "shufps $0, %%xmm5, %%xmm5 \n\t"
2429 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
2430 "movaps 16(%2, %0), %%xmm1 \n\t"
2431 "movaps 32(%2, %0), %%xmm2 \n\t"
2432 "movaps 48(%2, %0), %%xmm3 \n\t"
2433 "maxps %%xmm4, %%xmm0 \n\t"
2434 "maxps %%xmm4, %%xmm1 \n\t"
2435 "maxps %%xmm4, %%xmm2 \n\t"
2436 "maxps %%xmm4, %%xmm3 \n\t"
2437 "minps %%xmm5, %%xmm0 \n\t"
2438 "minps %%xmm5, %%xmm1 \n\t"
2439 "minps %%xmm5, %%xmm2 \n\t"
2440 "minps %%xmm5, %%xmm3 \n\t"
2441 "movaps %%xmm0, (%1, %0) \n\t"
2442 "movaps %%xmm1, 16(%1, %0) \n\t"
2443 "movaps %%xmm2, 32(%1, %0) \n\t"
2444 "movaps %%xmm3, 48(%1, %0) \n\t"
2448 : "r"(dst), "r"(src), "m"(min), "m"(max)
2453 #endif /* HAVE_INLINE_ASM */
2455 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2,
2457 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
2459 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2,
2461 int order, int mul);
2462 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
2464 int order, int mul);
2465 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
2467 int order, int mul);
2469 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
2470 const int16_t *window, unsigned int len);
2471 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2472 const int16_t *window, unsigned int len);
2473 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
2474 const int16_t *window, unsigned int len);
2475 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
2476 const int16_t *window, unsigned int len);
2477 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
2478 const int16_t *window, unsigned int len);
2479 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2480 const int16_t *window, unsigned int len);
2482 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
2483 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
2485 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top,
2486 const uint8_t *diff, int w,
2487 int *left, int *left_top);
2488 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
2490 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
2493 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2495 void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
2496 const float *src1, int len);
2497 void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
2498 const float *src1, int len);
2500 void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2501 const float *src2, int len);
2502 void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
2503 const float *src2, int len);
2505 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
2506 int32_t min, int32_t max, unsigned int len);
2507 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
2508 int32_t min, int32_t max, unsigned int len);
2509 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
2510 int32_t min, int32_t max, unsigned int len);
2511 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
2512 int32_t min, int32_t max, unsigned int len);
2514 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2515 const float *src1, int len);
2516 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2517 const float *src1, int len);
2519 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2521 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2522 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2523 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2524 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2525 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2526 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2527 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2528 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2529 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2530 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2531 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2532 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2533 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2534 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2535 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2536 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2539 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2541 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2542 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2543 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2544 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2547 #define H264_QPEL_FUNCS(x, y, CPU) \
2549 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2550 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2551 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2552 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2555 #define H264_QPEL_FUNCS_10(x, y, CPU) \
2557 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2558 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2559 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2560 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2563 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2565 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2568 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2569 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2570 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2572 if (!high_bit_depth) {
2573 c->clear_block = clear_block_mmx;
2574 c->clear_blocks = clear_blocks_mmx;
2575 c->draw_edges = draw_edges_mmx;
2577 SET_HPEL_FUNCS(put, 0, 16, mmx);
2578 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2579 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2580 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2581 SET_HPEL_FUNCS(put, 1, 8, mmx);
2582 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2583 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2584 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2587 #if ARCH_X86_32 || !HAVE_YASM
2591 c->add_bytes = add_bytes_mmx;
2593 c->put_no_rnd_pixels_l2[0]= put_vp_no_rnd_pixels16_l2_mmx;
2594 c->put_no_rnd_pixels_l2[1]= put_vp_no_rnd_pixels8_l2_mmx;
2596 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2597 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2598 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2600 #endif /* HAVE_INLINE_ASM */
2604 if (!high_bit_depth)
2605 c->emulated_edge_mc = emulated_edge_mc_mmx;
2608 if (!high_bit_depth && CONFIG_H264CHROMA) {
2609 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
2610 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2613 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2618 static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
2621 const int bit_depth = avctx->bits_per_raw_sample;
2622 const int high_bit_depth = bit_depth > 8;
2625 c->prefetch = prefetch_mmx2;
2627 if (!high_bit_depth) {
2628 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2629 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2631 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2632 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2633 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2635 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2636 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2638 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2639 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2640 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2643 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2644 if (!high_bit_depth) {
2645 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2646 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2647 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2648 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2650 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2651 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2655 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2656 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2657 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2658 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2660 #endif /* HAVE_INLINE_ASM */
2662 if (CONFIG_H264QPEL) {
2664 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
2665 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
2666 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
2667 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
2668 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
2669 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
2670 #endif /* HAVE_INLINE_ASM */
2672 if (!high_bit_depth) {
2674 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
2675 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
2676 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
2677 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
2678 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
2679 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
2680 #endif /* HAVE_INLINE_ASM */
2681 } else if (bit_depth == 10) {
2684 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2685 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2686 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2687 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2689 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2690 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2691 #endif /* HAVE_YASM */
2695 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
2696 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
2697 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
2698 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
2699 #endif /* HAVE_INLINE_ASM */
2703 if (!high_bit_depth && CONFIG_H264CHROMA) {
2704 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_mmx2_rnd;
2705 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmx2;
2706 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmx2;
2707 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmx2;
2709 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2710 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmx2;
2711 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmx2;
2712 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmx2;
2713 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmx2;
2716 /* slower than cmov version on AMD */
2717 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
2718 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2720 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2721 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2723 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2724 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2726 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2728 #endif /* HAVE_YASM */
2731 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2734 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2737 c->prefetch = prefetch_3dnow;
2739 if (!high_bit_depth) {
2740 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2741 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2743 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2744 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2745 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2747 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2748 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2750 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2751 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2752 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2754 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2755 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2756 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2757 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2758 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2760 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2761 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2765 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2766 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2767 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2768 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2771 if (CONFIG_H264QPEL) {
2772 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
2773 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
2774 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2775 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
2776 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
2777 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
2779 if (!high_bit_depth) {
2780 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2781 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
2782 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
2783 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2784 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
2785 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
2788 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2789 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
2790 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2791 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
2794 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2795 #endif /* HAVE_INLINE_ASM */
2798 if (!high_bit_depth && CONFIG_H264CHROMA) {
2799 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_3dnow_rnd;
2800 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2802 #endif /* HAVE_YASM */
2805 static void dsputil_init_3dnowext(DSPContext *c, AVCodecContext *avctx,
2808 #if HAVE_AMD3DNOWEXT_INLINE && HAVE_6REGS
2809 c->vector_fmul_window = vector_fmul_window_3dnowext;
2813 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2815 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2818 if (!high_bit_depth) {
2819 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2820 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2821 c->clear_block = clear_block_sse;
2822 c->clear_blocks = clear_blocks_sse;
2826 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2829 c->vector_fmul_window = vector_fmul_window_sse;
2832 c->vector_clipf = vector_clipf_sse;
2833 #endif /* HAVE_INLINE_ASM */
2836 c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
2837 c->vector_fmul_add = ff_vector_fmul_add_sse;
2839 c->scalarproduct_float = ff_scalarproduct_float_sse;
2840 c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
2842 if (!high_bit_depth)
2843 c->emulated_edge_mc = emulated_edge_mc_sse;
2847 #endif /* HAVE_YASM */
2850 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2853 const int bit_depth = avctx->bits_per_raw_sample;
2856 const int high_bit_depth = bit_depth > 8;
2858 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2859 // these functions are slower than mmx on AMD, but faster on Intel
2860 if (!high_bit_depth) {
2861 c->put_pixels_tab[0][0] = put_pixels16_sse2;
2862 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2863 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2864 if (CONFIG_H264QPEL)
2865 H264_QPEL_FUNCS(0, 0, sse2);
2869 if (!high_bit_depth && CONFIG_H264QPEL) {
2870 H264_QPEL_FUNCS(0, 1, sse2);
2871 H264_QPEL_FUNCS(0, 2, sse2);
2872 H264_QPEL_FUNCS(0, 3, sse2);
2873 H264_QPEL_FUNCS(1, 1, sse2);
2874 H264_QPEL_FUNCS(1, 2, sse2);
2875 H264_QPEL_FUNCS(1, 3, sse2);
2876 H264_QPEL_FUNCS(2, 1, sse2);
2877 H264_QPEL_FUNCS(2, 2, sse2);
2878 H264_QPEL_FUNCS(2, 3, sse2);
2879 H264_QPEL_FUNCS(3, 1, sse2);
2880 H264_QPEL_FUNCS(3, 2, sse2);
2881 H264_QPEL_FUNCS(3, 3, sse2);
2883 #endif /* HAVE_INLINE_ASM */
2886 if (bit_depth == 10) {
2887 if (CONFIG_H264QPEL) {
2888 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2889 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2890 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2891 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2892 H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2893 H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2894 H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
2896 if (CONFIG_H264CHROMA) {
2897 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2898 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2902 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2903 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2904 if (mm_flags & AV_CPU_FLAG_ATOM) {
2905 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2907 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2909 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2910 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
2911 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2912 c->apply_window_int16 = ff_apply_window_int16_sse2;
2914 c->bswap_buf = ff_bswap32_buf_sse2;
2915 #endif /* HAVE_YASM */
2918 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2921 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2922 const int bit_depth = avctx->bits_per_raw_sample;
2924 #if HAVE_SSSE3_INLINE
2925 if (!high_bit_depth && CONFIG_H264QPEL) {
2926 H264_QPEL_FUNCS(1, 0, ssse3);
2927 H264_QPEL_FUNCS(1, 1, ssse3);
2928 H264_QPEL_FUNCS(1, 2, ssse3);
2929 H264_QPEL_FUNCS(1, 3, ssse3);
2930 H264_QPEL_FUNCS(2, 0, ssse3);
2931 H264_QPEL_FUNCS(2, 1, ssse3);
2932 H264_QPEL_FUNCS(2, 2, ssse3);
2933 H264_QPEL_FUNCS(2, 3, ssse3);
2934 H264_QPEL_FUNCS(3, 0, ssse3);
2935 H264_QPEL_FUNCS(3, 1, ssse3);
2936 H264_QPEL_FUNCS(3, 2, ssse3);
2937 H264_QPEL_FUNCS(3, 3, ssse3);
2939 #endif /* HAVE_SSSE3_INLINE */
2941 #if HAVE_SSSE3_EXTERNAL
2942 if (bit_depth == 10 && CONFIG_H264QPEL) {
2943 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
2944 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
2945 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
2947 if (!high_bit_depth && CONFIG_H264CHROMA) {
2948 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_ssse3_rnd;
2949 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_ssse3_rnd;
2950 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
2951 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
2953 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2954 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2955 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2957 if (mm_flags & AV_CPU_FLAG_ATOM)
2958 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2960 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2961 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
2962 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2963 c->bswap_buf = ff_bswap32_buf_ssse3;
2964 #endif /* HAVE_SSSE3_EXTERNAL */
2967 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
2970 #if HAVE_SSE4_EXTERNAL
2971 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2972 #endif /* HAVE_SSE4_EXTERNAL */
2975 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2977 #if HAVE_AVX_EXTERNAL
2978 const int bit_depth = avctx->bits_per_raw_sample;
2980 if (bit_depth == 10) {
2981 // AVX implies !cache64.
2982 // TODO: Port cache(32|64) detection from x264.
2983 if (CONFIG_H264QPEL) {
2984 H264_QPEL_FUNCS_10(1, 0, sse2);
2985 H264_QPEL_FUNCS_10(2, 0, sse2);
2986 H264_QPEL_FUNCS_10(3, 0, sse2);
2989 if (CONFIG_H264CHROMA) {
2990 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2991 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2994 c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
2995 c->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
2996 c->vector_fmul_add = ff_vector_fmul_add_avx;
2997 #endif /* HAVE_AVX_EXTERNAL */
3000 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
3002 int mm_flags = av_get_cpu_flags();
3004 #if HAVE_7REGS && HAVE_INLINE_ASM
3005 if (mm_flags & AV_CPU_FLAG_CMOV)
3006 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
3009 if (mm_flags & AV_CPU_FLAG_MMX) {
3011 const int idct_algo = avctx->idct_algo;
3013 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
3014 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
3015 c->idct_put = ff_simple_idct_put_mmx;
3016 c->idct_add = ff_simple_idct_add_mmx;
3017 c->idct = ff_simple_idct_mmx;
3018 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
3020 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
3021 if (mm_flags & AV_CPU_FLAG_MMX2) {
3022 c->idct_put = ff_libmpeg2mmx2_idct_put;
3023 c->idct_add = ff_libmpeg2mmx2_idct_add;
3024 c->idct = ff_mmxext_idct;
3026 c->idct_put = ff_libmpeg2mmx_idct_put;
3027 c->idct_add = ff_libmpeg2mmx_idct_add;
3028 c->idct = ff_mmx_idct;
3030 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
3032 } else if (idct_algo == FF_IDCT_XVIDMMX) {
3033 if (mm_flags & AV_CPU_FLAG_SSE2) {
3034 c->idct_put = ff_idct_xvid_sse2_put;
3035 c->idct_add = ff_idct_xvid_sse2_add;
3036 c->idct = ff_idct_xvid_sse2;
3037 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
3038 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
3039 c->idct_put = ff_idct_xvid_mmx2_put;
3040 c->idct_add = ff_idct_xvid_mmx2_add;
3041 c->idct = ff_idct_xvid_mmx2;
3043 c->idct_put = ff_idct_xvid_mmx_put;
3044 c->idct_add = ff_idct_xvid_mmx_add;
3045 c->idct = ff_idct_xvid_mmx;
3049 #endif /* HAVE_INLINE_ASM */
3051 dsputil_init_mmx(c, avctx, mm_flags);
3054 if (mm_flags & AV_CPU_FLAG_MMXEXT)
3055 dsputil_init_mmx2(c, avctx, mm_flags);
3057 if (mm_flags & AV_CPU_FLAG_3DNOW)
3058 dsputil_init_3dnow(c, avctx, mm_flags);
3060 if (mm_flags & AV_CPU_FLAG_3DNOWEXT)
3061 dsputil_init_3dnowext(c, avctx, mm_flags);
3063 if (mm_flags & AV_CPU_FLAG_SSE)
3064 dsputil_init_sse(c, avctx, mm_flags);
3066 if (mm_flags & AV_CPU_FLAG_SSE2)
3067 dsputil_init_sse2(c, avctx, mm_flags);
3069 if (mm_flags & AV_CPU_FLAG_SSSE3)
3070 dsputil_init_ssse3(c, avctx, mm_flags);
3072 if (mm_flags & AV_CPU_FLAG_SSE4)
3073 dsputil_init_sse4(c, avctx, mm_flags);
3075 if (mm_flags & AV_CPU_FLAG_AVX)
3076 dsputil_init_avx(c, avctx, mm_flags);
3078 if (CONFIG_ENCODERS)
3079 ff_dsputilenc_init_mmx(c, avctx);