2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "libavcodec/ac3dec.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
34 #include "diracdsp_mmx.h"
39 /* pixel operations */
40 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
41 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
43 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
44 { 0x8000000080000000ULL, 0x8000000080000000ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
53 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
57 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
60 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
62 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
64 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
67 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
73 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
74 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
77 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
78 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
79 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
81 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
82 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
83 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
85 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
86 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
90 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
91 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
93 #define MOVQ_BFE(regd) \
95 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
96 "paddb %%"#regd", %%"#regd" \n\t" ::)
99 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
100 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
102 // for shared library it's better to use this way for accessing constants
104 #define MOVQ_BONE(regd) \
106 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
107 "psrlw $15, %%"#regd" \n\t" \
108 "packuswb %%"#regd", %%"#regd" \n\t" ::)
110 #define MOVQ_WTWO(regd) \
112 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
113 "psrlw $15, %%"#regd" \n\t" \
114 "psllw $1, %%"#regd" \n\t"::)
118 // using regr as temporary and for the output result
119 // first argument is unmodifed and second is trashed
120 // regfe is supposed to contain 0xfefefefefefefefe
121 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
122 "movq "#rega", "#regr" \n\t" \
123 "pand "#regb", "#regr" \n\t" \
124 "pxor "#rega", "#regb" \n\t" \
125 "pand "#regfe", "#regb" \n\t" \
126 "psrlq $1, "#regb" \n\t" \
127 "paddb "#regb", "#regr" \n\t"
129 #define PAVGB_MMX(rega, regb, regr, regfe) \
130 "movq "#rega", "#regr" \n\t" \
131 "por "#regb", "#regr" \n\t" \
132 "pxor "#rega", "#regb" \n\t" \
133 "pand "#regfe", "#regb" \n\t" \
134 "psrlq $1, "#regb" \n\t" \
135 "psubb "#regb", "#regr" \n\t"
137 // mm6 is supposed to contain 0xfefefefefefefefe
138 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
139 "movq "#rega", "#regr" \n\t" \
140 "movq "#regc", "#regp" \n\t" \
141 "pand "#regb", "#regr" \n\t" \
142 "pand "#regd", "#regp" \n\t" \
143 "pxor "#rega", "#regb" \n\t" \
144 "pxor "#regc", "#regd" \n\t" \
145 "pand %%mm6, "#regb" \n\t" \
146 "pand %%mm6, "#regd" \n\t" \
147 "psrlq $1, "#regb" \n\t" \
148 "psrlq $1, "#regd" \n\t" \
149 "paddb "#regb", "#regr" \n\t" \
150 "paddb "#regd", "#regp" \n\t"
152 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
153 "movq "#rega", "#regr" \n\t" \
154 "movq "#regc", "#regp" \n\t" \
155 "por "#regb", "#regr" \n\t" \
156 "por "#regd", "#regp" \n\t" \
157 "pxor "#rega", "#regb" \n\t" \
158 "pxor "#regc", "#regd" \n\t" \
159 "pand %%mm6, "#regb" \n\t" \
160 "pand %%mm6, "#regd" \n\t" \
161 "psrlq $1, "#regd" \n\t" \
162 "psrlq $1, "#regb" \n\t" \
163 "psubb "#regb", "#regr" \n\t" \
164 "psubb "#regd", "#regp" \n\t"
166 /***********************************/
167 /* MMX no rounding */
168 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
169 #define SET_RND MOVQ_WONE
170 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
171 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
172 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
174 #include "dsputil_mmx_rnd_template.c"
180 /***********************************/
183 #define DEF(x, y) x ## _ ## y ## _mmx
184 #define SET_RND MOVQ_WTWO
185 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
186 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
188 #include "dsputil_mmx_rnd_template.c"
196 /***********************************/
199 #define DEF(x) x ## _3dnow
200 #define PAVGB "pavgusb"
203 #include "dsputil_mmx_avg_template.c"
209 /***********************************/
212 #define DEF(x) x ## _mmx2
214 /* Introduced only in MMX2 set */
215 #define PAVGB "pavgb"
218 #include "dsputil_mmx_avg_template.c"
224 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
225 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
226 #define put_pixels16_mmx2 put_pixels16_mmx
227 #define put_pixels8_mmx2 put_pixels8_mmx
228 #define put_pixels4_mmx2 put_pixels4_mmx
229 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
230 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
231 #define put_pixels16_3dnow put_pixels16_mmx
232 #define put_pixels8_3dnow put_pixels8_mmx
233 #define put_pixels4_3dnow put_pixels4_mmx
234 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
235 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
237 /***********************************/
240 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
246 /* read the pixels */
251 "movq (%3), %%mm0 \n\t"
252 "movq 8(%3), %%mm1 \n\t"
253 "movq 16(%3), %%mm2 \n\t"
254 "movq 24(%3), %%mm3 \n\t"
255 "movq 32(%3), %%mm4 \n\t"
256 "movq 40(%3), %%mm5 \n\t"
257 "movq 48(%3), %%mm6 \n\t"
258 "movq 56(%3), %%mm7 \n\t"
259 "packuswb %%mm1, %%mm0 \n\t"
260 "packuswb %%mm3, %%mm2 \n\t"
261 "packuswb %%mm5, %%mm4 \n\t"
262 "packuswb %%mm7, %%mm6 \n\t"
263 "movq %%mm0, (%0) \n\t"
264 "movq %%mm2, (%0, %1) \n\t"
265 "movq %%mm4, (%0, %1, 2) \n\t"
266 "movq %%mm6, (%0, %2) \n\t"
267 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
270 pix += line_size * 4;
273 // if here would be an exact copy of the code above
274 // compiler would generate some very strange code
277 "movq (%3), %%mm0 \n\t"
278 "movq 8(%3), %%mm1 \n\t"
279 "movq 16(%3), %%mm2 \n\t"
280 "movq 24(%3), %%mm3 \n\t"
281 "movq 32(%3), %%mm4 \n\t"
282 "movq 40(%3), %%mm5 \n\t"
283 "movq 48(%3), %%mm6 \n\t"
284 "movq 56(%3), %%mm7 \n\t"
285 "packuswb %%mm1, %%mm0 \n\t"
286 "packuswb %%mm3, %%mm2 \n\t"
287 "packuswb %%mm5, %%mm4 \n\t"
288 "packuswb %%mm7, %%mm6 \n\t"
289 "movq %%mm0, (%0) \n\t"
290 "movq %%mm2, (%0, %1) \n\t"
291 "movq %%mm4, (%0, %1, 2) \n\t"
292 "movq %%mm6, (%0, %2) \n\t"
293 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
297 #define put_signed_pixels_clamped_mmx_half(off) \
298 "movq "#off"(%2), %%mm1 \n\t" \
299 "movq 16 + "#off"(%2), %%mm2 \n\t" \
300 "movq 32 + "#off"(%2), %%mm3 \n\t" \
301 "movq 48 + "#off"(%2), %%mm4 \n\t" \
302 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
303 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
304 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
305 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
306 "paddb %%mm0, %%mm1 \n\t" \
307 "paddb %%mm0, %%mm2 \n\t" \
308 "paddb %%mm0, %%mm3 \n\t" \
309 "paddb %%mm0, %%mm4 \n\t" \
310 "movq %%mm1, (%0) \n\t" \
311 "movq %%mm2, (%0, %3) \n\t" \
312 "movq %%mm3, (%0, %3, 2) \n\t" \
313 "movq %%mm4, (%0, %1) \n\t"
315 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
318 x86_reg line_skip = line_size;
322 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
323 "lea (%3, %3, 2), %1 \n\t"
324 put_signed_pixels_clamped_mmx_half(0)
325 "lea (%0, %3, 4), %0 \n\t"
326 put_signed_pixels_clamped_mmx_half(64)
327 : "+&r"(pixels), "=&r"(line_skip3)
328 : "r"(block), "r"(line_skip)
332 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
339 /* read the pixels */
346 "movq (%2), %%mm0 \n\t"
347 "movq 8(%2), %%mm1 \n\t"
348 "movq 16(%2), %%mm2 \n\t"
349 "movq 24(%2), %%mm3 \n\t"
350 "movq %0, %%mm4 \n\t"
351 "movq %1, %%mm6 \n\t"
352 "movq %%mm4, %%mm5 \n\t"
353 "punpcklbw %%mm7, %%mm4 \n\t"
354 "punpckhbw %%mm7, %%mm5 \n\t"
355 "paddsw %%mm4, %%mm0 \n\t"
356 "paddsw %%mm5, %%mm1 \n\t"
357 "movq %%mm6, %%mm5 \n\t"
358 "punpcklbw %%mm7, %%mm6 \n\t"
359 "punpckhbw %%mm7, %%mm5 \n\t"
360 "paddsw %%mm6, %%mm2 \n\t"
361 "paddsw %%mm5, %%mm3 \n\t"
362 "packuswb %%mm1, %%mm0 \n\t"
363 "packuswb %%mm3, %%mm2 \n\t"
364 "movq %%mm0, %0 \n\t"
365 "movq %%mm2, %1 \n\t"
366 : "+m"(*pix), "+m"(*(pix + line_size))
369 pix += line_size * 2;
374 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
375 int line_size, int h)
378 "lea (%3, %3), %%"REG_a" \n\t"
381 "movd (%1 ), %%mm0 \n\t"
382 "movd (%1, %3), %%mm1 \n\t"
383 "movd %%mm0, (%2) \n\t"
384 "movd %%mm1, (%2, %3) \n\t"
385 "add %%"REG_a", %1 \n\t"
386 "add %%"REG_a", %2 \n\t"
387 "movd (%1 ), %%mm0 \n\t"
388 "movd (%1, %3), %%mm1 \n\t"
389 "movd %%mm0, (%2) \n\t"
390 "movd %%mm1, (%2, %3) \n\t"
391 "add %%"REG_a", %1 \n\t"
392 "add %%"REG_a", %2 \n\t"
395 : "+g"(h), "+r"(pixels), "+r"(block)
396 : "r"((x86_reg)line_size)
401 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
402 int line_size, int h)
405 "lea (%3, %3), %%"REG_a" \n\t"
408 "movq (%1 ), %%mm0 \n\t"
409 "movq (%1, %3), %%mm1 \n\t"
410 "movq %%mm0, (%2) \n\t"
411 "movq %%mm1, (%2, %3) \n\t"
412 "add %%"REG_a", %1 \n\t"
413 "add %%"REG_a", %2 \n\t"
414 "movq (%1 ), %%mm0 \n\t"
415 "movq (%1, %3), %%mm1 \n\t"
416 "movq %%mm0, (%2) \n\t"
417 "movq %%mm1, (%2, %3) \n\t"
418 "add %%"REG_a", %1 \n\t"
419 "add %%"REG_a", %2 \n\t"
422 : "+g"(h), "+r"(pixels), "+r"(block)
423 : "r"((x86_reg)line_size)
428 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
429 int line_size, int h)
432 "lea (%3, %3), %%"REG_a" \n\t"
435 "movq (%1 ), %%mm0 \n\t"
436 "movq 8(%1 ), %%mm4 \n\t"
437 "movq (%1, %3), %%mm1 \n\t"
438 "movq 8(%1, %3), %%mm5 \n\t"
439 "movq %%mm0, (%2) \n\t"
440 "movq %%mm4, 8(%2) \n\t"
441 "movq %%mm1, (%2, %3) \n\t"
442 "movq %%mm5, 8(%2, %3) \n\t"
443 "add %%"REG_a", %1 \n\t"
444 "add %%"REG_a", %2 \n\t"
445 "movq (%1 ), %%mm0 \n\t"
446 "movq 8(%1 ), %%mm4 \n\t"
447 "movq (%1, %3), %%mm1 \n\t"
448 "movq 8(%1, %3), %%mm5 \n\t"
449 "movq %%mm0, (%2) \n\t"
450 "movq %%mm4, 8(%2) \n\t"
451 "movq %%mm1, (%2, %3) \n\t"
452 "movq %%mm5, 8(%2, %3) \n\t"
453 "add %%"REG_a", %1 \n\t"
454 "add %%"REG_a", %2 \n\t"
457 : "+g"(h), "+r"(pixels), "+r"(block)
458 : "r"((x86_reg)line_size)
463 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
464 int line_size, int h)
468 "movdqu (%1 ), %%xmm0 \n\t"
469 "movdqu (%1, %3 ), %%xmm1 \n\t"
470 "movdqu (%1, %3, 2), %%xmm2 \n\t"
471 "movdqu (%1, %4 ), %%xmm3 \n\t"
472 "lea (%1, %3, 4), %1 \n\t"
473 "movdqa %%xmm0, (%2) \n\t"
474 "movdqa %%xmm1, (%2, %3) \n\t"
475 "movdqa %%xmm2, (%2, %3, 2) \n\t"
476 "movdqa %%xmm3, (%2, %4) \n\t"
478 "lea (%2, %3, 4), %2 \n\t"
480 : "+g"(h), "+r"(pixels), "+r"(block)
481 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
486 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
487 int line_size, int h)
491 "movdqu (%1 ), %%xmm0 \n\t"
492 "movdqu (%1, %3 ), %%xmm1 \n\t"
493 "movdqu (%1, %3, 2), %%xmm2 \n\t"
494 "movdqu (%1, %4 ), %%xmm3 \n\t"
495 "lea (%1, %3, 4), %1 \n\t"
496 "pavgb (%2 ), %%xmm0 \n\t"
497 "pavgb (%2, %3 ), %%xmm1 \n\t"
498 "pavgb (%2, %3, 2), %%xmm2 \n\t"
499 "pavgb (%2, %4), %%xmm3 \n\t"
500 "movdqa %%xmm0, (%2) \n\t"
501 "movdqa %%xmm1, (%2, %3) \n\t"
502 "movdqa %%xmm2, (%2, %3, 2) \n\t"
503 "movdqa %%xmm3, (%2, %4) \n\t"
505 "lea (%2, %3, 4), %2 \n\t"
507 : "+g"(h), "+r"(pixels), "+r"(block)
508 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
513 #define CLEAR_BLOCKS(name, n) \
514 static void name(DCTELEM *blocks) \
517 "pxor %%mm7, %%mm7 \n\t" \
518 "mov %1, %%"REG_a" \n\t" \
520 "movq %%mm7, (%0, %%"REG_a") \n\t" \
521 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
522 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
523 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
524 "add $32, %%"REG_a" \n\t" \
526 :: "r"(((uint8_t *)blocks) + 128 * n), \
531 CLEAR_BLOCKS(clear_blocks_mmx, 6)
532 CLEAR_BLOCKS(clear_block_mmx, 1)
534 static void clear_block_sse(DCTELEM *block)
537 "xorps %%xmm0, %%xmm0 \n"
538 "movaps %%xmm0, (%0) \n"
539 "movaps %%xmm0, 16(%0) \n"
540 "movaps %%xmm0, 32(%0) \n"
541 "movaps %%xmm0, 48(%0) \n"
542 "movaps %%xmm0, 64(%0) \n"
543 "movaps %%xmm0, 80(%0) \n"
544 "movaps %%xmm0, 96(%0) \n"
545 "movaps %%xmm0, 112(%0) \n"
551 static void clear_blocks_sse(DCTELEM *blocks)
554 "xorps %%xmm0, %%xmm0 \n"
555 "mov %1, %%"REG_a" \n"
557 "movaps %%xmm0, (%0, %%"REG_a") \n"
558 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
559 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
560 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
561 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
562 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
563 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
564 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
565 "add $128, %%"REG_a" \n"
567 :: "r"(((uint8_t *)blocks) + 128 * 6),
573 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
579 "movq (%1, %0), %%mm0 \n\t"
580 "movq (%2, %0), %%mm1 \n\t"
581 "paddb %%mm0, %%mm1 \n\t"
582 "movq %%mm1, (%2, %0) \n\t"
583 "movq 8(%1, %0), %%mm0 \n\t"
584 "movq 8(%2, %0), %%mm1 \n\t"
585 "paddb %%mm0, %%mm1 \n\t"
586 "movq %%mm1, 8(%2, %0) \n\t"
592 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
595 dst[i + 0] += src[i + 0];
599 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
600 const uint8_t *diff, int w,
601 int *left, int *left_top)
605 int l = *left & 0xff;
606 int tl = *left_top & 0xff;
611 "movzbl (%3, %4), %2 \n"
624 "add (%6, %4), %b0 \n"
625 "mov %b0, (%5, %4) \n"
628 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
629 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
636 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
637 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
638 "movd (%1), %%mm0 \n\t"
640 "movd (%1), %%mm1 \n\t"
641 "movd (%1,%3,1), %%mm2 \n\t"
642 "movd (%1,%3,2), %%mm3 \n\t"
643 "punpcklbw %%mm1, %%mm0 \n\t"
644 "punpcklbw %%mm3, %%mm2 \n\t"
645 "movq %%mm0, %%mm1 \n\t"
646 "punpcklwd %%mm2, %%mm0 \n\t"
647 "punpckhwd %%mm2, %%mm1 \n\t"
648 "movd %%mm0, (%0) \n\t"
650 "punpckhdq %%mm0, %%mm0 \n\t"
651 "movd %%mm0, (%0) \n\t"
652 "movd %%mm1, (%0,%2,1) \n\t"
653 "punpckhdq %%mm1, %%mm1 \n\t"
654 "movd %%mm1, (%0,%2,2) \n\t"
664 #define H263_LOOP_FILTER \
665 "pxor %%mm7, %%mm7 \n\t" \
666 "movq %0, %%mm0 \n\t" \
667 "movq %0, %%mm1 \n\t" \
668 "movq %3, %%mm2 \n\t" \
669 "movq %3, %%mm3 \n\t" \
670 "punpcklbw %%mm7, %%mm0 \n\t" \
671 "punpckhbw %%mm7, %%mm1 \n\t" \
672 "punpcklbw %%mm7, %%mm2 \n\t" \
673 "punpckhbw %%mm7, %%mm3 \n\t" \
674 "psubw %%mm2, %%mm0 \n\t" \
675 "psubw %%mm3, %%mm1 \n\t" \
676 "movq %1, %%mm2 \n\t" \
677 "movq %1, %%mm3 \n\t" \
678 "movq %2, %%mm4 \n\t" \
679 "movq %2, %%mm5 \n\t" \
680 "punpcklbw %%mm7, %%mm2 \n\t" \
681 "punpckhbw %%mm7, %%mm3 \n\t" \
682 "punpcklbw %%mm7, %%mm4 \n\t" \
683 "punpckhbw %%mm7, %%mm5 \n\t" \
684 "psubw %%mm2, %%mm4 \n\t" \
685 "psubw %%mm3, %%mm5 \n\t" \
686 "psllw $2, %%mm4 \n\t" \
687 "psllw $2, %%mm5 \n\t" \
688 "paddw %%mm0, %%mm4 \n\t" \
689 "paddw %%mm1, %%mm5 \n\t" \
690 "pxor %%mm6, %%mm6 \n\t" \
691 "pcmpgtw %%mm4, %%mm6 \n\t" \
692 "pcmpgtw %%mm5, %%mm7 \n\t" \
693 "pxor %%mm6, %%mm4 \n\t" \
694 "pxor %%mm7, %%mm5 \n\t" \
695 "psubw %%mm6, %%mm4 \n\t" \
696 "psubw %%mm7, %%mm5 \n\t" \
697 "psrlw $3, %%mm4 \n\t" \
698 "psrlw $3, %%mm5 \n\t" \
699 "packuswb %%mm5, %%mm4 \n\t" \
700 "packsswb %%mm7, %%mm6 \n\t" \
701 "pxor %%mm7, %%mm7 \n\t" \
702 "movd %4, %%mm2 \n\t" \
703 "punpcklbw %%mm2, %%mm2 \n\t" \
704 "punpcklbw %%mm2, %%mm2 \n\t" \
705 "punpcklbw %%mm2, %%mm2 \n\t" \
706 "psubusb %%mm4, %%mm2 \n\t" \
707 "movq %%mm2, %%mm3 \n\t" \
708 "psubusb %%mm4, %%mm3 \n\t" \
709 "psubb %%mm3, %%mm2 \n\t" \
710 "movq %1, %%mm3 \n\t" \
711 "movq %2, %%mm4 \n\t" \
712 "pxor %%mm6, %%mm3 \n\t" \
713 "pxor %%mm6, %%mm4 \n\t" \
714 "paddusb %%mm2, %%mm3 \n\t" \
715 "psubusb %%mm2, %%mm4 \n\t" \
716 "pxor %%mm6, %%mm3 \n\t" \
717 "pxor %%mm6, %%mm4 \n\t" \
718 "paddusb %%mm2, %%mm2 \n\t" \
719 "packsswb %%mm1, %%mm0 \n\t" \
720 "pcmpgtb %%mm0, %%mm7 \n\t" \
721 "pxor %%mm7, %%mm0 \n\t" \
722 "psubb %%mm7, %%mm0 \n\t" \
723 "movq %%mm0, %%mm1 \n\t" \
724 "psubusb %%mm2, %%mm0 \n\t" \
725 "psubb %%mm0, %%mm1 \n\t" \
726 "pand %5, %%mm1 \n\t" \
727 "psrlw $2, %%mm1 \n\t" \
728 "pxor %%mm7, %%mm1 \n\t" \
729 "psubb %%mm7, %%mm1 \n\t" \
730 "movq %0, %%mm5 \n\t" \
731 "movq %3, %%mm6 \n\t" \
732 "psubb %%mm1, %%mm5 \n\t" \
733 "paddb %%mm1, %%mm6 \n\t"
735 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
737 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
738 const int strength = ff_h263_loop_filter_strength[qscale];
743 "movq %%mm3, %1 \n\t"
744 "movq %%mm4, %2 \n\t"
745 "movq %%mm5, %0 \n\t"
746 "movq %%mm6, %3 \n\t"
747 : "+m"(*(uint64_t*)(src - 2 * stride)),
748 "+m"(*(uint64_t*)(src - 1 * stride)),
749 "+m"(*(uint64_t*)(src + 0 * stride)),
750 "+m"(*(uint64_t*)(src + 1 * stride))
751 : "g"(2 * strength), "m"(ff_pb_FC)
756 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
758 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
759 const int strength = ff_h263_loop_filter_strength[qscale];
760 DECLARE_ALIGNED(8, uint64_t, temp)[4];
761 uint8_t *btemp = (uint8_t*)temp;
765 transpose4x4(btemp, src, 8, stride);
766 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
768 H263_LOOP_FILTER // 5 3 4 6
774 : "g"(2 * strength), "m"(ff_pb_FC)
778 "movq %%mm5, %%mm1 \n\t"
779 "movq %%mm4, %%mm0 \n\t"
780 "punpcklbw %%mm3, %%mm5 \n\t"
781 "punpcklbw %%mm6, %%mm4 \n\t"
782 "punpckhbw %%mm3, %%mm1 \n\t"
783 "punpckhbw %%mm6, %%mm0 \n\t"
784 "movq %%mm5, %%mm3 \n\t"
785 "movq %%mm1, %%mm6 \n\t"
786 "punpcklwd %%mm4, %%mm5 \n\t"
787 "punpcklwd %%mm0, %%mm1 \n\t"
788 "punpckhwd %%mm4, %%mm3 \n\t"
789 "punpckhwd %%mm0, %%mm6 \n\t"
790 "movd %%mm5, (%0) \n\t"
791 "punpckhdq %%mm5, %%mm5 \n\t"
792 "movd %%mm5, (%0, %2) \n\t"
793 "movd %%mm3, (%0, %2, 2) \n\t"
794 "punpckhdq %%mm3, %%mm3 \n\t"
795 "movd %%mm3, (%0, %3) \n\t"
796 "movd %%mm1, (%1) \n\t"
797 "punpckhdq %%mm1, %%mm1 \n\t"
798 "movd %%mm1, (%1, %2) \n\t"
799 "movd %%mm6, (%1, %2, 2) \n\t"
800 "punpckhdq %%mm6, %%mm6 \n\t"
801 "movd %%mm6, (%1, %3) \n\t"
803 "r"(src + 4 * stride),
804 "r"((x86_reg)stride),
805 "r"((x86_reg)(3 * stride))
810 /* Draw the edges of width 'w' of an image of size width, height
811 * this MMX version can only handle w == 8 || w == 16. */
812 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
813 int w, int h, int sides)
815 uint8_t *ptr, *last_line;
818 last_line = buf + (height - 1) * wrap;
824 "movd (%0), %%mm0 \n\t"
825 "punpcklbw %%mm0, %%mm0 \n\t"
826 "punpcklwd %%mm0, %%mm0 \n\t"
827 "punpckldq %%mm0, %%mm0 \n\t"
828 "movq %%mm0, -8(%0) \n\t"
829 "movq -8(%0, %2), %%mm1 \n\t"
830 "punpckhbw %%mm1, %%mm1 \n\t"
831 "punpckhwd %%mm1, %%mm1 \n\t"
832 "punpckhdq %%mm1, %%mm1 \n\t"
833 "movq %%mm1, (%0, %2) \n\t"
838 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
843 "movd (%0), %%mm0 \n\t"
844 "punpcklbw %%mm0, %%mm0 \n\t"
845 "punpcklwd %%mm0, %%mm0 \n\t"
846 "punpckldq %%mm0, %%mm0 \n\t"
847 "movq %%mm0, -8(%0) \n\t"
848 "movq %%mm0, -16(%0) \n\t"
849 "movq -8(%0, %2), %%mm1 \n\t"
850 "punpckhbw %%mm1, %%mm1 \n\t"
851 "punpckhwd %%mm1, %%mm1 \n\t"
852 "punpckhdq %%mm1, %%mm1 \n\t"
853 "movq %%mm1, (%0, %2) \n\t"
854 "movq %%mm1, 8(%0, %2) \n\t"
859 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
865 "movd (%0), %%mm0 \n\t"
866 "punpcklbw %%mm0, %%mm0 \n\t"
867 "punpcklwd %%mm0, %%mm0 \n\t"
868 "movd %%mm0, -4(%0) \n\t"
869 "movd -4(%0, %2), %%mm1 \n\t"
870 "punpcklbw %%mm1, %%mm1 \n\t"
871 "punpckhwd %%mm1, %%mm1 \n\t"
872 "punpckhdq %%mm1, %%mm1 \n\t"
873 "movd %%mm1, (%0, %2) \n\t"
878 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
882 /* top and bottom (and hopefully also the corners) */
883 if (sides & EDGE_TOP) {
884 for (i = 0; i < h; i += 4) {
885 ptr = buf - (i + 1) * wrap - w;
888 "movq (%1, %0), %%mm0 \n\t"
889 "movq %%mm0, (%0) \n\t"
890 "movq %%mm0, (%0, %2) \n\t"
891 "movq %%mm0, (%0, %2, 2) \n\t"
892 "movq %%mm0, (%0, %3) \n\t"
897 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
898 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
903 if (sides & EDGE_BOTTOM) {
904 for (i = 0; i < h; i += 4) {
905 ptr = last_line + (i + 1) * wrap - w;
908 "movq (%1, %0), %%mm0 \n\t"
909 "movq %%mm0, (%0) \n\t"
910 "movq %%mm0, (%0, %2) \n\t"
911 "movq %%mm0, (%0, %2, 2) \n\t"
912 "movq %%mm0, (%0, %3) \n\t"
917 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
918 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
919 "r"(ptr + width + 2 * w)
925 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
926 in0, in1, in2, in7, out, OP) \
927 "paddw "#m4", "#m3" \n\t" /* x1 */ \
928 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
929 "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
930 "movq "#in7", "#m3" \n\t" /* d */ \
931 "movq "#in0", %%mm5 \n\t" /* D */ \
932 "paddw "#m3", %%mm5 \n\t" /* x4 */ \
933 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
934 "movq "#in1", %%mm5 \n\t" /* C */ \
935 "movq "#in2", %%mm6 \n\t" /* B */ \
936 "paddw "#m6", %%mm5 \n\t" /* x3 */ \
937 "paddw "#m5", %%mm6 \n\t" /* x2 */ \
938 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
939 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
940 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
941 "paddw "#rnd", %%mm4 \n\t" /* x2 */ \
942 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
943 "psraw $5, %%mm5 \n\t" \
944 "packuswb %%mm5, %%mm5 \n\t" \
945 OP(%%mm5, out, %%mm7, d)
947 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW) \
948 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, \
957 "pxor %%mm7, %%mm7 \n\t" \
959 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
960 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
961 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
962 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
963 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
964 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
965 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
966 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
967 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
968 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
969 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
970 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
971 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
972 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
973 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
974 "paddw %%mm3, %%mm5 \n\t" /* b */ \
975 "paddw %%mm2, %%mm6 \n\t" /* c */ \
976 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
977 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
978 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
979 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
980 "paddw %%mm4, %%mm0 \n\t" /* a */ \
981 "paddw %%mm1, %%mm5 \n\t" /* d */ \
982 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
983 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
984 "paddw %6, %%mm6 \n\t" \
985 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
986 "psraw $5, %%mm0 \n\t" \
987 "movq %%mm0, %5 \n\t" \
988 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
990 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
991 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
992 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
993 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
994 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
995 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
996 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
997 "paddw %%mm0, %%mm2 \n\t" /* b */ \
998 "paddw %%mm5, %%mm3 \n\t" /* c */ \
999 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1000 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1001 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
1002 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
1003 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
1004 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
1005 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1006 "paddw %%mm2, %%mm1 \n\t" /* a */ \
1007 "paddw %%mm6, %%mm4 \n\t" /* d */ \
1008 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1009 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
1010 "paddw %6, %%mm1 \n\t" \
1011 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
1012 "psraw $5, %%mm3 \n\t" \
1013 "movq %5, %%mm1 \n\t" \
1014 "packuswb %%mm3, %%mm1 \n\t" \
1015 OP_MMX2(%%mm1, (%1), %%mm4, q) \
1016 /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
1018 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
1019 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
1020 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
1021 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
1022 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
1023 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
1024 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
1025 "paddw %%mm1, %%mm5 \n\t" /* b */ \
1026 "paddw %%mm4, %%mm0 \n\t" /* c */ \
1027 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1028 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
1029 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
1030 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
1031 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
1032 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
1033 "paddw %%mm3, %%mm2 \n\t" /* d */ \
1034 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
1035 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
1036 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
1037 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
1038 "paddw %%mm2, %%mm6 \n\t" /* a */ \
1039 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
1040 "paddw %6, %%mm0 \n\t" \
1041 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1042 "psraw $5, %%mm0 \n\t" \
1043 /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
1044 /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
1046 "paddw %%mm5, %%mm3 \n\t" /* a */ \
1047 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
1048 "paddw %%mm4, %%mm6 \n\t" /* b */ \
1049 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
1050 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
1051 "paddw %%mm1, %%mm4 \n\t" /* c */ \
1052 "paddw %%mm2, %%mm5 \n\t" /* d */ \
1053 "paddw %%mm6, %%mm6 \n\t" /* 2b */ \
1054 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
1055 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
1056 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
1057 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
1058 "paddw %6, %%mm4 \n\t" \
1059 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
1060 "psraw $5, %%mm4 \n\t" \
1061 "packuswb %%mm4, %%mm0 \n\t" \
1062 OP_MMX2(%%mm0, 8(%1), %%mm4, q) \
1068 : "+a"(src), "+c"(dst), "+D"(h) \
1069 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
1070 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
1075 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, \
1083 /* quick HACK, XXX FIXME MUST be optimized */ \
1084 for (i = 0; i < h; i++) { \
1085 temp[ 0] = (src[ 0] + src[ 1]) * 20 - (src[ 0] + src[ 2]) * 6 + \
1086 (src[ 1] + src[ 3]) * 3 - (src[ 2] + src[ 4]); \
1087 temp[ 1] = (src[ 1] + src[ 2]) * 20 - (src[ 0] + src[ 3]) * 6 + \
1088 (src[ 0] + src[ 4]) * 3 - (src[ 1] + src[ 5]); \
1089 temp[ 2] = (src[ 2] + src[ 3]) * 20 - (src[ 1] + src[ 4]) * 6 + \
1090 (src[ 0] + src[ 5]) * 3 - (src[ 0] + src[ 6]); \
1091 temp[ 3] = (src[ 3] + src[ 4]) * 20 - (src[ 2] + src[ 5]) * 6 + \
1092 (src[ 1] + src[ 6]) * 3 - (src[ 0] + src[ 7]); \
1093 temp[ 4] = (src[ 4] + src[ 5]) * 20 - (src[ 3] + src[ 6]) * 6 + \
1094 (src[ 2] + src[ 7]) * 3 - (src[ 1] + src[ 8]); \
1095 temp[ 5] = (src[ 5] + src[ 6]) * 20 - (src[ 4] + src[ 7]) * 6 + \
1096 (src[ 3] + src[ 8]) * 3 - (src[ 2] + src[ 9]); \
1097 temp[ 6] = (src[ 6] + src[ 7]) * 20 - (src[ 5] + src[ 8]) * 6 + \
1098 (src[ 4] + src[ 9]) * 3 - (src[ 3] + src[10]); \
1099 temp[ 7] = (src[ 7] + src[ 8]) * 20 - (src[ 6] + src[ 9]) * 6 + \
1100 (src[ 5] + src[10]) * 3 - (src[ 4] + src[11]); \
1101 temp[ 8] = (src[ 8] + src[ 9]) * 20 - (src[ 7] + src[10]) * 6 + \
1102 (src[ 6] + src[11]) * 3 - (src[ 5] + src[12]); \
1103 temp[ 9] = (src[ 9] + src[10]) * 20 - (src[ 8] + src[11]) * 6 + \
1104 (src[ 7] + src[12]) * 3 - (src[ 6] + src[13]); \
1105 temp[10] = (src[10] + src[11]) * 20 - (src[ 9] + src[12]) * 6 + \
1106 (src[ 8] + src[13]) * 3 - (src[ 7] + src[14]); \
1107 temp[11] = (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + \
1108 (src[ 9] + src[14]) * 3 - (src[ 8] + src[15]); \
1109 temp[12] = (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + \
1110 (src[10] + src[15]) * 3 - (src[ 9] + src[16]); \
1111 temp[13] = (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + \
1112 (src[11] + src[16]) * 3 - (src[10] + src[16]); \
1113 temp[14] = (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + \
1114 (src[12] + src[16]) * 3 - (src[11] + src[15]); \
1115 temp[15] = (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + \
1116 (src[13] + src[15]) * 3 - (src[12] + src[14]); \
1117 __asm__ volatile ( \
1118 "movq (%0), %%mm0 \n\t" \
1119 "movq 8(%0), %%mm1 \n\t" \
1120 "paddw %2, %%mm0 \n\t" \
1121 "paddw %2, %%mm1 \n\t" \
1122 "psraw $5, %%mm0 \n\t" \
1123 "psraw $5, %%mm1 \n\t" \
1124 "packuswb %%mm1, %%mm0 \n\t" \
1125 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
1126 "movq 16(%0), %%mm0 \n\t" \
1127 "movq 24(%0), %%mm1 \n\t" \
1128 "paddw %2, %%mm0 \n\t" \
1129 "paddw %2, %%mm1 \n\t" \
1130 "psraw $5, %%mm0 \n\t" \
1131 "psraw $5, %%mm1 \n\t" \
1132 "packuswb %%mm1, %%mm0 \n\t" \
1133 OP_3DNOW(%%mm0, 8(%1), %%mm1, q) \
1134 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
1142 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, \
1148 __asm__ volatile ( \
1149 "pxor %%mm7, %%mm7 \n\t" \
1151 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
1152 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
1153 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
1154 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
1155 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
1156 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
1157 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
1158 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
1159 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
1160 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
1161 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
1162 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
1163 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
1164 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
1165 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
1166 "paddw %%mm3, %%mm5 \n\t" /* b */ \
1167 "paddw %%mm2, %%mm6 \n\t" /* c */ \
1168 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1169 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
1170 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
1171 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
1172 "paddw %%mm4, %%mm0 \n\t" /* a */ \
1173 "paddw %%mm1, %%mm5 \n\t" /* d */ \
1174 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
1175 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
1176 "paddw %5, %%mm6 \n\t" \
1177 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1178 "psraw $5, %%mm0 \n\t" \
1179 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
1181 "movd 5(%0), %%mm5 \n\t" /* FGHI */ \
1182 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
1183 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
1184 "paddw %%mm5, %%mm1 \n\t" /* a */ \
1185 "paddw %%mm6, %%mm2 \n\t" /* b */ \
1186 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
1187 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
1188 "paddw %%mm6, %%mm3 \n\t" /* c */ \
1189 "paddw %%mm5, %%mm4 \n\t" /* d */ \
1190 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1191 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1192 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1193 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1194 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
1195 "paddw %5, %%mm1 \n\t" \
1196 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
1197 "psraw $5, %%mm3 \n\t" \
1198 "packuswb %%mm3, %%mm0 \n\t" \
1199 OP_MMX2(%%mm0, (%1), %%mm4, q) \
1205 : "+a"(src), "+c"(dst), "+d"(h) \
1206 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1207 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
1212 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, \
1220 /* quick HACK, XXX FIXME MUST be optimized */ \
1221 for (i = 0; i < h; i++) { \
1222 temp[0] = (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + \
1223 (src[1] + src[3]) * 3 - (src[2] + src[4]); \
1224 temp[1] = (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + \
1225 (src[0] + src[4]) * 3 - (src[1] + src[5]); \
1226 temp[2] = (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + \
1227 (src[0] + src[5]) * 3 - (src[0] + src[6]); \
1228 temp[3] = (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + \
1229 (src[1] + src[6]) * 3 - (src[0] + src[7]); \
1230 temp[4] = (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + \
1231 (src[2] + src[7]) * 3 - (src[1] + src[8]); \
1232 temp[5] = (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + \
1233 (src[3] + src[8]) * 3 - (src[2] + src[8]); \
1234 temp[6] = (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + \
1235 (src[4] + src[8]) * 3 - (src[3] + src[7]); \
1236 temp[7] = (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + \
1237 (src[5] + src[7]) * 3 - (src[4] + src[6]); \
1238 __asm__ volatile ( \
1239 "movq (%0), %%mm0 \n\t" \
1240 "movq 8(%0), %%mm1 \n\t" \
1241 "paddw %2, %%mm0 \n\t" \
1242 "paddw %2, %%mm1 \n\t" \
1243 "psraw $5, %%mm0 \n\t" \
1244 "psraw $5, %%mm1 \n\t" \
1245 "packuswb %%mm1, %%mm0 \n\t" \
1246 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
1247 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
1255 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1256 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1261 uint64_t temp[17 * 4]; \
1262 uint64_t *temp_ptr = temp; \
1265 /* FIXME unroll */ \
1266 __asm__ volatile ( \
1267 "pxor %%mm7, %%mm7 \n\t" \
1269 "movq (%0), %%mm0 \n\t" \
1270 "movq (%0), %%mm1 \n\t" \
1271 "movq 8(%0), %%mm2 \n\t" \
1272 "movq 8(%0), %%mm3 \n\t" \
1273 "punpcklbw %%mm7, %%mm0 \n\t" \
1274 "punpckhbw %%mm7, %%mm1 \n\t" \
1275 "punpcklbw %%mm7, %%mm2 \n\t" \
1276 "punpckhbw %%mm7, %%mm3 \n\t" \
1277 "movq %%mm0, (%1) \n\t" \
1278 "movq %%mm1, 17 * 8(%1) \n\t" \
1279 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1280 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1285 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1286 : "r"((x86_reg)srcStride) \
1293 /* FIXME reorder for speed */ \
1294 __asm__ volatile ( \
1295 /* "pxor %%mm7, %%mm7 \n\t" */ \
1297 "movq (%0), %%mm0 \n\t" \
1298 "movq 8(%0), %%mm1 \n\t" \
1299 "movq 16(%0), %%mm2 \n\t" \
1300 "movq 24(%0), %%mm3 \n\t" \
1301 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1302 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1304 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1306 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1308 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1309 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1311 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1312 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1314 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1315 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1317 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1318 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1320 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1322 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1324 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1325 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1327 "add $136, %0 \n\t" \
1332 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1333 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1334 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1335 "g"(4 - 14 * (x86_reg)dstStride) \
1340 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1345 uint64_t temp[9 * 2]; \
1346 uint64_t *temp_ptr = temp; \
1349 /* FIXME unroll */ \
1350 __asm__ volatile ( \
1351 "pxor %%mm7, %%mm7 \n\t" \
1353 "movq (%0), %%mm0 \n\t" \
1354 "movq (%0), %%mm1 \n\t" \
1355 "punpcklbw %%mm7, %%mm0 \n\t" \
1356 "punpckhbw %%mm7, %%mm1 \n\t" \
1357 "movq %%mm0, (%1) \n\t" \
1358 "movq %%mm1, 9*8(%1) \n\t" \
1363 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1364 : "r"((x86_reg)srcStride) \
1371 /* FIXME reorder for speed */ \
1372 __asm__ volatile ( \
1373 /* "pxor %%mm7, %%mm7 \n\t" */ \
1375 "movq (%0), %%mm0 \n\t" \
1376 "movq 8(%0), %%mm1 \n\t" \
1377 "movq 16(%0), %%mm2 \n\t" \
1378 "movq 24(%0), %%mm3 \n\t" \
1379 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1380 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1382 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1384 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1386 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1388 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1390 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1391 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1393 "add $72, %0 \n\t" \
1398 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1399 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1400 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1401 "g"(4 - 6 * (x86_reg)dstStride) \
1406 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1409 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1412 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1416 uint8_t * const half = (uint8_t*)temp; \
1417 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1419 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1422 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1425 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1429 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1433 uint8_t * const half = (uint8_t*)temp; \
1434 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1436 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1440 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1444 uint8_t * const half = (uint8_t*)temp; \
1445 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1446 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1449 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1452 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1455 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1459 uint8_t * const half = (uint8_t*)temp; \
1460 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1461 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1465 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1468 uint64_t half[8 + 9]; \
1469 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1470 uint8_t * const halfHV = ((uint8_t*)half); \
1471 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1473 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1474 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1475 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1478 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1481 uint64_t half[8 + 9]; \
1482 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1483 uint8_t * const halfHV = ((uint8_t*)half); \
1484 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1486 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1488 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1489 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1492 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1495 uint64_t half[8 + 9]; \
1496 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1497 uint8_t * const halfHV = ((uint8_t*)half); \
1498 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1500 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1501 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1502 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1505 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1508 uint64_t half[8 + 9]; \
1509 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1510 uint8_t * const halfHV = ((uint8_t*)half); \
1511 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1513 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1515 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1516 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1519 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1522 uint64_t half[8 + 9]; \
1523 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1524 uint8_t * const halfHV = ((uint8_t*)half); \
1525 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1527 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1528 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1531 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1534 uint64_t half[8 + 9]; \
1535 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1536 uint8_t * const halfHV = ((uint8_t*)half); \
1537 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1539 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1540 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1543 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1546 uint64_t half[8 + 9]; \
1547 uint8_t * const halfH = ((uint8_t*)half); \
1548 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1550 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1551 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1554 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1557 uint64_t half[8 + 9]; \
1558 uint8_t * const halfH = ((uint8_t*)half); \
1559 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1561 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1563 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1566 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1570 uint8_t * const halfH = ((uint8_t*)half); \
1571 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1573 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1576 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1579 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1582 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1585 uint64_t temp[32]; \
1586 uint8_t * const half = (uint8_t*)temp; \
1587 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1589 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1592 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1595 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1596 stride, stride, 16); \
1599 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1602 uint64_t temp[32]; \
1603 uint8_t * const half = (uint8_t*)temp; \
1604 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1606 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1607 stride, stride, 16); \
1610 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1613 uint64_t temp[32]; \
1614 uint8_t * const half = (uint8_t*)temp; \
1615 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1617 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1620 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1623 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1626 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1629 uint64_t temp[32]; \
1630 uint8_t * const half = (uint8_t*)temp; \
1631 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1633 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1634 stride, stride, 16); \
1637 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1640 uint64_t half[16 * 2 + 17 * 2]; \
1641 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1642 uint8_t * const halfHV = ((uint8_t*)half); \
1643 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1645 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1647 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1649 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1652 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1655 uint64_t half[16 * 2 + 17 * 2]; \
1656 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1657 uint8_t * const halfHV = ((uint8_t*)half); \
1658 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1660 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1662 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1664 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1667 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1670 uint64_t half[16 * 2 + 17 * 2]; \
1671 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1672 uint8_t * const halfHV = ((uint8_t*)half); \
1673 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1675 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1677 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1679 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1683 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1686 uint64_t half[16 * 2 + 17 * 2]; \
1687 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1688 uint8_t * const halfHV = ((uint8_t*)half); \
1689 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1691 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1693 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1695 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1699 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1702 uint64_t half[16 * 2 + 17 * 2]; \
1703 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1704 uint8_t * const halfHV = ((uint8_t*)half); \
1705 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1707 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1709 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1712 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1715 uint64_t half[16 * 2 + 17 * 2]; \
1716 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1717 uint8_t * const halfHV = ((uint8_t*)half); \
1718 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1720 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1722 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1726 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1729 uint64_t half[17 * 2]; \
1730 uint8_t * const halfH = ((uint8_t*)half); \
1731 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1733 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1735 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1738 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1741 uint64_t half[17 * 2]; \
1742 uint8_t * const halfH = ((uint8_t*)half); \
1743 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1745 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1747 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1750 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1753 uint64_t half[17 * 2]; \
1754 uint8_t * const halfH = ((uint8_t*)half); \
1755 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1757 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1760 #define PUT_OP(a, b, temp, size) \
1761 "mov"#size" "#a", "#b" \n\t"
1763 #define AVG_3DNOW_OP(a, b, temp, size) \
1764 "mov"#size" "#b", "#temp" \n\t" \
1765 "pavgusb "#temp", "#a" \n\t" \
1766 "mov"#size" "#a", "#b" \n\t"
1768 #define AVG_MMX2_OP(a, b, temp, size) \
1769 "mov"#size" "#b", "#temp" \n\t" \
1770 "pavgb "#temp", "#a" \n\t" \
1771 "mov"#size" "#a", "#b" \n\t"
1773 QPEL_BASE(put_, ff_pw_16, _, PUT_OP, PUT_OP)
1774 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMX2_OP, AVG_3DNOW_OP)
1775 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1776 QPEL_OP(put_, ff_pw_16, _, PUT_OP, 3dnow)
1777 QPEL_OP(avg_, ff_pw_16, _, AVG_3DNOW_OP, 3dnow)
1778 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1779 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmx2)
1780 QPEL_OP(avg_, ff_pw_16, _, AVG_MMX2_OP, mmx2)
1781 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1783 /***********************************/
1784 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1786 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL) \
1787 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1791 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE); \
1794 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2) \
1795 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1799 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE, \
1803 #define QPEL_2TAP(OPNAME, SIZE, MMX) \
1804 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX) \
1805 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX) \
1806 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx) \
1807 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX = \
1808 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX; \
1809 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX = \
1810 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX; \
1811 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX = \
1812 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX; \
1813 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, \
1817 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE); \
1819 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, \
1823 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride, \
1826 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0) \
1827 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0) \
1828 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0) \
1829 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0) \
1830 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1) \
1831 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1) \
1832 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1) \
1833 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1) \
1835 QPEL_2TAP(put_, 16, mmx2)
1836 QPEL_2TAP(avg_, 16, mmx2)
1837 QPEL_2TAP(put_, 8, mmx2)
1838 QPEL_2TAP(avg_, 8, mmx2)
1839 QPEL_2TAP(put_, 16, 3dnow)
1840 QPEL_2TAP(avg_, 16, 3dnow)
1841 QPEL_2TAP(put_, 8, 3dnow)
1842 QPEL_2TAP(avg_, 8, 3dnow)
1844 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1846 put_pixels8_xy2_mmx(dst, src, stride, 8);
1848 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1850 put_pixels16_xy2_mmx(dst, src, stride, 16);
1852 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1854 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1856 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1858 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1861 #endif /* HAVE_INLINE_ASM */
1864 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
1865 x86_reg linesize, x86_reg start_y,
1866 x86_reg end_y, x86_reg block_h,
1867 x86_reg start_x, x86_reg end_x,
1869 extern emu_edge_core_func ff_emu_edge_core_mmx;
1870 extern emu_edge_core_func ff_emu_edge_core_sse;
1872 static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
1874 int block_w, int block_h,
1875 int src_x, int src_y,
1877 emu_edge_core_func *core_fn)
1879 int start_y, start_x, end_y, end_x, src_y_add = 0;
1882 src_y_add = h - 1 - src_y;
1884 } else if (src_y <= -block_h) {
1885 src_y_add = 1 - block_h - src_y;
1886 src_y = 1 - block_h;
1889 src += w - 1 - src_x;
1891 } else if (src_x <= -block_w) {
1892 src += 1 - block_w - src_x;
1893 src_x = 1 - block_w;
1896 start_y = FFMAX(0, -src_y);
1897 start_x = FFMAX(0, -src_x);
1898 end_y = FFMIN(block_h, h-src_y);
1899 end_x = FFMIN(block_w, w-src_x);
1900 assert(start_x < end_x && block_w > 0);
1901 assert(start_y < end_y && block_h > 0);
1903 // fill in the to-be-copied part plus all above/below
1904 src += (src_y_add + start_y) * linesize + start_x;
1906 core_fn(buf, src, linesize, start_y, end_y,
1907 block_h, start_x, end_x, block_w);
1911 static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
1913 int block_w, int block_h,
1914 int src_x, int src_y, int w, int h)
1916 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1917 w, h, &ff_emu_edge_core_mmx);
1921 static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
1923 int block_w, int block_h,
1924 int src_x, int src_y, int w, int h)
1926 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1927 w, h, &ff_emu_edge_core_sse);
1929 #endif /* HAVE_YASM */
1933 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1934 int linesize, int block_w, int block_h,
1935 int src_x, int src_y, int w, int h);
1937 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1938 int stride, int h, int ox, int oy,
1939 int dxx, int dxy, int dyx, int dyy,
1940 int shift, int r, int width, int height,
1941 emulated_edge_mc_func *emu_edge_fn)
1944 const int ix = ox >> (16 + shift);
1945 const int iy = oy >> (16 + shift);
1946 const int oxs = ox >> 4;
1947 const int oys = oy >> 4;
1948 const int dxxs = dxx >> 4;
1949 const int dxys = dxy >> 4;
1950 const int dyxs = dyx >> 4;
1951 const int dyys = dyy >> 4;
1952 const uint16_t r4[4] = { r, r, r, r };
1953 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1954 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1955 const uint64_t shift2 = 2 * shift;
1956 uint8_t edge_buf[(h + 1) * stride];
1959 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1960 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1961 const int dxh = dxy * (h - 1);
1962 const int dyw = dyx * (w - 1);
1963 if ( // non-constant fullpel offset (3% of blocks)
1964 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1965 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1966 // uses more than 16 bits of subpel mv (only at huge resolution)
1967 || (dxx | dxy | dyx | dyy) & 15) {
1968 // FIXME could still use mmx for some of the rows
1969 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1970 shift, r, width, height);
1974 src += ix + iy * stride;
1975 if ((unsigned)ix >= width - w ||
1976 (unsigned)iy >= height - h) {
1977 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1982 "movd %0, %%mm6 \n\t"
1983 "pxor %%mm7, %%mm7 \n\t"
1984 "punpcklwd %%mm6, %%mm6 \n\t"
1985 "punpcklwd %%mm6, %%mm6 \n\t"
1989 for (x = 0; x < w; x += 4) {
1990 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1991 oxs - dxys + dxxs * (x + 1),
1992 oxs - dxys + dxxs * (x + 2),
1993 oxs - dxys + dxxs * (x + 3) };
1994 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1995 oys - dyys + dyxs * (x + 1),
1996 oys - dyys + dyxs * (x + 2),
1997 oys - dyys + dyxs * (x + 3) };
1999 for (y = 0; y < h; y++) {
2001 "movq %0, %%mm4 \n\t"
2002 "movq %1, %%mm5 \n\t"
2003 "paddw %2, %%mm4 \n\t"
2004 "paddw %3, %%mm5 \n\t"
2005 "movq %%mm4, %0 \n\t"
2006 "movq %%mm5, %1 \n\t"
2007 "psrlw $12, %%mm4 \n\t"
2008 "psrlw $12, %%mm5 \n\t"
2009 : "+m"(*dx4), "+m"(*dy4)
2010 : "m"(*dxy4), "m"(*dyy4)
2014 "movq %%mm6, %%mm2 \n\t"
2015 "movq %%mm6, %%mm1 \n\t"
2016 "psubw %%mm4, %%mm2 \n\t"
2017 "psubw %%mm5, %%mm1 \n\t"
2018 "movq %%mm2, %%mm0 \n\t"
2019 "movq %%mm4, %%mm3 \n\t"
2020 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
2021 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
2022 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
2023 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
2025 "movd %4, %%mm5 \n\t"
2026 "movd %3, %%mm4 \n\t"
2027 "punpcklbw %%mm7, %%mm5 \n\t"
2028 "punpcklbw %%mm7, %%mm4 \n\t"
2029 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
2030 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
2032 "movd %2, %%mm5 \n\t"
2033 "movd %1, %%mm4 \n\t"
2034 "punpcklbw %%mm7, %%mm5 \n\t"
2035 "punpcklbw %%mm7, %%mm4 \n\t"
2036 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
2037 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
2038 "paddw %5, %%mm1 \n\t"
2039 "paddw %%mm3, %%mm2 \n\t"
2040 "paddw %%mm1, %%mm0 \n\t"
2041 "paddw %%mm2, %%mm0 \n\t"
2043 "psrlw %6, %%mm0 \n\t"
2044 "packuswb %%mm0, %%mm0 \n\t"
2045 "movd %%mm0, %0 \n\t"
2047 : "=m"(dst[x + y * stride])
2048 : "m"(src[0]), "m"(src[1]),
2049 "m"(src[stride]), "m"(src[stride + 1]),
2050 "m"(*r4), "m"(shift2)
2054 src += 4 - h * stride;
2060 static void gmc_mmx(uint8_t *dst, uint8_t *src,
2061 int stride, int h, int ox, int oy,
2062 int dxx, int dxy, int dyx, int dyy,
2063 int shift, int r, int width, int height)
2065 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2066 width, height, &emulated_edge_mc_mmx);
2069 static void gmc_sse(uint8_t *dst, uint8_t *src,
2070 int stride, int h, int ox, int oy,
2071 int dxx, int dxy, int dyx, int dyy,
2072 int shift, int r, int width, int height)
2074 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2075 width, height, &emulated_edge_mc_sse);
2078 static void gmc_mmx(uint8_t *dst, uint8_t *src,
2079 int stride, int h, int ox, int oy,
2080 int dxx, int dxy, int dyx, int dyy,
2081 int shift, int r, int width, int height)
2083 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2084 width, height, &ff_emulated_edge_mc_8);
2088 #define PREFETCH(name, op) \
2089 static void name(void *mem, int stride, int h) \
2091 const uint8_t *p = mem; \
2093 __asm__ volatile (#op" %0" :: "m"(*p)); \
2098 PREFETCH(prefetch_mmx2, prefetcht0)
2099 PREFETCH(prefetch_3dnow, prefetch)
2102 #endif /* HAVE_INLINE_ASM */
2104 #include "h264_qpel_mmx.c"
2106 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
2107 int stride, int h, int x, int y);
2108 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
2109 int stride, int h, int x, int y);
2110 void ff_avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst, uint8_t *src,
2111 int stride, int h, int x, int y);
2113 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
2114 int stride, int h, int x, int y);
2115 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
2116 int stride, int h, int x, int y);
2117 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
2118 int stride, int h, int x, int y);
2120 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
2121 int stride, int h, int x, int y);
2122 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
2123 int stride, int h, int x, int y);
2125 void ff_put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
2126 int stride, int h, int x, int y);
2127 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
2128 int stride, int h, int x, int y);
2130 void ff_avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
2131 int stride, int h, int x, int y);
2132 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
2133 int stride, int h, int x, int y);
2135 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
2136 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
2137 (uint8_t *dst, uint8_t *src, \
2138 int stride, int h, int x, int y);
2140 CHROMA_MC(put, 2, 10, mmx2)
2141 CHROMA_MC(avg, 2, 10, mmx2)
2142 CHROMA_MC(put, 4, 10, mmx2)
2143 CHROMA_MC(avg, 4, 10, mmx2)
2144 CHROMA_MC(put, 8, 10, sse2)
2145 CHROMA_MC(avg, 8, 10, sse2)
2146 CHROMA_MC(put, 8, 10, avx)
2147 CHROMA_MC(avg, 8, 10, avx)
2152 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2154 put_pixels8_mmx(dst, src, stride, 8);
2157 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2159 avg_pixels8_mmx(dst, src, stride, 8);
2162 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2164 put_pixels16_mmx(dst, src, stride, 16);
2167 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2169 avg_pixels16_mmx(dst, src, stride, 16);
2173 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
2174 int stride, int rnd)
2176 put_pixels8_mmx(dst, src, stride, 8);
2179 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src,
2180 int stride, int rnd)
2182 avg_pixels8_mmx2(dst, src, stride, 8);
2185 /* only used in VP3/5/6 */
2186 static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
2192 "movq (%1), %%mm0 \n\t"
2193 "movq (%2), %%mm1 \n\t"
2194 "movq (%1,%4), %%mm2 \n\t"
2195 "movq (%2,%4), %%mm3 \n\t"
2196 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
2197 "movq %%mm4, (%3) \n\t"
2198 "movq %%mm5, (%3,%4) \n\t"
2200 "movq (%1,%4,2), %%mm0 \n\t"
2201 "movq (%2,%4,2), %%mm1 \n\t"
2202 "movq (%1,%5), %%mm2 \n\t"
2203 "movq (%2,%5), %%mm3 \n\t"
2204 "lea (%1,%4,4), %1 \n\t"
2205 "lea (%2,%4,4), %2 \n\t"
2206 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
2207 "movq %%mm4, (%3,%4,2) \n\t"
2208 "movq %%mm5, (%3,%5) \n\t"
2209 "lea (%3,%4,4), %3 \n\t"
2212 :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
2213 :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
2215 // STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
2217 static void put_vp_no_rnd_pixels16_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
2219 put_vp_no_rnd_pixels8_l2_mmx(dst, a, b, stride, h);
2220 put_vp_no_rnd_pixels8_l2_mmx(dst+8, a+8, b+8, stride, h);
2223 #if CONFIG_DIRAC_DECODER
2224 #define DIRAC_PIXOP(OPNAME, EXT)\
2225 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2227 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
2229 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2231 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
2233 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2235 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
2236 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
2239 DIRAC_PIXOP(put, mmx)
2240 DIRAC_PIXOP(avg, mmx)
2241 DIRAC_PIXOP(avg, mmx2)
2243 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2245 put_pixels16_sse2(dst, src[0], stride, h);
2247 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2249 avg_pixels16_sse2(dst, src[0], stride, h);
2251 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2253 put_pixels16_sse2(dst , src[0] , stride, h);
2254 put_pixels16_sse2(dst+16, src[0]+16, stride, h);
2256 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2258 avg_pixels16_sse2(dst , src[0] , stride, h);
2259 avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
2263 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
2266 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
2270 ff_put_pixels_clamped_mmx(block, dest, line_size);
2273 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
2277 ff_add_pixels_clamped_mmx(block, dest, line_size);
2280 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
2283 ff_mmxext_idct(block);
2284 ff_put_pixels_clamped_mmx(block, dest, line_size);
2287 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
2290 ff_mmxext_idct(block);
2291 ff_add_pixels_clamped_mmx(block, dest, line_size);
2295 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2297 ff_idct_xvid_mmx(block);
2298 ff_put_pixels_clamped_mmx(block, dest, line_size);
2301 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2303 ff_idct_xvid_mmx(block);
2304 ff_add_pixels_clamped_mmx(block, dest, line_size);
2307 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2309 ff_idct_xvid_mmx2(block);
2310 ff_put_pixels_clamped_mmx(block, dest, line_size);
2313 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2315 ff_idct_xvid_mmx2(block);
2316 ff_add_pixels_clamped_mmx(block, dest, line_size);
2319 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2322 __asm__ volatile ("pxor %%mm7, %%mm7":);
2323 for (i = 0; i < blocksize; i += 2) {
2325 "movq %0, %%mm0 \n\t"
2326 "movq %1, %%mm1 \n\t"
2327 "movq %%mm0, %%mm2 \n\t"
2328 "movq %%mm1, %%mm3 \n\t"
2329 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2330 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2331 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2332 "pxor %%mm2, %%mm1 \n\t"
2333 "movq %%mm3, %%mm4 \n\t"
2334 "pand %%mm1, %%mm3 \n\t"
2335 "pandn %%mm1, %%mm4 \n\t"
2336 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2337 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2338 "movq %%mm3, %1 \n\t"
2339 "movq %%mm0, %0 \n\t"
2340 : "+m"(mag[i]), "+m"(ang[i])
2344 __asm__ volatile ("femms");
2347 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2352 "movaps %0, %%xmm5 \n\t"
2353 :: "m"(ff_pdw_80000000[0])
2355 for (i = 0; i < blocksize; i += 4) {
2357 "movaps %0, %%xmm0 \n\t"
2358 "movaps %1, %%xmm1 \n\t"
2359 "xorps %%xmm2, %%xmm2 \n\t"
2360 "xorps %%xmm3, %%xmm3 \n\t"
2361 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2362 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2363 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2364 "xorps %%xmm2, %%xmm1 \n\t"
2365 "movaps %%xmm3, %%xmm4 \n\t"
2366 "andps %%xmm1, %%xmm3 \n\t"
2367 "andnps %%xmm1, %%xmm4 \n\t"
2368 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2369 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2370 "movaps %%xmm3, %1 \n\t"
2371 "movaps %%xmm0, %0 \n\t"
2372 : "+m"(mag[i]), "+m"(ang[i])
2381 #define MIX5(mono, stereo) \
2382 __asm__ volatile ( \
2383 "movss 0(%2), %%xmm5 \n" \
2384 "movss 8(%2), %%xmm6 \n" \
2385 "movss 24(%2), %%xmm7 \n" \
2386 "shufps $0, %%xmm5, %%xmm5 \n" \
2387 "shufps $0, %%xmm6, %%xmm6 \n" \
2388 "shufps $0, %%xmm7, %%xmm7 \n" \
2390 "movaps (%0, %1), %%xmm0 \n" \
2391 "movaps 0x400(%0, %1), %%xmm1 \n" \
2392 "movaps 0x800(%0, %1), %%xmm2 \n" \
2393 "movaps 0xc00(%0, %1), %%xmm3 \n" \
2394 "movaps 0x1000(%0, %1), %%xmm4 \n" \
2395 "mulps %%xmm5, %%xmm0 \n" \
2396 "mulps %%xmm6, %%xmm1 \n" \
2397 "mulps %%xmm5, %%xmm2 \n" \
2398 "mulps %%xmm7, %%xmm3 \n" \
2399 "mulps %%xmm7, %%xmm4 \n" \
2400 stereo("addps %%xmm1, %%xmm0 \n") \
2401 "addps %%xmm1, %%xmm2 \n" \
2402 "addps %%xmm3, %%xmm0 \n" \
2403 "addps %%xmm4, %%xmm2 \n" \
2404 mono("addps %%xmm2, %%xmm0 \n") \
2405 "movaps %%xmm0, (%0, %1) \n" \
2406 stereo("movaps %%xmm2, 0x400(%0, %1) \n") \
2410 : "r"(samples[0] + len), "r"(matrix) \
2411 : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2412 "%xmm4", "%xmm5", "%xmm6", "%xmm7",) \
2416 #define MIX_MISC(stereo) \
2417 __asm__ volatile ( \
2419 "movaps (%3, %0), %%xmm0 \n" \
2420 stereo("movaps %%xmm0, %%xmm1 \n") \
2421 "mulps %%xmm4, %%xmm0 \n" \
2422 stereo("mulps %%xmm5, %%xmm1 \n") \
2423 "lea 1024(%3, %0), %1 \n" \
2426 "movaps (%1), %%xmm2 \n" \
2427 stereo("movaps %%xmm2, %%xmm3 \n") \
2428 "mulps (%4, %2), %%xmm2 \n" \
2429 stereo("mulps 16(%4, %2), %%xmm3 \n") \
2430 "addps %%xmm2, %%xmm0 \n" \
2431 stereo("addps %%xmm3, %%xmm1 \n") \
2432 "add $1024, %1 \n" \
2435 "movaps %%xmm0, (%3, %0) \n" \
2436 stereo("movaps %%xmm1, 1024(%3, %0) \n") \
2439 : "+&r"(i), "=&r"(j), "=&r"(k) \
2440 : "r"(samples[0] + len), "r"(matrix_simd + in_ch), \
2441 "g"((intptr_t) - 32 * (in_ch - 1)) \
2445 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2],
2446 int out_ch, int in_ch, int len)
2448 int (*matrix_cmp)[2] = (int(*)[2])matrix;
2451 i = -len * sizeof(float);
2452 if (in_ch == 5 && out_ch == 2 &&
2453 !(matrix_cmp[0][1] | matrix_cmp[2][0] |
2454 matrix_cmp[3][1] | matrix_cmp[4][0] |
2455 (matrix_cmp[1][0] ^ matrix_cmp[1][1]) |
2456 (matrix_cmp[0][0] ^ matrix_cmp[2][1]))) {
2458 } else if (in_ch == 5 && out_ch == 1 &&
2459 matrix_cmp[0][0] == matrix_cmp[2][0] &&
2460 matrix_cmp[3][0] == matrix_cmp[4][0]) {
2463 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2464 j = 2 * in_ch * sizeof(float);
2468 "movss (%2, %0), %%xmm4 \n"
2469 "movss 4(%2, %0), %%xmm5 \n"
2470 "shufps $0, %%xmm4, %%xmm4 \n"
2471 "shufps $0, %%xmm5, %%xmm5 \n"
2472 "movaps %%xmm4, (%1, %0, 4) \n"
2473 "movaps %%xmm5, 16(%1, %0, 4) \n"
2476 : "r"(matrix_simd), "r"(matrix)
2488 static void vector_fmul_window_3dnowext(float *dst, const float *src0,
2489 const float *src1, const float *win,
2492 x86_reg i = -len * 4;
2493 x86_reg j = len * 4 - 8;
2496 "pswapd (%5, %1), %%mm1 \n"
2497 "movq (%5, %0), %%mm0 \n"
2498 "pswapd (%4, %1), %%mm5 \n"
2499 "movq (%3, %0), %%mm4 \n"
2500 "movq %%mm0, %%mm2 \n"
2501 "movq %%mm1, %%mm3 \n"
2502 "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]
2503 "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]
2504 "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]
2505 "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]
2506 "pfadd %%mm3, %%mm2 \n"
2507 "pfsub %%mm0, %%mm1 \n"
2508 "pswapd %%mm2, %%mm2 \n"
2509 "movq %%mm1, (%2, %0) \n"
2510 "movq %%mm2, (%2, %1) \n"
2516 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2520 static void vector_fmul_window_sse(float *dst, const float *src0,
2521 const float *src1, const float *win, int len)
2523 x86_reg i = -len * 4;
2524 x86_reg j = len * 4 - 16;
2527 "movaps (%5, %1), %%xmm1 \n"
2528 "movaps (%5, %0), %%xmm0 \n"
2529 "movaps (%4, %1), %%xmm5 \n"
2530 "movaps (%3, %0), %%xmm4 \n"
2531 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2532 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2533 "movaps %%xmm0, %%xmm2 \n"
2534 "movaps %%xmm1, %%xmm3 \n"
2535 "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]
2536 "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]
2537 "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]
2538 "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]
2539 "addps %%xmm3, %%xmm2 \n"
2540 "subps %%xmm0, %%xmm1 \n"
2541 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2542 "movaps %%xmm1, (%2, %0) \n"
2543 "movaps %%xmm2, (%2, %1) \n"
2548 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2551 #endif /* HAVE_6REGS */
2553 static void vector_clipf_sse(float *dst, const float *src,
2554 float min, float max, int len)
2556 x86_reg i = (len - 16) * 4;
2558 "movss %3, %%xmm4 \n\t"
2559 "movss %4, %%xmm5 \n\t"
2560 "shufps $0, %%xmm4, %%xmm4 \n\t"
2561 "shufps $0, %%xmm5, %%xmm5 \n\t"
2563 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
2564 "movaps 16(%2, %0), %%xmm1 \n\t"
2565 "movaps 32(%2, %0), %%xmm2 \n\t"
2566 "movaps 48(%2, %0), %%xmm3 \n\t"
2567 "maxps %%xmm4, %%xmm0 \n\t"
2568 "maxps %%xmm4, %%xmm1 \n\t"
2569 "maxps %%xmm4, %%xmm2 \n\t"
2570 "maxps %%xmm4, %%xmm3 \n\t"
2571 "minps %%xmm5, %%xmm0 \n\t"
2572 "minps %%xmm5, %%xmm1 \n\t"
2573 "minps %%xmm5, %%xmm2 \n\t"
2574 "minps %%xmm5, %%xmm3 \n\t"
2575 "movaps %%xmm0, (%1, %0) \n\t"
2576 "movaps %%xmm1, 16(%1, %0) \n\t"
2577 "movaps %%xmm2, 32(%1, %0) \n\t"
2578 "movaps %%xmm3, 48(%1, %0) \n\t"
2582 : "r"(dst), "r"(src), "m"(min), "m"(max)
2587 #endif /* HAVE_INLINE_ASM */
2589 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2,
2591 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
2593 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2,
2595 int order, int mul);
2596 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
2598 int order, int mul);
2599 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
2601 int order, int mul);
2603 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
2604 const int16_t *window, unsigned int len);
2605 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2606 const int16_t *window, unsigned int len);
2607 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
2608 const int16_t *window, unsigned int len);
2609 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
2610 const int16_t *window, unsigned int len);
2611 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
2612 const int16_t *window, unsigned int len);
2613 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2614 const int16_t *window, unsigned int len);
2616 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
2617 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
2619 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top,
2620 const uint8_t *diff, int w,
2621 int *left, int *left_top);
2622 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
2624 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
2627 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2629 void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
2630 const float *src1, int len);
2631 void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
2632 const float *src1, int len);
2634 void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2635 const float *src2, int len);
2636 void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
2637 const float *src2, int len);
2639 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
2640 int32_t min, int32_t max, unsigned int len);
2641 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
2642 int32_t min, int32_t max, unsigned int len);
2643 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
2644 int32_t min, int32_t max, unsigned int len);
2645 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
2646 int32_t min, int32_t max, unsigned int len);
2648 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2649 const float *src1, int len);
2650 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2651 const float *src1, int len);
2653 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2655 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2656 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2657 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2658 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2659 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2660 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2661 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2662 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2663 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2664 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2665 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2666 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2667 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2668 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2669 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2670 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2673 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2675 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2676 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2677 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2678 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2681 #define H264_QPEL_FUNCS(x, y, CPU) \
2683 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2684 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2685 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2686 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2689 #define H264_QPEL_FUNCS_10(x, y, CPU) \
2691 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2692 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2693 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2694 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2697 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2699 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2702 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2703 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2704 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2706 if (!high_bit_depth) {
2707 c->clear_block = clear_block_mmx;
2708 c->clear_blocks = clear_blocks_mmx;
2709 c->draw_edges = draw_edges_mmx;
2711 SET_HPEL_FUNCS(put, 0, 16, mmx);
2712 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2713 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2714 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2715 SET_HPEL_FUNCS(put, 1, 8, mmx);
2716 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2717 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2718 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2721 #if ARCH_X86_32 || !HAVE_YASM
2725 c->add_bytes = add_bytes_mmx;
2727 c->put_no_rnd_pixels_l2[0]= put_vp_no_rnd_pixels16_l2_mmx;
2728 c->put_no_rnd_pixels_l2[1]= put_vp_no_rnd_pixels8_l2_mmx;
2730 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2731 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2732 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2734 #endif /* HAVE_INLINE_ASM */
2738 if (!high_bit_depth)
2739 c->emulated_edge_mc = emulated_edge_mc_mmx;
2742 if (!high_bit_depth && CONFIG_H264CHROMA) {
2743 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
2744 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2747 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2752 static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
2755 const int bit_depth = avctx->bits_per_raw_sample;
2756 const int high_bit_depth = bit_depth > 8;
2759 c->prefetch = prefetch_mmx2;
2761 if (!high_bit_depth) {
2762 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2763 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2765 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2766 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2767 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2769 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2770 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2772 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2773 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2774 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2777 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2778 if (!high_bit_depth) {
2779 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2780 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2781 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2782 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2784 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2785 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2789 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2790 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2791 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2792 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2794 #endif /* HAVE_INLINE_ASM */
2796 if (CONFIG_H264QPEL) {
2798 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
2799 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
2800 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
2801 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
2802 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
2803 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
2804 #endif /* HAVE_INLINE_ASM */
2806 if (!high_bit_depth) {
2808 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
2809 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
2810 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
2811 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
2812 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
2813 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
2814 #endif /* HAVE_INLINE_ASM */
2815 } else if (bit_depth == 10) {
2818 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2819 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2820 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2821 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2823 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2824 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2829 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
2830 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
2831 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
2832 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
2833 #endif /* HAVE_INLINE_ASM */
2837 if (!high_bit_depth && CONFIG_H264CHROMA) {
2838 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_mmx2_rnd;
2839 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmx2;
2840 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmx2;
2841 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmx2;
2843 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2844 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmx2;
2845 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmx2;
2846 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmx2;
2847 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmx2;
2850 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2852 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2853 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2855 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2856 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2858 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2863 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2866 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2869 c->prefetch = prefetch_3dnow;
2871 if (!high_bit_depth) {
2872 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2873 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2875 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2876 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2877 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2879 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2880 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2882 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2883 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2884 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2886 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2887 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2888 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2889 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2890 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2892 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2893 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2897 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2898 avctx->codec_id == AV_CODEC_ID_THEORA)) {
2899 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2900 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2903 if (CONFIG_H264QPEL) {
2904 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
2905 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
2906 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2907 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
2908 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
2909 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
2911 if (!high_bit_depth) {
2912 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2913 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
2914 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
2915 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2916 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
2917 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
2920 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2921 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
2922 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2923 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
2926 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2929 if (mm_flags & AV_CPU_FLAG_CMOV)
2930 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2932 #endif /* HAVE_INLINE_ASM */
2935 if (!high_bit_depth && CONFIG_H264CHROMA) {
2936 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_3dnow_rnd;
2937 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2942 static void dsputil_init_3dnowext(DSPContext *c, AVCodecContext *avctx,
2945 #if HAVE_6REGS && HAVE_INLINE_ASM
2946 c->vector_fmul_window = vector_fmul_window_3dnowext;
2950 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2952 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2955 if (!high_bit_depth) {
2956 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2957 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2958 c->clear_block = clear_block_sse;
2959 c->clear_blocks = clear_blocks_sse;
2963 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2964 c->ac3_downmix = ac3_downmix_sse;
2967 c->vector_fmul_window = vector_fmul_window_sse;
2970 c->vector_clipf = vector_clipf_sse;
2971 #endif /* HAVE_INLINE_ASM */
2974 c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
2975 c->vector_fmul_add = ff_vector_fmul_add_sse;
2977 c->scalarproduct_float = ff_scalarproduct_float_sse;
2978 c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
2980 if (!high_bit_depth)
2981 c->emulated_edge_mc = emulated_edge_mc_sse;
2988 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2991 const int bit_depth = avctx->bits_per_raw_sample;
2994 const int high_bit_depth = bit_depth > 8;
2996 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2997 // these functions are slower than mmx on AMD, but faster on Intel
2998 if (!high_bit_depth) {
2999 c->put_pixels_tab[0][0] = put_pixels16_sse2;
3000 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
3001 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
3002 if (CONFIG_H264QPEL)
3003 H264_QPEL_FUNCS(0, 0, sse2);
3007 if (!high_bit_depth && CONFIG_H264QPEL) {
3008 H264_QPEL_FUNCS(0, 1, sse2);
3009 H264_QPEL_FUNCS(0, 2, sse2);
3010 H264_QPEL_FUNCS(0, 3, sse2);
3011 H264_QPEL_FUNCS(1, 1, sse2);
3012 H264_QPEL_FUNCS(1, 2, sse2);
3013 H264_QPEL_FUNCS(1, 3, sse2);
3014 H264_QPEL_FUNCS(2, 1, sse2);
3015 H264_QPEL_FUNCS(2, 2, sse2);
3016 H264_QPEL_FUNCS(2, 3, sse2);
3017 H264_QPEL_FUNCS(3, 1, sse2);
3018 H264_QPEL_FUNCS(3, 2, sse2);
3019 H264_QPEL_FUNCS(3, 3, sse2);
3021 #endif /* HAVE_INLINE_ASM */
3024 if (bit_depth == 10) {
3025 if (CONFIG_H264QPEL) {
3026 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
3027 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
3028 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
3029 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
3030 H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
3031 H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
3032 H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
3034 if (CONFIG_H264CHROMA) {
3035 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
3036 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
3040 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
3041 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
3042 if (mm_flags & AV_CPU_FLAG_ATOM) {
3043 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
3045 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
3047 if (avctx->flags & CODEC_FLAG_BITEXACT) {
3048 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
3049 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
3050 c->apply_window_int16 = ff_apply_window_int16_sse2;
3052 c->bswap_buf = ff_bswap32_buf_sse2;
3056 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
3060 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
3061 const int bit_depth = avctx->bits_per_raw_sample;
3064 if (!high_bit_depth && CONFIG_H264QPEL) {
3065 H264_QPEL_FUNCS(1, 0, ssse3);
3066 H264_QPEL_FUNCS(1, 1, ssse3);
3067 H264_QPEL_FUNCS(1, 2, ssse3);
3068 H264_QPEL_FUNCS(1, 3, ssse3);
3069 H264_QPEL_FUNCS(2, 0, ssse3);
3070 H264_QPEL_FUNCS(2, 1, ssse3);
3071 H264_QPEL_FUNCS(2, 2, ssse3);
3072 H264_QPEL_FUNCS(2, 3, ssse3);
3073 H264_QPEL_FUNCS(3, 0, ssse3);
3074 H264_QPEL_FUNCS(3, 1, ssse3);
3075 H264_QPEL_FUNCS(3, 2, ssse3);
3076 H264_QPEL_FUNCS(3, 3, ssse3);
3078 #endif /* HAVE_INLINE_ASM */
3080 if (bit_depth == 10 && CONFIG_H264QPEL) {
3081 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
3082 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
3083 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
3085 if (!high_bit_depth && CONFIG_H264CHROMA) {
3086 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_ssse3_rnd;
3087 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_ssse3_rnd;
3088 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
3089 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
3091 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
3092 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
3093 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
3095 if (mm_flags & AV_CPU_FLAG_ATOM)
3096 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
3098 c->apply_window_int16 = ff_apply_window_int16_ssse3;
3099 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
3100 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
3101 c->bswap_buf = ff_bswap32_buf_ssse3;
3106 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
3110 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
3114 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
3116 #if HAVE_AVX && HAVE_YASM
3117 const int bit_depth = avctx->bits_per_raw_sample;
3119 if (bit_depth == 10) {
3120 // AVX implies !cache64.
3121 // TODO: Port cache(32|64) detection from x264.
3122 if (CONFIG_H264QPEL) {
3123 H264_QPEL_FUNCS_10(1, 0, sse2);
3124 H264_QPEL_FUNCS_10(2, 0, sse2);
3125 H264_QPEL_FUNCS_10(3, 0, sse2);
3128 if (CONFIG_H264CHROMA) {
3129 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
3130 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
3133 c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
3134 c->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
3135 c->vector_fmul_add = ff_vector_fmul_add_avx;
3139 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
3141 int mm_flags = av_get_cpu_flags();
3143 if (mm_flags & AV_CPU_FLAG_MMX) {
3145 const int idct_algo = avctx->idct_algo;
3147 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
3148 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
3149 c->idct_put = ff_simple_idct_put_mmx;
3150 c->idct_add = ff_simple_idct_add_mmx;
3151 c->idct = ff_simple_idct_mmx;
3152 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
3154 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
3155 if (mm_flags & AV_CPU_FLAG_MMX2) {
3156 c->idct_put = ff_libmpeg2mmx2_idct_put;
3157 c->idct_add = ff_libmpeg2mmx2_idct_add;
3158 c->idct = ff_mmxext_idct;
3160 c->idct_put = ff_libmpeg2mmx_idct_put;
3161 c->idct_add = ff_libmpeg2mmx_idct_add;
3162 c->idct = ff_mmx_idct;
3164 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
3166 } else if (idct_algo == FF_IDCT_CAVS) {
3167 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
3168 } else if (idct_algo == FF_IDCT_XVIDMMX) {
3169 if (mm_flags & AV_CPU_FLAG_SSE2) {
3170 c->idct_put = ff_idct_xvid_sse2_put;
3171 c->idct_add = ff_idct_xvid_sse2_add;
3172 c->idct = ff_idct_xvid_sse2;
3173 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
3174 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
3175 c->idct_put = ff_idct_xvid_mmx2_put;
3176 c->idct_add = ff_idct_xvid_mmx2_add;
3177 c->idct = ff_idct_xvid_mmx2;
3179 c->idct_put = ff_idct_xvid_mmx_put;
3180 c->idct_add = ff_idct_xvid_mmx_add;
3181 c->idct = ff_idct_xvid_mmx;
3185 #endif /* HAVE_INLINE_ASM */
3187 dsputil_init_mmx(c, avctx, mm_flags);
3190 if (mm_flags & AV_CPU_FLAG_MMXEXT)
3191 dsputil_init_mmx2(c, avctx, mm_flags);
3193 if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW)
3194 dsputil_init_3dnow(c, avctx, mm_flags);
3196 if (mm_flags & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT)
3197 dsputil_init_3dnowext(c, avctx, mm_flags);
3199 if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE)
3200 dsputil_init_sse(c, avctx, mm_flags);
3202 if (mm_flags & AV_CPU_FLAG_SSE2)
3203 dsputil_init_sse2(c, avctx, mm_flags);
3205 if (mm_flags & AV_CPU_FLAG_SSSE3)
3206 dsputil_init_ssse3(c, avctx, mm_flags);
3208 if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE)
3209 dsputil_init_sse4(c, avctx, mm_flags);
3211 if (mm_flags & AV_CPU_FLAG_AVX)
3212 dsputil_init_avx(c, avctx, mm_flags);
3214 if (CONFIG_ENCODERS)
3215 ff_dsputilenc_init_mmx(c, avctx);