2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86_cpu.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "libavcodec/ac3dec.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
34 #include "diracdsp_mmx.h"
39 /* pixel operations */
40 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
41 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
43 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
44 { 0x8000000080000000ULL, 0x8000000080000000ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
53 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
57 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
60 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
62 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
64 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
67 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
73 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
74 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
77 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
78 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
79 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
81 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
82 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
83 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
85 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
86 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
88 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
89 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
91 #define MOVQ_BFE(regd) \
93 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
94 "paddb %%"#regd", %%"#regd" \n\t" ::)
97 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
98 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
100 // for shared library it's better to use this way for accessing constants
102 #define MOVQ_BONE(regd) \
104 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
105 "psrlw $15, %%"#regd" \n\t" \
106 "packuswb %%"#regd", %%"#regd" \n\t" ::)
108 #define MOVQ_WTWO(regd) \
110 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
111 "psrlw $15, %%"#regd" \n\t" \
112 "psllw $1, %%"#regd" \n\t"::)
116 // using regr as temporary and for the output result
117 // first argument is unmodifed and second is trashed
118 // regfe is supposed to contain 0xfefefefefefefefe
119 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
120 "movq "#rega", "#regr" \n\t" \
121 "pand "#regb", "#regr" \n\t" \
122 "pxor "#rega", "#regb" \n\t" \
123 "pand "#regfe", "#regb" \n\t" \
124 "psrlq $1, "#regb" \n\t" \
125 "paddb "#regb", "#regr" \n\t"
127 #define PAVGB_MMX(rega, regb, regr, regfe) \
128 "movq "#rega", "#regr" \n\t" \
129 "por "#regb", "#regr" \n\t" \
130 "pxor "#rega", "#regb" \n\t" \
131 "pand "#regfe", "#regb" \n\t" \
132 "psrlq $1, "#regb" \n\t" \
133 "psubb "#regb", "#regr" \n\t"
135 // mm6 is supposed to contain 0xfefefefefefefefe
136 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
137 "movq "#rega", "#regr" \n\t" \
138 "movq "#regc", "#regp" \n\t" \
139 "pand "#regb", "#regr" \n\t" \
140 "pand "#regd", "#regp" \n\t" \
141 "pxor "#rega", "#regb" \n\t" \
142 "pxor "#regc", "#regd" \n\t" \
143 "pand %%mm6, "#regb" \n\t" \
144 "pand %%mm6, "#regd" \n\t" \
145 "psrlq $1, "#regb" \n\t" \
146 "psrlq $1, "#regd" \n\t" \
147 "paddb "#regb", "#regr" \n\t" \
148 "paddb "#regd", "#regp" \n\t"
150 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
151 "movq "#rega", "#regr" \n\t" \
152 "movq "#regc", "#regp" \n\t" \
153 "por "#regb", "#regr" \n\t" \
154 "por "#regd", "#regp" \n\t" \
155 "pxor "#rega", "#regb" \n\t" \
156 "pxor "#regc", "#regd" \n\t" \
157 "pand %%mm6, "#regb" \n\t" \
158 "pand %%mm6, "#regd" \n\t" \
159 "psrlq $1, "#regd" \n\t" \
160 "psrlq $1, "#regb" \n\t" \
161 "psubb "#regb", "#regr" \n\t" \
162 "psubb "#regd", "#regp" \n\t"
164 /***********************************/
165 /* MMX no rounding */
166 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
167 #define SET_RND MOVQ_WONE
168 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
169 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
170 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
172 #include "dsputil_mmx_rnd_template.c"
178 /***********************************/
181 #define DEF(x, y) x ## _ ## y ## _mmx
182 #define SET_RND MOVQ_WTWO
183 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
184 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
186 #include "dsputil_mmx_rnd_template.c"
194 /***********************************/
197 #define DEF(x) x ## _3dnow
198 #define PAVGB "pavgusb"
201 #include "dsputil_mmx_avg_template.c"
207 /***********************************/
210 #define DEF(x) x ## _mmx2
212 /* Introduced only in MMX2 set */
213 #define PAVGB "pavgb"
216 #include "dsputil_mmx_avg_template.c"
222 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
223 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
224 #define put_pixels16_mmx2 put_pixels16_mmx
225 #define put_pixels8_mmx2 put_pixels8_mmx
226 #define put_pixels4_mmx2 put_pixels4_mmx
227 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
228 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
229 #define put_pixels16_3dnow put_pixels16_mmx
230 #define put_pixels8_3dnow put_pixels8_mmx
231 #define put_pixels4_3dnow put_pixels4_mmx
232 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
233 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
235 /***********************************/
238 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
244 /* read the pixels */
249 "movq %3, %%mm0 \n\t"
250 "movq 8%3, %%mm1 \n\t"
251 "movq 16%3, %%mm2 \n\t"
252 "movq 24%3, %%mm3 \n\t"
253 "movq 32%3, %%mm4 \n\t"
254 "movq 40%3, %%mm5 \n\t"
255 "movq 48%3, %%mm6 \n\t"
256 "movq 56%3, %%mm7 \n\t"
257 "packuswb %%mm1, %%mm0 \n\t"
258 "packuswb %%mm3, %%mm2 \n\t"
259 "packuswb %%mm5, %%mm4 \n\t"
260 "packuswb %%mm7, %%mm6 \n\t"
261 "movq %%mm0, (%0) \n\t"
262 "movq %%mm2, (%0, %1) \n\t"
263 "movq %%mm4, (%0, %1, 2) \n\t"
264 "movq %%mm6, (%0, %2) \n\t"
265 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
268 pix += line_size * 4;
271 // if here would be an exact copy of the code above
272 // compiler would generate some very strange code
275 "movq (%3), %%mm0 \n\t"
276 "movq 8(%3), %%mm1 \n\t"
277 "movq 16(%3), %%mm2 \n\t"
278 "movq 24(%3), %%mm3 \n\t"
279 "movq 32(%3), %%mm4 \n\t"
280 "movq 40(%3), %%mm5 \n\t"
281 "movq 48(%3), %%mm6 \n\t"
282 "movq 56(%3), %%mm7 \n\t"
283 "packuswb %%mm1, %%mm0 \n\t"
284 "packuswb %%mm3, %%mm2 \n\t"
285 "packuswb %%mm5, %%mm4 \n\t"
286 "packuswb %%mm7, %%mm6 \n\t"
287 "movq %%mm0, (%0) \n\t"
288 "movq %%mm2, (%0, %1) \n\t"
289 "movq %%mm4, (%0, %1, 2) \n\t"
290 "movq %%mm6, (%0, %2) \n\t"
291 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
295 #define put_signed_pixels_clamped_mmx_half(off) \
296 "movq "#off"(%2), %%mm1 \n\t" \
297 "movq 16 + "#off"(%2), %%mm2 \n\t" \
298 "movq 32 + "#off"(%2), %%mm3 \n\t" \
299 "movq 48 + "#off"(%2), %%mm4 \n\t" \
300 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
301 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
302 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
303 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
304 "paddb %%mm0, %%mm1 \n\t" \
305 "paddb %%mm0, %%mm2 \n\t" \
306 "paddb %%mm0, %%mm3 \n\t" \
307 "paddb %%mm0, %%mm4 \n\t" \
308 "movq %%mm1, (%0) \n\t" \
309 "movq %%mm2, (%0, %3) \n\t" \
310 "movq %%mm3, (%0, %3, 2) \n\t" \
311 "movq %%mm4, (%0, %1) \n\t"
313 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
316 x86_reg line_skip = line_size;
320 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
321 "lea (%3, %3, 2), %1 \n\t"
322 put_signed_pixels_clamped_mmx_half(0)
323 "lea (%0, %3, 4), %0 \n\t"
324 put_signed_pixels_clamped_mmx_half(64)
325 : "+&r"(pixels), "=&r"(line_skip3)
326 : "r"(block), "r"(line_skip)
330 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
337 /* read the pixels */
344 "movq (%2), %%mm0 \n\t"
345 "movq 8(%2), %%mm1 \n\t"
346 "movq 16(%2), %%mm2 \n\t"
347 "movq 24(%2), %%mm3 \n\t"
348 "movq %0, %%mm4 \n\t"
349 "movq %1, %%mm6 \n\t"
350 "movq %%mm4, %%mm5 \n\t"
351 "punpcklbw %%mm7, %%mm4 \n\t"
352 "punpckhbw %%mm7, %%mm5 \n\t"
353 "paddsw %%mm4, %%mm0 \n\t"
354 "paddsw %%mm5, %%mm1 \n\t"
355 "movq %%mm6, %%mm5 \n\t"
356 "punpcklbw %%mm7, %%mm6 \n\t"
357 "punpckhbw %%mm7, %%mm5 \n\t"
358 "paddsw %%mm6, %%mm2 \n\t"
359 "paddsw %%mm5, %%mm3 \n\t"
360 "packuswb %%mm1, %%mm0 \n\t"
361 "packuswb %%mm3, %%mm2 \n\t"
362 "movq %%mm0, %0 \n\t"
363 "movq %%mm2, %1 \n\t"
364 : "+m"(*pix), "+m"(*(pix + line_size))
367 pix += line_size * 2;
372 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
373 int line_size, int h)
376 "lea (%3, %3), %%"REG_a" \n\t"
379 "movd (%1 ), %%mm0 \n\t"
380 "movd (%1, %3), %%mm1 \n\t"
381 "movd %%mm0, (%2) \n\t"
382 "movd %%mm1, (%2, %3) \n\t"
383 "add %%"REG_a", %1 \n\t"
384 "add %%"REG_a", %2 \n\t"
385 "movd (%1 ), %%mm0 \n\t"
386 "movd (%1, %3), %%mm1 \n\t"
387 "movd %%mm0, (%2) \n\t"
388 "movd %%mm1, (%2, %3) \n\t"
389 "add %%"REG_a", %1 \n\t"
390 "add %%"REG_a", %2 \n\t"
393 : "+g"(h), "+r"(pixels), "+r"(block)
394 : "r"((x86_reg)line_size)
399 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
400 int line_size, int h)
403 "lea (%3, %3), %%"REG_a" \n\t"
406 "movq (%1 ), %%mm0 \n\t"
407 "movq (%1, %3), %%mm1 \n\t"
408 "movq %%mm0, (%2) \n\t"
409 "movq %%mm1, (%2, %3) \n\t"
410 "add %%"REG_a", %1 \n\t"
411 "add %%"REG_a", %2 \n\t"
412 "movq (%1 ), %%mm0 \n\t"
413 "movq (%1, %3), %%mm1 \n\t"
414 "movq %%mm0, (%2) \n\t"
415 "movq %%mm1, (%2, %3) \n\t"
416 "add %%"REG_a", %1 \n\t"
417 "add %%"REG_a", %2 \n\t"
420 : "+g"(h), "+r"(pixels), "+r"(block)
421 : "r"((x86_reg)line_size)
426 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
427 int line_size, int h)
430 "lea (%3, %3), %%"REG_a" \n\t"
433 "movq (%1 ), %%mm0 \n\t"
434 "movq 8(%1 ), %%mm4 \n\t"
435 "movq (%1, %3), %%mm1 \n\t"
436 "movq 8(%1, %3), %%mm5 \n\t"
437 "movq %%mm0, (%2) \n\t"
438 "movq %%mm4, 8(%2) \n\t"
439 "movq %%mm1, (%2, %3) \n\t"
440 "movq %%mm5, 8(%2, %3) \n\t"
441 "add %%"REG_a", %1 \n\t"
442 "add %%"REG_a", %2 \n\t"
443 "movq (%1 ), %%mm0 \n\t"
444 "movq 8(%1 ), %%mm4 \n\t"
445 "movq (%1, %3), %%mm1 \n\t"
446 "movq 8(%1, %3), %%mm5 \n\t"
447 "movq %%mm0, (%2) \n\t"
448 "movq %%mm4, 8(%2) \n\t"
449 "movq %%mm1, (%2, %3) \n\t"
450 "movq %%mm5, 8(%2, %3) \n\t"
451 "add %%"REG_a", %1 \n\t"
452 "add %%"REG_a", %2 \n\t"
455 : "+g"(h), "+r"(pixels), "+r"(block)
456 : "r"((x86_reg)line_size)
461 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
462 int line_size, int h)
466 "movdqu (%1 ), %%xmm0 \n\t"
467 "movdqu (%1, %3 ), %%xmm1 \n\t"
468 "movdqu (%1, %3, 2), %%xmm2 \n\t"
469 "movdqu (%1, %4 ), %%xmm3 \n\t"
470 "lea (%1, %3, 4), %1 \n\t"
471 "movdqa %%xmm0, (%2) \n\t"
472 "movdqa %%xmm1, (%2, %3) \n\t"
473 "movdqa %%xmm2, (%2, %3, 2) \n\t"
474 "movdqa %%xmm3, (%2, %4) \n\t"
476 "lea (%2, %3, 4), %2 \n\t"
478 : "+g"(h), "+r"(pixels), "+r"(block)
479 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
484 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
485 int line_size, int h)
489 "movdqu (%1 ), %%xmm0 \n\t"
490 "movdqu (%1, %3 ), %%xmm1 \n\t"
491 "movdqu (%1, %3, 2), %%xmm2 \n\t"
492 "movdqu (%1, %4 ), %%xmm3 \n\t"
493 "lea (%1, %3, 4), %1 \n\t"
494 "pavgb (%2 ), %%xmm0 \n\t"
495 "pavgb (%2, %3 ), %%xmm1 \n\t"
496 "pavgb (%2, %3, 2), %%xmm2 \n\t"
497 "pavgb (%2, %4), %%xmm3 \n\t"
498 "movdqa %%xmm0, (%2) \n\t"
499 "movdqa %%xmm1, (%2, %3) \n\t"
500 "movdqa %%xmm2, (%2, %3, 2) \n\t"
501 "movdqa %%xmm3, (%2, %4) \n\t"
503 "lea (%2, %3, 4), %2 \n\t"
505 : "+g"(h), "+r"(pixels), "+r"(block)
506 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
511 #define CLEAR_BLOCKS(name, n) \
512 static void name(DCTELEM *blocks) \
515 "pxor %%mm7, %%mm7 \n\t" \
516 "mov %1, %%"REG_a" \n\t" \
518 "movq %%mm7, (%0, %%"REG_a") \n\t" \
519 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
520 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
521 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
522 "add $32, %%"REG_a" \n\t" \
524 :: "r"(((uint8_t *)blocks) + 128 * n), \
529 CLEAR_BLOCKS(clear_blocks_mmx, 6)
530 CLEAR_BLOCKS(clear_block_mmx, 1)
532 static void clear_block_sse(DCTELEM *block)
535 "xorps %%xmm0, %%xmm0 \n"
536 "movaps %%xmm0, (%0) \n"
537 "movaps %%xmm0, 16(%0) \n"
538 "movaps %%xmm0, 32(%0) \n"
539 "movaps %%xmm0, 48(%0) \n"
540 "movaps %%xmm0, 64(%0) \n"
541 "movaps %%xmm0, 80(%0) \n"
542 "movaps %%xmm0, 96(%0) \n"
543 "movaps %%xmm0, 112(%0) \n"
549 static void clear_blocks_sse(DCTELEM *blocks)
552 "xorps %%xmm0, %%xmm0 \n"
553 "mov %1, %%"REG_a" \n"
555 "movaps %%xmm0, (%0, %%"REG_a") \n"
556 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
557 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
558 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
559 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
560 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
561 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
562 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
563 "add $128, %%"REG_a" \n"
565 :: "r"(((uint8_t *)blocks) + 128 * 6),
571 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
577 "movq (%1, %0), %%mm0 \n\t"
578 "movq (%2, %0), %%mm1 \n\t"
579 "paddb %%mm0, %%mm1 \n\t"
580 "movq %%mm1, (%2, %0) \n\t"
581 "movq 8(%1, %0), %%mm0 \n\t"
582 "movq 8(%2, %0), %%mm1 \n\t"
583 "paddb %%mm0, %%mm1 \n\t"
584 "movq %%mm1, 8(%2, %0) \n\t"
590 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
593 dst[i + 0] += src[i + 0];
597 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
598 const uint8_t *diff, int w,
599 int *left, int *left_top)
603 int l = *left & 0xff;
604 int tl = *left_top & 0xff;
609 "movzbl (%3, %4), %2 \n"
622 "add (%6, %4), %b0 \n"
623 "mov %b0, (%5, %4) \n"
626 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
627 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
634 #define H263_LOOP_FILTER \
635 "pxor %%mm7, %%mm7 \n\t" \
636 "movq %0, %%mm0 \n\t" \
637 "movq %0, %%mm1 \n\t" \
638 "movq %3, %%mm2 \n\t" \
639 "movq %3, %%mm3 \n\t" \
640 "punpcklbw %%mm7, %%mm0 \n\t" \
641 "punpckhbw %%mm7, %%mm1 \n\t" \
642 "punpcklbw %%mm7, %%mm2 \n\t" \
643 "punpckhbw %%mm7, %%mm3 \n\t" \
644 "psubw %%mm2, %%mm0 \n\t" \
645 "psubw %%mm3, %%mm1 \n\t" \
646 "movq %1, %%mm2 \n\t" \
647 "movq %1, %%mm3 \n\t" \
648 "movq %2, %%mm4 \n\t" \
649 "movq %2, %%mm5 \n\t" \
650 "punpcklbw %%mm7, %%mm2 \n\t" \
651 "punpckhbw %%mm7, %%mm3 \n\t" \
652 "punpcklbw %%mm7, %%mm4 \n\t" \
653 "punpckhbw %%mm7, %%mm5 \n\t" \
654 "psubw %%mm2, %%mm4 \n\t" \
655 "psubw %%mm3, %%mm5 \n\t" \
656 "psllw $2, %%mm4 \n\t" \
657 "psllw $2, %%mm5 \n\t" \
658 "paddw %%mm0, %%mm4 \n\t" \
659 "paddw %%mm1, %%mm5 \n\t" \
660 "pxor %%mm6, %%mm6 \n\t" \
661 "pcmpgtw %%mm4, %%mm6 \n\t" \
662 "pcmpgtw %%mm5, %%mm7 \n\t" \
663 "pxor %%mm6, %%mm4 \n\t" \
664 "pxor %%mm7, %%mm5 \n\t" \
665 "psubw %%mm6, %%mm4 \n\t" \
666 "psubw %%mm7, %%mm5 \n\t" \
667 "psrlw $3, %%mm4 \n\t" \
668 "psrlw $3, %%mm5 \n\t" \
669 "packuswb %%mm5, %%mm4 \n\t" \
670 "packsswb %%mm7, %%mm6 \n\t" \
671 "pxor %%mm7, %%mm7 \n\t" \
672 "movd %4, %%mm2 \n\t" \
673 "punpcklbw %%mm2, %%mm2 \n\t" \
674 "punpcklbw %%mm2, %%mm2 \n\t" \
675 "punpcklbw %%mm2, %%mm2 \n\t" \
676 "psubusb %%mm4, %%mm2 \n\t" \
677 "movq %%mm2, %%mm3 \n\t" \
678 "psubusb %%mm4, %%mm3 \n\t" \
679 "psubb %%mm3, %%mm2 \n\t" \
680 "movq %1, %%mm3 \n\t" \
681 "movq %2, %%mm4 \n\t" \
682 "pxor %%mm6, %%mm3 \n\t" \
683 "pxor %%mm6, %%mm4 \n\t" \
684 "paddusb %%mm2, %%mm3 \n\t" \
685 "psubusb %%mm2, %%mm4 \n\t" \
686 "pxor %%mm6, %%mm3 \n\t" \
687 "pxor %%mm6, %%mm4 \n\t" \
688 "paddusb %%mm2, %%mm2 \n\t" \
689 "packsswb %%mm1, %%mm0 \n\t" \
690 "pcmpgtb %%mm0, %%mm7 \n\t" \
691 "pxor %%mm7, %%mm0 \n\t" \
692 "psubb %%mm7, %%mm0 \n\t" \
693 "movq %%mm0, %%mm1 \n\t" \
694 "psubusb %%mm2, %%mm0 \n\t" \
695 "psubb %%mm0, %%mm1 \n\t" \
696 "pand %5, %%mm1 \n\t" \
697 "psrlw $2, %%mm1 \n\t" \
698 "pxor %%mm7, %%mm1 \n\t" \
699 "psubb %%mm7, %%mm1 \n\t" \
700 "movq %0, %%mm5 \n\t" \
701 "movq %3, %%mm6 \n\t" \
702 "psubb %%mm1, %%mm5 \n\t" \
703 "paddb %%mm1, %%mm6 \n\t"
705 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
707 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
708 const int strength = ff_h263_loop_filter_strength[qscale];
713 "movq %%mm3, %1 \n\t"
714 "movq %%mm4, %2 \n\t"
715 "movq %%mm5, %0 \n\t"
716 "movq %%mm6, %3 \n\t"
717 : "+m"(*(uint64_t*)(src - 2 * stride)),
718 "+m"(*(uint64_t*)(src - 1 * stride)),
719 "+m"(*(uint64_t*)(src + 0 * stride)),
720 "+m"(*(uint64_t*)(src + 1 * stride))
721 : "g"(2 * strength), "m"(ff_pb_FC)
726 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
728 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
729 const int strength = ff_h263_loop_filter_strength[qscale];
730 DECLARE_ALIGNED(8, uint64_t, temp)[4];
731 uint8_t *btemp = (uint8_t*)temp;
735 transpose4x4(btemp, src, 8, stride);
736 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
738 H263_LOOP_FILTER // 5 3 4 6
744 : "g"(2 * strength), "m"(ff_pb_FC)
748 "movq %%mm5, %%mm1 \n\t"
749 "movq %%mm4, %%mm0 \n\t"
750 "punpcklbw %%mm3, %%mm5 \n\t"
751 "punpcklbw %%mm6, %%mm4 \n\t"
752 "punpckhbw %%mm3, %%mm1 \n\t"
753 "punpckhbw %%mm6, %%mm0 \n\t"
754 "movq %%mm5, %%mm3 \n\t"
755 "movq %%mm1, %%mm6 \n\t"
756 "punpcklwd %%mm4, %%mm5 \n\t"
757 "punpcklwd %%mm0, %%mm1 \n\t"
758 "punpckhwd %%mm4, %%mm3 \n\t"
759 "punpckhwd %%mm0, %%mm6 \n\t"
760 "movd %%mm5, (%0) \n\t"
761 "punpckhdq %%mm5, %%mm5 \n\t"
762 "movd %%mm5, (%0, %2) \n\t"
763 "movd %%mm3, (%0, %2, 2) \n\t"
764 "punpckhdq %%mm3, %%mm3 \n\t"
765 "movd %%mm3, (%0, %3) \n\t"
766 "movd %%mm1, (%1) \n\t"
767 "punpckhdq %%mm1, %%mm1 \n\t"
768 "movd %%mm1, (%1, %2) \n\t"
769 "movd %%mm6, (%1, %2, 2) \n\t"
770 "punpckhdq %%mm6, %%mm6 \n\t"
771 "movd %%mm6, (%1, %3) \n\t"
773 "r"(src + 4 * stride),
774 "r"((x86_reg)stride),
775 "r"((x86_reg)(3 * stride))
780 /* Draw the edges of width 'w' of an image of size width, height
781 * this MMX version can only handle w == 8 || w == 16. */
782 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
783 int w, int h, int sides)
785 uint8_t *ptr, *last_line;
788 last_line = buf + (height - 1) * wrap;
794 "movd (%0), %%mm0 \n\t"
795 "punpcklbw %%mm0, %%mm0 \n\t"
796 "punpcklwd %%mm0, %%mm0 \n\t"
797 "punpckldq %%mm0, %%mm0 \n\t"
798 "movq %%mm0, -8(%0) \n\t"
799 "movq -8(%0, %2), %%mm1 \n\t"
800 "punpckhbw %%mm1, %%mm1 \n\t"
801 "punpckhwd %%mm1, %%mm1 \n\t"
802 "punpckhdq %%mm1, %%mm1 \n\t"
803 "movq %%mm1, (%0, %2) \n\t"
808 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
813 "movd (%0), %%mm0 \n\t"
814 "punpcklbw %%mm0, %%mm0 \n\t"
815 "punpcklwd %%mm0, %%mm0 \n\t"
816 "punpckldq %%mm0, %%mm0 \n\t"
817 "movq %%mm0, -8(%0) \n\t"
818 "movq %%mm0, -16(%0) \n\t"
819 "movq -8(%0, %2), %%mm1 \n\t"
820 "punpckhbw %%mm1, %%mm1 \n\t"
821 "punpckhwd %%mm1, %%mm1 \n\t"
822 "punpckhdq %%mm1, %%mm1 \n\t"
823 "movq %%mm1, (%0, %2) \n\t"
824 "movq %%mm1, 8(%0, %2) \n\t"
829 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
833 /* top and bottom (and hopefully also the corners) */
834 if (sides & EDGE_TOP) {
835 for (i = 0; i < h; i += 4) {
836 ptr = buf - (i + 1) * wrap - w;
839 "movq (%1, %0), %%mm0 \n\t"
840 "movq %%mm0, (%0) \n\t"
841 "movq %%mm0, (%0, %2) \n\t"
842 "movq %%mm0, (%0, %2, 2) \n\t"
843 "movq %%mm0, (%0, %3) \n\t"
848 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
849 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
854 if (sides & EDGE_BOTTOM) {
855 for (i = 0; i < h; i += 4) {
856 ptr = last_line + (i + 1) * wrap - w;
859 "movq (%1, %0), %%mm0 \n\t"
860 "movq %%mm0, (%0) \n\t"
861 "movq %%mm0, (%0, %2) \n\t"
862 "movq %%mm0, (%0, %2, 2) \n\t"
863 "movq %%mm0, (%0, %3) \n\t"
868 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
869 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
870 "r"(ptr + width + 2 * w)
876 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
877 in0, in1, in2, in7, out, OP) \
878 "paddw "#m4", "#m3" \n\t" /* x1 */ \
879 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
880 "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
881 "movq "#in7", "#m3" \n\t" /* d */ \
882 "movq "#in0", %%mm5 \n\t" /* D */ \
883 "paddw "#m3", %%mm5 \n\t" /* x4 */ \
884 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
885 "movq "#in1", %%mm5 \n\t" /* C */ \
886 "movq "#in2", %%mm6 \n\t" /* B */ \
887 "paddw "#m6", %%mm5 \n\t" /* x3 */ \
888 "paddw "#m5", %%mm6 \n\t" /* x2 */ \
889 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
890 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
891 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
892 "paddw "#rnd", %%mm4 \n\t" /* x2 */ \
893 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
894 "psraw $5, %%mm5 \n\t" \
895 "packuswb %%mm5, %%mm5 \n\t" \
896 OP(%%mm5, out, %%mm7, d)
898 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW) \
899 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, \
908 "pxor %%mm7, %%mm7 \n\t" \
910 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
911 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
912 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
913 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
914 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
915 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
916 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
917 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
918 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
919 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
920 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
921 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
922 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
923 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
924 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
925 "paddw %%mm3, %%mm5 \n\t" /* b */ \
926 "paddw %%mm2, %%mm6 \n\t" /* c */ \
927 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
928 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
929 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
930 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
931 "paddw %%mm4, %%mm0 \n\t" /* a */ \
932 "paddw %%mm1, %%mm5 \n\t" /* d */ \
933 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
934 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
935 "paddw %6, %%mm6 \n\t" \
936 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
937 "psraw $5, %%mm0 \n\t" \
938 "movq %%mm0, %5 \n\t" \
939 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
941 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
942 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
943 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
944 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
945 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
946 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
947 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
948 "paddw %%mm0, %%mm2 \n\t" /* b */ \
949 "paddw %%mm5, %%mm3 \n\t" /* c */ \
950 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
951 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
952 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
953 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
954 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
955 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
956 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
957 "paddw %%mm2, %%mm1 \n\t" /* a */ \
958 "paddw %%mm6, %%mm4 \n\t" /* d */ \
959 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
960 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
961 "paddw %6, %%mm1 \n\t" \
962 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
963 "psraw $5, %%mm3 \n\t" \
964 "movq %5, %%mm1 \n\t" \
965 "packuswb %%mm3, %%mm1 \n\t" \
966 OP_MMX2(%%mm1, (%1), %%mm4, q) \
967 /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
969 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
970 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
971 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
972 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
973 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
974 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
975 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
976 "paddw %%mm1, %%mm5 \n\t" /* b */ \
977 "paddw %%mm4, %%mm0 \n\t" /* c */ \
978 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
979 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
980 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
981 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
982 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
983 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
984 "paddw %%mm3, %%mm2 \n\t" /* d */ \
985 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
986 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
987 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
988 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
989 "paddw %%mm2, %%mm6 \n\t" /* a */ \
990 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
991 "paddw %6, %%mm0 \n\t" \
992 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
993 "psraw $5, %%mm0 \n\t" \
994 /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
995 /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
997 "paddw %%mm5, %%mm3 \n\t" /* a */ \
998 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
999 "paddw %%mm4, %%mm6 \n\t" /* b */ \
1000 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
1001 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
1002 "paddw %%mm1, %%mm4 \n\t" /* c */ \
1003 "paddw %%mm2, %%mm5 \n\t" /* d */ \
1004 "paddw %%mm6, %%mm6 \n\t" /* 2b */ \
1005 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
1006 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
1007 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
1008 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
1009 "paddw %6, %%mm4 \n\t" \
1010 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
1011 "psraw $5, %%mm4 \n\t" \
1012 "packuswb %%mm4, %%mm0 \n\t" \
1013 OP_MMX2(%%mm0, 8(%1), %%mm4, q) \
1019 : "+a"(src), "+c"(dst), "+D"(h) \
1020 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
1021 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
1026 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, \
1034 /* quick HACK, XXX FIXME MUST be optimized */ \
1035 for (i = 0; i < h; i++) { \
1036 temp[ 0] = (src[ 0] + src[ 1]) * 20 - (src[ 0] + src[ 2]) * 6 + \
1037 (src[ 1] + src[ 3]) * 3 - (src[ 2] + src[ 4]); \
1038 temp[ 1] = (src[ 1] + src[ 2]) * 20 - (src[ 0] + src[ 3]) * 6 + \
1039 (src[ 0] + src[ 4]) * 3 - (src[ 1] + src[ 5]); \
1040 temp[ 2] = (src[ 2] + src[ 3]) * 20 - (src[ 1] + src[ 4]) * 6 + \
1041 (src[ 0] + src[ 5]) * 3 - (src[ 0] + src[ 6]); \
1042 temp[ 3] = (src[ 3] + src[ 4]) * 20 - (src[ 2] + src[ 5]) * 6 + \
1043 (src[ 1] + src[ 6]) * 3 - (src[ 0] + src[ 7]); \
1044 temp[ 4] = (src[ 4] + src[ 5]) * 20 - (src[ 3] + src[ 6]) * 6 + \
1045 (src[ 2] + src[ 7]) * 3 - (src[ 1] + src[ 8]); \
1046 temp[ 5] = (src[ 5] + src[ 6]) * 20 - (src[ 4] + src[ 7]) * 6 + \
1047 (src[ 3] + src[ 8]) * 3 - (src[ 2] + src[ 9]); \
1048 temp[ 6] = (src[ 6] + src[ 7]) * 20 - (src[ 5] + src[ 8]) * 6 + \
1049 (src[ 4] + src[ 9]) * 3 - (src[ 3] + src[10]); \
1050 temp[ 7] = (src[ 7] + src[ 8]) * 20 - (src[ 6] + src[ 9]) * 6 + \
1051 (src[ 5] + src[10]) * 3 - (src[ 4] + src[11]); \
1052 temp[ 8] = (src[ 8] + src[ 9]) * 20 - (src[ 7] + src[10]) * 6 + \
1053 (src[ 6] + src[11]) * 3 - (src[ 5] + src[12]); \
1054 temp[ 9] = (src[ 9] + src[10]) * 20 - (src[ 8] + src[11]) * 6 + \
1055 (src[ 7] + src[12]) * 3 - (src[ 6] + src[13]); \
1056 temp[10] = (src[10] + src[11]) * 20 - (src[ 9] + src[12]) * 6 + \
1057 (src[ 8] + src[13]) * 3 - (src[ 7] + src[14]); \
1058 temp[11] = (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + \
1059 (src[ 9] + src[14]) * 3 - (src[ 8] + src[15]); \
1060 temp[12] = (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + \
1061 (src[10] + src[15]) * 3 - (src[ 9] + src[16]); \
1062 temp[13] = (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + \
1063 (src[11] + src[16]) * 3 - (src[10] + src[16]); \
1064 temp[14] = (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + \
1065 (src[12] + src[16]) * 3 - (src[11] + src[15]); \
1066 temp[15] = (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + \
1067 (src[13] + src[15]) * 3 - (src[12] + src[14]); \
1068 __asm__ volatile ( \
1069 "movq (%0), %%mm0 \n\t" \
1070 "movq 8(%0), %%mm1 \n\t" \
1071 "paddw %2, %%mm0 \n\t" \
1072 "paddw %2, %%mm1 \n\t" \
1073 "psraw $5, %%mm0 \n\t" \
1074 "psraw $5, %%mm1 \n\t" \
1075 "packuswb %%mm1, %%mm0 \n\t" \
1076 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
1077 "movq 16(%0), %%mm0 \n\t" \
1078 "movq 24(%0), %%mm1 \n\t" \
1079 "paddw %2, %%mm0 \n\t" \
1080 "paddw %2, %%mm1 \n\t" \
1081 "psraw $5, %%mm0 \n\t" \
1082 "psraw $5, %%mm1 \n\t" \
1083 "packuswb %%mm1, %%mm0 \n\t" \
1084 OP_3DNOW(%%mm0, 8(%1), %%mm1, q) \
1085 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
1093 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, \
1099 __asm__ volatile ( \
1100 "pxor %%mm7, %%mm7 \n\t" \
1102 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
1103 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
1104 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
1105 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
1106 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
1107 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
1108 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
1109 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
1110 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
1111 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
1112 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
1113 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
1114 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
1115 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
1116 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
1117 "paddw %%mm3, %%mm5 \n\t" /* b */ \
1118 "paddw %%mm2, %%mm6 \n\t" /* c */ \
1119 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1120 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
1121 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
1122 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
1123 "paddw %%mm4, %%mm0 \n\t" /* a */ \
1124 "paddw %%mm1, %%mm5 \n\t" /* d */ \
1125 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
1126 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
1127 "paddw %5, %%mm6 \n\t" \
1128 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1129 "psraw $5, %%mm0 \n\t" \
1130 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
1132 "movd 5(%0), %%mm5 \n\t" /* FGHI */ \
1133 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
1134 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
1135 "paddw %%mm5, %%mm1 \n\t" /* a */ \
1136 "paddw %%mm6, %%mm2 \n\t" /* b */ \
1137 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
1138 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
1139 "paddw %%mm6, %%mm3 \n\t" /* c */ \
1140 "paddw %%mm5, %%mm4 \n\t" /* d */ \
1141 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1142 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1143 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1144 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1145 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
1146 "paddw %5, %%mm1 \n\t" \
1147 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
1148 "psraw $5, %%mm3 \n\t" \
1149 "packuswb %%mm3, %%mm0 \n\t" \
1150 OP_MMX2(%%mm0, (%1), %%mm4, q) \
1156 : "+a"(src), "+c"(dst), "+d"(h) \
1157 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1158 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
1163 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, \
1171 /* quick HACK, XXX FIXME MUST be optimized */ \
1172 for (i = 0; i < h; i++) { \
1173 temp[0] = (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + \
1174 (src[1] + src[3]) * 3 - (src[2] + src[4]); \
1175 temp[1] = (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + \
1176 (src[0] + src[4]) * 3 - (src[1] + src[5]); \
1177 temp[2] = (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + \
1178 (src[0] + src[5]) * 3 - (src[0] + src[6]); \
1179 temp[3] = (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + \
1180 (src[1] + src[6]) * 3 - (src[0] + src[7]); \
1181 temp[4] = (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + \
1182 (src[2] + src[7]) * 3 - (src[1] + src[8]); \
1183 temp[5] = (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + \
1184 (src[3] + src[8]) * 3 - (src[2] + src[8]); \
1185 temp[6] = (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + \
1186 (src[4] + src[8]) * 3 - (src[3] + src[7]); \
1187 temp[7] = (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + \
1188 (src[5] + src[7]) * 3 - (src[4] + src[6]); \
1189 __asm__ volatile ( \
1190 "movq (%0), %%mm0 \n\t" \
1191 "movq 8(%0), %%mm1 \n\t" \
1192 "paddw %2, %%mm0 \n\t" \
1193 "paddw %2, %%mm1 \n\t" \
1194 "psraw $5, %%mm0 \n\t" \
1195 "psraw $5, %%mm1 \n\t" \
1196 "packuswb %%mm1, %%mm0 \n\t" \
1197 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
1198 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
1206 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1207 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1212 uint64_t temp[17 * 4]; \
1213 uint64_t *temp_ptr = temp; \
1216 /* FIXME unroll */ \
1217 __asm__ volatile ( \
1218 "pxor %%mm7, %%mm7 \n\t" \
1220 "movq (%0), %%mm0 \n\t" \
1221 "movq (%0), %%mm1 \n\t" \
1222 "movq 8(%0), %%mm2 \n\t" \
1223 "movq 8(%0), %%mm3 \n\t" \
1224 "punpcklbw %%mm7, %%mm0 \n\t" \
1225 "punpckhbw %%mm7, %%mm1 \n\t" \
1226 "punpcklbw %%mm7, %%mm2 \n\t" \
1227 "punpckhbw %%mm7, %%mm3 \n\t" \
1228 "movq %%mm0, (%1) \n\t" \
1229 "movq %%mm1, 17 * 8(%1) \n\t" \
1230 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1231 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1236 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1237 : "r"((x86_reg)srcStride) \
1244 /* FIXME reorder for speed */ \
1245 __asm__ volatile ( \
1246 /* "pxor %%mm7, %%mm7 \n\t" */ \
1248 "movq (%0), %%mm0 \n\t" \
1249 "movq 8(%0), %%mm1 \n\t" \
1250 "movq 16(%0), %%mm2 \n\t" \
1251 "movq 24(%0), %%mm3 \n\t" \
1252 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1253 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1255 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1257 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1259 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1260 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1262 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1263 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1265 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1266 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1268 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1269 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1271 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1273 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1275 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1276 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1278 "add $136, %0 \n\t" \
1283 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1284 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1285 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1286 "g"(4 - 14 * (x86_reg)dstStride) \
1291 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1296 uint64_t temp[9 * 2]; \
1297 uint64_t *temp_ptr = temp; \
1300 /* FIXME unroll */ \
1301 __asm__ volatile ( \
1302 "pxor %%mm7, %%mm7 \n\t" \
1304 "movq (%0), %%mm0 \n\t" \
1305 "movq (%0), %%mm1 \n\t" \
1306 "punpcklbw %%mm7, %%mm0 \n\t" \
1307 "punpckhbw %%mm7, %%mm1 \n\t" \
1308 "movq %%mm0, (%1) \n\t" \
1309 "movq %%mm1, 9*8(%1) \n\t" \
1314 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1315 : "r"((x86_reg)srcStride) \
1322 /* FIXME reorder for speed */ \
1323 __asm__ volatile ( \
1324 /* "pxor %%mm7, %%mm7 \n\t" */ \
1326 "movq (%0), %%mm0 \n\t" \
1327 "movq 8(%0), %%mm1 \n\t" \
1328 "movq 16(%0), %%mm2 \n\t" \
1329 "movq 24(%0), %%mm3 \n\t" \
1330 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1331 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1333 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1335 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1337 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1339 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1341 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1342 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1344 "add $72, %0 \n\t" \
1349 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1350 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1351 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1352 "g"(4 - 6 * (x86_reg)dstStride) \
1357 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1360 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1363 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1367 uint8_t * const half = (uint8_t*)temp; \
1368 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1370 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1373 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1376 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1380 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1384 uint8_t * const half = (uint8_t*)temp; \
1385 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1387 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1391 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1395 uint8_t * const half = (uint8_t*)temp; \
1396 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1397 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1400 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1403 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1406 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1410 uint8_t * const half = (uint8_t*)temp; \
1411 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1412 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1416 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1419 uint64_t half[8 + 9]; \
1420 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1421 uint8_t * const halfHV = ((uint8_t*)half); \
1422 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1424 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1425 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1426 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1429 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1432 uint64_t half[8 + 9]; \
1433 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1434 uint8_t * const halfHV = ((uint8_t*)half); \
1435 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1437 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1439 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1440 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1443 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1446 uint64_t half[8 + 9]; \
1447 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1448 uint8_t * const halfHV = ((uint8_t*)half); \
1449 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1451 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1452 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1453 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1456 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1459 uint64_t half[8 + 9]; \
1460 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1461 uint8_t * const halfHV = ((uint8_t*)half); \
1462 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1464 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1466 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1467 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1470 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1473 uint64_t half[8 + 9]; \
1474 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1475 uint8_t * const halfHV = ((uint8_t*)half); \
1476 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1478 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1479 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1482 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1485 uint64_t half[8 + 9]; \
1486 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1487 uint8_t * const halfHV = ((uint8_t*)half); \
1488 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1490 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1491 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1494 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1497 uint64_t half[8 + 9]; \
1498 uint8_t * const halfH = ((uint8_t*)half); \
1499 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1501 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1502 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1505 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1508 uint64_t half[8 + 9]; \
1509 uint8_t * const halfH = ((uint8_t*)half); \
1510 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1512 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1514 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1517 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1521 uint8_t * const halfH = ((uint8_t*)half); \
1522 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1524 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1527 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1530 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1533 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1536 uint64_t temp[32]; \
1537 uint8_t * const half = (uint8_t*)temp; \
1538 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1540 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1543 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1546 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1547 stride, stride, 16); \
1550 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1553 uint64_t temp[32]; \
1554 uint8_t * const half = (uint8_t*)temp; \
1555 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1557 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1558 stride, stride, 16); \
1561 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1564 uint64_t temp[32]; \
1565 uint8_t * const half = (uint8_t*)temp; \
1566 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1568 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1571 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1574 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1577 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1580 uint64_t temp[32]; \
1581 uint8_t * const half = (uint8_t*)temp; \
1582 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1584 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1585 stride, stride, 16); \
1588 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1591 uint64_t half[16 * 2 + 17 * 2]; \
1592 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1593 uint8_t * const halfHV = ((uint8_t*)half); \
1594 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1596 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1598 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1600 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1603 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1606 uint64_t half[16 * 2 + 17 * 2]; \
1607 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1608 uint8_t * const halfHV = ((uint8_t*)half); \
1609 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1611 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1613 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1615 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1618 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1621 uint64_t half[16 * 2 + 17 * 2]; \
1622 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1623 uint8_t * const halfHV = ((uint8_t*)half); \
1624 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1626 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1628 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1630 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1634 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1637 uint64_t half[16 * 2 + 17 * 2]; \
1638 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1639 uint8_t * const halfHV = ((uint8_t*)half); \
1640 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1642 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1644 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1646 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1650 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1653 uint64_t half[16 * 2 + 17 * 2]; \
1654 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1655 uint8_t * const halfHV = ((uint8_t*)half); \
1656 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1658 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1660 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1663 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1666 uint64_t half[16 * 2 + 17 * 2]; \
1667 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1668 uint8_t * const halfHV = ((uint8_t*)half); \
1669 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1671 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1673 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1677 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1680 uint64_t half[17 * 2]; \
1681 uint8_t * const halfH = ((uint8_t*)half); \
1682 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1684 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1686 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1689 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1692 uint64_t half[17 * 2]; \
1693 uint8_t * const halfH = ((uint8_t*)half); \
1694 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1696 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1698 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1701 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1704 uint64_t half[17 * 2]; \
1705 uint8_t * const halfH = ((uint8_t*)half); \
1706 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1708 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1711 #define PUT_OP(a, b, temp, size) \
1712 "mov"#size" "#a", "#b" \n\t"
1714 #define AVG_3DNOW_OP(a, b, temp, size) \
1715 "mov"#size" "#b", "#temp" \n\t" \
1716 "pavgusb "#temp", "#a" \n\t" \
1717 "mov"#size" "#a", "#b" \n\t"
1719 #define AVG_MMX2_OP(a, b, temp, size) \
1720 "mov"#size" "#b", "#temp" \n\t" \
1721 "pavgb "#temp", "#a" \n\t" \
1722 "mov"#size" "#a", "#b" \n\t"
1724 QPEL_BASE(put_, ff_pw_16, _, PUT_OP, PUT_OP)
1725 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMX2_OP, AVG_3DNOW_OP)
1726 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1727 QPEL_OP(put_, ff_pw_16, _, PUT_OP, 3dnow)
1728 QPEL_OP(avg_, ff_pw_16, _, AVG_3DNOW_OP, 3dnow)
1729 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1730 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmx2)
1731 QPEL_OP(avg_, ff_pw_16, _, AVG_MMX2_OP, mmx2)
1732 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1734 /***********************************/
1735 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1737 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL) \
1738 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1742 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE); \
1745 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2) \
1746 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1750 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE, \
1754 #define QPEL_2TAP(OPNAME, SIZE, MMX) \
1755 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX) \
1756 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX) \
1757 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx) \
1758 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX = \
1759 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX; \
1760 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX = \
1761 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX; \
1762 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX = \
1763 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX; \
1764 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, \
1768 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE); \
1770 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, \
1774 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride, \
1777 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0) \
1778 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0) \
1779 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0) \
1780 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0) \
1781 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1) \
1782 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1) \
1783 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1) \
1784 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1) \
1786 QPEL_2TAP(put_, 16, mmx2)
1787 QPEL_2TAP(avg_, 16, mmx2)
1788 QPEL_2TAP(put_, 8, mmx2)
1789 QPEL_2TAP(avg_, 8, mmx2)
1790 QPEL_2TAP(put_, 16, 3dnow)
1791 QPEL_2TAP(avg_, 16, 3dnow)
1792 QPEL_2TAP(put_, 8, 3dnow)
1793 QPEL_2TAP(avg_, 8, 3dnow)
1797 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
1798 x86_reg linesize, x86_reg start_y,
1799 x86_reg end_y, x86_reg block_h,
1800 x86_reg start_x, x86_reg end_x,
1802 extern emu_edge_core_func ff_emu_edge_core_mmx;
1803 extern emu_edge_core_func ff_emu_edge_core_sse;
1805 static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
1807 int block_w, int block_h,
1808 int src_x, int src_y,
1810 emu_edge_core_func *core_fn)
1812 int start_y, start_x, end_y, end_x, src_y_add = 0;
1815 src_y_add = h - 1 - src_y;
1817 } else if (src_y <= -block_h) {
1818 src_y_add = 1 - block_h - src_y;
1819 src_y = 1 - block_h;
1822 src += w - 1 - src_x;
1824 } else if (src_x <= -block_w) {
1825 src += 1 - block_w - src_x;
1826 src_x = 1 - block_w;
1829 start_y = FFMAX(0, -src_y);
1830 start_x = FFMAX(0, -src_x);
1831 end_y = FFMIN(block_h, h-src_y);
1832 end_x = FFMIN(block_w, w-src_x);
1833 assert(start_x < end_x && block_w > 0);
1834 assert(start_y < end_y && block_h > 0);
1836 // fill in the to-be-copied part plus all above/below
1837 src += (src_y_add + start_y) * linesize + start_x;
1839 core_fn(buf, src, linesize, start_y, end_y,
1840 block_h, start_x, end_x, block_w);
1844 static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
1846 int block_w, int block_h,
1847 int src_x, int src_y, int w, int h)
1849 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1850 w, h, &ff_emu_edge_core_mmx);
1854 static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
1856 int block_w, int block_h,
1857 int src_x, int src_y, int w, int h)
1859 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1860 w, h, &ff_emu_edge_core_sse);
1862 #endif /* HAVE_YASM */
1864 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1865 int linesize, int block_w, int block_h,
1866 int src_x, int src_y, int w, int h);
1868 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1869 int stride, int h, int ox, int oy,
1870 int dxx, int dxy, int dyx, int dyy,
1871 int shift, int r, int width, int height,
1872 emulated_edge_mc_func *emu_edge_fn)
1875 const int ix = ox >> (16 + shift);
1876 const int iy = oy >> (16 + shift);
1877 const int oxs = ox >> 4;
1878 const int oys = oy >> 4;
1879 const int dxxs = dxx >> 4;
1880 const int dxys = dxy >> 4;
1881 const int dyxs = dyx >> 4;
1882 const int dyys = dyy >> 4;
1883 const uint16_t r4[4] = { r, r, r, r };
1884 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1885 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1886 const uint64_t shift2 = 2 * shift;
1887 uint8_t edge_buf[(h + 1) * stride];
1890 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1891 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1892 const int dxh = dxy * (h - 1);
1893 const int dyw = dyx * (w - 1);
1894 if ( // non-constant fullpel offset (3% of blocks)
1895 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1896 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1897 // uses more than 16 bits of subpel mv (only at huge resolution)
1898 || (dxx | dxy | dyx | dyy) & 15) {
1899 // FIXME could still use mmx for some of the rows
1900 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1901 shift, r, width, height);
1905 src += ix + iy * stride;
1906 if ((unsigned)ix >= width - w ||
1907 (unsigned)iy >= height - h) {
1908 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1913 "movd %0, %%mm6 \n\t"
1914 "pxor %%mm7, %%mm7 \n\t"
1915 "punpcklwd %%mm6, %%mm6 \n\t"
1916 "punpcklwd %%mm6, %%mm6 \n\t"
1920 for (x = 0; x < w; x += 4) {
1921 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1922 oxs - dxys + dxxs * (x + 1),
1923 oxs - dxys + dxxs * (x + 2),
1924 oxs - dxys + dxxs * (x + 3) };
1925 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1926 oys - dyys + dyxs * (x + 1),
1927 oys - dyys + dyxs * (x + 2),
1928 oys - dyys + dyxs * (x + 3) };
1930 for (y = 0; y < h; y++) {
1932 "movq %0, %%mm4 \n\t"
1933 "movq %1, %%mm5 \n\t"
1934 "paddw %2, %%mm4 \n\t"
1935 "paddw %3, %%mm5 \n\t"
1936 "movq %%mm4, %0 \n\t"
1937 "movq %%mm5, %1 \n\t"
1938 "psrlw $12, %%mm4 \n\t"
1939 "psrlw $12, %%mm5 \n\t"
1940 : "+m"(*dx4), "+m"(*dy4)
1941 : "m"(*dxy4), "m"(*dyy4)
1945 "movq %%mm6, %%mm2 \n\t"
1946 "movq %%mm6, %%mm1 \n\t"
1947 "psubw %%mm4, %%mm2 \n\t"
1948 "psubw %%mm5, %%mm1 \n\t"
1949 "movq %%mm2, %%mm0 \n\t"
1950 "movq %%mm4, %%mm3 \n\t"
1951 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1952 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1953 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1954 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1956 "movd %4, %%mm5 \n\t"
1957 "movd %3, %%mm4 \n\t"
1958 "punpcklbw %%mm7, %%mm5 \n\t"
1959 "punpcklbw %%mm7, %%mm4 \n\t"
1960 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1961 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1963 "movd %2, %%mm5 \n\t"
1964 "movd %1, %%mm4 \n\t"
1965 "punpcklbw %%mm7, %%mm5 \n\t"
1966 "punpcklbw %%mm7, %%mm4 \n\t"
1967 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1968 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1969 "paddw %5, %%mm1 \n\t"
1970 "paddw %%mm3, %%mm2 \n\t"
1971 "paddw %%mm1, %%mm0 \n\t"
1972 "paddw %%mm2, %%mm0 \n\t"
1974 "psrlw %6, %%mm0 \n\t"
1975 "packuswb %%mm0, %%mm0 \n\t"
1976 "movd %%mm0, %0 \n\t"
1978 : "=m"(dst[x + y * stride])
1979 : "m"(src[0]), "m"(src[1]),
1980 "m"(src[stride]), "m"(src[stride + 1]),
1981 "m"(*r4), "m"(shift2)
1985 src += 4 - h * stride;
1991 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1992 int stride, int h, int ox, int oy,
1993 int dxx, int dxy, int dyx, int dyy,
1994 int shift, int r, int width, int height)
1996 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1997 width, height, &emulated_edge_mc_mmx);
2000 static void gmc_sse(uint8_t *dst, uint8_t *src,
2001 int stride, int h, int ox, int oy,
2002 int dxx, int dxy, int dyx, int dyy,
2003 int shift, int r, int width, int height)
2005 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2006 width, height, &emulated_edge_mc_sse);
2009 static void gmc_mmx(uint8_t *dst, uint8_t *src,
2010 int stride, int h, int ox, int oy,
2011 int dxx, int dxy, int dyx, int dyy,
2012 int shift, int r, int width, int height)
2014 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2015 width, height, &ff_emulated_edge_mc_8);
2019 #define PREFETCH(name, op) \
2020 static void name(void *mem, int stride, int h) \
2022 const uint8_t *p = mem; \
2024 __asm__ volatile (#op" %0" :: "m"(*p)); \
2029 PREFETCH(prefetch_mmx2, prefetcht0)
2030 PREFETCH(prefetch_3dnow, prefetch)
2033 #include "h264_qpel_mmx.c"
2035 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
2036 int stride, int h, int x, int y);
2037 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
2038 int stride, int h, int x, int y);
2039 void ff_avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst, uint8_t *src,
2040 int stride, int h, int x, int y);
2042 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
2043 int stride, int h, int x, int y);
2044 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
2045 int stride, int h, int x, int y);
2046 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
2047 int stride, int h, int x, int y);
2049 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
2050 int stride, int h, int x, int y);
2051 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
2052 int stride, int h, int x, int y);
2054 void ff_put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
2055 int stride, int h, int x, int y);
2056 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
2057 int stride, int h, int x, int y);
2059 void ff_avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
2060 int stride, int h, int x, int y);
2061 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
2062 int stride, int h, int x, int y);
2064 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
2065 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
2066 (uint8_t *dst, uint8_t *src, \
2067 int stride, int h, int x, int y);
2069 CHROMA_MC(put, 2, 10, mmxext)
2070 CHROMA_MC(avg, 2, 10, mmxext)
2071 CHROMA_MC(put, 4, 10, mmxext)
2072 CHROMA_MC(avg, 4, 10, mmxext)
2073 CHROMA_MC(put, 8, 10, sse2)
2074 CHROMA_MC(avg, 8, 10, sse2)
2075 CHROMA_MC(put, 8, 10, avx)
2076 CHROMA_MC(avg, 8, 10, avx)
2079 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2081 put_pixels8_mmx(dst, src, stride, 8);
2084 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2086 avg_pixels8_mmx(dst, src, stride, 8);
2089 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2091 put_pixels16_mmx(dst, src, stride, 16);
2094 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2096 avg_pixels16_mmx(dst, src, stride, 16);
2100 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
2101 int stride, int rnd)
2103 put_pixels8_mmx(dst, src, stride, 8);
2106 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src,
2107 int stride, int rnd)
2109 avg_pixels8_mmx2(dst, src, stride, 8);
2112 /* only used in VP3/5/6 */
2113 static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
2119 "movq (%1), %%mm0 \n\t"
2120 "movq (%2), %%mm1 \n\t"
2121 "movq (%1,%4), %%mm2 \n\t"
2122 "movq (%2,%4), %%mm3 \n\t"
2123 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
2124 "movq %%mm4, (%3) \n\t"
2125 "movq %%mm5, (%3,%4) \n\t"
2127 "movq (%1,%4,2), %%mm0 \n\t"
2128 "movq (%2,%4,2), %%mm1 \n\t"
2129 "movq (%1,%5), %%mm2 \n\t"
2130 "movq (%2,%5), %%mm3 \n\t"
2131 "lea (%1,%4,4), %1 \n\t"
2132 "lea (%2,%4,4), %2 \n\t"
2133 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
2134 "movq %%mm4, (%3,%4,2) \n\t"
2135 "movq %%mm5, (%3,%5) \n\t"
2136 "lea (%3,%4,4), %3 \n\t"
2139 :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
2140 :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
2142 // STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
2144 static void put_vp_no_rnd_pixels16_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
2146 put_vp_no_rnd_pixels8_l2_mmx(dst, a, b, stride, h);
2147 put_vp_no_rnd_pixels8_l2_mmx(dst+8, a+8, b+8, stride, h);
2150 #if CONFIG_DIRAC_DECODER
2151 #define DIRAC_PIXOP(OPNAME, EXT)\
2152 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2154 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
2156 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2158 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
2160 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2162 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
2163 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
2166 DIRAC_PIXOP(put, mmx)
2167 DIRAC_PIXOP(avg, mmx)
2168 DIRAC_PIXOP(avg, mmx2)
2170 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2172 put_pixels16_sse2(dst, src[0], stride, h);
2174 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2176 avg_pixels16_sse2(dst, src[0], stride, h);
2178 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2180 put_pixels16_sse2(dst , src[0] , stride, h);
2181 put_pixels16_sse2(dst+16, src[0]+16, stride, h);
2183 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2185 avg_pixels16_sse2(dst , src[0] , stride, h);
2186 avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
2190 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
2193 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
2197 ff_put_pixels_clamped_mmx(block, dest, line_size);
2200 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
2204 ff_add_pixels_clamped_mmx(block, dest, line_size);
2207 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
2210 ff_mmxext_idct(block);
2211 ff_put_pixels_clamped_mmx(block, dest, line_size);
2214 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
2217 ff_mmxext_idct(block);
2218 ff_add_pixels_clamped_mmx(block, dest, line_size);
2222 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2224 ff_idct_xvid_mmx(block);
2225 ff_put_pixels_clamped_mmx(block, dest, line_size);
2228 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2230 ff_idct_xvid_mmx(block);
2231 ff_add_pixels_clamped_mmx(block, dest, line_size);
2234 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2236 ff_idct_xvid_mmx2(block);
2237 ff_put_pixels_clamped_mmx(block, dest, line_size);
2240 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2242 ff_idct_xvid_mmx2(block);
2243 ff_add_pixels_clamped_mmx(block, dest, line_size);
2246 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2249 __asm__ volatile ("pxor %%mm7, %%mm7":);
2250 for (i = 0; i < blocksize; i += 2) {
2252 "movq %0, %%mm0 \n\t"
2253 "movq %1, %%mm1 \n\t"
2254 "movq %%mm0, %%mm2 \n\t"
2255 "movq %%mm1, %%mm3 \n\t"
2256 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2257 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2258 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2259 "pxor %%mm2, %%mm1 \n\t"
2260 "movq %%mm3, %%mm4 \n\t"
2261 "pand %%mm1, %%mm3 \n\t"
2262 "pandn %%mm1, %%mm4 \n\t"
2263 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2264 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2265 "movq %%mm3, %1 \n\t"
2266 "movq %%mm0, %0 \n\t"
2267 : "+m"(mag[i]), "+m"(ang[i])
2271 __asm__ volatile ("femms");
2274 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2279 "movaps %0, %%xmm5 \n\t"
2280 :: "m"(ff_pdw_80000000[0])
2282 for (i = 0; i < blocksize; i += 4) {
2284 "movaps %0, %%xmm0 \n\t"
2285 "movaps %1, %%xmm1 \n\t"
2286 "xorps %%xmm2, %%xmm2 \n\t"
2287 "xorps %%xmm3, %%xmm3 \n\t"
2288 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2289 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2290 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2291 "xorps %%xmm2, %%xmm1 \n\t"
2292 "movaps %%xmm3, %%xmm4 \n\t"
2293 "andps %%xmm1, %%xmm3 \n\t"
2294 "andnps %%xmm1, %%xmm4 \n\t"
2295 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2296 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2297 "movaps %%xmm3, %1 \n\t"
2298 "movaps %%xmm0, %0 \n\t"
2299 : "+m"(mag[i]), "+m"(ang[i])
2308 #define MIX5(mono, stereo) \
2309 __asm__ volatile ( \
2310 "movss 0(%2), %%xmm5 \n" \
2311 "movss 8(%2), %%xmm6 \n" \
2312 "movss 24(%2), %%xmm7 \n" \
2313 "shufps $0, %%xmm5, %%xmm5 \n" \
2314 "shufps $0, %%xmm6, %%xmm6 \n" \
2315 "shufps $0, %%xmm7, %%xmm7 \n" \
2317 "movaps (%0, %1), %%xmm0 \n" \
2318 "movaps 0x400(%0, %1), %%xmm1 \n" \
2319 "movaps 0x800(%0, %1), %%xmm2 \n" \
2320 "movaps 0xc00(%0, %1), %%xmm3 \n" \
2321 "movaps 0x1000(%0, %1), %%xmm4 \n" \
2322 "mulps %%xmm5, %%xmm0 \n" \
2323 "mulps %%xmm6, %%xmm1 \n" \
2324 "mulps %%xmm5, %%xmm2 \n" \
2325 "mulps %%xmm7, %%xmm3 \n" \
2326 "mulps %%xmm7, %%xmm4 \n" \
2327 stereo("addps %%xmm1, %%xmm0 \n") \
2328 "addps %%xmm1, %%xmm2 \n" \
2329 "addps %%xmm3, %%xmm0 \n" \
2330 "addps %%xmm4, %%xmm2 \n" \
2331 mono("addps %%xmm2, %%xmm0 \n") \
2332 "movaps %%xmm0, (%0, %1) \n" \
2333 stereo("movaps %%xmm2, 0x400(%0, %1) \n") \
2337 : "r"(samples[0] + len), "r"(matrix) \
2338 : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2339 "%xmm4", "%xmm5", "%xmm6", "%xmm7",) \
2343 #define MIX_MISC(stereo) \
2344 __asm__ volatile ( \
2346 "movaps (%3, %0), %%xmm0 \n" \
2347 stereo("movaps %%xmm0, %%xmm1 \n") \
2348 "mulps %%xmm4, %%xmm0 \n" \
2349 stereo("mulps %%xmm5, %%xmm1 \n") \
2350 "lea 1024(%3, %0), %1 \n" \
2353 "movaps (%1), %%xmm2 \n" \
2354 stereo("movaps %%xmm2, %%xmm3 \n") \
2355 "mulps (%4, %2), %%xmm2 \n" \
2356 stereo("mulps 16(%4, %2), %%xmm3 \n") \
2357 "addps %%xmm2, %%xmm0 \n" \
2358 stereo("addps %%xmm3, %%xmm1 \n") \
2359 "add $1024, %1 \n" \
2362 "movaps %%xmm0, (%3, %0) \n" \
2363 stereo("movaps %%xmm1, 1024(%3, %0) \n") \
2366 : "+&r"(i), "=&r"(j), "=&r"(k) \
2367 : "r"(samples[0] + len), "r"(matrix_simd + in_ch), \
2368 "g"((intptr_t) - 32 * (in_ch - 1)) \
2372 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2],
2373 int out_ch, int in_ch, int len)
2375 int (*matrix_cmp)[2] = (int(*)[2])matrix;
2378 i = -len * sizeof(float);
2379 if (in_ch == 5 && out_ch == 2 &&
2380 !(matrix_cmp[0][1] | matrix_cmp[2][0] |
2381 matrix_cmp[3][1] | matrix_cmp[4][0] |
2382 (matrix_cmp[1][0] ^ matrix_cmp[1][1]) |
2383 (matrix_cmp[0][0] ^ matrix_cmp[2][1]))) {
2385 } else if (in_ch == 5 && out_ch == 1 &&
2386 matrix_cmp[0][0] == matrix_cmp[2][0] &&
2387 matrix_cmp[3][0] == matrix_cmp[4][0]) {
2390 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2391 j = 2 * in_ch * sizeof(float);
2395 "movss (%2, %0), %%xmm4 \n"
2396 "movss 4(%2, %0), %%xmm5 \n"
2397 "shufps $0, %%xmm4, %%xmm4 \n"
2398 "shufps $0, %%xmm5, %%xmm5 \n"
2399 "movaps %%xmm4, (%1, %0, 4) \n"
2400 "movaps %%xmm5, 16(%1, %0, 4) \n"
2403 : "r"(matrix_simd), "r"(matrix)
2414 static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1,
2417 x86_reg i = (len - 4) * 4;
2420 "movq (%2, %0), %%mm0 \n\t"
2421 "movq 8(%2, %0), %%mm1 \n\t"
2422 "pfmul (%3, %0), %%mm0 \n\t"
2423 "pfmul 8(%3, %0), %%mm1 \n\t"
2424 "movq %%mm0, (%1, %0) \n\t"
2425 "movq %%mm1, 8(%1, %0) \n\t"
2430 : "r"(dst), "r"(src0), "r"(src1)
2435 static void vector_fmul_sse(float *dst, const float *src0, const float *src1,
2438 x86_reg i = (len - 8) * 4;
2441 "movaps (%2, %0), %%xmm0 \n\t"
2442 "movaps 16(%2, %0), %%xmm1 \n\t"
2443 "mulps (%3, %0), %%xmm0 \n\t"
2444 "mulps 16(%3, %0), %%xmm1 \n\t"
2445 "movaps %%xmm0, (%1, %0) \n\t"
2446 "movaps %%xmm1, 16(%1, %0) \n\t"
2450 : "r"(dst), "r"(src0), "r"(src1)
2455 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0,
2456 const float *src1, int len)
2458 x86_reg i = len * 4 - 16;
2461 "pswapd 8(%1), %%mm0 \n\t"
2462 "pswapd (%1), %%mm1 \n\t"
2463 "pfmul (%3, %0), %%mm0 \n\t"
2464 "pfmul 8(%3, %0), %%mm1 \n\t"
2465 "movq %%mm0, (%2, %0) \n\t"
2466 "movq %%mm1, 8(%2, %0) \n\t"
2470 : "+r"(i), "+r"(src1)
2471 : "r"(dst), "r"(src0)
2473 __asm__ volatile ("femms");
2476 static void vector_fmul_reverse_sse(float *dst, const float *src0,
2477 const float *src1, int len)
2479 x86_reg i = len * 4 - 32;
2482 "movaps 16(%1), %%xmm0 \n\t"
2483 "movaps (%1), %%xmm1 \n\t"
2484 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2485 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2486 "mulps (%3, %0), %%xmm0 \n\t"
2487 "mulps 16(%3, %0), %%xmm1 \n\t"
2488 "movaps %%xmm0, (%2, %0) \n\t"
2489 "movaps %%xmm1, 16(%2, %0) \n\t"
2493 : "+r"(i), "+r"(src1)
2494 : "r"(dst), "r"(src0)
2498 static void vector_fmul_add_3dnow(float *dst, const float *src0,
2499 const float *src1, const float *src2, int len)
2501 x86_reg i = (len - 4) * 4;
2504 "movq (%2, %0), %%mm0 \n\t"
2505 "movq 8(%2, %0), %%mm1 \n\t"
2506 "pfmul (%3, %0), %%mm0 \n\t"
2507 "pfmul 8(%3, %0), %%mm1 \n\t"
2508 "pfadd (%4, %0), %%mm0 \n\t"
2509 "pfadd 8(%4, %0), %%mm1 \n\t"
2510 "movq %%mm0, (%1, %0) \n\t"
2511 "movq %%mm1, 8(%1, %0) \n\t"
2515 : "r"(dst), "r"(src0), "r"(src1), "r"(src2)
2518 __asm__ volatile ("femms");
2521 static void vector_fmul_add_sse(float *dst, const float *src0,
2522 const float *src1, const float *src2, int len)
2524 x86_reg i = (len - 8) * 4;
2527 "movaps (%2, %0), %%xmm0 \n\t"
2528 "movaps 16(%2, %0), %%xmm1 \n\t"
2529 "mulps (%3, %0), %%xmm0 \n\t"
2530 "mulps 16(%3, %0), %%xmm1 \n\t"
2531 "addps (%4, %0), %%xmm0 \n\t"
2532 "addps 16(%4, %0), %%xmm1 \n\t"
2533 "movaps %%xmm0, (%1, %0) \n\t"
2534 "movaps %%xmm1, 16(%1, %0) \n\t"
2538 : "r"(dst), "r"(src0), "r"(src1), "r"(src2)
2544 static void vector_fmul_window_3dnow2(float *dst, const float *src0,
2545 const float *src1, const float *win,
2548 x86_reg i = -len * 4;
2549 x86_reg j = len * 4 - 8;
2552 "pswapd (%5, %1), %%mm1 \n"
2553 "movq (%5, %0), %%mm0 \n"
2554 "pswapd (%4, %1), %%mm5 \n"
2555 "movq (%3, %0), %%mm4 \n"
2556 "movq %%mm0, %%mm2 \n"
2557 "movq %%mm1, %%mm3 \n"
2558 "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]
2559 "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]
2560 "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]
2561 "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]
2562 "pfadd %%mm3, %%mm2 \n"
2563 "pfsub %%mm0, %%mm1 \n"
2564 "pswapd %%mm2, %%mm2 \n"
2565 "movq %%mm1, (%2, %0) \n"
2566 "movq %%mm2, (%2, %1) \n"
2572 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2576 static void vector_fmul_window_sse(float *dst, const float *src0,
2577 const float *src1, const float *win, int len)
2579 x86_reg i = -len * 4;
2580 x86_reg j = len * 4 - 16;
2583 "movaps (%5, %1), %%xmm1 \n"
2584 "movaps (%5, %0), %%xmm0 \n"
2585 "movaps (%4, %1), %%xmm5 \n"
2586 "movaps (%3, %0), %%xmm4 \n"
2587 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2588 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2589 "movaps %%xmm0, %%xmm2 \n"
2590 "movaps %%xmm1, %%xmm3 \n"
2591 "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]
2592 "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]
2593 "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]
2594 "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]
2595 "addps %%xmm3, %%xmm2 \n"
2596 "subps %%xmm0, %%xmm1 \n"
2597 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2598 "movaps %%xmm1, (%2, %0) \n"
2599 "movaps %%xmm2, (%2, %1) \n"
2604 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2607 #endif /* HAVE_6REGS */
2609 static void vector_clipf_sse(float *dst, const float *src,
2610 float min, float max, int len)
2612 x86_reg i = (len - 16) * 4;
2614 "movss %3, %%xmm4 \n\t"
2615 "movss %4, %%xmm5 \n\t"
2616 "shufps $0, %%xmm4, %%xmm4 \n\t"
2617 "shufps $0, %%xmm5, %%xmm5 \n\t"
2619 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
2620 "movaps 16(%2, %0), %%xmm1 \n\t"
2621 "movaps 32(%2, %0), %%xmm2 \n\t"
2622 "movaps 48(%2, %0), %%xmm3 \n\t"
2623 "maxps %%xmm4, %%xmm0 \n\t"
2624 "maxps %%xmm4, %%xmm1 \n\t"
2625 "maxps %%xmm4, %%xmm2 \n\t"
2626 "maxps %%xmm4, %%xmm3 \n\t"
2627 "minps %%xmm5, %%xmm0 \n\t"
2628 "minps %%xmm5, %%xmm1 \n\t"
2629 "minps %%xmm5, %%xmm2 \n\t"
2630 "minps %%xmm5, %%xmm3 \n\t"
2631 "movaps %%xmm0, (%1, %0) \n\t"
2632 "movaps %%xmm1, 16(%1, %0) \n\t"
2633 "movaps %%xmm2, 32(%1, %0) \n\t"
2634 "movaps %%xmm3, 48(%1, %0) \n\t"
2638 : "r"(dst), "r"(src), "m"(min), "m"(max)
2643 void ff_vp3_idct_mmx(int16_t *input_data);
2644 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2645 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2647 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size,
2648 const DCTELEM *block);
2650 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2651 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2653 void ff_vp3_idct_sse2(int16_t *input_data);
2654 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2655 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2657 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2,
2659 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
2661 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2,
2663 int order, int mul);
2664 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
2666 int order, int mul);
2667 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
2669 int order, int mul);
2671 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
2672 const int16_t *window, unsigned int len);
2673 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2674 const int16_t *window, unsigned int len);
2675 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
2676 const int16_t *window, unsigned int len);
2677 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
2678 const int16_t *window, unsigned int len);
2679 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
2680 const int16_t *window, unsigned int len);
2681 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2682 const int16_t *window, unsigned int len);
2684 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
2685 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
2687 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top,
2688 const uint8_t *diff, int w,
2689 int *left, int *left_top);
2690 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
2692 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
2695 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2697 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
2698 int32_t min, int32_t max, unsigned int len);
2699 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
2700 int32_t min, int32_t max, unsigned int len);
2701 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
2702 int32_t min, int32_t max, unsigned int len);
2703 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
2704 int32_t min, int32_t max, unsigned int len);
2706 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2707 const float *src1, int len);
2708 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2709 const float *src1, int len);
2711 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2713 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2714 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2715 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2716 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2717 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2718 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2719 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2720 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2721 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2722 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2723 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2724 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2725 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2726 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2727 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2728 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2731 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2733 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2734 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2735 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2736 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2739 #define H264_QPEL_FUNCS(x, y, CPU) \
2741 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2742 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2743 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2744 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2747 #define H264_QPEL_FUNCS_10(x, y, CPU) \
2749 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2750 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2751 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2752 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2755 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2757 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2759 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2760 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2761 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2763 if (!high_bit_depth) {
2764 c->clear_block = clear_block_mmx;
2765 c->clear_blocks = clear_blocks_mmx;
2766 c->draw_edges = draw_edges_mmx;
2768 SET_HPEL_FUNCS(put, 0, 16, mmx);
2769 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2770 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2771 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2772 SET_HPEL_FUNCS(put, 1, 8, mmx);
2773 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2774 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2775 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2778 #if ARCH_X86_32 || !HAVE_YASM
2781 #if ARCH_X86_32 && HAVE_YASM
2782 if (!high_bit_depth)
2783 c->emulated_edge_mc = emulated_edge_mc_mmx;
2786 c->add_bytes = add_bytes_mmx;
2788 c->put_no_rnd_pixels_l2[0]= put_vp_no_rnd_pixels16_l2_mmx;
2789 c->put_no_rnd_pixels_l2[1]= put_vp_no_rnd_pixels8_l2_mmx;
2791 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2792 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2793 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2797 if (!high_bit_depth && CONFIG_H264CHROMA) {
2798 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
2799 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2802 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2807 static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
2810 const int bit_depth = avctx->bits_per_raw_sample;
2811 const int high_bit_depth = bit_depth > 8;
2813 c->prefetch = prefetch_mmx2;
2815 if (!high_bit_depth) {
2816 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2817 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2819 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2820 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2821 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2823 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2824 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2826 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2827 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2828 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2831 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2832 if (!high_bit_depth) {
2833 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2834 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2835 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2836 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2838 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2839 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2842 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2843 c->vp3_v_loop_filter = ff_vp3_v_loop_filter_mmx2;
2844 c->vp3_h_loop_filter = ff_vp3_h_loop_filter_mmx2;
2847 if (CONFIG_VP3_DECODER && HAVE_YASM)
2848 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
2850 if (CONFIG_VP3_DECODER && (avctx->codec_id == CODEC_ID_VP3 ||
2851 avctx->codec_id == CODEC_ID_THEORA)) {
2852 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2853 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2856 if (CONFIG_H264QPEL) {
2857 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
2858 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
2859 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
2860 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
2861 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
2862 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
2864 if (!high_bit_depth) {
2865 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
2866 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
2867 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
2868 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
2869 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
2870 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
2871 } else if (bit_depth == 10) {
2874 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2875 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2876 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2877 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2879 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2880 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2884 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
2885 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
2886 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
2887 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
2891 if (!high_bit_depth && CONFIG_H264CHROMA) {
2892 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_mmx2_rnd;
2893 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmx2;
2894 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmx2;
2895 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmx2;
2897 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2898 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
2899 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
2900 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
2901 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
2904 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2906 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2907 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2909 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2910 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2912 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2917 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2920 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2922 c->prefetch = prefetch_3dnow;
2924 if (!high_bit_depth) {
2925 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2926 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2928 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2929 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2930 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2932 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2933 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2935 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2936 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2937 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2939 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2940 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2941 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2942 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2943 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2945 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2946 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2950 if (CONFIG_VP3_DECODER && (avctx->codec_id == CODEC_ID_VP3 ||
2951 avctx->codec_id == CODEC_ID_THEORA)) {
2952 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2953 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2956 if (CONFIG_H264QPEL) {
2957 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
2958 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
2959 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2960 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
2961 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
2962 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
2964 if (!high_bit_depth) {
2965 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2966 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
2967 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
2968 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2969 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
2970 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
2973 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2974 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
2975 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2976 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
2980 if (!high_bit_depth && CONFIG_H264CHROMA) {
2981 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_3dnow_rnd;
2982 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2986 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2987 c->vector_fmul = vector_fmul_3dnow;
2988 c->vector_fmul_add = vector_fmul_add_3dnow;
2991 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2995 static void dsputil_init_3dnow2(DSPContext *c, AVCodecContext *avctx,
2998 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
3000 c->vector_fmul_window = vector_fmul_window_3dnow2;
3004 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
3006 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
3008 if (!high_bit_depth) {
3009 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
3010 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
3011 c->clear_block = clear_block_sse;
3012 c->clear_blocks = clear_blocks_sse;
3016 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
3017 c->ac3_downmix = ac3_downmix_sse;
3018 c->vector_fmul = vector_fmul_sse;
3019 c->vector_fmul_reverse = vector_fmul_reverse_sse;
3021 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
3022 c->vector_fmul_add = vector_fmul_add_sse;
3025 c->vector_fmul_window = vector_fmul_window_sse;
3028 c->vector_clipf = vector_clipf_sse;
3031 c->scalarproduct_float = ff_scalarproduct_float_sse;
3032 c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
3034 if (!high_bit_depth)
3035 c->emulated_edge_mc = emulated_edge_mc_sse;
3040 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
3043 const int bit_depth = avctx->bits_per_raw_sample;
3044 const int high_bit_depth = bit_depth > 8;
3046 if (mm_flags & AV_CPU_FLAG_3DNOW) {
3047 // these functions are slower than mmx on AMD, but faster on Intel
3048 if (!high_bit_depth) {
3049 c->put_pixels_tab[0][0] = put_pixels16_sse2;
3050 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
3051 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
3052 if (CONFIG_H264QPEL)
3053 H264_QPEL_FUNCS(0, 0, sse2);
3057 if (!high_bit_depth && CONFIG_H264QPEL) {
3058 H264_QPEL_FUNCS(0, 1, sse2);
3059 H264_QPEL_FUNCS(0, 2, sse2);
3060 H264_QPEL_FUNCS(0, 3, sse2);
3061 H264_QPEL_FUNCS(1, 1, sse2);
3062 H264_QPEL_FUNCS(1, 2, sse2);
3063 H264_QPEL_FUNCS(1, 3, sse2);
3064 H264_QPEL_FUNCS(2, 1, sse2);
3065 H264_QPEL_FUNCS(2, 2, sse2);
3066 H264_QPEL_FUNCS(2, 3, sse2);
3067 H264_QPEL_FUNCS(3, 1, sse2);
3068 H264_QPEL_FUNCS(3, 2, sse2);
3069 H264_QPEL_FUNCS(3, 3, sse2);
3073 if (bit_depth == 10) {
3074 if (CONFIG_H264QPEL) {
3075 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
3076 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
3077 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
3078 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
3079 H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
3080 H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
3081 H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
3083 if (CONFIG_H264CHROMA) {
3084 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
3085 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
3089 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
3090 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
3091 if (mm_flags & AV_CPU_FLAG_ATOM) {
3092 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
3094 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
3096 if (avctx->flags & CODEC_FLAG_BITEXACT) {
3097 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
3098 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
3099 c->apply_window_int16 = ff_apply_window_int16_sse2;
3101 c->bswap_buf = ff_bswap32_buf_sse2;
3105 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
3109 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
3110 const int bit_depth = avctx->bits_per_raw_sample;
3112 if (!high_bit_depth && CONFIG_H264QPEL) {
3113 H264_QPEL_FUNCS(1, 0, ssse3);
3114 H264_QPEL_FUNCS(1, 1, ssse3);
3115 H264_QPEL_FUNCS(1, 2, ssse3);
3116 H264_QPEL_FUNCS(1, 3, ssse3);
3117 H264_QPEL_FUNCS(2, 0, ssse3);
3118 H264_QPEL_FUNCS(2, 1, ssse3);
3119 H264_QPEL_FUNCS(2, 2, ssse3);
3120 H264_QPEL_FUNCS(2, 3, ssse3);
3121 H264_QPEL_FUNCS(3, 0, ssse3);
3122 H264_QPEL_FUNCS(3, 1, ssse3);
3123 H264_QPEL_FUNCS(3, 2, ssse3);
3124 H264_QPEL_FUNCS(3, 3, ssse3);
3127 else if (bit_depth == 10 && CONFIG_H264QPEL) {
3128 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
3129 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
3130 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
3132 if (!high_bit_depth && CONFIG_H264CHROMA) {
3133 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_ssse3_rnd;
3134 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_ssse3_rnd;
3135 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
3136 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
3138 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
3139 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
3140 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
3142 if (mm_flags & AV_CPU_FLAG_ATOM)
3143 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
3145 c->apply_window_int16 = ff_apply_window_int16_ssse3;
3146 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
3147 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
3148 c->bswap_buf = ff_bswap32_buf_ssse3;
3153 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
3157 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
3161 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
3163 #if HAVE_AVX && HAVE_YASM
3164 const int bit_depth = avctx->bits_per_raw_sample;
3166 if (bit_depth == 10) {
3167 // AVX implies !cache64.
3168 // TODO: Port cache(32|64) detection from x264.
3169 if (CONFIG_H264QPEL) {
3170 H264_QPEL_FUNCS_10(1, 0, sse2);
3171 H264_QPEL_FUNCS_10(2, 0, sse2);
3172 H264_QPEL_FUNCS_10(3, 0, sse2);
3175 if (CONFIG_H264CHROMA) {
3176 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
3177 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
3180 c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
3184 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
3186 int mm_flags = av_get_cpu_flags();
3189 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
3190 if (mm_flags & AV_CPU_FLAG_MMX)
3191 av_log(avctx, AV_LOG_INFO, " mmx");
3192 if (mm_flags & AV_CPU_FLAG_MMX2)
3193 av_log(avctx, AV_LOG_INFO, " mmx2");
3194 if (mm_flags & AV_CPU_FLAG_3DNOW)
3195 av_log(avctx, AV_LOG_INFO, " 3dnow");
3196 if (mm_flags & AV_CPU_FLAG_SSE)
3197 av_log(avctx, AV_LOG_INFO, " sse");
3198 if (mm_flags & AV_CPU_FLAG_SSE2)
3199 av_log(avctx, AV_LOG_INFO, " sse2");
3200 av_log(avctx, AV_LOG_INFO, "\n");
3203 if (mm_flags & AV_CPU_FLAG_MMX) {
3204 const int idct_algo = avctx->idct_algo;
3206 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
3207 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
3208 c->idct_put = ff_simple_idct_put_mmx;
3209 c->idct_add = ff_simple_idct_add_mmx;
3210 c->idct = ff_simple_idct_mmx;
3211 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
3213 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
3214 if (mm_flags & AV_CPU_FLAG_MMX2) {
3215 c->idct_put = ff_libmpeg2mmx2_idct_put;
3216 c->idct_add = ff_libmpeg2mmx2_idct_add;
3217 c->idct = ff_mmxext_idct;
3219 c->idct_put = ff_libmpeg2mmx_idct_put;
3220 c->idct_add = ff_libmpeg2mmx_idct_add;
3221 c->idct = ff_mmx_idct;
3223 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
3225 } else if ((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER ||
3226 CONFIG_VP6_DECODER) &&
3227 idct_algo == FF_IDCT_VP3 && HAVE_YASM) {
3228 if (mm_flags & AV_CPU_FLAG_SSE2) {
3229 c->idct_put = ff_vp3_idct_put_sse2;
3230 c->idct_add = ff_vp3_idct_add_sse2;
3231 c->idct = ff_vp3_idct_sse2;
3232 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
3234 c->idct_put = ff_vp3_idct_put_mmx;
3235 c->idct_add = ff_vp3_idct_add_mmx;
3236 c->idct = ff_vp3_idct_mmx;
3237 c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
3239 } else if (idct_algo == FF_IDCT_CAVS) {
3240 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
3241 } else if (idct_algo == FF_IDCT_XVIDMMX) {
3242 if (mm_flags & AV_CPU_FLAG_SSE2) {
3243 c->idct_put = ff_idct_xvid_sse2_put;
3244 c->idct_add = ff_idct_xvid_sse2_add;
3245 c->idct = ff_idct_xvid_sse2;
3246 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
3247 } else if (mm_flags & AV_CPU_FLAG_MMX2) {
3248 c->idct_put = ff_idct_xvid_mmx2_put;
3249 c->idct_add = ff_idct_xvid_mmx2_add;
3250 c->idct = ff_idct_xvid_mmx2;
3252 c->idct_put = ff_idct_xvid_mmx_put;
3253 c->idct_add = ff_idct_xvid_mmx_add;
3254 c->idct = ff_idct_xvid_mmx;
3259 dsputil_init_mmx(c, avctx, mm_flags);
3262 if (mm_flags & AV_CPU_FLAG_MMX2)
3263 dsputil_init_mmx2(c, avctx, mm_flags);
3265 if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW)
3266 dsputil_init_3dnow(c, avctx, mm_flags);
3268 if (mm_flags & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT)
3269 dsputil_init_3dnow2(c, avctx, mm_flags);
3271 if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE)
3272 dsputil_init_sse(c, avctx, mm_flags);
3274 if (mm_flags & AV_CPU_FLAG_SSE2)
3275 dsputil_init_sse2(c, avctx, mm_flags);
3277 if (mm_flags & AV_CPU_FLAG_SSSE3)
3278 dsputil_init_ssse3(c, avctx, mm_flags);
3280 if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE)
3281 dsputil_init_sse4(c, avctx, mm_flags);
3283 if (mm_flags & AV_CPU_FLAG_AVX)
3284 dsputil_init_avx(c, avctx, mm_flags);
3286 if (CONFIG_ENCODERS)
3287 ff_dsputilenc_init_mmx(c, avctx);