2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86_cpu.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "libavcodec/ac3dec.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
34 #include "diracdsp_mmx.h"
39 /* pixel operations */
40 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
41 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
43 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
44 { 0x8000000080000000ULL, 0x8000000080000000ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
53 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
57 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
60 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
62 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
64 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
67 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
73 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
74 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
77 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
78 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
79 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
81 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
82 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
83 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
85 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
86 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
88 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
89 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
91 #define MOVQ_BFE(regd) \
93 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
94 "paddb %%"#regd", %%"#regd" \n\t" ::)
97 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
98 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
100 // for shared library it's better to use this way for accessing constants
102 #define MOVQ_BONE(regd) \
104 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
105 "psrlw $15, %%"#regd" \n\t" \
106 "packuswb %%"#regd", %%"#regd" \n\t" ::)
108 #define MOVQ_WTWO(regd) \
110 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
111 "psrlw $15, %%"#regd" \n\t" \
112 "psllw $1, %%"#regd" \n\t"::)
116 // using regr as temporary and for the output result
117 // first argument is unmodifed and second is trashed
118 // regfe is supposed to contain 0xfefefefefefefefe
119 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
120 "movq "#rega", "#regr" \n\t" \
121 "pand "#regb", "#regr" \n\t" \
122 "pxor "#rega", "#regb" \n\t" \
123 "pand "#regfe", "#regb" \n\t" \
124 "psrlq $1, "#regb" \n\t" \
125 "paddb "#regb", "#regr" \n\t"
127 #define PAVGB_MMX(rega, regb, regr, regfe) \
128 "movq "#rega", "#regr" \n\t" \
129 "por "#regb", "#regr" \n\t" \
130 "pxor "#rega", "#regb" \n\t" \
131 "pand "#regfe", "#regb" \n\t" \
132 "psrlq $1, "#regb" \n\t" \
133 "psubb "#regb", "#regr" \n\t"
135 // mm6 is supposed to contain 0xfefefefefefefefe
136 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
137 "movq "#rega", "#regr" \n\t" \
138 "movq "#regc", "#regp" \n\t" \
139 "pand "#regb", "#regr" \n\t" \
140 "pand "#regd", "#regp" \n\t" \
141 "pxor "#rega", "#regb" \n\t" \
142 "pxor "#regc", "#regd" \n\t" \
143 "pand %%mm6, "#regb" \n\t" \
144 "pand %%mm6, "#regd" \n\t" \
145 "psrlq $1, "#regb" \n\t" \
146 "psrlq $1, "#regd" \n\t" \
147 "paddb "#regb", "#regr" \n\t" \
148 "paddb "#regd", "#regp" \n\t"
150 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
151 "movq "#rega", "#regr" \n\t" \
152 "movq "#regc", "#regp" \n\t" \
153 "por "#regb", "#regr" \n\t" \
154 "por "#regd", "#regp" \n\t" \
155 "pxor "#rega", "#regb" \n\t" \
156 "pxor "#regc", "#regd" \n\t" \
157 "pand %%mm6, "#regb" \n\t" \
158 "pand %%mm6, "#regd" \n\t" \
159 "psrlq $1, "#regd" \n\t" \
160 "psrlq $1, "#regb" \n\t" \
161 "psubb "#regb", "#regr" \n\t" \
162 "psubb "#regd", "#regp" \n\t"
164 /***********************************/
165 /* MMX no rounding */
166 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
167 #define SET_RND MOVQ_WONE
168 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
169 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
170 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
172 #include "dsputil_mmx_rnd_template.c"
178 /***********************************/
181 #define DEF(x, y) x ## _ ## y ## _mmx
182 #define SET_RND MOVQ_WTWO
183 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
184 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
186 #include "dsputil_mmx_rnd_template.c"
194 /***********************************/
197 #define DEF(x) x ## _3dnow
198 #define PAVGB "pavgusb"
201 #include "dsputil_mmx_avg_template.c"
207 /***********************************/
210 #define DEF(x) x ## _mmx2
212 /* Introduced only in MMX2 set */
213 #define PAVGB "pavgb"
216 #include "dsputil_mmx_avg_template.c"
222 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
223 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
224 #define put_pixels16_mmx2 put_pixels16_mmx
225 #define put_pixels8_mmx2 put_pixels8_mmx
226 #define put_pixels4_mmx2 put_pixels4_mmx
227 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
228 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
229 #define put_pixels16_3dnow put_pixels16_mmx
230 #define put_pixels8_3dnow put_pixels8_mmx
231 #define put_pixels4_3dnow put_pixels4_mmx
232 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
233 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
235 /***********************************/
238 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
244 /* read the pixels */
249 "movq %3, %%mm0 \n\t"
250 "movq 8%3, %%mm1 \n\t"
251 "movq 16%3, %%mm2 \n\t"
252 "movq 24%3, %%mm3 \n\t"
253 "movq 32%3, %%mm4 \n\t"
254 "movq 40%3, %%mm5 \n\t"
255 "movq 48%3, %%mm6 \n\t"
256 "movq 56%3, %%mm7 \n\t"
257 "packuswb %%mm1, %%mm0 \n\t"
258 "packuswb %%mm3, %%mm2 \n\t"
259 "packuswb %%mm5, %%mm4 \n\t"
260 "packuswb %%mm7, %%mm6 \n\t"
261 "movq %%mm0, (%0) \n\t"
262 "movq %%mm2, (%0, %1) \n\t"
263 "movq %%mm4, (%0, %1, 2) \n\t"
264 "movq %%mm6, (%0, %2) \n\t"
265 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
268 pix += line_size * 4;
271 // if here would be an exact copy of the code above
272 // compiler would generate some very strange code
275 "movq (%3), %%mm0 \n\t"
276 "movq 8(%3), %%mm1 \n\t"
277 "movq 16(%3), %%mm2 \n\t"
278 "movq 24(%3), %%mm3 \n\t"
279 "movq 32(%3), %%mm4 \n\t"
280 "movq 40(%3), %%mm5 \n\t"
281 "movq 48(%3), %%mm6 \n\t"
282 "movq 56(%3), %%mm7 \n\t"
283 "packuswb %%mm1, %%mm0 \n\t"
284 "packuswb %%mm3, %%mm2 \n\t"
285 "packuswb %%mm5, %%mm4 \n\t"
286 "packuswb %%mm7, %%mm6 \n\t"
287 "movq %%mm0, (%0) \n\t"
288 "movq %%mm2, (%0, %1) \n\t"
289 "movq %%mm4, (%0, %1, 2) \n\t"
290 "movq %%mm6, (%0, %2) \n\t"
291 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
295 #define put_signed_pixels_clamped_mmx_half(off) \
296 "movq "#off"(%2), %%mm1 \n\t" \
297 "movq 16 + "#off"(%2), %%mm2 \n\t" \
298 "movq 32 + "#off"(%2), %%mm3 \n\t" \
299 "movq 48 + "#off"(%2), %%mm4 \n\t" \
300 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
301 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
302 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
303 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
304 "paddb %%mm0, %%mm1 \n\t" \
305 "paddb %%mm0, %%mm2 \n\t" \
306 "paddb %%mm0, %%mm3 \n\t" \
307 "paddb %%mm0, %%mm4 \n\t" \
308 "movq %%mm1, (%0) \n\t" \
309 "movq %%mm2, (%0, %3) \n\t" \
310 "movq %%mm3, (%0, %3, 2) \n\t" \
311 "movq %%mm4, (%0, %1) \n\t"
313 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
316 x86_reg line_skip = line_size;
320 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
321 "lea (%3, %3, 2), %1 \n\t"
322 put_signed_pixels_clamped_mmx_half(0)
323 "lea (%0, %3, 4), %0 \n\t"
324 put_signed_pixels_clamped_mmx_half(64)
325 : "+&r"(pixels), "=&r"(line_skip3)
326 : "r"(block), "r"(line_skip)
330 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
337 /* read the pixels */
344 "movq (%2), %%mm0 \n\t"
345 "movq 8(%2), %%mm1 \n\t"
346 "movq 16(%2), %%mm2 \n\t"
347 "movq 24(%2), %%mm3 \n\t"
348 "movq %0, %%mm4 \n\t"
349 "movq %1, %%mm6 \n\t"
350 "movq %%mm4, %%mm5 \n\t"
351 "punpcklbw %%mm7, %%mm4 \n\t"
352 "punpckhbw %%mm7, %%mm5 \n\t"
353 "paddsw %%mm4, %%mm0 \n\t"
354 "paddsw %%mm5, %%mm1 \n\t"
355 "movq %%mm6, %%mm5 \n\t"
356 "punpcklbw %%mm7, %%mm6 \n\t"
357 "punpckhbw %%mm7, %%mm5 \n\t"
358 "paddsw %%mm6, %%mm2 \n\t"
359 "paddsw %%mm5, %%mm3 \n\t"
360 "packuswb %%mm1, %%mm0 \n\t"
361 "packuswb %%mm3, %%mm2 \n\t"
362 "movq %%mm0, %0 \n\t"
363 "movq %%mm2, %1 \n\t"
364 : "+m"(*pix), "+m"(*(pix + line_size))
367 pix += line_size * 2;
372 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels,
373 int line_size, int h)
376 "lea (%3, %3), %%"REG_a" \n\t"
379 "movd (%1 ), %%mm0 \n\t"
380 "movd (%1, %3), %%mm1 \n\t"
381 "movd %%mm0, (%2) \n\t"
382 "movd %%mm1, (%2, %3) \n\t"
383 "add %%"REG_a", %1 \n\t"
384 "add %%"REG_a", %2 \n\t"
385 "movd (%1 ), %%mm0 \n\t"
386 "movd (%1, %3), %%mm1 \n\t"
387 "movd %%mm0, (%2) \n\t"
388 "movd %%mm1, (%2, %3) \n\t"
389 "add %%"REG_a", %1 \n\t"
390 "add %%"REG_a", %2 \n\t"
393 : "+g"(h), "+r"(pixels), "+r"(block)
394 : "r"((x86_reg)line_size)
399 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
400 int line_size, int h)
403 "lea (%3, %3), %%"REG_a" \n\t"
406 "movq (%1 ), %%mm0 \n\t"
407 "movq (%1, %3), %%mm1 \n\t"
408 "movq %%mm0, (%2) \n\t"
409 "movq %%mm1, (%2, %3) \n\t"
410 "add %%"REG_a", %1 \n\t"
411 "add %%"REG_a", %2 \n\t"
412 "movq (%1 ), %%mm0 \n\t"
413 "movq (%1, %3), %%mm1 \n\t"
414 "movq %%mm0, (%2) \n\t"
415 "movq %%mm1, (%2, %3) \n\t"
416 "add %%"REG_a", %1 \n\t"
417 "add %%"REG_a", %2 \n\t"
420 : "+g"(h), "+r"(pixels), "+r"(block)
421 : "r"((x86_reg)line_size)
426 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
427 int line_size, int h)
430 "lea (%3, %3), %%"REG_a" \n\t"
433 "movq (%1 ), %%mm0 \n\t"
434 "movq 8(%1 ), %%mm4 \n\t"
435 "movq (%1, %3), %%mm1 \n\t"
436 "movq 8(%1, %3), %%mm5 \n\t"
437 "movq %%mm0, (%2) \n\t"
438 "movq %%mm4, 8(%2) \n\t"
439 "movq %%mm1, (%2, %3) \n\t"
440 "movq %%mm5, 8(%2, %3) \n\t"
441 "add %%"REG_a", %1 \n\t"
442 "add %%"REG_a", %2 \n\t"
443 "movq (%1 ), %%mm0 \n\t"
444 "movq 8(%1 ), %%mm4 \n\t"
445 "movq (%1, %3), %%mm1 \n\t"
446 "movq 8(%1, %3), %%mm5 \n\t"
447 "movq %%mm0, (%2) \n\t"
448 "movq %%mm4, 8(%2) \n\t"
449 "movq %%mm1, (%2, %3) \n\t"
450 "movq %%mm5, 8(%2, %3) \n\t"
451 "add %%"REG_a", %1 \n\t"
452 "add %%"REG_a", %2 \n\t"
455 : "+g"(h), "+r"(pixels), "+r"(block)
456 : "r"((x86_reg)line_size)
461 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
462 int line_size, int h)
466 "movdqu (%1 ), %%xmm0 \n\t"
467 "movdqu (%1, %3 ), %%xmm1 \n\t"
468 "movdqu (%1, %3, 2), %%xmm2 \n\t"
469 "movdqu (%1, %4 ), %%xmm3 \n\t"
470 "lea (%1, %3, 4), %1 \n\t"
471 "movdqa %%xmm0, (%2) \n\t"
472 "movdqa %%xmm1, (%2, %3) \n\t"
473 "movdqa %%xmm2, (%2, %3, 2) \n\t"
474 "movdqa %%xmm3, (%2, %4) \n\t"
476 "lea (%2, %3, 4), %2 \n\t"
478 : "+g"(h), "+r"(pixels), "+r"(block)
479 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
484 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
485 int line_size, int h)
489 "movdqu (%1 ), %%xmm0 \n\t"
490 "movdqu (%1, %3 ), %%xmm1 \n\t"
491 "movdqu (%1, %3, 2), %%xmm2 \n\t"
492 "movdqu (%1, %4 ), %%xmm3 \n\t"
493 "lea (%1, %3, 4), %1 \n\t"
494 "pavgb (%2 ), %%xmm0 \n\t"
495 "pavgb (%2, %3 ), %%xmm1 \n\t"
496 "pavgb (%2, %3, 2), %%xmm2 \n\t"
497 "pavgb (%2, %4), %%xmm3 \n\t"
498 "movdqa %%xmm0, (%2) \n\t"
499 "movdqa %%xmm1, (%2, %3) \n\t"
500 "movdqa %%xmm2, (%2, %3, 2) \n\t"
501 "movdqa %%xmm3, (%2, %4) \n\t"
503 "lea (%2, %3, 4), %2 \n\t"
505 : "+g"(h), "+r"(pixels), "+r"(block)
506 : "r"((x86_reg)line_size), "r"((x86_reg)3L * line_size)
511 #define CLEAR_BLOCKS(name, n) \
512 static void name(DCTELEM *blocks) \
515 "pxor %%mm7, %%mm7 \n\t" \
516 "mov %1, %%"REG_a" \n\t" \
518 "movq %%mm7, (%0, %%"REG_a") \n\t" \
519 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
520 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
521 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
522 "add $32, %%"REG_a" \n\t" \
524 :: "r"(((uint8_t *)blocks) + 128 * n), \
529 CLEAR_BLOCKS(clear_blocks_mmx, 6)
530 CLEAR_BLOCKS(clear_block_mmx, 1)
532 static void clear_block_sse(DCTELEM *block)
535 "xorps %%xmm0, %%xmm0 \n"
536 "movaps %%xmm0, (%0) \n"
537 "movaps %%xmm0, 16(%0) \n"
538 "movaps %%xmm0, 32(%0) \n"
539 "movaps %%xmm0, 48(%0) \n"
540 "movaps %%xmm0, 64(%0) \n"
541 "movaps %%xmm0, 80(%0) \n"
542 "movaps %%xmm0, 96(%0) \n"
543 "movaps %%xmm0, 112(%0) \n"
549 static void clear_blocks_sse(DCTELEM *blocks)
552 "xorps %%xmm0, %%xmm0 \n"
553 "mov %1, %%"REG_a" \n"
555 "movaps %%xmm0, (%0, %%"REG_a") \n"
556 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
557 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
558 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
559 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
560 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
561 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
562 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
563 "add $128, %%"REG_a" \n"
565 :: "r"(((uint8_t *)blocks) + 128 * 6),
571 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
577 "movq (%1, %0), %%mm0 \n\t"
578 "movq (%2, %0), %%mm1 \n\t"
579 "paddb %%mm0, %%mm1 \n\t"
580 "movq %%mm1, (%2, %0) \n\t"
581 "movq 8(%1, %0), %%mm0 \n\t"
582 "movq 8(%2, %0), %%mm1 \n\t"
583 "paddb %%mm0, %%mm1 \n\t"
584 "movq %%mm1, 8(%2, %0) \n\t"
590 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
593 dst[i + 0] += src[i + 0];
597 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
598 const uint8_t *diff, int w,
599 int *left, int *left_top)
603 int l = *left & 0xff;
604 int tl = *left_top & 0xff;
609 "movzbl (%3, %4), %2 \n"
622 "add (%6, %4), %b0 \n"
623 "mov %b0, (%5, %4) \n"
626 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
627 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
634 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
635 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
636 "movd (%1), %%mm0 \n\t"
638 "movd (%1), %%mm1 \n\t"
639 "movd (%1,%3,1), %%mm2 \n\t"
640 "movd (%1,%3,2), %%mm3 \n\t"
641 "punpcklbw %%mm1, %%mm0 \n\t"
642 "punpcklbw %%mm3, %%mm2 \n\t"
643 "movq %%mm0, %%mm1 \n\t"
644 "punpcklwd %%mm2, %%mm0 \n\t"
645 "punpckhwd %%mm2, %%mm1 \n\t"
646 "movd %%mm0, (%0) \n\t"
648 "punpckhdq %%mm0, %%mm0 \n\t"
649 "movd %%mm0, (%0) \n\t"
650 "movd %%mm1, (%0,%2,1) \n\t"
651 "punpckhdq %%mm1, %%mm1 \n\t"
652 "movd %%mm1, (%0,%2,2) \n\t"
662 #define H263_LOOP_FILTER \
663 "pxor %%mm7, %%mm7 \n\t" \
664 "movq %0, %%mm0 \n\t" \
665 "movq %0, %%mm1 \n\t" \
666 "movq %3, %%mm2 \n\t" \
667 "movq %3, %%mm3 \n\t" \
668 "punpcklbw %%mm7, %%mm0 \n\t" \
669 "punpckhbw %%mm7, %%mm1 \n\t" \
670 "punpcklbw %%mm7, %%mm2 \n\t" \
671 "punpckhbw %%mm7, %%mm3 \n\t" \
672 "psubw %%mm2, %%mm0 \n\t" \
673 "psubw %%mm3, %%mm1 \n\t" \
674 "movq %1, %%mm2 \n\t" \
675 "movq %1, %%mm3 \n\t" \
676 "movq %2, %%mm4 \n\t" \
677 "movq %2, %%mm5 \n\t" \
678 "punpcklbw %%mm7, %%mm2 \n\t" \
679 "punpckhbw %%mm7, %%mm3 \n\t" \
680 "punpcklbw %%mm7, %%mm4 \n\t" \
681 "punpckhbw %%mm7, %%mm5 \n\t" \
682 "psubw %%mm2, %%mm4 \n\t" \
683 "psubw %%mm3, %%mm5 \n\t" \
684 "psllw $2, %%mm4 \n\t" \
685 "psllw $2, %%mm5 \n\t" \
686 "paddw %%mm0, %%mm4 \n\t" \
687 "paddw %%mm1, %%mm5 \n\t" \
688 "pxor %%mm6, %%mm6 \n\t" \
689 "pcmpgtw %%mm4, %%mm6 \n\t" \
690 "pcmpgtw %%mm5, %%mm7 \n\t" \
691 "pxor %%mm6, %%mm4 \n\t" \
692 "pxor %%mm7, %%mm5 \n\t" \
693 "psubw %%mm6, %%mm4 \n\t" \
694 "psubw %%mm7, %%mm5 \n\t" \
695 "psrlw $3, %%mm4 \n\t" \
696 "psrlw $3, %%mm5 \n\t" \
697 "packuswb %%mm5, %%mm4 \n\t" \
698 "packsswb %%mm7, %%mm6 \n\t" \
699 "pxor %%mm7, %%mm7 \n\t" \
700 "movd %4, %%mm2 \n\t" \
701 "punpcklbw %%mm2, %%mm2 \n\t" \
702 "punpcklbw %%mm2, %%mm2 \n\t" \
703 "punpcklbw %%mm2, %%mm2 \n\t" \
704 "psubusb %%mm4, %%mm2 \n\t" \
705 "movq %%mm2, %%mm3 \n\t" \
706 "psubusb %%mm4, %%mm3 \n\t" \
707 "psubb %%mm3, %%mm2 \n\t" \
708 "movq %1, %%mm3 \n\t" \
709 "movq %2, %%mm4 \n\t" \
710 "pxor %%mm6, %%mm3 \n\t" \
711 "pxor %%mm6, %%mm4 \n\t" \
712 "paddusb %%mm2, %%mm3 \n\t" \
713 "psubusb %%mm2, %%mm4 \n\t" \
714 "pxor %%mm6, %%mm3 \n\t" \
715 "pxor %%mm6, %%mm4 \n\t" \
716 "paddusb %%mm2, %%mm2 \n\t" \
717 "packsswb %%mm1, %%mm0 \n\t" \
718 "pcmpgtb %%mm0, %%mm7 \n\t" \
719 "pxor %%mm7, %%mm0 \n\t" \
720 "psubb %%mm7, %%mm0 \n\t" \
721 "movq %%mm0, %%mm1 \n\t" \
722 "psubusb %%mm2, %%mm0 \n\t" \
723 "psubb %%mm0, %%mm1 \n\t" \
724 "pand %5, %%mm1 \n\t" \
725 "psrlw $2, %%mm1 \n\t" \
726 "pxor %%mm7, %%mm1 \n\t" \
727 "psubb %%mm7, %%mm1 \n\t" \
728 "movq %0, %%mm5 \n\t" \
729 "movq %3, %%mm6 \n\t" \
730 "psubb %%mm1, %%mm5 \n\t" \
731 "paddb %%mm1, %%mm6 \n\t"
733 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
735 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
736 const int strength = ff_h263_loop_filter_strength[qscale];
741 "movq %%mm3, %1 \n\t"
742 "movq %%mm4, %2 \n\t"
743 "movq %%mm5, %0 \n\t"
744 "movq %%mm6, %3 \n\t"
745 : "+m"(*(uint64_t*)(src - 2 * stride)),
746 "+m"(*(uint64_t*)(src - 1 * stride)),
747 "+m"(*(uint64_t*)(src + 0 * stride)),
748 "+m"(*(uint64_t*)(src + 1 * stride))
749 : "g"(2 * strength), "m"(ff_pb_FC)
754 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
756 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
757 const int strength = ff_h263_loop_filter_strength[qscale];
758 DECLARE_ALIGNED(8, uint64_t, temp)[4];
759 uint8_t *btemp = (uint8_t*)temp;
763 transpose4x4(btemp, src, 8, stride);
764 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
766 H263_LOOP_FILTER // 5 3 4 6
772 : "g"(2 * strength), "m"(ff_pb_FC)
776 "movq %%mm5, %%mm1 \n\t"
777 "movq %%mm4, %%mm0 \n\t"
778 "punpcklbw %%mm3, %%mm5 \n\t"
779 "punpcklbw %%mm6, %%mm4 \n\t"
780 "punpckhbw %%mm3, %%mm1 \n\t"
781 "punpckhbw %%mm6, %%mm0 \n\t"
782 "movq %%mm5, %%mm3 \n\t"
783 "movq %%mm1, %%mm6 \n\t"
784 "punpcklwd %%mm4, %%mm5 \n\t"
785 "punpcklwd %%mm0, %%mm1 \n\t"
786 "punpckhwd %%mm4, %%mm3 \n\t"
787 "punpckhwd %%mm0, %%mm6 \n\t"
788 "movd %%mm5, (%0) \n\t"
789 "punpckhdq %%mm5, %%mm5 \n\t"
790 "movd %%mm5, (%0, %2) \n\t"
791 "movd %%mm3, (%0, %2, 2) \n\t"
792 "punpckhdq %%mm3, %%mm3 \n\t"
793 "movd %%mm3, (%0, %3) \n\t"
794 "movd %%mm1, (%1) \n\t"
795 "punpckhdq %%mm1, %%mm1 \n\t"
796 "movd %%mm1, (%1, %2) \n\t"
797 "movd %%mm6, (%1, %2, 2) \n\t"
798 "punpckhdq %%mm6, %%mm6 \n\t"
799 "movd %%mm6, (%1, %3) \n\t"
801 "r"(src + 4 * stride),
802 "r"((x86_reg)stride),
803 "r"((x86_reg)(3 * stride))
808 /* Draw the edges of width 'w' of an image of size width, height
809 * this MMX version can only handle w == 8 || w == 16. */
810 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
811 int w, int h, int sides)
813 uint8_t *ptr, *last_line;
816 last_line = buf + (height - 1) * wrap;
822 "movd (%0), %%mm0 \n\t"
823 "punpcklbw %%mm0, %%mm0 \n\t"
824 "punpcklwd %%mm0, %%mm0 \n\t"
825 "punpckldq %%mm0, %%mm0 \n\t"
826 "movq %%mm0, -8(%0) \n\t"
827 "movq -8(%0, %2), %%mm1 \n\t"
828 "punpckhbw %%mm1, %%mm1 \n\t"
829 "punpckhwd %%mm1, %%mm1 \n\t"
830 "punpckhdq %%mm1, %%mm1 \n\t"
831 "movq %%mm1, (%0, %2) \n\t"
836 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
841 "movd (%0), %%mm0 \n\t"
842 "punpcklbw %%mm0, %%mm0 \n\t"
843 "punpcklwd %%mm0, %%mm0 \n\t"
844 "punpckldq %%mm0, %%mm0 \n\t"
845 "movq %%mm0, -8(%0) \n\t"
846 "movq %%mm0, -16(%0) \n\t"
847 "movq -8(%0, %2), %%mm1 \n\t"
848 "punpckhbw %%mm1, %%mm1 \n\t"
849 "punpckhwd %%mm1, %%mm1 \n\t"
850 "punpckhdq %%mm1, %%mm1 \n\t"
851 "movq %%mm1, (%0, %2) \n\t"
852 "movq %%mm1, 8(%0, %2) \n\t"
857 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
863 "movd (%0), %%mm0 \n\t"
864 "punpcklbw %%mm0, %%mm0 \n\t"
865 "punpcklwd %%mm0, %%mm0 \n\t"
866 "movd %%mm0, -4(%0) \n\t"
867 "movd -4(%0, %2), %%mm1 \n\t"
868 "punpcklbw %%mm1, %%mm1 \n\t"
869 "punpckhwd %%mm1, %%mm1 \n\t"
870 "punpckhdq %%mm1, %%mm1 \n\t"
871 "movd %%mm1, (%0, %2) \n\t"
876 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
880 /* top and bottom (and hopefully also the corners) */
881 if (sides & EDGE_TOP) {
882 for (i = 0; i < h; i += 4) {
883 ptr = buf - (i + 1) * wrap - w;
886 "movq (%1, %0), %%mm0 \n\t"
887 "movq %%mm0, (%0) \n\t"
888 "movq %%mm0, (%0, %2) \n\t"
889 "movq %%mm0, (%0, %2, 2) \n\t"
890 "movq %%mm0, (%0, %3) \n\t"
895 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
896 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
901 if (sides & EDGE_BOTTOM) {
902 for (i = 0; i < h; i += 4) {
903 ptr = last_line + (i + 1) * wrap - w;
906 "movq (%1, %0), %%mm0 \n\t"
907 "movq %%mm0, (%0) \n\t"
908 "movq %%mm0, (%0, %2) \n\t"
909 "movq %%mm0, (%0, %2, 2) \n\t"
910 "movq %%mm0, (%0, %3) \n\t"
915 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
916 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
917 "r"(ptr + width + 2 * w)
923 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
924 in0, in1, in2, in7, out, OP) \
925 "paddw "#m4", "#m3" \n\t" /* x1 */ \
926 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
927 "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
928 "movq "#in7", "#m3" \n\t" /* d */ \
929 "movq "#in0", %%mm5 \n\t" /* D */ \
930 "paddw "#m3", %%mm5 \n\t" /* x4 */ \
931 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
932 "movq "#in1", %%mm5 \n\t" /* C */ \
933 "movq "#in2", %%mm6 \n\t" /* B */ \
934 "paddw "#m6", %%mm5 \n\t" /* x3 */ \
935 "paddw "#m5", %%mm6 \n\t" /* x2 */ \
936 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
937 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
938 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
939 "paddw "#rnd", %%mm4 \n\t" /* x2 */ \
940 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
941 "psraw $5, %%mm5 \n\t" \
942 "packuswb %%mm5, %%mm5 \n\t" \
943 OP(%%mm5, out, %%mm7, d)
945 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW) \
946 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, \
955 "pxor %%mm7, %%mm7 \n\t" \
957 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
958 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
959 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
960 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
961 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
962 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
963 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
964 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
965 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
966 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
967 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
968 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
969 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
970 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
971 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
972 "paddw %%mm3, %%mm5 \n\t" /* b */ \
973 "paddw %%mm2, %%mm6 \n\t" /* c */ \
974 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
975 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
976 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
977 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
978 "paddw %%mm4, %%mm0 \n\t" /* a */ \
979 "paddw %%mm1, %%mm5 \n\t" /* d */ \
980 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
981 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
982 "paddw %6, %%mm6 \n\t" \
983 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
984 "psraw $5, %%mm0 \n\t" \
985 "movq %%mm0, %5 \n\t" \
986 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
988 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
989 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
990 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
991 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
992 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
993 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
994 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
995 "paddw %%mm0, %%mm2 \n\t" /* b */ \
996 "paddw %%mm5, %%mm3 \n\t" /* c */ \
997 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
998 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
999 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
1000 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
1001 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
1002 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
1003 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1004 "paddw %%mm2, %%mm1 \n\t" /* a */ \
1005 "paddw %%mm6, %%mm4 \n\t" /* d */ \
1006 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1007 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
1008 "paddw %6, %%mm1 \n\t" \
1009 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
1010 "psraw $5, %%mm3 \n\t" \
1011 "movq %5, %%mm1 \n\t" \
1012 "packuswb %%mm3, %%mm1 \n\t" \
1013 OP_MMX2(%%mm1, (%1), %%mm4, q) \
1014 /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
1016 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
1017 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
1018 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
1019 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
1020 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
1021 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
1022 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
1023 "paddw %%mm1, %%mm5 \n\t" /* b */ \
1024 "paddw %%mm4, %%mm0 \n\t" /* c */ \
1025 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1026 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
1027 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
1028 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
1029 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
1030 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
1031 "paddw %%mm3, %%mm2 \n\t" /* d */ \
1032 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
1033 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
1034 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
1035 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
1036 "paddw %%mm2, %%mm6 \n\t" /* a */ \
1037 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
1038 "paddw %6, %%mm0 \n\t" \
1039 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1040 "psraw $5, %%mm0 \n\t" \
1041 /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
1042 /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
1044 "paddw %%mm5, %%mm3 \n\t" /* a */ \
1045 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
1046 "paddw %%mm4, %%mm6 \n\t" /* b */ \
1047 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
1048 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
1049 "paddw %%mm1, %%mm4 \n\t" /* c */ \
1050 "paddw %%mm2, %%mm5 \n\t" /* d */ \
1051 "paddw %%mm6, %%mm6 \n\t" /* 2b */ \
1052 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
1053 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
1054 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
1055 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
1056 "paddw %6, %%mm4 \n\t" \
1057 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
1058 "psraw $5, %%mm4 \n\t" \
1059 "packuswb %%mm4, %%mm0 \n\t" \
1060 OP_MMX2(%%mm0, 8(%1), %%mm4, q) \
1066 : "+a"(src), "+c"(dst), "+D"(h) \
1067 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
1068 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
1073 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, \
1081 /* quick HACK, XXX FIXME MUST be optimized */ \
1082 for (i = 0; i < h; i++) { \
1083 temp[ 0] = (src[ 0] + src[ 1]) * 20 - (src[ 0] + src[ 2]) * 6 + \
1084 (src[ 1] + src[ 3]) * 3 - (src[ 2] + src[ 4]); \
1085 temp[ 1] = (src[ 1] + src[ 2]) * 20 - (src[ 0] + src[ 3]) * 6 + \
1086 (src[ 0] + src[ 4]) * 3 - (src[ 1] + src[ 5]); \
1087 temp[ 2] = (src[ 2] + src[ 3]) * 20 - (src[ 1] + src[ 4]) * 6 + \
1088 (src[ 0] + src[ 5]) * 3 - (src[ 0] + src[ 6]); \
1089 temp[ 3] = (src[ 3] + src[ 4]) * 20 - (src[ 2] + src[ 5]) * 6 + \
1090 (src[ 1] + src[ 6]) * 3 - (src[ 0] + src[ 7]); \
1091 temp[ 4] = (src[ 4] + src[ 5]) * 20 - (src[ 3] + src[ 6]) * 6 + \
1092 (src[ 2] + src[ 7]) * 3 - (src[ 1] + src[ 8]); \
1093 temp[ 5] = (src[ 5] + src[ 6]) * 20 - (src[ 4] + src[ 7]) * 6 + \
1094 (src[ 3] + src[ 8]) * 3 - (src[ 2] + src[ 9]); \
1095 temp[ 6] = (src[ 6] + src[ 7]) * 20 - (src[ 5] + src[ 8]) * 6 + \
1096 (src[ 4] + src[ 9]) * 3 - (src[ 3] + src[10]); \
1097 temp[ 7] = (src[ 7] + src[ 8]) * 20 - (src[ 6] + src[ 9]) * 6 + \
1098 (src[ 5] + src[10]) * 3 - (src[ 4] + src[11]); \
1099 temp[ 8] = (src[ 8] + src[ 9]) * 20 - (src[ 7] + src[10]) * 6 + \
1100 (src[ 6] + src[11]) * 3 - (src[ 5] + src[12]); \
1101 temp[ 9] = (src[ 9] + src[10]) * 20 - (src[ 8] + src[11]) * 6 + \
1102 (src[ 7] + src[12]) * 3 - (src[ 6] + src[13]); \
1103 temp[10] = (src[10] + src[11]) * 20 - (src[ 9] + src[12]) * 6 + \
1104 (src[ 8] + src[13]) * 3 - (src[ 7] + src[14]); \
1105 temp[11] = (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + \
1106 (src[ 9] + src[14]) * 3 - (src[ 8] + src[15]); \
1107 temp[12] = (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + \
1108 (src[10] + src[15]) * 3 - (src[ 9] + src[16]); \
1109 temp[13] = (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + \
1110 (src[11] + src[16]) * 3 - (src[10] + src[16]); \
1111 temp[14] = (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + \
1112 (src[12] + src[16]) * 3 - (src[11] + src[15]); \
1113 temp[15] = (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + \
1114 (src[13] + src[15]) * 3 - (src[12] + src[14]); \
1115 __asm__ volatile ( \
1116 "movq (%0), %%mm0 \n\t" \
1117 "movq 8(%0), %%mm1 \n\t" \
1118 "paddw %2, %%mm0 \n\t" \
1119 "paddw %2, %%mm1 \n\t" \
1120 "psraw $5, %%mm0 \n\t" \
1121 "psraw $5, %%mm1 \n\t" \
1122 "packuswb %%mm1, %%mm0 \n\t" \
1123 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
1124 "movq 16(%0), %%mm0 \n\t" \
1125 "movq 24(%0), %%mm1 \n\t" \
1126 "paddw %2, %%mm0 \n\t" \
1127 "paddw %2, %%mm1 \n\t" \
1128 "psraw $5, %%mm0 \n\t" \
1129 "psraw $5, %%mm1 \n\t" \
1130 "packuswb %%mm1, %%mm0 \n\t" \
1131 OP_3DNOW(%%mm0, 8(%1), %%mm1, q) \
1132 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
1140 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, \
1146 __asm__ volatile ( \
1147 "pxor %%mm7, %%mm7 \n\t" \
1149 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
1150 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
1151 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
1152 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
1153 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
1154 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
1155 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
1156 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
1157 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
1158 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
1159 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
1160 "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
1161 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
1162 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
1163 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
1164 "paddw %%mm3, %%mm5 \n\t" /* b */ \
1165 "paddw %%mm2, %%mm6 \n\t" /* c */ \
1166 "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1167 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
1168 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
1169 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
1170 "paddw %%mm4, %%mm0 \n\t" /* a */ \
1171 "paddw %%mm1, %%mm5 \n\t" /* d */ \
1172 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
1173 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
1174 "paddw %5, %%mm6 \n\t" \
1175 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1176 "psraw $5, %%mm0 \n\t" \
1177 /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
1179 "movd 5(%0), %%mm5 \n\t" /* FGHI */ \
1180 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
1181 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
1182 "paddw %%mm5, %%mm1 \n\t" /* a */ \
1183 "paddw %%mm6, %%mm2 \n\t" /* b */ \
1184 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
1185 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
1186 "paddw %%mm6, %%mm3 \n\t" /* c */ \
1187 "paddw %%mm5, %%mm4 \n\t" /* d */ \
1188 "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1189 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1190 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1191 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1192 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
1193 "paddw %5, %%mm1 \n\t" \
1194 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
1195 "psraw $5, %%mm3 \n\t" \
1196 "packuswb %%mm3, %%mm0 \n\t" \
1197 OP_MMX2(%%mm0, (%1), %%mm4, q) \
1203 : "+a"(src), "+c"(dst), "+d"(h) \
1204 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1205 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
1210 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, \
1218 /* quick HACK, XXX FIXME MUST be optimized */ \
1219 for (i = 0; i < h; i++) { \
1220 temp[0] = (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + \
1221 (src[1] + src[3]) * 3 - (src[2] + src[4]); \
1222 temp[1] = (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + \
1223 (src[0] + src[4]) * 3 - (src[1] + src[5]); \
1224 temp[2] = (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + \
1225 (src[0] + src[5]) * 3 - (src[0] + src[6]); \
1226 temp[3] = (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + \
1227 (src[1] + src[6]) * 3 - (src[0] + src[7]); \
1228 temp[4] = (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + \
1229 (src[2] + src[7]) * 3 - (src[1] + src[8]); \
1230 temp[5] = (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + \
1231 (src[3] + src[8]) * 3 - (src[2] + src[8]); \
1232 temp[6] = (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + \
1233 (src[4] + src[8]) * 3 - (src[3] + src[7]); \
1234 temp[7] = (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + \
1235 (src[5] + src[7]) * 3 - (src[4] + src[6]); \
1236 __asm__ volatile ( \
1237 "movq (%0), %%mm0 \n\t" \
1238 "movq 8(%0), %%mm1 \n\t" \
1239 "paddw %2, %%mm0 \n\t" \
1240 "paddw %2, %%mm1 \n\t" \
1241 "psraw $5, %%mm0 \n\t" \
1242 "psraw $5, %%mm1 \n\t" \
1243 "packuswb %%mm1, %%mm0 \n\t" \
1244 OP_3DNOW(%%mm0, (%1), %%mm1, q) \
1245 :: "r"(temp), "r"(dst), "m"(ROUNDER) \
1253 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1254 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1259 uint64_t temp[17 * 4]; \
1260 uint64_t *temp_ptr = temp; \
1263 /* FIXME unroll */ \
1264 __asm__ volatile ( \
1265 "pxor %%mm7, %%mm7 \n\t" \
1267 "movq (%0), %%mm0 \n\t" \
1268 "movq (%0), %%mm1 \n\t" \
1269 "movq 8(%0), %%mm2 \n\t" \
1270 "movq 8(%0), %%mm3 \n\t" \
1271 "punpcklbw %%mm7, %%mm0 \n\t" \
1272 "punpckhbw %%mm7, %%mm1 \n\t" \
1273 "punpcklbw %%mm7, %%mm2 \n\t" \
1274 "punpckhbw %%mm7, %%mm3 \n\t" \
1275 "movq %%mm0, (%1) \n\t" \
1276 "movq %%mm1, 17 * 8(%1) \n\t" \
1277 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1278 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1283 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1284 : "r"((x86_reg)srcStride) \
1291 /* FIXME reorder for speed */ \
1292 __asm__ volatile ( \
1293 /* "pxor %%mm7, %%mm7 \n\t" */ \
1295 "movq (%0), %%mm0 \n\t" \
1296 "movq 8(%0), %%mm1 \n\t" \
1297 "movq 16(%0), %%mm2 \n\t" \
1298 "movq 24(%0), %%mm3 \n\t" \
1299 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1300 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1302 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1304 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1306 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1307 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1309 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1310 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1312 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1313 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1315 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1316 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1318 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1320 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1322 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1323 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1325 "add $136, %0 \n\t" \
1330 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1331 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1332 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1333 "g"(4 - 14 * (x86_reg)dstStride) \
1338 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1343 uint64_t temp[9 * 2]; \
1344 uint64_t *temp_ptr = temp; \
1347 /* FIXME unroll */ \
1348 __asm__ volatile ( \
1349 "pxor %%mm7, %%mm7 \n\t" \
1351 "movq (%0), %%mm0 \n\t" \
1352 "movq (%0), %%mm1 \n\t" \
1353 "punpcklbw %%mm7, %%mm0 \n\t" \
1354 "punpckhbw %%mm7, %%mm1 \n\t" \
1355 "movq %%mm0, (%1) \n\t" \
1356 "movq %%mm1, 9*8(%1) \n\t" \
1361 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1362 : "r"((x86_reg)srcStride) \
1369 /* FIXME reorder for speed */ \
1370 __asm__ volatile ( \
1371 /* "pxor %%mm7, %%mm7 \n\t" */ \
1373 "movq (%0), %%mm0 \n\t" \
1374 "movq 8(%0), %%mm1 \n\t" \
1375 "movq 16(%0), %%mm2 \n\t" \
1376 "movq 24(%0), %%mm3 \n\t" \
1377 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1378 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1380 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1382 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1384 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1386 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1388 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1389 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1391 "add $72, %0 \n\t" \
1396 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1397 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1398 /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1399 "g"(4 - 6 * (x86_reg)dstStride) \
1404 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1407 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1410 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1414 uint8_t * const half = (uint8_t*)temp; \
1415 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1417 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1420 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1423 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1427 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1431 uint8_t * const half = (uint8_t*)temp; \
1432 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1434 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1438 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1442 uint8_t * const half = (uint8_t*)temp; \
1443 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1444 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1447 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1450 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1453 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1457 uint8_t * const half = (uint8_t*)temp; \
1458 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1459 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1463 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1466 uint64_t half[8 + 9]; \
1467 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1468 uint8_t * const halfHV = ((uint8_t*)half); \
1469 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1471 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1472 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1473 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1476 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1479 uint64_t half[8 + 9]; \
1480 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1481 uint8_t * const halfHV = ((uint8_t*)half); \
1482 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1484 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1486 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1487 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1490 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1493 uint64_t half[8 + 9]; \
1494 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1495 uint8_t * const halfHV = ((uint8_t*)half); \
1496 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1498 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1499 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1500 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1503 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1506 uint64_t half[8 + 9]; \
1507 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1508 uint8_t * const halfHV = ((uint8_t*)half); \
1509 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1511 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1513 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1514 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1517 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1520 uint64_t half[8 + 9]; \
1521 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1522 uint8_t * const halfHV = ((uint8_t*)half); \
1523 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1525 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1526 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1529 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1532 uint64_t half[8 + 9]; \
1533 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1534 uint8_t * const halfHV = ((uint8_t*)half); \
1535 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1537 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1538 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1541 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1544 uint64_t half[8 + 9]; \
1545 uint8_t * const halfH = ((uint8_t*)half); \
1546 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1548 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1549 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1552 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1555 uint64_t half[8 + 9]; \
1556 uint8_t * const halfH = ((uint8_t*)half); \
1557 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1559 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1561 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1564 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1568 uint8_t * const halfH = ((uint8_t*)half); \
1569 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1571 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1574 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1577 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1580 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1583 uint64_t temp[32]; \
1584 uint8_t * const half = (uint8_t*)temp; \
1585 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1587 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1590 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1593 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1594 stride, stride, 16); \
1597 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1600 uint64_t temp[32]; \
1601 uint8_t * const half = (uint8_t*)temp; \
1602 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1604 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1605 stride, stride, 16); \
1608 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1611 uint64_t temp[32]; \
1612 uint8_t * const half = (uint8_t*)temp; \
1613 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1615 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1618 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1621 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1624 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1627 uint64_t temp[32]; \
1628 uint8_t * const half = (uint8_t*)temp; \
1629 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1631 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1632 stride, stride, 16); \
1635 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1638 uint64_t half[16 * 2 + 17 * 2]; \
1639 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1640 uint8_t * const halfHV = ((uint8_t*)half); \
1641 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1643 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1645 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1647 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1650 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1653 uint64_t half[16 * 2 + 17 * 2]; \
1654 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1655 uint8_t * const halfHV = ((uint8_t*)half); \
1656 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1658 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1660 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1662 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1665 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1668 uint64_t half[16 * 2 + 17 * 2]; \
1669 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1670 uint8_t * const halfHV = ((uint8_t*)half); \
1671 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1673 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1675 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1677 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1681 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1684 uint64_t half[16 * 2 + 17 * 2]; \
1685 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1686 uint8_t * const halfHV = ((uint8_t*)half); \
1687 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1689 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1691 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1693 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1697 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1700 uint64_t half[16 * 2 + 17 * 2]; \
1701 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1702 uint8_t * const halfHV = ((uint8_t*)half); \
1703 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1705 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1707 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1710 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1713 uint64_t half[16 * 2 + 17 * 2]; \
1714 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1715 uint8_t * const halfHV = ((uint8_t*)half); \
1716 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1718 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1720 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1724 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1727 uint64_t half[17 * 2]; \
1728 uint8_t * const halfH = ((uint8_t*)half); \
1729 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1731 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1733 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1736 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1739 uint64_t half[17 * 2]; \
1740 uint8_t * const halfH = ((uint8_t*)half); \
1741 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1743 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1745 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1748 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1751 uint64_t half[17 * 2]; \
1752 uint8_t * const halfH = ((uint8_t*)half); \
1753 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1755 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1758 #define PUT_OP(a, b, temp, size) \
1759 "mov"#size" "#a", "#b" \n\t"
1761 #define AVG_3DNOW_OP(a, b, temp, size) \
1762 "mov"#size" "#b", "#temp" \n\t" \
1763 "pavgusb "#temp", "#a" \n\t" \
1764 "mov"#size" "#a", "#b" \n\t"
1766 #define AVG_MMX2_OP(a, b, temp, size) \
1767 "mov"#size" "#b", "#temp" \n\t" \
1768 "pavgb "#temp", "#a" \n\t" \
1769 "mov"#size" "#a", "#b" \n\t"
1771 QPEL_BASE(put_, ff_pw_16, _, PUT_OP, PUT_OP)
1772 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMX2_OP, AVG_3DNOW_OP)
1773 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1774 QPEL_OP(put_, ff_pw_16, _, PUT_OP, 3dnow)
1775 QPEL_OP(avg_, ff_pw_16, _, AVG_3DNOW_OP, 3dnow)
1776 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1777 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmx2)
1778 QPEL_OP(avg_, ff_pw_16, _, AVG_MMX2_OP, mmx2)
1779 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1781 /***********************************/
1782 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1784 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL) \
1785 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1789 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE); \
1792 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2) \
1793 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1797 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE, \
1801 #define QPEL_2TAP(OPNAME, SIZE, MMX) \
1802 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX) \
1803 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX) \
1804 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx) \
1805 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX = \
1806 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX; \
1807 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX = \
1808 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX; \
1809 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX = \
1810 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX; \
1811 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, \
1815 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE); \
1817 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, \
1821 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride, \
1824 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0) \
1825 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0) \
1826 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0) \
1827 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0) \
1828 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1) \
1829 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1) \
1830 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1) \
1831 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1) \
1833 QPEL_2TAP(put_, 16, mmx2)
1834 QPEL_2TAP(avg_, 16, mmx2)
1835 QPEL_2TAP(put_, 8, mmx2)
1836 QPEL_2TAP(avg_, 8, mmx2)
1837 QPEL_2TAP(put_, 16, 3dnow)
1838 QPEL_2TAP(avg_, 16, 3dnow)
1839 QPEL_2TAP(put_, 8, 3dnow)
1840 QPEL_2TAP(avg_, 8, 3dnow)
1842 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1844 put_pixels8_xy2_mmx(dst, src, stride, 8);
1846 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1848 put_pixels16_xy2_mmx(dst, src, stride, 16);
1850 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1852 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1854 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1856 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1860 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
1861 x86_reg linesize, x86_reg start_y,
1862 x86_reg end_y, x86_reg block_h,
1863 x86_reg start_x, x86_reg end_x,
1865 extern emu_edge_core_func ff_emu_edge_core_mmx;
1866 extern emu_edge_core_func ff_emu_edge_core_sse;
1868 static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
1870 int block_w, int block_h,
1871 int src_x, int src_y,
1873 emu_edge_core_func *core_fn)
1875 int start_y, start_x, end_y, end_x, src_y_add = 0;
1878 src_y_add = h - 1 - src_y;
1880 } else if (src_y <= -block_h) {
1881 src_y_add = 1 - block_h - src_y;
1882 src_y = 1 - block_h;
1885 src += w - 1 - src_x;
1887 } else if (src_x <= -block_w) {
1888 src += 1 - block_w - src_x;
1889 src_x = 1 - block_w;
1892 start_y = FFMAX(0, -src_y);
1893 start_x = FFMAX(0, -src_x);
1894 end_y = FFMIN(block_h, h-src_y);
1895 end_x = FFMIN(block_w, w-src_x);
1896 assert(start_x < end_x && block_w > 0);
1897 assert(start_y < end_y && block_h > 0);
1899 // fill in the to-be-copied part plus all above/below
1900 src += (src_y_add + start_y) * linesize + start_x;
1902 core_fn(buf, src, linesize, start_y, end_y,
1903 block_h, start_x, end_x, block_w);
1907 static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
1909 int block_w, int block_h,
1910 int src_x, int src_y, int w, int h)
1912 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1913 w, h, &ff_emu_edge_core_mmx);
1917 static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
1919 int block_w, int block_h,
1920 int src_x, int src_y, int w, int h)
1922 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1923 w, h, &ff_emu_edge_core_sse);
1925 #endif /* HAVE_YASM */
1927 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1928 int linesize, int block_w, int block_h,
1929 int src_x, int src_y, int w, int h);
1931 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1932 int stride, int h, int ox, int oy,
1933 int dxx, int dxy, int dyx, int dyy,
1934 int shift, int r, int width, int height,
1935 emulated_edge_mc_func *emu_edge_fn)
1938 const int ix = ox >> (16 + shift);
1939 const int iy = oy >> (16 + shift);
1940 const int oxs = ox >> 4;
1941 const int oys = oy >> 4;
1942 const int dxxs = dxx >> 4;
1943 const int dxys = dxy >> 4;
1944 const int dyxs = dyx >> 4;
1945 const int dyys = dyy >> 4;
1946 const uint16_t r4[4] = { r, r, r, r };
1947 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1948 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1949 const uint64_t shift2 = 2 * shift;
1950 uint8_t edge_buf[(h + 1) * stride];
1953 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1954 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1955 const int dxh = dxy * (h - 1);
1956 const int dyw = dyx * (w - 1);
1957 if ( // non-constant fullpel offset (3% of blocks)
1958 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1959 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1960 // uses more than 16 bits of subpel mv (only at huge resolution)
1961 || (dxx | dxy | dyx | dyy) & 15) {
1962 // FIXME could still use mmx for some of the rows
1963 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1964 shift, r, width, height);
1968 src += ix + iy * stride;
1969 if ((unsigned)ix >= width - w ||
1970 (unsigned)iy >= height - h) {
1971 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1976 "movd %0, %%mm6 \n\t"
1977 "pxor %%mm7, %%mm7 \n\t"
1978 "punpcklwd %%mm6, %%mm6 \n\t"
1979 "punpcklwd %%mm6, %%mm6 \n\t"
1983 for (x = 0; x < w; x += 4) {
1984 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1985 oxs - dxys + dxxs * (x + 1),
1986 oxs - dxys + dxxs * (x + 2),
1987 oxs - dxys + dxxs * (x + 3) };
1988 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1989 oys - dyys + dyxs * (x + 1),
1990 oys - dyys + dyxs * (x + 2),
1991 oys - dyys + dyxs * (x + 3) };
1993 for (y = 0; y < h; y++) {
1995 "movq %0, %%mm4 \n\t"
1996 "movq %1, %%mm5 \n\t"
1997 "paddw %2, %%mm4 \n\t"
1998 "paddw %3, %%mm5 \n\t"
1999 "movq %%mm4, %0 \n\t"
2000 "movq %%mm5, %1 \n\t"
2001 "psrlw $12, %%mm4 \n\t"
2002 "psrlw $12, %%mm5 \n\t"
2003 : "+m"(*dx4), "+m"(*dy4)
2004 : "m"(*dxy4), "m"(*dyy4)
2008 "movq %%mm6, %%mm2 \n\t"
2009 "movq %%mm6, %%mm1 \n\t"
2010 "psubw %%mm4, %%mm2 \n\t"
2011 "psubw %%mm5, %%mm1 \n\t"
2012 "movq %%mm2, %%mm0 \n\t"
2013 "movq %%mm4, %%mm3 \n\t"
2014 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
2015 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
2016 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
2017 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
2019 "movd %4, %%mm5 \n\t"
2020 "movd %3, %%mm4 \n\t"
2021 "punpcklbw %%mm7, %%mm5 \n\t"
2022 "punpcklbw %%mm7, %%mm4 \n\t"
2023 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
2024 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
2026 "movd %2, %%mm5 \n\t"
2027 "movd %1, %%mm4 \n\t"
2028 "punpcklbw %%mm7, %%mm5 \n\t"
2029 "punpcklbw %%mm7, %%mm4 \n\t"
2030 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
2031 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
2032 "paddw %5, %%mm1 \n\t"
2033 "paddw %%mm3, %%mm2 \n\t"
2034 "paddw %%mm1, %%mm0 \n\t"
2035 "paddw %%mm2, %%mm0 \n\t"
2037 "psrlw %6, %%mm0 \n\t"
2038 "packuswb %%mm0, %%mm0 \n\t"
2039 "movd %%mm0, %0 \n\t"
2041 : "=m"(dst[x + y * stride])
2042 : "m"(src[0]), "m"(src[1]),
2043 "m"(src[stride]), "m"(src[stride + 1]),
2044 "m"(*r4), "m"(shift2)
2048 src += 4 - h * stride;
2054 static void gmc_mmx(uint8_t *dst, uint8_t *src,
2055 int stride, int h, int ox, int oy,
2056 int dxx, int dxy, int dyx, int dyy,
2057 int shift, int r, int width, int height)
2059 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2060 width, height, &emulated_edge_mc_mmx);
2063 static void gmc_sse(uint8_t *dst, uint8_t *src,
2064 int stride, int h, int ox, int oy,
2065 int dxx, int dxy, int dyx, int dyy,
2066 int shift, int r, int width, int height)
2068 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2069 width, height, &emulated_edge_mc_sse);
2072 static void gmc_mmx(uint8_t *dst, uint8_t *src,
2073 int stride, int h, int ox, int oy,
2074 int dxx, int dxy, int dyx, int dyy,
2075 int shift, int r, int width, int height)
2077 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
2078 width, height, &ff_emulated_edge_mc_8);
2082 #define PREFETCH(name, op) \
2083 static void name(void *mem, int stride, int h) \
2085 const uint8_t *p = mem; \
2087 __asm__ volatile (#op" %0" :: "m"(*p)); \
2092 PREFETCH(prefetch_mmx2, prefetcht0)
2093 PREFETCH(prefetch_3dnow, prefetch)
2096 #include "h264_qpel_mmx.c"
2098 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
2099 int stride, int h, int x, int y);
2100 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
2101 int stride, int h, int x, int y);
2102 void ff_avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst, uint8_t *src,
2103 int stride, int h, int x, int y);
2105 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
2106 int stride, int h, int x, int y);
2107 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
2108 int stride, int h, int x, int y);
2109 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
2110 int stride, int h, int x, int y);
2112 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
2113 int stride, int h, int x, int y);
2114 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
2115 int stride, int h, int x, int y);
2117 void ff_put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
2118 int stride, int h, int x, int y);
2119 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
2120 int stride, int h, int x, int y);
2122 void ff_avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src,
2123 int stride, int h, int x, int y);
2124 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
2125 int stride, int h, int x, int y);
2127 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
2128 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
2129 (uint8_t *dst, uint8_t *src, \
2130 int stride, int h, int x, int y);
2132 CHROMA_MC(put, 2, 10, mmxext)
2133 CHROMA_MC(avg, 2, 10, mmxext)
2134 CHROMA_MC(put, 4, 10, mmxext)
2135 CHROMA_MC(avg, 4, 10, mmxext)
2136 CHROMA_MC(put, 8, 10, sse2)
2137 CHROMA_MC(avg, 8, 10, sse2)
2138 CHROMA_MC(put, 8, 10, avx)
2139 CHROMA_MC(avg, 8, 10, avx)
2142 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2144 put_pixels8_mmx(dst, src, stride, 8);
2147 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2149 avg_pixels8_mmx(dst, src, stride, 8);
2152 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2154 put_pixels16_mmx(dst, src, stride, 16);
2157 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride)
2159 avg_pixels16_mmx(dst, src, stride, 16);
2163 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
2164 int stride, int rnd)
2166 put_pixels8_mmx(dst, src, stride, 8);
2169 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src,
2170 int stride, int rnd)
2172 avg_pixels8_mmx2(dst, src, stride, 8);
2175 /* only used in VP3/5/6 */
2176 static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
2182 "movq (%1), %%mm0 \n\t"
2183 "movq (%2), %%mm1 \n\t"
2184 "movq (%1,%4), %%mm2 \n\t"
2185 "movq (%2,%4), %%mm3 \n\t"
2186 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
2187 "movq %%mm4, (%3) \n\t"
2188 "movq %%mm5, (%3,%4) \n\t"
2190 "movq (%1,%4,2), %%mm0 \n\t"
2191 "movq (%2,%4,2), %%mm1 \n\t"
2192 "movq (%1,%5), %%mm2 \n\t"
2193 "movq (%2,%5), %%mm3 \n\t"
2194 "lea (%1,%4,4), %1 \n\t"
2195 "lea (%2,%4,4), %2 \n\t"
2196 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
2197 "movq %%mm4, (%3,%4,2) \n\t"
2198 "movq %%mm5, (%3,%5) \n\t"
2199 "lea (%3,%4,4), %3 \n\t"
2202 :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
2203 :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
2205 // STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
2207 static void put_vp_no_rnd_pixels16_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
2209 put_vp_no_rnd_pixels8_l2_mmx(dst, a, b, stride, h);
2210 put_vp_no_rnd_pixels8_l2_mmx(dst+8, a+8, b+8, stride, h);
2213 #if CONFIG_DIRAC_DECODER
2214 #define DIRAC_PIXOP(OPNAME, EXT)\
2215 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2217 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
2219 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2221 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
2223 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
2225 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
2226 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
2229 DIRAC_PIXOP(put, mmx)
2230 DIRAC_PIXOP(avg, mmx)
2231 DIRAC_PIXOP(avg, mmx2)
2233 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2235 put_pixels16_sse2(dst, src[0], stride, h);
2237 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2239 avg_pixels16_sse2(dst, src[0], stride, h);
2241 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2243 put_pixels16_sse2(dst , src[0] , stride, h);
2244 put_pixels16_sse2(dst+16, src[0]+16, stride, h);
2246 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
2248 avg_pixels16_sse2(dst , src[0] , stride, h);
2249 avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
2253 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
2256 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
2260 ff_put_pixels_clamped_mmx(block, dest, line_size);
2263 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
2267 ff_add_pixels_clamped_mmx(block, dest, line_size);
2270 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
2273 ff_mmxext_idct(block);
2274 ff_put_pixels_clamped_mmx(block, dest, line_size);
2277 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
2280 ff_mmxext_idct(block);
2281 ff_add_pixels_clamped_mmx(block, dest, line_size);
2285 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2287 ff_idct_xvid_mmx(block);
2288 ff_put_pixels_clamped_mmx(block, dest, line_size);
2291 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2293 ff_idct_xvid_mmx(block);
2294 ff_add_pixels_clamped_mmx(block, dest, line_size);
2297 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2299 ff_idct_xvid_mmx2(block);
2300 ff_put_pixels_clamped_mmx(block, dest, line_size);
2303 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2305 ff_idct_xvid_mmx2(block);
2306 ff_add_pixels_clamped_mmx(block, dest, line_size);
2309 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2312 __asm__ volatile ("pxor %%mm7, %%mm7":);
2313 for (i = 0; i < blocksize; i += 2) {
2315 "movq %0, %%mm0 \n\t"
2316 "movq %1, %%mm1 \n\t"
2317 "movq %%mm0, %%mm2 \n\t"
2318 "movq %%mm1, %%mm3 \n\t"
2319 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2320 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2321 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2322 "pxor %%mm2, %%mm1 \n\t"
2323 "movq %%mm3, %%mm4 \n\t"
2324 "pand %%mm1, %%mm3 \n\t"
2325 "pandn %%mm1, %%mm4 \n\t"
2326 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2327 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2328 "movq %%mm3, %1 \n\t"
2329 "movq %%mm0, %0 \n\t"
2330 : "+m"(mag[i]), "+m"(ang[i])
2334 __asm__ volatile ("femms");
2337 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2342 "movaps %0, %%xmm5 \n\t"
2343 :: "m"(ff_pdw_80000000[0])
2345 for (i = 0; i < blocksize; i += 4) {
2347 "movaps %0, %%xmm0 \n\t"
2348 "movaps %1, %%xmm1 \n\t"
2349 "xorps %%xmm2, %%xmm2 \n\t"
2350 "xorps %%xmm3, %%xmm3 \n\t"
2351 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2352 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2353 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2354 "xorps %%xmm2, %%xmm1 \n\t"
2355 "movaps %%xmm3, %%xmm4 \n\t"
2356 "andps %%xmm1, %%xmm3 \n\t"
2357 "andnps %%xmm1, %%xmm4 \n\t"
2358 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2359 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2360 "movaps %%xmm3, %1 \n\t"
2361 "movaps %%xmm0, %0 \n\t"
2362 : "+m"(mag[i]), "+m"(ang[i])
2371 #define MIX5(mono, stereo) \
2372 __asm__ volatile ( \
2373 "movss 0(%2), %%xmm5 \n" \
2374 "movss 8(%2), %%xmm6 \n" \
2375 "movss 24(%2), %%xmm7 \n" \
2376 "shufps $0, %%xmm5, %%xmm5 \n" \
2377 "shufps $0, %%xmm6, %%xmm6 \n" \
2378 "shufps $0, %%xmm7, %%xmm7 \n" \
2380 "movaps (%0, %1), %%xmm0 \n" \
2381 "movaps 0x400(%0, %1), %%xmm1 \n" \
2382 "movaps 0x800(%0, %1), %%xmm2 \n" \
2383 "movaps 0xc00(%0, %1), %%xmm3 \n" \
2384 "movaps 0x1000(%0, %1), %%xmm4 \n" \
2385 "mulps %%xmm5, %%xmm0 \n" \
2386 "mulps %%xmm6, %%xmm1 \n" \
2387 "mulps %%xmm5, %%xmm2 \n" \
2388 "mulps %%xmm7, %%xmm3 \n" \
2389 "mulps %%xmm7, %%xmm4 \n" \
2390 stereo("addps %%xmm1, %%xmm0 \n") \
2391 "addps %%xmm1, %%xmm2 \n" \
2392 "addps %%xmm3, %%xmm0 \n" \
2393 "addps %%xmm4, %%xmm2 \n" \
2394 mono("addps %%xmm2, %%xmm0 \n") \
2395 "movaps %%xmm0, (%0, %1) \n" \
2396 stereo("movaps %%xmm2, 0x400(%0, %1) \n") \
2400 : "r"(samples[0] + len), "r"(matrix) \
2401 : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2402 "%xmm4", "%xmm5", "%xmm6", "%xmm7",) \
2406 #define MIX_MISC(stereo) \
2407 __asm__ volatile ( \
2409 "movaps (%3, %0), %%xmm0 \n" \
2410 stereo("movaps %%xmm0, %%xmm1 \n") \
2411 "mulps %%xmm4, %%xmm0 \n" \
2412 stereo("mulps %%xmm5, %%xmm1 \n") \
2413 "lea 1024(%3, %0), %1 \n" \
2416 "movaps (%1), %%xmm2 \n" \
2417 stereo("movaps %%xmm2, %%xmm3 \n") \
2418 "mulps (%4, %2), %%xmm2 \n" \
2419 stereo("mulps 16(%4, %2), %%xmm3 \n") \
2420 "addps %%xmm2, %%xmm0 \n" \
2421 stereo("addps %%xmm3, %%xmm1 \n") \
2422 "add $1024, %1 \n" \
2425 "movaps %%xmm0, (%3, %0) \n" \
2426 stereo("movaps %%xmm1, 1024(%3, %0) \n") \
2429 : "+&r"(i), "=&r"(j), "=&r"(k) \
2430 : "r"(samples[0] + len), "r"(matrix_simd + in_ch), \
2431 "g"((intptr_t) - 32 * (in_ch - 1)) \
2435 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2],
2436 int out_ch, int in_ch, int len)
2438 int (*matrix_cmp)[2] = (int(*)[2])matrix;
2441 i = -len * sizeof(float);
2442 if (in_ch == 5 && out_ch == 2 &&
2443 !(matrix_cmp[0][1] | matrix_cmp[2][0] |
2444 matrix_cmp[3][1] | matrix_cmp[4][0] |
2445 (matrix_cmp[1][0] ^ matrix_cmp[1][1]) |
2446 (matrix_cmp[0][0] ^ matrix_cmp[2][1]))) {
2448 } else if (in_ch == 5 && out_ch == 1 &&
2449 matrix_cmp[0][0] == matrix_cmp[2][0] &&
2450 matrix_cmp[3][0] == matrix_cmp[4][0]) {
2453 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2454 j = 2 * in_ch * sizeof(float);
2458 "movss (%2, %0), %%xmm4 \n"
2459 "movss 4(%2, %0), %%xmm5 \n"
2460 "shufps $0, %%xmm4, %%xmm4 \n"
2461 "shufps $0, %%xmm5, %%xmm5 \n"
2462 "movaps %%xmm4, (%1, %0, 4) \n"
2463 "movaps %%xmm5, 16(%1, %0, 4) \n"
2466 : "r"(matrix_simd), "r"(matrix)
2478 static void vector_fmul_window_3dnow2(float *dst, const float *src0,
2479 const float *src1, const float *win,
2482 x86_reg i = -len * 4;
2483 x86_reg j = len * 4 - 8;
2486 "pswapd (%5, %1), %%mm1 \n"
2487 "movq (%5, %0), %%mm0 \n"
2488 "pswapd (%4, %1), %%mm5 \n"
2489 "movq (%3, %0), %%mm4 \n"
2490 "movq %%mm0, %%mm2 \n"
2491 "movq %%mm1, %%mm3 \n"
2492 "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]
2493 "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]
2494 "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]
2495 "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]
2496 "pfadd %%mm3, %%mm2 \n"
2497 "pfsub %%mm0, %%mm1 \n"
2498 "pswapd %%mm2, %%mm2 \n"
2499 "movq %%mm1, (%2, %0) \n"
2500 "movq %%mm2, (%2, %1) \n"
2506 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2510 static void vector_fmul_window_sse(float *dst, const float *src0,
2511 const float *src1, const float *win, int len)
2513 x86_reg i = -len * 4;
2514 x86_reg j = len * 4 - 16;
2517 "movaps (%5, %1), %%xmm1 \n"
2518 "movaps (%5, %0), %%xmm0 \n"
2519 "movaps (%4, %1), %%xmm5 \n"
2520 "movaps (%3, %0), %%xmm4 \n"
2521 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2522 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2523 "movaps %%xmm0, %%xmm2 \n"
2524 "movaps %%xmm1, %%xmm3 \n"
2525 "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]
2526 "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]
2527 "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]
2528 "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]
2529 "addps %%xmm3, %%xmm2 \n"
2530 "subps %%xmm0, %%xmm1 \n"
2531 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2532 "movaps %%xmm1, (%2, %0) \n"
2533 "movaps %%xmm2, (%2, %1) \n"
2538 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2541 #endif /* HAVE_6REGS */
2543 static void vector_clipf_sse(float *dst, const float *src,
2544 float min, float max, int len)
2546 x86_reg i = (len - 16) * 4;
2548 "movss %3, %%xmm4 \n\t"
2549 "movss %4, %%xmm5 \n\t"
2550 "shufps $0, %%xmm4, %%xmm4 \n\t"
2551 "shufps $0, %%xmm5, %%xmm5 \n\t"
2553 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
2554 "movaps 16(%2, %0), %%xmm1 \n\t"
2555 "movaps 32(%2, %0), %%xmm2 \n\t"
2556 "movaps 48(%2, %0), %%xmm3 \n\t"
2557 "maxps %%xmm4, %%xmm0 \n\t"
2558 "maxps %%xmm4, %%xmm1 \n\t"
2559 "maxps %%xmm4, %%xmm2 \n\t"
2560 "maxps %%xmm4, %%xmm3 \n\t"
2561 "minps %%xmm5, %%xmm0 \n\t"
2562 "minps %%xmm5, %%xmm1 \n\t"
2563 "minps %%xmm5, %%xmm2 \n\t"
2564 "minps %%xmm5, %%xmm3 \n\t"
2565 "movaps %%xmm0, (%1, %0) \n\t"
2566 "movaps %%xmm1, 16(%1, %0) \n\t"
2567 "movaps %%xmm2, 32(%1, %0) \n\t"
2568 "movaps %%xmm3, 48(%1, %0) \n\t"
2572 : "r"(dst), "r"(src), "m"(min), "m"(max)
2577 void ff_vp3_idct_mmx(int16_t *input_data);
2578 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2579 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2581 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size,
2582 const DCTELEM *block);
2584 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2585 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2587 void ff_vp3_idct_sse2(int16_t *input_data);
2588 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2589 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2591 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2,
2593 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
2595 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2,
2597 int order, int mul);
2598 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
2600 int order, int mul);
2601 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
2603 int order, int mul);
2605 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
2606 const int16_t *window, unsigned int len);
2607 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2608 const int16_t *window, unsigned int len);
2609 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
2610 const int16_t *window, unsigned int len);
2611 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
2612 const int16_t *window, unsigned int len);
2613 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
2614 const int16_t *window, unsigned int len);
2615 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2616 const int16_t *window, unsigned int len);
2618 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
2619 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
2621 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top,
2622 const uint8_t *diff, int w,
2623 int *left, int *left_top);
2624 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
2626 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
2629 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2631 void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
2632 const float *src1, int len);
2633 void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
2634 const float *src1, int len);
2636 void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2637 const float *src2, int len);
2638 void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
2639 const float *src2, int len);
2641 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
2642 int32_t min, int32_t max, unsigned int len);
2643 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
2644 int32_t min, int32_t max, unsigned int len);
2645 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
2646 int32_t min, int32_t max, unsigned int len);
2647 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
2648 int32_t min, int32_t max, unsigned int len);
2650 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2651 const float *src1, int len);
2652 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2653 const float *src1, int len);
2655 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2657 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2658 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2659 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2660 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2661 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2662 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2663 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2664 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2665 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2666 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2667 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2668 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2669 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2670 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2671 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2672 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2675 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2677 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2678 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2679 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2680 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2683 #define H264_QPEL_FUNCS(x, y, CPU) \
2685 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2686 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2687 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2688 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2691 #define H264_QPEL_FUNCS_10(x, y, CPU) \
2693 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2694 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2695 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2696 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2699 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2701 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2703 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2704 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2705 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2707 if (!high_bit_depth) {
2708 c->clear_block = clear_block_mmx;
2709 c->clear_blocks = clear_blocks_mmx;
2710 c->draw_edges = draw_edges_mmx;
2712 SET_HPEL_FUNCS(put, 0, 16, mmx);
2713 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2714 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2715 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2716 SET_HPEL_FUNCS(put, 1, 8, mmx);
2717 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2718 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2719 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2722 #if ARCH_X86_32 || !HAVE_YASM
2725 #if ARCH_X86_32 && HAVE_YASM
2726 if (!high_bit_depth)
2727 c->emulated_edge_mc = emulated_edge_mc_mmx;
2730 c->add_bytes = add_bytes_mmx;
2732 c->put_no_rnd_pixels_l2[0]= put_vp_no_rnd_pixels16_l2_mmx;
2733 c->put_no_rnd_pixels_l2[1]= put_vp_no_rnd_pixels8_l2_mmx;
2735 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2736 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2737 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2741 if (!high_bit_depth && CONFIG_H264CHROMA) {
2742 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
2743 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2746 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2751 static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
2754 const int bit_depth = avctx->bits_per_raw_sample;
2755 const int high_bit_depth = bit_depth > 8;
2757 c->prefetch = prefetch_mmx2;
2759 if (!high_bit_depth) {
2760 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2761 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2763 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2764 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2765 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2767 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2768 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2770 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2771 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2772 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2775 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2776 if (!high_bit_depth) {
2777 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2778 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2779 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2780 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2782 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2783 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2786 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2787 c->vp3_v_loop_filter = ff_vp3_v_loop_filter_mmx2;
2788 c->vp3_h_loop_filter = ff_vp3_h_loop_filter_mmx2;
2791 if (CONFIG_VP3_DECODER && HAVE_YASM)
2792 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
2794 if (CONFIG_VP3_DECODER && (avctx->codec_id == CODEC_ID_VP3 ||
2795 avctx->codec_id == CODEC_ID_THEORA)) {
2796 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2797 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2800 if (CONFIG_H264QPEL) {
2801 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
2802 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
2803 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
2804 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
2805 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
2806 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
2808 if (!high_bit_depth) {
2809 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
2810 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
2811 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
2812 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
2813 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
2814 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
2815 } else if (bit_depth == 10) {
2818 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2819 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2820 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2821 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2823 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2824 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2828 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
2829 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
2830 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
2831 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
2835 if (!high_bit_depth && CONFIG_H264CHROMA) {
2836 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_mmx2_rnd;
2837 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmx2;
2838 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmx2;
2839 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmx2;
2841 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2842 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
2843 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
2844 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
2845 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
2848 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2850 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2851 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2853 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2854 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2856 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2861 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2864 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2866 c->prefetch = prefetch_3dnow;
2868 if (!high_bit_depth) {
2869 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2870 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2872 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2873 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2874 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2876 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2877 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2879 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2880 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2881 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2883 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2884 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2885 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2886 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2887 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2889 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2890 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2894 if (CONFIG_VP3_DECODER && (avctx->codec_id == CODEC_ID_VP3 ||
2895 avctx->codec_id == CODEC_ID_THEORA)) {
2896 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2897 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2900 if (CONFIG_H264QPEL) {
2901 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
2902 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
2903 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2904 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
2905 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
2906 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
2908 if (!high_bit_depth) {
2909 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2910 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
2911 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
2912 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2913 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
2914 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
2917 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2918 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
2919 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2920 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
2924 if (!high_bit_depth && CONFIG_H264CHROMA) {
2925 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_3dnow_rnd;
2926 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2930 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2933 if (mm_flags & AV_CPU_FLAG_CMOV)
2934 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2938 static void dsputil_init_3dnow2(DSPContext *c, AVCodecContext *avctx,
2942 c->vector_fmul_window = vector_fmul_window_3dnow2;
2946 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2948 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2950 if (!high_bit_depth) {
2951 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2952 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2953 c->clear_block = clear_block_sse;
2954 c->clear_blocks = clear_blocks_sse;
2958 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2959 c->ac3_downmix = ac3_downmix_sse;
2961 c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
2962 c->vector_fmul_add = ff_vector_fmul_add_sse;
2966 c->vector_fmul_window = vector_fmul_window_sse;
2969 c->vector_clipf = vector_clipf_sse;
2972 c->scalarproduct_float = ff_scalarproduct_float_sse;
2973 c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
2975 if (!high_bit_depth)
2976 c->emulated_edge_mc = emulated_edge_mc_sse;
2981 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2984 const int bit_depth = avctx->bits_per_raw_sample;
2985 const int high_bit_depth = bit_depth > 8;
2987 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2988 // these functions are slower than mmx on AMD, but faster on Intel
2989 if (!high_bit_depth) {
2990 c->put_pixels_tab[0][0] = put_pixels16_sse2;
2991 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2992 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2993 if (CONFIG_H264QPEL)
2994 H264_QPEL_FUNCS(0, 0, sse2);
2998 if (!high_bit_depth && CONFIG_H264QPEL) {
2999 H264_QPEL_FUNCS(0, 1, sse2);
3000 H264_QPEL_FUNCS(0, 2, sse2);
3001 H264_QPEL_FUNCS(0, 3, sse2);
3002 H264_QPEL_FUNCS(1, 1, sse2);
3003 H264_QPEL_FUNCS(1, 2, sse2);
3004 H264_QPEL_FUNCS(1, 3, sse2);
3005 H264_QPEL_FUNCS(2, 1, sse2);
3006 H264_QPEL_FUNCS(2, 2, sse2);
3007 H264_QPEL_FUNCS(2, 3, sse2);
3008 H264_QPEL_FUNCS(3, 1, sse2);
3009 H264_QPEL_FUNCS(3, 2, sse2);
3010 H264_QPEL_FUNCS(3, 3, sse2);
3014 if (bit_depth == 10) {
3015 if (CONFIG_H264QPEL) {
3016 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
3017 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
3018 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
3019 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
3020 H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
3021 H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
3022 H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
3024 if (CONFIG_H264CHROMA) {
3025 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
3026 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
3030 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
3031 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
3032 if (mm_flags & AV_CPU_FLAG_ATOM) {
3033 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
3035 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
3037 if (avctx->flags & CODEC_FLAG_BITEXACT) {
3038 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
3039 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
3040 c->apply_window_int16 = ff_apply_window_int16_sse2;
3042 c->bswap_buf = ff_bswap32_buf_sse2;
3046 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
3050 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
3051 const int bit_depth = avctx->bits_per_raw_sample;
3053 if (!high_bit_depth && CONFIG_H264QPEL) {
3054 H264_QPEL_FUNCS(1, 0, ssse3);
3055 H264_QPEL_FUNCS(1, 1, ssse3);
3056 H264_QPEL_FUNCS(1, 2, ssse3);
3057 H264_QPEL_FUNCS(1, 3, ssse3);
3058 H264_QPEL_FUNCS(2, 0, ssse3);
3059 H264_QPEL_FUNCS(2, 1, ssse3);
3060 H264_QPEL_FUNCS(2, 2, ssse3);
3061 H264_QPEL_FUNCS(2, 3, ssse3);
3062 H264_QPEL_FUNCS(3, 0, ssse3);
3063 H264_QPEL_FUNCS(3, 1, ssse3);
3064 H264_QPEL_FUNCS(3, 2, ssse3);
3065 H264_QPEL_FUNCS(3, 3, ssse3);
3068 else if (bit_depth == 10 && CONFIG_H264QPEL) {
3069 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
3070 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
3071 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
3073 if (!high_bit_depth && CONFIG_H264CHROMA) {
3074 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_ssse3_rnd;
3075 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_ssse3_rnd;
3076 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
3077 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
3079 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
3080 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
3081 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
3083 if (mm_flags & AV_CPU_FLAG_ATOM)
3084 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
3086 c->apply_window_int16 = ff_apply_window_int16_ssse3;
3087 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
3088 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
3089 c->bswap_buf = ff_bswap32_buf_ssse3;
3094 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
3098 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
3102 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
3104 #if HAVE_AVX && HAVE_YASM
3105 const int bit_depth = avctx->bits_per_raw_sample;
3107 if (bit_depth == 10) {
3108 // AVX implies !cache64.
3109 // TODO: Port cache(32|64) detection from x264.
3110 if (CONFIG_H264QPEL) {
3111 H264_QPEL_FUNCS_10(1, 0, sse2);
3112 H264_QPEL_FUNCS_10(2, 0, sse2);
3113 H264_QPEL_FUNCS_10(3, 0, sse2);
3116 if (CONFIG_H264CHROMA) {
3117 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
3118 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
3121 c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
3122 c->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
3123 c->vector_fmul_add = ff_vector_fmul_add_avx;
3127 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
3129 int mm_flags = av_get_cpu_flags();
3132 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
3133 if (mm_flags & AV_CPU_FLAG_MMX)
3134 av_log(avctx, AV_LOG_INFO, " mmx");
3135 if (mm_flags & AV_CPU_FLAG_MMX2)
3136 av_log(avctx, AV_LOG_INFO, " mmx2");
3137 if (mm_flags & AV_CPU_FLAG_3DNOW)
3138 av_log(avctx, AV_LOG_INFO, " 3dnow");
3139 if (mm_flags & AV_CPU_FLAG_SSE)
3140 av_log(avctx, AV_LOG_INFO, " sse");
3141 if (mm_flags & AV_CPU_FLAG_SSE2)
3142 av_log(avctx, AV_LOG_INFO, " sse2");
3143 av_log(avctx, AV_LOG_INFO, "\n");
3146 if (mm_flags & AV_CPU_FLAG_MMX) {
3147 const int idct_algo = avctx->idct_algo;
3149 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
3150 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
3151 c->idct_put = ff_simple_idct_put_mmx;
3152 c->idct_add = ff_simple_idct_add_mmx;
3153 c->idct = ff_simple_idct_mmx;
3154 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
3156 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
3157 if (mm_flags & AV_CPU_FLAG_MMX2) {
3158 c->idct_put = ff_libmpeg2mmx2_idct_put;
3159 c->idct_add = ff_libmpeg2mmx2_idct_add;
3160 c->idct = ff_mmxext_idct;
3162 c->idct_put = ff_libmpeg2mmx_idct_put;
3163 c->idct_add = ff_libmpeg2mmx_idct_add;
3164 c->idct = ff_mmx_idct;
3166 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
3168 } else if ((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER ||
3169 CONFIG_VP6_DECODER) &&
3170 idct_algo == FF_IDCT_VP3 && HAVE_YASM) {
3171 if (mm_flags & AV_CPU_FLAG_SSE2) {
3172 c->idct_put = ff_vp3_idct_put_sse2;
3173 c->idct_add = ff_vp3_idct_add_sse2;
3174 c->idct = ff_vp3_idct_sse2;
3175 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
3177 c->idct_put = ff_vp3_idct_put_mmx;
3178 c->idct_add = ff_vp3_idct_add_mmx;
3179 c->idct = ff_vp3_idct_mmx;
3180 c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
3182 } else if (idct_algo == FF_IDCT_CAVS) {
3183 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
3184 } else if (idct_algo == FF_IDCT_XVIDMMX) {
3185 if (mm_flags & AV_CPU_FLAG_SSE2) {
3186 c->idct_put = ff_idct_xvid_sse2_put;
3187 c->idct_add = ff_idct_xvid_sse2_add;
3188 c->idct = ff_idct_xvid_sse2;
3189 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
3190 } else if (mm_flags & AV_CPU_FLAG_MMX2) {
3191 c->idct_put = ff_idct_xvid_mmx2_put;
3192 c->idct_add = ff_idct_xvid_mmx2_add;
3193 c->idct = ff_idct_xvid_mmx2;
3195 c->idct_put = ff_idct_xvid_mmx_put;
3196 c->idct_add = ff_idct_xvid_mmx_add;
3197 c->idct = ff_idct_xvid_mmx;
3202 dsputil_init_mmx(c, avctx, mm_flags);
3205 if (mm_flags & AV_CPU_FLAG_MMX2)
3206 dsputil_init_mmx2(c, avctx, mm_flags);
3208 if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW)
3209 dsputil_init_3dnow(c, avctx, mm_flags);
3211 if (mm_flags & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT)
3212 dsputil_init_3dnow2(c, avctx, mm_flags);
3214 if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE)
3215 dsputil_init_sse(c, avctx, mm_flags);
3217 if (mm_flags & AV_CPU_FLAG_SSE2)
3218 dsputil_init_sse2(c, avctx, mm_flags);
3220 if (mm_flags & AV_CPU_FLAG_SSSE3)
3221 dsputil_init_ssse3(c, avctx, mm_flags);
3223 if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE)
3224 dsputil_init_sse4(c, avctx, mm_flags);
3226 if (mm_flags & AV_CPU_FLAG_AVX)
3227 dsputil_init_avx(c, avctx, mm_flags);
3229 if (CONFIG_ENCODERS)
3230 ff_dsputilenc_init_mmx(c, avctx);